OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringOps.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "StringOps.h"
18 #include "Shared/base64.h"
19 
20 #include <rapidjson/document.h>
21 #include <boost/algorithm/string/predicate.hpp>
22 
23 namespace StringOps_Namespace {
24 
25 boost::regex StringOp::generateRegex(const std::string& op_name,
26  const std::string& regex_pattern,
27  const std::string& regex_params,
28  const bool supports_sub_matches) {
29  bool is_case_sensitive = false;
30  bool is_case_insensitive = false;
31 
32  for (const auto& c : regex_params) {
33  switch (c) {
34  case 'c':
35  is_case_sensitive = true;
36  break;
37  case 'i':
38  is_case_insensitive = true;
39  break;
40  case 'e': {
41  if (!supports_sub_matches) {
42  throw std::runtime_error(op_name +
43  " does not support 'e' (sub-matches) option.");
44  }
45  // We use e to set sub-expression group in a separate initializer
46  // but need to have this entry to not error on the default path
47  break;
48  }
49  default: {
50  if (supports_sub_matches) {
51  throw std::runtime_error("Unrecognized regex parameter for " + op_name +
52  ", expected either 'c' 'i', or 'e'.");
53  }
54  throw std::runtime_error("Unrecognized regex parameter for " + op_name +
55  ", expected either 'c' or 'i'.");
56  }
57  }
58  }
59  if (!is_case_sensitive && !is_case_insensitive) {
60  throw std::runtime_error(op_name +
61  " params must either specify case-sensitivity ('c') or "
62  "case-insensitivity ('i').");
63  }
64  if (is_case_sensitive && is_case_insensitive) {
65  throw std::runtime_error(op_name +
66  " params cannot specify both case-sensitivity ('c') and "
67  "case-insensitivity ('i').");
68  }
69  if (is_case_insensitive) {
70  return boost::regex(regex_pattern,
71  boost::regex_constants::extended |
72  boost::regex_constants::optimize |
73  boost::regex_constants::icase);
74  } else {
75  return boost::regex(
76  regex_pattern,
77  boost::regex_constants::extended | boost::regex_constants::optimize);
78  }
79 }
80 
81 NullableStrType TryStringCast::operator()(const std::string& str) const {
82  UNREACHABLE() << "Invalid string output for TryStringCast";
83  return NullableStrType();
84 }
85 
86 Datum TryStringCast::numericEval(const std::string_view str) const {
87  if (str.empty()) {
88  return NullDatum(return_ti_);
89  }
90  // Need to make copy for now b/c StringToDatum can mod SQLTypeInfo arg
91  SQLTypeInfo return_ti(return_ti_);
92  try {
93  return StringToDatum(str, return_ti);
94  } catch (std::runtime_error& e) {
95  return NullDatum(return_ti);
96  }
97 }
98 
99 NullableStrType Position::operator()(const std::string& str) const {
100  UNREACHABLE() << "Invalid string output for Position";
101  return {};
102 }
103 
104 Datum Position::numericEval(const std::string_view str) const {
105  if (str.empty()) {
106  return NullDatum(return_ti_);
107  } else {
108  const int64_t str_len = str.size();
109  const int64_t wrapped_start = start_ >= 0 ? start_ : str_len + start_;
110  Datum return_datum;
111  const auto search_index = str.find(search_str_, wrapped_start);
112  if (search_index == std::string::npos) {
113  return_datum.bigintval = 0;
114  } else {
115  return_datum.bigintval = static_cast<int64_t>(search_index) + 1;
116  }
117  return return_datum;
118  }
119 }
120 
121 // Prefix length to consider for the Jaro-Winkler score.
122 constexpr int winkler_k_prefix_length = 4;
123 
124 // Scaling factor for the adjustment of the score.
125 constexpr double winkler_k_scaling_factor = 0.1;
126 
127 double compute_jaro_score(std::string_view s1, std::string_view s2) {
128  int s1_len = s1.size();
129  int s2_len = s2.size();
130 
131  if (s1_len == 0 || s2_len == 0) {
132  return 0.0;
133  }
134 
135  int match_distance = std::max(s1_len, s2_len) / 2 - 1;
136  std::vector<bool> s1_match(s1_len, false);
137  std::vector<bool> s2_match(s2_len, false);
138 
139  int matches = 0;
140  int transpositions = 0;
141 
142  for (int i = 0; i < s1_len; ++i) {
143  int start = std::max(0, i - match_distance);
144  int end = std::min(i + match_distance + 1, s2_len);
145 
146  for (int j = start; j < end; ++j) {
147  if (s2_match[j]) {
148  continue;
149  }
150  if (s1[i] != s2[j]) {
151  continue;
152  }
153  s1_match[i] = true;
154  s2_match[j] = true;
155  ++matches;
156  break;
157  }
158  }
159 
160  if (matches == 0) {
161  return 0.0;
162  }
163 
164  int k = 0;
165  for (int i = 0; i < s1_len; ++i) {
166  if (!s1_match[i]) {
167  continue;
168  }
169  while (!s2_match[k]) {
170  ++k;
171  }
172  if (s1[i] != s2[k]) {
173  ++transpositions;
174  }
175  ++k;
176  }
177 
178  double score = ((matches / (double)s1_len) + (matches / (double)s2_len) +
179  ((matches - transpositions / 2.0) / matches)) /
180  3.0;
181 
182  return score;
183 }
184 
185 double compute_jaro_winkler_score(std::string_view s1, std::string_view s2) {
186  double jaro_score = compute_jaro_score(s1, s2);
187 
188  int l = 0;
189  int n = std::min({static_cast<int>(s1.size()),
190  static_cast<int>(s2.size()),
192 
193  for (; l < n; ++l) {
194  if (s1[l] != s2[l]) {
195  break;
196  }
197  }
198 
199  double winkler_adjustment = l * winkler_k_scaling_factor * (1 - jaro_score);
200  double jaro_winkler_score = jaro_score + winkler_adjustment;
201 
202  return jaro_winkler_score * 100;
203 }
204 
205 NullableStrType JarowinklerSimilarity::operator()(const std::string& str) const {
206  UNREACHABLE() << "Invalid string output for Jarowinkler Similarity";
207  return {};
208 }
209 
210 Datum JarowinklerSimilarity::numericEval(const std::string_view str) const {
211  if (str.empty()) {
212  return NullDatum(return_ti_);
213  }
214  const double jaro_winkler_score = compute_jaro_winkler_score(str, str_literal_);
215  Datum return_datum;
216  return_datum.bigintval = static_cast<int64_t>(std::round(jaro_winkler_score));
217  return return_datum;
218 }
219 
220 Datum JarowinklerSimilarity::numericEval(const std::string_view str1,
221  const std::string_view str2) const {
222  if (str1.empty() || str2.empty()) {
223  return NullDatum(return_ti_);
224  }
225  const double jaro_winkler_score = compute_jaro_winkler_score(str1, str2);
226  Datum return_datum;
227  return_datum.bigintval = static_cast<int64_t>(std::round(jaro_winkler_score));
228  return return_datum;
229 }
230 
231 // int64_t compute_levenshtein_distance(std::string_view s1, std::string_view s2) {
232 // const size_t len1 = s1.size(), len2 = s2.size();
233 // std::vector<std::vector<size_t>> d(len1 + 1, std::vector<size_t>(len2 + 1));
234 //
235 // d[0][0] = 0;
236 // for (size_t i = 1; i <= len1; ++i) {
237 // d[i][0] = i;
238 // }
239 // for (size_t i = 1; i <= len2; ++i) {
240 // d[0][i] = i;
241 // }
242 //
243 // for (size_t i = 1; i <= len1; ++i) {
244 // for (size_t j = 1; j <= len2; ++j) {
245 // d[i][j] = std::min({d[i - 1][j] + 1,
246 // d[i][j - 1] + 1,
247 // d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : 1)});
248 // }
249 // }
250 //
251 // return d[len1][len2];
252 // }
253 
254 template <typename T>
255 T compute_levenshtein_distance_template(std::string_view s1, std::string_view s2) {
256  const size_t len1 = s1.size(), len2 = s2.size();
257  std::vector<std::vector<T>> d(len1 + 1, std::vector<T>(len2 + 1));
258 
259  d[0][0] = 0;
260  for (size_t i = 1; i <= len1; ++i) {
261  d[i][0] = i;
262  }
263  for (size_t i = 1; i <= len2; ++i) {
264  d[0][i] = i;
265  }
266 
267  for (size_t i = 1; i <= len1; ++i) {
268  for (size_t j = 1; j <= len2; ++j) {
269  d[i][j] = std::min({d[i - 1][j] + 1,
270  d[i][j - 1] + 1,
271  d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : 1)});
272  }
273  }
274 
275  return d[len1][len2];
276 }
277 
278 int64_t compute_levenshtein_distance(std::string_view s1, std::string_view s2) {
279  const size_t max_len = std::max(s1.size(), s2.size());
280 
281  if (max_len < 256) {
282  return compute_levenshtein_distance_template<uint8_t>(s1, s2);
283  } else if (max_len < 65536) {
284  return compute_levenshtein_distance_template<uint16_t>(s1, s2);
285  } else if (max_len < std::numeric_limits<uint32_t>::max()) {
286  return compute_levenshtein_distance_template<uint32_t>(s1, s2);
287  } else {
288  return compute_levenshtein_distance_template<uint64_t>(s1, s2);
289  }
290 }
291 
292 NullableStrType LevenshteinDistance::operator()(const std::string& str) const {
293  UNREACHABLE() << "Invalid string output for Levenshtein Distance";
294  return {};
295 }
296 
297 Datum LevenshteinDistance::numericEval(const std::string_view str) const {
298  if (str.empty()) {
299  return NullDatum(return_ti_);
300  }
301  const double levenshtein_distance = compute_levenshtein_distance(str, str_literal_);
302  Datum return_datum;
303  return_datum.bigintval = static_cast<int64_t>(std::round(levenshtein_distance));
304  return return_datum;
305 }
306 
307 Datum LevenshteinDistance::numericEval(const std::string_view str1,
308  const std::string_view str2) const {
309  if (str1.empty() || str2.empty()) {
310  return NullDatum(return_ti_);
311  }
312  const double levenshtein_distance = compute_levenshtein_distance(str1, str2);
313  Datum return_datum;
314  return_datum.bigintval = static_cast<int64_t>(std::round(levenshtein_distance));
315  return return_datum;
316 }
317 
318 NullableStrType Hash::operator()(const std::string& str) const {
319  UNREACHABLE() << "Invalid string output for Hash";
320  return {};
321 }
322 
323 Datum Hash::numericEval(const std::string_view str) const {
324  if (str.empty()) {
325  return NullDatum(return_ti_);
326  } else {
327  uint64_t str_hash = 1;
328  // rely on fact that unsigned overflow is defined and wraps
329  for (size_t i = 0; i < str.size(); ++i) {
330  str_hash = str_hash * 997u + static_cast<unsigned char>(str[i]);
331  }
332  Datum return_datum;
333  return_datum.bigintval = static_cast<int64_t>(str_hash);
334  return return_datum;
335  }
336 }
337 
338 NullableStrType Lower::operator()(const std::string& str) const {
339  std::string output_str(str);
341  output_str.begin(), output_str.end(), output_str.begin(), [](unsigned char c) {
342  return std::tolower(c);
343  });
344  return output_str;
345 }
346 
347 NullableStrType Upper::operator()(const std::string& str) const {
348  std::string output_str(str);
350  output_str.begin(), output_str.end(), output_str.begin(), [](unsigned char c) {
351  return std::toupper(c);
352  });
353  return output_str;
354 }
355 
356 NullableStrType InitCap::operator()(const std::string& str) const {
357  std::string output_str(str);
358  bool last_char_whitespace = true; // Beginning of string counts as whitespace
359  for (auto& c : output_str) {
360  if (isspace(c) || delimiter_bitmap_[reinterpret_cast<const uint8_t&>(c)]) {
361  last_char_whitespace = true;
362  continue;
363  }
364  if (last_char_whitespace) {
365  c = toupper(c);
366  last_char_whitespace = false;
367  } else {
368  c = tolower(c);
369  }
370  }
371  return output_str;
372 }
373 
374 NullableStrType Reverse::operator()(const std::string& str) const {
375  const std::string reversed_str = std::string(str.rbegin(), str.rend());
376  return reversed_str;
377 }
378 
379 NullableStrType Repeat::operator()(const std::string& str) const {
380  std::string repeated_str;
381  repeated_str.reserve(str.size() * n_);
382  for (size_t r = 0; r < n_; ++r) {
383  repeated_str += str;
384  }
385  return repeated_str;
386 }
387 
388 NullableStrType Concat::operator()(const std::string& str) const {
389  return reverse_order_ ? str_literal_ + str : str + str_literal_;
390 }
391 
392 NullableStrType Concat::operator()(const std::string& str1,
393  const std::string& str2) const {
394  return str1 + str2;
395 }
396 
397 NullableStrType Pad::operator()(const std::string& str) const {
398  return pad_mode_ == Pad::PadMode::LEFT ? lpad(str) : rpad(str);
399 }
400 
401 std::string Pad::lpad(const std::string& str) const {
402  const auto str_len = str.size();
403  const size_t chars_to_fill = str_len < padded_length_ ? padded_length_ - str_len : 0UL;
404  if (chars_to_fill == 0UL) {
405  return str.substr(0, padded_length_);
406  }
407  // If here we need to add characters from the padding_string_
408  // to fill the difference between str_len and padded_length_
409  if (padding_string_length_ == 1UL) {
410  return std::string(chars_to_fill, padding_char_) + str;
411  }
412 
413  std::string fitted_padding_str;
414  fitted_padding_str.reserve(chars_to_fill);
415  for (size_t i = 0; i < chars_to_fill; ++i) {
416  fitted_padding_str.push_back(padding_string_[i % padding_string_length_]);
417  }
418  return fitted_padding_str + str;
419 }
420 
421 std::string Pad::rpad(const std::string& str) const {
422  const auto str_len = str.size();
423  const size_t chars_to_fill = str_len < padded_length_ ? padded_length_ - str_len : 0UL;
424  if (chars_to_fill == 0UL) {
425  return str.substr(str_len - padded_length_, std::string::npos);
426  }
427  // If here we need to add characters from the padding_string_
428  // to fill the difference between str_len and padded_length_
429  if (padding_string_length_ == 1UL) {
430  return str + std::string(chars_to_fill, padding_char_);
431  }
432 
433  std::string fitted_padding_str;
434  fitted_padding_str.reserve(chars_to_fill);
435  for (size_t i = 0; i < chars_to_fill; ++i) {
436  fitted_padding_str.push_back(padding_string_[i % padding_string_length_]);
437  }
438  return str + fitted_padding_str;
439 }
440 
441 Pad::PadMode Pad::op_kind_to_pad_mode(const SqlStringOpKind op_kind) {
442  switch (op_kind) {
444  return PadMode::LEFT;
446  return PadMode::RIGHT;
447  default:
448  UNREACHABLE();
449  // Not reachable, but make compiler happy
450  return PadMode::LEFT;
451  };
452 }
453 
454 NullableStrType Trim::operator()(const std::string& str) const {
455  const auto str_len = str.size();
456  size_t trim_begin = 0;
457  if (trim_mode_ == TrimMode::LEFT || trim_mode_ == TrimMode::BOTH) {
458  while (trim_begin < str_len &&
459  trim_char_bitmap_[reinterpret_cast<const uint8_t&>(str[trim_begin])]) {
460  ++trim_begin;
461  }
462  }
463  size_t trim_end = str_len - 1;
464  if (trim_mode_ == TrimMode::RIGHT || trim_mode_ == TrimMode::BOTH) {
465  while (trim_end > trim_begin &&
466  trim_char_bitmap_[reinterpret_cast<const uint8_t&>(str[trim_end])]) {
467  --trim_end;
468  }
469  }
470  if (trim_begin == 0 && trim_end == str_len - 1) {
471  return str;
472  }
473  return str.substr(trim_begin, trim_end + 1 - trim_begin);
474 }
475 
476 Trim::TrimMode Trim::op_kind_to_trim_mode(const SqlStringOpKind op_kind) {
477  switch (op_kind) {
479  return Trim::TrimMode::BOTH;
481  return Trim::TrimMode::LEFT;
483  return Trim::TrimMode::RIGHT;
484  default:
485  UNREACHABLE();
486  // Not reachable, but make compiler happy
487  return Trim::TrimMode::BOTH;
488  };
489 }
490 
491 NullableStrType Substring::operator()(const std::string& str) const {
492  // If start_ is negative then we start abs(start_) characters from the end
493  // of the string
494  const int64_t str_len = str.size();
495  const int64_t wrapped_start = start_ >= 0 ? start_ : str_len + start_;
496  const size_t capped_start =
497  wrapped_start > str_len ? str_len : (wrapped_start < 0 ? 0 : wrapped_start);
498  return str.substr(capped_start, length_);
499 }
500 
501 NullableStrType Overlay::operator()(const std::string& base_str) const {
502  // If start_ is negative then we start abs(start_) characters from the end
503  // of the string
504  const int64_t str_len = base_str.size();
505  const int64_t wrapped_start = start_ >= 0 ? start_ : str_len + start_;
506  const size_t capped_start =
507  wrapped_start > str_len ? str_len : (wrapped_start < 0 ? 0 : wrapped_start);
508  std::string replaced_str = base_str.substr(0, capped_start);
509  replaced_str += insert_str_;
510  const size_t remainder_start =
511  std::min(wrapped_start + replacement_length_, size_t(str_len));
512  const size_t remainder_length = static_cast<size_t>(str_len) - remainder_start;
513  replaced_str += base_str.substr(remainder_start, remainder_length);
514  return replaced_str;
515 }
516 
517 NullableStrType Replace::operator()(const std::string& str) const {
518  std::string replaced_str(str);
519 
520  size_t search_start_index = 0;
521  while (true) {
522  search_start_index = replaced_str.find(pattern_str_, search_start_index);
523  if (search_start_index == std::string::npos) {
524  break;
525  }
526  replaced_str.replace(search_start_index, pattern_str_len_, replacement_str_);
527  search_start_index += replacement_str_len_;
528  }
529  return replaced_str;
530 }
531 
532 NullableStrType SplitPart::operator()(const std::string& str) const {
533  // If split_part_ is negative then it is taken as the number
534  // of split parts from the end of the string
535 
536  if (delimiter_ == "") {
537  return str;
538  }
539 
540  const size_t str_len = str.size();
541  size_t delimiter_pos = reverse_ ? str_len : 0UL;
542  size_t last_delimiter_pos;
543  size_t delimiter_idx = 0UL;
544 
545  do {
546  last_delimiter_pos = delimiter_pos;
547  delimiter_pos = reverse_ ? str.rfind(delimiter_, delimiter_pos - 1UL)
548  : str.find(delimiter_, delimiter_pos + delimiter_length_);
549  } while (delimiter_pos != std::string::npos && ++delimiter_idx < split_part_);
550 
551  if (delimiter_idx == 0UL && split_part_ == 1UL) {
552  // No delimiter was found, but the first match is requested, which here is
553  // the whole string
554  return str;
555  }
556 
557  if (delimiter_pos == std::string::npos &&
558  (delimiter_idx < split_part_ - 1UL || delimiter_idx < 1UL)) {
559  // split_part_ was out of range
560  return NullableStrType(); // null string
561  }
562 
563  if (reverse_) {
564  const size_t substr_start =
565  delimiter_pos == std::string::npos ? 0UL : delimiter_pos + delimiter_length_;
566  return str.substr(substr_start, last_delimiter_pos - substr_start);
567  } else {
568  const size_t substr_start =
569  split_part_ == 1UL ? 0UL : last_delimiter_pos + delimiter_length_;
570  return str.substr(substr_start, delimiter_pos - substr_start);
571  }
572 }
573 
574 NullableStrType RegexpReplace::operator()(const std::string& str) const {
575  const int64_t str_len = str.size();
576  const int64_t pos = start_pos_ < 0 ? str_len + start_pos_ : start_pos_;
577  const size_t wrapped_start = std::clamp(pos, int64_t(0), str_len);
578  if (occurrence_ == 0L) {
579  std::string result;
580  std::string::const_iterator replace_start(str.cbegin() + wrapped_start);
581  boost::regex_replace(std::back_inserter(result),
582  replace_start,
583  str.cend(),
584  regex_pattern_,
585  replacement_);
586  return str.substr(0UL, wrapped_start) + result;
587  } else {
588  const auto occurrence_match_pos = RegexpReplace::get_nth_regex_match(
589  str,
590  wrapped_start,
591  regex_pattern_,
592  occurrence_ > 0 ? occurrence_ - 1 : occurrence_);
593  if (occurrence_match_pos.first == std::string::npos) {
594  // No match found, return original string
595  return str;
596  }
597  std::string result;
598  std::string::const_iterator replace_start(str.cbegin() + occurrence_match_pos.first);
599  std::string::const_iterator replace_end(str.cbegin() + occurrence_match_pos.second);
600  std::string replaced_match;
601  boost::regex_replace(std::back_inserter(replaced_match),
602  replace_start,
603  replace_end,
604  regex_pattern_,
605  replacement_);
606  return str.substr(0UL, occurrence_match_pos.first) + replaced_match +
607  str.substr(occurrence_match_pos.second, std::string::npos);
608  }
609 }
610 
611 std::pair<size_t, size_t> RegexpReplace::get_nth_regex_match(
612  const std::string& str,
613  const size_t start_pos,
614  const boost::regex& regex_pattern,
615  const int64_t occurrence) {
616  std::vector<std::pair<size_t, size_t>> regex_match_positions;
617  std::string::const_iterator search_start(str.cbegin() + start_pos);
618  boost::smatch match;
619  int64_t match_idx = 0;
620  size_t string_pos = start_pos;
621  while (boost::regex_search(search_start, str.cend(), match, regex_pattern)) {
622  string_pos += match.position(size_t(0)) + match.length(0);
623  regex_match_positions.emplace_back(
624  std::make_pair(string_pos - match.length(0), string_pos));
625  if (match_idx++ == occurrence) {
626  return regex_match_positions.back();
627  }
628  search_start =
629  match.suffix().first; // Move to position after last char of matched string
630  // Position is relative to last match/initial iterator, so need to increment our
631  // string_pos accordingly
632  }
633  // occurrence only could have a valid match if negative here,
634  // but don't want to check in inner loop for performance reasons
635  const int64_t wrapped_match = occurrence >= 0 ? occurrence : match_idx + occurrence;
636  if (wrapped_match < 0 || wrapped_match >= match_idx) {
637  // Represents a non-match
638  return std::make_pair(std::string::npos, std::string::npos);
639  }
640  return regex_match_positions[wrapped_match];
641 }
642 
643 NullableStrType RegexpSubstr::operator()(const std::string& str) const {
644  const int64_t str_len = str.size();
645  const int64_t pos = start_pos_ < 0 ? str_len + start_pos_ : start_pos_;
646  const size_t wrapped_start = std::clamp(pos, int64_t(0), str_len);
647  int64_t match_idx = 0;
648  // Apears std::regex_search does not support string_view?
649  std::vector<std::string> regex_matches;
650  std::string::const_iterator search_start(str.cbegin() + wrapped_start);
651  boost::smatch match;
652  while (boost::regex_search(search_start, str.cend(), match, regex_pattern_)) {
653  if (match_idx++ == occurrence_) {
654  if (sub_match_info_.first) {
655  return RegexpSubstr::get_sub_match(match, sub_match_info_);
656  }
657  return NullableStrType(match[0]);
658  }
659  regex_matches.emplace_back(match[0]);
660  search_start =
661  match.suffix().first; // Move to position after last char of matched string
662  }
663  const int64_t wrapped_match = occurrence_ >= 0 ? occurrence_ : match_idx + occurrence_;
664  if (wrapped_match < 0 || wrapped_match >= match_idx) {
665  return NullableStrType();
666  }
667  if (sub_match_info_.first) {
668  return RegexpSubstr::get_sub_match(match, sub_match_info_);
669  }
670  return regex_matches[wrapped_match];
671 }
672 
673 std::string RegexpSubstr::get_sub_match(const boost::smatch& match,
674  const std::pair<bool, int64_t> sub_match_info) {
675  const int64_t num_sub_matches = match.size() - 1;
676  const int64_t wrapped_sub_match = sub_match_info.second >= 0
677  ? sub_match_info.second
678  : num_sub_matches + sub_match_info.second;
679  if (wrapped_sub_match < 0 || wrapped_sub_match >= num_sub_matches) {
680  return "";
681  }
682  return match[wrapped_sub_match + 1];
683 }
684 
685 std::pair<bool, int64_t> RegexpSubstr::set_sub_match_info(
686  const std::string& regex_pattern,
687  const int64_t sub_match_group_idx) {
688  if (regex_pattern.find("e", 0UL) == std::string::npos) {
689  return std::make_pair(false, 0UL);
690  }
691  return std::make_pair(
692  true, sub_match_group_idx > 0L ? sub_match_group_idx - 1 : sub_match_group_idx);
693 }
694 
695 NullableStrType RegexpCount::operator()(const std::string& str) const {
696  UNREACHABLE() << "Invalid string output for RegexpCount";
697  return {};
698 }
699 
700 Datum RegexpCount::numericEval(const std::string_view str_view) const {
701  if (str_view.empty()) {
702  return NullDatum(return_ti_);
703  }
704 
705  Datum return_datum;
706  const int64_t str_len = str_view.size();
707  const int64_t pos = start_pos_ < 0 ? str_len + start_pos_ : start_pos_;
708  const size_t wrapped_start = std::clamp(pos, int64_t(0), str_len);
709  auto search_start = str_view.data() + wrapped_start;
710  auto search_end = str_view.data() + str_len;
711  boost::cregex_iterator iter(search_start, search_end, regex_pattern_);
712  boost::cregex_iterator end;
713 
714  int64_t num_matches = std::distance(iter, end);
715  return_datum.bigintval = num_matches;
716 
717  return return_datum;
718 }
719 
720 // json_path must start with "lax $", "strict $" or "$" (case-insensitive).
721 JsonValue::JsonParseMode JsonValue::parse_json_parse_mode(std::string_view json_path) {
722  size_t const string_pos = json_path.find('$');
723  if (string_pos == 0) {
724  // Parsing mode was not explicitly specified, default to PARSE_MODE_LAX
725  return JsonValue::JsonParseMode::PARSE_MODE_LAX;
726  } else if (string_pos == std::string::npos) {
727  throw std::runtime_error("JSON search path must include a '$' literal.");
728  }
729  std::string_view const prefix = json_path.substr(0, string_pos);
730  if (boost::iequals(prefix, std::string_view("lax "))) {
731  return JsonValue::JsonParseMode::PARSE_MODE_LAX;
732  } else if (boost::iequals(prefix, std::string_view("strict "))) {
733  if constexpr (JsonValue::allow_strict_json_parsing) {
734  return JsonValue::JsonParseMode::PARSE_MODE_STRICT;
735  } else {
736  throw std::runtime_error("Strict parsing not currently supported for JSON_VALUE.");
737  }
738  } else {
739  throw std::runtime_error("Issue parsing JSON_VALUE Parse Mode.");
740  }
741 }
742 
743 std::vector<JsonValue::JsonKey> JsonValue::parse_json_path(const std::string& json_path) {
744  // Assume that parse_key_error_mode validated strict/lax mode
745  size_t string_pos = json_path.find("$");
746  if (string_pos == std::string::npos) {
747  throw std::runtime_error("JSON search path must begin with '$' literal.");
748  }
749  string_pos += 1; // Go to next character after $
750 
751  // Use tildas to enclose escaped regex string due to embedded ')"'
752  static const auto& key_regex = *new boost::regex(
753  R"~(^(\.(([[:alpha:]][[:alnum:]_-]*)|"([[:alpha:]][ [:alnum:]_-]*)"))|\[([[:digit:]]+)\])~",
754  boost::regex_constants::extended | boost::regex_constants::optimize);
755  static_assert(std::is_trivially_destructible_v<decltype(key_regex)>);
756 
757  std::string::const_iterator search_start(json_path.cbegin() + string_pos);
758  boost::smatch match;
759  std::vector<JsonKey> json_keys;
760  while (boost::regex_search(search_start, json_path.cend(), match, key_regex)) {
761  CHECK_EQ(match.size(), 6UL);
762  if (match.position(size_t(0)) != 0L) {
763  // Match wasn't found at beginning of string
764  throw std::runtime_error("JSON search path parsing error: '" + json_path + "'");
765  }
766  size_t matching_expr = 0;
767  if (match[3].matched) {
768  // simple object key
769  matching_expr = 3;
770  } else if (match[4].matched) {
771  // complex object key
772  matching_expr = 4;
773  } else if (match[5].matched) {
774  // array key
775  matching_expr = 5;
776  }
777  CHECK_GT(matching_expr, 0UL);
778  string_pos += match.length(0);
779 
780  const std::string key_match(match[matching_expr].first, match[matching_expr].second);
781  CHECK_GE(key_match.length(), 1UL);
782  if (isalpha(key_match[0])) {
783  // Object key
784  json_keys.emplace_back(JsonKey(key_match));
785  } else {
786  // Array key
787  json_keys.emplace_back(JsonKey(std::stoi(key_match)));
788  }
789  search_start =
790  match.suffix().first; // Move to position after last char of matched string
791  }
792  if (json_keys.empty()) {
793  throw std::runtime_error("No keys found in JSON search path.");
794  }
795  if (string_pos < json_path.size()) {
796  throw std::runtime_error("JSON path parsing error.");
797  }
798  return json_keys;
799 }
800 
801 NullableStrType JsonValue::operator()(const std::string& str) const {
802  rapidjson::Document document;
803  if (document.Parse(str.c_str()).HasParseError()) {
804  if constexpr (JsonValue::allow_strict_json_parsing) {
805  return handle_parse_error(str);
806  } else {
807  return NullableStrType();
808  }
809  }
810  rapidjson::Value& json_val = document;
811  for (const auto& json_key : json_keys_) {
812  switch (json_key.key_kind) {
813  case JsonKeyKind::JSON_OBJECT: {
814  if (!json_val.IsObject() || !json_val.HasMember(json_key.object_key)) {
815  if constexpr (JsonValue::allow_strict_json_parsing) {
816  return handle_key_error(str);
817  } else {
818  return NullableStrType();
819  }
820  }
821  json_val = json_val[json_key.object_key];
822  break;
823  }
824  case JsonKeyKind::JSON_ARRAY: {
825  if (!json_val.IsArray() || json_val.Size() <= json_key.array_key) {
826  if constexpr (JsonValue::allow_strict_json_parsing) {
827  return handle_key_error(str);
828  } else {
829  return NullableStrType();
830  }
831  }
832  json_val = json_val[json_key.array_key];
833  break;
834  }
835  }
836  }
837  // Now get value as string
838  if (json_val.IsString()) {
839  return NullableStrType(std::string(json_val.GetString()));
840  } else if (json_val.IsNumber()) {
841  if (json_val.IsDouble()) {
842  return NullableStrType(std::to_string(json_val.GetDouble()));
843  } else if (json_val.IsInt64()) {
844  return NullableStrType(std::to_string(json_val.GetInt64()));
845  } else if (json_val.IsUint64()) {
846  // Need to cover range of uint64 that can't fit int in64
847  return NullableStrType(std::to_string(json_val.GetUint64()));
848  } else {
849  // A bit defensive, as I'm fairly sure json does not
850  // support numeric types with widths > 64 bits, so may drop
851  if constexpr (JsonValue::allow_strict_json_parsing) {
852  return handle_key_error(str);
853  } else {
854  return NullableStrType();
855  }
856  }
857  } else if (json_val.IsBool()) {
858  return NullableStrType(std::string(json_val.IsTrue() ? "true" : "false"));
859  } else if (json_val.IsNull()) {
860  return NullableStrType();
861  } else {
862  // For any unhandled type - we may move this to a CHECK after gaining
863  // more confidence in prod
864  if constexpr (JsonValue::allow_strict_json_parsing) {
865  return handle_key_error(str);
866  } else {
867  return NullableStrType();
868  }
869  }
870 }
871 
872 NullableStrType Base64Encode::operator()(const std::string& str) const {
873  return shared::encode_base64(str);
874 }
875 
876 NullableStrType Base64Decode::operator()(const std::string& str) const {
877  return shared::decode_base64(str);
878 }
879 
880 namespace {
881 // Unreserved characters https://www.rfc-editor.org/rfc/rfc3986#section-2.3
882 bool is_normal(char const c) {
883  return std::isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~';
884 }
885 
886 // True iff c will be encoded into a single character.
887 bool is_singular(char const c) {
888  return is_normal(c) || c == ' ';
889 }
890 
891 // Count % chars that are eligible to begin a url-encoded triplet.
892 size_t count_percents(std::string const& str) {
893  size_t n_percents = 0u;
894  if (2u < str.size()) {
895  for (size_t i = 0u; i < str.size() - 2u; ++i) {
896  if (str[i] == '%') {
897  ++n_percents;
898  i += 2u;
899  }
900  }
901  }
902  return n_percents;
903 }
904 
905 // If hex is a hex digit, return int value from 0-15. Otherwise undefined.
906 int nibble(char const hex) {
907  return 'A' <= hex ? std::toupper(hex) + (10 - 'A') : hex - '0';
908 }
909 } // namespace
910 
911 // Encode unreserved characters (RFC 3986 Sec. 2.3) into themselves.
912 // Encode space ' ' into plus '+'.
913 // Encode all other characters c into "%XX" where XX is the hex value of c.
914 NullableStrType UrlEncode::operator()(const std::string& str) const {
915  constexpr char const* tr = "0123456789ABCDEF";
916  // Number of characters in string that will be copied/translated into a single char.
917  size_t const n_singular = std::count_if(str.begin(), str.end(), is_singular);
918  std::string encoded;
919  encoded.reserve(str.size() + 2u * (str.size() - n_singular));
920  for (char const c : str) {
921  if (is_normal(c)) {
922  encoded.append(1u, c);
923  } else if (c == ' ') {
924  encoded.append(1u, '+');
925  } else {
926  encoded.append(1u, '%');
927  encoded.append(1u, tr[(c >> 4) & 0xf]);
928  encoded.append(1u, tr[c & 0xf]);
929  }
930  }
931  return encoded;
932 }
933 
934 // Inverse of UrlEncode::operator(). Garbage in, garbage out, but must never segfault.
935 NullableStrType UrlDecode::operator()(const std::string& str) const {
936  size_t const n_percents = count_percents(str);
937  std::string decoded;
938  decoded.reserve(str.size() - 2u * n_percents);
939  for (size_t i = 0u; i < str.size(); ++i) {
940  if (str[i] == '%' && i + 2u < str.size()) {
941  decoded.append(1u, nibble(str[i + 1u]) << 4 ^ nibble(str[i + 2u]));
942  i += 2u; // Skip the two hexadecimal digits
943  } else if (str[i] == '+') {
944  decoded.append(1u, ' ');
945  } else { // Append normal characters, or % if one of last two characters.
946  decoded.append(1u, str[i]);
947  }
948  }
949  return decoded;
950 }
951 
952 std::string StringOps::operator()(const std::string& str) const {
953  NullableStrType modified_str(str);
954  if (modified_str.is_null) {
955  return ""; // How we currently represent dictionary-encoded nulls
956  }
957  for (const auto& string_op : string_ops_) {
958  modified_str = string_op->operator()(modified_str.str);
959  if (modified_str.is_null) {
960  return ""; // How we currently represent dictionary-encoded nulls
961  }
962  }
963  return modified_str.str;
964 }
965 
966 std::string StringOps::multi_input_eval(const std::string_view str1,
967  const std::string_view str2) const {
968  NullableStrType modified_str1(str1);
969  NullableStrType modified_str2(str2);
970  if (modified_str1.is_null || modified_str2.is_null) {
971  return ""; // How we currently represent dictionary-encoded nulls
972  }
973  for (const auto& string_op : string_ops_) {
974  modified_str1 = string_op->operator()(modified_str1.str, modified_str2.str);
975  if (modified_str1.is_null) {
976  return ""; // How we currently represent dictionary-encoded nulls
977  }
978  }
979  return modified_str1.str;
980 }
981 
982 std::string_view StringOps::operator()(const std::string_view sv,
983  std::string& sv_storage) const {
984  sv_storage = sv;
985  NullableStrType nullable_str(sv);
986  for (const auto& string_op : string_ops_) {
987  nullable_str = string_op->operator()(nullable_str.str);
988  if (nullable_str.is_null) {
989  return "";
990  }
991  }
992  sv_storage = nullable_str.str;
993  return sv_storage;
994 }
995 
996 Datum StringOps::numericEval(const std::string_view str) const {
997  const auto num_string_producing_ops = string_ops_.size() - 1;
998  if (num_string_producing_ops == 0UL) {
999  // Short circuit and avoid transformation to string if
1000  // only have one string->numeric op
1001  return string_ops_.back()->numericEval(str);
1002  }
1003  NullableStrType modified_str(str);
1004  for (size_t string_op_idx = 0; string_op_idx < num_string_producing_ops;
1005  ++string_op_idx) {
1006  const auto& string_op = string_ops_[string_op_idx];
1007  modified_str = string_op->operator()(modified_str.str);
1008  if (modified_str.is_null) {
1009  break;
1010  }
1011  }
1012  return string_ops_.back()->numericEval(modified_str.str);
1013 }
1014 
1015 Datum StringOps::numericEval(const std::string_view str1,
1016  const std::string_view str2) const {
1017  const auto num_string_producing_ops = string_ops_.size() - 1;
1018  // All string ops should be evaluated before invoking
1019  // numericEval with two non-literal string inputs, so
1020  // num string producing ops should be 0 here
1021  CHECK_EQ(num_string_producing_ops, 0UL);
1022  return string_ops_.back()->numericEval(str1, str2);
1023 }
1024 
1025 std::vector<std::unique_ptr<const StringOp>> StringOps::genStringOpsFromOpInfos(
1026  const std::vector<StringOpInfo>& string_op_infos) const {
1027  // Should we handle pure literal expressions here as well
1028  // even though they are currently rewritten to string literals?
1029  std::vector<std::unique_ptr<const StringOp>> string_ops;
1030  string_ops.reserve(string_op_infos.size());
1031  for (const auto& string_op_info : string_op_infos) {
1032  string_ops.emplace_back(gen_string_op(string_op_info));
1033  }
1034  return string_ops;
1035 }
1036 
1037 // Free functions follow
1038 
1039 std::unique_ptr<const StringOp> gen_string_op(const StringOpInfo& string_op_info) {
1040  std::optional<std::string> var_string_optional_literal;
1041  const auto op_kind = string_op_info.getOpKind();
1042  const auto& return_ti = string_op_info.getReturnType();
1043 
1044  if (string_op_info.hasNullLiteralArg()) {
1045  return std::make_unique<const NullOp>(
1046  return_ti, var_string_optional_literal, op_kind);
1047  }
1048 
1049  const auto num_non_variable_literals = string_op_info.numNonVariableLiterals();
1050  if (string_op_info.hasVarStringLiteral()) {
1051  CHECK_EQ(num_non_variable_literals + 1UL, string_op_info.numLiterals());
1052  var_string_optional_literal = string_op_info.getStringLiteral(0);
1053  }
1054 
1055  switch (op_kind) {
1056  case SqlStringOpKind::LOWER: {
1057  CHECK_EQ(num_non_variable_literals, 0UL);
1058  return std::make_unique<const Lower>(var_string_optional_literal);
1059  }
1060  case SqlStringOpKind::UPPER: {
1061  CHECK_EQ(num_non_variable_literals, 0UL);
1062  return std::make_unique<const Upper>(var_string_optional_literal);
1063  }
1064  case SqlStringOpKind::INITCAP: {
1065  CHECK_EQ(num_non_variable_literals, 0UL);
1066  return std::make_unique<const InitCap>(var_string_optional_literal);
1067  }
1068  case SqlStringOpKind::REVERSE: {
1069  CHECK_EQ(num_non_variable_literals, 0UL);
1070  return std::make_unique<const Reverse>(var_string_optional_literal);
1071  }
1072  case SqlStringOpKind::REPEAT: {
1073  CHECK_EQ(num_non_variable_literals, 1UL);
1074  const auto num_repeats_literal = string_op_info.getIntLiteral(1);
1075  return std::make_unique<const Repeat>(var_string_optional_literal,
1076  num_repeats_literal);
1077  }
1079  case SqlStringOpKind::RCONCAT: {
1080  CHECK_GE(num_non_variable_literals, 0UL);
1081  CHECK_LE(num_non_variable_literals, 1UL);
1082  if (num_non_variable_literals == 1UL) {
1083  const auto str_literal = string_op_info.getStringLiteral(1);
1084  // Handle lhs literals by having RCONCAT operator set a flag
1085  return std::make_unique<const Concat>(var_string_optional_literal,
1086  str_literal,
1087  op_kind == SqlStringOpKind::RCONCAT);
1088  } else {
1089  return std::make_unique<const Concat>(var_string_optional_literal);
1090  }
1091  }
1092  case SqlStringOpKind::LPAD:
1093  case SqlStringOpKind::RPAD: {
1094  CHECK_EQ(num_non_variable_literals, 2UL);
1095  const auto padded_length_literal = string_op_info.getIntLiteral(1);
1096  const auto padding_string_literal = string_op_info.getStringLiteral(2);
1097  return std::make_unique<Pad>(var_string_optional_literal,
1098  op_kind,
1099  padded_length_literal,
1100  padding_string_literal);
1101  }
1102  case SqlStringOpKind::TRIM:
1104  case SqlStringOpKind::RTRIM: {
1105  CHECK_EQ(num_non_variable_literals, 1UL);
1106  const auto trim_chars_literal = string_op_info.getStringLiteral(1);
1107  return std::make_unique<Trim>(
1108  var_string_optional_literal, op_kind, trim_chars_literal);
1109  }
1111  CHECK_GE(num_non_variable_literals, 1UL);
1112  CHECK_LE(num_non_variable_literals, 2UL);
1113  const auto start_pos_literal = string_op_info.getIntLiteral(1);
1114  const bool has_length_literal = string_op_info.intLiteralArgAtIdxExists(2);
1115  if (has_length_literal) {
1116  const auto length_literal = string_op_info.getIntLiteral(2);
1117  return std::make_unique<const Substring>(
1118  var_string_optional_literal, start_pos_literal, length_literal);
1119  } else {
1120  return std::make_unique<const Substring>(var_string_optional_literal,
1121  start_pos_literal);
1122  }
1123  }
1124  case SqlStringOpKind::OVERLAY: {
1125  CHECK_GE(num_non_variable_literals, 2UL);
1126  CHECK_LE(num_non_variable_literals, 3UL);
1127  const auto replace_string_literal = string_op_info.getStringLiteral(1);
1128  const auto start_pos_literal = string_op_info.getIntLiteral(2);
1129  const bool has_length_literal = string_op_info.intLiteralArgAtIdxExists(3);
1130  if (has_length_literal) {
1131  const auto length_literal = string_op_info.getIntLiteral(3);
1132  return std::make_unique<const Overlay>(var_string_optional_literal,
1133  replace_string_literal,
1134  start_pos_literal,
1135  length_literal);
1136  } else {
1137  return std::make_unique<const Overlay>(
1138  var_string_optional_literal, replace_string_literal, start_pos_literal);
1139  }
1140  }
1141  case SqlStringOpKind::REPLACE: {
1142  CHECK_GE(num_non_variable_literals, 2UL);
1143  CHECK_LE(num_non_variable_literals, 2UL);
1144  const auto pattern_string_literal = string_op_info.getStringLiteral(1);
1145  const auto replacement_string_literal = string_op_info.getStringLiteral(2);
1146  return std::make_unique<const Replace>(var_string_optional_literal,
1147  pattern_string_literal,
1148  replacement_string_literal);
1149  }
1151  CHECK_GE(num_non_variable_literals, 2UL);
1152  CHECK_LE(num_non_variable_literals, 2UL);
1153  const auto delimiter_literal = string_op_info.getStringLiteral(1);
1154  const auto split_part_literal = string_op_info.getIntLiteral(2);
1155  return std::make_unique<const SplitPart>(
1156  var_string_optional_literal, delimiter_literal, split_part_literal);
1157  }
1159  CHECK_GE(num_non_variable_literals, 5UL);
1160  CHECK_LE(num_non_variable_literals, 5UL);
1161  const auto pattern_literal = string_op_info.getStringLiteral(1);
1162  const auto replacement_literal = string_op_info.getStringLiteral(2);
1163  const auto start_pos_literal = string_op_info.getIntLiteral(3);
1164  const auto occurrence_literal = string_op_info.getIntLiteral(4);
1165  const auto regex_params_literal = string_op_info.getStringLiteral(5);
1166  return std::make_unique<const RegexpReplace>(var_string_optional_literal,
1167  pattern_literal,
1168  replacement_literal,
1169  start_pos_literal,
1170  occurrence_literal,
1171  regex_params_literal);
1172  }
1174  CHECK_GE(num_non_variable_literals, 5UL);
1175  CHECK_LE(num_non_variable_literals, 5UL);
1176  const auto pattern_literal = string_op_info.getStringLiteral(1);
1177  const auto start_pos_literal = string_op_info.getIntLiteral(2);
1178  const auto occurrence_literal = string_op_info.getIntLiteral(3);
1179  const auto regex_params_literal = string_op_info.getStringLiteral(4);
1180  const auto sub_match_idx_literal = string_op_info.getIntLiteral(5);
1181  return std::make_unique<const RegexpSubstr>(var_string_optional_literal,
1182  pattern_literal,
1183  start_pos_literal,
1184  occurrence_literal,
1185  regex_params_literal,
1186  sub_match_idx_literal);
1187  }
1189  CHECK_GE(num_non_variable_literals, 3UL);
1190  CHECK_LE(num_non_variable_literals, 3UL);
1191  const auto pattern_literal = string_op_info.getStringLiteral(1);
1192  const auto start_pos_literal = string_op_info.getIntLiteral(2);
1193  const auto regex_params_literal = string_op_info.getStringLiteral(3);
1194  return std::make_unique<const RegexpCount>(var_string_optional_literal,
1195  pattern_literal,
1196  start_pos_literal,
1197  regex_params_literal);
1198  }
1200  CHECK_EQ(num_non_variable_literals, 1UL);
1201  const auto json_path_literal = string_op_info.getStringLiteral(1);
1202  return std::make_unique<const JsonValue>(var_string_optional_literal,
1203  json_path_literal);
1204  }
1206  CHECK_EQ(num_non_variable_literals, 0UL);
1207  return std::make_unique<const Base64Encode>(var_string_optional_literal);
1208  }
1210  CHECK_EQ(num_non_variable_literals, 0UL);
1211  return std::make_unique<const Base64Decode>(var_string_optional_literal);
1212  }
1214  CHECK_EQ(num_non_variable_literals, 0UL);
1215  return std::make_unique<const UrlEncode>(var_string_optional_literal);
1216  }
1218  CHECK_EQ(num_non_variable_literals, 0UL);
1219  return std::make_unique<const UrlDecode>(var_string_optional_literal);
1220  }
1222  CHECK_EQ(num_non_variable_literals, 0UL);
1223  return std::make_unique<const TryStringCast>(return_ti,
1224  var_string_optional_literal);
1225  }
1227  CHECK_GE(num_non_variable_literals, 1UL);
1228  CHECK_LE(num_non_variable_literals, 2UL);
1229  const auto search_literal = string_op_info.getStringLiteral(1);
1230  const bool has_start_pos_literal = string_op_info.intLiteralArgAtIdxExists(2);
1231  if (has_start_pos_literal) {
1232  const auto start_pos_literal = string_op_info.getIntLiteral(2);
1233  return std::make_unique<const Position>(
1234  var_string_optional_literal, search_literal, start_pos_literal);
1235  } else {
1236  return std::make_unique<const Position>(var_string_optional_literal,
1237  search_literal);
1238  }
1239  }
1241  CHECK_GE(num_non_variable_literals, 0UL);
1242  CHECK_LE(num_non_variable_literals, 1UL);
1243  if (num_non_variable_literals == 1UL) {
1244  const auto str_literal = string_op_info.getStringLiteral(1);
1245  return std::make_unique<const JarowinklerSimilarity>(var_string_optional_literal,
1246  str_literal);
1247  } else {
1248  return std::make_unique<const JarowinklerSimilarity>(var_string_optional_literal);
1249  }
1250  }
1252  CHECK_GE(num_non_variable_literals, 0UL);
1253  CHECK_LE(num_non_variable_literals, 1UL);
1254  if (num_non_variable_literals == 1UL) {
1255  const auto str_literal = string_op_info.getStringLiteral(1);
1256  return std::make_unique<const LevenshteinDistance>(var_string_optional_literal,
1257  str_literal);
1258  } else {
1259  return std::make_unique<const LevenshteinDistance>(var_string_optional_literal);
1260  }
1261  }
1262  case SqlStringOpKind::HASH: {
1263  CHECK_EQ(num_non_variable_literals, 0UL);
1264  return std::make_unique<const Hash>(var_string_optional_literal);
1265  }
1266  default:
1267  UNREACHABLE();
1268  return {};
1269  }
1270 }
1271 
1272 std::pair<std::string, bool /* is null */> apply_string_op_to_literals(
1273  const StringOpInfo& string_op_info) {
1274  CHECK(string_op_info.hasVarStringLiteral());
1275  if (string_op_info.hasNullLiteralArg()) {
1276  const std::string null_str{""};
1277  return std::make_pair(null_str, true);
1278  }
1279  const auto string_op = gen_string_op(string_op_info);
1280  return string_op->operator()().toPair();
1281 }
1282 
1284  CHECK(string_op_info.hasVarStringLiteral());
1285  const auto string_op = gen_string_op(string_op_info);
1286  return string_op->numericEval();
1287 }
1288 
1289 } // namespace StringOps_Namespace
#define CHECK_EQ(x, y)
Definition: Logger.h:301
const SQLTypeInfo & getReturnType() const
Definition: StringOpInfo.h:58
Datum apply_numeric_op_to_literals(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:1283
T compute_levenshtein_distance_template(std::string_view s1, std::string_view s2)
Definition: StringOps.cpp:255
#define UNREACHABLE()
Definition: Logger.h:338
#define CHECK_GE(x, y)
Definition: Logger.h:306
SqlStringOpKind
Definition: sqldefs.h:92
bool intLiteralArgAtIdxExists(const size_t index) const
double compute_jaro_winkler_score(std::string_view s1, std::string_view s2)
Definition: StringOps.cpp:185
#define CHECK_GT(x, y)
Definition: Logger.h:305
std::string to_string(char const *&&v)
int64_t getIntLiteral(const size_t index) const
size_t numNonVariableLiterals() const
Definition: StringOpInfo.h:54
int64_t bigintval
Definition: Datum.h:76
std::pair< std::string, bool > apply_string_op_to_literals(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:1272
Datum StringToDatum(const std::string_view s, SQLTypeInfo &ti)
Definition: Datum.cpp:339
OUTPUT transform(INPUT const &input, FUNC const &func)
Definition: misc.h:329
Datum NullDatum(const SQLTypeInfo &ti)
Definition: Datum.cpp:288
constexpr int winkler_k_prefix_length
Definition: StringOps.cpp:122
#define CHECK_LE(x, y)
Definition: Logger.h:304
double compute_jaro_score(std::string_view s1, std::string_view s2)
Definition: StringOps.cpp:127
int64_t compute_levenshtein_distance(std::string_view s1, std::string_view s2)
Definition: StringOps.cpp:278
std::string getStringLiteral(const size_t index) const
#define CHECK(condition)
Definition: Logger.h:291
std::string decode_base64(const std::string &val, bool trim_nulls)
Definition: base64.h:27
static std::string encode_base64(const std::string &val)
Definition: base64.h:45
const SqlStringOpKind & getOpKind() const
Definition: StringOpInfo.h:42
constexpr double n
Definition: Utm.h:38
constexpr double winkler_k_scaling_factor
Definition: StringOps.cpp:125
Definition: Datum.h:71
std::unique_ptr< const StringOp > gen_string_op(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:1039