OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringOps_Namespace Namespace Reference

Namespaces

 anonymous_namespace{StringOps.cpp}
 

Classes

struct  StringOpInfo
 

Typedefs

using LiteralArgMap = std::map< size_t, std::pair< SQLTypes, Datum >>
 

Functions

std::ostream & operator<< (std::ostream &stream, const StringOpInfo &string_op_info)
 
std::ostream & operator<< (std::ostream &stream, const std::vector< StringOpInfo > &string_op_infos)
 
std::string toString (const std::vector< StringOpInfo > &string_op_infos)
 
double compute_jaro_score (std::string_view s1, std::string_view s2)
 
double compute_jaro_winkler_score (std::string_view s1, std::string_view s2)
 
template<typename T >
compute_levenshtein_distance_template (std::string_view s1, std::string_view s2)
 
int64_t compute_levenshtein_distance (std::string_view s1, std::string_view s2)
 
std::unique_ptr< const StringOp > gen_string_op (const StringOpInfo &string_op_info)
 
std::pair< std::string, bool > apply_string_op_to_literals (const StringOpInfo &string_op_info)
 
Datum apply_numeric_op_to_literals (const StringOpInfo &string_op_info)
 

Variables

constexpr int winkler_k_prefix_length = 4
 
constexpr double winkler_k_scaling_factor = 0.1
 

Typedef Documentation

using StringOps_Namespace::LiteralArgMap = typedef std::map<size_t, std::pair<SQLTypes, Datum>>

Definition at line 30 of file StringOpInfo.h.

Function Documentation

Datum StringOps_Namespace::apply_numeric_op_to_literals ( const StringOpInfo &  string_op_info)

Definition at line 1283 of file StringOps.cpp.

References CHECK, gen_string_op(), and StringOps_Namespace::StringOpInfo::hasVarStringLiteral().

Referenced by anonymous_namespace{ExpressionRewrite.cpp}::ConstantFoldingVisitor::visitStringOper().

1283  {
1284  CHECK(string_op_info.hasVarStringLiteral());
1285  const auto string_op = gen_string_op(string_op_info);
1286  return string_op->numericEval();
1287 }
#define CHECK(condition)
Definition: Logger.h:291
std::unique_ptr< const StringOp > gen_string_op(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:1039

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::pair<std::string, bool > StringOps_Namespace::apply_string_op_to_literals ( const StringOpInfo &  string_op_info)

Definition at line 1272 of file StringOps.cpp.

References CHECK, gen_string_op(), StringOps_Namespace::StringOpInfo::hasNullLiteralArg(), and StringOps_Namespace::StringOpInfo::hasVarStringLiteral().

Referenced by TransientStringLiteralsVisitor::visitStringOper(), and anonymous_namespace{ExpressionRewrite.cpp}::ConstantFoldingVisitor::visitStringOper().

1273  {
1274  CHECK(string_op_info.hasVarStringLiteral());
1275  if (string_op_info.hasNullLiteralArg()) {
1276  const std::string null_str{""};
1277  return std::make_pair(null_str, true);
1278  }
1279  const auto string_op = gen_string_op(string_op_info);
1280  return string_op->operator()().toPair();
1281 }
#define CHECK(condition)
Definition: Logger.h:291
std::unique_ptr< const StringOp > gen_string_op(const StringOpInfo &string_op_info)
Definition: StringOps.cpp:1039

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

double StringOps_Namespace::compute_jaro_score ( std::string_view  s1,
std::string_view  s2 
)

Definition at line 127 of file StringOps.cpp.

References score.

Referenced by compute_jaro_winkler_score().

127  {
128  int s1_len = s1.size();
129  int s2_len = s2.size();
130 
131  if (s1_len == 0 || s2_len == 0) {
132  return 0.0;
133  }
134 
135  int match_distance = std::max(s1_len, s2_len) / 2 - 1;
136  std::vector<bool> s1_match(s1_len, false);
137  std::vector<bool> s2_match(s2_len, false);
138 
139  int matches = 0;
140  int transpositions = 0;
141 
142  for (int i = 0; i < s1_len; ++i) {
143  int start = std::max(0, i - match_distance);
144  int end = std::min(i + match_distance + 1, s2_len);
145 
146  for (int j = start; j < end; ++j) {
147  if (s2_match[j]) {
148  continue;
149  }
150  if (s1[i] != s2[j]) {
151  continue;
152  }
153  s1_match[i] = true;
154  s2_match[j] = true;
155  ++matches;
156  break;
157  }
158  }
159 
160  if (matches == 0) {
161  return 0.0;
162  }
163 
164  int k = 0;
165  for (int i = 0; i < s1_len; ++i) {
166  if (!s1_match[i]) {
167  continue;
168  }
169  while (!s2_match[k]) {
170  ++k;
171  }
172  if (s1[i] != s2[k]) {
173  ++transpositions;
174  }
175  ++k;
176  }
177 
178  double score = ((matches / (double)s1_len) + (matches / (double)s2_len) +
179  ((matches - transpositions / 2.0) / matches)) /
180  3.0;
181 
182  return score;
183 }

+ Here is the caller graph for this function:

double StringOps_Namespace::compute_jaro_winkler_score ( std::string_view  s1,
std::string_view  s2 
)

Definition at line 185 of file StringOps.cpp.

References Datum::bigintval, compute_jaro_score(), anonymous_namespace{Utm.h}::n, NullDatum(), UNREACHABLE, winkler_k_prefix_length, and winkler_k_scaling_factor.

185  {
186  double jaro_score = compute_jaro_score(s1, s2);
187 
188  int l = 0;
189  int n = std::min({static_cast<int>(s1.size()),
190  static_cast<int>(s2.size()),
192 
193  for (; l < n; ++l) {
194  if (s1[l] != s2[l]) {
195  break;
196  }
197  }
198 
199  double winkler_adjustment = l * winkler_k_scaling_factor * (1 - jaro_score);
200  double jaro_winkler_score = jaro_score + winkler_adjustment;
201 
202  return jaro_winkler_score * 100;
203 }
constexpr int winkler_k_prefix_length
Definition: StringOps.cpp:122
double compute_jaro_score(std::string_view s1, std::string_view s2)
Definition: StringOps.cpp:127
constexpr double n
Definition: Utm.h:38
constexpr double winkler_k_scaling_factor
Definition: StringOps.cpp:125

+ Here is the call graph for this function:

int64_t StringOps_Namespace::compute_levenshtein_distance ( std::string_view  s1,
std::string_view  s2 
)

Definition at line 278 of file StringOps.cpp.

278  {
279  const size_t max_len = std::max(s1.size(), s2.size());
280 
281  if (max_len < 256) {
282  return compute_levenshtein_distance_template<uint8_t>(s1, s2);
283  } else if (max_len < 65536) {
284  return compute_levenshtein_distance_template<uint16_t>(s1, s2);
285  } else if (max_len < std::numeric_limits<uint32_t>::max()) {
286  return compute_levenshtein_distance_template<uint32_t>(s1, s2);
287  } else {
288  return compute_levenshtein_distance_template<uint64_t>(s1, s2);
289  }
290 }
template<typename T >
T StringOps_Namespace::compute_levenshtein_distance_template ( std::string_view  s1,
std::string_view  s2 
)

Definition at line 255 of file StringOps.cpp.

255  {
256  const size_t len1 = s1.size(), len2 = s2.size();
257  std::vector<std::vector<T>> d(len1 + 1, std::vector<T>(len2 + 1));
258 
259  d[0][0] = 0;
260  for (size_t i = 1; i <= len1; ++i) {
261  d[i][0] = i;
262  }
263  for (size_t i = 1; i <= len2; ++i) {
264  d[0][i] = i;
265  }
266 
267  for (size_t i = 1; i <= len1; ++i) {
268  for (size_t j = 1; j <= len2; ++j) {
269  d[i][j] = std::min({d[i - 1][j] + 1,
270  d[i][j - 1] + 1,
271  d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : 1)});
272  }
273  }
274 
275  return d[len1][len2];
276 }
std::unique_ptr<const StringOp> StringOps_Namespace::gen_string_op ( const StringOpInfo &  string_op_info)

Definition at line 1039 of file StringOps.cpp.

References BASE64_DECODE, BASE64_ENCODE, CHECK_EQ, CHECK_GE, CHECK_LE, CONCAT, StringOps_Namespace::StringOpInfo::getIntLiteral(), StringOps_Namespace::StringOpInfo::getOpKind(), StringOps_Namespace::StringOpInfo::getReturnType(), StringOps_Namespace::StringOpInfo::getStringLiteral(), HASH, StringOps_Namespace::StringOpInfo::hasNullLiteralArg(), StringOps_Namespace::StringOpInfo::hasVarStringLiteral(), INITCAP, StringOps_Namespace::StringOpInfo::intLiteralArgAtIdxExists(), JAROWINKLER_SIMILARITY, JSON_VALUE, LEVENSHTEIN_DISTANCE, LOWER, LPAD, LTRIM, StringOps_Namespace::StringOpInfo::numLiterals(), StringOps_Namespace::StringOpInfo::numNonVariableLiterals(), OVERLAY, POSITION, RCONCAT, REGEXP_COUNT, REGEXP_REPLACE, REGEXP_SUBSTR, REPEAT, REPLACE, REVERSE, RPAD, RTRIM, SPLIT_PART, SUBSTRING, TRIM, TRY_STRING_CAST, UNREACHABLE, UPPER, URL_DECODE, and URL_ENCODE.

Referenced by apply_numeric_op_to_literals(), and apply_string_op_to_literals().

1039  {
1040  std::optional<std::string> var_string_optional_literal;
1041  const auto op_kind = string_op_info.getOpKind();
1042  const auto& return_ti = string_op_info.getReturnType();
1043 
1044  if (string_op_info.hasNullLiteralArg()) {
1045  return std::make_unique<const NullOp>(
1046  return_ti, var_string_optional_literal, op_kind);
1047  }
1048 
1049  const auto num_non_variable_literals = string_op_info.numNonVariableLiterals();
1050  if (string_op_info.hasVarStringLiteral()) {
1051  CHECK_EQ(num_non_variable_literals + 1UL, string_op_info.numLiterals());
1052  var_string_optional_literal = string_op_info.getStringLiteral(0);
1053  }
1054 
1055  switch (op_kind) {
1056  case SqlStringOpKind::LOWER: {
1057  CHECK_EQ(num_non_variable_literals, 0UL);
1058  return std::make_unique<const Lower>(var_string_optional_literal);
1059  }
1060  case SqlStringOpKind::UPPER: {
1061  CHECK_EQ(num_non_variable_literals, 0UL);
1062  return std::make_unique<const Upper>(var_string_optional_literal);
1063  }
1064  case SqlStringOpKind::INITCAP: {
1065  CHECK_EQ(num_non_variable_literals, 0UL);
1066  return std::make_unique<const InitCap>(var_string_optional_literal);
1067  }
1068  case SqlStringOpKind::REVERSE: {
1069  CHECK_EQ(num_non_variable_literals, 0UL);
1070  return std::make_unique<const Reverse>(var_string_optional_literal);
1071  }
1072  case SqlStringOpKind::REPEAT: {
1073  CHECK_EQ(num_non_variable_literals, 1UL);
1074  const auto num_repeats_literal = string_op_info.getIntLiteral(1);
1075  return std::make_unique<const Repeat>(var_string_optional_literal,
1076  num_repeats_literal);
1077  }
1079  case SqlStringOpKind::RCONCAT: {
1080  CHECK_GE(num_non_variable_literals, 0UL);
1081  CHECK_LE(num_non_variable_literals, 1UL);
1082  if (num_non_variable_literals == 1UL) {
1083  const auto str_literal = string_op_info.getStringLiteral(1);
1084  // Handle lhs literals by having RCONCAT operator set a flag
1085  return std::make_unique<const Concat>(var_string_optional_literal,
1086  str_literal,
1087  op_kind == SqlStringOpKind::RCONCAT);
1088  } else {
1089  return std::make_unique<const Concat>(var_string_optional_literal);
1090  }
1091  }
1092  case SqlStringOpKind::LPAD:
1093  case SqlStringOpKind::RPAD: {
1094  CHECK_EQ(num_non_variable_literals, 2UL);
1095  const auto padded_length_literal = string_op_info.getIntLiteral(1);
1096  const auto padding_string_literal = string_op_info.getStringLiteral(2);
1097  return std::make_unique<Pad>(var_string_optional_literal,
1098  op_kind,
1099  padded_length_literal,
1100  padding_string_literal);
1101  }
1102  case SqlStringOpKind::TRIM:
1104  case SqlStringOpKind::RTRIM: {
1105  CHECK_EQ(num_non_variable_literals, 1UL);
1106  const auto trim_chars_literal = string_op_info.getStringLiteral(1);
1107  return std::make_unique<Trim>(
1108  var_string_optional_literal, op_kind, trim_chars_literal);
1109  }
1111  CHECK_GE(num_non_variable_literals, 1UL);
1112  CHECK_LE(num_non_variable_literals, 2UL);
1113  const auto start_pos_literal = string_op_info.getIntLiteral(1);
1114  const bool has_length_literal = string_op_info.intLiteralArgAtIdxExists(2);
1115  if (has_length_literal) {
1116  const auto length_literal = string_op_info.getIntLiteral(2);
1117  return std::make_unique<const Substring>(
1118  var_string_optional_literal, start_pos_literal, length_literal);
1119  } else {
1120  return std::make_unique<const Substring>(var_string_optional_literal,
1121  start_pos_literal);
1122  }
1123  }
1124  case SqlStringOpKind::OVERLAY: {
1125  CHECK_GE(num_non_variable_literals, 2UL);
1126  CHECK_LE(num_non_variable_literals, 3UL);
1127  const auto replace_string_literal = string_op_info.getStringLiteral(1);
1128  const auto start_pos_literal = string_op_info.getIntLiteral(2);
1129  const bool has_length_literal = string_op_info.intLiteralArgAtIdxExists(3);
1130  if (has_length_literal) {
1131  const auto length_literal = string_op_info.getIntLiteral(3);
1132  return std::make_unique<const Overlay>(var_string_optional_literal,
1133  replace_string_literal,
1134  start_pos_literal,
1135  length_literal);
1136  } else {
1137  return std::make_unique<const Overlay>(
1138  var_string_optional_literal, replace_string_literal, start_pos_literal);
1139  }
1140  }
1141  case SqlStringOpKind::REPLACE: {
1142  CHECK_GE(num_non_variable_literals, 2UL);
1143  CHECK_LE(num_non_variable_literals, 2UL);
1144  const auto pattern_string_literal = string_op_info.getStringLiteral(1);
1145  const auto replacement_string_literal = string_op_info.getStringLiteral(2);
1146  return std::make_unique<const Replace>(var_string_optional_literal,
1147  pattern_string_literal,
1148  replacement_string_literal);
1149  }
1151  CHECK_GE(num_non_variable_literals, 2UL);
1152  CHECK_LE(num_non_variable_literals, 2UL);
1153  const auto delimiter_literal = string_op_info.getStringLiteral(1);
1154  const auto split_part_literal = string_op_info.getIntLiteral(2);
1155  return std::make_unique<const SplitPart>(
1156  var_string_optional_literal, delimiter_literal, split_part_literal);
1157  }
1159  CHECK_GE(num_non_variable_literals, 5UL);
1160  CHECK_LE(num_non_variable_literals, 5UL);
1161  const auto pattern_literal = string_op_info.getStringLiteral(1);
1162  const auto replacement_literal = string_op_info.getStringLiteral(2);
1163  const auto start_pos_literal = string_op_info.getIntLiteral(3);
1164  const auto occurrence_literal = string_op_info.getIntLiteral(4);
1165  const auto regex_params_literal = string_op_info.getStringLiteral(5);
1166  return std::make_unique<const RegexpReplace>(var_string_optional_literal,
1167  pattern_literal,
1168  replacement_literal,
1169  start_pos_literal,
1170  occurrence_literal,
1171  regex_params_literal);
1172  }
1174  CHECK_GE(num_non_variable_literals, 5UL);
1175  CHECK_LE(num_non_variable_literals, 5UL);
1176  const auto pattern_literal = string_op_info.getStringLiteral(1);
1177  const auto start_pos_literal = string_op_info.getIntLiteral(2);
1178  const auto occurrence_literal = string_op_info.getIntLiteral(3);
1179  const auto regex_params_literal = string_op_info.getStringLiteral(4);
1180  const auto sub_match_idx_literal = string_op_info.getIntLiteral(5);
1181  return std::make_unique<const RegexpSubstr>(var_string_optional_literal,
1182  pattern_literal,
1183  start_pos_literal,
1184  occurrence_literal,
1185  regex_params_literal,
1186  sub_match_idx_literal);
1187  }
1189  CHECK_GE(num_non_variable_literals, 3UL);
1190  CHECK_LE(num_non_variable_literals, 3UL);
1191  const auto pattern_literal = string_op_info.getStringLiteral(1);
1192  const auto start_pos_literal = string_op_info.getIntLiteral(2);
1193  const auto regex_params_literal = string_op_info.getStringLiteral(3);
1194  return std::make_unique<const RegexpCount>(var_string_optional_literal,
1195  pattern_literal,
1196  start_pos_literal,
1197  regex_params_literal);
1198  }
1200  CHECK_EQ(num_non_variable_literals, 1UL);
1201  const auto json_path_literal = string_op_info.getStringLiteral(1);
1202  return std::make_unique<const JsonValue>(var_string_optional_literal,
1203  json_path_literal);
1204  }
1206  CHECK_EQ(num_non_variable_literals, 0UL);
1207  return std::make_unique<const Base64Encode>(var_string_optional_literal);
1208  }
1210  CHECK_EQ(num_non_variable_literals, 0UL);
1211  return std::make_unique<const Base64Decode>(var_string_optional_literal);
1212  }
1214  CHECK_EQ(num_non_variable_literals, 0UL);
1215  return std::make_unique<const UrlEncode>(var_string_optional_literal);
1216  }
1218  CHECK_EQ(num_non_variable_literals, 0UL);
1219  return std::make_unique<const UrlDecode>(var_string_optional_literal);
1220  }
1222  CHECK_EQ(num_non_variable_literals, 0UL);
1223  return std::make_unique<const TryStringCast>(return_ti,
1224  var_string_optional_literal);
1225  }
1227  CHECK_GE(num_non_variable_literals, 1UL);
1228  CHECK_LE(num_non_variable_literals, 2UL);
1229  const auto search_literal = string_op_info.getStringLiteral(1);
1230  const bool has_start_pos_literal = string_op_info.intLiteralArgAtIdxExists(2);
1231  if (has_start_pos_literal) {
1232  const auto start_pos_literal = string_op_info.getIntLiteral(2);
1233  return std::make_unique<const Position>(
1234  var_string_optional_literal, search_literal, start_pos_literal);
1235  } else {
1236  return std::make_unique<const Position>(var_string_optional_literal,
1237  search_literal);
1238  }
1239  }
1241  CHECK_GE(num_non_variable_literals, 0UL);
1242  CHECK_LE(num_non_variable_literals, 1UL);
1243  if (num_non_variable_literals == 1UL) {
1244  const auto str_literal = string_op_info.getStringLiteral(1);
1245  return std::make_unique<const JarowinklerSimilarity>(var_string_optional_literal,
1246  str_literal);
1247  } else {
1248  return std::make_unique<const JarowinklerSimilarity>(var_string_optional_literal);
1249  }
1250  }
1252  CHECK_GE(num_non_variable_literals, 0UL);
1253  CHECK_LE(num_non_variable_literals, 1UL);
1254  if (num_non_variable_literals == 1UL) {
1255  const auto str_literal = string_op_info.getStringLiteral(1);
1256  return std::make_unique<const LevenshteinDistance>(var_string_optional_literal,
1257  str_literal);
1258  } else {
1259  return std::make_unique<const LevenshteinDistance>(var_string_optional_literal);
1260  }
1261  }
1262  case SqlStringOpKind::HASH: {
1263  CHECK_EQ(num_non_variable_literals, 0UL);
1264  return std::make_unique<const Hash>(var_string_optional_literal);
1265  }
1266  default:
1267  UNREACHABLE();
1268  return {};
1269  }
1270 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
#define UNREACHABLE()
Definition: Logger.h:338
#define CHECK_GE(x, y)
Definition: Logger.h:306
#define CHECK_LE(x, y)
Definition: Logger.h:304

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::ostream& StringOps_Namespace::operator<< ( std::ostream &  stream,
const StringOpInfo &  string_op_info 
)

Definition at line 24 of file StringOpInfo.cpp.

References CHECK, extract_int_type_from_datum(), SQLTypeInfo::get_dimension(), SQLTypeInfo::get_scale(), SQLTypeInfo::get_type(), StringOps_Namespace::StringOpInfo::getOpKind(), StringOps_Namespace::StringOpInfo::getReturnType(), IS_INTEGER, IS_STRING, StringOps_Namespace::StringOpInfo::isLiteralArgNull(), StringOps_Namespace::StringOpInfo::literal_arg_map_, and toString().

24  {
25  stream << "StringOp("
26  << "operator: " << string_op_info.getOpKind()
27  << "return_ti: " << toString(string_op_info.getReturnType().get_type())
28  << " dim: " << string_op_info.getReturnType().get_dimension()
29  << " scale: " << string_op_info.getReturnType().get_scale() << ", literals: [";
30  bool first_elem = true;
31  for (const auto& literal_arg : string_op_info.literal_arg_map_) {
32  if (!first_elem) {
33  stream << ", ";
34  }
35  first_elem = false;
36  const auto datum_type = literal_arg.second.first;
37  const auto& datum = literal_arg.second.second;
38  stream << "{slot: " << literal_arg.first /* slot/idx */ << ", type: "
39  << ::toString(datum_type) << ", value: ";
40  if (string_op_info.isLiteralArgNull(datum_type, literal_arg.second.second)) {
41  stream << "NULL";
42  } else if (IS_STRING(datum_type)) {
43  stream << *datum.stringval;
44  } else {
45  CHECK(IS_INTEGER(datum_type));
46  const SQLTypeInfo ti(datum_type, false);
47  stream << extract_int_type_from_datum(datum, ti);
48  }
49  stream << "}";
50  }
51  stream << "]";
52  return stream;
53 }
int64_t extract_int_type_from_datum(const Datum datum, const SQLTypeInfo &ti)
Definition: Datum.cpp:523
std::string toString(const Executor::ExtModuleKinds &kind)
Definition: Execute.h:1703
#define IS_INTEGER(T)
Definition: sqltypes.h:304
#define IS_STRING(T)
Definition: sqltypes.h:309
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

std::ostream & StringOps_Namespace::operator<< ( std::ostream &  stream,
const std::vector< StringOpInfo > &  string_op_infos 
)

Definition at line 55 of file StringOpInfo.cpp.

56  {
57  stream << "[";
58  bool first_elem = true;
59  for (const auto& string_op_info : string_op_infos) {
60  if (!first_elem) {
61  stream << ", ";
62  }
63  first_elem = false;
64  stream << string_op_info;
65  }
66  stream << "]";
67  return stream;
68 }
std::string StringOps_Namespace::toString ( const std::vector< StringOpInfo > &  string_op_infos)

Definition at line 70 of file StringOpInfo.cpp.

Referenced by operator<<().

70  {
71  std::ostringstream oss;
72  oss << string_op_infos;
73  return oss.str();
74 }

+ Here is the caller graph for this function:

Variable Documentation

constexpr int StringOps_Namespace::winkler_k_prefix_length = 4

Definition at line 122 of file StringOps.cpp.

Referenced by compute_jaro_winkler_score().

constexpr double StringOps_Namespace::winkler_k_scaling_factor = 0.1

Definition at line 125 of file StringOps.cpp.

Referenced by compute_jaro_winkler_score().