OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
StringTransform.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "StringTransform.h"
18 #include "Logger/Logger.h"
19 
20 #include <numeric>
21 #include <random>
22 #include <regex>
23 
24 #include <cmath> // format_bytes round call
25 
26 #ifndef __CUDACC__
27 #include <boost/filesystem.hpp>
28 #include <iomanip>
29 #endif
30 
31 void apply_shim(std::string& result,
32  const boost::regex& reg_expr,
33  const std::function<void(std::string&, const boost::smatch&)>& shim_fn) {
34  boost::smatch what;
35  auto lit_pos = find_string_literals(result);
36  auto start_it = result.cbegin();
37  auto end_it = result.cend();
38  while (true) {
39  if (!boost::regex_search(start_it, end_it, what, reg_expr)) {
40  break;
41  }
42  const auto next_start =
43  inside_string_literal(what.position(), what.length(), lit_pos);
44  if (next_start) {
45  start_it = result.cbegin() + *next_start;
46  } else {
47  shim_fn(result, what);
48  lit_pos = find_string_literals(result);
49  start_it = result.cbegin();
50  end_it = result.cend();
51  }
52  }
53 }
54 
55 // Scan query and save all single-quoted string literals as [begin,end) index pairs into
56 // lit_pos, including the surrounding quotes.
57 std::vector<std::pair<size_t, size_t>> find_string_literals(const std::string& query) {
58  boost::regex literal_string_regex{R"(([^']+)('(?:[^']+|'')*'))", boost::regex::perl};
59  boost::smatch what;
60  auto it = query.begin();
61  auto prev_it = it;
62  std::vector<std::pair<size_t, size_t>> positions;
63  while (true) {
64  try {
65  if (!boost::regex_search(it, query.end(), what, literal_string_regex)) {
66  break;
67  }
68  } catch (const std::exception& e) {
69  // boost::regex throws an exception about the complexity of matching when
70  // the wrong type of quotes are used or they're mismatched. Let the query
71  // through unmodified, the parser will throw a much more informative error.
72  // This can also throw on very long queries
73  std::ostringstream oss;
74  oss << "Detecting an error while processing string literal regex search: "
75  << e.what();
76  throw std::runtime_error(oss.str());
77  }
78  CHECK_GT(what[1].length(), 0);
79  prev_it = it;
80  it += what.length();
81  positions.emplace_back(prev_it + what[1].length() - query.begin(),
82  it - query.begin());
83  }
84  return positions;
85 }
86 
87 std::string hide_sensitive_data_from_query(std::string const& query_str) {
88  constexpr std::regex::flag_type flags =
89  std::regex::ECMAScript | std::regex::icase | std::regex::optimize;
90  static const std::initializer_list<std::pair<std::regex, std::string>> rules{
91  {std::regex(
92  R"(\b((?:password|s3_access_key|s3_secret_key|s3_session_token|username|credential_string)\s*=\s*)'.+?')",
93  flags),
94  "$1'XXXXXXXX'"},
95  {std::regex(R"((\\set_license\s+)\S+)", flags), "$1XXXXXXXX"}};
96  return std::accumulate(
97  rules.begin(), rules.end(), query_str, [](auto& str, auto& rule) {
98  return std::regex_replace(str, rule.first, rule.second);
99  });
100 }
101 
102 std::string format_num_bytes(const size_t bytes) {
103  const size_t units_per_k_unit{1024};
104  const std::vector<std::string> byte_units = {" bytes", "KB", "MB", "GB", "TB", "PB"};
105  const std::vector<size_t> bytes_per_scale_unit = {size_t(1),
106  size_t(1) << 10,
107  size_t(1) << 20,
108  size_t(1) << 30,
109  size_t(1) << 40,
110  size_t(1) << 50,
111  size_t(1) << 60};
112  if (bytes < units_per_k_unit) {
113  return std::to_string(bytes) + " bytes";
114  }
115  CHECK_GE(bytes, units_per_k_unit);
116  const size_t byte_scale = log(bytes) / log(units_per_k_unit);
117  CHECK_GE(byte_scale, size_t(1));
118  CHECK_LE(byte_scale, size_t(5));
119  const size_t scaled_bytes_left_of_decimal = bytes / bytes_per_scale_unit[byte_scale];
120  const size_t scaled_bytes_right_of_decimal = bytes % bytes_per_scale_unit[byte_scale];
121  const size_t fractional_digits = static_cast<double>(scaled_bytes_right_of_decimal) /
122  bytes_per_scale_unit[byte_scale] * 100.;
123  return std::to_string(scaled_bytes_left_of_decimal) + "." +
124  std::to_string(fractional_digits) + " " + byte_units[byte_scale];
125 }
126 
127 template <>
128 std::string to_string(char const*&& v) {
129  return std::string(v);
130 }
131 
132 template <>
133 std::string to_string(std::string&& v) {
134  return std::move(v);
135 }
136 
137 std::pair<std::string_view, const char*> substring(const std::string& str,
138  size_t substr_length) {
139  // return substring with a post_fix
140  // assume input str is valid and we perform substring starting from str's initial pos
141  // (=0)
142  const auto str_size = str.size();
143  if (substr_length >= str_size) {
144  return {str, ""};
145  }
146  std::string_view substr(str.c_str(), substr_length);
147  return {substr, "..."};
148 }
149 
150 std::string generate_random_string(const size_t len) {
151  static char charset[] =
152  "0123456789"
153  "abcdefghijklmnopqrstuvwxyz"
154  "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
155 
156  static std::mt19937 prng{std::random_device{}()};
157  static std::uniform_int_distribution<size_t> dist(0, strlen(charset) - 1);
158 
159  std::string str;
160  str.reserve(len);
161  for (size_t i = 0; i < len; i++) {
162  str += charset[dist(prng)];
163  }
164  return str;
165 }
166 
167 #ifndef __CUDACC__
168 // This version of split works almost exactly like Python's split,
169 // which is very convienently-designed.
170 // See also: https://docs.python.org/3.8/library/stdtypes.html#str.split
171 std::vector<std::string> split(std::string_view str,
172  std::string_view delim,
173  std::optional<size_t> maxsplit) {
174  std::vector<std::string> result;
175 
176  // Use an explicit delimiter.
177  if (!delim.empty()) {
178  std::string::size_type i = 0, j = 0;
179  while ((i = str.find(delim, i)) != std::string::npos &&
180  (!maxsplit || result.size() < maxsplit.value())) {
181  result.emplace_back(str, j, i - j);
182  i += delim.size();
183  j = i;
184  }
185  result.emplace_back(str, j, std::string::npos);
186  return result;
187 
188  // Treat any number of consecutive whitespace characters as a delimiter.
189  } else {
190  bool prev_ws = true;
191  std::string::size_type i = 0, j = 0;
192  for (; i < str.size(); ++i) {
193  if (prev_ws) {
194  if (!isspace(str[i])) {
195  // start of word
196  prev_ws = false;
197  j = i;
198  }
199  } else {
200  if (isspace(str[i])) {
201  // start of space
202  result.emplace_back(str, j, i - j);
203  prev_ws = true;
204  j = i;
205  if ((maxsplit && result.size() == maxsplit.value())) {
206  // stop early if maxsplit was reached
207  result.emplace_back(str, j, std::string::npos);
208  return result;
209  }
210  }
211  }
212  }
213  if (!prev_ws) {
214  result.emplace_back(str, j, std::string::npos);
215  }
216  return result;
217  }
218 }
219 
220 std::string_view sv_strip(std::string_view str) {
221  std::string::size_type i, j;
222  for (i = 0; i < str.size() && std::isspace(str[i]); ++i) {
223  }
224  for (j = str.size(); j > i && std::isspace(str[j - 1]); --j) {
225  }
226  return str.substr(i, j - i);
227 }
228 
229 std::string strip(std::string_view str) {
230  return std::string(sv_strip(str));
231 }
232 
233 std::optional<size_t> inside_string_literal(
234  const size_t start,
235  const size_t length,
236  std::vector<std::pair<size_t, size_t>> const& literal_positions) {
237  const auto end = start + length;
238  for (const auto& literal_position : literal_positions) {
239  if (literal_position.first <= start && end <= literal_position.second) {
240  return literal_position.second;
241  }
242  }
243  return std::nullopt;
244 }
245 
246 #endif // __CUDACC__
247 
249  std::string& str) noexcept {
250  char inside_quote = 0;
251  bool previous_c_was_backslash = false;
252  for (auto& c : str) {
253  // if this character is a quote of either type
254  if (c == '\'' || c == '\"') {
255  // ignore if previous character was a backslash
256  if (!previous_c_was_backslash) {
257  // start or end of a quoted region
258  if (inside_quote == c) {
259  // end region
260  inside_quote = 0;
261  } else if (inside_quote == 0) {
262  // start region
263  inside_quote = c;
264  }
265  }
266  } else if (inside_quote == 0) {
267  // outside quoted region
268  if (c == '\n' || c == '\t' || c == '\r') {
269  // replace these with space
270  c = ' ';
271  }
272  // otherwise leave alone, including quotes of a different type
273  }
274  // handle backslashes, except for double backslashes
275  if (c == '\\') {
276  previous_c_was_backslash = !previous_c_was_backslash;
277  } else {
278  previous_c_was_backslash = false;
279  }
280  }
281  // if we didn't end a region, there were unclosed or mixed-nested quotes
282  // accounting for backslashes should mean that this should only be the
283  // case with truly malformed strings which Calcite will barf on anyway
284  return (inside_quote == 0);
285 }
286 
287 #ifndef __CUDACC__
288 std::string get_quoted_string(const std::string& filename, char quote, char escape) {
289  std::stringstream ss;
290  ss << std::quoted(filename, quote, escape); // TODO: prevents string_view Jun 2020
291  return ss.str();
292 }
293 #endif // __CUDACC__
294 
295 #ifndef __CUDACC__
296 std::string simple_sanitize(const std::string& str) {
297  auto sanitized_str{str};
298  for (auto& c : sanitized_str) {
299  c = (c < 32) ? ' ' : c;
300  }
301  return sanitized_str;
302 }
303 #endif // __CUDACC__
std::string hide_sensitive_data_from_query(std::string const &query_str)
std::optional< size_t > inside_string_literal(const size_t start, const size_t length, std::vector< std::pair< size_t, size_t >> const &literal_positions)
std::vector< std::pair< size_t, size_t > > find_string_literals(const std::string &query)
std::string_view sv_strip(std::string_view str)
return trimmed string_view
std::string strip(std::string_view str)
trim any whitespace from the left and right ends of a string
#define CHECK_GE(x, y)
Definition: Logger.h:306
std::string simple_sanitize(const std::string &str)
simple sanitize string (replace control characters with space)
#define CHECK_GT(x, y)
Definition: Logger.h:305
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
std::string get_quoted_string(const std::string &filename, char quote, char escape)
Quote a string while escaping any existing quotes in the string.
std::string generate_random_string(const size_t len)
DEVICE auto accumulate(ARGS &&...args)
Definition: gpu_enabled.h:42
void apply_shim(std::string &result, const boost::regex &reg_expr, const std::function< void(std::string &, const boost::smatch &)> &shim_fn)
std::string format_num_bytes(const size_t bytes)
#define CHECK_LE(x, y)
Definition: Logger.h:304
bool remove_unquoted_newlines_linefeeds_and_tabs_from_sql_string(std::string &str) noexcept
sanitize an SQL string
std::pair< std::string_view, const char * > substring(const std::string &str, size_t substr_length)
return substring of str with postfix if str.size() &gt; substr_length