OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
DdlUtils.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "DdlUtils.h"
18 
19 #include <unordered_set>
20 
21 #include <boost/algorithm/string.hpp>
22 #include <boost/filesystem.hpp>
23 #include <boost/program_options.hpp>
24 
25 #include "rapidjson/document.h"
26 
28 #include "Geospatial/Types.h"
30 #include "Shared/SysDefinitions.h"
31 #include "Shared/file_path_util.h"
32 #include "Shared/misc.h"
33 #include "Shared/sqltypes.h"
34 
36 
37 namespace ddl_utils {
38 
39 namespace alter_column_utils {
40 bool compare_sql_type_infos(const SQLTypeInfo& lhs, const SQLTypeInfo& rhs) {
41  return lhs.get_type() == rhs.get_type() && lhs.get_subtype() == rhs.get_subtype() &&
42  lhs.get_dimension() == rhs.get_dimension() &&
43  lhs.get_scale() == rhs.get_scale() &&
44  lhs.get_compression() == rhs.get_compression() &&
45  (lhs.get_compression() == kENCODING_NONE ||
46  lhs.get_comp_param() == rhs.get_comp_param() ||
47  (lhs.get_compression() == kENCODING_DICT &&
48  lhs.get_size() == rhs.get_size())) &&
49  lhs.get_notnull() == rhs.get_notnull();
50 }
51 
53  const ColumnDescriptor* rhs) {
55  result.defaults_match =
56  (!lhs->default_value.has_value() && !rhs->default_value.has_value()) ||
57  (lhs->default_value.has_value() && rhs->default_value.has_value() &&
58  lhs->default_value.value() == rhs->default_value.value());
60  result.remainder_match =
61  lhs->tableId == rhs->tableId && lhs->columnId == rhs->columnId &&
62  lhs->columnName == rhs->columnName && lhs->sourceName == rhs->sourceName &&
63  lhs->chunks == rhs->chunks && lhs->isSystemCol == rhs->isSystemCol &&
64  lhs->isVirtualCol == rhs->isVirtualCol && lhs->virtualExpr == rhs->virtualExpr &&
65  lhs->isDeletedCol == rhs->isDeletedCol && lhs->isGeoPhyCol == rhs->isGeoPhyCol;
66  result.exact_match =
67  result.defaults_match && result.sql_types_match && result.remainder_match;
68  return result;
69 }
70 
71 } // namespace alter_column_utils
72 
73 SqlType::SqlType(SQLTypes type, int param1, int param2, bool is_array, int array_size)
74  : type(type)
75  , param1(param1)
76  , param2(param2)
77  , is_array(is_array)
78  , array_size(array_size) {}
79 
81  return type;
82 }
83 
84 int SqlType::get_param1() const {
85  return param1;
86 }
87 
88 void SqlType::set_param1(int param) {
89  param1 = param;
90 }
91 
92 int SqlType::get_param2() const {
93  return param2;
94 }
95 
96 bool SqlType::get_is_array() const {
97  return is_array;
98 }
99 
101  is_array = a;
102 }
103 
105  return array_size;
106 }
107 
109  array_size = s;
110 }
111 
112 std::string SqlType::to_string() const {
113  std::string str;
114  switch (type) {
115  case kBOOLEAN:
116  str = "BOOLEAN";
117  break;
118  case kCHAR:
119  str = "CHAR(" + boost::lexical_cast<std::string>(param1) + ")";
120  break;
121  case kVARCHAR:
122  str = "VARCHAR(" + boost::lexical_cast<std::string>(param1) + ")";
123  break;
124  case kTEXT:
125  str = "TEXT";
126  break;
127  case kNUMERIC:
128  str = "NUMERIC(" + boost::lexical_cast<std::string>(param1);
129  if (param2 > 0) {
130  str += ", " + boost::lexical_cast<std::string>(param2);
131  }
132  str += ")";
133  break;
134  case kDECIMAL:
135  str = "DECIMAL(" + boost::lexical_cast<std::string>(param1);
136  if (param2 > 0) {
137  str += ", " + boost::lexical_cast<std::string>(param2);
138  }
139  str += ")";
140  break;
141  case kBIGINT:
142  str = "BIGINT";
143  break;
144  case kINT:
145  str = "INT";
146  break;
147  case kTINYINT:
148  str = "TINYINT";
149  break;
150  case kSMALLINT:
151  str = "SMALLINT";
152  break;
153  case kFLOAT:
154  str = "FLOAT";
155  break;
156  case kDOUBLE:
157  str = "DOUBLE";
158  break;
159  case kTIME:
160  str = "TIME";
161  if (param1 < 6) {
162  str += "(" + boost::lexical_cast<std::string>(param1) + ")";
163  }
164  break;
165  case kTIMESTAMP:
166  str = "TIMESTAMP";
167  if (param1 <= 9) {
168  str += "(" + boost::lexical_cast<std::string>(param1) + ")";
169  }
170  break;
171  case kDATE:
172  str = "DATE";
173  break;
174  default:
175  assert(false);
176  break;
177  }
178  if (is_array) {
179  str += "[";
180  if (array_size > 0) {
181  str += boost::lexical_cast<std::string>(array_size);
182  }
183  str += "]";
184  }
185  return str;
186 }
187 
189  switch (type) {
190  case kCHAR:
191  case kVARCHAR:
192  if (param1 <= 0) {
193  throw std::runtime_error("CHAR and VARCHAR must have a positive dimension.");
194  }
195  break;
196  case kDECIMAL:
197  case kNUMERIC:
198  if (param1 <= 0) {
199  throw std::runtime_error("DECIMAL and NUMERIC must have a positive precision.");
201  throw std::runtime_error("DECIMAL and NUMERIC precision cannot be larger than " +
203  ".");
204  } else if (param1 <= param2) {
205  throw std::runtime_error(
206  "DECIMAL and NUMERIC must have precision larger than scale.");
207  }
208  break;
209  case kTIMESTAMP:
210  if (param1 == -1) {
211  param1 = 0; // set default to 0
212  } else if (param1 != 0 && param1 != 3 && param1 != 6 &&
213  param1 != 9) { // support ms, us, ns
214  throw std::runtime_error(
215  "Only TIMESTAMP(n) where n = (0,3,6,9) are supported now.");
216  }
217  break;
218  case kTIME:
219  if (param1 == -1) {
220  param1 = 0; // default precision is 0
221  }
222  if (param1 > 0) { // @TODO(wei) support sub-second precision later.
223  throw std::runtime_error("Only TIME(0) is supported now.");
224  }
225  break;
226  case kPOINT:
227  case kMULTIPOINT:
228  case kLINESTRING:
229  case kMULTILINESTRING:
230  case kPOLYGON:
231  case kMULTIPOLYGON:
232  // Storing SRID in param1
233  break;
234  default:
235  param1 = 0;
236  break;
237  }
238 }
239 
240 Encoding::Encoding(std::string* encoding_name, int encoding_param)
241  : encoding_name(encoding_name), encoding_param(encoding_param) {}
242 
243 const std::string* Encoding::get_encoding_name() const {
244  return encoding_name.get();
245 }
246 
248  return encoding_param;
249 }
250 
252  // Change default TEXT column behaviour to be DICT encoded
253  if (cd.columnType.is_string() || cd.columnType.is_string_array()) {
254  // default to 32-bits
256  cd.columnType.set_comp_param(32);
257  } else if (cd.columnType.is_decimal() && cd.columnType.get_precision() <= 4) {
259  cd.columnType.set_comp_param(16);
260  } else if (cd.columnType.is_decimal() && cd.columnType.get_precision() <= 9) {
262  cd.columnType.set_comp_param(32);
263  } else if (cd.columnType.is_decimal() &&
265  throw std::runtime_error(cd.columnName + ": Precision too high, max " +
267  } else if (cd.columnType.is_geometry() && cd.columnType.get_output_srid() == 4326) {
268  // default to GEOINT 32-bits
270  cd.columnType.set_comp_param(32);
272  // Days encoding for DATE
275  } else {
278  }
279 }
280 
282  int encoding_size,
283  const SqlType* column_type) {
284  auto type = cd.columnType.get_type();
285  // fixed-bits encoding
286  if (type == kARRAY) {
287  type = cd.columnType.get_subtype();
288  switch (type) {
289  case kTINYINT:
290  case kSMALLINT:
291  case kINT:
292  case kBIGINT:
293  case kDATE:
294  throw std::runtime_error(cd.columnName + ": Cannot apply FIXED encoding to " +
295  column_type->to_string() + " type array.");
296  break;
297  default:
298  break;
299  }
300  }
301 
302  if (!IS_INTEGER(type) && !is_datetime(type) &&
303  !(type == kDECIMAL || type == kNUMERIC)) {
304  throw std::runtime_error(
305  cd.columnName +
306  ": Fixed encoding is only supported for integer or time columns.");
307  }
308 
309  switch (type) {
310  case kSMALLINT:
311  if (encoding_size != 8) {
312  throw std::runtime_error(
313  cd.columnName +
314  ": Compression parameter for Fixed encoding on SMALLINT must be 8.");
315  }
316  break;
317  case kINT:
318  if (encoding_size != 8 && encoding_size != 16) {
319  throw std::runtime_error(
320  cd.columnName +
321  ": Compression parameter for Fixed encoding on INTEGER must be 8 or 16.");
322  }
323  break;
324  case kBIGINT:
325  if (encoding_size != 8 && encoding_size != 16 && encoding_size != 32) {
326  throw std::runtime_error(cd.columnName +
327  ": Compression parameter for Fixed encoding on "
328  "BIGINT must be 8 or 16 or 32.");
329  }
330  break;
331  case kTIMESTAMP:
332  case kTIME:
333  if (encoding_size != 32) {
334  throw std::runtime_error(cd.columnName +
335  ": Compression parameter for Fixed encoding on "
336  "TIME or TIMESTAMP must be 32.");
337  } else if (cd.columnType.is_high_precision_timestamp()) {
338  throw std::runtime_error("Fixed encoding is not supported for TIMESTAMP(3|6|9).");
339  }
340  break;
341  case kDECIMAL:
342  case kNUMERIC:
343  if (encoding_size != 32 && encoding_size != 16) {
344  throw std::runtime_error(cd.columnName +
345  ": Compression parameter for Fixed encoding on "
346  "DECIMAL must be 16 or 32.");
347  }
348 
349  if (encoding_size == 32 && cd.columnType.get_precision() > 9) {
350  throw std::runtime_error(cd.columnName +
351  ": Precision too high for Fixed(32) encoding, max 9.");
352  }
353 
354  if (encoding_size == 16 && cd.columnType.get_precision() > 4) {
355  throw std::runtime_error(cd.columnName +
356  ": Precision too high for Fixed(16) encoding, max 4.");
357  }
358  break;
359  case kDATE:
360  if (encoding_size != 32 && encoding_size != 16) {
361  throw std::runtime_error(cd.columnName +
362  ": Compression parameter for Fixed encoding on "
363  "DATE must be 16 or 32.");
364  }
365  break;
366  default:
367  throw std::runtime_error(cd.columnName + ": Cannot apply FIXED encoding to " +
368  column_type->to_string());
369  }
370  if (type == kDATE) {
372  cd.columnType.set_comp_param(16);
373  } else {
375  cd.columnType.set_comp_param(encoding_size);
376  }
377 }
378 
380  if (!cd.columnType.is_string() && !cd.columnType.is_string_array()) {
381  throw std::runtime_error(
382  cd.columnName +
383  ": Dictionary encoding is only supported on string or string array columns.");
384  }
385  int comp_param;
386  if (encoding_size == 0) {
387  comp_param = 32; // default to 32-bits
388  } else {
389  comp_param = encoding_size;
390  }
391  if (cd.columnType.is_string_array() && comp_param != 32) {
392  throw std::runtime_error(cd.columnName +
393  ": Compression parameter for string arrays must be 32");
394  }
395  if (comp_param != 8 && comp_param != 16 && comp_param != 32) {
396  throw std::runtime_error(
397  cd.columnName +
398  ": Compression parameter for Dictionary encoding must be 8 or 16 or 32.");
399  }
400  // dictionary encoding
402  cd.columnType.set_comp_param(comp_param);
403 }
404 
406  if (!cd.columnType.is_string() && !cd.columnType.is_string_array() &&
407  !cd.columnType.is_geometry()) {
408  throw std::runtime_error(
409  cd.columnName +
410  ": None encoding is only supported on string, string array, or geo columns.");
411  }
414 }
415 
417  // sparse column encoding with mostly NULL values
418  if (cd.columnType.get_notnull()) {
419  throw std::runtime_error(cd.columnName +
420  ": Cannot do sparse column encoding on a NOT NULL column.");
421  }
422  if (encoding_size == 0 || encoding_size % 8 != 0 || encoding_size > 48) {
423  throw std::runtime_error(
424  cd.columnName +
425  "Must specify number of bits as 8, 16, 24, 32 or 48 as the parameter to "
426  "sparse-column encoding.");
427  }
429  cd.columnType.set_comp_param(encoding_size);
430  // throw std::runtime_error("SPARSE encoding not supported yet.");
431 }
432 
434  if (!cd.columnType.is_geometry() || cd.columnType.get_output_srid() != 4326) {
435  throw std::runtime_error(
436  cd.columnName + ": COMPRESSED encoding is only supported on WGS84 geo columns.");
437  }
438  int comp_param;
439  if (encoding_size == 0) {
440  comp_param = 32; // default to 32-bits
441  } else {
442  comp_param = encoding_size;
443  }
444  if (comp_param != 32) {
445  throw std::runtime_error(cd.columnName +
446  ": only 32-bit COMPRESSED geo encoding is supported");
447  }
448  // encoding longitude/latitude as integers
450  cd.columnType.set_comp_param(comp_param);
451 }
452 
453 void validate_and_set_date_encoding(ColumnDescriptor& cd, int encoding_size) {
454  // days encoding for dates
455  if (cd.columnType.get_type() == kARRAY && cd.columnType.get_subtype() == kDATE) {
456  throw std::runtime_error(cd.columnName +
457  ": Cannot apply days encoding to date array.");
458  }
459  if (cd.columnType.get_type() != kDATE) {
460  throw std::runtime_error(cd.columnName +
461  ": Days encoding is only supported for DATE columns.");
462  }
463  if (encoding_size != 32 && encoding_size != 16) {
464  throw std::runtime_error(cd.columnName +
465  ": Compression parameter for Days encoding on "
466  "DATE must be 16 or 32.");
467  }
469  cd.columnType.set_comp_param((encoding_size == 16) ? 16 : 0);
470 }
471 
473  const Encoding* encoding,
474  const SqlType* column_type) {
475  if (encoding == nullptr) {
477  } else {
478  const std::string& comp = *encoding->get_encoding_name();
479  if (boost::iequals(comp, "fixed")) {
480  validate_and_set_fixed_encoding(cd, encoding->get_encoding_param(), column_type);
481  } else if (boost::iequals(comp, "rl")) {
482  // run length encoding
485  // throw std::runtime_error("RL(Run Length) encoding not supported yet.");
486  } else if (boost::iequals(comp, "diff")) {
487  // differential encoding
490  // throw std::runtime_error("DIFF(differential) encoding not supported yet.");
491  } else if (boost::iequals(comp, "dict")) {
493  } else if (boost::iequals(comp, "NONE")) {
495  } else if (boost::iequals(comp, "sparse")) {
497  } else if (boost::iequals(comp, "compressed")) {
499  } else if (boost::iequals(comp, "days")) {
501  } else {
502  throw std::runtime_error(cd.columnName + ": Invalid column compression scheme " +
503  comp);
504  }
505  }
506 }
507 
509  column_type->check_type();
510 
511  if (column_type->get_type() == kGEOMETRY) {
512  throw std::runtime_error("Unsupported type \"GEOMETRY\" specified.");
513  }
514 
515  if (column_type->get_is_array()) {
517  cd.columnType.set_subtype(column_type->get_type());
518  } else {
519  cd.columnType.set_type(column_type->get_type());
520  }
521  if (IS_GEO(column_type->get_type())) {
522  cd.columnType.set_subtype(static_cast<SQLTypes>(column_type->get_param1()));
523  cd.columnType.set_input_srid(column_type->get_param2());
524  cd.columnType.set_output_srid(column_type->get_param2());
525  } else {
526  cd.columnType.set_dimension(column_type->get_param1());
527  cd.columnType.set_scale(column_type->get_param2());
528  }
529 }
530 
531 void validate_and_set_array_size(ColumnDescriptor& cd, const SqlType* column_type) {
532  if (cd.columnType.is_string_array() &&
534  throw std::runtime_error(
535  cd.columnName +
536  ": Array of strings must be dictionary encoded. Specify ENCODING DICT");
537  }
538 
539  if (column_type->get_is_array()) {
540  int s = -1;
541  auto array_size = column_type->get_array_size();
542  if (array_size > 0) {
543  auto sti = cd.columnType.get_elem_type();
544  s = array_size * sti.get_size();
545  if (s <= 0) {
546  throw std::runtime_error(cd.columnName + ": Unexpected fixed length array size");
547  }
548  }
549  cd.columnType.set_size(s);
550 
551  } else {
553  }
554 }
555 
556 namespace {
557 
558 void validate_literal(const std::string& val,
559  SQLTypeInfo column_type,
560  const std::string& column_name) {
561  if (to_upper(val) == "NULL") {
562  return;
563  }
564  switch (column_type.get_type()) {
565  case kBOOLEAN:
566  case kTINYINT:
567  case kSMALLINT:
568  case kINT:
569  case kBIGINT:
570  case kFLOAT:
571  case kDOUBLE:
572  case kTIME:
573  case kTIMESTAMP:
574  StringToDatum(val, column_type);
575  break;
576  case kDATE: {
577  auto d = StringToDatum(val, column_type);
578  DateDaysOverflowValidator validator(column_type);
579  validator.validate(d.bigintval);
580  break;
581  }
582  case kDECIMAL:
583  case kNUMERIC: {
584  SQLTypeInfo ti(kNUMERIC, 0, 0, false);
585  auto d = StringToDatum(val, ti);
586  auto converted_val = convert_decimal_value_to_scale(d.bigintval, ti, column_type);
587  DecimalOverflowValidator validator(column_type);
588  validator.validate(converted_val);
589  break;
590  }
591  case kTEXT:
592  case kVARCHAR:
593  case kCHAR:
594  if (column_type.get_max_strlen() < val.length()) {
595  throw std::runtime_error("String too long for column " + column_name + " was " +
596  std::to_string(val.length()) + " max is " +
597  std::to_string(column_type.get_max_strlen()));
598  }
599  break;
600  case kARRAY: {
601  if (val.front() != '{' || val.back() != '}') {
602  throw std::runtime_error(column_name +
603  ": arrays should start and end with curly braces");
604  }
605  std::vector<std::string> elements = split(val.substr(1, val.length() - 2), ", ");
606  if (column_type.get_size() > 0) {
607  auto sti = column_type.get_elem_type();
608  size_t expected_size = column_type.get_size() / sti.get_size();
609  size_t actual_size = elements.size();
610  if (actual_size != expected_size) {
611  throw std::runtime_error("Fixed length array column " + column_name +
612  " expects " + std::to_string(expected_size) +
613  " values, received " + std::to_string(actual_size));
614  }
615  }
616  SQLTypeInfo element_ti = column_type.get_elem_type();
617  for (const auto& element : elements) {
618  if (to_upper(element) != "NULL") {
619  validate_literal(element, element_ti, column_name);
620  }
621  }
622  break;
623  }
624  case kPOINT:
625  case kMULTIPOINT:
626  case kLINESTRING:
627  case kMULTILINESTRING:
628  case kPOLYGON:
629  case kMULTIPOLYGON:
630  if (val.empty()) {
631  return;
632  }
633  try {
634  const bool validate_with_geos_if_available = false;
636  val, validate_with_geos_if_available);
637  if (!geo) {
638  throw std::runtime_error("Unexpected geo literal '" + val + "' for column " +
639  column_name);
640  }
641  if (!geo->transform(column_type)) {
642  throw std::runtime_error("Cannot transform SRID for literal '" + val +
643  "' for column " + column_name);
644  } else {
645  auto sql_type = column_type.get_type();
646  auto geo_type = geo->getType();
647  if ((geo_type == Geospatial::GeoBase::GeoType::kPOINT && sql_type != kPOINT) ||
649  sql_type != kMULTIPOINT) ||
651  sql_type != kLINESTRING) ||
653  sql_type != kMULTILINESTRING) ||
655  sql_type != kPOLYGON) ||
657  sql_type != kMULTIPOLYGON)) {
658  throw std::runtime_error("Geo literal '" + val +
659  "' doesn't match the type "
660  "of column column " +
661  column_name);
662  }
663  }
664  } catch (Geospatial::GeoTypesError& e) {
665  throw std::runtime_error("Unexpected geo literal '" + val + "' for column " +
666  column_name + ": " + e.what());
667  }
668  break;
669  default:
670  CHECK(false) << "validate_literal() does not support type "
671  << column_type.get_type();
672  }
673 }
674 
675 } // namespace
676 
678  const std::string* default_value,
679  bool not_null) {
680  bool is_null_literal =
681  default_value && ((to_upper(*default_value) == "NULL") ||
682  (cd.columnType.is_geometry() && default_value->empty()));
683  if (not_null && (is_null_literal)) {
684  throw std::runtime_error(cd.columnName +
685  ": cannot set default value to NULL for "
686  "NOT NULL column");
687  }
688  if (!default_value || is_null_literal) {
689  cd.default_value = std::nullopt;
690  return;
691  }
692  const auto& column_type = cd.columnType;
693  const auto& val = *default_value;
694  validate_literal(val, column_type, cd.columnName);
695  cd.default_value = std::make_optional(*default_value);
696 }
697 
698 void set_column_descriptor(const std::string& column_name,
699  ColumnDescriptor& cd,
700  SqlType* column_type,
701  const bool not_null,
702  const Encoding* encoding,
703  const std::string* default_value) {
704  cd.columnName = column_name;
705  validate_and_set_type(cd, column_type);
706  cd.columnType.set_notnull(not_null);
707  validate_and_set_encoding(cd, encoding, column_type);
708  validate_and_set_array_size(cd, column_type);
709  cd.isSystemCol = false;
710  cd.isVirtualCol = false;
711  validate_and_set_default_value(cd, default_value, not_null);
712 }
713 
714 void set_default_table_attributes(const std::string& table_name,
715  TableDescriptor& td,
716  const int32_t column_count) {
717  td.tableName = table_name;
718  td.nColumns = column_count;
719  td.isView = false;
720  td.fragmenter = nullptr;
726 }
727 
728 void validate_non_duplicate_column(const std::string& column_name,
729  std::unordered_set<std::string>& upper_column_names) {
730  const auto upper_column_name = boost::to_upper_copy<std::string>(column_name);
731  const auto insert_it = upper_column_names.insert(upper_column_name);
732  if (!insert_it.second) {
733  throw std::runtime_error("Column '" + column_name + "' defined more than once");
734  }
735 }
736 
737 void validate_non_reserved_keyword(const std::string& column_name) {
738  const auto upper_column_name = boost::to_upper_copy<std::string>(column_name);
739  if (reserved_keywords.find(upper_column_name) != reserved_keywords.end()) {
740  throw std::runtime_error("Cannot create column with reserved keyword '" +
741  column_name + "'");
742  }
743 }
744 
746  const TableType expected_table_type,
747  const std::string& command) {
748  if (td->isView) {
749  if (expected_table_type != TableType::VIEW) {
750  throw std::runtime_error(td->tableName + " is a view. Use " + command + " VIEW.");
751  }
752  } else if (td->storageType == StorageType::FOREIGN_TABLE) {
753  if (expected_table_type != TableType::FOREIGN_TABLE) {
754  throw std::runtime_error(td->tableName + " is a foreign table. Use " + command +
755  " FOREIGN TABLE.");
756  }
757  } else if (expected_table_type != TableType::TABLE) {
758  throw std::runtime_error(td->tableName + " is a table. Use " + command + " TABLE.");
759  }
760 }
761 
762 std::string table_type_enum_to_string(const TableType table_type) {
763  if (table_type == ddl_utils::TableType::TABLE) {
764  return "Table";
765  }
766  if (table_type == ddl_utils::TableType::FOREIGN_TABLE) {
767  return "ForeignTable";
768  }
769  if (table_type == ddl_utils::TableType::VIEW) {
770  return "View";
771  }
772  throw std::runtime_error{"Unexpected table type"};
773 }
774 
775 std::string get_malformed_config_error_message(const std::string& config_key) {
776  return "Configuration value for \"" + config_key +
777  "\" is malformed. Value should be a list of paths with format: [ "
778  "\"root-path-1\", \"root-path-2\", ... ]";
779 }
780 
781 void validate_expanded_file_path(const std::string& file_path,
782  const std::vector<std::string>& whitelisted_root_paths) {
783  const auto& canonical_file_path = boost::filesystem::canonical(file_path);
784  for (const auto& root_path : whitelisted_root_paths) {
785  if (boost::istarts_with(canonical_file_path.string(), root_path)) {
786  return;
787  }
788  }
789  if (canonical_file_path == boost::filesystem::absolute(file_path)) {
790  throw std::runtime_error{"File or directory path \"" + file_path +
791  "\" is not whitelisted."};
792  }
793  throw std::runtime_error{"File or directory path \"" + file_path +
794  "\" (resolved to \"" + canonical_file_path.string() +
795  "\") is not whitelisted."};
796 }
797 
798 std::vector<std::string> get_expanded_file_paths(
799  const std::string& file_path,
800  const DataTransferType data_transfer_type) {
801  std::vector<std::string> file_paths;
802  if (data_transfer_type == DataTransferType::IMPORT) {
803  file_paths = shared::local_glob_filter_sort_files(file_path, {});
804  } else {
805  std::string path;
806  if (!boost::filesystem::exists(file_path)) {
807  // For exports, it is possible to provide a path to a new (nonexistent) file. In
808  // this case, validate using the parent path.
809  path = boost::filesystem::path(file_path).parent_path().string();
810  if (!boost::filesystem::exists(path)) {
811  throw std::runtime_error{"File or directory \"" + file_path +
812  "\" does not exist."};
813  }
814  } else {
815  path = file_path;
816  }
817  file_paths = {path};
818  }
819  return file_paths;
820 }
821 
822 void validate_allowed_file_path(const std::string& file_path,
823  const DataTransferType data_transfer_type,
824  const bool allow_wildcards) {
825  // Reject any punctuation characters except for a few safe ones.
826  // Some punctuation characters present a security risk when passed
827  // to subprocesses. Don't change this without a security review.
828  static const std::string safe_punctuation{"./_+-=:~"};
829  for (const auto& ch : file_path) {
830  if (std::ispunct(ch) && safe_punctuation.find(ch) == std::string::npos &&
831  !(allow_wildcards && ch == '*')) {
832  throw std::runtime_error(std::string("Punctuation \"") + ch +
833  "\" is not allowed in file path: " + file_path);
834  }
835  }
836 
837  // Enforce our whitelist and blacklist for file paths.
838  const auto& expanded_file_paths =
839  get_expanded_file_paths(file_path, data_transfer_type);
840  for (const auto& path : expanded_file_paths) {
842  const auto& canonical_file_path = boost::filesystem::canonical(file_path);
843  if (canonical_file_path == boost::filesystem::absolute(file_path)) {
844  throw std::runtime_error{"Access to file or directory path \"" + file_path +
845  "\" is not allowed."};
846  }
847  throw std::runtime_error{"Access to file or directory path \"" + file_path +
848  "\" (resolved to \"" + canonical_file_path.string() +
849  "\") is not allowed."};
850  }
851  }
852  FilePathWhitelist::validateWhitelistedFilePath(expanded_file_paths, data_transfer_type);
853 }
854 
855 void set_whitelisted_paths(const std::string& config_key,
856  const std::string& config_value,
857  std::vector<std::string>& whitelisted_paths) {
858  rapidjson::Document whitelisted_root_paths;
859  whitelisted_root_paths.Parse(config_value);
860  if (!whitelisted_root_paths.IsArray()) {
861  throw std::runtime_error{get_malformed_config_error_message(config_key)};
862  }
863  for (const auto& root_path : whitelisted_root_paths.GetArray()) {
864  if (!root_path.IsString()) {
865  throw std::runtime_error{get_malformed_config_error_message(config_key)};
866  }
867  if (!boost::filesystem::exists(root_path.GetString())) {
868  throw std::runtime_error{"Whitelisted root path \"" +
869  std::string{root_path.GetString()} + "\" does not exist."};
870  }
871  whitelisted_paths.emplace_back(
872  boost::filesystem::canonical(root_path.GetString()).string());
873  }
874  LOG(INFO) << "Parsed " << config_key << ": "
875  << shared::printContainer(whitelisted_paths);
876 }
877 
878 void FilePathWhitelist::initialize(const std::string& data_dir,
879  const std::string& allowed_import_paths,
880  const std::string& allowed_export_paths) {
881  CHECK(!data_dir.empty());
882  CHECK(boost::filesystem::is_directory(data_dir));
883 
884  auto data_dir_path = boost::filesystem::canonical(data_dir);
886  whitelisted_import_paths_.emplace_back(
887  (data_dir_path / shared::kDefaultImportDirName).string());
888 
890  whitelisted_export_paths_.emplace_back(
891  (data_dir_path / shared::kDefaultExportDirName).string());
892 
893  if (!allowed_import_paths.empty()) {
895  "allowed-import-paths", allowed_import_paths, whitelisted_import_paths_);
896  }
897  if (!allowed_export_paths.empty()) {
899  "allowed-export-paths", allowed_export_paths, whitelisted_export_paths_);
900  }
901 }
902 
904  const std::vector<std::string>& expanded_file_paths,
905  const DataTransferType data_transfer_type) {
906  for (const auto& path : expanded_file_paths) {
907  if (data_transfer_type == DataTransferType::IMPORT) {
909  } else if (data_transfer_type == DataTransferType::EXPORT) {
911  } else {
912  UNREACHABLE();
913  }
914  }
915 }
916 
920 }
921 
922 std::vector<std::string> FilePathWhitelist::whitelisted_import_paths_{};
923 std::vector<std::string> FilePathWhitelist::whitelisted_export_paths_{};
924 
925 void FilePathBlacklist::addToBlacklist(const std::string& path) {
926  CHECK(!path.empty());
927  blacklisted_paths_.emplace_back(path);
928 }
929 
930 bool FilePathBlacklist::isBlacklistedPath(const std::string& path) {
931  const auto canonical_path = boost::filesystem::canonical(path).string();
932  for (const auto& blacklisted_path : blacklisted_paths_) {
933  std::string full_path;
934  try {
935  full_path = boost::filesystem::canonical(blacklisted_path).string();
936  } catch (...) {
943  full_path = boost::filesystem::absolute(blacklisted_path).string();
944  }
945  if (boost::istarts_with(canonical_path, full_path)) {
946  return true;
947  }
948  }
949  return false;
950 }
951 
953  blacklisted_paths_.clear();
954 }
955 
956 std::vector<std::string> FilePathBlacklist::blacklisted_paths_{};
957 } // namespace ddl_utils
static std::set< std::string > reserved_keywords
std::string virtualExpr
DataTransferType
Definition: DdlUtils.h:80
HOST DEVICE SQLTypes get_subtype() const
Definition: sqltypes.h:392
void set_compression(EncodingType c)
Definition: sqltypes.h:481
void set_size(int s)
Definition: sqltypes.h:478
void validate_and_set_sparse_encoding(ColumnDescriptor &cd, int encoding_size)
Definition: DdlUtils.cpp:416
std::vector< std::string > get_expanded_file_paths(const std::string &file_path, const DataTransferType data_transfer_type)
Definition: DdlUtils.cpp:798
static std::vector< std::string > whitelisted_export_paths_
Definition: DdlUtils.h:94
HOST DEVICE int get_size() const
Definition: sqltypes.h:403
void validate_literal(const std::string &val, SQLTypeInfo column_type, const std::string &column_name)
Definition: DdlUtils.cpp:558
shared utility for globbing files, paths can be specified as either a single file, directory or wildcards
static constexpr int32_t kMaxNumericPrecision
Definition: sqltypes.h:58
Definition: sqltypes.h:76
SQLTypes
Definition: sqltypes.h:65
std::string tableName
SqlType(SQLTypes type, int param1, int param2, bool is_array, int array_size)
Definition: DdlUtils.cpp:73
CompareResult compare_column_descriptors(const ColumnDescriptor *lhs, const ColumnDescriptor *rhs)
Definition: DdlUtils.cpp:52
void validate_and_set_array_size(ColumnDescriptor &cd, const SqlType *column_type)
Definition: DdlUtils.cpp:531
virtual void check_type()
Definition: DdlUtils.cpp:188
static void initialize(const std::string &data_dir, const std::string &allowed_import_paths, const std::string &allowed_export_paths)
Definition: DdlUtils.cpp:878
void validate_and_set_dictionary_encoding(ColumnDescriptor &cd, int encoding_size)
Definition: DdlUtils.cpp:379
#define LOG(tag)
Definition: Logger.h:285
HOST DEVICE int get_scale() const
Definition: sqltypes.h:396
std::string storageType
#define DEFAULT_MAX_CHUNK_SIZE
#define UNREACHABLE()
Definition: Logger.h:338
HOST DEVICE void set_subtype(SQLTypes st)
Definition: sqltypes.h:471
virtual int get_encoding_param() const
Definition: DdlUtils.cpp:247
Constants for Builtin SQL Types supported by HEAVY.AI.
const std::string kDefaultExportDirName
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:391
std::string sourceName
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
void validate_non_duplicate_column(const std::string &column_name, std::unordered_set< std::string > &upper_column_names)
Definition: DdlUtils.cpp:728
std::string chunks
constexpr double a
Definition: Utm.h:32
void set_column_descriptor(const std::string &column_name, ColumnDescriptor &cd, SqlType *column_type, const bool not_null, const Encoding *encoding, const std::string *default_value)
Definition: DdlUtils.cpp:698
void validate_and_set_none_encoding(ColumnDescriptor &cd)
Definition: DdlUtils.cpp:405
void set_input_srid(int d)
Definition: sqltypes.h:474
void validate_and_set_encoding(ColumnDescriptor &cd, const Encoding *encoding, const SqlType *column_type)
Definition: DdlUtils.cpp:472
bool g_use_date_in_days_default_encoding
Definition: DdlUtils.cpp:35
#define DEFAULT_MAX_ROWS
const std::string kDefaultImportDirName
static std::vector< std::string > whitelisted_import_paths_
Definition: DdlUtils.h:93
static void validateWhitelistedFilePath(const std::vector< std::string > &expanded_file_paths, const DataTransferType data_transfer_type)
Definition: DdlUtils.cpp:903
void set_fixed_size()
Definition: sqltypes.h:479
void set_default_encoding(ColumnDescriptor &cd)
Definition: DdlUtils.cpp:251
void set_scale(int s)
Definition: sqltypes.h:475
SQLTypes type
Definition: DdlUtils.h:54
void validate(T value)
Definition: Encoder.h:122
virtual SQLTypes get_type() const
Definition: DdlUtils.cpp:80
void validate_expanded_file_path(const std::string &file_path, const std::vector< std::string > &whitelisted_root_paths)
Definition: DdlUtils.cpp:781
Datum StringToDatum(const std::string_view s, SQLTypeInfo &ti)
Definition: Datum.cpp:339
virtual std::string to_string() const
Definition: DdlUtils.cpp:112
void validate_non_reserved_keyword(const std::string &column_name)
Definition: DdlUtils.cpp:737
specifies the content in-memory of a row in the column metadata table
void set_default_table_attributes(const std::string &table_name, TableDescriptor &td, const int32_t column_count)
Definition: DdlUtils.cpp:714
std::shared_ptr< Fragmenter_Namespace::AbstractFragmenter > fragmenter
int get_precision() const
Definition: sqltypes.h:394
void validate_allowed_file_path(const std::string &file_path, const DataTransferType data_transfer_type, const bool allow_wildcards)
Definition: DdlUtils.cpp:822
void set_output_srid(int s)
Definition: sqltypes.h:476
std::string to_upper(const std::string &str)
#define DEFAULT_PAGE_SIZE
void set_comp_param(int p)
Definition: sqltypes.h:482
void validate_and_set_compressed_encoding(ColumnDescriptor &cd, int encoding_size)
Definition: DdlUtils.cpp:433
std::optional< std::string > default_value
Definition: sqltypes.h:79
Definition: sqltypes.h:80
bool compare_sql_type_infos(const SQLTypeInfo &lhs, const SQLTypeInfo &rhs)
Definition: DdlUtils.cpp:40
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:399
virtual const std::string * get_encoding_name() const
Definition: DdlUtils.cpp:243
static std::unique_ptr< GeoBase > createGeoType(const std::string &wkt_or_wkb_hex, const bool validate_with_geos_if_available)
Definition: Types.cpp:1085
int64_t convert_decimal_value_to_scale(const int64_t decimal_value, const SQLTypeInfo &type_info, const SQLTypeInfo &new_type_info)
Definition: Datum.cpp:624
std::string get_malformed_config_error_message(const std::string &config_key)
Definition: DdlUtils.cpp:775
void set_dimension(int d)
Definition: sqltypes.h:472
#define DEFAULT_FRAGMENT_ROWS
void validate_and_set_fixed_encoding(ColumnDescriptor &cd, int encoding_size, const SqlType *column_type)
Definition: DdlUtils.cpp:281
std::string table_type_enum_to_string(const TableType table_type)
Definition: DdlUtils.cpp:762
Fragmenter_Namespace::FragmenterType fragType
HOST DEVICE int get_dimension() const
Definition: sqltypes.h:393
Encoding(std::string *encoding_name, int encoding_param)
Definition: DdlUtils.cpp:240
#define IS_INTEGER(T)
Definition: sqltypes.h:304
void set_whitelisted_paths(const std::string &config_key, const std::string &config_value, std::vector< std::string > &whitelisted_paths)
Definition: DdlUtils.cpp:855
Definition: sqltypes.h:68
virtual void set_param1(int param)
Definition: DdlUtils.cpp:88
HOST DEVICE int get_comp_param() const
Definition: sqltypes.h:402
static bool isBlacklistedPath(const std::string &path)
Definition: DdlUtils.cpp:930
void validate_table_type(const TableDescriptor *td, const TableType expected_table_type, const std::string &command)
Definition: DdlUtils.cpp:745
void set_notnull(bool n)
Definition: sqltypes.h:477
#define CHECK(condition)
Definition: Logger.h:291
bool is_geometry() const
Definition: sqltypes.h:597
void validate_and_set_default_value(ColumnDescriptor &cd, const std::string *default_value, bool not_null)
Definition: DdlUtils.cpp:677
bool is_high_precision_timestamp() const
Definition: sqltypes.h:1036
void validate_and_set_date_encoding(ColumnDescriptor &cd, int encoding_size)
Definition: DdlUtils.cpp:453
std::vector< std::string > local_glob_filter_sort_files(const std::string &file_path, const FilePathOptions &options, const bool recurse)
static void addToBlacklist(const std::string &path)
Definition: DdlUtils.cpp:925
Definition: sqltypes.h:72
std::unique_ptr< std::string > encoding_name
Definition: DdlUtils.h:76
SQLTypeInfo columnType
virtual void set_is_array(bool a)
Definition: DdlUtils.cpp:100
PrintContainer< CONTAINER > printContainer(CONTAINER &container)
Definition: misc.h:108
bool is_string() const
Definition: sqltypes.h:561
virtual int get_param1() const
Definition: DdlUtils.cpp:84
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:398
static constexpr char const * FOREIGN_TABLE
HOST DEVICE size_t get_max_strlen() const
Definition: sqltypes.h:405
bool is_string_array() const
Definition: sqltypes.h:564
void validate(T value) const
Definition: Encoder.h:54
SQLTypeInfo get_elem_type() const
Definition: sqltypes.h:977
bool is_decimal() const
Definition: sqltypes.h:570
virtual int get_param2() const
Definition: DdlUtils.cpp:92
std::string columnName
#define IS_GEO(T)
Definition: sqltypes.h:310
virtual bool get_is_array() const
Definition: DdlUtils.cpp:96
HOST DEVICE int get_output_srid() const
Definition: sqltypes.h:397
virtual void set_array_size(int s)
Definition: DdlUtils.cpp:108
constexpr auto is_datetime(SQLTypes type)
Definition: sqltypes.h:325
static std::vector< std::string > blacklisted_paths_
Definition: DdlUtils.h:104
virtual int get_array_size() const
Definition: DdlUtils.cpp:104
void validate_and_set_type(ColumnDescriptor &cd, SqlType *column_type)
Definition: DdlUtils.cpp:508
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:470