19 #include <unordered_set>
21 #include <boost/algorithm/string.hpp>
22 #include <boost/filesystem.hpp>
23 #include <boost/program_options.hpp>
25 #include "rapidjson/document.h"
39 namespace alter_column_utils {
78 , array_size(array_size) {}
119 str =
"CHAR(" + boost::lexical_cast<std::string>(
param1) +
")";
122 str =
"VARCHAR(" + boost::lexical_cast<std::string>(
param1) +
")";
128 str =
"NUMERIC(" + boost::lexical_cast<std::string>(
param1);
130 str +=
", " + boost::lexical_cast<std::string>(
param2);
135 str =
"DECIMAL(" + boost::lexical_cast<std::string>(
param1);
137 str +=
", " + boost::lexical_cast<std::string>(
param2);
162 str +=
"(" + boost::lexical_cast<std::string>(
param1) +
")";
168 str +=
"(" + boost::lexical_cast<std::string>(
param1) +
")";
181 str += boost::lexical_cast<std::string>(
array_size);
193 throw std::runtime_error(
"CHAR and VARCHAR must have a positive dimension.");
199 throw std::runtime_error(
"DECIMAL and NUMERIC must have a positive precision.");
201 throw std::runtime_error(
"DECIMAL and NUMERIC precision cannot be larger than " +
205 throw std::runtime_error(
206 "DECIMAL and NUMERIC must have precision larger than scale.");
214 throw std::runtime_error(
215 "Only TIMESTAMP(n) where n = (0,3,6,9) are supported now.");
223 throw std::runtime_error(
"Only TIME(0) is supported now.");
241 : encoding_name(encoding_name), encoding_param(encoding_param) {}
265 throw std::runtime_error(cd.
columnName +
": Precision too high, max " +
294 throw std::runtime_error(cd.
columnName +
": Cannot apply FIXED encoding to " +
295 column_type->
to_string() +
" type array.");
304 throw std::runtime_error(
306 ": Fixed encoding is only supported for integer or time columns.");
311 if (encoding_size != 8) {
312 throw std::runtime_error(
314 ": Compression parameter for Fixed encoding on SMALLINT must be 8.");
318 if (encoding_size != 8 && encoding_size != 16) {
319 throw std::runtime_error(
321 ": Compression parameter for Fixed encoding on INTEGER must be 8 or 16.");
325 if (encoding_size != 8 && encoding_size != 16 && encoding_size != 32) {
327 ": Compression parameter for Fixed encoding on "
328 "BIGINT must be 8 or 16 or 32.");
333 if (encoding_size != 32) {
335 ": Compression parameter for Fixed encoding on "
336 "TIME or TIMESTAMP must be 32.");
338 throw std::runtime_error(
"Fixed encoding is not supported for TIMESTAMP(3|6|9).");
343 if (encoding_size != 32 && encoding_size != 16) {
345 ": Compression parameter for Fixed encoding on "
346 "DECIMAL must be 16 or 32.");
351 ": Precision too high for Fixed(32) encoding, max 9.");
356 ": Precision too high for Fixed(16) encoding, max 4.");
360 if (encoding_size != 32 && encoding_size != 16) {
362 ": Compression parameter for Fixed encoding on "
363 "DATE must be 16 or 32.");
367 throw std::runtime_error(cd.
columnName +
": Cannot apply FIXED encoding to " +
381 throw std::runtime_error(
383 ": Dictionary encoding is only supported on string or string array columns.");
386 if (encoding_size == 0) {
389 comp_param = encoding_size;
393 ": Compression parameter for string arrays must be 32");
395 if (comp_param != 8 && comp_param != 16 && comp_param != 32) {
396 throw std::runtime_error(
398 ": Compression parameter for Dictionary encoding must be 8 or 16 or 32.");
408 throw std::runtime_error(
410 ": None encoding is only supported on string, string array, or geo columns.");
420 ": Cannot do sparse column encoding on a NOT NULL column.");
422 if (encoding_size == 0 || encoding_size % 8 != 0 || encoding_size > 48) {
423 throw std::runtime_error(
425 "Must specify number of bits as 8, 16, 24, 32 or 48 as the parameter to "
426 "sparse-column encoding.");
435 throw std::runtime_error(
436 cd.
columnName +
": COMPRESSED encoding is only supported on WGS84 geo columns.");
439 if (encoding_size == 0) {
442 comp_param = encoding_size;
444 if (comp_param != 32) {
446 ": only 32-bit COMPRESSED geo encoding is supported");
457 ": Cannot apply days encoding to date array.");
461 ": Days encoding is only supported for DATE columns.");
463 if (encoding_size != 32 && encoding_size != 16) {
465 ": Compression parameter for Days encoding on "
466 "DATE must be 16 or 32.");
475 if (encoding ==
nullptr) {
479 if (boost::iequals(comp,
"fixed")) {
481 }
else if (boost::iequals(comp,
"rl")) {
486 }
else if (boost::iequals(comp,
"diff")) {
491 }
else if (boost::iequals(comp,
"dict")) {
493 }
else if (boost::iequals(comp,
"NONE")) {
495 }
else if (boost::iequals(comp,
"sparse")) {
497 }
else if (boost::iequals(comp,
"compressed")) {
499 }
else if (boost::iequals(comp,
"days")) {
502 throw std::runtime_error(cd.
columnName +
": Invalid column compression scheme " +
512 throw std::runtime_error(
"Unsupported type \"GEOMETRY\" specified.");
534 throw std::runtime_error(
536 ": Array of strings must be dictionary encoded. Specify ENCODING DICT");
542 if (array_size > 0) {
546 throw std::runtime_error(cd.
columnName +
": Unexpected fixed length array size");
560 const std::string& column_name) {
595 throw std::runtime_error(
"String too long for column " + column_name +
" was " +
601 if (val.front() !=
'{' || val.back() !=
'}') {
602 throw std::runtime_error(column_name +
603 ": arrays should start and end with curly braces");
605 std::vector<std::string> elements =
split(val.substr(1, val.length() - 2),
", ");
608 size_t expected_size = column_type.
get_size() / sti.get_size();
609 size_t actual_size = elements.size();
610 if (actual_size != expected_size) {
611 throw std::runtime_error(
"Fixed length array column " + column_name +
617 for (
const auto& element : elements) {
634 const bool validate_with_geos_if_available =
false;
636 val, validate_with_geos_if_available);
638 throw std::runtime_error(
"Unexpected geo literal '" + val +
"' for column " +
641 if (!geo->transform(column_type)) {
642 throw std::runtime_error(
"Cannot transform SRID for literal '" + val +
643 "' for column " + column_name);
645 auto sql_type = column_type.
get_type();
646 auto geo_type = geo->getType();
658 throw std::runtime_error(
"Geo literal '" + val +
659 "' doesn't match the type "
660 "of column column " +
665 throw std::runtime_error(
"Unexpected geo literal '" + val +
"' for column " +
666 column_name +
": " + e.what());
670 CHECK(
false) <<
"validate_literal() does not support type "
678 const std::string* default_value,
680 bool is_null_literal =
681 default_value && ((
to_upper(*default_value) ==
"NULL") ||
683 if (not_null && (is_null_literal)) {
685 ": cannot set default value to NULL for "
688 if (!default_value || is_null_literal) {
693 const auto& val = *default_value;
703 const std::string* default_value) {
716 const int32_t column_count) {
729 std::unordered_set<std::string>& upper_column_names) {
730 const auto upper_column_name = boost::to_upper_copy<std::string>(column_name);
731 const auto insert_it = upper_column_names.insert(upper_column_name);
732 if (!insert_it.second) {
733 throw std::runtime_error(
"Column '" + column_name +
"' defined more than once");
738 const auto upper_column_name = boost::to_upper_copy<std::string>(column_name);
740 throw std::runtime_error(
"Cannot create column with reserved keyword '" +
747 const std::string& command) {
750 throw std::runtime_error(td->
tableName +
" is a view. Use " + command +
" VIEW.");
754 throw std::runtime_error(td->
tableName +
" is a foreign table. Use " + command +
758 throw std::runtime_error(td->
tableName +
" is a table. Use " + command +
" TABLE.");
767 return "ForeignTable";
772 throw std::runtime_error{
"Unexpected table type"};
776 return "Configuration value for \"" + config_key +
777 "\" is malformed. Value should be a list of paths with format: [ "
778 "\"root-path-1\", \"root-path-2\", ... ]";
782 const std::vector<std::string>& whitelisted_root_paths) {
783 const auto& canonical_file_path = boost::filesystem::canonical(file_path);
784 for (
const auto& root_path : whitelisted_root_paths) {
785 if (boost::istarts_with(canonical_file_path.string(), root_path)) {
789 if (canonical_file_path == boost::filesystem::absolute(file_path)) {
790 throw std::runtime_error{
"File or directory path \"" + file_path +
791 "\" is not whitelisted."};
793 throw std::runtime_error{
"File or directory path \"" + file_path +
794 "\" (resolved to \"" + canonical_file_path.string() +
795 "\") is not whitelisted."};
799 const std::string& file_path,
801 std::vector<std::string> file_paths;
806 if (!boost::filesystem::exists(file_path)) {
809 path = boost::filesystem::path(file_path).parent_path().string();
810 if (!boost::filesystem::exists(path)) {
811 throw std::runtime_error{
"File or directory \"" + file_path +
812 "\" does not exist."};
824 const bool allow_wildcards) {
828 static const std::string safe_punctuation{
"./_+-=:~"};
829 for (
const auto& ch : file_path) {
830 if (std::ispunct(ch) && safe_punctuation.find(ch) == std::string::npos &&
831 !(allow_wildcards && ch ==
'*')) {
832 throw std::runtime_error(std::string(
"Punctuation \"") + ch +
833 "\" is not allowed in file path: " + file_path);
838 const auto& expanded_file_paths =
840 for (
const auto& path : expanded_file_paths) {
842 const auto& canonical_file_path = boost::filesystem::canonical(file_path);
843 if (canonical_file_path == boost::filesystem::absolute(file_path)) {
844 throw std::runtime_error{
"Access to file or directory path \"" + file_path +
845 "\" is not allowed."};
847 throw std::runtime_error{
"Access to file or directory path \"" + file_path +
848 "\" (resolved to \"" + canonical_file_path.string() +
849 "\") is not allowed."};
856 const std::string& config_value,
857 std::vector<std::string>& whitelisted_paths) {
858 rapidjson::Document whitelisted_root_paths;
859 whitelisted_root_paths.Parse(config_value);
860 if (!whitelisted_root_paths.IsArray()) {
863 for (
const auto& root_path : whitelisted_root_paths.GetArray()) {
864 if (!root_path.IsString()) {
867 if (!boost::filesystem::exists(root_path.GetString())) {
868 throw std::runtime_error{
"Whitelisted root path \"" +
869 std::string{root_path.GetString()} +
"\" does not exist."};
871 whitelisted_paths.emplace_back(
872 boost::filesystem::canonical(root_path.GetString()).
string());
874 LOG(
INFO) <<
"Parsed " << config_key <<
": "
879 const std::string& allowed_import_paths,
880 const std::string& allowed_export_paths) {
881 CHECK(!data_dir.empty());
882 CHECK(boost::filesystem::is_directory(data_dir));
884 auto data_dir_path = boost::filesystem::canonical(data_dir);
893 if (!allowed_import_paths.empty()) {
897 if (!allowed_export_paths.empty()) {
904 const std::vector<std::string>& expanded_file_paths,
906 for (
const auto& path : expanded_file_paths) {
926 CHECK(!path.empty());
931 const auto canonical_path = boost::filesystem::canonical(path).string();
933 std::string full_path;
935 full_path = boost::filesystem::canonical(blacklisted_path).string();
943 full_path = boost::filesystem::absolute(blacklisted_path).string();
945 if (boost::istarts_with(canonical_path, full_path)) {
static std::set< std::string > reserved_keywords
HOST DEVICE SQLTypes get_subtype() const
void set_compression(EncodingType c)
void validate_and_set_sparse_encoding(ColumnDescriptor &cd, int encoding_size)
std::vector< std::string > get_expanded_file_paths(const std::string &file_path, const DataTransferType data_transfer_type)
static std::vector< std::string > whitelisted_export_paths_
HOST DEVICE int get_size() const
void validate_literal(const std::string &val, SQLTypeInfo column_type, const std::string &column_name)
shared utility for globbing files, paths can be specified as either a single file, directory or wildcards
static constexpr int32_t kMaxNumericPrecision
SqlType(SQLTypes type, int param1, int param2, bool is_array, int array_size)
CompareResult compare_column_descriptors(const ColumnDescriptor *lhs, const ColumnDescriptor *rhs)
void validate_and_set_array_size(ColumnDescriptor &cd, const SqlType *column_type)
virtual void check_type()
static void initialize(const std::string &data_dir, const std::string &allowed_import_paths, const std::string &allowed_export_paths)
void validate_and_set_dictionary_encoding(ColumnDescriptor &cd, int encoding_size)
HOST DEVICE int get_scale() const
#define DEFAULT_MAX_CHUNK_SIZE
HOST DEVICE void set_subtype(SQLTypes st)
virtual int get_encoding_param() const
Constants for Builtin SQL Types supported by HEAVY.AI.
const std::string kDefaultExportDirName
HOST DEVICE SQLTypes get_type() const
void validate_non_duplicate_column(const std::string &column_name, std::unordered_set< std::string > &upper_column_names)
void set_column_descriptor(const std::string &column_name, ColumnDescriptor &cd, SqlType *column_type, const bool not_null, const Encoding *encoding, const std::string *default_value)
void validate_and_set_none_encoding(ColumnDescriptor &cd)
void set_input_srid(int d)
void validate_and_set_encoding(ColumnDescriptor &cd, const Encoding *encoding, const SqlType *column_type)
bool g_use_date_in_days_default_encoding
const std::string kDefaultImportDirName
static std::vector< std::string > whitelisted_import_paths_
static void validateWhitelistedFilePath(const std::vector< std::string > &expanded_file_paths, const DataTransferType data_transfer_type)
void set_default_encoding(ColumnDescriptor &cd)
virtual SQLTypes get_type() const
void validate_expanded_file_path(const std::string &file_path, const std::vector< std::string > &whitelisted_root_paths)
Datum StringToDatum(const std::string_view s, SQLTypeInfo &ti)
virtual std::string to_string() const
void validate_non_reserved_keyword(const std::string &column_name)
specifies the content in-memory of a row in the column metadata table
void set_default_table_attributes(const std::string &table_name, TableDescriptor &td, const int32_t column_count)
std::shared_ptr< Fragmenter_Namespace::AbstractFragmenter > fragmenter
int get_precision() const
void validate_allowed_file_path(const std::string &file_path, const DataTransferType data_transfer_type, const bool allow_wildcards)
void set_output_srid(int s)
#define DEFAULT_PAGE_SIZE
void set_comp_param(int p)
void validate_and_set_compressed_encoding(ColumnDescriptor &cd, int encoding_size)
std::optional< std::string > default_value
bool compare_sql_type_infos(const SQLTypeInfo &lhs, const SQLTypeInfo &rhs)
HOST DEVICE EncodingType get_compression() const
virtual const std::string * get_encoding_name() const
static std::unique_ptr< GeoBase > createGeoType(const std::string &wkt_or_wkb_hex, const bool validate_with_geos_if_available)
int64_t convert_decimal_value_to_scale(const int64_t decimal_value, const SQLTypeInfo &type_info, const SQLTypeInfo &new_type_info)
std::string get_malformed_config_error_message(const std::string &config_key)
void set_dimension(int d)
#define DEFAULT_FRAGMENT_ROWS
void validate_and_set_fixed_encoding(ColumnDescriptor &cd, int encoding_size, const SqlType *column_type)
std::string table_type_enum_to_string(const TableType table_type)
Fragmenter_Namespace::FragmenterType fragType
HOST DEVICE int get_dimension() const
Encoding(std::string *encoding_name, int encoding_param)
void set_whitelisted_paths(const std::string &config_key, const std::string &config_value, std::vector< std::string > &whitelisted_paths)
virtual void set_param1(int param)
HOST DEVICE int get_comp_param() const
static bool isBlacklistedPath(const std::string &path)
void validate_table_type(const TableDescriptor *td, const TableType expected_table_type, const std::string &command)
void validate_and_set_default_value(ColumnDescriptor &cd, const std::string *default_value, bool not_null)
bool is_high_precision_timestamp() const
void validate_and_set_date_encoding(ColumnDescriptor &cd, int encoding_size)
std::vector< std::string > local_glob_filter_sort_files(const std::string &file_path, const FilePathOptions &options, const bool recurse)
static void addToBlacklist(const std::string &path)
std::unique_ptr< std::string > encoding_name
virtual void set_is_array(bool a)
PrintContainer< CONTAINER > printContainer(CONTAINER &container)
virtual int get_param1() const
HOST DEVICE bool get_notnull() const
static constexpr char const * FOREIGN_TABLE
HOST DEVICE size_t get_max_strlen() const
bool is_string_array() const
void validate(T value) const
SQLTypeInfo get_elem_type() const
virtual int get_param2() const
virtual bool get_is_array() const
HOST DEVICE int get_output_srid() const
virtual void set_array_size(int s)
constexpr auto is_datetime(SQLTypes type)
static std::vector< std::string > blacklisted_paths_
virtual int get_array_size() const
void validate_and_set_type(ColumnDescriptor &cd, SqlType *column_type)
HOST DEVICE void set_type(SQLTypes t)