OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MetadataColumn.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /*
18  * @file MetadataColumn.cpp
19  * @brief Metadata Column info struct and parser
20  *
21  */
22 
24 
25 #include <boost/algorithm/string.hpp>
26 #include <boost/filesystem.hpp>
27 
29 #include "Shared/DateTimeParser.h"
30 #include "Shared/StringTransform.h"
32 
33 namespace import_export {
34 
35 MetadataColumnInfos parse_add_metadata_columns(const std::string& add_metadata_columns,
36  const std::string& file_path) {
37  //
38  // each string is "column_name,column_type,expression"
39  //
40  // column_type can be:
41  // tinyint
42  // smallint
43  // int
44  // bigint
45  // float
46  // double
47  // date
48  // time
49  // timestamp
50  // text
51  //
52  // expression can be in terms of:
53  // filename
54  // filedir
55  // filepath
56  // etc.
57  //
58 
59  // anything to do?
60  if (add_metadata_columns.length() == 0u) {
61  return {};
62  }
63 
64  // split by ";"
65  // @TODO(se) is this safe?
66  // probably won't appear in a file name/path or a date/time string
67  std::vector<std::string> add_metadata_column_strings;
68  boost::split(add_metadata_column_strings, add_metadata_columns, boost::is_any_of(";"));
69  if (add_metadata_column_strings.size() == 0u) {
70  return {};
71  }
72 
74 
75  // known string constants
76  auto const fn = boost::filesystem::path(file_path).filename().string();
77  auto const fd = boost::filesystem::path(file_path).parent_path().string();
78  auto const fp = file_path;
79  parser.setStringConstant("filename", fn);
80  parser.setStringConstant("filedir", fd);
81  parser.setStringConstant("filepath", fp);
82 
83  MetadataColumnInfos metadata_column_infos;
84 
85  // for each requested column...
86  for (auto const& add_metadata_column_string : add_metadata_column_strings) {
87  // strip
88  auto const add_metadata_column = strip(add_metadata_column_string);
89 
90  // tokenize and extract
91  std::vector<std::string> tokens;
92  boost::split(tokens, add_metadata_column, boost::is_any_of(","));
93  if (tokens.size() < 3u) {
94  throw std::runtime_error("Invalid metadata column info '" + add_metadata_column +
95  "' (must be of the form 'name,type,expression')");
96  }
97  auto token_itr = tokens.begin();
98  auto const column_name = strip(*token_itr++);
99  auto const data_type = strip(to_lower(*token_itr++));
100  tokens.erase(tokens.begin(), token_itr);
101  auto const expression = strip(boost::join(tokens, ","));
102 
103  // get column type
104  SQLTypes sql_type{kNULLT};
105  double range_min{0.0}, range_max{0.0};
106  if (data_type == "tinyint") {
107  sql_type = kTINYINT;
108  range_min = static_cast<double>(std::numeric_limits<int8_t>::min());
109  range_max = static_cast<double>(std::numeric_limits<int8_t>::max());
110  } else if (data_type == "smallint") {
111  sql_type = kSMALLINT;
112  range_min = static_cast<double>(std::numeric_limits<int16_t>::min());
113  range_max = static_cast<double>(std::numeric_limits<int16_t>::max());
114  } else if (data_type == "int") {
115  sql_type = kINT;
116  range_min = static_cast<double>(std::numeric_limits<int32_t>::min());
117  range_max = static_cast<double>(std::numeric_limits<int32_t>::max());
118  } else if (data_type == "bigint") {
119  sql_type = kBIGINT;
120  range_min = static_cast<double>(std::numeric_limits<int64_t>::min());
121  range_max = static_cast<double>(std::numeric_limits<int64_t>::max());
122  } else if (data_type == "float") {
123  sql_type = kFLOAT;
124  range_min = static_cast<double>(std::numeric_limits<float>::min());
125  range_max = static_cast<double>(std::numeric_limits<float>::max());
126  } else if (data_type == "double") {
127  sql_type = kDOUBLE;
128  range_min = static_cast<double>(std::numeric_limits<double>::min());
129  range_max = static_cast<double>(std::numeric_limits<double>::max());
130  } else if (data_type == "date") {
131  sql_type = kDATE;
132  } else if (data_type == "time") {
133  sql_type = kTIME;
134  } else if (data_type == "timestamp") {
135  sql_type = kTIMESTAMP;
136  } else if (data_type == "text") {
137  sql_type = kTEXT;
138  } else {
139  throw std::runtime_error("Invalid metadata column data type '" + data_type +
140  "' for column '" + column_name + "'");
141  }
142 
143  // set expression with force cast back to string
144  parser.setExpression("str(" + expression + ")");
145 
146  // evaluate
147  auto value = parser.evalAsString();
148 
149  // validate date/time/timestamp value now
150  // @TODO(se) do we need to provide for non-zero dimension?
151  try {
152  if (sql_type == kDATE) {
153  dateTimeParse<kDATE>(value, 0);
154  } else if (sql_type == kTIME) {
155  dateTimeParse<kTIME>(value, 0);
156  } else if (sql_type == kTIMESTAMP) {
157  dateTimeParse<kTIMESTAMP>(value, 0);
158  }
159  } catch (std::runtime_error& e) {
160  throw std::runtime_error("Invalid metadata column " + to_string(sql_type) +
161  " value '" + value + "' for column '" + column_name + "'");
162  }
163 
164  // validate int/float/double
165  try {
166  if (IS_INTEGER(sql_type) || sql_type == kFLOAT || sql_type == kDOUBLE) {
167  size_t num_chars{0u};
168  auto const v = static_cast<double>(std::stod(value, &num_chars));
169  if (v < range_min || v > range_max) {
170  throw std::out_of_range(to_string(sql_type));
171  }
172  if (num_chars == 0u) {
173  throw std::invalid_argument("empty value");
174  }
175  }
176  } catch (std::invalid_argument& e) {
177  throw std::runtime_error("Invalid metadata column " + to_string(sql_type) +
178  " value '" + value + "' for column '" + column_name +
179  "' (" + e.what() + ")");
180  } catch (std::out_of_range& e) {
181  throw std::runtime_error("Out-of-range metadata column " + to_string(sql_type) +
182  " value '" + value + "' for column '" + column_name +
183  "' (" + e.what() + ")");
184  }
185 
186  // build column descriptor
187  ColumnDescriptor cd;
188  cd.columnName = cd.sourceName = column_name;
189  cd.columnType.set_type(sql_type);
191  if (sql_type == kTEXT) {
194  }
195 
196  // add to result
197  metadata_column_infos.push_back({std::move(cd), std::move(value)});
198  }
199 
200  // done
201  return metadata_column_infos;
202 }
203 
204 } // namespace import_export
std::string to_lower(const std::string &str)
void set_compression(EncodingType c)
Definition: sqltypes.h:481
Definition: sqltypes.h:76
SQLTypes
Definition: sqltypes.h:65
std::string strip(std::string_view str)
trim any whitespace from the left and right ends of a string
std::string join(T const &container, std::string const &delim)
std::string sourceName
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
void set_fixed_size()
Definition: sqltypes.h:479
specifies the content in-memory of a row in the column metadata table
void set_comp_param(int p)
Definition: sqltypes.h:482
Definition: sqltypes.h:79
Definition: sqltypes.h:80
MetadataColumnInfos parse_add_metadata_columns(const std::string &add_metadata_columns, const std::string &file_path)
#define IS_INTEGER(T)
Definition: sqltypes.h:304
void setStringConstant(const std::string &name, const std::string &value)
Definition: sqltypes.h:72
SQLTypeInfo columnType
std::string columnName
void setExpression(const std::string &expression)
std::vector< MetadataColumnInfo > MetadataColumnInfos
HOST DEVICE void set_type(SQLTypes t)
Definition: sqltypes.h:470