OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ParquetMetadataValidator.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "ParquetEncoder.h"
21 
22 namespace foreign_storage {
24  public:
25  virtual ~ParquetMetadataValidator() = default;
26 
27  virtual void validate(std::shared_ptr<parquet::Statistics> stats,
28  const SQLTypeInfo& column_type) const = 0;
29 };
30 
31 template <typename D, typename T>
32 inline bool check_bounds(const T& value) {
33  auto [min_value, max_value] = get_min_max_bounds<D>();
34  return value >= min_value && value <= max_value;
35 }
36 
37 template <typename D>
38 inline std::string datetime_to_string(const D& timestamp,
39  const SQLTypeInfo& column_type) {
40  CHECK(column_type.is_timestamp() || column_type.is_date());
41  Datum d;
42  d.bigintval = timestamp;
43  return DatumToString(d, column_type);
44 }
45 
47  const std::string& min_value,
48  const std::string& max_value,
49  const std::string& encountered_value) {
50  std::stringstream error_message;
51  error_message << "Parquet column contains values that are outside the range of the "
52  "HeavyDB column "
53  "type. Consider using a wider column type. Min allowed value: "
54  << min_value << ". Max allowed value: " << max_value
55  << ". Encountered value: " << encountered_value << ".";
56  throw std::runtime_error(error_message.str());
57 }
58 
59 template <typename T>
61  static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
62  "TimestampBoundsValidator is only defined for signed integral types.");
63 
64  public:
65  template <typename D>
66  static void validateValue(const D& data_value, const SQLTypeInfo& column_type) {
67  if (!valueWithinBounds(data_value, column_type)) {
68  auto [min_allowed_value, max_allowed_value] = getMinMaxBoundsAsStrings(column_type);
70  min_allowed_value,
71  max_allowed_value,
72  datetime_to_string(data_value, column_type));
73  }
74  }
75 
76  private:
77  static bool valueWithinBounds(const T& value, const SQLTypeInfo& column_type) {
78  CHECK(column_type.is_timestamp());
79  switch (column_type.get_size()) {
80  case 4:
81  return check_bounds<int32_t>(value);
82  case 8:
83  return check_bounds<int64_t>(value);
84  default:
85  UNREACHABLE();
86  }
87  return {};
88  }
89 
90  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
91  const SQLTypeInfo& column_type) {
92  CHECK(column_type.is_timestamp());
93  switch (column_type.get_size()) {
94  case 4:
95  return getMinMaxBoundsAsStrings<int32_t>(column_type);
96  case 8:
97  return getMinMaxBoundsAsStrings<int64_t>(column_type);
98  default:
99  UNREACHABLE();
100  }
101  return {};
102  }
103 
104  template <typename D>
105  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
106  const SQLTypeInfo& column_type) {
107  auto [min_value, max_value] = get_min_max_bounds<D>();
108  return {datetime_to_string(min_value, column_type),
109  datetime_to_string(max_value, column_type)};
110  }
111 };
112 
113 template <typename T>
115  static_assert(std::is_integral<T>::value,
116  "IntegralFixedLengthBoundsValidator is only defined for integral types.");
117 
118  public:
119  template <typename D>
120  static void validateValue(const D& data_value, const SQLTypeInfo& column_type) {
121  if (!valueWithinBounds(data_value, column_type)) {
122  auto [min_allowed_value, max_allowed_value] = getMinMaxBoundsAsStrings(column_type);
123  if (std::is_signed<T>::value) {
125  min_allowed_value, max_allowed_value, std::to_string(data_value));
126  } else {
128  min_allowed_value,
129  max_allowed_value,
130  std::to_string(static_cast<T>(data_value)));
131  }
132  }
133  }
134 
135  private:
136  static bool valueWithinBounds(const T& value, const SQLTypeInfo& column_type) {
137  CHECK(column_type.is_integer());
138  switch (column_type.get_size()) {
139  case 1:
140  return checkBounds<int8_t>(value);
141  case 2:
142  return checkBounds<int16_t>(value);
143  case 4:
144  return checkBounds<int32_t>(value);
145  case 8:
146  return checkBounds<int64_t>(value);
147  default:
148  UNREACHABLE();
149  }
150  return {};
151  }
152 
153  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
154  const SQLTypeInfo& column_type) {
155  CHECK(column_type.is_integer());
156  switch (column_type.get_size()) {
157  case 1:
158  return getMinMaxBoundsAsStrings<int8_t>();
159  case 2:
160  return getMinMaxBoundsAsStrings<int16_t>();
161  case 4:
162  return getMinMaxBoundsAsStrings<int32_t>();
163  case 8:
164  return getMinMaxBoundsAsStrings<int64_t>();
165  default:
166  UNREACHABLE();
167  }
168  return {};
169  }
170 
178  template <typename D,
179  typename TT = T,
180  std::enable_if_t<std::is_signed<TT>::value, int> = 0>
181  static bool checkBounds(const T& value) {
182  return check_bounds<D>(value);
183  }
184 
192  template <typename D,
193  typename TT = T,
194  std::enable_if_t<!std::is_signed<TT>::value, int> = 0>
195  static bool checkBounds(const T& value) {
196  auto [min_value, max_value] = get_min_max_bounds<D>();
197  auto signed_value = static_cast<D>(value);
198  return signed_value >= 0 && signed_value <= max_value;
199  }
200 
201  template <typename D>
202  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings() {
203  auto [min_value, max_value] = get_min_max_bounds<D>();
204  return {std::to_string(min_value), std::to_string(max_value)};
205  }
206 };
207 
208 template <typename T, bool is_in_seconds = true>
210  static_assert(
211  std::is_integral<T>::value && std::is_signed<T>::value,
212  "DateInSecondsBoundsValidator is only defined for signed integral types.");
213 
214  public:
215  template <typename D>
216  static void validateValue(const D& data_value, const SQLTypeInfo& column_type) {
217  if (!valueWithinBounds(data_value, column_type)) {
218  auto [min_allowed_value, max_allowed_value] = getMinMaxBoundsAsStrings(column_type);
220  min_allowed_value,
221  max_allowed_value,
222  datetime_to_string(data_value, column_type));
223  }
224  }
225 
226  private:
227  static bool valueWithinBounds(const T& value, const SQLTypeInfo& column_type) {
228  CHECK(column_type.is_date());
229  switch (column_type.get_size()) {
230  case 4:
231  return checkBounds<int32_t>(value);
232  case 2:
233  return checkBounds<int16_t>(value);
234  default:
235  UNREACHABLE();
236  }
237  return {};
238  }
239 
240  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
241  const SQLTypeInfo& column_type) {
242  CHECK(column_type.is_date());
243  switch (column_type.get_size()) {
244  case 4:
245  return getMinMaxBoundsAsStrings<int32_t>(column_type);
246  case 2:
247  return getMinMaxBoundsAsStrings<int16_t>(column_type);
248  default:
249  UNREACHABLE();
250  }
251  return {};
252  }
253 
254  template <typename D>
255  static bool checkBounds(const T& value) {
256  auto [min_value, max_value] = get_min_max_bounds<D>();
257  if (is_in_seconds) {
258  return value >= kSecsPerDay * min_value && value <= kSecsPerDay * max_value;
259  } else {
260  return value >= min_value && value <= max_value;
261  }
262  }
263 
264  template <typename D>
265  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
266  const SQLTypeInfo& column_type) {
267  auto [min_value, max_value] = get_min_max_bounds<D>();
268  return {datetime_to_string(kSecsPerDay * min_value, column_type),
269  datetime_to_string(kSecsPerDay * max_value, column_type)};
270  }
271 };
272 
273 template <typename T>
275 
276 template <typename T>
278 
279 template <typename T>
281  static_assert(std::is_floating_point<T>::value,
282  "FloatPointValidator is only defined for floating point types.");
283 
284  public:
285  template <typename D>
286  static void validateValue(const D& data_value, const SQLTypeInfo& column_type) {
287  if (!valueWithinBounds(data_value, column_type)) {
288  auto [min_allowed_value, max_allowed_value] = getMinMaxBoundsAsStrings(column_type);
290  min_allowed_value, max_allowed_value, std::to_string(data_value));
291  }
292  }
293 
294  private:
295  static bool valueWithinBounds(const T& value, const SQLTypeInfo& column_type) {
296  CHECK(column_type.is_fp());
297  switch (column_type.get_size()) {
298  case 4:
299  return checkBounds<float>(value);
300  case 8:
301  return checkBounds<double>(value);
302  default:
303  UNREACHABLE();
304  }
305  return {};
306  }
307 
308  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings(
309  const SQLTypeInfo& column_type) {
310  CHECK(column_type.is_fp());
311  switch (column_type.get_size()) {
312  case 4:
313  return getMinMaxBoundsAsStrings<float>();
314  case 8:
315  return getMinMaxBoundsAsStrings<double>();
316  default:
317  UNREACHABLE();
318  }
319  return {};
320  }
321 
322  template <typename D>
323  static bool checkBounds(const T& value) {
324  return check_bounds<D>(value);
325  }
326 
327  template <typename D>
328  static std::pair<std::string, std::string> getMinMaxBoundsAsStrings() {
329  auto [min_value, max_value] = get_min_max_bounds<D>();
330  return {std::to_string(min_value), std::to_string(max_value)};
331  }
332 };
333 
334 } // namespace foreign_storage
static constexpr int64_t kSecsPerDay
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
HOST DEVICE int get_size() const
Definition: sqltypes.h:403
std::string DatumToString(Datum d, const SQLTypeInfo &ti)
Definition: Datum.cpp:460
bool is_timestamp() const
Definition: sqltypes.h:1046
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
static bool valueWithinBounds(const T &value, const SQLTypeInfo &column_type)
bool is_fp() const
Definition: sqltypes.h:573
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
#define UNREACHABLE()
Definition: Logger.h:338
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
virtual void validate(std::shared_ptr< parquet::Statistics > stats, const SQLTypeInfo &column_type) const =0
dictionary stats
Definition: report.py:116
void throw_parquet_metadata_out_of_bounds_error(const std::string &min_value, const std::string &max_value, const std::string &encountered_value)
std::string to_string(char const *&&v)
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings()
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings()
static bool valueWithinBounds(const T &value, const SQLTypeInfo &column_type)
bool is_integer() const
Definition: sqltypes.h:567
bool check_bounds(const T &value)
int64_t bigintval
Definition: Datum.h:76
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
static bool valueWithinBounds(const T &value, const SQLTypeInfo &column_type)
#define CHECK(condition)
Definition: Logger.h:291
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
std::string datetime_to_string(const D &timestamp, const SQLTypeInfo &column_type)
static bool valueWithinBounds(const T &value, const SQLTypeInfo &column_type)
Definition: Datum.h:71
static std::pair< std::string, std::string > getMinMaxBoundsAsStrings(const SQLTypeInfo &column_type)
bool is_date() const
Definition: sqltypes.h:1028
static bool checkBounds(const T &value)
Check bounds for value in signed case.
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)