OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ParquetFixedLengthEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "ParquetInPlaceEncoder.h"
20 
21 namespace foreign_storage {
22 
23 // ParquetFixedLengthEncoder is used in two separate use cases: metadata
24 // scanning & chunk loading. During metadata scan the type of metadata (& in
25 // some cases data) must be known, while during chunk loading only the type of
26 // data needs to be known.
27 //
28 // The following semantics apply to the templated types below.
29 //
30 // At metadata scan:
31 // V - type of metadata (for loading metadata)
32 // T - physical type of parquet data
33 //
34 // At chunk load:
35 // V - type of data (to load data)
36 // T - physical type of parquet data
37 // NullType - the type to use for encoding nulls
38 template <typename V, typename T, typename NullType = V>
41  public:
43  const ColumnDescriptor* column_desciptor,
44  const parquet::ColumnDescriptor* parquet_column_descriptor)
45  : TypedParquetInPlaceEncoder<V, T, NullType>(buffer,
46  column_desciptor,
47  parquet_column_descriptor) {}
48 
50  const size_t omnisci_data_type_byte_size,
51  const size_t parquet_data_type_byte_size)
52  : TypedParquetInPlaceEncoder<V, T, NullType>(buffer,
53  omnisci_data_type_byte_size,
54  parquet_data_type_byte_size) {}
55 
56  void encodeAndCopy(const int8_t* parquet_data_bytes,
57  int8_t* omnisci_data_bytes) override {
58  const auto& parquet_data_value = reinterpret_cast<const T*>(parquet_data_bytes)[0];
59  auto& omnisci_data_value = reinterpret_cast<V*>(omnisci_data_bytes)[0];
60  omnisci_data_value = parquet_data_value;
61  }
62 
63  void validate(std::shared_ptr<parquet::Statistics> stats,
64  const SQLTypeInfo& column_type) const override {
65  validateIntegralOrFloatingPointMetadata(stats, column_type);
66  }
67 
68  void validate(const int8_t* parquet_data,
69  const int64_t j,
70  const SQLTypeInfo& column_type) const override {
71  const auto& parquet_data_value = reinterpret_cast<const T*>(parquet_data)[j];
72  validateIntegralOrFloatingPointValue(parquet_data_value, column_type);
73  }
74 
75  bool encodingIsIdentityForSameTypes() const override { return true; }
76 
77  private:
78  template <
79  typename TT = T,
80  std::enable_if_t<(!std::is_integral<TT>::value || std::is_same<TT, bool>::value) &&
81  !std::is_floating_point<TT>::value,
82  int> = 0>
84  const SQLTypeInfo& column_type) const {
85  // do nothing when type `T` is non-integral and non-floating-point (case
86  // for which this can happen are when `T` is bool)
87  }
88 
89  template <typename TT = T, std::enable_if_t<std::is_floating_point<TT>::value, int> = 0>
91  const SQLTypeInfo& column_type) const {
92  if (column_type.is_fp()) {
93  FloatPointValidator<T>::validateValue(value, column_type);
94  } else {
95  UNREACHABLE();
96  }
97  }
98 
99  template <
100  typename TT = T,
101  std::enable_if_t<std::is_integral<TT>::value && !std::is_same<TT, bool>::value,
102  int> = 0>
104  const SQLTypeInfo& column_type) const {
105  if (column_type.is_integer()) {
107  } else if (column_type.is_timestamp()) {
108  TimestampBoundsValidator<T>::validateValue(value, column_type);
109  } else if (column_type.is_date()) {
111  }
112  }
113 
114  void validateIntegralOrFloatingPointMetadata(std::shared_ptr<parquet::Statistics> stats,
115  const SQLTypeInfo& column_type) const {
116  if (!column_type.is_integer() && !column_type.is_timestamp() &&
117  !column_type.is_fp()) {
118  return;
119  }
120  auto [unencoded_stats_min, unencoded_stats_max] =
122  validateIntegralOrFloatingPointValue(unencoded_stats_min, column_type);
123  validateIntegralOrFloatingPointValue(unencoded_stats_max, column_type);
124  }
125 };
126 
127 // ParquetUnsignedFixedLengthEncoder is used in two separate use cases:
128 // metadata scanning & chunk loading. During metadata scan the type of
129 // metadata (& in some cases data) must be known, while during chunk loading
130 // only the type of data needs to be known.
131 //
132 // The following semantics apply to the templated types below.
133 //
134 // At metadata scan:
135 // V - type of metadata (for loading metadata)
136 // T - physical type of parquet data
137 // U - unsigned type that the parquet data represents
138 //
139 // At chunk load:
140 // V - type of data (to load data)
141 // T - physical type of parquet data
142 // U - unsigned type that the parquet data represents
143 // NullType - the type to use for encoding nulls
144 template <typename V, typename T, typename U, typename NullType = V>
146  : public TypedParquetInPlaceEncoder<V, T, NullType>,
147  public ParquetMetadataValidator {
148  public:
151  const ColumnDescriptor* column_desciptor,
152  const parquet::ColumnDescriptor* parquet_column_descriptor)
153  : TypedParquetInPlaceEncoder<V, T, NullType>(buffer,
154  column_desciptor,
155  parquet_column_descriptor) {}
156 
158  const size_t omnisci_data_type_byte_size,
159  const size_t parquet_data_type_byte_size)
160  : TypedParquetInPlaceEncoder<V, T, NullType>(buffer,
161  omnisci_data_type_byte_size,
162  parquet_data_type_byte_size) {}
163 
164  void encodeAndCopy(const int8_t* parquet_data_bytes,
165  int8_t* omnisci_data_bytes) override {
166  const auto& parquet_data_value = reinterpret_cast<const T*>(parquet_data_bytes)[0];
167  auto& omnisci_data_value = reinterpret_cast<V*>(omnisci_data_bytes)[0];
168  omnisci_data_value = static_cast<U>(parquet_data_value);
169  }
170 
171  void validate(std::shared_ptr<parquet::Statistics> stats,
172  const SQLTypeInfo& column_type) const override {
173  if (!column_type.is_integer()) { // do not validate non-integral types
174  return;
175  }
176  auto [unencoded_stats_min, unencoded_stats_max] =
179  column_type);
181  column_type);
182  }
183 
184  void validate(const int8_t* parquet_data,
185  const int64_t j,
186  const SQLTypeInfo& column_type) const override {
187  const auto& parquet_data_value = reinterpret_cast<const T*>(parquet_data)[j];
188  IntegralFixedLengthBoundsValidator<U>::validateValue(parquet_data_value, column_type);
189  }
190 };
191 
192 } // namespace foreign_storage
void validateIntegralOrFloatingPointValue(const T &value, const SQLTypeInfo &column_type) const
bool is_timestamp() const
Definition: sqltypes.h:1046
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
bool is_fp() const
Definition: sqltypes.h:573
#define UNREACHABLE()
Definition: Logger.h:338
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
void validateIntegralOrFloatingPointMetadata(std::shared_ptr< parquet::Statistics > stats, const SQLTypeInfo &column_type) const
dictionary stats
Definition: report.py:116
void encodeAndCopy(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes) override
void validate(std::shared_ptr< parquet::Statistics > stats, const SQLTypeInfo &column_type) const override
void encodeAndCopy(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes) override
ParquetFixedLengthEncoder(Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)
bool is_integer() const
Definition: sqltypes.h:567
ParquetUnsignedFixedLengthEncoder(Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)
An AbstractBuffer is a unit of data management for a data manager.
specifies the content in-memory of a row in the column metadata table
std::pair< T, T > getUnencodedStats(std::shared_ptr< parquet::Statistics > stats) const
void validate(const int8_t *parquet_data, const int64_t j, const SQLTypeInfo &column_type) const override
ParquetFixedLengthEncoder(Data_Namespace::AbstractBuffer *buffer, const ColumnDescriptor *column_desciptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
void validate(std::shared_ptr< parquet::Statistics > stats, const SQLTypeInfo &column_type) const override
void validate(const int8_t *parquet_data, const int64_t j, const SQLTypeInfo &column_type) const override
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)
ParquetUnsignedFixedLengthEncoder(Data_Namespace::AbstractBuffer *buffer, const ColumnDescriptor *column_desciptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
bool is_date() const
Definition: sqltypes.h:1028
static void validateValue(const D &data_value, const SQLTypeInfo &column_type)