OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
foreign_storage::ParquetStringEncoder< V > Class Template Reference

#include <ParquetStringEncoder.h>

+ Inheritance diagram for foreign_storage::ParquetStringEncoder< V >:
+ Collaboration diagram for foreign_storage::ParquetStringEncoder< V >:

Public Member Functions

 ParquetStringEncoder (Data_Namespace::AbstractBuffer *buffer, StringDictionary *string_dictionary, ChunkMetadata *chunk_metadata)
 
void validateAndAppendData (const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values, const SQLTypeInfo &column_type, InvalidRowGroupIndices &invalid_indices) override
 
void appendDataTrackErrors (const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
 
void appendData (const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
 
void encodeAndCopyContiguous (const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes, const size_t num_elements) override
 
void encodeAndCopy (const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes) override
 
std::shared_ptr< ChunkMetadatagetRowGroupMetadata (const parquet::RowGroupMetaData *group_metadata, const int parquet_column_index, const SQLTypeInfo &column_type) override
 
- Public Member Functions inherited from foreign_storage::TypedParquetInPlaceEncoder< V, V >
 TypedParquetInPlaceEncoder (Data_Namespace::AbstractBuffer *buffer, const ColumnDescriptor *column_desciptor, const parquet::ColumnDescriptor *parquet_column_descriptor)
 
 TypedParquetInPlaceEncoder (Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)
 
void validate (const int8_t *parquet_data, const int64_t j, const SQLTypeInfo &column_type) const override
 
std::string integralTypeToString (const V &element) const
 
bool isIntegralType (const SQLTypeInfo &type) const
 
std::string elementToString (const V &element) const
 
std::string encodedDataToString (const int8_t *bytes) const override
 
void setDetectBufferConverterType ()
 
void validateUsingEncodersColumnType (const int8_t *parquet_data, const int64_t j) const override
 
void reserve (const size_t num_append_elements) override
 
void appendDataTrackErrors (const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
 
void validateAndAppendData (const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values, const SQLTypeInfo &column_type, InvalidRowGroupIndices &invalid_indices) override
 
void eraseInvalidIndicesInBuffer (const InvalidRowGroupIndices &invalid_indices) override
 
void appendData (const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
 
void encodeAndCopyContiguous (const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes, const size_t num_elements) override
 
void setNull (int8_t *omnisci_data_bytes) override
 
void copy (const int8_t *omnisci_data_bytes_source, int8_t *omnisci_data_bytes_destination) override
 
std::shared_ptr< ChunkMetadatagetRowGroupMetadata (const parquet::RowGroupMetaData *group_metadata, const int parquet_column_index, const SQLTypeInfo &column_type) override
 
- Public Member Functions inherited from foreign_storage::ParquetInPlaceEncoder
 ParquetInPlaceEncoder (Data_Namespace::AbstractBuffer *buffer, const size_t omnisci_data_type_byte_size, const size_t parquet_data_type_byte_size)
 
- Public Member Functions inherited from foreign_storage::ParquetScalarEncoder
 ParquetScalarEncoder (Data_Namespace::AbstractBuffer *buffer)
 
- Public Member Functions inherited from foreign_storage::ParquetEncoder
 ParquetEncoder (Data_Namespace::AbstractBuffer *buffer)
 
virtual ~ParquetEncoder ()=default
 
RejectedRowIndices getRejectedRowIndices () const
 
virtual void disableMetadataStatsValidation ()
 
virtual void initializeErrorTracking ()
 
virtual void initializeColumnType (const SQLTypeInfo &column_type)
 

Protected Member Functions

bool encodingIsIdentityForSameTypes () const override
 
- Protected Member Functions inherited from foreign_storage::TypedParquetInPlaceEncoder< V, V >
std::pair< V, V > getUnencodedStats (std::shared_ptr< parquet::Statistics > stats) const
 

Private Member Functions

void updateMetadataStats (int64_t values_read, int8_t *values)
 

Private Attributes

StringDictionarystring_dictionary_
 
ChunkMetadatachunk_metadata_
 
std::vector< int8_t > encode_buffer_
 
min_
 
max_
 
int64_t current_batch_offset_
 
InvalidRowGroupIndicesinvalid_indices_
 

Additional Inherited Members

- Static Protected Member Functions inherited from foreign_storage::ParquetEncoder
static std::shared_ptr
< ChunkMetadata
createMetadata (const SQLTypeInfo &column_type)
 
static void throwNotNullViolation (const std::string &parquet_column_name)
 
static void validateNullCount (const std::string &parquet_column_name, int64_t null_count, const SQLTypeInfo &column_type)
 
- Protected Attributes inherited from foreign_storage::ParquetInPlaceEncoder
const size_t omnisci_data_type_byte_size_
 
const size_t parquet_data_type_byte_size_
 
- Protected Attributes inherited from foreign_storage::ParquetEncoder
Data_Namespace::AbstractBufferbuffer_
 
bool is_error_tracking_enabled_
 
RejectedRowIndices invalid_indices_
 
size_t current_chunk_offset_
 
SQLTypeInfo column_type_
 
bool validate_metadata_stats_
 

Detailed Description

template<typename V>
class foreign_storage::ParquetStringEncoder< V >

Definition at line 29 of file ParquetStringEncoder.h.

Constructor & Destructor Documentation

template<typename V >
foreign_storage::ParquetStringEncoder< V >::ParquetStringEncoder ( Data_Namespace::AbstractBuffer buffer,
StringDictionary string_dictionary,
ChunkMetadata chunk_metadata 
)
inline

Definition at line 31 of file ParquetStringEncoder.h.

References foreign_storage::ParquetStringEncoder< V >::chunk_metadata_, ChunkMetadata::chunkStats, and ChunkStats::has_nulls.

34  : TypedParquetInPlaceEncoder<V, V>(buffer, sizeof(V), sizeof(V))
35  , string_dictionary_(string_dictionary)
36  , chunk_metadata_(chunk_metadata)
38  , min_(std::numeric_limits<V>::max())
39  , max_(std::numeric_limits<V>::lowest())
41  , invalid_indices_(nullptr) {
42  if (chunk_metadata_) {
44  }
45  }
bool has_nulls
Definition: ChunkMetadata.h:30
ChunkStats chunkStats
Definition: ChunkMetadata.h:37

Member Function Documentation

template<typename V >
void foreign_storage::ParquetStringEncoder< V >::appendData ( const int16_t *  def_levels,
const int16_t *  rep_levels,
const int64_t  values_read,
const int64_t  levels_read,
int8_t *  values 
)
inlineoverridevirtual

Appends Parquet data to the buffer using an in-place algorithm. Any necessary transformation or validation of the data and decoding of nulls is part of appending the data. Each class inheriting from this abstract class must implement the functionality to copy, nullify and encode the data.

Parameters
def_levels- an array containing the Dremel encoding definition levels
rep_levels- an array containing the Dremel encoding repetition levels
values_read- the number of non-null values read
levels_read- the total number of values (non-null & null) that are read
values- values that are read

Note that the Parquet format encodes nulls using Dremel encoding.

Reimplemented from foreign_storage::ParquetInPlaceEncoder.

Definition at line 96 of file ParquetStringEncoder.h.

References foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::appendData(), foreign_storage::ParquetStringEncoder< V >::chunk_metadata_, ChunkMetadata::chunkStats, foreign_storage::ParquetStringEncoder< V >::encode_buffer_, foreign_storage::ParquetStringEncoder< V >::encodeAndCopyContiguous(), and ChunkStats::has_nulls.

Referenced by foreign_storage::ParquetStringEncoder< V >::appendDataTrackErrors(), and foreign_storage::ParquetStringEncoder< V >::validateAndAppendData().

100  {
101  encodeAndCopyContiguous(values, encode_buffer_.data(), values_read);
103  def_levels, rep_levels, values_read, levels_read, encode_buffer_.data());
105  chunk_metadata_->chunkStats.has_nulls || (values_read < levels_read);
106  }
bool has_nulls
Definition: ChunkMetadata.h:30
ChunkStats chunkStats
Definition: ChunkMetadata.h:37
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
void encodeAndCopyContiguous(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes, const size_t num_elements) override

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V >
void foreign_storage::ParquetStringEncoder< V >::appendDataTrackErrors ( const int16_t *  def_levels,
const int16_t *  rep_levels,
const int64_t  values_read,
const int64_t  levels_read,
int8_t *  values 
)
inlineoverridevirtual

Implements foreign_storage::ParquetEncoder.

Definition at line 71 of file ParquetStringEncoder.h.

References foreign_storage::ParquetStringEncoder< V >::appendData(), CHECK, CHECK_LT, foreign_storage::ParquetEncoder::column_type_, foreign_storage::ParquetEncoder::current_chunk_offset_, foreign_storage::ParquetEncoder::invalid_indices_, foreign_storage::ParquetEncoder::is_error_tracking_enabled_, and StringDictionary::MAX_STRLEN.

75  {
77  auto parquet_data_ptr = reinterpret_cast<const parquet::ByteArray*>(values);
78  for (int64_t i = 0, j = 0; i < levels_read; ++i) {
79  if (def_levels[i]) {
80  CHECK_LT(j, values_read);
81  auto& byte_array = parquet_data_ptr[j++];
82  if (byte_array.len > StringDictionary::MAX_STRLEN) {
84  i);
85  }
87  .get_notnull()) { // item is null for NOT NULL column
89  i);
90  }
91  }
93  appendData(def_levels, rep_levels, values_read, levels_read, values);
94  }
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
RejectedRowIndices invalid_indices_
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define CHECK(condition)
Definition: Logger.h:291
static constexpr size_t MAX_STRLEN

+ Here is the call graph for this function:

template<typename V >
void foreign_storage::ParquetStringEncoder< V >::encodeAndCopy ( const int8_t *  parquet_data_bytes,
int8_t *  omnisci_data_bytes 
)
inlineoverridevirtual

Implements foreign_storage::ParquetScalarEncoder.

Definition at line 130 of file ParquetStringEncoder.h.

References foreign_storage::TypedParquetInPlaceEncoder< V, T, NullType >::copy().

131  {
132  TypedParquetInPlaceEncoder<V, V>::copy(parquet_data_bytes, omnisci_data_bytes);
133  }
void copy(const int8_t *omnisci_data_bytes_source, int8_t *omnisci_data_bytes_destination) override

+ Here is the call graph for this function:

template<typename V >
void foreign_storage::ParquetStringEncoder< V >::encodeAndCopyContiguous ( const int8_t *  parquet_data_bytes,
int8_t *  omnisci_data_bytes,
const size_t  num_elements 
)
inlineoverridevirtual

Implements foreign_storage::ParquetScalarEncoder.

Definition at line 108 of file ParquetStringEncoder.h.

References CHECK, StringDictionary::getOrAddBulk(), StringDictionary::MAX_STRLEN, foreign_storage::ParquetStringEncoder< V >::string_dictionary_, and foreign_storage::ParquetStringEncoder< V >::updateMetadataStats().

Referenced by foreign_storage::ParquetStringEncoder< V >::appendData().

110  {
112  auto parquet_data_ptr =
113  reinterpret_cast<const parquet::ByteArray*>(parquet_data_bytes);
114  auto omnisci_data_ptr = reinterpret_cast<V*>(omnisci_data_bytes);
115  std::vector<std::string_view> string_views;
116  string_views.reserve(num_elements);
117  for (size_t i = 0; i < num_elements; ++i) {
118  auto& byte_array = parquet_data_ptr[i];
119  if (byte_array.len <= StringDictionary::MAX_STRLEN) {
120  string_views.emplace_back(reinterpret_cast<const char*>(byte_array.ptr),
121  byte_array.len);
122  } else {
123  string_views.emplace_back(nullptr, 0);
124  }
125  }
126  string_dictionary_->getOrAddBulk(string_views, omnisci_data_ptr);
127  updateMetadataStats(num_elements, omnisci_data_bytes);
128  }
void updateMetadataStats(int64_t values_read, int8_t *values)
void getOrAddBulk(const std::vector< String > &string_vec, T *encoded_vec)
#define CHECK(condition)
Definition: Logger.h:291
static constexpr size_t MAX_STRLEN

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V >
bool foreign_storage::ParquetStringEncoder< V >::encodingIsIdentityForSameTypes ( ) const
inlineoverrideprotectedvirtual

Reimplemented from foreign_storage::TypedParquetInPlaceEncoder< V, V >.

Definition at line 151 of file ParquetStringEncoder.h.

151 { return true; }
template<typename V >
std::shared_ptr<ChunkMetadata> foreign_storage::ParquetStringEncoder< V >::getRowGroupMetadata ( const parquet::RowGroupMetaData *  group_metadata,
const int  parquet_column_index,
const SQLTypeInfo column_type 
)
inlineoverridevirtual

Reimplemented from foreign_storage::ParquetEncoder.

Definition at line 135 of file ParquetStringEncoder.h.

References foreign_storage::ParquetEncoder::getRowGroupMetadata(), and foreign_storage::ParquetInPlaceEncoder::omnisci_data_type_byte_size_.

138  {
139  auto metadata = ParquetEncoder::getRowGroupMetadata(
140  group_metadata, parquet_column_index, column_type);
141  auto column_metadata = group_metadata->ColumnChunk(parquet_column_index);
143  column_metadata->num_values();
144 
145  // Placeholder metadata is defined with has_nulls = false.
146  metadata->chunkStats.has_nulls = false;
147  return metadata;
148  }
virtual std::shared_ptr< ChunkMetadata > getRowGroupMetadata(const parquet::RowGroupMetaData *group_metadata, const int parquet_column_index, const SQLTypeInfo &column_type)

+ Here is the call graph for this function:

template<typename V >
void foreign_storage::ParquetStringEncoder< V >::updateMetadataStats ( int64_t  values_read,
int8_t *  values 
)
inlineprivate

Definition at line 154 of file ParquetStringEncoder.h.

References foreign_storage::ParquetStringEncoder< V >::chunk_metadata_, ChunkMetadata::chunkStats, ChunkMetadata::fillChunkStats(), ChunkStats::has_nulls, foreign_storage::ParquetStringEncoder< V >::max_, and foreign_storage::ParquetStringEncoder< V >::min_.

Referenced by foreign_storage::ParquetStringEncoder< V >::encodeAndCopyContiguous().

154  {
155  if (!chunk_metadata_) {
156  return;
157  }
158  V* data_ptr = reinterpret_cast<V*>(values);
159  for (int64_t i = 0; i < values_read; ++i) {
160  min_ = std::min<V>(data_ptr[i], min_);
161  max_ = std::max<V>(data_ptr[i], max_);
162  }
164  }
void fillChunkStats(const T min, const T max, const bool has_nulls)
Definition: ChunkMetadata.h:51
bool has_nulls
Definition: ChunkMetadata.h:30
ChunkStats chunkStats
Definition: ChunkMetadata.h:37

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename V >
void foreign_storage::ParquetStringEncoder< V >::validateAndAppendData ( const int16_t *  def_levels,
const int16_t *  rep_levels,
const int64_t  values_read,
const int64_t  levels_read,
int8_t *  values,
const SQLTypeInfo column_type,
InvalidRowGroupIndices invalid_indices 
)
inlineoverridevirtual

Implements foreign_storage::ParquetImportEncoder.

Definition at line 50 of file ParquetStringEncoder.h.

References foreign_storage::ParquetStringEncoder< V >::appendData(), CHECK_LT, foreign_storage::ParquetStringEncoder< V >::current_batch_offset_, and StringDictionary::MAX_STRLEN.

56  {
57  auto parquet_data_ptr = reinterpret_cast<const parquet::ByteArray*>(values);
58  for (int64_t i = 0, j = 0; i < levels_read; ++i) {
59  if (def_levels[i]) {
60  CHECK_LT(j, values_read);
61  auto& byte_array = parquet_data_ptr[j++];
62  if (byte_array.len > StringDictionary::MAX_STRLEN) {
63  invalid_indices.insert(current_batch_offset_ + i);
64  }
65  }
66  }
67  current_batch_offset_ += levels_read;
68  appendData(def_levels, rep_levels, values_read, levels_read, values);
69  }
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
#define CHECK_LT(x, y)
Definition: Logger.h:303
static constexpr size_t MAX_STRLEN

+ Here is the call graph for this function:

Member Data Documentation

template<typename V >
int64_t foreign_storage::ParquetStringEncoder< V >::current_batch_offset_
private
template<typename V >
std::vector<int8_t> foreign_storage::ParquetStringEncoder< V >::encode_buffer_
private
template<typename V >
InvalidRowGroupIndices* foreign_storage::ParquetStringEncoder< V >::invalid_indices_
private

Definition at line 173 of file ParquetStringEncoder.h.

template<typename V >
V foreign_storage::ParquetStringEncoder< V >::max_
private
template<typename V >
V foreign_storage::ParquetStringEncoder< V >::min_
private
template<typename V >
StringDictionary* foreign_storage::ParquetStringEncoder< V >::string_dictionary_
private

The documentation for this class was generated from the following file: