OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ParquetStringImportEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "ParquetEncoder.h"
21 
22 #include <parquet/schema.h>
23 #include <parquet/types.h>
24 
25 namespace foreign_storage {
26 
28  public:
30  : ParquetEncoder(buffer)
31  , string_buffer_(dynamic_cast<TypedParquetStorageBuffer<std::string>*>(buffer)) {
32  CHECK(string_buffer_); // verify dynamic_cast succeeded
33  }
34 
35  void appendData(const int16_t* def_levels,
36  const int16_t* rep_levels,
37  const int64_t values_read,
38  const int64_t levels_read,
39  int8_t* values) override {
40  auto parquet_data_ptr = reinterpret_cast<const parquet::ByteArray*>(values);
41  string_buffer_->reserveNumElements(levels_read);
42  for (int64_t i = 0, j = 0; i < levels_read; ++i) {
43  if (def_levels[i]) {
44  CHECK(j < values_read);
45  auto& byte_array = parquet_data_ptr[j++];
47  std::string{reinterpret_cast<const char*>(byte_array.ptr), byte_array.len});
48  } else {
49  string_buffer_->appendElement(""); // empty strings encode nulls
50  }
51  }
52  }
53 
54  void appendDataTrackErrors(const int16_t* def_levels,
55  const int16_t* rep_levels,
56  const int64_t values_read,
57  const int64_t levels_read,
58  int8_t* values) override {
59  UNREACHABLE() << "unexpected call to appendDataTrackErrors from unsupported encoder";
60  }
61 
62  void validateAndAppendData(const int16_t* def_levels,
63  const int16_t* rep_levels,
64  const int64_t values_read,
65  const int64_t levels_read,
66  int8_t* values,
67  const SQLTypeInfo& column_type, /* may not be used */
68  InvalidRowGroupIndices& invalid_indices) override {
69  appendData(def_levels, rep_levels, values_read, levels_read, values);
70  }
71 
73  const InvalidRowGroupIndices& invalid_indices) override {
74  if (invalid_indices.empty()) {
75  return;
76  }
77  string_buffer_->eraseInvalidData(invalid_indices);
78  }
79 
80  private:
82 };
83 
84 } // namespace foreign_storage
void eraseInvalidData(const FindContainer &invalid_indices)
#define UNREACHABLE()
Definition: Logger.h:338
std::set< int64_t > InvalidRowGroupIndices
TypedParquetStorageBuffer< std::string > * string_buffer_
An AbstractBuffer is a unit of data management for a data manager.
void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
void appendDataTrackErrors(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values) override
void validateAndAppendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values, const SQLTypeInfo &column_type, InvalidRowGroupIndices &invalid_indices) override
#define CHECK(condition)
Definition: Logger.h:291
void reserveNumElements(size_t additional_num_elements)
void eraseInvalidIndicesInBuffer(const InvalidRowGroupIndices &invalid_indices) override
ParquetStringImportEncoder(Data_Namespace::AbstractBuffer *buffer)