OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ParquetArrayDetectEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <parquet/types.h>
20 
21 #include "ParquetArrayEncoder.h"
23 #include "Shared/StringTransform.h"
25 
26 namespace foreign_storage {
28  public:
30  std::shared_ptr<ParquetScalarEncoder> scalar_encoder,
31  const ColumnDescriptor* column_desciptor)
32  : ParquetArrayEncoder(data_buffer, scalar_encoder, column_desciptor)
33  , detect_buffer_(dynamic_cast<TypedParquetDetectBuffer*>(data_buffer))
35  dynamic_cast<ParquetDetectStringEncoder*>(scalar_encoder_.get())) {
37  }
38 
39  void appendArrayItem(const int64_t encoded_index) override {
40  if (!is_string_array_) {
41  auto string_value =
42  scalar_encoder_->encodedDataToString(encodedDataAtIndex(encoded_index));
43  array_string_.emplace_back(string_value);
44  } else {
45  CHECK_GT(string_buffer_.size(), static_cast<size_t>(encoded_index));
46  array_string_.emplace_back(string_buffer_[encoded_index]);
47  }
49  }
50 
51  size_t getArraysCount() const { return detect_buffer_->getStrings().size(); }
52 
53  protected:
54  void encodeAllValues(const int8_t* values, const int64_t values_read) override {
55  if (!is_string_array_) {
56  ParquetArrayEncoder::encodeAllValues(values, values_read);
57  } else { // string arrays are a special case that require special handling
58  string_buffer_.clear();
59  auto parquet_data_ptr = reinterpret_cast<const parquet::ByteArray*>(values);
60  for (int64_t i = 0; i < values_read; ++i) {
61  auto& byte_array = parquet_data_ptr[i];
62  auto string_value =
63  std::string{reinterpret_cast<const char*>(byte_array.ptr), byte_array.len};
64  string_buffer_.push_back(string_value);
65  }
66  }
67  }
68 
69  void appendArraysToBuffer() override {
70  // no-op as data is already written to buffer in `processLastArray`
71  }
72 
73  void processLastArray() override {
76  }
77 
78  private:
80  if (isLastArrayNull()) {
81  detect_buffer_->appendValue("NULL");
82  } else if (isLastArrayEmpty()) {
84  } else {
85  detect_buffer_->appendValue("{" + join(array_string_, ",") + "}");
86  array_string_.clear();
87  }
88  }
89 
91  const bool is_string_array_;
92  std::vector<std::string> array_string_;
93  std::vector<std::string> string_buffer_;
94 };
95 } // namespace foreign_storage
std::string join(T const &container, std::string const &delim)
void appendArrayItem(const int64_t encoded_index) override
virtual void encodeAllValues(const int8_t *values, const int64_t values_read)
#define CHECK_GT(x, y)
Definition: Logger.h:305
void encodeAllValues(const int8_t *values, const int64_t values_read) override
int8_t * encodedDataAtIndex(const size_t index)
An AbstractBuffer is a unit of data management for a data manager.
specifies the content in-memory of a row in the column metadata table
const std::vector< std::string > & getStrings()
void updateMetadataForAppendedArrayItem(const int64_t encoded_index)
ParquetArrayDetectEncoder(Data_Namespace::AbstractBuffer *data_buffer, std::shared_ptr< ParquetScalarEncoder > scalar_encoder, const ColumnDescriptor *column_desciptor)
#define CHECK(condition)
Definition: Logger.h:291
std::shared_ptr< ParquetScalarEncoder > scalar_encoder_