OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TextFileBufferParser.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
21 
22 #include "ImportExport/Importer.h"
23 
24 namespace foreign_storage {
25 
27  ParseBufferRequest(const ParseBufferRequest& request) = delete;
28  ParseBufferRequest(ParseBufferRequest&& request) = default;
31  int db_id,
32  const ForeignTable* foreign_table,
33  const std::set<int> column_filter_set,
34  const std::string& full_path,
35  const bool track_rejected_rows = false);
36 
37  inline std::shared_ptr<Catalog_Namespace::Catalog> getCatalog() const {
39  CHECK(catalog);
40  return catalog;
41  }
42 
43  inline std::list<const ColumnDescriptor*> getColumns() const {
44  return foreign_table_schema->getLogicalAndPhysicalColumns();
45  }
46 
47  inline int32_t getTableId() const {
48  return foreign_table_schema->getForeignTable()->tableId;
49  }
50 
51  inline std::string getTableName() const {
52  return foreign_table_schema->getForeignTable()->tableName;
53  }
54 
55  inline size_t getMaxFragRows() const {
56  return foreign_table_schema->getForeignTable()->maxFragRows;
57  }
58 
59  inline std::string getFilePath() const { return full_path; }
60 
61  // These must be initialized at construction (before parsing).
62  std::unique_ptr<char[]> buffer;
63  size_t buffer_size;
66  const int db_id;
67  std::unique_ptr<ForeignTableSchema> foreign_table_schema;
68  std::vector<std::unique_ptr<import_export::TypedImportBuffer>> import_buffers;
69 
70  // These are set during parsing.
72  size_t begin_pos;
73  size_t end_pos;
75  size_t file_offset;
77  std::string full_path;
78 
79  // This parameter controls the behaviour of error handling in the data wrapper
80  const bool track_rejected_rows;
81 
82  // This tracks the number of rows processed, is necessary to identify requests that are
83  // not completed
85 };
86 
88  std::map<int, DataBlockPtr> column_id_to_data_blocks_map;
89  size_t row_count;
90  std::vector<size_t> row_offsets;
91  std::set<size_t> rejected_rows;
92 };
93 
95  public:
106  bool convert_data_blocks,
107  bool columns_are_pre_filtered = false,
108  bool skip_dict_encoding = false) const = 0;
114  const ForeignTable* foreign_table) const = 0;
115 
123  virtual size_t findRowEndPosition(size_t& alloc_size,
124  std::unique_ptr<char[]>& buffer,
125  size_t& buffer_size,
126  const import_export::CopyParams& copy_params,
127  const size_t buffer_first_row_index,
128  unsigned int& num_rows_in_buffer,
129  FileReader* file_reader) const = 0;
130 
134  virtual void validateFiles(const FileReader* file_reader,
135  const ForeignTable* foreign_table) const = 0;
136 
137  static std::map<int, DataBlockPtr> convertImportBuffersToDataBlocks(
138  const std::vector<std::unique_ptr<import_export::TypedImportBuffer>>&
139  import_buffers,
140  const bool skip_dict_encoding = false);
141 
142  static bool isCoordinateScalar(const std::string_view datum);
143 
144  static void processGeoColumn(
145  std::vector<std::unique_ptr<import_export::TypedImportBuffer>>& import_buffers,
146  size_t& col_idx,
147  const import_export::CopyParams& copy_params,
148  std::list<const ColumnDescriptor*>::iterator& cd_it,
149  std::vector<std::string_view>& row,
150  size_t& import_idx,
151  bool is_null,
152  size_t first_row_index,
153  size_t row_index_plus_one,
154  std::shared_ptr<Catalog_Namespace::Catalog> catalog);
155 
160  static void fillRejectedRowWithInvalidData(
161  const std::list<const ColumnDescriptor*>& columns,
162  std::list<const ColumnDescriptor*>::iterator& cd_it,
163  const size_t col_idx,
164  ParseBufferRequest& request);
165 
166  static bool isNullDatum(const std::string_view datum,
167  const ColumnDescriptor* column,
168  const std::string& null_indicator);
169 
170  inline static const std::string BUFFER_SIZE_KEY = "BUFFER_SIZE";
171 
172  private:
173  static void processInvalidGeoColumn(
174  std::vector<std::unique_ptr<import_export::TypedImportBuffer>>& import_buffers,
175  size_t& col_idx,
176  const import_export::CopyParams& copy_params,
177  const ColumnDescriptor* cd,
178  std::shared_ptr<Catalog_Namespace::Catalog> catalog);
179 };
180 } // namespace foreign_storage
std::vector< std::unique_ptr< import_export::TypedImportBuffer > > import_buffers
virtual void validateFiles(const FileReader *file_reader, const ForeignTable *foreign_table) const =0
ParseBufferRequest(const ParseBufferRequest &request)=delete
static std::map< int, DataBlockPtr > convertImportBuffersToDataBlocks(const std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &import_buffers, const bool skip_dict_encoding=false)
std::map< int, DataBlockPtr > column_id_to_data_blocks_map
virtual ParseBufferResult parseBuffer(ParseBufferRequest &request, bool convert_data_blocks, bool columns_are_pre_filtered=false, bool skip_dict_encoding=false) const =0
static void processGeoColumn(std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &import_buffers, size_t &col_idx, const import_export::CopyParams &copy_params, std::list< const ColumnDescriptor * >::iterator &cd_it, std::vector< std::string_view > &row, size_t &import_idx, bool is_null, size_t first_row_index, size_t row_index_plus_one, std::shared_ptr< Catalog_Namespace::Catalog > catalog)
const import_export::CopyParams copy_params
static void fillRejectedRowWithInvalidData(const std::list< const ColumnDescriptor * > &columns, std::list< const ColumnDescriptor * >::iterator &cd_it, const size_t col_idx, ParseBufferRequest &request)
std::unique_ptr< ForeignTableSchema > foreign_table_schema
static SysCatalog & instance()
Definition: SysCatalog.h:343
CONSTEXPR DEVICE bool is_null(const T &value)
specifies the content in-memory of a row in the column metadata table
std::list< const ColumnDescriptor * > getColumns() const
std::shared_ptr< Catalog > getCatalog(const std::string &dbName)
std::shared_ptr< Catalog_Namespace::Catalog > getCatalog() const
virtual size_t findRowEndPosition(size_t &alloc_size, std::unique_ptr< char[]> &buffer, size_t &buffer_size, const import_export::CopyParams &copy_params, const size_t buffer_first_row_index, unsigned int &num_rows_in_buffer, FileReader *file_reader) const =0
static void processInvalidGeoColumn(std::vector< std::unique_ptr< import_export::TypedImportBuffer >> &import_buffers, size_t &col_idx, const import_export::CopyParams &copy_params, const ColumnDescriptor *cd, std::shared_ptr< Catalog_Namespace::Catalog > catalog)
#define CHECK(condition)
Definition: Logger.h:291
virtual import_export::CopyParams validateAndGetCopyParams(const ForeignTable *foreign_table) const =0
static bool isNullDatum(const std::string_view datum, const ColumnDescriptor *column, const std::string &null_indicator)
static bool isCoordinateScalar(const std::string_view datum)