OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GeospatialEncoder.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "DataMgr/Chunk/Chunk.h"
20 #include "Geospatial/Compression.h"
21 #include "Geospatial/Types.h"
22 
27 #include "ImportExport/Importer.h"
28 
29 namespace foreign_storage {
30 
31 template <typename T>
32 inline ArrayDatum encode_as_array_datum(const std::vector<T>& data) {
33  const size_t num_bytes = data.size() * sizeof(T);
34  std::shared_ptr<int8_t> buffer(new int8_t[num_bytes], std::default_delete<int8_t[]>());
35  memcpy(buffer.get(), data.data(), num_bytes);
36  return ArrayDatum(num_bytes, buffer, false);
37 }
38 
40  public:
41  virtual ~GeospatialEncoder() = default;
42 
43  GeospatialEncoder(const bool geo_validate_geometry)
44  : geo_validate_geometry_{geo_validate_geometry} {}
45 
46  GeospatialEncoder(std::list<Chunk_NS::Chunk>& chunks, const bool geo_validate_geometry)
47  : geo_column_descriptor_(chunks.begin()->getColumnDesc())
48  , base_column_encoder_(nullptr)
49  , coords_column_encoder_(nullptr)
50  , bounds_column_encoder_(nullptr)
53  , base_column_metadata_(nullptr)
54  , coords_column_metadata_(nullptr)
55  , bounds_column_metadata_(nullptr)
58  , geo_validate_geometry_{geo_validate_geometry} {
60  validateChunksSizing(chunks);
61  const auto geo_column_type = geo_column_descriptor_->columnType.get_type();
62 
63  // initialize coords column
64  coords_column_descriptor_ = getColumnDescriptor(chunks, geo_column_type, COORDS);
65 
66  // initialize bounds column
67  if (hasBoundsColumn()) {
68  bounds_column_descriptor_ = getColumnDescriptor(chunks, geo_column_type, BOUNDS);
69  }
70 
71  // initialize ring sizes column & render group column
74  getColumnDescriptor(chunks, geo_column_type, RING_OR_LINE_SIZES);
75  }
76 
77  // initialize poly rings column
78  if (hasPolyRingsColumn()) {
80  getColumnDescriptor(chunks, geo_column_type, POLY_RINGS);
81  }
82  }
83 
84  GeospatialEncoder(std::list<Chunk_NS::Chunk>& chunks,
85  std::list<std::unique_ptr<ChunkMetadata>>& chunk_metadata,
86  const bool geo_validate_geometry)
87  : geo_column_descriptor_(chunks.begin()->getColumnDesc())
88  , base_column_encoder_(nullptr)
89  , coords_column_encoder_(nullptr)
90  , bounds_column_encoder_(nullptr)
93  , base_column_metadata_(nullptr)
94  , coords_column_metadata_(nullptr)
95  , bounds_column_metadata_(nullptr)
98  , geo_validate_geometry_{geo_validate_geometry} {
100 
101  validateChunksSizing(chunks);
102  validateMetadataSizing(chunk_metadata);
103 
104  const auto geo_column_type = geo_column_descriptor_->columnType.get_type();
105 
106  // initialize base column encoder
107  auto base_chunk = chunks.begin();
108  base_chunk->initEncoder();
110  dynamic_cast<StringNoneEncoder*>(base_chunk->getBuffer()->getEncoder());
111  base_column_metadata_ = chunk_metadata.begin()->get();
113 
114  // initialize coords column
117  chunks, chunk_metadata, geo_column_type, COORDS);
118 
119  // initialize bounds column
120  if (hasBoundsColumn()) {
121  std::tie(
124  chunks, chunk_metadata, geo_column_type, BOUNDS);
125  }
126 
127  // initialize ring sizes column & render group column
128  if (hasRingOrLineSizesColumn()) {
133  chunks, chunk_metadata, geo_column_type, RING_OR_LINE_SIZES);
134  }
135 
136  // initialize poly rings column
137  if (hasPolyRingsColumn()) {
142  chunks, chunk_metadata, geo_column_type, POLY_RINGS);
143  }
144  }
145 
146  protected:
147  void appendBaseDataAndUpdateMetadata(const int64_t row_count) {
148  base_values_.resize(row_count);
150  *base_column_encoder_->appendData(&base_values_, 0, row_count);
151  }
152 
153  void validateChunksSizing(std::list<Chunk_NS::Chunk>& chunks) const {
154  size_t expected_size = geo_column_descriptor_->columnType.get_physical_cols() + 1;
155  CHECK_EQ(chunks.size(), expected_size);
156  }
157 
159  std::list<std::unique_ptr<ChunkMetadata>>& chunk_metadata) const {
160  size_t expected_size = geo_column_descriptor_->columnType.get_physical_cols() + 1;
161  CHECK_EQ(chunk_metadata.size(), expected_size);
162  }
163 
175  }
176 
178  const std::vector<ArrayDatum>& datum_parse_buffer,
179  Encoder* encoder,
180  ChunkMetadata* chunk_metadata) const {
181  if (!encoder) {
182  CHECK(!chunk_metadata);
183  return;
184  }
185  if (auto fixed_len_array_encoder =
186  dynamic_cast<FixedLengthArrayNoneEncoder*>(encoder)) {
187  auto new_chunk_metadata = fixed_len_array_encoder->appendData(
188  &datum_parse_buffer, 0, datum_parse_buffer.size());
189  *chunk_metadata = *new_chunk_metadata;
190  } else if (auto array_encoder = dynamic_cast<ArrayNoneEncoder*>(encoder)) {
191  auto new_chunk_metadata = array_encoder->appendData(
192  &datum_parse_buffer, 0, datum_parse_buffer.size(), false);
193  *chunk_metadata = *new_chunk_metadata;
194  } else {
195  UNREACHABLE();
196  }
197  }
198 
199  void processGeoElement(std::string_view geo_string_view) {
201  if (!Geospatial::GeoTypesFactory::getGeoColumns(std::string(geo_string_view),
202  import_ti,
209  }
210 
211  // validate types
212 
213  if (!geo_promoted_type_match(import_ti.get_type(),
216  }
217 
218  // append coords
219  std::vector<uint8_t> compressed_coords = Geospatial::compress_coords(
221  coords_datum_buffer_.emplace_back(encode_as_array_datum(compressed_coords));
222 
223  // append bounds
224  if (hasBoundsColumn()) {
226  }
227 
228  // append ring sizes
229  if (hasRingOrLineSizesColumn()) {
232  }
233 
234  // append poly rings
235  if (hasPolyRingsColumn()) {
236  poly_rings_datum_buffer_.emplace_back(
238  }
239  }
240 
248  // POINT columns are represented using fixed length arrays and need
249  // special treatment of nulls
251  std::vector<uint8_t> compressed_coords = Geospatial::compress_coords(
253  coords_datum_buffer_.emplace_back(encode_as_array_datum(compressed_coords));
254  } else {
257  }
258  if (hasBoundsColumn()) {
261  }
262  if (hasRingOrLineSizesColumn()) {
266  }
267  if (hasPolyRingsColumn()) {
268  poly_rings_datum_buffer_.emplace_back(
271  }
272  }
273 
275  coords_parse_buffer_.clear();
276  bounds_parse_buffer_.clear();
278  poly_rings_parse_buffer_.clear();
279  }
280 
282  coords_datum_buffer_.clear();
283  bounds_datum_buffer_.clear();
285  poly_rings_datum_buffer_.clear();
286  }
287 
289 
290  template <typename T>
291  typename std::list<T>::iterator getIteratorForGeoColumnType(
292  std::list<T>& list,
293  const SQLTypes column_type,
294  const GeoColumnType geo_column) {
295  auto list_iter = list.begin();
296  list_iter++; // skip base column
297  switch (column_type) {
298  case kPOINT: {
299  if (geo_column == COORDS) {
300  return list_iter;
301  }
302  UNREACHABLE();
303  }
304  case kMULTIPOINT:
305  case kLINESTRING: {
306  if (geo_column == COORDS) {
307  return list_iter;
308  }
309  list_iter++;
310  if (geo_column == BOUNDS) {
311  return list_iter;
312  }
313  UNREACHABLE();
314  }
315  case kMULTILINESTRING: {
316  if (geo_column == COORDS) {
317  return list_iter;
318  }
319  list_iter++;
320  if (geo_column == RING_OR_LINE_SIZES) {
321  return list_iter;
322  }
323  list_iter++;
324  if (geo_column == BOUNDS) {
325  return list_iter;
326  }
327  UNREACHABLE();
328  }
329  case kPOLYGON: {
330  if (geo_column == COORDS) {
331  return list_iter;
332  }
333  list_iter++;
334  if (geo_column == RING_OR_LINE_SIZES) {
335  return list_iter;
336  }
337  list_iter++;
338  if (geo_column == BOUNDS) {
339  return list_iter;
340  }
341  UNREACHABLE();
342  }
343  case kMULTIPOLYGON: {
344  if (geo_column == COORDS) {
345  return list_iter;
346  }
347  list_iter++;
348  if (geo_column == RING_OR_LINE_SIZES) {
349  return list_iter;
350  }
351  list_iter++;
352  if (geo_column == POLY_RINGS) {
353  return list_iter;
354  }
355  list_iter++;
356  if (geo_column == BOUNDS) {
357  return list_iter;
358  }
359  UNREACHABLE();
360  }
361  default:
362  UNREACHABLE();
363  }
364  return {};
365  }
366 
367  std::tuple<Encoder*, ChunkMetadata*, const ColumnDescriptor*>
369  std::list<Chunk_NS::Chunk>& chunks,
370  std::list<std::unique_ptr<ChunkMetadata>>& chunk_metadata,
371  const SQLTypes sql_type,
372  GeoColumnType geo_column_type) {
373  auto chunk = getIteratorForGeoColumnType(chunks, sql_type, geo_column_type);
374  chunk->initEncoder();
375  auto encoder = chunk->getBuffer()->getEncoder();
376  auto metadata =
377  getIteratorForGeoColumnType(chunk_metadata, sql_type, geo_column_type)->get();
378  auto column_descriptor = chunk->getColumnDesc();
379  return {encoder, metadata, column_descriptor};
380  }
381 
382  const ColumnDescriptor* getColumnDescriptor(std::list<Chunk_NS::Chunk>& chunks,
383  const SQLTypes sql_type,
384  GeoColumnType geo_column_type) {
385  auto chunk = getIteratorForGeoColumnType(chunks, sql_type, geo_column_type);
386  auto column_descriptor = chunk->getColumnDesc();
387  return column_descriptor;
388  }
389 
390  static void throwMalformedGeoElement(const std::string& omnisci_column_name) {
391  std::string error_message = "Failed to extract valid geometry in HeavyDB column '" +
392  omnisci_column_name + "'.";
393  throw foreign_storage::ForeignStorageException(error_message);
394  }
395 
396  static void throwMismatchedGeoElement(const std::string& omnisci_column_name) {
398  "Imported geometry"
399  " doesn't match the geospatial type of HeavyDB column '" +
400  omnisci_column_name + "'.");
401  }
402 
403  bool hasBoundsColumn() const {
404  const auto geo_column_type = geo_column_descriptor_->columnType.get_type();
405  return geo_column_type == kMULTIPOINT || geo_column_type == kLINESTRING ||
406  geo_column_type == kMULTILINESTRING || geo_column_type == kPOLYGON ||
407  geo_column_type == kMULTIPOLYGON;
408  }
409 
411  const auto geo_column_type = geo_column_descriptor_->columnType.get_type();
412  return geo_column_type == kPOLYGON || geo_column_type == kMULTIPOLYGON ||
413  geo_column_type == kMULTILINESTRING;
414  }
415 
416  bool hasPolyRingsColumn() const {
417  const auto geo_column_type = geo_column_descriptor_->columnType.get_type();
418  return geo_column_type == kMULTIPOLYGON;
419  }
420 
422 
428 
434 
439 
440  std::vector<std::string> base_values_;
441 
442  // Used repeatedly in parsing geo types, declared as members to prevent
443  // deallocation/reallocation costs
444  std::vector<double> coords_parse_buffer_;
445  std::vector<double> bounds_parse_buffer_;
447  std::vector<int> poly_rings_parse_buffer_;
448 
449  // Used to buffer array appends in memory for a batch
450  std::vector<ArrayDatum> coords_datum_buffer_;
451  std::vector<ArrayDatum> bounds_datum_buffer_;
452  std::vector<ArrayDatum> ring_or_line_sizes_datum_buffer_;
453  std::vector<ArrayDatum> poly_rings_datum_buffer_;
454 
455  // CopyParams
457 };
458 
459 } // namespace foreign_storage
static void throwMismatchedGeoElement(const std::string &omnisci_column_name)
bool geo_promoted_type_match(const SQLTypes a, const SQLTypes b)
Definition: sqltypes.h:2031
#define CHECK_EQ(x, y)
Definition: Logger.h:301
SQLTypes
Definition: sqltypes.h:65
const ColumnDescriptor * poly_rings_column_descriptor_
std::vector< ArrayDatum > coords_datum_buffer_
std::vector< std::string > base_values_
GeospatialEncoder(std::list< Chunk_NS::Chunk > &chunks, const bool geo_validate_geometry)
static ArrayDatum composeNullArray(const SQLTypeInfo &ti)
Definition: Importer.cpp:395
std::vector< ArrayDatum > ring_or_line_sizes_datum_buffer_
std::vector< int > ring_or_line_sizes_parse_buffer_
std::shared_ptr< ChunkMetadata > appendData(int8_t *&src_data, const size_t num_elems_to_append, const SQLTypeInfo &ti, const bool replicating=false, const int64_t offset=-1) override
#define UNREACHABLE()
Definition: Logger.h:338
void validateMetadataSizing(std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata) const
static void getNullGeoColumns(SQLTypeInfo &ti, std::vector< double > &coords, std::vector< double > &bounds, std::vector< int > &ring_sizes, std::vector< int > &poly_rings)
Definition: Types.cpp:1342
std::vector< ArrayDatum > bounds_datum_buffer_
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:391
void appendToArrayEncoderAndUpdateMetadata(const std::vector< ArrayDatum > &datum_parse_buffer, Encoder *encoder, ChunkMetadata *chunk_metadata) const
static void throwMalformedGeoElement(const std::string &omnisci_column_name)
std::conditional_t< is_cuda_compiler(), DeviceArrayDatum, HostArrayDatum > ArrayDatum
Definition: sqltypes.h:229
std::vector< uint8_t > compress_coords(const std::vector< double > &coords, const SQLTypeInfo &ti)
Definition: Compression.cpp:52
int get_physical_cols() const
Definition: sqltypes.h:432
static bool getGeoColumns(const std::string &wkt_or_wkb_hex, SQLTypeInfo &ti, std::vector< double > &coords, std::vector< double > &bounds, std::vector< int > &ring_sizes, std::vector< int > &poly_rings, const bool validate_with_geos_if_available)
Definition: Types.cpp:1121
void validateChunksSizing(std::list< Chunk_NS::Chunk > &chunks) const
specifies the content in-memory of a row in the column metadata table
void processGeoElement(std::string_view geo_string_view)
const ColumnDescriptor * getColumnDescriptor(std::list< Chunk_NS::Chunk > &chunks, const SQLTypes sql_type, GeoColumnType geo_column_type)
ChunkMetadata * ring_or_line_sizes_column_metadata_
const ColumnDescriptor * coords_column_descriptor_
std::vector< double > bounds_parse_buffer_
std::vector< double > coords_parse_buffer_
unencoded fixed length array encoder
ArrayDatum encode_as_array_datum(const std::vector< T > &data)
GeospatialEncoder(std::list< Chunk_NS::Chunk > &chunks, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const bool geo_validate_geometry)
#define CHECK(condition)
Definition: Logger.h:291
bool is_geometry() const
Definition: sqltypes.h:597
GeospatialEncoder(const bool geo_validate_geometry)
For unencoded strings.
const ColumnDescriptor * geo_column_descriptor_
void appendBaseDataAndUpdateMetadata(const int64_t row_count)
const ColumnDescriptor * ring_or_line_sizes_column_descriptor_
SQLTypeInfo columnType
std::tuple< Encoder *, ChunkMetadata *, const ColumnDescriptor * > initEncoderAndGetEncoderAndMetadata(std::list< Chunk_NS::Chunk > &chunks, std::list< std::unique_ptr< ChunkMetadata >> &chunk_metadata, const SQLTypes sql_type, GeoColumnType geo_column_type)
unencoded array encoder
std::vector< ArrayDatum > poly_rings_datum_buffer_
std::string columnName
std::list< T >::iterator getIteratorForGeoColumnType(std::list< T > &list, const SQLTypes column_type, const GeoColumnType geo_column)
virtual ~GeospatialEncoder()=default
const ColumnDescriptor * bounds_column_descriptor_