OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ArrowForeignStorage.cpp File Reference
#include "ArrowForeignStorage.h"
#include <arrow/api.h>
#include <arrow/csv/reader.h>
#include <arrow/io/file.h>
#include <arrow/util/decimal.h>
#include <tbb/parallel_for.h>
#include <tbb/task_group.h>
#include <array>
#include <future>
#include <vector>
#include "Catalog/DataframeTableDescriptor.h"
#include "DataMgr/ForeignStorage/ForeignStorageInterface.h"
#include "DataMgr/StringNoneEncoder.h"
#include "Logger/Logger.h"
#include "QueryEngine/ArrowResultSet.h"
#include "Shared/ArrowUtil.h"
#include "Shared/measure.h"
+ Include dependency graph for ArrowForeignStorage.cpp:

Go to the source code of this file.

Classes

struct  Frag
 
struct  ArrowFragment
 
class  ArrowForeignStorageBase
 
class  ArrowForeignStorage
 
class  ArrowCsvForeignStorage
 

Functions

void convertBoolBitmapBufferWithNulls (int8_t *dst, const uint8_t *src, const uint8_t *bitmap, int64_t length, int8_t null_value)
 
void convertBoolBitmapBufferWithoutNulls (int8_t *dst, const uint8_t *src, int64_t length)
 
template<typename V , std::enable_if_t<!std::is_same_v< V, bool > &&std::is_integral< V >::value, int > = 0>
get_null_value ()
 
template<typename V , std::enable_if_t< std::is_same_v< V, bool >, int > = 0>
int8_t get_null_value ()
 
std::vector< FragcalculateFragmentsOffsets (const arrow::ChunkedArray &array, size_t maxFragRows)
 
static SQLTypeInfo getOmnisciType (const arrow::DataType &type)
 
void setArrowTable (std::string name, std::shared_ptr< arrow::Table > table)
 
void releaseArrowTable (std::string name)
 
void registerArrowForeignStorage (std::shared_ptr< ForeignStorageInterface > fsi)
 
static std::shared_ptr
< arrow::DataType > 
getArrowImportType (const SQLTypeInfo type)
 
void registerArrowCsvForeignStorage (std::shared_ptr< ForeignStorageInterface > fsi)
 

Function Documentation

std::vector<Frag> calculateFragmentsOffsets ( const arrow::ChunkedArray &  array,
size_t  maxFragRows 
)

Definition at line 320 of file ArrowForeignStorage.cpp.

Referenced by ArrowForeignStorageBase::parseArrowTable().

321  {
322  std::vector<Frag> fragments;
323  size_t sz = 0;
324  size_t offset = 0;
325  fragments.push_back({0, 0, 0, 0});
326  size_t num_chunks = (size_t)array.num_chunks();
327  for (size_t i = 0; i < num_chunks;) {
328  auto& chunk = *array.chunk(i);
329  auto& frag = *fragments.rbegin();
330  if (maxFragRows - sz > chunk.length() - offset) {
331  sz += chunk.length() - offset;
332  if (i == num_chunks - 1) {
333  fragments.rbegin()->last_chunk = num_chunks - 1;
334  fragments.rbegin()->last_chunk_size =
335  array.chunk((int)num_chunks - 1)->length() - offset;
336  }
337  offset = 0;
338  i++;
339  } else {
340  frag.last_chunk = i;
341  frag.last_chunk_size = maxFragRows - sz;
342  offset += maxFragRows - sz;
343  sz = 0;
344  fragments.push_back({i, offset, 0, 0});
345  }
346  }
347  if (fragments.rbegin()->first_chunk == fragments.rbegin()->first_chunk &&
348  fragments.rbegin()->last_chunk_size == 0) {
349  // remove empty fragment at the end if any
350  fragments.pop_back();
351  }
352  return fragments;
353 }

+ Here is the caller graph for this function:

void convertBoolBitmapBufferWithNulls ( int8_t *  dst,
const uint8_t *  src,
const uint8_t *  bitmap,
int64_t  length,
int8_t  null_value 
)

Definition at line 142 of file ArrowForeignStorage.cpp.

References run_benchmark_import::dest, and is_null().

Referenced by ArrowForeignStorageBase::replaceNullValuesImpl().

146  {
147  for (int64_t bitmap_idx = 0; bitmap_idx < length / 8; ++bitmap_idx) {
148  auto source = src[bitmap_idx];
149  auto dest = dst + bitmap_idx * 8;
150  auto inversed_bitmap = ~bitmap[bitmap_idx];
151  for (int8_t bitmap_offset = 0; bitmap_offset < 8; ++bitmap_offset) {
152  auto is_null = (inversed_bitmap >> bitmap_offset) & 1;
153  auto val = (source >> bitmap_offset) & 1;
154  dest[bitmap_offset] = is_null ? null_value : val;
155  }
156  }
157 
158  for (int64_t j = (length / 8) * 8; j < length; ++j) {
159  auto is_null = (~bitmap[length / 8] >> (j % 8)) & 1;
160  auto val = (src[length / 8] >> (j % 8)) & 1;
161  dst[j] = is_null ? null_value : val;
162  }
163 }
CONSTEXPR DEVICE bool is_null(const T &value)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void convertBoolBitmapBufferWithoutNulls ( int8_t *  dst,
const uint8_t *  src,
int64_t  length 
)

Definition at line 165 of file ArrowForeignStorage.cpp.

References run_benchmark_import::dest.

Referenced by ArrowForeignStorageBase::replaceNullValuesImpl().

167  {
168  for (int64_t bitmap_idx = 0; bitmap_idx < length / 8; ++bitmap_idx) {
169  auto source = src[bitmap_idx];
170  auto dest = dst + bitmap_idx * 8;
171  for (int8_t bitmap_offset = 0; bitmap_offset < 8; ++bitmap_offset) {
172  dest[bitmap_offset] = (source >> bitmap_offset) & 1;
173  }
174  }
175 
176  for (int64_t j = (length / 8) * 8; j < length; ++j) {
177  dst[j] = (src[length / 8] >> (j % 8)) & 1;
178  }
179 }

+ Here is the caller graph for this function:

template<typename V , std::enable_if_t<!std::is_same_v< V, bool > &&std::is_integral< V >::value, int > = 0>
V get_null_value ( )
inline

Definition at line 184 of file ArrowForeignStorage.cpp.

Referenced by initializeFlatBuffer().

184  {
185  return inline_int_null_value<V>();
186 }

+ Here is the caller graph for this function:

template<typename V , std::enable_if_t< std::is_same_v< V, bool >, int > = 0>
int8_t get_null_value ( )
inline

Definition at line 189 of file ArrowForeignStorage.cpp.

189  {
190  return inline_int_null_value<int8_t>();
191 }
static std::shared_ptr<arrow::DataType> getArrowImportType ( const SQLTypeInfo  type)
static

Definition at line 924 of file ArrowForeignStorage.cpp.

References CHECK, SQLTypeInfo::get_precision(), SQLTypeInfo::get_scale(), SQLTypeInfo::get_size(), SQLTypeInfo::get_type(), SQLTypeInfo::get_type_name(), IS_INTEGER, kARRAY, kBOOLEAN, kCHAR, kDATE, kDECIMAL, kDOUBLE, kFLOAT, kINTERVAL_DAY_TIME, kINTERVAL_YEAR_MONTH, kNUMERIC, kTEXT, kTIME, kTIMESTAMP, kVARCHAR, and to_string().

Referenced by ArrowCsvForeignStorage::registerTable().

924  {
925  using namespace arrow;
926  auto ktype = type.get_type();
927  if (IS_INTEGER(ktype)) {
928  switch (type.get_size()) {
929  case 1:
930  return int8();
931  case 2:
932  return int16();
933  case 4:
934  return int32();
935  case 8:
936  return int64();
937  default:
938  CHECK(false);
939  }
940  }
941  switch (ktype) {
942  case kBOOLEAN:
943  return arrow::boolean();
944  case kFLOAT:
945  return float32();
946  case kDOUBLE:
947  return float64();
948  case kCHAR:
949  case kVARCHAR:
950  case kTEXT:
951  return utf8();
952  case kDECIMAL:
953  case kNUMERIC:
954  return decimal(type.get_precision(), type.get_scale());
955  case kTIME:
956  return time32(TimeUnit::SECOND);
957  case kDATE:
958 #ifdef HAVE_CUDA
959  return arrow::date64();
960 #else
961  return arrow::date32();
962 #endif
963  case kTIMESTAMP:
964  switch (type.get_precision()) {
965  case 0:
966  return timestamp(TimeUnit::SECOND);
967  case 3:
968  return timestamp(TimeUnit::MILLI);
969  case 6:
970  return timestamp(TimeUnit::MICRO);
971  case 9:
972  return timestamp(TimeUnit::NANO);
973  default:
974  throw std::runtime_error("Unsupported timestamp precision for Arrow: " +
975  std::to_string(type.get_precision()));
976  }
977  case kARRAY:
978  case kINTERVAL_DAY_TIME:
980  default:
981  throw std::runtime_error(type.get_type_name() + " is not supported in Arrow.");
982  }
983  return nullptr;
984 }
HOST DEVICE int get_size() const
Definition: sqltypes.h:403
Definition: sqltypes.h:76
HOST DEVICE int get_scale() const
Definition: sqltypes.h:396
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:391
std::string to_string(char const *&&v)
int get_precision() const
Definition: sqltypes.h:394
Definition: sqltypes.h:79
Definition: sqltypes.h:80
std::string get_type_name() const
Definition: sqltypes.h:484
#define IS_INTEGER(T)
Definition: sqltypes.h:304
Definition: sqltypes.h:68
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

static SQLTypeInfo getOmnisciType ( const arrow::DataType &  type)
static

Definition at line 793 of file ArrowForeignStorage.cpp.

References kBIGINT, kBOOLEAN, kDATE, kDECIMAL, kDOUBLE, kENCODING_DICT, kFLOAT, kINT, kSMALLINT, kTEXT, kTIME, kTIMESTAMP, kTINYINT, heavydb.dtypes::STRING, and run_benchmark_import::type.

Referenced by ArrowForeignStorage::prepareTable().

793  {
794  using namespace arrow;
795  switch (type.id()) {
796  case Type::INT8:
797  return SQLTypeInfo(kTINYINT, false);
798  case Type::INT16:
799  return SQLTypeInfo(kSMALLINT, false);
800  case Type::INT32:
801  return SQLTypeInfo(kINT, false);
802  case Type::INT64:
803  return SQLTypeInfo(kBIGINT, false);
804  case Type::BOOL:
805  return SQLTypeInfo(kBOOLEAN, false);
806  case Type::FLOAT:
807  return SQLTypeInfo(kFLOAT, false);
808  case Type::DATE32:
809  case Type::DATE64:
810  return SQLTypeInfo(kDATE, false);
811  case Type::DOUBLE:
812  return SQLTypeInfo(kDOUBLE, false);
813  // uncomment when arrow 2.0 will be released and modin support for dictionary types
814  // in read_csv would be implemented
815 
816  // case Type::DICTIONARY: {
817  // auto type = SQLTypeInfo(kTEXT, false, kENCODING_DICT);
818  // // this is needed because createTable forces type.size to be equal to
819  // // comp_param / 8, no matter what type.size you set here
820  // type.set_comp_param(sizeof(uint32_t) * 8);
821  // return type;
822  // }
823  // case Type::STRING:
824  // return SQLTypeInfo(kTEXT, false, kENCODING_NONE);
825 
826  case Type::STRING: {
827  auto type = SQLTypeInfo(kTEXT, false, kENCODING_DICT);
828  // this is needed because createTable forces type.size to be equal to
829  // comp_param / 8, no matter what type.size you set here
830  type.set_comp_param(sizeof(uint32_t) * 8);
831  return type;
832  }
833  case Type::DECIMAL: {
834  const auto& decimal_type = static_cast<const arrow::DecimalType&>(type);
835  return SQLTypeInfo(kDECIMAL, decimal_type.precision(), decimal_type.scale(), false);
836  }
837  case Type::TIME32:
838  return SQLTypeInfo(kTIME, false);
839  case Type::TIMESTAMP:
840  switch (static_cast<const arrow::TimestampType&>(type).unit()) {
841  case TimeUnit::SECOND:
842  return SQLTypeInfo(kTIMESTAMP, 0, 0);
843  case TimeUnit::MILLI:
844  return SQLTypeInfo(kTIMESTAMP, 3, 0);
845  case TimeUnit::MICRO:
846  return SQLTypeInfo(kTIMESTAMP, 6, 0);
847  case TimeUnit::NANO:
848  return SQLTypeInfo(kTIMESTAMP, 9, 0);
849  }
850  default:
851  throw std::runtime_error(type.ToString() + " is not yet supported.");
852  }
853 }
Definition: sqltypes.h:76
tuple STRING
Definition: dtypes.py:31
Definition: sqltypes.h:79
Definition: sqltypes.h:80
Definition: sqltypes.h:72

+ Here is the caller graph for this function:

void registerArrowCsvForeignStorage ( std::shared_ptr< ForeignStorageInterface fsi)

Definition at line 1063 of file ArrowForeignStorage.cpp.

Referenced by PersistentStorageMgr::PersistentStorageMgr().

1063  {
1064  fsi->registerPersistentStorageInterface(std::make_unique<ArrowCsvForeignStorage>());
1065 }

+ Here is the caller graph for this function:

void registerArrowForeignStorage ( std::shared_ptr< ForeignStorageInterface fsi)

Definition at line 893 of file ArrowForeignStorage.cpp.

Referenced by PersistentStorageMgr::PersistentStorageMgr().

893  {
894  fsi->registerPersistentStorageInterface(std::make_unique<ArrowForeignStorage>());
895 }

+ Here is the caller graph for this function:

void releaseArrowTable ( std::string  name)

Definition at line 889 of file ArrowForeignStorage.cpp.

References ArrowForeignStorage::tables.

Referenced by EmbeddedDatabase::DBEngineImpl::importArrowTable().

889  {
891 }
static std::map< std::string, std::shared_ptr< arrow::Table > > tables
string name
Definition: setup.in.py:72

+ Here is the caller graph for this function:

void setArrowTable ( std::string  name,
std::shared_ptr< arrow::Table >  table 
)

Definition at line 885 of file ArrowForeignStorage.cpp.

References setup::name, and ArrowForeignStorage::tables.

Referenced by EmbeddedDatabase::DBEngineImpl::importArrowTable().

885  {
887 }
static std::map< std::string, std::shared_ptr< arrow::Table > > tables
string name
Definition: setup.in.py:72

+ Here is the caller graph for this function: