OmniSciDB
a5dc49c757
|
#include <LazyParquetChunkLoader.h>
Public Member Functions | |
LazyParquetChunkLoader (std::shared_ptr< arrow::fs::FileSystem > file_system, FileReaderMap *file_reader_cache, const ForeignTable *foreign_table) | |
std::list< std::unique_ptr < ChunkMetadata > > | loadChunk (const std::vector< RowGroupInterval > &row_group_intervals, const int parquet_column_index, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary=nullptr, RejectedRowIndices *rejected_row_indices=nullptr) |
std::list< RowGroupMetadata > | metadataScan (const std::vector< std::string > &file_paths, const ForeignTableSchema &schema, const bool do_metadata_stats_validation=true) |
Perform a metadata scan for the paths specified. More... | |
std::pair< size_t, size_t > | loadRowGroups (const RowGroupInterval &row_group_interval, const std::map< int, Chunk_NS::Chunk > &chunks, const ForeignTableSchema &schema, const std::map< int, StringDictionary * > &column_dictionaries, const int num_threads=1) |
Load row groups of data into given chunks. More... | |
DataPreview | previewFiles (const std::vector< std::string > &files, const size_t max_num_rows, const ForeignTable &table) |
Preview rows of data and column types in a set of files. More... | |
Static Public Member Functions | |
static bool | isColumnMappingSupported (const ColumnDescriptor *omnisci_column, const parquet::ColumnDescriptor *parquet_column) |
Static Public Attributes | |
static const int | batch_reader_num_elements = 4096 |
Private Member Functions | |
std::list< std::unique_ptr < ChunkMetadata > > | appendRowGroups (const std::vector< RowGroupInterval > &row_group_intervals, const int parquet_column_index, const ColumnDescriptor *column_descriptor, std::list< Chunk_NS::Chunk > &chunks, StringDictionary *string_dictionary, RejectedRowIndices *rejected_row_indices, const bool is_for_detect=false, const std::optional< int64_t > max_levels_read=std::nullopt) |
Static Private Member Functions | |
static SQLTypeInfo | suggestColumnMapping (const parquet::ColumnDescriptor *parquet_column) |
Private Attributes | |
std::shared_ptr < arrow::fs::FileSystem > | file_system_ |
FileReaderMap * | file_reader_cache_ |
const ForeignTable * | foreign_table_ |
A lazy parquet to chunk loader
Definition at line 37 of file LazyParquetChunkLoader.h.
foreign_storage::LazyParquetChunkLoader::LazyParquetChunkLoader | ( | std::shared_ptr< arrow::fs::FileSystem > | file_system, |
FileReaderMap * | file_reader_cache, | ||
const ForeignTable * | foreign_table | ||
) |
Definition at line 2083 of file LazyParquetChunkLoader.cpp.
References CHECK, and foreign_table_.
|
private |
Definition at line 1828 of file LazyParquetChunkLoader.cpp.
References batch_reader_num_elements, CHECK, ColumnDescriptor::columnId, ColumnDescriptor::columnName, ColumnDescriptor::columnType, foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::create_parquet_encoder(), DEBUG_TIMER, Timer< TimeT >::elapsed(), file_reader_cache_, file_system_, foreign_table_, foreign_storage::ForeignTable::GEO_VALIDATE_GEOMETRY_KEY, foreign_storage::get_column_descriptor(), foreign_storage::get_parquet_table_size(), foreign_storage::OptionsContainer::getOptionAsBool(), foreign_storage::FileReaderMap::getOrInsert(), SQLTypeInfo::is_array(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::resize_values_buffer(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::set_definition_levels_for_zero_max_definition_level_case(), Timer< TimeT >::start(), Timer< TimeT >::stop(), TableDescriptor::tableName, to_string(), foreign_storage::validate_equal_column_descriptor(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_list_column_metadata_statistics(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_max_repetition_and_definition_level(), and VLOG.
Referenced by loadChunk(), and previewFiles().
|
static |
Determine if a Parquet to OmniSci column mapping is supported.
omnisci_column | - the column descriptor of the OmniSci column |
parquet_column | - the column descriptor of the Parquet column |
Definition at line 2048 of file LazyParquetChunkLoader.cpp.
References CHECK, ColumnDescriptor::columnType, SQLTypeInfo::is_array(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_date_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_decimal_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_floating_point_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_geospatial_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_integral_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_none_type_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_string_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_time_mapping(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_timestamp_mapping().
Referenced by foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_allowed_mapping().
std::list< std::unique_ptr< ChunkMetadata > > foreign_storage::LazyParquetChunkLoader::loadChunk | ( | const std::vector< RowGroupInterval > & | row_group_intervals, |
const int | parquet_column_index, | ||
std::list< Chunk_NS::Chunk > & | chunks, | ||
StringDictionary * | string_dictionary = nullptr , |
||
RejectedRowIndices * | rejected_row_indices = nullptr |
||
) |
Load a number of row groups of a column in a parquet file into a chunk
row_group_interval | - an inclusive interval [start,end] that specifies row groups to load |
parquet_column_index | - the logical column index in the parquet file (and omnisci db) of column to load |
chunks | - a list containing the chunks to load |
string_dictionary | - a string dictionary for the column corresponding to the column, if applicable |
rejected_row_indices | - optional, if specified errors will be tracked in this data structure while loading |
NOTE: if more than one chunk is supplied, the first chunk is required to be the chunk corresponding to the logical column, while the remaining chunks correspond to physical columns (in ascending order of column id.) Similarly, if a metada update is expected, the list of ChunkMetadata shared pointers returned will correspond directly to the list chunks
.
Definition at line 2093 of file LazyParquetChunkLoader.cpp.
References appendRowGroups(), and CHECK.
Referenced by foreign_storage::ParquetDataWrapper::loadBuffersUsingLazyParquetChunkLoader().
std::pair< size_t, size_t > foreign_storage::LazyParquetChunkLoader::loadRowGroups | ( | const RowGroupInterval & | row_group_interval, |
const std::map< int, Chunk_NS::Chunk > & | chunks, | ||
const ForeignTableSchema & | schema, | ||
const std::map< int, StringDictionary * > & | column_dictionaries, | ||
const int | num_threads = 1 |
||
) |
Load row groups of data into given chunks.
row_group_interval | - specifies which row groups to load |
chunks | - map of column index to chunk which data will be loaded into |
schema | - schema of the foreign table to perform metadata scan for |
column_dictionaries | - a map of string dictionaries for columns that require it |
num_threads | - number of threads to utilize while reading (if applicale) |
Note that only logical chunks are expected because the data is read into an intermediate form into the underlying buffers. This member is intended to be used for import.
NOTE: Currently, loading one row group at a time is required.
Definition at line 2203 of file LazyParquetChunkLoader.cpp.
References threading_serial::async(), CHECK, DEBUG_TIMER, foreign_storage::RowGroupInterval::end_index, foreign_storage::RowGroupInterval::file_path, file_system_, foreign_table_, foreign_storage::ForeignTable::GEO_VALIDATE_GEOMETRY_KEY, shared::get_from_map(), foreign_storage::get_parquet_table_size(), foreign_storage::ForeignTableSchema::getColumnDescriptor(), foreign_storage::ForeignTableSchema::getLogicalColumns(), foreign_storage::OptionsContainer::getOptionAsBool(), foreign_storage::ForeignTableSchema::getParquetColumnIndex(), foreign_storage::open_parquet_table(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::populate_encoder_map_for_import(), foreign_storage::RowGroupInterval::start_index, logger::thread_id(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_allowed_mapping(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_max_repetition_and_definition_level(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_number_of_columns().
std::list< RowGroupMetadata > foreign_storage::LazyParquetChunkLoader::metadataScan | ( | const std::vector< std::string > & | file_paths, |
const ForeignTableSchema & | schema, | ||
const bool | do_metadata_stats_validation = true |
||
) |
Perform a metadata scan for the paths specified.
file_paths | - (ordered) files of the metadata scan |
schema | - schema of the foreign table to perform metadata scan for |
do_metadata_stats_validation | - validate stats in metadata of parquet files if true |
file_paths
Definition at line 2514 of file LazyParquetChunkLoader.cpp.
References threading_serial::async(), CHECK, DEBUG_TIMER, Timer< TimeT >::elapsed(), file_reader_cache_, file_system_, foreign_table_, foreign_storage::ForeignTable::GEO_VALIDATE_GEOMETRY_KEY, foreign_storage::get_num_threads(), foreign_storage::get_parquet_table_size(), foreign_storage::ForeignTableSchema::getLogicalAndPhysicalColumns(), foreign_storage::OptionsContainer::getOptionAsBool(), foreign_storage::FileReaderMap::initializeIfEmpty(), foreign_storage::FileReaderMap::insert(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::metadata_scan_rowgroup_interval(), foreign_storage::partition_for_threads(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::populate_encoder_map_for_metadata_scan(), Timer< TimeT >::start(), Timer< TimeT >::stop(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::throw_row_group_larger_than_fragment_size_error(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_parquet_metadata(), and VLOG.
Referenced by foreign_storage::ParquetDataWrapper::getRowGroupMetadataForFilePaths().
DataPreview foreign_storage::LazyParquetChunkLoader::previewFiles | ( | const std::vector< std::string > & | files, |
const size_t | max_num_rows, | ||
const ForeignTable & | table | ||
) |
Preview rows of data and column types in a set of files.
files | - files to preview |
max_num_rows | - maximum number of rows to preview |
table | - foreign table for preview |
DataPreview
instance that contains relevant preview information Definition at line 2355 of file LazyParquetChunkLoader.cpp.
References appendRowGroups(), CHECK, CHECK_EQ, CHECK_GE, foreign_storage::PreviewContext::column_chunks, foreign_storage::PreviewContext::column_descriptors, foreign_storage::DataPreview::column_names, foreign_storage::DataPreview::column_types, ColumnDescriptor::columnId, ColumnDescriptor::columnName, ColumnDescriptor::columnType, foreign_storage::create_futures_for_workers(), foreign_storage::PreviewContext::detect_buffers, foreign_storage::detect_geo_type(), file_reader_cache_, file_system_, foreign_storage::get_num_threads(), foreign_storage::FileReaderMap::getOrInsert(), gpu_enabled::iota(), ColumnDescriptor::isSystemCol, ColumnDescriptor::isVirtualCol, kENCODING_NONE, foreign_storage::DataPreview::num_rejected_rows, foreign_storage::PreviewContext::rejected_row_indices_per_column, foreign_storage::DataPreview::sample_rows, suggestColumnMapping(), ColumnDescriptor::tableId, and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::validate_equal_schema().
|
staticprivate |
Suggest a possible Parquet to OmniSci column mapping based on heuristics.
parquet_column | - the column descriptor of the Parquet column |
SQLTypeInfo
given the Parquet column typeNOTE: the suggested type may be entirely inappropriate given a specific use-case; however, it is guaranteed to be an allowed mapping. For example, geo-types are never attempted to be detected and instead strings are always suggested in their place.
Definition at line 2036 of file LazyParquetChunkLoader.cpp.
References foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::is_valid_parquet_list_column(), foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::suggest_column_scalar_type(), and run_benchmark_import::type.
Referenced by previewFiles().
|
static |
Definition at line 42 of file LazyParquetChunkLoader.h.
Referenced by appendRowGroups(), foreign_storage::ParquetRowGroupReader::readAndValidateRowGroup(), and foreign_storage::anonymous_namespace{LazyParquetChunkLoader.cpp}::resize_values_buffer().
|
private |
Definition at line 171 of file LazyParquetChunkLoader.h.
Referenced by appendRowGroups(), metadataScan(), and previewFiles().
|
private |
Definition at line 170 of file LazyParquetChunkLoader.h.
Referenced by appendRowGroups(), loadRowGroups(), metadataScan(), and previewFiles().
|
private |
Definition at line 173 of file LazyParquetChunkLoader.h.
Referenced by appendRowGroups(), LazyParquetChunkLoader(), loadRowGroups(), and metadataScan().