OmniSciDB
a5dc49c757
|
#include <ParquetDataWrapper.h>
Public Member Functions | |
ParquetDataWrapper () | |
ParquetDataWrapper (const int db_id, const ForeignTable *foreign_table, const bool do_metadata_stats_validation=true) | |
ParquetDataWrapper (const ForeignTable *foreign_table, std::shared_ptr< arrow::fs::FileSystem > file_system) | |
Constructor intended for detect use-case only. More... | |
void | populateChunkMetadata (ChunkMetadataVector &chunk_metadata_vector) override |
void | populateChunkBuffers (const ChunkToBufferMap &required_buffers, const ChunkToBufferMap &optional_buffers, AbstractBuffer *delete_buffer) override |
std::string | getSerializedDataWrapper () const override |
void | restoreDataWrapperInternals (const std::string &file_path, const ChunkMetadataVector &chunk_metadata_vector) override |
bool | isRestored () const override |
ParallelismLevel | getCachedParallelismLevel () const override |
ParallelismLevel | getNonCachedParallelismLevel () const override |
DataPreview | getDataPreview (const size_t num_rows) |
Public Member Functions inherited from foreign_storage::AbstractFileStorageDataWrapper | |
AbstractFileStorageDataWrapper () | |
void | validateServerOptions (const ForeignServer *foreign_server) const override |
void | validateTableOptions (const ForeignTable *foreign_table) const override |
const std::set < std::string_view > & | getSupportedTableOptions () const override |
void | validateUserMappingOptions (const UserMapping *user_mapping, const ForeignServer *foreign_server) const override |
const std::set < std::string_view > & | getSupportedUserMappingOptions () const override |
const std::set< std::string > | getAlterableTableOptions () const override |
Public Member Functions inherited from foreign_storage::ForeignDataWrapper | |
ForeignDataWrapper ()=default | |
virtual | ~ForeignDataWrapper ()=default |
virtual void | validateSchema (const std::list< ColumnDescriptor > &columns) const |
virtual bool | isLazyFragmentFetchingEnabled () const |
Private Member Functions | |
std::list< const ColumnDescriptor * > | getColumnsToInitialize (const Interval< ColumnType > &column_interval) |
void | initializeChunkBuffers (const int fragment_index, const Interval< ColumnType > &column_interval, const ChunkToBufferMap &required_buffers, const bool reserve_buffers_and_set_stats=false) |
void | fetchChunkMetadata () |
void | loadBuffersUsingLazyParquetChunkLoader (const int logical_column_id, const int fragment_id, const ChunkToBufferMap &required_buffers, AbstractBuffer *delete_buffer) |
std::vector< std::string > | getOrderedProcessedFilePaths () |
std::vector< std::string > | getAllFilePaths () |
bool | moveToNextFragment (size_t new_rows_count) const |
void | finalizeFragmentMap () |
void | addNewFragment (int row_group, const std::string &file_path) |
bool | isNewFile (const std::string &file_path) const |
void | addNewFile (const std::string &file_path) |
void | setLastFileRowCount (const std::string &file_path) |
void | resetParquetMetadata () |
void | metadataScanFiles (const std::vector< std::string > &file_paths) |
void | metadataScanRowGroupMetadata (const std::list< RowGroupMetadata > &row_group_metadata) |
std::list< RowGroupMetadata > | getRowGroupMetadataForFilePaths (const std::vector< std::string > &file_paths) const |
std::map< FilePathAndRowGroup, RowGroupMetadata > | getRowGroupMetadataMap (const std::vector< std::string > &file_paths) const |
void | updateChunkMetadataForFragment (const Interval< ColumnType > &column_interval, const std::list< std::shared_ptr< ChunkMetadata >> &column_chunk_metadata, int32_t fragment_id) |
void | metadataScanRowGroupIntervals (const std::vector< RowGroupInterval > &row_group_intervals) |
void | updateMetadataForRolledOffFiles (const std::set< std::string > &rolled_off_files) |
void | removeMetadataForLastFile (const std::string &last_file_path) |
Private Attributes | |
const bool | do_metadata_stats_validation_ |
std::map< int, std::vector < RowGroupInterval > > | fragment_to_row_group_interval_map_ |
std::map< ChunkKey, std::shared_ptr< ChunkMetadata > > | chunk_metadata_map_ |
const int | db_id_ |
const ForeignTable * | foreign_table_ |
int | last_fragment_index_ |
size_t | last_fragment_row_count_ |
size_t | total_row_count_ |
size_t | last_file_row_count_ |
int | last_row_group_ |
bool | is_restored_ |
std::unique_ptr < ForeignTableSchema > | schema_ |
std::shared_ptr < arrow::fs::FileSystem > | file_system_ |
std::unique_ptr< FileReaderMap > | file_reader_cache_ |
std::mutex | delete_buffer_mutex_ |
Additional Inherited Members | |
Public Types inherited from foreign_storage::ForeignDataWrapper | |
enum | ParallelismLevel { NONE, INTRA_FRAGMENT, INTER_FRAGMENT } |
Static Public Member Functions inherited from foreign_storage::AbstractFileStorageDataWrapper | |
static shared::FilePathOptions | getFilePathOptions (const ForeignTable *foreign_table) |
Static Public Attributes inherited from foreign_storage::AbstractFileStorageDataWrapper | |
static const std::string | STORAGE_TYPE_KEY = "STORAGE_TYPE" |
static const std::string | BASE_PATH_KEY = "BASE_PATH" |
static const std::string | FILE_PATH_KEY = "FILE_PATH" |
static const std::string | REGEX_PATH_FILTER_KEY = "REGEX_PATH_FILTER" |
static const std::string | LOCAL_FILE_STORAGE_TYPE = "LOCAL_FILE" |
static const std::string | S3_STORAGE_TYPE = "AWS_S3" |
static const std::string | FILE_SORT_ORDER_BY_KEY = shared::FILE_SORT_ORDER_BY_KEY |
static const std::string | FILE_SORT_REGEX_KEY = shared::FILE_SORT_REGEX_KEY |
static const std::string | ALLOW_FILE_ROLL_OFF_KEY = "ALLOW_FILE_ROLL_OFF" |
static const std::string | THREADS_KEY = "THREADS" |
static const std::array < std::string, 1 > | supported_storage_types |
Static Protected Member Functions inherited from foreign_storage::AbstractFileStorageDataWrapper | |
static std::string | getFullFilePath (const ForeignTable *foreign_table) |
Returns the path to the source file/dir of the table. Depending on options this may result from a concatenation of server and table path options. More... | |
static bool | allowFileRollOff (const ForeignTable *foreign_table) |
Definition at line 42 of file ParquetDataWrapper.h.
foreign_storage::ParquetDataWrapper::ParquetDataWrapper | ( | ) |
Definition at line 75 of file ParquetDataWrapper.cpp.
foreign_storage::ParquetDataWrapper::ParquetDataWrapper | ( | const int | db_id, |
const ForeignTable * | foreign_table, | ||
const bool | do_metadata_stats_validation = true |
||
) |
Definition at line 92 of file ParquetDataWrapper.cpp.
References file_system_, foreign_storage::ForeignTable::foreign_server, foreign_storage::AbstractFileStorageDataWrapper::LOCAL_FILE_STORAGE_TYPE, foreign_storage::OptionsContainer::options, foreign_storage::AbstractFileStorageDataWrapper::STORAGE_TYPE_KEY, and UNREACHABLE.
foreign_storage::ParquetDataWrapper::ParquetDataWrapper | ( | const ForeignTable * | foreign_table, |
std::shared_ptr< arrow::fs::FileSystem > | file_system | ||
) |
Constructor intended for detect use-case only.
Definition at line 78 of file ParquetDataWrapper.cpp.
|
private |
Definition at line 232 of file ParquetDataWrapper.cpp.
References foreign_storage::AbstractFileStorageDataWrapper::allowFileRollOff(), CHECK, CHECK_EQ, foreign_table_, fragment_to_row_group_interval_map_, last_fragment_index_, last_row_group_, and setLastFileRowCount().
Referenced by metadataScanRowGroupMetadata().
|
private |
Definition at line 202 of file ParquetDataWrapper.cpp.
References CHECK, fragment_to_row_group_interval_map_, last_fragment_index_, last_fragment_row_count_, last_row_group_, and setLastFileRowCount().
Referenced by metadataScanRowGroupMetadata().
|
private |
Definition at line 255 of file ParquetDataWrapper.cpp.
References foreign_storage::AbstractFileStorageDataWrapper::allowFileRollOff(), CHECK, CHECK_EQ, shared::check_for_rolled_off_file_paths(), chunk_metadata_map_, shared::contains(), db_id_, file_reader_cache_, file_system_, foreign_table_, getAllFilePaths(), Catalog_Namespace::SysCatalog::getCatalog(), getOrderedProcessedFilePaths(), Catalog_Namespace::SysCatalog::instance(), foreign_storage::ForeignTable::isAppendMode(), last_file_row_count_, metadataScanFiles(), removeMetadataForLastFile(), resetParquetMetadata(), foreign_storage::throw_removed_file_error(), foreign_storage::throw_removed_row_in_file_error(), and updateMetadataForRolledOffFiles().
Referenced by populateChunkMetadata().
|
private |
Definition at line 197 of file ParquetDataWrapper.cpp.
References fragment_to_row_group_interval_map_, last_fragment_index_, and last_row_group_.
Referenced by metadataScanRowGroupMetadata().
|
private |
Definition at line 395 of file ParquetDataWrapper.cpp.
References DEBUG_TIMER, foreign_storage::ForeignTable::foreign_server, foreign_table_, foreign_storage::AbstractFileStorageDataWrapper::getFilePathOptions(), foreign_storage::AbstractFileStorageDataWrapper::getFullFilePath(), foreign_storage::AbstractFileStorageDataWrapper::LOCAL_FILE_STORAGE_TYPE, shared::local_glob_filter_sort_files(), foreign_storage::OptionsContainer::options, foreign_storage::AbstractFileStorageDataWrapper::STORAGE_TYPE_KEY, and UNREACHABLE.
Referenced by fetchChunkMetadata(), and getDataPreview().
|
inlineoverridevirtual |
Gets the desired level of parallelism for the data wrapper when a cache is in use. This affects the optional buffers that the data wrapper is made aware of during data requests.
Reimplemented from foreign_storage::ForeignDataWrapper.
Definition at line 70 of file ParquetDataWrapper.h.
References foreign_storage::ForeignDataWrapper::INTER_FRAGMENT.
|
private |
Definition at line 126 of file ParquetDataWrapper.cpp.
References CHECK, db_id_, foreign_storage::Interval< T >::end, Catalog_Namespace::SysCatalog::getCatalog(), Catalog_Namespace::SysCatalog::instance(), schema_, and foreign_storage::Interval< T >::start.
Referenced by initializeChunkBuffers().
DataPreview foreign_storage::ParquetDataWrapper::getDataPreview | ( | const size_t | num_rows | ) |
Definition at line 735 of file ParquetDataWrapper.cpp.
References file_reader_cache_, file_system_, foreign_table_, getAllFilePaths(), and foreign_storage::AbstractFileStorageDataWrapper::getFullFilePath().
|
inlineoverridevirtual |
Gets the desired level of parallelism for the data wrapper when no cache is in use. This affects the optional buffers that the data wrapper is made aware of during data requests.
Reimplemented from foreign_storage::ForeignDataWrapper.
Definition at line 72 of file ParquetDataWrapper.h.
References foreign_storage::ForeignDataWrapper::INTRA_FRAGMENT.
|
private |
Definition at line 383 of file ParquetDataWrapper.cpp.
References fragment_to_row_group_interval_map_.
Referenced by fetchChunkMetadata().
|
private |
Definition at line 441 of file ParquetDataWrapper.cpp.
References do_metadata_stats_validation_, file_reader_cache_, file_system_, foreign_table_, foreign_storage::LazyParquetChunkLoader::metadataScan(), and schema_.
Referenced by getRowGroupMetadataMap(), and metadataScanFiles().
|
private |
Definition at line 844 of file ParquetDataWrapper.cpp.
References getRowGroupMetadataForFilePaths().
Referenced by metadataScanRowGroupIntervals(), and updateMetadataForRolledOffFiles().
|
overridevirtual |
Serialize internal state of wrapper into file at given path if implemented
Implements foreign_storage::ForeignDataWrapper.
Definition at line 687 of file ParquetDataWrapper.cpp.
References json_utils::add_value_to_object(), fragment_to_row_group_interval_map_, last_file_row_count_, last_fragment_index_, last_fragment_row_count_, last_row_group_, total_row_count_, and json_utils::write_to_string().
|
private |
Definition at line 143 of file ParquetDataWrapper.cpp.
References CHECK, chunk_metadata_map_, db_id_, foreign_table_, shared::get_from_map(), getColumnsToInitialize(), kENCODING_NONE, and TableDescriptor::tableId.
Referenced by loadBuffersUsingLazyParquetChunkLoader().
|
private |
Definition at line 215 of file ParquetDataWrapper.cpp.
References foreign_storage::AbstractFileStorageDataWrapper::allowFileRollOff(), CHECK, CHECK_EQ, foreign_table_, fragment_to_row_group_interval_map_, and last_fragment_index_.
Referenced by metadataScanRowGroupMetadata().
|
overridevirtual |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 731 of file ParquetDataWrapper.cpp.
References is_restored_.
|
private |
Definition at line 487 of file ParquetDataWrapper.cpp.
References Data_Namespace::AbstractBuffer::append(), CHECK, CHECK_GT, chunk_metadata_map_, ColumnDescriptor::columnType, db_id_, delete_buffer_mutex_, foreign_storage::Interval< T >::end, file_reader_cache_, file_system_, foreign_table_, fragment_to_row_group_interval_map_, SQLTypeInfo::get_comp_param(), SQLTypeInfo::get_elem_type(), shared::get_from_map(), SQLTypeInfo::get_physical_cols(), Catalog_Namespace::SysCatalog::getCatalog(), Data_Namespace::AbstractBuffer::getMemoryPtr(), initializeChunkBuffers(), Catalog_Namespace::SysCatalog::instance(), SQLTypeInfo::is_array(), SQLTypeInfo::is_dict_encoded_string(), SQLTypeInfo::is_geometry(), foreign_storage::LazyParquetChunkLoader::loadChunk(), schema_, Data_Namespace::AbstractBuffer::size(), foreign_storage::Interval< T >::start, and TableDescriptor::tableId.
Referenced by populateChunkBuffers().
|
private |
Definition at line 409 of file ParquetDataWrapper.cpp.
References getRowGroupMetadataForFilePaths(), and metadataScanRowGroupMetadata().
Referenced by fetchChunkMetadata().
|
private |
Definition at line 824 of file ParquetDataWrapper.cpp.
References shared::get_from_map(), getRowGroupMetadataMap(), and metadataScanRowGroupMetadata().
Referenced by removeMetadataForLastFile().
|
private |
Definition at line 414 of file ParquetDataWrapper.cpp.
References addNewFile(), addNewFragment(), CHECK_EQ, finalizeFragmentMap(), isNewFile(), last_fragment_index_, last_fragment_row_count_, last_row_group_, moveToNextFragment(), schema_, total_row_count_, and updateChunkMetadataForFragment().
Referenced by metadataScanFiles(), and metadataScanRowGroupIntervals().
|
private |
Definition at line 474 of file ParquetDataWrapper.cpp.
References foreign_table_, last_fragment_row_count_, and TableDescriptor::maxFragRows.
Referenced by metadataScanRowGroupMetadata().
|
overridevirtual |
Populates given chunk buffers identified by chunk keys. All provided chunk buffers are expected to be for the same fragment.
required_buffers | - chunk buffers that must always be populated |
optional_buffers | - chunk buffers that can be optionally populated, if the data wrapper has to scan through chunk data anyways (typically for row wise data formats) |
delete_buffer | - chunk buffer for fragment's delete column, if non-null data wrapper is expected to mark deleted rows in buffer and continue processing |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 620 of file ParquetDataWrapper.cpp.
References CHECK, CHECK_EQ, CHUNK_KEY_COLUMN_IDX, CHUNK_KEY_FRAGMENT_IDX, foreign_storage::create_futures_for_workers(), db_id_, DEBUG_TIMER_NEW_THREAD, foreign_table_, foreign_storage::get_num_threads(), loadBuffersUsingLazyParquetChunkLoader(), schema_, logger::ThreadLocalIds::setNewThreadId(), TableDescriptor::tableId, logger::ThreadLocalIds::thread_id_, logger::thread_local_ids(), to_string(), and VLOG.
|
overridevirtual |
Populates given chunk metadata vector with metadata for all chunks in related foreign table.
chunk_metadata_vector | - vector that will be populated with chunk metadata |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 479 of file ParquetDataWrapper.cpp.
References chunk_metadata_map_, and fetchChunkMetadata().
|
private |
Definition at line 746 of file ParquetDataWrapper.cpp.
References CHECK, CHECK_EQ, CHUNK_KEY_FRAGMENT_IDX, chunk_metadata_map_, fragment_to_row_group_interval_map_, shared::get_from_map(), last_fragment_index_, last_fragment_row_count_, last_row_group_, metadataScanRowGroupIntervals(), resetParquetMetadata(), and total_row_count_.
Referenced by fetchChunkMetadata().
|
private |
Definition at line 114 of file ParquetDataWrapper.cpp.
References file_reader_cache_, fragment_to_row_group_interval_map_, last_file_row_count_, last_fragment_index_, last_fragment_row_count_, last_row_group_, and total_row_count_.
Referenced by fetchChunkMetadata(), and removeMetadataForLastFile().
|
overridevirtual |
Restore internal state of datawrapper
file_path | - location of file created by serializeMetadata |
chunk_metadata_vector | - vector of chunk metadata recovered from disk |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 707 of file ParquetDataWrapper.cpp.
References CHECK, chunk_metadata_map_, fragment_to_row_group_interval_map_, json_utils::get_value_from_object(), is_restored_, last_file_row_count_, last_fragment_index_, last_fragment_row_count_, last_row_group_, json_utils::read_from_file(), and total_row_count_.
|
private |
Definition at line 250 of file ParquetDataWrapper.cpp.
References file_reader_cache_, file_system_, and last_file_row_count_.
Referenced by addNewFile(), and addNewFragment().
|
private |
Definition at line 448 of file ParquetDataWrapper.cpp.
References CHECK, CHECK_EQ, chunk_metadata_map_, db_id_, foreign_storage::Interval< T >::end, foreign_table_, foreign_storage::anonymous_namespace{ParquetDataWrapper.cpp}::reduce_metadata(), schema_, foreign_storage::Interval< T >::start, and TableDescriptor::tableId.
Referenced by metadataScanRowGroupMetadata(), and updateMetadataForRolledOffFiles().
|
private |
Definition at line 313 of file ParquetDataWrapper.cpp.
References CHECK, CHUNK_KEY_FRAGMENT_IDX, chunk_metadata_map_, shared::contains(), fragment_to_row_group_interval_map_, shared::get_from_map(), getRowGroupMetadataMap(), schema_, and updateChunkMetadataForFragment().
Referenced by fetchChunkMetadata().
|
private |
Definition at line 132 of file ParquetDataWrapper.h.
Referenced by fetchChunkMetadata(), initializeChunkBuffers(), loadBuffersUsingLazyParquetChunkLoader(), populateChunkMetadata(), removeMetadataForLastFile(), restoreDataWrapperInternals(), updateChunkMetadataForFragment(), and updateMetadataForRolledOffFiles().
|
private |
Definition at line 133 of file ParquetDataWrapper.h.
Referenced by fetchChunkMetadata(), getColumnsToInitialize(), initializeChunkBuffers(), loadBuffersUsingLazyParquetChunkLoader(), populateChunkBuffers(), and updateChunkMetadataForFragment().
|
private |
Definition at line 145 of file ParquetDataWrapper.h.
Referenced by loadBuffersUsingLazyParquetChunkLoader().
|
private |
Definition at line 130 of file ParquetDataWrapper.h.
Referenced by getRowGroupMetadataForFilePaths().
|
private |
Definition at line 143 of file ParquetDataWrapper.h.
Referenced by fetchChunkMetadata(), getDataPreview(), getRowGroupMetadataForFilePaths(), loadBuffersUsingLazyParquetChunkLoader(), resetParquetMetadata(), and setLastFileRowCount().
|
private |
Definition at line 142 of file ParquetDataWrapper.h.
Referenced by fetchChunkMetadata(), getDataPreview(), getRowGroupMetadataForFilePaths(), loadBuffersUsingLazyParquetChunkLoader(), ParquetDataWrapper(), and setLastFileRowCount().
|
private |
Definition at line 134 of file ParquetDataWrapper.h.
Referenced by addNewFile(), fetchChunkMetadata(), getAllFilePaths(), getDataPreview(), getRowGroupMetadataForFilePaths(), initializeChunkBuffers(), isNewFile(), loadBuffersUsingLazyParquetChunkLoader(), moveToNextFragment(), populateChunkBuffers(), and updateChunkMetadataForFragment().
|
private |
Definition at line 131 of file ParquetDataWrapper.h.
Referenced by addNewFile(), addNewFragment(), finalizeFragmentMap(), getOrderedProcessedFilePaths(), getSerializedDataWrapper(), isNewFile(), loadBuffersUsingLazyParquetChunkLoader(), removeMetadataForLastFile(), resetParquetMetadata(), restoreDataWrapperInternals(), and updateMetadataForRolledOffFiles().
|
private |
Definition at line 140 of file ParquetDataWrapper.h.
Referenced by isRestored(), and restoreDataWrapperInternals().
|
private |
Definition at line 138 of file ParquetDataWrapper.h.
Referenced by fetchChunkMetadata(), getSerializedDataWrapper(), resetParquetMetadata(), restoreDataWrapperInternals(), and setLastFileRowCount().
|
private |
Definition at line 135 of file ParquetDataWrapper.h.
Referenced by addNewFile(), addNewFragment(), finalizeFragmentMap(), getSerializedDataWrapper(), isNewFile(), metadataScanRowGroupMetadata(), removeMetadataForLastFile(), resetParquetMetadata(), and restoreDataWrapperInternals().
|
private |
Definition at line 136 of file ParquetDataWrapper.h.
Referenced by addNewFragment(), getSerializedDataWrapper(), metadataScanRowGroupMetadata(), moveToNextFragment(), removeMetadataForLastFile(), resetParquetMetadata(), and restoreDataWrapperInternals().
|
private |
Definition at line 139 of file ParquetDataWrapper.h.
Referenced by addNewFile(), addNewFragment(), finalizeFragmentMap(), getSerializedDataWrapper(), metadataScanRowGroupMetadata(), removeMetadataForLastFile(), resetParquetMetadata(), and restoreDataWrapperInternals().
|
private |
Definition at line 141 of file ParquetDataWrapper.h.
Referenced by getColumnsToInitialize(), getRowGroupMetadataForFilePaths(), loadBuffersUsingLazyParquetChunkLoader(), metadataScanRowGroupMetadata(), populateChunkBuffers(), updateChunkMetadataForFragment(), and updateMetadataForRolledOffFiles().
|
private |
Definition at line 137 of file ParquetDataWrapper.h.
Referenced by getSerializedDataWrapper(), metadataScanRowGroupMetadata(), removeMetadataForLastFile(), resetParquetMetadata(), and restoreDataWrapperInternals().