OmniSciDB
a5dc49c757
|
#include <ParquetImporter.h>
Public Member Functions | |
ParquetImporter () | |
ParquetImporter (const int db_id, const ForeignTable *foreign_table, const UserMapping *user_mapping) | |
void | populateChunkMetadata (ChunkMetadataVector &chunk_metadata_vector) override |
void | populateChunkBuffers (const ChunkToBufferMap &required_buffers, const ChunkToBufferMap &optional_buffers, AbstractBuffer *delete_buffer) override |
std::string | getSerializedDataWrapper () const override |
void | restoreDataWrapperInternals (const std::string &file_path, const ChunkMetadataVector &chunk_metadata_vector) override |
bool | isRestored () const override |
ParallelismLevel | getCachedParallelismLevel () const override |
ParallelismLevel | getNonCachedParallelismLevel () const override |
std::unique_ptr < import_export::ImportBatchResult > | getNextImportBatch () |
std::vector< std::pair< const ColumnDescriptor *, StringDictionary * > > | getStringDictionaries () const |
int | getMaxNumUsefulThreads () const |
void | setNumThreads (const int num_threads) |
Public Member Functions inherited from foreign_storage::AbstractFileStorageDataWrapper | |
AbstractFileStorageDataWrapper () | |
void | validateServerOptions (const ForeignServer *foreign_server) const override |
void | validateTableOptions (const ForeignTable *foreign_table) const override |
const std::set < std::string_view > & | getSupportedTableOptions () const override |
void | validateUserMappingOptions (const UserMapping *user_mapping, const ForeignServer *foreign_server) const override |
const std::set < std::string_view > & | getSupportedUserMappingOptions () const override |
const std::set< std::string > | getAlterableTableOptions () const override |
Public Member Functions inherited from foreign_storage::ForeignDataWrapper | |
ForeignDataWrapper ()=default | |
virtual | ~ForeignDataWrapper ()=default |
virtual void | validateSchema (const std::list< ColumnDescriptor > &columns) const |
virtual bool | isLazyFragmentFetchingEnabled () const |
Private Member Functions | |
std::set< std::string > | getAllFilePaths () |
Private Attributes | |
const int | db_id_ |
const ForeignTable * | foreign_table_ |
int | num_threads_ |
std::unique_ptr < AbstractRowGroupIntervalTracker > | row_group_interval_tracker_ |
std::unique_ptr < ForeignTableSchema > | schema_ |
std::shared_ptr < arrow::fs::FileSystem > | file_system_ |
std::unique_ptr< FileReaderMap > | file_reader_cache_ |
std::vector< std::pair< const ColumnDescriptor *, StringDictionary * > > | string_dictionaries_per_column_ |
std::shared_mutex | row_group_interval_tracker_mutex_ |
std::shared_mutex | string_dictionaries_per_column_mutex_ |
Additional Inherited Members | |
Public Types inherited from foreign_storage::ForeignDataWrapper | |
enum | ParallelismLevel { NONE, INTRA_FRAGMENT, INTER_FRAGMENT } |
Static Public Member Functions inherited from foreign_storage::AbstractFileStorageDataWrapper | |
static shared::FilePathOptions | getFilePathOptions (const ForeignTable *foreign_table) |
Static Public Attributes inherited from foreign_storage::AbstractFileStorageDataWrapper | |
static const std::string | STORAGE_TYPE_KEY = "STORAGE_TYPE" |
static const std::string | BASE_PATH_KEY = "BASE_PATH" |
static const std::string | FILE_PATH_KEY = "FILE_PATH" |
static const std::string | REGEX_PATH_FILTER_KEY = "REGEX_PATH_FILTER" |
static const std::string | LOCAL_FILE_STORAGE_TYPE = "LOCAL_FILE" |
static const std::string | S3_STORAGE_TYPE = "AWS_S3" |
static const std::string | FILE_SORT_ORDER_BY_KEY = shared::FILE_SORT_ORDER_BY_KEY |
static const std::string | FILE_SORT_REGEX_KEY = shared::FILE_SORT_REGEX_KEY |
static const std::string | ALLOW_FILE_ROLL_OFF_KEY = "ALLOW_FILE_ROLL_OFF" |
static const std::string | THREADS_KEY = "THREADS" |
static const std::array < std::string, 1 > | supported_storage_types |
Static Protected Member Functions inherited from foreign_storage::AbstractFileStorageDataWrapper | |
static std::string | getFullFilePath (const ForeignTable *foreign_table) |
Returns the path to the source file/dir of the table. Depending on options this may result from a concatenation of server and table path options. More... | |
static bool | allowFileRollOff (const ForeignTable *foreign_table) |
Definition at line 40 of file ParquetImporter.h.
foreign_storage::ParquetImporter::ParquetImporter | ( | ) |
Definition at line 226 of file ParquetImporter.cpp.
foreign_storage::ParquetImporter::ParquetImporter | ( | const int | db_id, |
const ForeignTable * | foreign_table, | ||
const UserMapping * | user_mapping | ||
) |
Definition at line 229 of file ParquetImporter.cpp.
References file_system_, foreign_storage::ForeignTable::foreign_server, foreign_storage::AbstractFileStorageDataWrapper::LOCAL_FILE_STORAGE_TYPE, foreign_storage::OptionsContainer::options, foreign_storage::AbstractFileStorageDataWrapper::STORAGE_TYPE_KEY, and UNREACHABLE.
|
private |
Definition at line 245 of file ParquetImporter.cpp.
References CHECK_EQ, DEBUG_TIMER, file_system_, foreign_table_, foreign_storage::AbstractFileStorageDataWrapper::getFullFilePath(), foreign_storage::throw_file_access_error(), and foreign_storage::throw_file_not_found_error().
Referenced by getNextImportBatch().
|
inlineoverridevirtual |
Gets the desired level of parallelism for the data wrapper when a cache is in use. This affects the optional buffers that the data wrapper is made aware of during data requests.
Reimplemented from foreign_storage::ForeignDataWrapper.
Definition at line 62 of file ParquetImporter.h.
References UNREACHABLE.
int foreign_storage::ParquetImporter::getMaxNumUsefulThreads | ( | ) | const |
Get the maximum number of threads that can do useful computation.
Definition at line 218 of file ParquetImporter.cpp.
References schema_.
std::unique_ptr< import_export::ImportBatchResult > foreign_storage::ParquetImporter::getNextImportBatch | ( | ) |
Produce the next ImportBatchResult
for import. This is the only functionality of ParquetImporter
that is required to be implemented.
ImportBatchResult
for import. Definition at line 307 of file ParquetImporter.cpp.
References db_id_, file_reader_cache_, file_system_, foreign_table_, getAllFilePaths(), num_threads_, row_group_interval_tracker_, row_group_interval_tracker_mutex_, schema_, string_dictionaries_per_column_, and string_dictionaries_per_column_mutex_.
|
inlineoverridevirtual |
Gets the desired level of parallelism for the data wrapper when no cache is in use. This affects the optional buffers that the data wrapper is made aware of during data requests.
Reimplemented from foreign_storage::ForeignDataWrapper.
Definition at line 67 of file ParquetImporter.h.
References UNREACHABLE.
|
overridevirtual |
Serialize internal state of wrapper into file at given path if implemented
Implements foreign_storage::ForeignDataWrapper.
Definition at line 297 of file ParquetImporter.cpp.
References UNREACHABLE.
std::vector< std::pair< const ColumnDescriptor *, StringDictionary * > > foreign_storage::ParquetImporter::getStringDictionaries | ( | ) | const |
Return string dictionaries that are used per column.
Definition at line 303 of file ParquetImporter.cpp.
References string_dictionaries_per_column_.
|
overridevirtual |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 360 of file ParquetImporter.cpp.
References UNREACHABLE.
|
overridevirtual |
Populates given chunk buffers identified by chunk keys. All provided chunk buffers are expected to be for the same fragment.
required_buffers | - chunk buffers that must always be populated |
optional_buffers | - chunk buffers that can be optionally populated, if the data wrapper has to scan through chunk data anyways (typically for row wise data formats) |
delete_buffer | - chunk buffer for fragment's delete column, if non-null data wrapper is expected to mark deleted rows in buffer and continue processing |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 291 of file ParquetImporter.cpp.
References UNREACHABLE.
|
overridevirtual |
Populates given chunk metadata vector with metadata for all chunks in related foreign table.
chunk_metadata_vector | - vector that will be populated with chunk metadata |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 287 of file ParquetImporter.cpp.
References UNREACHABLE.
|
overridevirtual |
Restore internal state of datawrapper
file_path | - location of file created by serializeMetadata |
chunk_metadata_vector | - vector of chunk metadata recovered from disk |
Implements foreign_storage::ForeignDataWrapper.
Definition at line 354 of file ParquetImporter.cpp.
References UNREACHABLE.
void foreign_storage::ParquetImporter::setNumThreads | ( | const int | num_threads | ) |
Set the number of threads to use internally when reading batches.
Definition at line 222 of file ParquetImporter.cpp.
References num_threads_.
|
private |
Definition at line 99 of file ParquetImporter.h.
Referenced by getNextImportBatch().
|
private |
Definition at line 109 of file ParquetImporter.h.
Referenced by getNextImportBatch().
|
private |
Definition at line 108 of file ParquetImporter.h.
Referenced by getAllFilePaths(), getNextImportBatch(), and ParquetImporter().
|
private |
Definition at line 100 of file ParquetImporter.h.
Referenced by getAllFilePaths(), and getNextImportBatch().
|
private |
Definition at line 101 of file ParquetImporter.h.
Referenced by getNextImportBatch(), and setNumThreads().
|
private |
Definition at line 105 of file ParquetImporter.h.
Referenced by getNextImportBatch().
|
private |
Definition at line 113 of file ParquetImporter.h.
Referenced by getNextImportBatch().
|
private |
Definition at line 107 of file ParquetImporter.h.
Referenced by getMaxNumUsefulThreads(), and getNextImportBatch().
|
private |
Definition at line 111 of file ParquetImporter.h.
Referenced by getNextImportBatch(), and getStringDictionaries().
|
private |
Definition at line 114 of file ParquetImporter.h.
Referenced by getNextImportBatch().