24 #include <parquet/metadata.h>
26 namespace foreign_storage {
40 const int16_t* rep_levels,
41 const int64_t values_read,
42 const int64_t levels_read,
45 virtual void appendData(
const int16_t* def_levels,
46 const int16_t* rep_levels,
47 const int64_t values_read,
48 const int64_t levels_read,
52 const parquet::RowGroupMetaData* group_metadata,
53 const int parquet_column_index,
55 int64_t null_count{0};
60 auto column_metadata = group_metadata->ColumnChunk(parquet_column_index);
62 null_count =
stats->null_count();
67 metadata->chunkStats.has_nulls = null_count > 0;
70 metadata->numElements = group_metadata->num_rows();
97 auto metadata = std::make_shared<ChunkMetadata>();
101 auto encoder = buffer.getEncoder();
102 encoder->getMetadata(metadata);
103 metadata->sqlType = column_type;
108 std::stringstream error_message;
109 error_message <<
"A null value was detected in Parquet column '"
110 << parquet_column_name <<
"' but HeavyDB column is set to not null";
111 throw std::runtime_error(error_message.str());
117 bool has_nulls = null_count > 0;
132 const int16_t* rep_levels,
133 const int64_t values_read,
134 const int64_t levels_read,
144 virtual void setNull(int8_t* omnisci_data_bytes) = 0;
145 virtual void copy(
const int8_t* omnisci_data_bytes_source,
146 int8_t* omnisci_data_bytes_destination) = 0;
148 int8_t* omnisci_data_bytes) = 0;
151 int8_t* omnisci_data_bytes,
152 const size_t num_elements) = 0;
154 virtual void validate(
const int8_t* parquet_data,
159 const int64_t j)
const = 0;
ParquetEncoder(Data_Namespace::AbstractBuffer *buffer)
virtual void initializeErrorTracking()
bool is_error_tracking_enabled_
RejectedRowIndices invalid_indices_
virtual ~ParquetEncoder()=default
std::shared_ptr< parquet::Statistics > validate_and_get_column_metadata_statistics(const parquet::ColumnChunkMetaData *column_metadata)
virtual std::string encodedDataToString(const int8_t *bytes) const =0
static void throwNotNullViolation(const std::string &parquet_column_name)
virtual void disableMetadataStatsValidation()
virtual void appendDataTrackErrors(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values)=0
RejectedRowIndices getRejectedRowIndices() const
virtual void encodeAndCopy(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes)=0
virtual void appendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values)=0
void initEncoder(const SQLTypeInfo &tmp_sql_type)
std::set< int64_t > InvalidRowGroupIndices
static void validateNullCount(const std::string &parquet_column_name, int64_t null_count, const SQLTypeInfo &column_type)
virtual void setNull(int8_t *omnisci_data_bytes)=0
An AbstractBuffer is a unit of data management for a data manager.
bool g_enable_smem_group_by true
virtual void validate(const int8_t *parquet_data, const int64_t j, const SQLTypeInfo &column_type) const =0
static std::shared_ptr< ChunkMetadata > createMetadata(const SQLTypeInfo &column_type)
virtual std::shared_ptr< ChunkMetadata > getRowGroupMetadata(const parquet::RowGroupMetaData *group_metadata, const int parquet_column_index, const SQLTypeInfo &column_type)
virtual void encodeAndCopyContiguous(const int8_t *parquet_data_bytes, int8_t *omnisci_data_bytes, const size_t num_elements)=0
virtual void validateUsingEncodersColumnType(const int8_t *parquet_data, const int64_t j) const =0
virtual void validateAndAppendData(const int16_t *def_levels, const int16_t *rep_levels, const int64_t values_read, const int64_t levels_read, int8_t *values, const SQLTypeInfo &column_type, InvalidRowGroupIndices &invalid_indices)=0
virtual void eraseInvalidIndicesInBuffer(const InvalidRowGroupIndices &invalid_indices)=0
std::set< int64_t > RejectedRowIndices
virtual void initializeColumnType(const SQLTypeInfo &column_type)
bool g_enable_watchdog false
ParquetScalarEncoder(Data_Namespace::AbstractBuffer *buffer)
HOST DEVICE bool get_notnull() const
bool validate_metadata_stats_
Data_Namespace::AbstractBuffer * buffer_
SQLTypeInfo get_elem_type() const
size_t current_chunk_offset_
virtual void copy(const int8_t *omnisci_data_bytes_source, int8_t *omnisci_data_bytes_destination)=0