19 #include <parquet/column_scanner.h>
20 #include <parquet/exception.h>
21 #include <parquet/platform.h>
24 namespace foreign_storage {
27 std::shared_ptr<arrow::fs::FileSystem>& file_system) {
29 auto file_result = file_system->OpenInputFile(file_path);
30 if (!file_result.ok()) {
31 throw std::runtime_error{
"Unable to access " + file_system->type_name() +
" file: " +
32 file_path +
". " + file_result.status().message()};
34 auto infile = file_result.ValueOrDie();
35 PARQUET_THROW_NOT_OK(OpenFile(infile, arrow::default_memory_pool(), &reader));
40 auto file_metadata = reader->parquet_reader()->metadata();
41 const auto num_row_groups = file_metadata->num_row_groups();
42 const auto num_columns = file_metadata->num_columns();
43 return std::make_pair(num_row_groups, num_columns);
47 const parquet::arrow::FileReader* reader,
48 const int logical_column_index) {
49 return reader->parquet_reader()->metadata()->schema()->Column(logical_column_index);
53 return reader->parquet_reader()
56 ->Column(logical_column_index)
61 const parquet::ColumnDescriptor* reference_descriptor,
62 const parquet::ColumnDescriptor* new_descriptor,
63 const std::string& reference_file_path,
64 const std::string& new_file_path) {
65 if (!reference_descriptor->Equals(*new_descriptor)) {
66 throw std::runtime_error{
"Parquet file \"" + new_file_path +
67 "\" has a different schema. Please ensure that all Parquet "
68 "files use the same schema. Reference Parquet file: " +
70 ", column name: " + reference_descriptor->name() +
71 ". New Parquet file: " + new_file_path +
72 ", column name: " + new_descriptor->name() +
"."};
79 if (column_type.get_size() == -1 && column_type.is_dict_encoded_string()) {
82 return std::make_unique<ColumnDescriptor>(
87 const parquet::ColumnChunkMetaData* column_metadata) {
88 CHECK(column_metadata->is_stats_set());
89 std::shared_ptr<parquet::Statistics>
stats = column_metadata->statistics();
std::shared_ptr< parquet::Statistics > validate_and_get_column_metadata_statistics(const parquet::ColumnChunkMetaData *column_metadata)
std::pair< int, int > get_parquet_table_size(const ReaderPtr &reader)
std::unique_ptr< ColumnDescriptor > get_sub_type_column_descriptor(const ColumnDescriptor *column)
parquet::Type::type get_physical_type(ReaderPtr &reader, const int logical_column_index)
void validate_equal_column_descriptor(const parquet::ColumnDescriptor *reference_descriptor, const parquet::ColumnDescriptor *new_descriptor, const std::string &reference_file_path, const std::string &new_file_path)
UniqueReaderPtr open_parquet_table(const std::string &file_path, std::shared_ptr< arrow::fs::FileSystem > &file_system)
const parquet::ColumnDescriptor * get_column_descriptor(const parquet::arrow::FileReader *reader, const int logical_column_index)
specifies the content in-memory of a row in the column metadata table
parquet::arrow::FileReader * ReaderPtr
std::unique_ptr< parquet::arrow::FileReader > UniqueReaderPtr
SQLTypeInfo get_elem_type() const