11#include " parquet_reader.hpp"
22
3- #include " reader/boolean_column_reader.hpp"
4- #include " reader/callback_column_reader.hpp"
3+ #include " duckdb/common/optional_ptr.hpp"
4+ #include " duckdb/function/partition_stats.hpp"
5+ #include " parquet_types.h"
56#include " column_reader.hpp"
6- #include " duckdb.hpp"
77#include " reader/expression_column_reader.hpp"
88#include " parquet_geometry.hpp"
99#include " reader/list_column_reader.hpp"
1010#include " parquet_crypto.hpp"
1111#include " parquet_file_metadata_cache.hpp"
1212#include " parquet_statistics.hpp"
13- #include " parquet_timestamp.hpp"
1413#include " mbedtls_wrapper.hpp"
1514#include " reader/row_number_column_reader.hpp"
16- #include " reader/string_column_reader.hpp"
1715#include " reader/variant_column_reader.hpp"
1816#include " reader/struct_column_reader.hpp"
19- #include " reader/templated_column_reader.hpp"
2017#include " thrift_tools.hpp"
2118#include " duckdb/main/config.hpp"
2219#include " duckdb/common/encryption_state.hpp"
2320#include " duckdb/common/file_system.hpp"
2421#include " duckdb/common/helper.hpp"
25- #include " duckdb/common/hive_partitioning.hpp"
2622#include " duckdb/common/string_util.hpp"
2723#include " duckdb/planner/table_filter.hpp"
2824#include " duckdb/storage/object_cache.hpp"
2925#include " duckdb/optimizer/statistics_propagator.hpp"
3026#include " duckdb/planner/table_filter_state.hpp"
3127#include " duckdb/common/multi_file/multi_file_reader.hpp"
32- #include " duckdb/logging/log_manager.hpp"
33- #include " duckdb/common/multi_file/multi_file_column_mapper.hpp"
34- #include " duckdb/common/encryption_functions.hpp"
35-
36- #include < cassert>
37- #include < chrono>
38- #include < cstring>
39- #include < sstream>
4028
4129namespace duckdb {
4230
@@ -176,7 +164,7 @@ LoadMetadata(ClientContext &context, Allocator &allocator, CachingFileHandle &fi
176164 }
177165 ParquetCrypto::GenerateAdditionalAuthenticatedData (allocator, aad_crypto_metadata);
178166 ParquetCrypto::Read (*metadata, *file_proto, encryption_config->GetFooterKey (), encryption_util,
179- std::move ( aad_crypto_metadata) );
167+ aad_crypto_metadata);
180168 } else {
181169 metadata->read (file_proto.get ());
182170 }
@@ -650,8 +638,8 @@ ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_d
650638 if (is_repeated) {
651639 auto list_type = LogicalType::LIST (result.type );
652640 vector<ParquetColumnSchema> list_child = {std::move (result)};
653- result = ParquetColumnSchema::FromChildSchemas (s_ele.name , std::move ( list_type) , max_define, max_repeat,
654- this_idx, next_file_idx, std::move (list_child));
641+ result = ParquetColumnSchema::FromChildSchemas (s_ele.name , list_type, max_define, max_repeat, this_idx ,
642+ next_file_idx, std::move (list_child));
655643 }
656644 result.parent_schema_index = this_idx;
657645 return result;
@@ -665,8 +653,8 @@ ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_d
665653 if (s_ele.repetition_type == FieldRepetitionType::REPEATED) {
666654 auto list_type = LogicalType::LIST (result.type );
667655 vector<ParquetColumnSchema> list_child = {std::move (result)};
668- return ParquetColumnSchema::FromChildSchemas (s_ele.name , std::move ( list_type) , max_define, max_repeat,
669- this_idx, next_file_idx, std::move (list_child));
656+ return ParquetColumnSchema::FromChildSchemas (s_ele.name , list_type, max_define, max_repeat, this_idx ,
657+ next_file_idx, std::move (list_child));
670658 }
671659
672660 return result;
@@ -1233,17 +1221,64 @@ void ParquetReader::InitializeScan(ClientContext &context, ParquetReaderScanStat
12331221}
12341222
12351223void ParquetReader::GetPartitionStats (vector<PartitionStatistics> &result) {
1236- GetPartitionStats (*GetFileMetadata (), result);
1224+ GetPartitionStats (*GetFileMetadata (), result, *root_schema, parquet_options );
12371225}
12381226
1239- void ParquetReader::GetPartitionStats (const duckdb_parquet::FileMetaData &metadata,
1240- vector<PartitionStatistics> &result) {
1227+ struct ParquetPartitionRowGroup : public PartitionRowGroup {
1228+ ParquetPartitionRowGroup (const duckdb_parquet::FileMetaData &metadata_p,
1229+ optional_ptr<ParquetColumnSchema> root_schema_p,
1230+ optional_ptr<ParquetOptions> parquet_options_p, const idx_t row_group_idx_p)
1231+ : metadata(metadata_p), root_schema(root_schema_p), parquet_options(parquet_options_p),
1232+ row_group_idx (row_group_idx_p) {
1233+ }
1234+
1235+ const duckdb_parquet::FileMetaData &metadata;
1236+ const optional_ptr<ParquetColumnSchema> root_schema;
1237+ const optional_ptr<ParquetOptions> parquet_options;
1238+ const idx_t row_group_idx;
1239+
1240+ unique_ptr<BaseStatistics> GetColumnStatistics (const StorageIndex &storage_index) override {
1241+ const idx_t primary_index = storage_index.GetPrimaryIndex ();
1242+ D_ASSERT (metadata.row_groups .size () > row_group_idx);
1243+ D_ASSERT (root_schema->children .size () > primary_index);
1244+
1245+ const auto &row_group = metadata.row_groups [row_group_idx];
1246+ const auto &column_schema = root_schema->children [primary_index];
1247+ return column_schema.Stats (metadata, *parquet_options, row_group_idx, row_group.columns );
1248+ }
1249+
1250+ bool MinMaxIsExact (const BaseStatistics &, const StorageIndex &storage_index) override {
1251+ const idx_t primary_index = storage_index.GetPrimaryIndex ();
1252+ D_ASSERT (metadata.row_groups .size () > row_group_idx);
1253+ D_ASSERT (root_schema->children .size () > primary_index);
1254+
1255+ const auto &row_group = metadata.row_groups [row_group_idx];
1256+ const auto &column_chunk = row_group.columns [primary_index];
1257+
1258+ if (column_chunk.__isset .meta_data && column_chunk.meta_data .__isset .statistics &&
1259+ column_chunk.meta_data .statistics .__isset .is_min_value_exact &&
1260+ column_chunk.meta_data .statistics .__isset .is_max_value_exact ) {
1261+ const auto &stats = column_chunk.meta_data .statistics ;
1262+ return stats.is_min_value_exact && stats.is_max_value_exact ;
1263+ }
1264+ return false ;
1265+ }
1266+ };
1267+
1268+ void ParquetReader::GetPartitionStats (const duckdb_parquet::FileMetaData &metadata, vector<PartitionStatistics> &result,
1269+ optional_ptr<ParquetColumnSchema> root_schema,
1270+ optional_ptr<ParquetOptions> parquet_options) {
12411271 idx_t offset = 0 ;
1242- for (auto &row_group : metadata.row_groups ) {
1272+ for (idx_t i = 0 ; i < metadata.row_groups .size (); i++) {
1273+ auto &row_group = metadata.row_groups [i];
12431274 PartitionStatistics partition_stats;
12441275 partition_stats.row_start = offset;
12451276 partition_stats.count = row_group.num_rows ;
12461277 partition_stats.count_type = CountType::COUNT_EXACT;
1278+ if (root_schema && parquet_options) {
1279+ partition_stats.partition_row_group =
1280+ make_shared_ptr<ParquetPartitionRowGroup>(metadata, root_schema, parquet_options, i);
1281+ }
12471282 offset += row_group.num_rows ;
12481283 result.push_back (partition_stats);
12491284 }
0 commit comments