Skip to content

Commit 63f7e38

Browse files
committed
feat: metrics for parquet writer
1 parent fc80e4b commit 63f7e38

14 files changed

Lines changed: 1822 additions & 10 deletions

src/iceberg/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,7 @@ if(ICEBERG_BUILD_BUNDLE)
233233
avro/avro_schema_util.cc
234234
avro/avro_stream_internal.cc
235235
parquet/parquet_data_util.cc
236+
parquet/parquet_metrics.cc
236237
parquet/parquet_reader.cc
237238
parquet/parquet_register.cc
238239
parquet/parquet_schema_util.cc

src/iceberg/file_writer.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "iceberg/arrow_c_data.h"
3131
#include "iceberg/file_format.h"
3232
#include "iceberg/metrics.h"
33+
#include "iceberg/metrics_config.h"
3334
#include "iceberg/result.h"
3435
#include "iceberg/type_fwd.h"
3536
#include "iceberg/util/config.h"
@@ -77,6 +78,8 @@ struct ICEBERG_EXPORT WriterOptions {
7778
std::shared_ptr<class FileIO> io;
7879
/// \brief Metadata to write to the file.
7980
std::unordered_map<std::string, std::string> metadata;
81+
/// \brief Metrics configuration.
82+
std::shared_ptr<MetricsConfig> metrics_config = MetricsConfig::Default();
8083
/// \brief Format-specific or implementation-specific properties.
8184
WriterProperties properties;
8285
};

src/iceberg/metrics.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,35 @@
3030

3131
namespace iceberg {
3232

33+
/// \brief Field-level metrics for a single column.
34+
///
35+
/// This structure captures value counts, null counts, NaN counts, and optional
36+
/// lower/upper bounds for a specific field identified by its field_id.
37+
struct ICEBERG_EXPORT FieldMetrics {
38+
/// \brief The field ID this metrics belongs to.
39+
int32_t field_id;
40+
41+
/// \brief The total number of values (including nulls) for this field.
42+
/// A negative value indicates the count is unknown.
43+
int64_t value_count = -1;
44+
45+
/// \brief The number of null values for this field.
46+
/// A negative value indicates the count is unknown.
47+
int64_t null_value_count = -1;
48+
49+
/// \brief The number of NaN values for this field.
50+
/// A negative value indicates the count is unknown.
51+
int64_t nan_value_count = -1;
52+
53+
/// \brief The lower bound value as a Literal.
54+
/// Empty if no lower bound is available.
55+
std::optional<Literal> lower_bound = std::nullopt;
56+
57+
/// \brief The upper bound value as a Literal.
58+
/// Empty if no upper bound is available.
59+
std::optional<Literal> upper_bound = std::nullopt;
60+
};
61+
3362
/// \brief Iceberg file format metrics
3463
struct ICEBERG_EXPORT Metrics {
3564
std::optional<int64_t> row_count;

src/iceberg/metrics_config.cc

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
#include "iceberg/metrics_config.h"
2121

22+
#include <limits>
2223
#include <string>
2324
#include <unordered_map>
2425

@@ -100,6 +101,19 @@ Result<MetricsMode> MetricsMode::FromString(std::string_view mode) {
100101
return InvalidArgument("Invalid metrics mode: {}", mode);
101102
}
102103

104+
int32_t MetricsMode::TruncateLength() const {
105+
switch (kind) {
106+
case Kind::kNone:
107+
case Kind::kCounts:
108+
return 0;
109+
case Kind::kTruncate:
110+
return std::get<int32_t>(length);
111+
case Kind::kFull:
112+
return std::numeric_limits<int32_t>::max();
113+
}
114+
return 0;
115+
}
116+
103117
MetricsConfig::MetricsConfig(ColumnModeMap column_modes, MetricsMode default_mode)
104118
: column_modes_(std::move(column_modes)), default_mode_(default_mode) {}
105119

@@ -116,6 +130,14 @@ Result<std::shared_ptr<MetricsConfig>> MetricsConfig::Make(const Table& table) {
116130
*sort_order.value_or(SortOrder::Unsorted()));
117131
}
118132

133+
Result<std::shared_ptr<MetricsConfig>> MetricsConfig::Make(
134+
std::unordered_map<std::string, std::string> properties) {
135+
// Create a minimal TableProperties wrapper for the properties
136+
TableProperties props = TableProperties::FromMap(std::move(properties));
137+
138+
return MakeInternal(props, Schema({}), *SortOrder::Unsorted());
139+
}
140+
119141
Result<std::shared_ptr<MetricsConfig>> MetricsConfig::MakeInternal(
120142
const TableProperties& props, const Schema& schema, const SortOrder& order) {
121143
ColumnModeMap column_modes;

src/iceberg/metrics_config.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,11 @@ struct ICEBERG_EXPORT MetricsMode {
5252

5353
Kind kind;
5454
std::variant<std::monostate, int32_t> length;
55+
56+
/// \brief Get the truncate length from this MetricsMode.
57+
/// \return 0 for None/Counts modes, the truncate length for Truncate mode,
58+
/// or INT_MAX for Full mode.
59+
int32_t TruncateLength() const;
5560
};
5661

5762
/// \brief Configuration for collecting column metrics for an Iceberg table.
@@ -63,6 +68,12 @@ class ICEBERG_EXPORT MetricsConfig {
6368
/// \brief Creates a metrics config from a table.
6469
static Result<std::shared_ptr<MetricsConfig>> Make(const Table& table);
6570

71+
/// \brief Creates a metrics config from properties (for testing)
72+
/// \param properties Map of property key-value pairs
73+
/// \return A shared pointer to the created MetricsConfig
74+
static Result<std::shared_ptr<MetricsConfig>> Make(
75+
std::unordered_map<std::string, std::string> properties);
76+
6677
/// \brief Get `limit` num of primitive field ids from schema
6778
static Result<std::unordered_set<int32_t>> LimitFieldIds(const Schema& schema,
6879
int32_t limit);

0 commit comments

Comments
 (0)