Skip to content

Commit abc7aa8

Browse files
author
xuan.zhao
committed
feat(puffin): add format constants, utilities, and JSON serialization
1 parent 133742d commit abc7aa8

File tree

13 files changed

+630
-27
lines changed

13 files changed

+630
-27
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ set(ICEBERG_SOURCES
6464
partition_spec.cc
6565
partition_summary.cc
6666
puffin/file_metadata.cc
67+
puffin/puffin_format.cc
68+
puffin/puffin_json_internal.cc
6769
row/arrow_array_wrapper.cc
6870
row/manifest_wrapper.cc
6971
row/partition_values.cc

src/iceberg/deletes/roaring_position_bitmap.cc

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -49,28 +49,6 @@ int64_t ToPosition(int32_t key, uint32_t pos32) {
4949
return (int64_t{key} << 32) | int64_t{pos32};
5050
}
5151

52-
void WriteLE64(char* buf, int64_t value) {
53-
auto le = ToLittleEndian(static_cast<uint64_t>(value));
54-
std::memcpy(buf, &le, sizeof(le));
55-
}
56-
57-
void WriteLE32(char* buf, int32_t value) {
58-
auto le = ToLittleEndian(static_cast<uint32_t>(value));
59-
std::memcpy(buf, &le, sizeof(le));
60-
}
61-
62-
int64_t ReadLE64(const char* buf) {
63-
uint64_t v;
64-
std::memcpy(&v, buf, sizeof(v));
65-
return static_cast<int64_t>(FromLittleEndian(v));
66-
}
67-
68-
int32_t ReadLE32(const char* buf) {
69-
uint32_t v;
70-
std::memcpy(&v, buf, sizeof(v));
71-
return static_cast<int32_t>(FromLittleEndian(v));
72-
}
73-
7452
Status ValidatePosition(int64_t pos) {
7553
if (pos < 0 || pos > RoaringPositionBitmap::kMaxPosition) {
7654
return InvalidArgument("Bitmap supports positions that are >= 0 and <= {}: {}",
@@ -189,12 +167,12 @@ Result<std::string> RoaringPositionBitmap::Serialize() const {
189167
char* buf = result.data();
190168

191169
// Write bitmap count (array length including empties)
192-
WriteLE64(buf, static_cast<int64_t>(impl_->bitmaps.size()));
170+
WriteLittleEndian(static_cast<int64_t>(impl_->bitmaps.size()), buf);
193171
buf += kBitmapCountSizeBytes;
194172

195173
// Write each bitmap with its key
196174
for (int32_t key = 0; std::cmp_less(key, impl_->bitmaps.size()); ++key) {
197-
WriteLE32(buf, key);
175+
WriteLittleEndian(key, buf);
198176
buf += kBitmapKeySizeBytes;
199177
size_t written = impl_->bitmaps[key].write(buf, /*portable=*/true);
200178
buf += written;
@@ -210,7 +188,7 @@ Result<RoaringPositionBitmap> RoaringPositionBitmap::Deserialize(std::string_vie
210188
ICEBERG_PRECHECK(remaining >= kBitmapCountSizeBytes,
211189
"Buffer too small for bitmap count: {} bytes", remaining);
212190

213-
int64_t bitmap_count = ReadLE64(buf);
191+
auto bitmap_count = ReadLittleEndian<int64_t>(buf);
214192
buf += kBitmapCountSizeBytes;
215193
remaining -= kBitmapCountSizeBytes;
216194

@@ -226,7 +204,7 @@ Result<RoaringPositionBitmap> RoaringPositionBitmap::Deserialize(std::string_vie
226204
ICEBERG_PRECHECK(remaining >= kBitmapKeySizeBytes,
227205
"Buffer too small for bitmap key: {} bytes", remaining);
228206

229-
int32_t key = ReadLE32(buf);
207+
auto key = ReadLittleEndian<int32_t>(buf);
230208
buf += kBitmapKeySizeBytes;
231209
remaining -= kBitmapKeySizeBytes;
232210

src/iceberg/meson.build

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ iceberg_sources = files(
8282
'partition_spec.cc',
8383
'partition_summary.cc',
8484
'puffin/file_metadata.cc',
85+
'puffin/puffin_format.cc',
86+
'puffin/puffin_json_internal.cc',
8587
'row/arrow_array_wrapper.cc',
8688
'row/manifest_wrapper.cc',
8789
'row/partition_values.cc',

src/iceberg/puffin/meson.build

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,4 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
install_headers(['file_metadata.h'], subdir: 'iceberg/puffin')
18+
install_headers(['file_metadata.h', 'puffin_format.h'], subdir: 'iceberg/puffin')
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/puffin/puffin_format.h"
21+
22+
#include <utility>
23+
24+
#include "iceberg/util/endian.h"
25+
#include "iceberg/util/macros.h"
26+
27+
namespace iceberg::puffin {
28+
29+
namespace {
30+
31+
constexpr std::pair<int, int> GetFlagPosition(PuffinFlag flag) {
32+
switch (flag) {
33+
case PuffinFlag::kFooterPayloadCompressed:
34+
return {0, 0};
35+
}
36+
std::unreachable();
37+
}
38+
39+
} // namespace
40+
41+
bool IsFlagSet(std::span<const uint8_t, 4> flags, PuffinFlag flag) {
42+
auto [byte_num, bit_num] = GetFlagPosition(flag);
43+
return (flags[byte_num] & (1 << bit_num)) != 0;
44+
}
45+
46+
void SetFlag(std::span<uint8_t, 4> flags, PuffinFlag flag) {
47+
auto [byte_num, bit_num] = GetFlagPosition(flag);
48+
flags[byte_num] |= (1 << bit_num);
49+
}
50+
51+
void WriteInt32LittleEndian(int32_t value, std::span<uint8_t, 4> output) {
52+
WriteLittleEndian(value, output.data());
53+
}
54+
55+
int32_t ReadInt32LittleEndian(std::span<const uint8_t, 4> input) {
56+
return ReadLittleEndian<int32_t>(input.data());
57+
}
58+
59+
int32_t ReadInt32LittleEndian(std::span<const uint8_t> data, int32_t offset) {
60+
ICEBERG_DCHECK(offset >= 0, "Offset must be non-negative");
61+
ICEBERG_DCHECK(static_cast<size_t>(offset) + 4 <= data.size(), "Offset out of bounds");
62+
return ReadInt32LittleEndian(std::span<const uint8_t, 4>(data.data() + offset, 4));
63+
}
64+
65+
Result<std::vector<uint8_t>> Compress(PuffinCompressionCodec codec,
66+
std::span<const uint8_t> input) {
67+
switch (codec) {
68+
case PuffinCompressionCodec::kNone:
69+
return std::vector<uint8_t>(input.begin(), input.end());
70+
case PuffinCompressionCodec::kLz4:
71+
return NotSupported("LZ4 compression is not yet supported");
72+
case PuffinCompressionCodec::kZstd:
73+
return NotSupported("Zstd compression is not yet supported");
74+
}
75+
std::unreachable();
76+
}
77+
78+
Result<std::vector<uint8_t>> Decompress(PuffinCompressionCodec codec,
79+
std::span<const uint8_t> input) {
80+
switch (codec) {
81+
case PuffinCompressionCodec::kNone:
82+
return std::vector<uint8_t>(input.begin(), input.end());
83+
case PuffinCompressionCodec::kLz4:
84+
return NotSupported("LZ4 decompression is not yet supported");
85+
case PuffinCompressionCodec::kZstd:
86+
return NotSupported("Zstd decompression is not yet supported");
87+
}
88+
std::unreachable();
89+
}
90+
91+
} // namespace iceberg::puffin

src/iceberg/puffin/puffin_format.h

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/puffin/puffin_format.h
23+
/// Puffin file format constants and utilities.
24+
25+
#include <array>
26+
#include <cstdint>
27+
#include <span>
28+
#include <vector>
29+
30+
#include "iceberg/iceberg_export.h"
31+
#include "iceberg/puffin/file_metadata.h"
32+
#include "iceberg/result.h"
33+
34+
namespace iceberg::puffin {
35+
36+
/// \brief Puffin file format constants.
37+
struct ICEBERG_EXPORT PuffinFormat {
38+
/// Magic bytes: "PFA1" (Puffin Fratercula arctica, version 1)
39+
static constexpr std::array<uint8_t, 4> kMagic = {0x50, 0x46, 0x41, 0x31};
40+
41+
static constexpr int32_t kMagicLength = 4;
42+
static constexpr int32_t kFooterStartMagicOffset = 0;
43+
static constexpr int32_t kFooterStartMagicLength = kMagicLength;
44+
static constexpr int32_t kFooterStructPayloadSizeOffset = 0;
45+
static constexpr int32_t kFooterStructFlagsOffset = kFooterStructPayloadSizeOffset + 4;
46+
static constexpr int32_t kFooterStructFlagsLength = 4;
47+
static constexpr int32_t kFooterStructMagicOffset =
48+
kFooterStructFlagsOffset + kFooterStructFlagsLength;
49+
50+
/// Total length of the footer struct: payload_size(4) + flags(4) + magic(4)
51+
static constexpr int32_t kFooterStructLength = kFooterStructMagicOffset + kMagicLength;
52+
53+
/// Default compression codec for footer payload.
54+
static constexpr PuffinCompressionCodec kFooterCompressionCodec =
55+
PuffinCompressionCodec::kLz4;
56+
};
57+
58+
/// \brief Footer flags for Puffin files.
59+
enum class PuffinFlag : uint8_t {
60+
/// Whether the footer payload is compressed.
61+
kFooterPayloadCompressed = 0,
62+
};
63+
64+
/// \brief Check if a flag is set in the flags bytes.
65+
ICEBERG_EXPORT bool IsFlagSet(std::span<const uint8_t, 4> flags, PuffinFlag flag);
66+
67+
/// \brief Set a flag in the flags bytes.
68+
ICEBERG_EXPORT void SetFlag(std::span<uint8_t, 4> flags, PuffinFlag flag);
69+
70+
/// \brief Write a 32-bit integer in little-endian format.
71+
ICEBERG_EXPORT void WriteInt32LittleEndian(int32_t value, std::span<uint8_t, 4> output);
72+
73+
/// \brief Read a 32-bit integer from a fixed-size span in little-endian format.
74+
ICEBERG_EXPORT int32_t ReadInt32LittleEndian(std::span<const uint8_t, 4> input);
75+
76+
/// \brief Read a 32-bit integer from a buffer at the given offset in little-endian
77+
/// format.
78+
ICEBERG_EXPORT int32_t ReadInt32LittleEndian(std::span<const uint8_t> data,
79+
int32_t offset);
80+
81+
/// \brief Compress data using the specified codec.
82+
ICEBERG_EXPORT Result<std::vector<uint8_t>> Compress(PuffinCompressionCodec codec,
83+
std::span<const uint8_t> input);
84+
85+
/// \brief Decompress data using the specified codec.
86+
ICEBERG_EXPORT Result<std::vector<uint8_t>> Decompress(PuffinCompressionCodec codec,
87+
std::span<const uint8_t> input);
88+
89+
} // namespace iceberg::puffin
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/puffin/puffin_json_internal.h"
21+
22+
#include <nlohmann/json.hpp>
23+
24+
#include "iceberg/util/json_util_internal.h"
25+
#include "iceberg/util/macros.h"
26+
27+
namespace iceberg::puffin {
28+
29+
namespace {
30+
constexpr std::string_view kBlobs = "blobs";
31+
constexpr std::string_view kProperties = "properties";
32+
constexpr std::string_view kType = "type";
33+
constexpr std::string_view kFields = "fields";
34+
constexpr std::string_view kSnapshotId = "snapshot-id";
35+
constexpr std::string_view kSequenceNumber = "sequence-number";
36+
constexpr std::string_view kOffset = "offset";
37+
constexpr std::string_view kLength = "length";
38+
constexpr std::string_view kCompressionCodec = "compression-codec";
39+
} // namespace
40+
41+
nlohmann::json ToJson(const BlobMetadata& blob_metadata) {
42+
nlohmann::json json;
43+
json[kType] = blob_metadata.type;
44+
json[kFields] = blob_metadata.input_fields;
45+
json[kSnapshotId] = blob_metadata.snapshot_id;
46+
json[kSequenceNumber] = blob_metadata.sequence_number;
47+
json[kOffset] = blob_metadata.offset;
48+
json[kLength] = blob_metadata.length;
49+
50+
SetOptionalStringField(json, kCompressionCodec, blob_metadata.compression_codec);
51+
SetContainerField(json, kProperties, blob_metadata.properties);
52+
53+
return json;
54+
}
55+
56+
Result<BlobMetadata> BlobMetadataFromJson(const nlohmann::json& json) {
57+
BlobMetadata blob_metadata;
58+
59+
ICEBERG_ASSIGN_OR_RAISE(blob_metadata.type, GetJsonValue<std::string>(json, kType));
60+
ICEBERG_ASSIGN_OR_RAISE(blob_metadata.input_fields,
61+
GetJsonValue<std::vector<int32_t>>(json, kFields));
62+
ICEBERG_ASSIGN_OR_RAISE(blob_metadata.snapshot_id,
63+
GetJsonValue<int64_t>(json, kSnapshotId));
64+
ICEBERG_ASSIGN_OR_RAISE(blob_metadata.sequence_number,
65+
GetJsonValue<int64_t>(json, kSequenceNumber));
66+
ICEBERG_ASSIGN_OR_RAISE(blob_metadata.offset, GetJsonValue<int64_t>(json, kOffset));
67+
ICEBERG_ASSIGN_OR_RAISE(blob_metadata.length, GetJsonValue<int64_t>(json, kLength));
68+
ICEBERG_ASSIGN_OR_RAISE(blob_metadata.compression_codec,
69+
GetJsonValueOrDefault<std::string>(json, kCompressionCodec));
70+
ICEBERG_ASSIGN_OR_RAISE(blob_metadata.properties,
71+
FromJsonMap<std::string>(json, kProperties));
72+
73+
return blob_metadata;
74+
}
75+
76+
nlohmann::json ToJson(const FileMetadata& file_metadata) {
77+
nlohmann::json json;
78+
79+
nlohmann::json blobs_json = nlohmann::json::array();
80+
for (const auto& blob : file_metadata.blobs) {
81+
blobs_json.push_back(ToJson(blob));
82+
}
83+
json[kBlobs] = std::move(blobs_json);
84+
85+
SetContainerField(json, kProperties, file_metadata.properties);
86+
87+
return json;
88+
}
89+
90+
Result<FileMetadata> FileMetadataFromJson(const nlohmann::json& json) {
91+
FileMetadata file_metadata;
92+
93+
ICEBERG_ASSIGN_OR_RAISE(auto blobs_json, GetJsonValue<nlohmann::json>(json, kBlobs));
94+
if (!blobs_json.is_array()) {
95+
return JsonParseError("Cannot parse blobs from non-array: {}",
96+
SafeDumpJson(blobs_json));
97+
}
98+
99+
for (const auto& blob_json : blobs_json) {
100+
ICEBERG_ASSIGN_OR_RAISE(auto blob, BlobMetadataFromJson(blob_json));
101+
file_metadata.blobs.push_back(std::move(blob));
102+
}
103+
104+
ICEBERG_ASSIGN_OR_RAISE(file_metadata.properties,
105+
FromJsonMap<std::string>(json, kProperties));
106+
107+
return file_metadata;
108+
}
109+
110+
std::string ToJsonString(const FileMetadata& file_metadata, bool pretty) {
111+
auto json = ToJson(file_metadata);
112+
return pretty ? json.dump(2) : json.dump();
113+
}
114+
115+
Result<FileMetadata> FileMetadataFromJsonString(std::string_view json_string) {
116+
if (json_string.empty()) {
117+
return JsonParseError("Cannot parse empty JSON string");
118+
}
119+
try {
120+
auto json = nlohmann::json::parse(json_string);
121+
return FileMetadataFromJson(json);
122+
} catch (const nlohmann::json::parse_error& e) {
123+
return JsonParseError("Failed to parse JSON: {}", e.what());
124+
}
125+
}
126+
127+
} // namespace iceberg::puffin

0 commit comments

Comments
 (0)