Skip to content

Commit 80f7408

Browse files
authored
refactor: separate lazy-initialized fields into dedicated cache classes (#477)
- Split Schema lazy initialized fields into SchemaCache class - Rename CachedSnapshot to SnapshotCache This refactoring improves consistency with TableMetadataCache
1 parent 3743008 commit 80f7408

File tree

4 files changed

+138
-87
lines changed

4 files changed

+138
-87
lines changed

src/iceberg/schema.cc

Lines changed: 81 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@
3636
namespace iceberg {
3737

3838
Schema::Schema(std::vector<SchemaField> fields, int32_t schema_id)
39-
: StructType(std::move(fields)), schema_id_(schema_id) {}
39+
: StructType(std::move(fields)),
40+
schema_id_(schema_id),
41+
cache_(std::make_unique<SchemaCache>(this)) {}
4042

4143
Result<std::unique_ptr<Schema>> Schema::Make(std::vector<SchemaField> fields,
4244
int32_t schema_id,
@@ -156,54 +158,24 @@ bool Schema::Equals(const Schema& other) const {
156158
Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldByName(
157159
std::string_view name, bool case_sensitive) const {
158160
if (case_sensitive) {
159-
ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this));
161+
ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, cache_->GetNameIdMap());
160162
auto it = name_id_map.get().name_to_id.find(name);
161163
if (it == name_id_map.get().name_to_id.end()) {
162164
return std::nullopt;
163165
};
164166
return FindFieldById(it->second);
165167
}
166-
ICEBERG_ASSIGN_OR_RAISE(auto lowercase_name_to_id, lowercase_name_to_id_.Get(*this));
168+
ICEBERG_ASSIGN_OR_RAISE(auto lowercase_name_to_id, cache_->GetLowercaseNameToIdMap());
167169
auto it = lowercase_name_to_id.get().find(StringUtils::ToLower(name));
168170
if (it == lowercase_name_to_id.get().end()) {
169171
return std::nullopt;
170172
}
171173
return FindFieldById(it->second);
172174
}
173175

174-
Result<std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>>
175-
Schema::InitIdToFieldMap(const Schema& self) {
176-
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>> id_to_field;
177-
IdToFieldVisitor visitor(id_to_field);
178-
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(self, &visitor));
179-
return id_to_field;
180-
}
181-
182-
Result<Schema::NameIdMap> Schema::InitNameIdMap(const Schema& self) {
183-
NameIdMap name_id_map;
184-
NameToIdVisitor visitor(name_id_map.name_to_id, &name_id_map.id_to_name,
185-
/*case_sensitive=*/true);
186-
ICEBERG_RETURN_UNEXPECTED(
187-
VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/""));
188-
visitor.Finish();
189-
return name_id_map;
190-
}
191-
192-
Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
193-
Schema::InitLowerCaseNameToIdMap(const Schema& self) {
194-
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
195-
lowercase_name_to_id;
196-
NameToIdVisitor visitor(lowercase_name_to_id, /*id_to_name=*/nullptr,
197-
/*case_sensitive=*/false);
198-
ICEBERG_RETURN_UNEXPECTED(
199-
VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/""));
200-
visitor.Finish();
201-
return lowercase_name_to_id;
202-
}
203-
204176
Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldById(
205177
int32_t field_id) const {
206-
ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, id_to_field_.Get(*this));
178+
ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, cache_->GetIdToFieldMap());
207179
auto it = id_to_field.get().find(field_id);
208180
if (it == id_to_field.get().end()) {
209181
return std::nullopt;
@@ -213,38 +185,17 @@ Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFie
213185

214186
Result<std::optional<std::string_view>> Schema::FindColumnNameById(
215187
int32_t field_id) const {
216-
ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this));
188+
ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, cache_->GetNameIdMap());
217189
auto it = name_id_map.get().id_to_name.find(field_id);
218190
if (it == name_id_map.get().id_to_name.end()) {
219191
return std::nullopt;
220192
}
221193
return it->second;
222194
}
223195

224-
Result<std::unordered_map<int32_t, std::vector<size_t>>> Schema::InitIdToPositionPath(
225-
const Schema& self) {
226-
PositionPathVisitor visitor;
227-
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(self, &visitor));
228-
return visitor.Finish();
229-
}
230-
231-
Result<int32_t> Schema::InitHighestFieldId(const Schema& self) {
232-
ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, self.id_to_field_.Get(self));
233-
234-
if (id_to_field.get().empty()) {
235-
return kInitialColumnId;
236-
}
237-
238-
auto max_it = std::ranges::max_element(
239-
id_to_field.get(),
240-
[](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; });
241-
242-
return max_it->first;
243-
}
244-
245196
Result<std::unique_ptr<StructLikeAccessor>> Schema::GetAccessorById(
246197
int32_t field_id) const {
247-
ICEBERG_ASSIGN_OR_RAISE(auto id_to_position_path, id_to_position_path_.Get(*this));
198+
ICEBERG_ASSIGN_OR_RAISE(auto id_to_position_path, cache_->GetIdToPositionPathMap());
248199
if (auto it = id_to_position_path.get().find(field_id);
249200
it != id_to_position_path.get().cend()) {
250201
ICEBERG_ASSIGN_OR_RAISE(auto field, FindFieldById(field_id));
@@ -322,15 +273,15 @@ Result<std::vector<std::string>> Schema::IdentifierFieldNames() const {
322273
return names;
323274
}
324275

325-
Result<int32_t> Schema::HighestFieldId() const { return highest_field_id_.Get(*this); }
276+
Result<int32_t> Schema::HighestFieldId() const { return cache_->GetHighestFieldId(); }
326277

327278
bool Schema::SameSchema(const Schema& other) const {
328279
return fields_ == other.fields_ && identifier_field_ids_ == other.identifier_field_ids_;
329280
}
330281

331282
Status Schema::Validate(int32_t format_version) const {
332283
// Get all fields including nested ones
333-
ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, id_to_field_.Get(*this));
284+
ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, cache_->GetIdToFieldMap());
334285

335286
// Check each field's type and defaults
336287
for (const auto& [field_id, field_ref] : id_to_field.get()) {
@@ -351,4 +302,75 @@ Status Schema::Validate(int32_t format_version) const {
351302
return {};
352303
}
353304

305+
Result<SchemaCache::IdToFieldMapRef> SchemaCache::GetIdToFieldMap() const {
306+
return id_to_field_.Get(schema_);
307+
}
308+
309+
Result<SchemaCache::NameIdMapRef> SchemaCache::GetNameIdMap() const {
310+
return name_id_map_.Get(schema_);
311+
}
312+
313+
Result<SchemaCache::LowercaseNameToIdMapRef> SchemaCache::GetLowercaseNameToIdMap()
314+
const {
315+
return lowercase_name_to_id_.Get(schema_);
316+
}
317+
318+
Result<SchemaCache::IdToPositionPathMapRef> SchemaCache::GetIdToPositionPathMap() const {
319+
return id_to_position_path_.Get(schema_);
320+
}
321+
322+
Result<int32_t> SchemaCache::GetHighestFieldId() const {
323+
return highest_field_id_.Get(schema_);
324+
}
325+
326+
Result<SchemaCache::IdToFieldMap> SchemaCache::InitIdToFieldMap(const Schema* schema) {
327+
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>> id_to_field;
328+
IdToFieldVisitor visitor(id_to_field);
329+
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*schema, &visitor));
330+
return id_to_field;
331+
}
332+
333+
Result<SchemaCache::NameIdMap> SchemaCache::InitNameIdMap(const Schema* schema) {
334+
NameIdMap name_id_map;
335+
NameToIdVisitor visitor(name_id_map.name_to_id, &name_id_map.id_to_name,
336+
/*case_sensitive=*/true);
337+
ICEBERG_RETURN_UNEXPECTED(
338+
VisitTypeInline(*schema, &visitor, /*path=*/"", /*short_path=*/""));
339+
visitor.Finish();
340+
return name_id_map;
341+
}
342+
343+
Result<SchemaCache::LowercaseNameToIdMap> SchemaCache::InitLowerCaseNameToIdMap(
344+
const Schema* schema) {
345+
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
346+
lowercase_name_to_id;
347+
NameToIdVisitor visitor(lowercase_name_to_id, /*id_to_name=*/nullptr,
348+
/*case_sensitive=*/false);
349+
ICEBERG_RETURN_UNEXPECTED(
350+
VisitTypeInline(*schema, &visitor, /*path=*/"", /*short_path=*/""));
351+
visitor.Finish();
352+
return lowercase_name_to_id;
353+
}
354+
355+
Result<SchemaCache::IdToPositionPathMap> SchemaCache::InitIdToPositionPath(
356+
const Schema* schema) {
357+
PositionPathVisitor visitor;
358+
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*schema, &visitor));
359+
return visitor.Finish();
360+
}
361+
362+
Result<int32_t> SchemaCache::InitHighestFieldId(const Schema* schema) {
363+
ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, InitIdToFieldMap(schema));
364+
365+
if (id_to_field.empty()) {
366+
return Schema::kInitialColumnId;
367+
}
368+
369+
auto max_it = std::ranges::max_element(
370+
id_to_field,
371+
[](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; });
372+
373+
return max_it->first;
374+
}
375+
354376
} // namespace iceberg

src/iceberg/schema.h

Lines changed: 45 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <cstdint>
2727
#include <optional>
2828
#include <string>
29+
#include <unordered_map>
2930
#include <unordered_set>
3031
#include <vector>
3132

@@ -38,6 +39,8 @@
3839

3940
namespace iceberg {
4041

42+
class SchemaCache;
43+
4144
/// \brief A schema for a Table.
4245
///
4346
/// A schema is a list of typed columns, along with a unique integer ID. A
@@ -187,6 +190,22 @@ class ICEBERG_EXPORT Schema : public StructType {
187190
/// \brief Compare two schemas for equality.
188191
bool Equals(const Schema& other) const;
189192

193+
const int32_t schema_id_;
194+
// Field IDs that uniquely identify rows in the table.
195+
std::vector<int32_t> identifier_field_ids_;
196+
// Cache for schema mappings to facilitate fast lookups.
197+
std::unique_ptr<SchemaCache> cache_;
198+
};
199+
200+
// Cache for schema mappings to facilitate fast lookups.
201+
class ICEBERG_EXPORT SchemaCache {
202+
public:
203+
explicit SchemaCache(const Schema* schema) : schema_(schema) {}
204+
205+
using IdToFieldMap =
206+
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>;
207+
using IdToFieldMapRef = std::reference_wrapper<const IdToFieldMap>;
208+
190209
struct NameIdMap {
191210
/// \brief Mapping from canonical field name to ID
192211
///
@@ -201,28 +220,38 @@ class ICEBERG_EXPORT Schema : public StructType {
201220
/// 'list.element.field' instead of 'list.field'.
202221
std::unordered_map<int32_t, std::string> id_to_name;
203222
};
223+
using NameIdMapRef = std::reference_wrapper<const NameIdMap>;
204224

205-
static Result<std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>>
206-
InitIdToFieldMap(const Schema&);
207-
static Result<NameIdMap> InitNameIdMap(const Schema&);
208-
static Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
209-
InitLowerCaseNameToIdMap(const Schema&);
210-
static Result<std::unordered_map<int32_t, std::vector<size_t>>> InitIdToPositionPath(
211-
const Schema&);
212-
static Result<int32_t> InitHighestFieldId(const Schema&);
225+
using LowercaseNameToIdMap =
226+
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>;
227+
using LowercaseNameToIdMapRef = std::reference_wrapper<const LowercaseNameToIdMap>;
213228

214-
const int32_t schema_id_;
215-
/// Field IDs that uniquely identify rows in the table.
216-
std::vector<int32_t> identifier_field_ids_;
217-
/// Mapping from field id to field.
229+
using IdToPositionPathMap = std::unordered_map<int32_t, std::vector<size_t>>;
230+
using IdToPositionPathMapRef = std::reference_wrapper<const IdToPositionPathMap>;
231+
232+
Result<IdToFieldMapRef> GetIdToFieldMap() const;
233+
Result<NameIdMapRef> GetNameIdMap() const;
234+
Result<LowercaseNameToIdMapRef> GetLowercaseNameToIdMap() const;
235+
Result<IdToPositionPathMapRef> GetIdToPositionPathMap() const;
236+
Result<int32_t> GetHighestFieldId() const;
237+
238+
private:
239+
static Result<IdToFieldMap> InitIdToFieldMap(const Schema* schema);
240+
static Result<NameIdMap> InitNameIdMap(const Schema* schema);
241+
static Result<LowercaseNameToIdMap> InitLowerCaseNameToIdMap(const Schema* schema);
242+
static Result<IdToPositionPathMap> InitIdToPositionPath(const Schema* schema);
243+
static Result<int32_t> InitHighestFieldId(const Schema* schema);
244+
245+
const Schema* schema_;
246+
// Mapping from field id to field.
218247
Lazy<InitIdToFieldMap> id_to_field_;
219-
/// Mapping from field name to field id.
248+
// Mapping from field name to field id.
220249
Lazy<InitNameIdMap> name_id_map_;
221-
/// Mapping from lowercased field name to field id
250+
// Mapping from lowercased field name to field id.
222251
Lazy<InitLowerCaseNameToIdMap> lowercase_name_to_id_;
223-
/// Mapping from field id to (nested) position path to access the field.
252+
// Mapping from field id to (nested) position path to access the field.
224253
Lazy<InitIdToPositionPath> id_to_position_path_;
225-
/// Highest field ID in the schema.
254+
// Highest field ID in the schema.
226255
Lazy<InitHighestFieldId> highest_field_id_;
227256
};
228257

src/iceberg/snapshot.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -85,15 +85,15 @@ bool Snapshot::Equals(const Snapshot& other) const {
8585
schema_id == other.schema_id;
8686
}
8787

88-
Result<CachedSnapshot::ManifestsCache> CachedSnapshot::InitManifestsCache(
89-
const Snapshot& snapshot, std::shared_ptr<FileIO> file_io) {
88+
Result<SnapshotCache::ManifestsCache> SnapshotCache::InitManifestsCache(
89+
const Snapshot* snapshot, std::shared_ptr<FileIO> file_io) {
9090
if (file_io == nullptr) {
9191
return InvalidArgument("Cannot cache manifests: FileIO is null");
9292
}
9393

9494
// Read manifest list
9595
ICEBERG_ASSIGN_OR_RAISE(auto reader,
96-
ManifestListReader::Make(snapshot.manifest_list, file_io));
96+
ManifestListReader::Make(snapshot->manifest_list, file_io));
9797
ICEBERG_ASSIGN_OR_RAISE(auto manifest_files, reader->Files());
9898

9999
std::vector<ManifestFile> manifests;
@@ -118,21 +118,21 @@ Result<CachedSnapshot::ManifestsCache> CachedSnapshot::InitManifestsCache(
118118
return std::make_pair(std::move(manifests), data_manifests_count);
119119
}
120120

121-
Result<std::span<ManifestFile>> CachedSnapshot::Manifests(
121+
Result<std::span<ManifestFile>> SnapshotCache::Manifests(
122122
std::shared_ptr<FileIO> file_io) const {
123123
ICEBERG_ASSIGN_OR_RAISE(auto cache_ref, manifests_cache_.Get(snapshot_, file_io));
124124
auto& cache = cache_ref.get();
125125
return std::span<ManifestFile>(cache.first.data(), cache.first.size());
126126
}
127127

128-
Result<std::span<ManifestFile>> CachedSnapshot::DataManifests(
128+
Result<std::span<ManifestFile>> SnapshotCache::DataManifests(
129129
std::shared_ptr<FileIO> file_io) const {
130130
ICEBERG_ASSIGN_OR_RAISE(auto cache_ref, manifests_cache_.Get(snapshot_, file_io));
131131
auto& cache = cache_ref.get();
132132
return std::span<ManifestFile>(cache.first.data(), cache.second);
133133
}
134134

135-
Result<std::span<ManifestFile>> CachedSnapshot::DeleteManifests(
135+
Result<std::span<ManifestFile>> SnapshotCache::DeleteManifests(
136136
std::shared_ptr<FileIO> file_io) const {
137137
ICEBERG_ASSIGN_OR_RAISE(auto cache_ref, manifests_cache_.Get(snapshot_, file_io));
138138
auto& cache = cache_ref.get();

src/iceberg/snapshot.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -265,13 +265,13 @@ struct ICEBERG_EXPORT Snapshot {
265265

266266
/// \brief A snapshot with cached manifest loading capabilities.
267267
///
268-
/// This class wraps a Snapshot reference and provides lazy-loading of manifests.
269-
class ICEBERG_EXPORT CachedSnapshot {
268+
/// This class wraps a Snapshot pointer and provides lazy-loading of manifests.
269+
class ICEBERG_EXPORT SnapshotCache {
270270
public:
271-
explicit CachedSnapshot(const Snapshot& snapshot) : snapshot_(snapshot) {}
271+
explicit SnapshotCache(const Snapshot* snapshot) : snapshot_(snapshot) {}
272272

273273
/// \brief Get the underlying Snapshot reference
274-
const Snapshot& snapshot() const { return snapshot_; }
274+
const Snapshot& snapshot() const { return *snapshot_; }
275275

276276
/// \brief Returns all ManifestFile instances for either data or delete manifests
277277
/// in this snapshot.
@@ -303,11 +303,11 @@ class ICEBERG_EXPORT CachedSnapshot {
303303
/// \param snapshot The snapshot to initialize the manifests cache for
304304
/// \param file_io The FileIO instance to use for reading the manifest list
305305
/// \return A result containing the manifests cache
306-
static Result<ManifestsCache> InitManifestsCache(const Snapshot& snapshot,
306+
static Result<ManifestsCache> InitManifestsCache(const Snapshot* snapshot,
307307
std::shared_ptr<FileIO> file_io);
308308

309309
/// The underlying snapshot data
310-
const Snapshot& snapshot_;
310+
const Snapshot* snapshot_;
311311

312312
/// Lazy-loaded manifests cache
313313
Lazy<InitManifestsCache> manifests_cache_;

0 commit comments

Comments
 (0)