Skip to content

Commit e7e0943

Browse files
duckdblabs-botgithub-actions[bot]
authored andcommitted
Update vendored DuckDB sources to b390a7c376
1 parent c037599 commit e7e0943

30 files changed

Lines changed: 246 additions & 41 deletions

src/duckdb/extension/parquet/column_writer.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ void ColumnWriter::HandleDefineLevels(ColumnWriterState &state, ColumnWriterStat
245245
//===--------------------------------------------------------------------===//
246246

247247
ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::SchemaElement> &schemas,
248-
const LogicalType &type, const string &name,
248+
const LogicalType &type, const string &name, bool allow_geometry,
249249
optional_ptr<const ChildFieldIDs> field_ids, idx_t max_repeat,
250250
idx_t max_define, bool can_have_nulls) {
251251
auto null_type = can_have_nulls ? FieldRepetitionType::OPTIONAL : FieldRepetitionType::REQUIRED;
@@ -285,7 +285,8 @@ ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::Schem
285285
struct_column.children.reserve(child_types.size());
286286
for (auto &child_type : child_types) {
287287
struct_column.children.emplace_back(FillParquetSchema(schemas, child_type.second, child_type.first,
288-
child_field_ids, max_repeat, max_define + 1));
288+
allow_geometry, child_field_ids, max_repeat,
289+
max_define + 1));
289290
}
290291
return struct_column;
291292
}
@@ -321,8 +322,8 @@ ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::Schem
321322
schemas.push_back(std::move(repeated_element));
322323

323324
ParquetColumnSchema list_column(name, type, max_define, max_repeat, schema_idx, 0);
324-
list_column.children.push_back(
325-
FillParquetSchema(schemas, child_type, "element", child_field_ids, max_repeat + 1, max_define + 2));
325+
list_column.children.push_back(FillParquetSchema(schemas, child_type, "element", allow_geometry,
326+
child_field_ids, max_repeat + 1, max_define + 2));
326327
return list_column;
327328
}
328329
if (type.id() == LogicalTypeId::MAP) {
@@ -369,8 +370,8 @@ ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::Schem
369370
for (idx_t i = 0; i < 2; i++) {
370371
// key needs to be marked as REQUIRED
371372
bool is_key = i == 0;
372-
auto child_schema = FillParquetSchema(schemas, kv_types[i], kv_names[i], child_field_ids, max_repeat + 1,
373-
max_define + 2, !is_key);
373+
auto child_schema = FillParquetSchema(schemas, kv_types[i], kv_names[i], allow_geometry, child_field_ids,
374+
max_repeat + 1, max_define + 2, !is_key);
374375

375376
map_column.children.push_back(std::move(child_schema));
376377
}
@@ -388,7 +389,7 @@ ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::Schem
388389
schema_element.__isset.field_id = true;
389390
schema_element.field_id = field_id->field_id;
390391
}
391-
ParquetWriter::SetSchemaProperties(type, schema_element);
392+
ParquetWriter::SetSchemaProperties(type, schema_element, allow_geometry);
392393
schemas.push_back(std::move(schema_element));
393394
return ParquetColumnSchema(name, type, max_define, max_repeat, schema_idx, 0);
394395
}

src/duckdb/extension/parquet/geo_parquet.cpp

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -208,17 +208,19 @@ unique_ptr<GeoParquetFileMetadata> GeoParquetFileMetadata::TryRead(const duckdb_
208208
throw InvalidInputException("Geoparquet metadata is not an object");
209209
}
210210

211-
auto result = make_uniq<GeoParquetFileMetadata>();
211+
// We dont actually care about the version for now, as we only support V1+native
212+
auto result = make_uniq<GeoParquetFileMetadata>(GeoParquetVersion::BOTH);
212213

213214
// Check and parse the version
214215
const auto version_val = yyjson_obj_get(root, "version");
215216
if (!yyjson_is_str(version_val)) {
216217
throw InvalidInputException("Geoparquet metadata does not have a version");
217218
}
218-
result->version = yyjson_get_str(version_val);
219-
if (StringUtil::StartsWith(result->version, "2")) {
220-
// Guard against a breaking future 2.0 version
221-
throw InvalidInputException("Geoparquet version %s is not supported", result->version);
219+
220+
auto version = yyjson_get_str(version_val);
221+
if (StringUtil::StartsWith(version, "3")) {
222+
// Guard against a breaking future 3.0 version
223+
throw InvalidInputException("Geoparquet version %s is not supported", version);
222224
}
223225

224226
// Check and parse the geometry columns
@@ -344,7 +346,20 @@ void GeoParquetFileMetadata::Write(duckdb_parquet::FileMetaData &file_meta_data)
344346
yyjson_mut_doc_set_root(doc, root);
345347

346348
// Add the version
347-
yyjson_mut_obj_add_strncpy(doc, root, "version", version.c_str(), version.size());
349+
switch (version) {
350+
case GeoParquetVersion::V1:
351+
case GeoParquetVersion::BOTH:
352+
yyjson_mut_obj_add_strcpy(doc, root, "version", "1.0.0");
353+
break;
354+
case GeoParquetVersion::V2:
355+
yyjson_mut_obj_add_strcpy(doc, root, "version", "2.0.0");
356+
break;
357+
case GeoParquetVersion::NONE:
358+
default:
359+
// Should never happen, we should not be writing anything
360+
yyjson_mut_doc_free(doc);
361+
throw InternalException("GeoParquetVersion::NONE should not write metadata");
362+
}
348363

349364
// Add the primary column
350365
yyjson_mut_obj_add_strncpy(doc, root, "primary_column", primary_geometry_column.c_str(),

src/duckdb/extension/parquet/include/column_writer.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ class ColumnWriter {
9494
}
9595

9696
static ParquetColumnSchema FillParquetSchema(vector<duckdb_parquet::SchemaElement> &schemas,
97-
const LogicalType &type, const string &name,
97+
const LogicalType &type, const string &name, bool allow_geometry,
9898
optional_ptr<const ChildFieldIDs> field_ids, idx_t max_repeat = 0,
9999
idx_t max_define = 1, bool can_have_nulls = true);
100100
//! Create the column writer for a specific type recursively

src/duckdb/extension/parquet/include/geo_parquet.hpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,31 @@ enum class GeoParquetColumnEncoding : uint8_t {
199199
MULTIPOLYGON,
200200
};
201201

202+
enum class GeoParquetVersion : uint8_t {
203+
// Write GeoParquet 1.0 metadata
204+
// GeoParquet 1.0 has the widest support among readers and writers
205+
V1,
206+
207+
// Write GeoParquet 2.0
208+
// The GeoParquet 2.0 options is identical to GeoParquet 1.0 except the underlying storage
209+
// of spatial columns is Parquet native geometry, where the Parquet writer will include
210+
// native statistics according to the underlying Parquet options. Compared to 'BOTH', this will
211+
// actually write the metadata as containing GeoParquet version 2.0.0
212+
// However, V2 isnt standardized yet, so this option is still a bit experimental
213+
V2,
214+
215+
// Write GeoParquet 1.0 metadata, with native Parquet geometry types
216+
// This is a bit of a hold-over option for compatibility with systems that
217+
// reject GeoParquet 2.0 metadata, but can read Parquet native geometry types as they simply ignore the extra
218+
// logical type. DuckDB v1.4.0 falls into this category.
219+
BOTH,
220+
221+
// Do not write GeoParquet metadata
222+
// This option suppresses GeoParquet metadata; however, spatial types will be written as
223+
// Parquet native Geometry/Geography.
224+
NONE,
225+
};
226+
202227
struct GeoParquetColumnMetadata {
203228
// The encoding of the geometry column
204229
GeoParquetColumnEncoding geometry_encoding;
@@ -215,6 +240,8 @@ struct GeoParquetColumnMetadata {
215240

216241
class GeoParquetFileMetadata {
217242
public:
243+
GeoParquetFileMetadata(GeoParquetVersion geo_parquet_version) : version(geo_parquet_version) {
244+
}
218245
void AddGeoParquetStats(const string &column_name, const LogicalType &type, const GeometryStats &stats);
219246
void Write(duckdb_parquet::FileMetaData &file_meta_data);
220247

@@ -234,8 +261,8 @@ class GeoParquetFileMetadata {
234261

235262
private:
236263
mutex write_lock;
237-
string version = "1.1.0";
238264
unordered_map<string, GeoParquetColumnMetadata> geometry_columns;
265+
GeoParquetVersion version;
239266
};
240267

241268
} // namespace duckdb

src/duckdb/extension/parquet/include/parquet_writer.hpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ class ParquetWriter {
8585
shared_ptr<ParquetEncryptionConfig> encryption_config, optional_idx dictionary_size_limit,
8686
idx_t string_dictionary_page_size_limit, bool enable_bloom_filters,
8787
double bloom_filter_false_positive_ratio, int64_t compression_level, bool debug_use_openssl,
88-
ParquetVersion parquet_version);
88+
ParquetVersion parquet_version, GeoParquetVersion geoparquet_version);
8989
~ParquetWriter();
9090

9191
public:
@@ -95,7 +95,8 @@ class ParquetWriter {
9595
void Finalize();
9696

9797
static duckdb_parquet::Type::type DuckDBTypeToParquetType(const LogicalType &duckdb_type);
98-
static void SetSchemaProperties(const LogicalType &duckdb_type, duckdb_parquet::SchemaElement &schema_ele);
98+
static void SetSchemaProperties(const LogicalType &duckdb_type, duckdb_parquet::SchemaElement &schema_ele,
99+
bool allow_geometry);
99100

100101
ClientContext &GetContext() {
101102
return context;
@@ -139,6 +140,9 @@ class ParquetWriter {
139140
ParquetVersion GetParquetVersion() const {
140141
return parquet_version;
141142
}
143+
GeoParquetVersion GetGeoParquetVersion() const {
144+
return geoparquet_version;
145+
}
142146
const string &GetFileName() const {
143147
return file_name;
144148
}
@@ -175,6 +179,7 @@ class ParquetWriter {
175179
bool debug_use_openssl;
176180
shared_ptr<EncryptionUtil> encryption_util;
177181
ParquetVersion parquet_version;
182+
GeoParquetVersion geoparquet_version;
178183
vector<ParquetColumnSchema> column_schemas;
179184

180185
unique_ptr<BufferedFileWriter> writer;

src/duckdb/extension/parquet/parquet_extension.cpp

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,9 @@ struct ParquetWriteBindData : public TableFunctionData {
238238

239239
//! Which encodings to include when writing
240240
ParquetVersion parquet_version = ParquetVersion::V1;
241+
242+
//! Which geo-parquet version to use when writing
243+
GeoParquetVersion geoparquet_version = GeoParquetVersion::V1;
241244
};
242245

243246
struct ParquetWriteGlobalState : public GlobalFunctionData {
@@ -291,6 +294,7 @@ static void ParquetListCopyOptions(ClientContext &context, CopyOptionsInput &inp
291294
copy_options["binary_as_string"] = CopyOption(LogicalType::BOOLEAN, CopyOptionMode::READ_ONLY);
292295
copy_options["file_row_number"] = CopyOption(LogicalType::BOOLEAN, CopyOptionMode::READ_ONLY);
293296
copy_options["can_have_nan"] = CopyOption(LogicalType::BOOLEAN, CopyOptionMode::READ_ONLY);
297+
copy_options["geoparquet_version"] = CopyOption(LogicalType::VARCHAR, CopyOptionMode::WRITE_ONLY);
294298
}
295299

296300
static unique_ptr<FunctionData> ParquetWriteBind(ClientContext &context, CopyFunctionBindInput &input,
@@ -426,6 +430,19 @@ static unique_ptr<FunctionData> ParquetWriteBind(ClientContext &context, CopyFun
426430
} else {
427431
throw BinderException("Expected parquet_version 'V1' or 'V2'");
428432
}
433+
} else if (loption == "geoparquet_version") {
434+
const auto roption = StringUtil::Upper(option.second[0].ToString());
435+
if (roption == "NONE") {
436+
bind_data->geoparquet_version = GeoParquetVersion::NONE;
437+
} else if (roption == "V1") {
438+
bind_data->geoparquet_version = GeoParquetVersion::V1;
439+
} else if (roption == "V2") {
440+
bind_data->geoparquet_version = GeoParquetVersion::V2;
441+
} else if (roption == "BOTH") {
442+
bind_data->geoparquet_version = GeoParquetVersion::BOTH;
443+
} else {
444+
throw BinderException("Expected geoparquet_version 'NONE', 'V1' or 'BOTH'");
445+
}
429446
} else {
430447
throw InternalException("Unrecognized option for PARQUET: %s", option.first.c_str());
431448
}
@@ -457,7 +474,8 @@ static unique_ptr<GlobalFunctionData> ParquetWriteInitializeGlobal(ClientContext
457474
parquet_bind.field_ids.Copy(), parquet_bind.kv_metadata, parquet_bind.encryption_config,
458475
parquet_bind.dictionary_size_limit, parquet_bind.string_dictionary_page_size_limit,
459476
parquet_bind.enable_bloom_filters, parquet_bind.bloom_filter_false_positive_ratio,
460-
parquet_bind.compression_level, parquet_bind.debug_use_openssl, parquet_bind.parquet_version);
477+
parquet_bind.compression_level, parquet_bind.debug_use_openssl, parquet_bind.parquet_version,
478+
parquet_bind.geoparquet_version);
461479
return std::move(global_state);
462480
}
463481

@@ -626,6 +644,39 @@ ParquetVersion EnumUtil::FromString<ParquetVersion>(const char *value) {
626644
throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
627645
}
628646

647+
template <>
648+
const char *EnumUtil::ToChars<GeoParquetVersion>(GeoParquetVersion value) {
649+
switch (value) {
650+
case GeoParquetVersion::NONE:
651+
return "NONE";
652+
case GeoParquetVersion::V1:
653+
return "V1";
654+
case GeoParquetVersion::V2:
655+
return "V2";
656+
case GeoParquetVersion::BOTH:
657+
return "BOTH";
658+
default:
659+
throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
660+
}
661+
}
662+
663+
template <>
664+
GeoParquetVersion EnumUtil::FromString<GeoParquetVersion>(const char *value) {
665+
if (StringUtil::Equals(value, "NONE")) {
666+
return GeoParquetVersion::NONE;
667+
}
668+
if (StringUtil::Equals(value, "V1")) {
669+
return GeoParquetVersion::V1;
670+
}
671+
if (StringUtil::Equals(value, "V2")) {
672+
return GeoParquetVersion::V2;
673+
}
674+
if (StringUtil::Equals(value, "BOTH")) {
675+
return GeoParquetVersion::BOTH;
676+
}
677+
throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
678+
}
679+
629680
static optional_idx SerializeCompressionLevel(const int64_t compression_level) {
630681
return compression_level < 0 ? NumericLimits<idx_t>::Maximum() - NumericCast<idx_t>(AbsValue(compression_level))
631682
: NumericCast<idx_t>(compression_level);
@@ -679,6 +730,8 @@ static void ParquetCopySerialize(Serializer &serializer, const FunctionData &bin
679730
serializer.WritePropertyWithDefault(115, "string_dictionary_page_size_limit",
680731
bind_data.string_dictionary_page_size_limit,
681732
default_value.string_dictionary_page_size_limit);
733+
serializer.WritePropertyWithDefault(116, "geoparquet_version", bind_data.geoparquet_version,
734+
default_value.geoparquet_version);
682735
}
683736

684737
static unique_ptr<FunctionData> ParquetCopyDeserialize(Deserializer &deserializer, CopyFunction &function) {
@@ -711,6 +764,8 @@ static unique_ptr<FunctionData> ParquetCopyDeserialize(Deserializer &deserialize
711764
deserializer.ReadPropertyWithExplicitDefault(114, "parquet_version", default_value.parquet_version);
712765
data->string_dictionary_page_size_limit = deserializer.ReadPropertyWithExplicitDefault(
713766
115, "string_dictionary_page_size_limit", default_value.string_dictionary_page_size_limit);
767+
data->geoparquet_version =
768+
deserializer.ReadPropertyWithExplicitDefault(116, "geoparquet_version", default_value.geoparquet_version);
714769

715770
return std::move(data);
716771
}

src/duckdb/extension/parquet/parquet_writer.cpp

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -166,15 +166,16 @@ Type::type ParquetWriter::DuckDBTypeToParquetType(const LogicalType &duckdb_type
166166
throw NotImplementedException("Unimplemented type for Parquet \"%s\"", duckdb_type.ToString());
167167
}
168168

169-
void ParquetWriter::SetSchemaProperties(const LogicalType &duckdb_type, duckdb_parquet::SchemaElement &schema_ele) {
169+
void ParquetWriter::SetSchemaProperties(const LogicalType &duckdb_type, duckdb_parquet::SchemaElement &schema_ele,
170+
bool allow_geometry) {
170171
if (duckdb_type.IsJSONType()) {
171172
schema_ele.converted_type = ConvertedType::JSON;
172173
schema_ele.__isset.converted_type = true;
173174
schema_ele.__isset.logicalType = true;
174175
schema_ele.logicalType.__set_JSON(duckdb_parquet::JsonType());
175176
return;
176177
}
177-
if (duckdb_type.GetAlias() == "WKB_BLOB") {
178+
if (duckdb_type.GetAlias() == "WKB_BLOB" && allow_geometry) {
178179
schema_ele.__isset.logicalType = true;
179180
schema_ele.logicalType.__isset.GEOMETRY = true;
180181
// TODO: Set CRS in the future
@@ -356,14 +357,16 @@ ParquetWriter::ParquetWriter(ClientContext &context, FileSystem &fs, string file
356357
shared_ptr<ParquetEncryptionConfig> encryption_config_p,
357358
optional_idx dictionary_size_limit_p, idx_t string_dictionary_page_size_limit_p,
358359
bool enable_bloom_filters_p, double bloom_filter_false_positive_ratio_p,
359-
int64_t compression_level_p, bool debug_use_openssl_p, ParquetVersion parquet_version)
360+
int64_t compression_level_p, bool debug_use_openssl_p, ParquetVersion parquet_version,
361+
GeoParquetVersion geoparquet_version)
360362
: context(context), file_name(std::move(file_name_p)), sql_types(std::move(types_p)),
361363
column_names(std::move(names_p)), codec(codec), field_ids(std::move(field_ids_p)),
362364
encryption_config(std::move(encryption_config_p)), dictionary_size_limit(dictionary_size_limit_p),
363365
string_dictionary_page_size_limit(string_dictionary_page_size_limit_p),
364366
enable_bloom_filters(enable_bloom_filters_p),
365367
bloom_filter_false_positive_ratio(bloom_filter_false_positive_ratio_p), compression_level(compression_level_p),
366-
debug_use_openssl(debug_use_openssl_p), parquet_version(parquet_version), total_written(0), num_row_groups(0) {
368+
debug_use_openssl(debug_use_openssl_p), parquet_version(parquet_version), geoparquet_version(geoparquet_version),
369+
total_written(0), num_row_groups(0) {
367370

368371
// initialize the file writer
369372
writer = make_uniq<BufferedFileWriter>(fs, file_name.c_str(),
@@ -416,10 +419,13 @@ ParquetWriter::ParquetWriter(ClientContext &context, FileSystem &fs, string file
416419
auto &unique_names = column_names;
417420
VerifyUniqueNames(unique_names);
418421

422+
// V1 GeoParquet stores geometries as blobs, no logical type
423+
auto allow_geometry = geoparquet_version != GeoParquetVersion::V1;
424+
419425
// construct the child schemas
420426
for (idx_t i = 0; i < sql_types.size(); i++) {
421-
auto child_schema =
422-
ColumnWriter::FillParquetSchema(file_meta_data.schema, sql_types[i], unique_names[i], &field_ids);
427+
auto child_schema = ColumnWriter::FillParquetSchema(file_meta_data.schema, sql_types[i], unique_names[i],
428+
allow_geometry, &field_ids);
423429
column_schemas.push_back(std::move(child_schema));
424430
}
425431
// now construct the writers based on the schemas
@@ -975,7 +981,8 @@ void ParquetWriter::Finalize() {
975981
}
976982

977983
// Add geoparquet metadata to the file metadata
978-
if (geoparquet_data && GeoParquetFileMetadata::IsGeoParquetConversionEnabled(context)) {
984+
if (geoparquet_data && GeoParquetFileMetadata::IsGeoParquetConversionEnabled(context) &&
985+
geoparquet_version != GeoParquetVersion::NONE) {
979986
geoparquet_data->Write(file_meta_data);
980987
}
981988

@@ -1005,7 +1012,7 @@ void ParquetWriter::Finalize() {
10051012

10061013
GeoParquetFileMetadata &ParquetWriter::GetGeoParquetData() {
10071014
if (!geoparquet_data) {
1008-
geoparquet_data = make_uniq<GeoParquetFileMetadata>();
1015+
geoparquet_data = make_uniq<GeoParquetFileMetadata>(geoparquet_version);
10091016
}
10101017
return *geoparquet_data;
10111018
}

0 commit comments

Comments
 (0)