Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ set(ARROW_SRCS
extension_type.cc
extension/bool8.cc
extension/json.cc
extension/parquet_variant.cc
extension/uuid.cc
pretty_print.cc
record_batch.cc
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,19 @@
// specific language governing permissions and limitations
// under the License.

#include "parquet/arrow/variant_internal.h"
#include "arrow/extension/parquet_variant.h"

#include <string>

#include "arrow/extension_type.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/logging_internal.h"

namespace parquet::arrow {
namespace arrow::extension {

using ::arrow::Array;
using ::arrow::ArrayData;
using ::arrow::DataType;
using ::arrow::ExtensionType;
using ::arrow::Result;
using ::arrow::Type;

VariantExtensionType::VariantExtensionType(
const std::shared_ptr<::arrow::DataType>& storage_type)
: ::arrow::ExtensionType(storage_type) {
VariantExtensionType::VariantExtensionType(const std::shared_ptr<DataType>& storage_type)
: ExtensionType(storage_type) {
// GH-45948: Shredded variants will need to handle an optional shredded_value as
// well as value_ becoming optional.

Expand Down Expand Up @@ -66,14 +57,13 @@ std::string VariantExtensionType::Serialize() const { return ""; }
std::shared_ptr<Array> VariantExtensionType::MakeArray(
std::shared_ptr<ArrayData> data) const {
DCHECK_EQ(data->type->id(), Type::EXTENSION);
DCHECK_EQ("parquet.variant",
::arrow::internal::checked_cast<const ExtensionType&>(*data->type)
.extension_name());
DCHECK_EQ("arrow.parquet.variant",
internal::checked_cast<const ExtensionType&>(*data->type).extension_name());
return std::make_shared<VariantArray>(data);
}

namespace {
bool IsBinaryField(const std::shared_ptr<::arrow::Field> field) {
bool IsBinaryField(const std::shared_ptr<Field> field) {
return field->type()->storage_id() == Type::BINARY ||
field->type()->storage_id() == Type::LARGE_BINARY;
}
Expand Down Expand Up @@ -116,8 +106,8 @@ bool VariantExtensionType::IsSupportedStorageType(
Result<std::shared_ptr<DataType>> VariantExtensionType::Make(
std::shared_ptr<DataType> storage_type) {
if (!IsSupportedStorageType(storage_type)) {
return ::arrow::Status::Invalid("Invalid storage type for VariantExtensionType: ",
storage_type->ToString());
return Status::Invalid("Invalid storage type for VariantExtensionType: ",
storage_type->ToString());
}

return std::make_shared<VariantExtensionType>(std::move(storage_type));
Expand All @@ -130,4 +120,4 @@ std::shared_ptr<DataType> variant(std::shared_ptr<DataType> storage_type) {
return VariantExtensionType::Make(std::move(storage_type)).ValueOrDie();
}

} // namespace parquet::arrow
} // namespace arrow::extension
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,16 @@

#pragma once

#include <stdexcept>
#include <string>

#include "arrow/extension_type.h"
#include "parquet/platform.h"
#include "arrow/util/visibility.h"

namespace parquet::arrow {
namespace arrow::extension {

class PARQUET_EXPORT VariantArray : public ::arrow::ExtensionArray {
class ARROW_EXPORT VariantArray : public ExtensionArray {
public:
using ::arrow::ExtensionArray::ExtensionArray;
using ExtensionArray::ExtensionArray;
};

/// EXPERIMENTAL: Variant is not yet fully supported.
Expand All @@ -46,41 +45,37 @@ class PARQUET_EXPORT VariantArray : public ::arrow::ExtensionArray {
///
/// To read more about variant shredding, see the variant shredding spec at
/// https://github.com/apache/parquet-format/blob/master/VariantShredding.md
class PARQUET_EXPORT VariantExtensionType : public ::arrow::ExtensionType {
class ARROW_EXPORT VariantExtensionType : public ExtensionType {
public:
explicit VariantExtensionType(const std::shared_ptr<::arrow::DataType>& storage_type);
explicit VariantExtensionType(const std::shared_ptr<DataType>& storage_type);

std::string extension_name() const override { return "parquet.variant"; }
std::string extension_name() const override { return "arrow.parquet.variant"; }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change looks good.

Do you think it is worth adding a kVariantExtensionName in this file and move this file to cpp/src/arrow/extension/parquet_variant.h?

WDYT? @pitrou

Copy link
Contributor Author

@HuaHuaY HuaHuaY Jan 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

JsonExtensionType also uses hardcoded extension name. If we want to define string literals, I suggest to move other's extension types' names into a same file.

Copy link
Contributor Author

@HuaHuaY HuaHuaY Jan 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to move variant_internal.h to cpp/src/arrow/extension/ like other extension types?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's exactly what I proposed above.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have moved variant_internal.cc to cpp/src/arrow/extension/parquet_variant.cc.


bool ExtensionEquals(const ::arrow::ExtensionType& other) const override;
bool ExtensionEquals(const ExtensionType& other) const override;

::arrow::Result<std::shared_ptr<::arrow::DataType>> Deserialize(
std::shared_ptr<::arrow::DataType> storage_type,
Result<std::shared_ptr<DataType>> Deserialize(
std::shared_ptr<DataType> storage_type,
const std::string& serialized_data) const override;

std::string Serialize() const override;

std::shared_ptr<::arrow::Array> MakeArray(
std::shared_ptr<::arrow::ArrayData> data) const override;
std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;

static ::arrow::Result<std::shared_ptr<::arrow::DataType>> Make(
std::shared_ptr<::arrow::DataType> storage_type);
static Result<std::shared_ptr<DataType>> Make(std::shared_ptr<DataType> storage_type);

static bool IsSupportedStorageType(
const std::shared_ptr<::arrow::DataType>& storage_type);
static bool IsSupportedStorageType(const std::shared_ptr<DataType>& storage_type);

std::shared_ptr<::arrow::Field> metadata() const { return metadata_; }
std::shared_ptr<Field> metadata() const { return metadata_; }

std::shared_ptr<::arrow::Field> value() const { return value_; }
std::shared_ptr<Field> value() const { return value_; }

private:
// TODO GH-45948 added shredded_value
std::shared_ptr<::arrow::Field> metadata_;
std::shared_ptr<::arrow::Field> value_;
std::shared_ptr<Field> metadata_;
std::shared_ptr<Field> value_;
};

/// \brief Return a VariantExtensionType instance.
PARQUET_EXPORT std::shared_ptr<::arrow::DataType> variant(
std::shared_ptr<::arrow::DataType> storage_type);
ARROW_EXPORT std::shared_ptr<DataType> variant(std::shared_ptr<DataType> storage_type);

} // namespace parquet::arrow
} // namespace arrow::extension
1 change: 0 additions & 1 deletion cpp/src/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,6 @@ set(PARQUET_SRCS
arrow/reader_internal.cc
arrow/schema.cc
arrow/schema_internal.cc
arrow/variant_internal.cc
arrow/writer.cc
bloom_filter.cc
bloom_filter_reader.cc
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/parquet/arrow/arrow_schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
#include "parquet/arrow/reader.h"
#include "parquet/arrow/reader_internal.h"
#include "parquet/arrow/schema.h"
#include "parquet/arrow/variant_internal.h"
#include "parquet/file_reader.h"
#include "parquet/schema.h"
#include "parquet/schema_internal.h"
Expand All @@ -34,6 +33,7 @@

#include "arrow/array.h"
#include "arrow/extension/json.h"
#include "arrow/extension/parquet_variant.h"
#include "arrow/extension/uuid.h"
#include "arrow/ipc/writer.h"
#include "arrow/testing/extension_type.h"
Expand Down Expand Up @@ -950,7 +950,7 @@ TEST_F(TestConvertParquetSchema, ParquetVariant) {
auto arrow_metadata = ::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false);
auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false);
auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value});
auto variant_extension = std::make_shared<VariantExtensionType>(arrow_variant);
auto variant_extension = ::arrow::extension::variant(arrow_variant);

{
// Parquet file does not contain Arrow schema.
Expand Down
23 changes: 13 additions & 10 deletions cpp/src/parquet/arrow/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <vector>

#include "arrow/extension/json.h"
#include "arrow/extension/parquet_variant.h"
#include "arrow/extension/uuid.h"
#include "arrow/extension_type.h"
#include "arrow/io/memory.h"
Expand All @@ -36,7 +37,6 @@
#include "arrow/util/value_parsing.h"

#include "parquet/arrow/schema_internal.h"
#include "parquet/arrow/variant_internal.h"
#include "parquet/exception.h"
#include "parquet/geospatial/util_json_internal.h"
#include "parquet/metadata.h"
Expand Down Expand Up @@ -129,10 +129,11 @@ Status MapToNode(const std::shared_ptr<::arrow::MapType>& type, const std::strin
return Status::OK();
}

Status VariantToNode(const std::shared_ptr<VariantExtensionType>& type,
const std::string& name, bool nullable, int field_id,
const WriterProperties& properties,
const ArrowWriterProperties& arrow_properties, NodePtr* out) {
Status VariantToNode(
const std::shared_ptr<::arrow::extension::VariantExtensionType>& type,
const std::string& name, bool nullable, int field_id,
const WriterProperties& properties, const ArrowWriterProperties& arrow_properties,
NodePtr* out) {
NodePtr metadata_node;
RETURN_NOT_OK(FieldToNode("metadata", type->metadata(), properties, arrow_properties,
&metadata_node));
Expand Down Expand Up @@ -485,8 +486,10 @@ Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
ARROW_ASSIGN_OR_RAISE(logical_type,
LogicalTypeFromGeoArrowMetadata(ext_type->Serialize()));
break;
} else if (ext_type->extension_name() == std::string("parquet.variant")) {
auto variant_type = std::static_pointer_cast<VariantExtensionType>(field->type());
} else if (ext_type->extension_name() == std::string("arrow.parquet.variant")) {
auto variant_type =
std::static_pointer_cast<::arrow::extension::VariantExtensionType>(
field->type());

return VariantToNode(variant_type, name, field->nullable(), field_id, properties,
arrow_properties, out);
Expand Down Expand Up @@ -597,7 +600,7 @@ Status GroupToStruct(const GroupNode& node, LevelInfo current_levels,
auto struct_type = ::arrow::struct_(arrow_fields);
if (ctx->properties.get_arrow_extensions_enabled() &&
node.logical_type()->is_variant()) {
auto extension_type = ::arrow::GetExtensionType("parquet.variant");
auto extension_type = ::arrow::GetExtensionType("arrow.parquet.variant");
if (extension_type) {
ARROW_ASSIGN_OR_RAISE(
struct_type,
Expand Down Expand Up @@ -1147,10 +1150,10 @@ Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer
extension_supports_inferred_storage =
arrow_extension_inferred ||
::arrow::extension::UuidType::IsSupportedStorageType(inferred_type);
} else if (origin_extension_name == "parquet.variant") {
} else if (origin_extension_name == "arrow.parquet.variant") {
extension_supports_inferred_storage =
arrow_extension_inferred ||
VariantExtensionType::IsSupportedStorageType(inferred_type);
::arrow::extension::VariantExtensionType::IsSupportedStorageType(inferred_type);
} else {
extension_supports_inferred_storage =
origin_extension_type.storage_type()->Equals(*inferred_type);
Expand Down
21 changes: 12 additions & 9 deletions cpp/src/parquet/arrow/variant_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@
// specific language governing permissions and limitations
// under the License.

#include "parquet/arrow/variant_internal.h"

#include "arrow/array/validate.h"
#include "arrow/extension/parquet_variant.h"
#include "arrow/ipc/test_common.h"
#include "arrow/record_batch.h"
#include "arrow/testing/gtest_util.h"
Expand All @@ -29,16 +28,20 @@ using ::arrow::binary;
using ::arrow::struct_;

TEST(TestVariantExtensionType, StorageTypeValidation) {
auto variant1 = variant(struct_({field("metadata", binary(), /*nullable=*/false),
field("value", binary(), /*nullable=*/false)}));
auto variant2 = variant(struct_({field("metadata", binary(), /*nullable=*/false),
field("value", binary(), /*nullable=*/false)}));
auto variant1 = ::arrow::extension::variant(
struct_({field("metadata", binary(), /*nullable=*/false),
field("value", binary(), /*nullable=*/false)}));
auto variant2 = ::arrow::extension::variant(
struct_({field("metadata", binary(), /*nullable=*/false),
field("value", binary(), /*nullable=*/false)}));

ASSERT_TRUE(variant1->Equals(variant2));

// Metadata and value fields can be provided in either order
auto variantFieldsFlipped = std::dynamic_pointer_cast<VariantExtensionType>(
variant(struct_({field("value", binary(), /*nullable=*/false),
auto variantFieldsFlipped =
std::dynamic_pointer_cast<::arrow::extension::VariantExtensionType>(
::arrow::extension::variant(
struct_({field("value", binary(), /*nullable=*/false),
field("metadata", binary(), /*nullable=*/false)})));

ASSERT_EQ("metadata", variantFieldsFlipped->metadata()->name());
Expand All @@ -62,7 +65,7 @@ TEST(TestVariantExtensionType, StorageTypeValidation) {
Invalid,
"Invalid: Invalid storage type for VariantExtensionType: " +
storage_type->ToString(),
VariantExtensionType::Make(storage_type));
::arrow::extension::VariantExtensionType::Make(storage_type));
}
}

Expand Down
Loading