Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 57 additions & 30 deletions cpp/src/parquet/arrow/arrow_schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,32 @@ TEST_F(TestConvertParquetSchema, ParquetLists) {
arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
}

// Deep nested two-level encoding List<List<List<Integer>>>:
// optional group my_list (LIST) {
// repeated group array (LIST) {
// repeated group array (LIST) {
// repeated int32 array;
// }
// }
// }
{
auto inner_array =
PrimitiveNode::Make("array", Repetition::REPEATED, ParquetType::INT32);
auto middle_array = GroupNode::Make("array", Repetition::REPEATED, {inner_array},
ConvertedType::LIST);
auto outer_array = GroupNode::Make("array", Repetition::REPEATED, {middle_array},
ConvertedType::LIST);
parquet_fields.push_back(GroupNode::Make("my_list", Repetition::OPTIONAL,
{outer_array}, ConvertedType::LIST));
auto arrow_inner_array = ::arrow::field("array", INT32, /*nullable=*/false);
auto arrow_middle_array = ::arrow::field(
"array", list_case.type_factory(arrow_inner_array), /*nullable=*/false);
auto arrow_outer_array = ::arrow::field(
"array", list_case.type_factory(arrow_middle_array), /*nullable=*/false);
auto arrow_list = list_case.type_factory(arrow_outer_array);
arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
}

// List<Map<String, String>> in three-level list encoding:
// optional group my_list (LIST) {
// repeated group list {
Expand Down Expand Up @@ -681,6 +707,36 @@ TEST_F(TestConvertParquetSchema, ParquetLists) {
arrow_fields.push_back(::arrow::field("my_list", arrow_list, /*nullable=*/true));
}

// List<Map<String, String>> in two-level list encoding:
//
// optional group my_list (LIST) {
// repeated group array (MAP) {
// repeated group key_value {
// required binary key (STRING);
// optional binary value (STRING);
// }
// }
// }
{
auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key, value});
auto array =
GroupNode::Make("array", Repetition::REPEATED, {key_value}, ConvertedType::MAP);
parquet_fields.push_back(
GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, ConvertedType::LIST));

auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false);
auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true);
auto arrow_element = ::arrow::field(
"array", std::make_shared<::arrow::MapType>(arrow_key, arrow_value),
/*nullable=*/false);
auto arrow_list = list_case.type_factory(arrow_element);
arrow_fields.push_back(::arrow::field("my_list", arrow_list, /*nullable=*/true));
}

auto arrow_schema = ::arrow::schema(arrow_fields);

ArrowReaderProperties props;
Expand Down Expand Up @@ -844,34 +900,6 @@ TEST_F(TestConvertParquetSchema, ParquetRepeatedNestedSchema) {
}

TEST_F(TestConvertParquetSchema, IllegalParquetNestedSchema) {
// List<Map<String, String>> in two-level list encoding:
//
// optional group my_list (LIST) {
// repeated group array (MAP) {
// repeated group key_value {
// required binary key (STRING);
// optional binary value (STRING);
// }
// }
// }
{
auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key, value});
auto array =
GroupNode::Make("array", Repetition::REPEATED, {key_value}, ConvertedType::MAP);
std::vector<NodePtr> parquet_fields;
parquet_fields.push_back(
GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, ConvertedType::LIST));

EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid,
testing::HasSubstr("Group with one repeated child must be LIST-annotated."),
ConvertSchema(parquet_fields));
}

// List<List<String>>: outer list is two-level encoding, inner list is three-level
//
// optional group my_list (LIST) {
Expand Down Expand Up @@ -912,8 +940,7 @@ TEST_F(TestConvertParquetSchema, IllegalParquetNestedSchema) {
GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, ConvertedType::LIST));

EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid,
testing::HasSubstr("LIST-annotated groups must have at least one child."),
Invalid, testing::HasSubstr("Group must have at least one child."),
ConvertSchema(parquet_fields));
}
}
Expand Down
159 changes: 71 additions & 88 deletions cpp/src/parquet/arrow/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "arrow/io/memory.h"
#include "arrow/ipc/api.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/base64.h"
#include "arrow/util/checked_cast.h"
Expand Down Expand Up @@ -620,7 +621,8 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
if (group.field_count() != 1) {
return Status::Invalid("MAP-annotated groups must have a single child.");
}
if (group.is_repeated()) {
if (group.is_repeated() &&
(group.parent() == nullptr || !group.parent()->logical_type()->is_list())) {
return Status::Invalid("MAP-annotated groups must not be repeated.");
}

Expand Down Expand Up @@ -651,7 +653,9 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
return ListToSchemaField(group, current_levels, ctx, parent, out);
}

current_levels.Increment(group);
if (group.is_optional()) {
current_levels.IncrementOptional();
}
int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();

out->children.resize(1);
Expand Down Expand Up @@ -694,17 +698,76 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
return Status::OK();
}

Status ResolveList(const GroupNode& group, const Node& list_node,
LevelInfo current_levels, SchemaTreeContext* ctx,
const SchemaField* out, SchemaField* child_field) {
auto check_two_level_list_repeated = [](const GroupNode& group) -> Status {
// When it is repeated, the LIST-annotated 2-level structure can only serve as an
// element within another LIST-annotated 2-level structure.
if (group.is_repeated() &&
(group.parent() == nullptr || !group.parent()->logical_type()->is_list())) {
return Status::Invalid("LIST-annotated groups must not be repeated.");
}
return {};
};

if (list_node.is_group()) {
const auto& list_group = static_cast<const GroupNode&>(list_node);
if (list_group.field_count() > 1) {
// The inner type of the list should be a struct when there are multiple fields
// in the repeated group
RETURN_NOT_OK(check_two_level_list_repeated(group));
return GroupToStruct(list_group, current_levels, ctx, out, child_field);
}
if (list_group.field_count() == 0) {
return Status::Invalid("Group must have at least one child.");
}

if (list_group.logical_type()->is_none() && HasListElementName(list_group, group)) {
// Backward-compatibility rule 4
RETURN_NOT_OK(check_two_level_list_repeated(group));
return GroupToStruct(list_group, current_levels, ctx, out, child_field);
}

const auto& repeated_field = list_group.field(0);
if (!list_group.logical_type()->is_none() || repeated_field->is_repeated()) {
RETURN_NOT_OK(check_two_level_list_repeated(group));
if (list_group.logical_type()->is_list()) {
return ListToSchemaField(list_group, current_levels, ctx, out, child_field);
} else if (list_group.logical_type()->is_map()) {
return MapToSchemaField(list_group, current_levels, ctx, out, child_field);
} else {
return GroupToStruct(list_group, current_levels, ctx, out, child_field);
}
}

if (group.is_repeated()) {
return Status::Invalid("LIST-annotated groups must not be repeated.");
}

return NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field);
}

RETURN_NOT_OK(check_two_level_list_repeated(group));
const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
int column_index = ctx->schema->GetColumnIndex(primitive_node);
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
GetTypeForNode(column_index, primitive_node, ctx));
auto item_field = ::arrow::field(list_node.name(), type, /*nullable=*/false,
FieldIdMetadata(list_node.field_id()));
return PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field);
}

Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
SchemaTreeContext* ctx, const SchemaField* parent,
SchemaField* out) {
if (group.field_count() != 1) {
return Status::Invalid("LIST-annotated groups must have a single child.");
}
if (group.is_repeated()) {
return Status::Invalid("LIST-annotated groups must not be repeated.");
}

current_levels.Increment(group);
if (group.is_optional()) {
current_levels.IncrementOptional();
}

out->children.resize(group.field_count());
SchemaField* child_field = &out->children[0];
Expand All @@ -720,88 +783,8 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
}

int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
if (list_node.is_group()) {
const auto& list_group = static_cast<const GroupNode&>(list_node);
if (list_group.field_count() > 1) {
// The inner type of the list should be a struct when there are multiple fields
// in the repeated group
RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field));
} else if (list_group.field_count() == 1) {
const auto& repeated_field = list_group.field(0);
if (repeated_field->is_repeated()) {
// Special case where the inner type might be a list with two-level encoding
// like below:
//
// required/optional group name=SOMETHING (LIST) {
// repeated group array (LIST) {
// repeated TYPE item;
// }
// }
//
// yields list<item: list<item: TYPE not null> not null> ?nullable
if (!list_group.logical_type()->is_list()) {
return Status::Invalid("Group with one repeated child must be LIST-annotated.");
}
// LIST-annotated group with three-level encoding cannot be repeated.
if (repeated_field->is_group()) {
auto& repeated_group_field = static_cast<const GroupNode&>(*repeated_field);
if (repeated_group_field.field_count() == 0) {
return Status::Invalid("LIST-annotated groups must have at least one child.");
}
if (!repeated_group_field.field(0)->is_repeated()) {
return Status::Invalid("LIST-annotated groups must not be repeated.");
}
}
RETURN_NOT_OK(
NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field));
} else if (HasListElementName(list_group, group)) {
// We distinguish the special case that we have
//
// required/optional group name=SOMETHING {
// repeated group name=array or $SOMETHING_tuple {
// required/optional TYPE item;
// }
// }
//
// The inner type of the list should be a struct rather than a primitive value
//
// yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field));
} else {
// Resolve 3-level encoding
//
// required/optional group name=whatever {
// repeated group name=list {
// required/optional TYPE item;
// }
// }
//
// yields list<item: TYPE ?nullable> ?nullable
RETURN_NOT_OK(
NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field));
}
} else {
return Status::Invalid("Group must have at least one child.");
}
} else {
// Two-level list encoding
//
// required/optional group LIST {
// repeated TYPE;
// }
//
// TYPE is a primitive type
//
// yields list<item: TYPE not null> ?nullable
const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
int column_index = ctx->schema->GetColumnIndex(primitive_node);
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
GetTypeForNode(column_index, primitive_node, ctx));
auto item_field = ::arrow::field(list_node.name(), type, /*nullable=*/false,
FieldIdMetadata(list_node.field_id()));
RETURN_NOT_OK(
PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field));
}
RETURN_NOT_OK(ResolveList(group, list_node, current_levels, ctx, out, child_field));

ARROW_ASSIGN_OR_RAISE(auto list_type,
MakeArrowList(child_field->field, ctx->properties));
out->field = ::arrow::field(group.name(), std::move(list_type), group.is_optional(),
Expand Down
Loading