Skip to content

[C++][Parquet] Failed to read deep nested two-level encoding list structure #49114

@HuaHuaY

Description

@HuaHuaY

Describe the bug, including details regarding any error messages, version, and platform.

An error failed with Invalid: LIST-annotated groups must not be repeated occurs when converting the schema of a parquet file with deep nested two-level encoding list into arrow schema. I provide the method for generating parquet files later. We can also add this code to the unit test TestConvertParquetSchema.ParquetLists to reproduce:

    // Deep nested two-level encoding List<List<List<Integer>>>:
    // optional group my_list (LIST) {
    //   repeated group array (LIST) {
    //     repeated group array (LIST) {
    //       repeated int32 array;
    //     }
    //   }
    // }
    {
      auto inner_array =
          PrimitiveNode::Make("array", Repetition::REPEATED, ParquetType::INT32);
      auto middle_array = GroupNode::Make("array", Repetition::REPEATED, {inner_array},
                                          ConvertedType::LIST);
      auto outer_array = GroupNode::Make("array", Repetition::REPEATED, {middle_array},
                                         ConvertedType::LIST);
      parquet_fields.push_back(GroupNode::Make("my_list", Repetition::OPTIONAL,
                                               {outer_array}, ConvertedType::LIST));
      auto arrow_inner_array = ::arrow::field("array", INT32, /*nullable=*/false);
      auto arrow_middle_array = ::arrow::field(
          "array", list_case.type_factory(arrow_inner_array), /*nullable=*/false);
      auto arrow_outer_array = ::arrow::field(
          "array", list_case.type_factory(arrow_middle_array), /*nullable=*/false);
      auto arrow_list = list_case.type_factory(arrow_outer_array);
      arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
    }

Schema of parquet:

message complex_record {
  required group list_of_lists_of_lists (LIST) {
    repeated group array (LIST) {
      repeated group array (LIST) {
        repeated binary array (STRING);
      }
    }
  }
}

The parquet file can be producted by this java program:

package org.example;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.avro.AvroParquetWriter;
import org.apache.parquet.hadoop.ParquetFileWriter;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.io.LocalOutputFile;

import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class ComplexParquetWriter {

  public static void main(String[] args) {
    Schema schema = new Schema.Parser().parse("{"
        + "\"type\":\"record\","
        + "\"name\":\"complex_record\","
        + "\"fields\":["
        + "  {\"name\":\"list_of_lists_of_lists\", \"type\":{\"type\":\"array\", \"items\":{\"type\":\"array\", \"items\":{\"type\":\"array\", \"items\":\"string\"}}}}"
        + "]"
        + "}");

    GenericRecord record = new GenericData.Record(schema);

    Schema listOfListsOfLists = schema.getField("list_of_lists_of_lists").schema();
    record.put("list_of_lists_of_lists", new GenericData.Array<List<List<String>>>(2, listOfListsOfLists) {
      {
        add(new ArrayList<List<String>>() {
          {
            add(new ArrayList<String>() {
              {
                add("deep1");
              }
            });
          }
        });
        add(new ArrayList<List<String>>() {
          {
            add(new ArrayList<String>() {
              {
                add("deep2");
                add("deep3");
              }
            });
            add(new ArrayList<String>() {
              {
                add("deep4");
              }
            });
          }
        });
      }
    });

    Configuration conf = new Configuration();
    Path file = new File("./complex_structure.parquet").toPath();
    LocalOutputFile outputFile = new LocalOutputFile(file);
    try {
      ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(outputFile)
          .withSchema(schema)
          .withConf(conf)
          .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
          .build();
      writer.write(record);
      writer.close();
      System.out.println("Successfully written to " + file.toString());
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
}

Component(s)

C++, Parquet

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions