@@ -20,9 +20,6 @@ void DeltaByteArrayDecoder::ReadDbpData(Allocator &allocator, ResizeableBuffer &
2020}
2121
2222void DeltaByteArrayDecoder::InitializePage () {
23- if (reader.Type ().InternalType () != PhysicalType::VARCHAR) {
24- throw std::runtime_error (" Delta Byte Array encoding is only supported for string/blob data" );
25- }
2623 auto &block = *reader.block ;
2724 auto &allocator = reader.reader .allocator ;
2825 idx_t prefix_count, suffix_count;
@@ -33,71 +30,77 @@ void DeltaByteArrayDecoder::InitializePage() {
3330 if (prefix_count != suffix_count) {
3431 throw std::runtime_error (" DELTA_BYTE_ARRAY - prefix and suffix counts are different - corrupt file?" );
3532 }
33+
34+ auto prefix_data = reinterpret_cast <uint32_t *>(prefix_buffer.ptr );
35+ auto suffix_data = reinterpret_cast <uint32_t *>(suffix_buffer.ptr );
36+
37+ // Allocate the plain data buffer
38+ if (!plain_data) {
39+ plain_data = make_shared_ptr<ResizeableBuffer>();
40+ }
41+ plain_data->reset ();
42+
3643 if (prefix_count == 0 ) {
37- // no values
38- byte_array_data = make_uniq<Vector>(LogicalType::VARCHAR, nullptr );
44+ plain_data->resize (allocator, 0 );
3945 return ;
4046 }
41- auto prefix_data = reinterpret_cast <uint32_t *>(prefix_buffer.ptr );
42- auto suffix_data = reinterpret_cast <uint32_t *>(suffix_buffer.ptr );
43- byte_array_data = make_uniq<Vector>(LogicalType::VARCHAR, prefix_count);
44- byte_array_count = prefix_count;
45- delta_offset = 0 ;
46- auto string_data = FlatVector::GetData<string_t >(*byte_array_data);
47+
48+ // Decode DELTA_BYTE_ARRAY into plain Parquet page format
49+ // Plain format for BYTE_ARRAY: [4-byte length][data] repeated
50+ // Plain format for FIXED_LEN_BYTE_ARRAY: [data] repeated (no length prefix)
51+ auto &schema = reader.Schema ();
52+ bool is_fixed_len = (schema.parquet_type == duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY);
53+ idx_t fixed_len = is_fixed_len ? schema.type_length : 0 ;
54+
55+ // Calculate total buffer size and max value length in one pass
56+ idx_t total_size = 0 ;
57+ idx_t max_len = 0 ;
4758 for (idx_t i = 0 ; i < prefix_count; i++) {
48- auto str_len = prefix_data[i] + suffix_data[i];
49- block.available (suffix_data[i]);
50- string_data[i] = StringVector::EmptyString (*byte_array_data, str_len);
51- auto result_data = string_data[i].GetDataWriteable ();
52- if (prefix_data[i] > 0 ) {
53- if (i == 0 || prefix_data[i] > string_data[i - 1 ].GetSize ()) {
54- throw std::runtime_error (" DELTA_BYTE_ARRAY - prefix is out of range - corrupt file?" );
55- }
56- memcpy (result_data, string_data[i - 1 ].GetData (), prefix_data[i]);
59+ idx_t len = prefix_data[i] + suffix_data[i];
60+ if (is_fixed_len && len != fixed_len) {
61+ throw std::runtime_error (
62+ " DELTA_BYTE_ARRAY on FIXED_LEN_BYTE_ARRAY: decoded length does not match type length" );
5763 }
58- memcpy (result_data + prefix_data[i], block.ptr , suffix_data[i]);
59- block.inc (suffix_data[i]);
60- string_data[i].Finalize ();
64+ total_size += len + (is_fixed_len ? 0 : sizeof (uint32_t ));
65+ max_len = MaxValue (max_len, len);
6166 }
62- }
6367
64- void DeltaByteArrayDecoder::Read (uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
65- if (!byte_array_data) {
66- throw std::runtime_error (" Internal error - DeltaByteArray called but there was no byte_array_data set" );
67- }
68- auto result_ptr = FlatVector::GetData<string_t >(result);
69- auto &result_mask = FlatVector::Validity (result);
70- auto string_data = FlatVector::GetData<string_t >(*byte_array_data);
71- for (idx_t row_idx = 0 ; row_idx < read_count; row_idx++) {
72- if (defines && defines[row_idx + result_offset] != reader.MaxDefine ()) {
73- result_mask.SetInvalid (row_idx + result_offset);
74- continue ;
68+ plain_data->resize (allocator, total_size);
69+ unsafe_vector<uint8_t > prev_value (max_len);
70+ idx_t prev_len = 0 ;
71+
72+ auto output = plain_data->ptr ;
73+ for (idx_t i = 0 ; i < prefix_count; i++) {
74+ auto prefix_len = prefix_data[i];
75+ auto suffix_len = suffix_data[i];
76+ auto value_len = prefix_len + suffix_len;
77+
78+ if (prefix_len > prev_len) {
79+ throw std::runtime_error (" DELTA_BYTE_ARRAY - prefix is out of range - corrupt file?" );
7580 }
76- if (delta_offset >= byte_array_count) {
77- throw IOException ( " DELTA_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
78- " read of %d from %d entries) - corrupt file? " ,
79- delta_offset + 1 , byte_array_count );
81+
82+ if (!is_fixed_len) {
83+ Store< uint32_t >( static_cast < uint32_t >(value_len), output);
84+ output += sizeof ( uint32_t );
8085 }
81- result_ptr[row_idx + result_offset] = string_data[delta_offset++];
86+
87+ memcpy (output, prev_value.data (), prefix_len);
88+ block.available (suffix_len);
89+ memcpy (output + prefix_len, block.ptr , suffix_len);
90+ block.inc (suffix_len);
91+
92+ memcpy (prev_value.data (), output, value_len);
93+ prev_len = value_len;
94+ output += value_len;
8295 }
83- StringVector::AddHeapReference (result, *byte_array_data);
96+ }
97+
98+ void DeltaByteArrayDecoder::Read (uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
99+ reader.Plain (plain_data, defines, read_count, result_offset, result);
84100}
85101
86102void DeltaByteArrayDecoder::Skip (uint8_t *defines, idx_t skip_count) {
87- if (!byte_array_data) {
88- throw std::runtime_error (" Internal error - DeltaByteArray called but there was no byte_array_data set" );
89- }
90- for (idx_t row_idx = 0 ; row_idx < skip_count; row_idx++) {
91- if (defines && defines[row_idx] != reader.MaxDefine ()) {
92- continue ;
93- }
94- if (delta_offset >= byte_array_count) {
95- throw IOException (" DELTA_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
96- " read of %d from %d entries) - corrupt file?" ,
97- delta_offset + 1 , byte_array_count);
98- }
99- delta_offset++;
100- }
103+ reader.PlainSkip (*plain_data, defines, skip_count);
101104}
102105
103106} // namespace duckdb
0 commit comments