Skip to content

Commit 16e352f

Browse files
duckdblabs-botgithub-actions[bot]
authored andcommitted
Update vendored DuckDB sources to 4b7a6b7bd0
1 parent b521b12 commit 16e352f

23 files changed

Lines changed: 549 additions & 177 deletions

File tree

src/duckdb/extension/parquet/decoder/delta_byte_array_decoder.cpp

Lines changed: 58 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@ void DeltaByteArrayDecoder::ReadDbpData(Allocator &allocator, ResizeableBuffer &
2020
}
2121

2222
void DeltaByteArrayDecoder::InitializePage() {
23-
if (reader.Type().InternalType() != PhysicalType::VARCHAR) {
24-
throw std::runtime_error("Delta Byte Array encoding is only supported for string/blob data");
25-
}
2623
auto &block = *reader.block;
2724
auto &allocator = reader.reader.allocator;
2825
idx_t prefix_count, suffix_count;
@@ -33,71 +30,77 @@ void DeltaByteArrayDecoder::InitializePage() {
3330
if (prefix_count != suffix_count) {
3431
throw std::runtime_error("DELTA_BYTE_ARRAY - prefix and suffix counts are different - corrupt file?");
3532
}
33+
34+
auto prefix_data = reinterpret_cast<uint32_t *>(prefix_buffer.ptr);
35+
auto suffix_data = reinterpret_cast<uint32_t *>(suffix_buffer.ptr);
36+
37+
// Allocate the plain data buffer
38+
if (!plain_data) {
39+
plain_data = make_shared_ptr<ResizeableBuffer>();
40+
}
41+
plain_data->reset();
42+
3643
if (prefix_count == 0) {
37-
// no values
38-
byte_array_data = make_uniq<Vector>(LogicalType::VARCHAR, nullptr);
44+
plain_data->resize(allocator, 0);
3945
return;
4046
}
41-
auto prefix_data = reinterpret_cast<uint32_t *>(prefix_buffer.ptr);
42-
auto suffix_data = reinterpret_cast<uint32_t *>(suffix_buffer.ptr);
43-
byte_array_data = make_uniq<Vector>(LogicalType::VARCHAR, prefix_count);
44-
byte_array_count = prefix_count;
45-
delta_offset = 0;
46-
auto string_data = FlatVector::GetData<string_t>(*byte_array_data);
47+
48+
// Decode DELTA_BYTE_ARRAY into plain Parquet page format
49+
// Plain format for BYTE_ARRAY: [4-byte length][data] repeated
50+
// Plain format for FIXED_LEN_BYTE_ARRAY: [data] repeated (no length prefix)
51+
auto &schema = reader.Schema();
52+
bool is_fixed_len = (schema.parquet_type == duckdb_parquet::Type::FIXED_LEN_BYTE_ARRAY);
53+
idx_t fixed_len = is_fixed_len ? schema.type_length : 0;
54+
55+
// Calculate total buffer size and max value length in one pass
56+
idx_t total_size = 0;
57+
idx_t max_len = 0;
4758
for (idx_t i = 0; i < prefix_count; i++) {
48-
auto str_len = prefix_data[i] + suffix_data[i];
49-
block.available(suffix_data[i]);
50-
string_data[i] = StringVector::EmptyString(*byte_array_data, str_len);
51-
auto result_data = string_data[i].GetDataWriteable();
52-
if (prefix_data[i] > 0) {
53-
if (i == 0 || prefix_data[i] > string_data[i - 1].GetSize()) {
54-
throw std::runtime_error("DELTA_BYTE_ARRAY - prefix is out of range - corrupt file?");
55-
}
56-
memcpy(result_data, string_data[i - 1].GetData(), prefix_data[i]);
59+
idx_t len = prefix_data[i] + suffix_data[i];
60+
if (is_fixed_len && len != fixed_len) {
61+
throw std::runtime_error(
62+
"DELTA_BYTE_ARRAY on FIXED_LEN_BYTE_ARRAY: decoded length does not match type length");
5763
}
58-
memcpy(result_data + prefix_data[i], block.ptr, suffix_data[i]);
59-
block.inc(suffix_data[i]);
60-
string_data[i].Finalize();
64+
total_size += len + (is_fixed_len ? 0 : sizeof(uint32_t));
65+
max_len = MaxValue(max_len, len);
6166
}
62-
}
6367

64-
void DeltaByteArrayDecoder::Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
65-
if (!byte_array_data) {
66-
throw std::runtime_error("Internal error - DeltaByteArray called but there was no byte_array_data set");
67-
}
68-
auto result_ptr = FlatVector::GetData<string_t>(result);
69-
auto &result_mask = FlatVector::Validity(result);
70-
auto string_data = FlatVector::GetData<string_t>(*byte_array_data);
71-
for (idx_t row_idx = 0; row_idx < read_count; row_idx++) {
72-
if (defines && defines[row_idx + result_offset] != reader.MaxDefine()) {
73-
result_mask.SetInvalid(row_idx + result_offset);
74-
continue;
68+
plain_data->resize(allocator, total_size);
69+
unsafe_vector<uint8_t> prev_value(max_len);
70+
idx_t prev_len = 0;
71+
72+
auto output = plain_data->ptr;
73+
for (idx_t i = 0; i < prefix_count; i++) {
74+
auto prefix_len = prefix_data[i];
75+
auto suffix_len = suffix_data[i];
76+
auto value_len = prefix_len + suffix_len;
77+
78+
if (prefix_len > prev_len) {
79+
throw std::runtime_error("DELTA_BYTE_ARRAY - prefix is out of range - corrupt file?");
7580
}
76-
if (delta_offset >= byte_array_count) {
77-
throw IOException("DELTA_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
78-
"read of %d from %d entries) - corrupt file?",
79-
delta_offset + 1, byte_array_count);
81+
82+
if (!is_fixed_len) {
83+
Store<uint32_t>(static_cast<uint32_t>(value_len), output);
84+
output += sizeof(uint32_t);
8085
}
81-
result_ptr[row_idx + result_offset] = string_data[delta_offset++];
86+
87+
memcpy(output, prev_value.data(), prefix_len);
88+
block.available(suffix_len);
89+
memcpy(output + prefix_len, block.ptr, suffix_len);
90+
block.inc(suffix_len);
91+
92+
memcpy(prev_value.data(), output, value_len);
93+
prev_len = value_len;
94+
output += value_len;
8295
}
83-
StringVector::AddHeapReference(result, *byte_array_data);
96+
}
97+
98+
void DeltaByteArrayDecoder::Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
99+
reader.Plain(plain_data, defines, read_count, result_offset, result);
84100
}
85101

86102
void DeltaByteArrayDecoder::Skip(uint8_t *defines, idx_t skip_count) {
87-
if (!byte_array_data) {
88-
throw std::runtime_error("Internal error - DeltaByteArray called but there was no byte_array_data set");
89-
}
90-
for (idx_t row_idx = 0; row_idx < skip_count; row_idx++) {
91-
if (defines && defines[row_idx] != reader.MaxDefine()) {
92-
continue;
93-
}
94-
if (delta_offset >= byte_array_count) {
95-
throw IOException("DELTA_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
96-
"read of %d from %d entries) - corrupt file?",
97-
delta_offset + 1, byte_array_count);
98-
}
99-
delta_offset++;
100-
}
103+
reader.PlainSkip(*plain_data, defines, skip_count);
101104
}
102105

103106
} // namespace duckdb

src/duckdb/extension/parquet/include/decoder/delta_byte_array_decoder.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ class DeltaByteArrayDecoder {
3030

3131
private:
3232
ColumnReader &reader;
33-
unique_ptr<Vector> byte_array_data;
34-
idx_t byte_array_count = 0;
35-
idx_t delta_offset = 0;
33+
34+
//! Decoded data in plain Parquet page format
35+
shared_ptr<ResizeableBuffer> plain_data;
3636
};
3737

3838
} // namespace duckdb

src/duckdb/src/common/enum_util.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@
133133
#include "duckdb/main/extension.hpp"
134134
#include "duckdb/main/extension_helper.hpp"
135135
#include "duckdb/main/extension_install_info.hpp"
136+
#include "duckdb/main/profiling_info.hpp"
136137
#include "duckdb/main/query_parameters.hpp"
137138
#include "duckdb/main/query_profiler.hpp"
138139
#include "duckdb/main/query_result.hpp"
@@ -3806,6 +3807,27 @@ ProfilingCoverage EnumUtil::FromString<ProfilingCoverage>(const char *value) {
38063807
return static_cast<ProfilingCoverage>(StringUtil::StringToEnum(GetProfilingCoverageValues(), 2, "ProfilingCoverage", value));
38073808
}
38083809

3810+
const StringUtil::EnumStringLiteral *GetProfilingParameterNamesValues() {
3811+
static constexpr StringUtil::EnumStringLiteral values[] {
3812+
{ static_cast<uint32_t>(ProfilingParameterNames::FORMAT), "FORMAT" },
3813+
{ static_cast<uint32_t>(ProfilingParameterNames::COVERAGE), "COVERAGE" },
3814+
{ static_cast<uint32_t>(ProfilingParameterNames::SAVE_LOCATION), "SAVE_LOCATION" },
3815+
{ static_cast<uint32_t>(ProfilingParameterNames::MODE), "MODE" },
3816+
{ static_cast<uint32_t>(ProfilingParameterNames::METRICS), "METRICS" }
3817+
};
3818+
return values;
3819+
}
3820+
3821+
template<>
3822+
const char* EnumUtil::ToChars<ProfilingParameterNames>(ProfilingParameterNames value) {
3823+
return StringUtil::EnumToString(GetProfilingParameterNamesValues(), 5, "ProfilingParameterNames", static_cast<uint32_t>(value));
3824+
}
3825+
3826+
template<>
3827+
ProfilingParameterNames EnumUtil::FromString<ProfilingParameterNames>(const char *value) {
3828+
return static_cast<ProfilingParameterNames>(StringUtil::StringToEnum(GetProfilingParameterNamesValues(), 5, "ProfilingParameterNames", value));
3829+
}
3830+
38093831
const StringUtil::EnumStringLiteral *GetPushdownExtractSupportValues() {
38103832
static constexpr StringUtil::EnumStringLiteral values[] {
38113833
{ static_cast<uint32_t>(PushdownExtractSupport::UNCHECKED), "UNCHECKED" },

src/duckdb/src/common/file_system.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ bool FileSystem::IsPathAbsolute(const string &path) {
228228

229229
string FileSystem::NormalizeAbsolutePath(const string &path) {
230230
D_ASSERT(IsPathAbsolute(path));
231-
auto result = StringUtil::Lower(FileSystem::ConvertSeparators(path));
231+
auto result = FileSystem::ConvertSeparators(path);
232232
if (StartsWithSingleBackslash(result)) {
233233
// Path starts with a single backslash or forward slash
234234
// prepend drive letter

src/duckdb/src/execution/expression_executor/execute_operator.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,18 @@ void ExpressionExecutor::Execute(const BoundOperatorExpression &expr, Expression
113113
}
114114
} else if (expression_type == ExpressionType::OPERATOR_TRY) {
115115
auto &child_state = *state->child_states[0];
116+
Vector try_result(result.GetType());
116117
try {
117-
Execute(*expr.children[0], &child_state, sel, count, result);
118+
Execute(*expr.children[0], &child_state, sel, count, try_result);
119+
if (try_result.GetVectorType() == VectorType::CONSTANT_VECTOR) {
120+
result.Reference(try_result);
121+
return;
122+
}
123+
if (sel) {
124+
VectorOperations::Copy(try_result, result, *sel, count, 0, 0, count);
125+
} else {
126+
VectorOperations::Copy(try_result, result, count, 0, 0);
127+
}
118128
return;
119129
} catch (std::exception &ex) {
120130
ErrorData error(ex);

src/duckdb/src/execution/physical_plan/plan_set_operation.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,15 @@ static vector<unique_ptr<Expression>> CreatePartitionedRowNumExpression(const ve
2323
return res;
2424
}
2525

26-
static JoinCondition CreateNotDistinctComparison(const LogicalType &type, idx_t i) {
26+
static JoinCondition CreateNotDistinctComparison(ClientContext &context, const LogicalType &type, idx_t i) {
2727
JoinCondition cond;
2828
cond.left = make_uniq<BoundReferenceExpression>(type, i);
2929
cond.right = make_uniq<BoundReferenceExpression>(type, i);
3030
cond.comparison = ExpressionType::COMPARE_NOT_DISTINCT_FROM;
31+
32+
ExpressionBinder::PushCollation(context, cond.left, type);
33+
ExpressionBinder::PushCollation(context, cond.right, type);
34+
3135
return cond;
3236
}
3337

@@ -59,7 +63,7 @@ PhysicalOperator &PhysicalPlanGenerator::CreatePlan(LogicalSetOperation &op) {
5963
vector<JoinCondition> conditions;
6064
// create equality condition for all columns
6165
for (idx_t i = 0; i < types.size(); i++) {
62-
conditions.push_back(CreateNotDistinctComparison(types[i], i));
66+
conditions.push_back(CreateNotDistinctComparison(context, types[i], i));
6367
}
6468
// For EXCEPT ALL / INTERSECT ALL we push a window operator with a ROW_NUMBER into the scans and join to get bag
6569
// semantics.
@@ -80,7 +84,7 @@ PhysicalOperator &PhysicalPlanGenerator::CreatePlan(LogicalSetOperation &op) {
8084
right = right_window;
8185

8286
// add window expression result to join condition
83-
conditions.push_back(CreateNotDistinctComparison(LogicalType::BIGINT, types.size()));
87+
conditions.push_back(CreateNotDistinctComparison(context, LogicalType::BIGINT, types.size()));
8488
// join (created below) now includes the row number result column
8589
op.types.push_back(LogicalType::BIGINT);
8690
}

0 commit comments

Comments
 (0)