Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions src/duckdb/extension/json/include/json_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,9 +230,9 @@ class JSONReader : public BaseFileReader {

void Initialize(Allocator &allocator, idx_t buffer_size);
bool InitializeScan(JSONReaderScanState &state, JSONFileReadType file_read_type);
void ParseJSON(JSONReaderScanState &scan_state, char *const json_start, const idx_t json_size,
bool ParseJSON(JSONReaderScanState &scan_state, char *const json_start, const idx_t json_size,
const idx_t remaining);
void ParseNextChunk(JSONReaderScanState &scan_state);
bool ParseNextChunk(JSONReaderScanState &scan_state);
idx_t Scan(JSONReaderScanState &scan_state);
bool ReadNextBuffer(JSONReaderScanState &scan_state);
bool PrepareBufferForRead(JSONReaderScanState &scan_state);
Expand Down
25 changes: 16 additions & 9 deletions src/duckdb/extension/json/json_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ static pair<JSONFormat, JSONRecordType> DetectFormatAndRecordType(char *const bu
return make_pair(JSONFormat::ARRAY, JSONRecordType::VALUES);
}

void JSONReader::ParseJSON(JSONReaderScanState &scan_state, char *const json_start, const idx_t json_size,
bool JSONReader::ParseJSON(JSONReaderScanState &scan_state, char *const json_start, const idx_t json_size,
const idx_t remaining) {
yyjson_doc *doc;
yyjson_read_err err;
Expand All @@ -640,7 +640,7 @@ void JSONReader::ParseJSON(JSONReaderScanState &scan_state, char *const json_sta
}
if (!can_ignore_this_error) {
AddParseError(scan_state, scan_state.lines_or_objects_in_buffer, err, extra);
return;
return false;
}
}

Expand All @@ -652,7 +652,7 @@ void JSONReader::ParseJSON(JSONReaderScanState &scan_state, char *const json_sta
err.msg = "unexpected end of data";
err.pos = json_size;
AddParseError(scan_state, scan_state.lines_or_objects_in_buffer, err, "Try auto-detecting the JSON format");
return;
return false;
} else if (!options.ignore_errors && read_size < json_size) {
idx_t off = read_size;
idx_t rem = json_size;
Expand All @@ -662,20 +662,21 @@ void JSONReader::ParseJSON(JSONReaderScanState &scan_state, char *const json_sta
err.msg = "unexpected content after document";
err.pos = read_size;
AddParseError(scan_state, scan_state.lines_or_objects_in_buffer, err, "Try auto-detecting the JSON format");
return;
return false;
}
}

scan_state.lines_or_objects_in_buffer++;
if (!doc) {
scan_state.values[scan_state.scan_count] = nullptr;
return;
return true;
}

// Set the JSONLine and trim
scan_state.units[scan_state.scan_count] = JSONString(json_start, json_size);
TrimWhitespace(scan_state.units[scan_state.scan_count]);
scan_state.values[scan_state.scan_count] = doc->root;
return true;
}

void JSONReader::AutoDetect(Allocator &allocator, idx_t buffer_capacity) {
Expand Down Expand Up @@ -762,7 +763,7 @@ bool JSONReader::CopyRemainderFromPreviousBuffer(JSONReaderScanState &scan_state
return true;
}

void JSONReader::ParseNextChunk(JSONReaderScanState &scan_state) {
bool JSONReader::ParseNextChunk(JSONReaderScanState &scan_state) {
const auto format = GetFormat();
auto &buffer_ptr = scan_state.buffer_ptr;
auto &buffer_offset = scan_state.buffer_offset;
Expand Down Expand Up @@ -796,7 +797,9 @@ void JSONReader::ParseNextChunk(JSONReaderScanState &scan_state) {
}

idx_t json_size = json_end - json_start;
ParseJSON(scan_state, json_start, json_size, remaining);
if (!ParseJSON(scan_state, json_start, json_size, remaining)) {
return false;
}
buffer_offset += json_size;

if (format == JSONFormat::ARRAY) {
Expand All @@ -809,11 +812,12 @@ void JSONReader::ParseNextChunk(JSONReaderScanState &scan_state) {
err.msg = "unexpected character";
err.pos = json_size;
AddParseError(scan_state, scan_state.lines_or_objects_in_buffer, err);
return;
return false;
}
}
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
}
return true;
}

void JSONReader::Initialize(Allocator &allocator, idx_t buffer_size) {
Expand Down Expand Up @@ -868,7 +872,10 @@ idx_t JSONReader::Scan(JSONReaderScanState &scan_state) {
return 0;
}
}
ParseNextChunk(scan_state);
if (!ParseNextChunk(scan_state)) {
// found an error but we can't handle it - return
return 0;
}
}
return scan_state.scan_count;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ class StringColumnReader : public ColumnReader {
const StringColumnType string_column_type;

public:
static void VerifyString(const char *str_data, uint32_t str_len, const bool isVarchar);
static bool IsValid(const char *str_data, uint32_t str_len, bool is_varchar);
static bool IsValid(const string &str, bool is_varchar);
static void VerifyString(const char *str_data, uint32_t str_len, bool is_varchar);
void VerifyString(const char *str_data, uint32_t str_len) const;

static void ReferenceBlock(Vector &result, shared_ptr<ResizeableBuffer> &block);
Expand Down
12 changes: 4 additions & 8 deletions src/duckdb/extension/parquet/parquet_statistics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -395,18 +395,14 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
break;
case LogicalTypeId::VARCHAR: {
auto string_stats = StringStats::CreateUnknown(type);
if (parquet_stats.__isset.min_value) {
StringColumnReader::VerifyString(parquet_stats.min_value.c_str(), parquet_stats.min_value.size(), true);
if (parquet_stats.__isset.min_value && StringColumnReader::IsValid(parquet_stats.min_value, true)) {
StringStats::SetMin(string_stats, parquet_stats.min_value);
} else if (parquet_stats.__isset.min) {
StringColumnReader::VerifyString(parquet_stats.min.c_str(), parquet_stats.min.size(), true);
} else if (parquet_stats.__isset.min && StringColumnReader::IsValid(parquet_stats.min, true)) {
StringStats::SetMin(string_stats, parquet_stats.min);
}
if (parquet_stats.__isset.max_value) {
StringColumnReader::VerifyString(parquet_stats.max_value.c_str(), parquet_stats.max_value.size(), true);
if (parquet_stats.__isset.max_value && StringColumnReader::IsValid(parquet_stats.max_value, true)) {
StringStats::SetMax(string_stats, parquet_stats.max_value);
} else if (parquet_stats.__isset.max) {
StringColumnReader::VerifyString(parquet_stats.max.c_str(), parquet_stats.max.size(), true);
} else if (parquet_stats.__isset.max && StringColumnReader::IsValid(parquet_stats.max, true)) {
StringStats::SetMax(string_stats, parquet_stats.max);
}
row_group_stats = string_stats.ToUnique();
Expand Down
13 changes: 10 additions & 3 deletions src/duckdb/extension/parquet/reader/string_column_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,23 @@ StringColumnReader::StringColumnReader(ParquetReader &reader, const ParquetColum
}
}

void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len, const bool is_varchar) {
bool StringColumnReader::IsValid(const char *str_data, uint32_t str_len, const bool is_varchar) {
if (!is_varchar) {
return;
return true;
}
// verify if a string is actually UTF8, and if there are no null bytes in the middle of the string
// technically Parquet should guarantee this, but reality is often disappointing
UnicodeInvalidReason reason;
size_t pos;
auto utf_type = Utf8Proc::Analyze(str_data, str_len, &reason, &pos);
if (utf_type == UnicodeType::INVALID) {
return utf_type != UnicodeType::INVALID;
}

bool StringColumnReader::IsValid(const string &str, bool is_varchar) {
return IsValid(str.c_str(), str.size(), is_varchar);
}
void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len, const bool is_varchar) {
if (!IsValid(str_data, str_len, is_varchar)) {
throw InvalidInputException("Invalid string encoding found in Parquet file: value \"%s\" is not valid UTF8!",
Blob::ToString(string_t(str_data, str_len)));
}
Expand Down
19 changes: 13 additions & 6 deletions src/duckdb/src/catalog/catalog_search_path.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,15 @@ void CatalogSearchPath::Set(CatalogSearchEntry new_value, CatalogSetPathType set
Set(std::move(new_paths), set_type);
}

const vector<CatalogSearchEntry> &CatalogSearchPath::Get() const {
return paths;
vector<CatalogSearchEntry> CatalogSearchPath::Get() const {
vector<CatalogSearchEntry> res;
for (auto &path : paths) {
if (path.schema.empty()) {
continue;
}
res.emplace_back(path);
}
return res;
}

string CatalogSearchPath::GetDefaultSchema(const string &catalog) const {
Expand Down Expand Up @@ -250,7 +257,7 @@ vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) con
catalogs.push_back(SYSTEM_CATALOG);
} else {
for (auto &path : paths) {
if (StringUtil::CIEquals(path.schema, schema)) {
if (StringUtil::CIEquals(path.schema, schema) || path.schema.empty()) {
catalogs.push_back(path.catalog);
}
}
Expand All @@ -261,24 +268,24 @@ vector<string> CatalogSearchPath::GetCatalogsForSchema(const string &schema) con
vector<string> CatalogSearchPath::GetSchemasForCatalog(const string &catalog) const {
vector<string> schemas;
for (auto &path : paths) {
if (StringUtil::CIEquals(path.catalog, catalog)) {
if (!path.schema.empty() && StringUtil::CIEquals(path.catalog, catalog)) {
schemas.push_back(path.schema);
}
}
return schemas;
}

const CatalogSearchEntry &CatalogSearchPath::GetDefault() const {
const auto &paths = Get();
D_ASSERT(paths.size() >= 2);
D_ASSERT(!paths[1].schema.empty());
return paths[1];
}

void CatalogSearchPath::SetPathsInternal(vector<CatalogSearchEntry> new_paths) {
this->set_paths = std::move(new_paths);

paths.clear();
paths.reserve(set_paths.size() + 3);
paths.reserve(set_paths.size() + 4);
paths.emplace_back(TEMP_CATALOG, DEFAULT_SCHEMA);
for (auto &path : set_paths) {
paths.push_back(path);
Expand Down
Loading