Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 29 additions & 4 deletions parquet-variant/src/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,15 @@ use std::{borrow::Cow, ops::Deref};
/// assert_eq!(path[1], VariantPathElement::field("bar"));
/// ```
///
/// # Example: Accessing filed with bracket
/// # Example: Accessing field with bracket
/// ```
/// # use parquet_variant::{VariantPath, VariantPathElement};
/// let path = VariantPath::try_from("a[b.c].d[2]").unwrap();
/// let path = VariantPath::try_from("a['b.c'].d[2]['3']").unwrap();
/// let expected = VariantPath::from_iter([VariantPathElement::field("a"),
/// VariantPathElement::field("b.c"),
/// VariantPathElement::field("d"),
/// VariantPathElement::index(2)]);
/// VariantPathElement::index(2),
/// VariantPathElement::field("3")]);
/// assert_eq!(path, expected)
#[derive(Debug, Clone, PartialEq, Default)]
pub struct VariantPath<'a>(Vec<VariantPathElement<'a>>);
Expand Down Expand Up @@ -287,11 +288,22 @@ mod tests {
assert_eq!(path, expected);

// invalid index will be treated as field
let path = VariantPath::try_from("foo.bar[abc]").unwrap();
let path = VariantPath::try_from("foo.bar['abc'][\"def\"]").unwrap();
let expected = VariantPath::from_iter([
VariantPathElement::field("foo"),
VariantPathElement::field("bar"),
VariantPathElement::field("abc"),
VariantPathElement::field("def"),
]);
assert_eq!(path, expected);

// a number quoted with `'` is treated as field, not index
let path = VariantPath::try_from("foo['0'].bar[\"1\"]").unwrap();
let expected = VariantPath::from_iter([
VariantPathElement::field("foo"),
VariantPathElement::field("0"),
VariantPathElement::field("bar"),
VariantPathElement::field("1"),
]);
assert_eq!(path, expected);
}
Expand Down Expand Up @@ -321,5 +333,18 @@ mod tests {
// No '[' before ']'
let err = VariantPath::try_from("foo.bar]baz").unwrap_err();
assert_eq!(err.to_string(), "Parser error: Unexpected ']' at byte 7");

// Invalid number(without quote) parse
let err = VariantPath::try_from("foo.bar[123abc]").unwrap_err();
assert_eq!(
err.to_string(),
"Parser error: Invalid token in bracket request: `123abc`. Expected a quoted string or a number(e.g., `['field']` or `[123]`)"
);

let err = VariantPath::try_from("foo.bar[abc]").unwrap_err();
assert_eq!(
err.to_string(),
"Parser error: Invalid token in bracket request: `abc`. Expected a quoted string or a number(e.g., `['field']` or `[123]`)"
);
}
}
25 changes: 20 additions & 5 deletions parquet-variant/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,10 @@ pub(crate) fn fits_precision<const N: u32>(n: impl Into<i64>) -> bool {
/// - `"foo"` -> single field `foo`
/// - `"foo.bar"` -> nested fields `foo`, `bar`
/// - `"[1]"` -> array index 1
/// - `"['1']"` or `"["1"]"`-> field `1`
/// - `"foo[1].bar"` -> field `foo`, index 1, field `bar`
/// - `"[a.b]"` -> field `a.b` (dot is literal inside bracket)
/// - `"[a\\]b]"` -> field `a]b` (escaped `]`
/// - `"['a.b']"` -> field `a.b` (dot is literal inside bracket)
/// - `"['a\]b']"` -> field `a]b` (escaped `]`
/// - etc.
///
/// # Errors
Expand Down Expand Up @@ -267,9 +268,23 @@ fn parse_in_bracket(s: &str, i: usize) -> Result<(VariantPathElement<'_>, usize)
}
};

let element = match unescaped.parse() {
Ok(idx) => VariantPathElement::index(idx),
Err(_) => VariantPathElement::field(unescaped),
let element = if let Some(inner) = unescaped
.strip_prefix('\'')
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

JSONpath allows either ' or " as the string delimiter. Do we want to support both? Or take the intersection between SQL and JSONpath and only support single quotes?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Followed the examples on databricks web previously, this is a valid json syntax, not don't add much more complexity, added " now.

.and_then(|s| s.strip_suffix('\''))
.or_else(|| {
unescaped
.strip_prefix('"')
.and_then(|s| s.strip_suffix('"'))
}) {
// Quoted field name, e.g., ['field'] or ['123'] or ["123"]
VariantPathElement::field(inner.to_string())
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are the string-escape rules here? Can the user embed a single quote as \'? as ''?
Or do they have to uri-encode it as %27? (do we even handle that?)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Update: Here's the relevant grammar from JSONpath spec:
https://www.rfc-editor.org/rfc/rfc9535#name-selector

AI web search overview suggests the actual handling of delimiters and escapes tends to be vendor-specific (e.g. many database engines double up the delimiter as an escape)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah! This function already handles \' escape syntax, L247 above.

} else {
let Ok(idx) = unescaped.parse() else {
return Err(ArrowError::ParseError(format!(
"Invalid token in bracket request: `{unescaped}`. Expected a quoted string or a number(e.g., `['field']` or `[123]`)"
)));
};
VariantPathElement::index(idx)
};

Ok((element, end + 1))
Expand Down
Loading