Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,8 @@ case class ListAgg(
inputAggBufferOffset: Int = 0)
extends Collect[mutable.ArrayBuffer[Any]]
with SupportsOrderingWithinGroup
with ImplicitCastInputTypes {
with ImplicitCastInputTypes
with AliasHelper {

override def orderingFilled: Boolean = orderExpressions.nonEmpty

Expand Down Expand Up @@ -588,7 +589,8 @@ case class ListAgg(
if (someOrder.isEmpty) {
return true
}
if (someOrder.size == 1 && someOrder.head.child.semanticEquals(child)) {
if (someOrder.size == 1 &&
trimAliases(someOrder.head.child).semanticEquals(trimAliases(child))) {
Comment thread
mihailoale-db marked this conversation as resolved.
return true
}
false
Expand Down Expand Up @@ -679,7 +681,7 @@ case class ListAgg(
if (orderExpressions.size != 1) return OrderDeterminismResult.NonDeterministicMismatch
child match {
case Cast(castChild, castType, _, _)
if orderExpressions.head.child.semanticEquals(castChild) =>
if trimAliases(orderExpressions.head.child).semanticEquals(trimAliases(castChild)) =>
if (isCastEqualityPreserving(castChild.dataType) &&
isCastTargetEqualityPreserving(castType)) {
OrderDeterminismResult.Deterministic
Expand Down
103 changes: 103 additions & 0 deletions sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out
Original file line number Diff line number Diff line change
Expand Up @@ -702,3 +702,106 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
"inputType" : "\"TIMESTAMP\""
}
}


-- !query
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
-- !query analysis
Aggregate [listagg(distinct cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as string), ,, cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as string) ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS FIRST)#x]
+- SubqueryAlias __auto_generated_subquery_name
+- Union false, false
:- Union false, false
: :- Project [parse_json({"a": "x"}, true) AS v#x]
: : +- OneRowRelation
: +- Project [parse_json({"a": "y"}, true) AS parse_json({"a": "y"})#x]
: +- OneRowRelation
+- Project [parse_json({"a": "x"}, true) AS parse_json({"a": "x"})#x]
+- OneRowRelation


-- !query
SELECT listagg(v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
-- !query analysis
Aggregate [listagg(cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as string), ,, cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as string) ASC NULLS FIRST, 0, 0) AS listagg(CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS FIRST)#x]
+- SubqueryAlias __auto_generated_subquery_name
+- Union false, false
:- Union false, false
: :- Project [parse_json({"a": "x"}, true) AS v#x]
: : +- OneRowRelation
: +- Project [parse_json({"a": "y"}, true) AS parse_json({"a": "y"})#x]
: +- OneRowRelation
+- Project [parse_json({"a": "x"}, true) AS parse_json({"a": "x"})#x]
+- OneRowRelation


-- !query
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string DESC) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
-- !query analysis
Aggregate [listagg(distinct cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as string), ,, cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as string) DESC NULLS LAST, 0, 0) AS listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) DESC NULLS LAST)#x]
+- SubqueryAlias __auto_generated_subquery_name
+- Union false, false
:- Union false, false
: :- Project [parse_json({"a": "x"}, true) AS v#x]
: : +- OneRowRelation
: +- Project [parse_json({"a": "y"}, true) AS parse_json({"a": "y"})#x]
: +- OneRowRelation
+- Project [parse_json({"a": "x"}, true) AS parse_json({"a": "x"})#x]
+- OneRowRelation


-- !query
SELECT listagg(DISTINCT v:a.b::string, ',') WITHIN GROUP (ORDER BY v:a.b::string) FROM (SELECT parse_json('{"a": {"b": "x"}}') v UNION ALL SELECT parse_json('{"a": {"b": "y"}}') UNION ALL SELECT parse_json('{"a": {"b": "x"}}'))
-- !query analysis
Aggregate [listagg(distinct cast(variant_get(v#x, $.a.b, VariantType, true, Some(America/Los_Angeles)) as string), ,, cast(variant_get(v#x, $.a.b, VariantType, true, Some(America/Los_Angeles)) as string) ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT CAST(variant_get(v, $.a.b) AS b AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a.b) AS b AS STRING) ASC NULLS FIRST)#x]
+- SubqueryAlias __auto_generated_subquery_name
+- Union false, false
:- Union false, false
: :- Project [parse_json({"a": {"b": "x"}}, true) AS v#x]
: : +- OneRowRelation
: +- Project [parse_json({"a": {"b": "y"}}, true) AS parse_json({"a": {"b": "y"}})#x]
: +- OneRowRelation
+- Project [parse_json({"a": {"b": "x"}}, true) AS parse_json({"a": {"b": "x"}})#x]
+- OneRowRelation


-- !query
SELECT grp, listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT 1 grp, parse_json('{"a": "x"}') v UNION ALL SELECT 1, parse_json('{"a": "y"}') UNION ALL SELECT 2, parse_json('{"a": "x"}') UNION ALL SELECT 2, parse_json('{"a": "x"}') UNION ALL SELECT 1, parse_json('{"a": "x"}')) GROUP BY grp
-- !query analysis
Aggregate [grp#x], [grp#x, listagg(distinct cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as string), ,, cast(variant_get(v#x, $.a, VariantType, true, Some(America/Los_Angeles)) as string) ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS FIRST)#x]
+- SubqueryAlias __auto_generated_subquery_name
+- Union false, false
:- Union false, false
: :- Union false, false
: : :- Union false, false
: : : :- Project [1 AS grp#x, parse_json({"a": "x"}, true) AS v#x]
: : : : +- OneRowRelation
: : : +- Project [1 AS 1#x, parse_json({"a": "y"}, true) AS parse_json({"a": "y"})#x]
: : : +- OneRowRelation
: : +- Project [2 AS 2#x, parse_json({"a": "x"}, true) AS parse_json({"a": "x"})#x]
: : +- OneRowRelation
: +- Project [2 AS 2#x, parse_json({"a": "x"}, true) AS parse_json({"a": "x"})#x]
: +- OneRowRelation
+- Project [1 AS 1#x, parse_json({"a": "x"}, true) AS parse_json({"a": "x"})#x]
+- OneRowRelation


-- !query
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
-- !query analysis
org.apache.spark.sql.catalyst.ExtendedAnalysisException
{
"errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE",
"sqlState" : "42K09",
"messageParameters" : {
"dataType" : "\"VARIANT\"",
"functionName" : "`sortorder`",
"sqlExpr" : "\"variant_get(v, $.a) ASC NULLS FIRST\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 66,
"stopIndex" : 68,
"fragment" : "v:a"
} ]
}
15 changes: 15 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/listagg.sql
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,18 @@ SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col1, col2) FROM df;
SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1.1 as double)), (cast(2.2 as double)), (cast(2.2 as double)), (cast(3.3 as double)) AS t(col);
SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1.0 as float)), (cast(2.0 as float)), (cast(2.0 as float)) AS t(col);
SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (TIMESTAMP'2024-01-01 10:00:00'), (TIMESTAMP'2024-01-02 12:00:00'), (TIMESTAMP'2024-01-01 10:00:00') AS t(col);

-- LISTAGG with semi-structured extract (parser wraps v:a in Alias with fresh ExprId)
-- Tests that isOrderCompatible strips Alias wrappers before comparing via semanticEquals
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'));
Comment thread
mihailoale-db marked this conversation as resolved.
-- Semi-structured extract without DISTINCT
SELECT listagg(v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'));
-- Semi-structured extract with DESC ordering
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string DESC) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'));
-- Semi-structured extract with nested path
SELECT listagg(DISTINCT v:a.b::string, ',') WITHIN GROUP (ORDER BY v:a.b::string) FROM (SELECT parse_json('{"a": {"b": "x"}}') v UNION ALL SELECT parse_json('{"a": {"b": "y"}}') UNION ALL SELECT parse_json('{"a": {"b": "x"}}'));
-- Semi-structured extract with GROUP BY
SELECT grp, listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT 1 grp, parse_json('{"a": "x"}') v UNION ALL SELECT 1, parse_json('{"a": "y"}') UNION ALL SELECT 2, parse_json('{"a": "x"}') UNION ALL SELECT 2, parse_json('{"a": "x"}') UNION ALL SELECT 1, parse_json('{"a": "x"}')) GROUP BY grp;
-- Semi-structured extract: DISTINCT cast with non-equality-preserving order (variant)
-- Tests that checkOrderValueDeterminism strips Alias wrappers before comparing via semanticEquals
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'));
65 changes: 65 additions & 0 deletions sql/core/src/test/resources/sql-tests/results/listagg.sql.out
Original file line number Diff line number Diff line change
Expand Up @@ -563,3 +563,68 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
"inputType" : "\"TIMESTAMP\""
}
}


-- !query
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
-- !query schema
struct<listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS FIRST):string>
-- !query output
x,y


-- !query
SELECT listagg(v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
-- !query schema
struct<listagg(CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS FIRST):string>
-- !query output
x,x,y


-- !query
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string DESC) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
-- !query schema
struct<listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) DESC NULLS LAST):string>
-- !query output
y,x


-- !query
SELECT listagg(DISTINCT v:a.b::string, ',') WITHIN GROUP (ORDER BY v:a.b::string) FROM (SELECT parse_json('{"a": {"b": "x"}}') v UNION ALL SELECT parse_json('{"a": {"b": "y"}}') UNION ALL SELECT parse_json('{"a": {"b": "x"}}'))
-- !query schema
struct<listagg(DISTINCT CAST(variant_get(v, $.a.b) AS b AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a.b) AS b AS STRING) ASC NULLS FIRST):string>
-- !query output
x,y


-- !query
SELECT grp, listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a::string) FROM (SELECT 1 grp, parse_json('{"a": "x"}') v UNION ALL SELECT 1, parse_json('{"a": "y"}') UNION ALL SELECT 2, parse_json('{"a": "x"}') UNION ALL SELECT 2, parse_json('{"a": "x"}') UNION ALL SELECT 1, parse_json('{"a": "x"}')) GROUP BY grp
-- !query schema
struct<grp:int,listagg(DISTINCT CAST(variant_get(v, $.a) AS a AS STRING), ,) WITHIN GROUP (ORDER BY CAST(variant_get(v, $.a) AS a AS STRING) ASC NULLS FIRST):string>
-- !query output
1 x,y
2 x


-- !query
SELECT listagg(DISTINCT v:a::string, ',') WITHIN GROUP (ORDER BY v:a) FROM (SELECT parse_json('{"a": "x"}') v UNION ALL SELECT parse_json('{"a": "y"}') UNION ALL SELECT parse_json('{"a": "x"}'))
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.catalyst.ExtendedAnalysisException
{
"errorClass" : "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE",
"sqlState" : "42K09",
"messageParameters" : {
"dataType" : "\"VARIANT\"",
"functionName" : "`sortorder`",
"sqlExpr" : "\"variant_get(v, $.a) ASC NULLS FIRST\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 66,
"stopIndex" : 68,
"fragment" : "v:a"
} ]
}