Skip to content

Commit 93f4c34

Browse files
ntjohnson1claude
andauthored
Add docstring examples for Aggregate basic and bitwise/boolean functions (#1416)
* Add docstring examples for Aggregate basic and bitwise/boolean functions Add example usage to docstrings for Aggregate basic and bitwise/boolean functions to improve documentation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Add tighter bound on approx_distinct for small sizes --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent e524121 commit 93f4c34

File tree

1 file changed

+141
-0
lines changed

1 file changed

+141
-0
lines changed

python/datafusion/functions.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2370,6 +2370,15 @@ def approx_distinct(
23702370
Args:
23712371
expression: Values to check for distinct entries
23722372
filter: If provided, only compute against rows for which the filter is True
2373+
2374+
Examples:
2375+
---------
2376+
>>> ctx = dfn.SessionContext()
2377+
>>> df = ctx.from_pydict({"a": [1, 1, 2, 3]})
2378+
>>> result = df.aggregate(
2379+
... [], [dfn.functions.approx_distinct(dfn.col("a")).alias("v")])
2380+
>>> result.collect_column("v")[0].as_py() == 3
2381+
True
23732382
"""
23742383
filter_raw = filter.expr if filter is not None else None
23752384

@@ -2388,6 +2397,15 @@ def approx_median(expression: Expr, filter: Expr | None = None) -> Expr:
23882397
Args:
23892398
expression: Values to find the median for
23902399
filter: If provided, only compute against rows for which the filter is True
2400+
2401+
Examples:
2402+
---------
2403+
>>> ctx = dfn.SessionContext()
2404+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
2405+
>>> result = df.aggregate(
2406+
... [], [dfn.functions.approx_median(dfn.col("a")).alias("v")])
2407+
>>> result.collect_column("v")[0].as_py()
2408+
2.0
23912409
"""
23922410
filter_raw = filter.expr if filter is not None else None
23932411
return Expr(f.approx_median(expression.expr, filter=filter_raw))
@@ -2419,6 +2437,15 @@ def approx_percentile_cont(
24192437
percentile: This must be between 0.0 and 1.0, inclusive
24202438
num_centroids: Max bin size for the t-digest algorithm
24212439
filter: If provided, only compute against rows for which the filter is True
2440+
2441+
Examples:
2442+
---------
2443+
>>> ctx = dfn.SessionContext()
2444+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0, 4.0, 5.0]})
2445+
>>> result = df.aggregate(
2446+
... [], [dfn.functions.approx_percentile_cont(dfn.col("a"), 0.5).alias("v")])
2447+
>>> result.collect_column("v")[0].as_py()
2448+
3.0
24222449
"""
24232450
sort_expr_raw = sort_or_default(sort_expression)
24242451
filter_raw = filter.expr if filter is not None else None
@@ -2451,6 +2478,15 @@ def approx_percentile_cont_with_weight(
24512478
num_centroids: Max bin size for the t-digest algorithm
24522479
filter: If provided, only compute against rows for which the filter is True
24532480
2481+
Examples:
2482+
---------
2483+
>>> ctx = dfn.SessionContext()
2484+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "w": [1.0, 1.0, 1.0]})
2485+
>>> result = df.aggregate(
2486+
... [], [dfn.functions.approx_percentile_cont_with_weight(dfn.col("a"),
2487+
... dfn.col("w"), 0.5).alias("v")])
2488+
>>> result.collect_column("v")[0].as_py()
2489+
2.0
24542490
"""
24552491
sort_expr_raw = sort_or_default(sort_expression)
24562492
filter_raw = filter.expr if filter is not None else None
@@ -2514,6 +2550,14 @@ def avg(
25142550
Args:
25152551
expression: Values to combine into an array
25162552
filter: If provided, only compute against rows for which the filter is True
2553+
2554+
Examples:
2555+
---------
2556+
>>> ctx = dfn.SessionContext()
2557+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
2558+
>>> result = df.aggregate([], [dfn.functions.avg(dfn.col("a")).alias("v")])
2559+
>>> result.collect_column("v")[0].as_py()
2560+
2.0
25172561
"""
25182562
filter_raw = filter.expr if filter is not None else None
25192563
return Expr(f.avg(expression.expr, filter=filter_raw))
@@ -2552,6 +2596,14 @@ def count(
25522596
expressions: Argument to perform bitwise calculation on
25532597
distinct: If True, a single entry for each distinct value will be in the result
25542598
filter: If provided, only compute against rows for which the filter is True
2599+
2600+
Examples:
2601+
---------
2602+
>>> ctx = dfn.SessionContext()
2603+
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
2604+
>>> result = df.aggregate([], [dfn.functions.count(dfn.col("a")).alias("v")])
2605+
>>> result.collect_column("v")[0].as_py()
2606+
3
25552607
"""
25562608
filter_raw = filter.expr if filter is not None else None
25572609

@@ -2616,6 +2668,14 @@ def max(expression: Expr, filter: Expr | None = None) -> Expr:
26162668
Args:
26172669
expression: The value to find the maximum of
26182670
filter: If provided, only compute against rows for which the filter is True
2671+
2672+
Examples:
2673+
---------
2674+
>>> ctx = dfn.SessionContext()
2675+
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
2676+
>>> result = df.aggregate([], [dfn.functions.max(dfn.col("a")).alias("v")])
2677+
>>> result.collect_column("v")[0].as_py()
2678+
3
26192679
"""
26202680
filter_raw = filter.expr if filter is not None else None
26212681
return Expr(f.max(expression.expr, filter=filter_raw))
@@ -2625,6 +2685,14 @@ def mean(expression: Expr, filter: Expr | None = None) -> Expr:
26252685
"""Returns the average (mean) value of the argument.
26262686
26272687
This is an alias for :py:func:`avg`.
2688+
2689+
Examples:
2690+
---------
2691+
>>> ctx = dfn.SessionContext()
2692+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
2693+
>>> result = df.aggregate([], [dfn.functions.mean(dfn.col("a")).alias("v")])
2694+
>>> result.collect_column("v")[0].as_py()
2695+
2.0
26282696
"""
26292697
return avg(expression, filter)
26302698

@@ -2644,6 +2712,14 @@ def median(
26442712
expression: The value to compute the median of
26452713
distinct: If True, a single entry for each distinct value will be in the result
26462714
filter: If provided, only compute against rows for which the filter is True
2715+
2716+
Examples:
2717+
---------
2718+
>>> ctx = dfn.SessionContext()
2719+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
2720+
>>> result = df.aggregate([], [dfn.functions.median(dfn.col("a")).alias("v")])
2721+
>>> result.collect_column("v")[0].as_py()
2722+
2.0
26472723
"""
26482724
filter_raw = filter.expr if filter is not None else None
26492725
return Expr(f.median(expression.expr, distinct=distinct, filter=filter_raw))
@@ -2658,6 +2734,14 @@ def min(expression: Expr, filter: Expr | None = None) -> Expr:
26582734
Args:
26592735
expression: The value to find the minimum of
26602736
filter: If provided, only compute against rows for which the filter is True
2737+
2738+
Examples:
2739+
---------
2740+
>>> ctx = dfn.SessionContext()
2741+
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
2742+
>>> result = df.aggregate([], [dfn.functions.min(dfn.col("a")).alias("v")])
2743+
>>> result.collect_column("v")[0].as_py()
2744+
1
26612745
"""
26622746
filter_raw = filter.expr if filter is not None else None
26632747
return Expr(f.min(expression.expr, filter=filter_raw))
@@ -2677,6 +2761,14 @@ def sum(
26772761
Args:
26782762
expression: Values to combine into an array
26792763
filter: If provided, only compute against rows for which the filter is True
2764+
2765+
Examples:
2766+
---------
2767+
>>> ctx = dfn.SessionContext()
2768+
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
2769+
>>> result = df.aggregate([], [dfn.functions.sum(dfn.col("a")).alias("v")])
2770+
>>> result.collect_column("v")[0].as_py()
2771+
6
26802772
"""
26812773
filter_raw = filter.expr if filter is not None else None
26822774
return Expr(f.sum(expression.expr, filter=filter_raw))
@@ -3094,6 +3186,14 @@ def bit_and(expression: Expr, filter: Expr | None = None) -> Expr:
30943186
Args:
30953187
expression: Argument to perform bitwise calculation on
30963188
filter: If provided, only compute against rows for which the filter is True
3189+
3190+
Examples:
3191+
---------
3192+
>>> ctx = dfn.SessionContext()
3193+
>>> df = ctx.from_pydict({"a": [7, 3]})
3194+
>>> result = df.aggregate([], [dfn.functions.bit_and(dfn.col("a")).alias("v")])
3195+
>>> result.collect_column("v")[0].as_py()
3196+
3
30973197
"""
30983198
filter_raw = filter.expr if filter is not None else None
30993199
return Expr(f.bit_and(expression.expr, filter=filter_raw))
@@ -3110,6 +3210,14 @@ def bit_or(expression: Expr, filter: Expr | None = None) -> Expr:
31103210
Args:
31113211
expression: Argument to perform bitwise calculation on
31123212
filter: If provided, only compute against rows for which the filter is True
3213+
3214+
Examples:
3215+
---------
3216+
>>> ctx = dfn.SessionContext()
3217+
>>> df = ctx.from_pydict({"a": [1, 2]})
3218+
>>> result = df.aggregate([], [dfn.functions.bit_or(dfn.col("a")).alias("v")])
3219+
>>> result.collect_column("v")[0].as_py()
3220+
3
31133221
"""
31143222
filter_raw = filter.expr if filter is not None else None
31153223
return Expr(f.bit_or(expression.expr, filter=filter_raw))
@@ -3129,6 +3237,14 @@ def bit_xor(
31293237
expression: Argument to perform bitwise calculation on
31303238
distinct: If True, evaluate each unique value of expression only once
31313239
filter: If provided, only compute against rows for which the filter is True
3240+
3241+
Examples:
3242+
---------
3243+
>>> ctx = dfn.SessionContext()
3244+
>>> df = ctx.from_pydict({"a": [5, 3]})
3245+
>>> result = df.aggregate([], [dfn.functions.bit_xor(dfn.col("a")).alias("v")])
3246+
>>> result.collect_column("v")[0].as_py()
3247+
6
31323248
"""
31333249
filter_raw = filter.expr if filter is not None else None
31343250
return Expr(f.bit_xor(expression.expr, distinct=distinct, filter=filter_raw))
@@ -3146,6 +3262,14 @@ def bool_and(expression: Expr, filter: Expr | None = None) -> Expr:
31463262
Args:
31473263
expression: Argument to perform calculation on
31483264
filter: If provided, only compute against rows for which the filter is True
3265+
3266+
Examples:
3267+
---------
3268+
>>> ctx = dfn.SessionContext()
3269+
>>> df = ctx.from_pydict({"a": [True, True, False]})
3270+
>>> result = df.aggregate([], [dfn.functions.bool_and(dfn.col("a")).alias("v")])
3271+
>>> result.collect_column("v")[0].as_py()
3272+
False
31493273
"""
31503274
filter_raw = filter.expr if filter is not None else None
31513275
return Expr(f.bool_and(expression.expr, filter=filter_raw))
@@ -3163,6 +3287,14 @@ def bool_or(expression: Expr, filter: Expr | None = None) -> Expr:
31633287
Args:
31643288
expression: Argument to perform calculation on
31653289
filter: If provided, only compute against rows for which the filter is True
3290+
3291+
Examples:
3292+
---------
3293+
>>> ctx = dfn.SessionContext()
3294+
>>> df = ctx.from_pydict({"a": [False, False, True]})
3295+
>>> result = df.aggregate([], [dfn.functions.bool_or(dfn.col("a")).alias("v")])
3296+
>>> result.collect_column("v")[0].as_py()
3297+
True
31663298
"""
31673299
filter_raw = filter.expr if filter is not None else None
31683300
return Expr(f.bool_or(expression.expr, filter=filter_raw))
@@ -3553,6 +3685,15 @@ def string_agg(
35533685
For example::
35543686
35553687
df.aggregate([], string_agg(col("a"), ",", order_by="b"))
3688+
3689+
Examples:
3690+
---------
3691+
>>> ctx = dfn.SessionContext()
3692+
>>> df = ctx.from_pydict({"a": ["x", "y", "z"]})
3693+
>>> result = df.aggregate(
3694+
... [], [dfn.functions.string_agg(dfn.col("a"), ",", order_by="a").alias("s")])
3695+
>>> result.collect_column("s")[0].as_py()
3696+
'x,y,z'
35563697
"""
35573698
order_by_raw = sort_list_to_raw_sort_list(order_by)
35583699
filter_raw = filter.expr if filter is not None else None

0 commit comments

Comments
 (0)