Skip to content

Commit 74b3221

Browse files
ntjohnson1claude
andauthored
Add docstring examples for Aggregate statistical and regression functions (#1417)
* Add docstring examples for Aggregate statistical and regression functions Add example usage to docstrings for Aggregate statistical and regression functions to improve documentation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Simplify covar * Make sure everything is google doc style --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent f01f30c commit 74b3221

File tree

1 file changed

+162
-11
lines changed

1 file changed

+162
-11
lines changed

python/datafusion/functions.py

Lines changed: 162 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -823,14 +823,11 @@ def cot(arg: Expr) -> Expr:
823823
>>> from math import pi
824824
>>> ctx = dfn.SessionContext()
825825
>>> df = ctx.from_pydict({"a": [pi / 4]})
826-
>>> import builtins
827826
>>> result = df.select(
828827
... dfn.functions.cot(dfn.col("a")).alias("cot")
829828
... )
830-
>>> builtins.round(
831-
... result.collect_column("cot")[0].as_py(), 1
832-
... )
833-
1.0
829+
>>> result.collect_column("cot")[0].as_py()
830+
1.0...
834831
"""
835832
return Expr(f.cot(arg.expr))
836833

@@ -1171,14 +1168,11 @@ def radians(arg: Expr) -> Expr:
11711168
>>> from math import pi
11721169
>>> ctx = dfn.SessionContext()
11731170
>>> df = ctx.from_pydict({"a": [180.0]})
1174-
>>> import builtins
11751171
>>> result = df.select(
11761172
... dfn.functions.radians(dfn.col("a")).alias("rad")
11771173
... )
1178-
>>> builtins.round(
1179-
... result.collect_column("rad")[0].as_py(), 6
1180-
... )
1181-
3.141593
1174+
>>> result.collect_column("rad")[0].as_py() == pi
1175+
True
11821176
"""
11831177
return Expr(f.radians(arg.expr))
11841178

@@ -2737,6 +2731,14 @@ def corr(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr:
27372731
value_y: The dependent variable for correlation
27382732
value_x: The independent variable for correlation
27392733
filter: If provided, only compute against rows for which the filter is True
2734+
2735+
Examples:
2736+
>>> ctx = dfn.SessionContext()
2737+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [1.0, 2.0, 3.0]})
2738+
>>> result = df.aggregate(
2739+
... [], [dfn.functions.corr(dfn.col("a"), dfn.col("b")).alias("v")])
2740+
>>> result.collect_column("v")[0].as_py()
2741+
1.0
27402742
"""
27412743
filter_raw = filter.expr if filter is not None else None
27422744
return Expr(f.corr(value_y.expr, value_x.expr, filter=filter_raw))
@@ -2791,6 +2793,18 @@ def covar_pop(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr:
27912793
value_y: The dependent variable for covariance
27922794
value_x: The independent variable for covariance
27932795
filter: If provided, only compute against rows for which the filter is True
2796+
2797+
Examples:
2798+
>>> ctx = dfn.SessionContext()
2799+
>>> df = ctx.from_pydict({"a": [1.0, 5.0, 10.0], "b": [1.0, 2.0, 3.0]})
2800+
>>> result = df.aggregate(
2801+
... [],
2802+
... [dfn.functions.covar_pop(
2803+
... dfn.col("a"), dfn.col("b")
2804+
... ).alias("v")]
2805+
... )
2806+
>>> result.collect_column("v")[0].as_py()
2807+
3.0
27942808
"""
27952809
filter_raw = filter.expr if filter is not None else None
27962810
return Expr(f.covar_pop(value_y.expr, value_x.expr, filter=filter_raw))
@@ -2808,6 +2822,14 @@ def covar_samp(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr
28082822
value_y: The dependent variable for covariance
28092823
value_x: The independent variable for covariance
28102824
filter: If provided, only compute against rows for which the filter is True
2825+
2826+
Examples:
2827+
>>> ctx = dfn.SessionContext()
2828+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
2829+
>>> result = df.aggregate(
2830+
... [], [dfn.functions.covar_samp(dfn.col("a"), dfn.col("b")).alias("v")])
2831+
>>> result.collect_column("v")[0].as_py()
2832+
1.0
28112833
"""
28122834
filter_raw = filter.expr if filter is not None else None
28132835
return Expr(f.covar_samp(value_y.expr, value_x.expr, filter=filter_raw))
@@ -2816,7 +2838,8 @@ def covar_samp(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr
28162838
def covar(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr:
28172839
"""Computes the sample covariance.
28182840
2819-
This is an alias for :py:func:`covar_samp`.
2841+
See Also:
2842+
This is an alias for :py:func:`covar_samp`.
28202843
"""
28212844
return covar_samp(value_y, value_x, filter)
28222845

@@ -2945,6 +2968,13 @@ def stddev(expression: Expr, filter: Expr | None = None) -> Expr:
29452968
Args:
29462969
expression: The value to find the minimum of
29472970
filter: If provided, only compute against rows for which the filter is True
2971+
2972+
Examples:
2973+
>>> ctx = dfn.SessionContext()
2974+
>>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]})
2975+
>>> result = df.aggregate([], [dfn.functions.stddev(dfn.col("a")).alias("v")])
2976+
>>> result.collect_column("v")[0].as_py()
2977+
2.0
29482978
"""
29492979
filter_raw = filter.expr if filter is not None else None
29502980
return Expr(f.stddev(expression.expr, filter=filter_raw))
@@ -2959,6 +2989,15 @@ def stddev_pop(expression: Expr, filter: Expr | None = None) -> Expr:
29592989
Args:
29602990
expression: The value to find the minimum of
29612991
filter: If provided, only compute against rows for which the filter is True
2992+
2993+
Examples:
2994+
>>> ctx = dfn.SessionContext()
2995+
>>> df = ctx.from_pydict({"a": [1.0, 3.0]})
2996+
>>> result = df.aggregate(
2997+
... [], [dfn.functions.stddev_pop(dfn.col("a")).alias("v")]
2998+
... )
2999+
>>> result.collect_column("v")[0].as_py()
3000+
1.0
29623001
"""
29633002
filter_raw = filter.expr if filter is not None else None
29643003
return Expr(f.stddev_pop(expression.expr, filter=filter_raw))
@@ -2968,6 +3007,15 @@ def stddev_samp(arg: Expr, filter: Expr | None = None) -> Expr:
29683007
"""Computes the sample standard deviation of the argument.
29693008
29703009
This is an alias for :py:func:`stddev`.
3010+
3011+
Examples:
3012+
>>> ctx = dfn.SessionContext()
3013+
>>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]})
3014+
>>> result = df.aggregate(
3015+
... [], [dfn.functions.stddev_samp(dfn.col("a")).alias("v")]
3016+
... )
3017+
>>> result.collect_column("v")[0].as_py()
3018+
2.0
29713019
"""
29723020
return stddev(arg, filter=filter)
29733021

@@ -2976,6 +3024,13 @@ def var(expression: Expr, filter: Expr | None = None) -> Expr:
29763024
"""Computes the sample variance of the argument.
29773025
29783026
This is an alias for :py:func:`var_samp`.
3027+
3028+
Examples:
3029+
>>> ctx = dfn.SessionContext()
3030+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
3031+
>>> result = df.aggregate([], [dfn.functions.var(dfn.col("a")).alias("v")])
3032+
>>> result.collect_column("v")[0].as_py()
3033+
1.0
29793034
"""
29803035
return var_samp(expression, filter)
29813036

@@ -2989,6 +3044,13 @@ def var_pop(expression: Expr, filter: Expr | None = None) -> Expr:
29893044
Args:
29903045
expression: The variable to compute the variance for
29913046
filter: If provided, only compute against rows for which the filter is True
3047+
3048+
Examples:
3049+
>>> ctx = dfn.SessionContext()
3050+
>>> df = ctx.from_pydict({"a": [0.0, 2.0]})
3051+
>>> result = df.aggregate([], [dfn.functions.var_pop(dfn.col("a")).alias("v")])
3052+
>>> result.collect_column("v")[0].as_py()
3053+
1.0
29923054
"""
29933055
filter_raw = filter.expr if filter is not None else None
29943056
return Expr(f.var_pop(expression.expr, filter=filter_raw))
@@ -3003,6 +3065,13 @@ def var_samp(expression: Expr, filter: Expr | None = None) -> Expr:
30033065
Args:
30043066
expression: The variable to compute the variance for
30053067
filter: If provided, only compute against rows for which the filter is True
3068+
3069+
Examples:
3070+
>>> ctx = dfn.SessionContext()
3071+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
3072+
>>> result = df.aggregate([], [dfn.functions.var_samp(dfn.col("a")).alias("v")])
3073+
>>> result.collect_column("v")[0].as_py()
3074+
1.0
30063075
"""
30073076
filter_raw = filter.expr if filter is not None else None
30083077
return Expr(f.var_sample(expression.expr, filter=filter_raw))
@@ -3012,6 +3081,15 @@ def var_sample(expression: Expr, filter: Expr | None = None) -> Expr:
30123081
"""Computes the sample variance of the argument.
30133082
30143083
This is an alias for :py:func:`var_samp`.
3084+
3085+
Examples:
3086+
>>> ctx = dfn.SessionContext()
3087+
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
3088+
>>> result = df.aggregate(
3089+
... [], [dfn.functions.var_sample(dfn.col("a")).alias("v")]
3090+
... )
3091+
>>> result.collect_column("v")[0].as_py()
3092+
1.0
30153093
"""
30163094
return var_samp(expression, filter)
30173095

@@ -3033,6 +3111,14 @@ def regr_avgx(
30333111
y: The linear regression dependent variable
30343112
x: The linear regression independent variable
30353113
filter: If provided, only compute against rows for which the filter is True
3114+
3115+
Examples:
3116+
>>> ctx = dfn.SessionContext()
3117+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
3118+
>>> result = df.aggregate(
3119+
... [], [dfn.functions.regr_avgx(dfn.col("y"), dfn.col("x")).alias("v")])
3120+
>>> result.collect_column("v")[0].as_py()
3121+
5.0
30363122
"""
30373123
filter_raw = filter.expr if filter is not None else None
30383124

@@ -3056,6 +3142,14 @@ def regr_avgy(
30563142
y: The linear regression dependent variable
30573143
x: The linear regression independent variable
30583144
filter: If provided, only compute against rows for which the filter is True
3145+
3146+
Examples:
3147+
>>> ctx = dfn.SessionContext()
3148+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
3149+
>>> result = df.aggregate(
3150+
... [], [dfn.functions.regr_avgy(dfn.col("y"), dfn.col("x")).alias("v")])
3151+
>>> result.collect_column("v")[0].as_py()
3152+
2.0
30593153
"""
30603154
filter_raw = filter.expr if filter is not None else None
30613155

@@ -3079,6 +3173,14 @@ def regr_count(
30793173
y: The linear regression dependent variable
30803174
x: The linear regression independent variable
30813175
filter: If provided, only compute against rows for which the filter is True
3176+
3177+
Examples:
3178+
>>> ctx = dfn.SessionContext()
3179+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
3180+
>>> result = df.aggregate(
3181+
... [], [dfn.functions.regr_count(dfn.col("y"), dfn.col("x")).alias("v")])
3182+
>>> result.collect_column("v")[0].as_py()
3183+
3
30823184
"""
30833185
filter_raw = filter.expr if filter is not None else None
30843186

@@ -3102,6 +3204,15 @@ def regr_intercept(
31023204
y: The linear regression dependent variable
31033205
x: The linear regression independent variable
31043206
filter: If provided, only compute against rows for which the filter is True
3207+
3208+
Examples:
3209+
>>> ctx = dfn.SessionContext()
3210+
>>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
3211+
>>> result = df.aggregate(
3212+
... [],
3213+
... [dfn.functions.regr_intercept(dfn.col("y"), dfn.col("x")).alias("v")])
3214+
>>> result.collect_column("v")[0].as_py()
3215+
0.0
31053216
"""
31063217
filter_raw = filter.expr if filter is not None else None
31073218

@@ -3125,6 +3236,14 @@ def regr_r2(
31253236
y: The linear regression dependent variable
31263237
x: The linear regression independent variable
31273238
filter: If provided, only compute against rows for which the filter is True
3239+
3240+
Examples:
3241+
>>> ctx = dfn.SessionContext()
3242+
>>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
3243+
>>> result = df.aggregate(
3244+
... [], [dfn.functions.regr_r2(dfn.col("y"), dfn.col("x")).alias("v")])
3245+
>>> result.collect_column("v")[0].as_py()
3246+
1.0
31283247
"""
31293248
filter_raw = filter.expr if filter is not None else None
31303249

@@ -3148,6 +3267,14 @@ def regr_slope(
31483267
y: The linear regression dependent variable
31493268
x: The linear regression independent variable
31503269
filter: If provided, only compute against rows for which the filter is True
3270+
3271+
Examples:
3272+
>>> ctx = dfn.SessionContext()
3273+
>>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
3274+
>>> result = df.aggregate(
3275+
... [], [dfn.functions.regr_slope(dfn.col("y"), dfn.col("x")).alias("v")])
3276+
>>> result.collect_column("v")[0].as_py()
3277+
2.0
31513278
"""
31523279
filter_raw = filter.expr if filter is not None else None
31533280

@@ -3171,6 +3298,14 @@ def regr_sxx(
31713298
y: The linear regression dependent variable
31723299
x: The linear regression independent variable
31733300
filter: If provided, only compute against rows for which the filter is True
3301+
3302+
Examples:
3303+
>>> ctx = dfn.SessionContext()
3304+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
3305+
>>> result = df.aggregate(
3306+
... [], [dfn.functions.regr_sxx(dfn.col("y"), dfn.col("x")).alias("v")])
3307+
>>> result.collect_column("v")[0].as_py()
3308+
2.0
31743309
"""
31753310
filter_raw = filter.expr if filter is not None else None
31763311

@@ -3194,6 +3329,14 @@ def regr_sxy(
31943329
y: The linear regression dependent variable
31953330
x: The linear regression independent variable
31963331
filter: If provided, only compute against rows for which the filter is True
3332+
3333+
Examples:
3334+
>>> ctx = dfn.SessionContext()
3335+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
3336+
>>> result = df.aggregate(
3337+
... [], [dfn.functions.regr_sxy(dfn.col("y"), dfn.col("x")).alias("v")])
3338+
>>> result.collect_column("v")[0].as_py()
3339+
2.0
31973340
"""
31983341
filter_raw = filter.expr if filter is not None else None
31993342

@@ -3217,6 +3360,14 @@ def regr_syy(
32173360
y: The linear regression dependent variable
32183361
x: The linear regression independent variable
32193362
filter: If provided, only compute against rows for which the filter is True
3363+
3364+
Examples:
3365+
>>> ctx = dfn.SessionContext()
3366+
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
3367+
>>> result = df.aggregate(
3368+
... [], [dfn.functions.regr_syy(dfn.col("y"), dfn.col("x")).alias("v")])
3369+
>>> result.collect_column("v")[0].as_py()
3370+
2.0
32203371
"""
32213372
filter_raw = filter.expr if filter is not None else None
32223373

0 commit comments

Comments
 (0)