Skip to content

Commit 430d4ba

Browse files
timsaucerclaude
andcommitted
docs(udf): document SessionContext UDF lookup with worked examples
Add Examples docstrings (doctest) for `udf` / `udaf` / `udwf` / `udfs` / `udafs` / `udwfs` that demonstrate the lookup pattern, including a late-binding example where the function name comes from configuration. Add tests covering config-driven dispatch and built-in UDAF / UDWF lookup so the documented patterns are exercised end-to-end. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent c2e0881 commit 430d4ba

2 files changed

Lines changed: 140 additions & 3 deletions

File tree

python/datafusion/context.py

Lines changed: 90 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1313,11 +1313,44 @@ def deregister_udwf(self, name: str) -> None:
13131313
def udf(self, name: str) -> ScalarUDF:
13141314
"""Look up a registered scalar UDF by name.
13151315
1316+
Returns the same :py:class:`~datafusion.user_defined.ScalarUDF`
1317+
wrapper that :py:meth:`register_udf` accepts, so it can be invoked
1318+
as an expression in the DataFrame API or re-registered into a
1319+
different :py:class:`SessionContext`. Built-in scalar functions
1320+
from the session's function registry are also looked up.
1321+
13161322
Args:
13171323
name: Name of the registered scalar UDF.
13181324
13191325
Raises:
13201326
Exception: If no scalar UDF is registered under ``name``.
1327+
1328+
Examples:
1329+
Register a UDF, then look it up by name and use it in the
1330+
DataFrame API:
1331+
1332+
>>> ctx = dfn.SessionContext()
1333+
>>> nullcheck = dfn.udf(
1334+
... lambda x: x.is_null(),
1335+
... [pa.int64()],
1336+
... pa.bool_(),
1337+
... volatility="immutable",
1338+
... name="nullcheck",
1339+
... )
1340+
>>> ctx.register_udf(nullcheck)
1341+
>>> fn = ctx.udf("nullcheck")
1342+
>>> df = ctx.from_pydict({"a": [1, None, 3]})
1343+
>>> df.select(fn(col("a")).alias("is_null")).to_pydict()
1344+
{'is_null': [False, True, False]}
1345+
1346+
Late-binding: the function name can come from configuration
1347+
rather than an imported symbol, which is useful when the set
1348+
of UDFs is plugin-driven or chosen at runtime:
1349+
1350+
>>> config = {"null_check": "nullcheck"}
1351+
>>> fn = ctx.udf(config["null_check"])
1352+
>>> df.select(fn(col("a")).alias("is_null")).to_pydict()
1353+
{'is_null': [False, True, False]}
13211354
"""
13221355
from datafusion.user_defined import ScalarUDF as _ScalarUDF # noqa: PLC0415
13231356

@@ -1328,11 +1361,27 @@ def udf(self, name: str) -> ScalarUDF:
13281361
def udaf(self, name: str) -> AggregateUDF:
13291362
"""Look up a registered aggregate UDF by name.
13301363
1364+
Returns the same :py:class:`~datafusion.user_defined.AggregateUDF`
1365+
wrapper that :py:meth:`register_udaf` accepts. Built-in aggregate
1366+
functions such as ``sum`` or ``avg`` are also discoverable through
1367+
this lookup. See :py:meth:`udf` for a worked late-binding example;
1368+
the pattern is identical for aggregates.
1369+
13311370
Args:
13321371
name: Name of the registered aggregate UDF.
13331372
13341373
Raises:
13351374
Exception: If no aggregate UDF is registered under ``name``.
1375+
1376+
Examples:
1377+
Look up a built-in aggregate by name and use it in
1378+
:py:meth:`~datafusion.DataFrame.aggregate`:
1379+
1380+
>>> ctx = dfn.SessionContext()
1381+
>>> sum_fn = ctx.udaf("sum")
1382+
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
1383+
>>> df.aggregate([], [sum_fn(col("a")).alias("total")]).to_pydict()
1384+
{'total': [6]}
13361385
"""
13371386
from datafusion.user_defined import ( # noqa: PLC0415
13381387
AggregateUDF as _AggregateUDF,
@@ -1345,11 +1394,27 @@ def udaf(self, name: str) -> AggregateUDF:
13451394
def udwf(self, name: str) -> WindowUDF:
13461395
"""Look up a registered window UDF by name.
13471396
1397+
Returns the same :py:class:`~datafusion.user_defined.WindowUDF`
1398+
wrapper that :py:meth:`register_udwf` accepts. Built-in window
1399+
functions such as ``row_number`` or ``rank`` are also discoverable
1400+
through this lookup. See :py:meth:`udf` for a worked late-binding
1401+
example; the pattern is identical for window functions.
1402+
13481403
Args:
13491404
name: Name of the registered window UDF.
13501405
13511406
Raises:
13521407
Exception: If no window UDF is registered under ``name``.
1408+
1409+
Examples:
1410+
Look up a built-in window function by name and use it in
1411+
``select``:
1412+
1413+
>>> ctx = dfn.SessionContext()
1414+
>>> rn = ctx.udwf("row_number")
1415+
>>> df = ctx.from_pydict({"a": [10, 20, 30]})
1416+
>>> df.select(col("a"), rn().alias("rn")).to_pydict()
1417+
{'a': [10, 20, 30], 'rn': [1, 2, 3]}
13531418
"""
13541419
from datafusion.user_defined import WindowUDF as _WindowUDF # noqa: PLC0415
13551420

@@ -1358,15 +1423,37 @@ def udwf(self, name: str) -> WindowUDF:
13581423
return wrapper
13591424

13601425
def udfs(self) -> list[str]:
1361-
"""Return the sorted names of all registered scalar UDFs."""
1426+
"""Return the sorted names of all registered scalar UDFs.
1427+
1428+
Includes both user-registered and built-in scalar functions. Pair
1429+
with :py:meth:`udf` to drive discovery, validation, or config-based
1430+
dispatch.
1431+
1432+
Examples:
1433+
>>> ctx = dfn.SessionContext()
1434+
>>> "abs" in ctx.udfs()
1435+
True
1436+
"""
13621437
return self.ctx.udfs()
13631438

13641439
def udafs(self) -> list[str]:
1365-
"""Return the sorted names of all registered aggregate UDFs."""
1440+
"""Return the sorted names of all registered aggregate UDFs.
1441+
1442+
Examples:
1443+
>>> ctx = dfn.SessionContext()
1444+
>>> "sum" in ctx.udafs()
1445+
True
1446+
"""
13661447
return self.ctx.udafs()
13671448

13681449
def udwfs(self) -> list[str]:
1369-
"""Return the sorted names of all registered window UDFs."""
1450+
"""Return the sorted names of all registered window UDFs.
1451+
1452+
Examples:
1453+
>>> ctx = dfn.SessionContext()
1454+
>>> "row_number" in ctx.udwfs()
1455+
True
1456+
"""
13701457
return self.ctx.udwfs()
13711458

13721459
def catalog(self, name: str = "datafusion") -> Catalog:

python/tests/test_udf.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,56 @@ def test_udf_lookup(ctx, df) -> None:
9797
ctx.udf("does_not_exist")
9898

9999

100+
def test_udf_late_binding_dispatch(ctx, df) -> None:
101+
"""Resolve a UDF chosen by configuration string, then invoke it."""
102+
late_is_null = udf(
103+
lambda x: x.is_null(),
104+
[pa.int64()],
105+
pa.bool_(),
106+
volatility="immutable",
107+
name="late_is_null",
108+
)
109+
late_is_not_null = udf(
110+
lambda x: pc.invert(x.is_null()),
111+
[pa.int64()],
112+
pa.bool_(),
113+
volatility="immutable",
114+
name="late_is_not_null",
115+
)
116+
117+
ctx.register_udf(late_is_null)
118+
ctx.register_udf(late_is_not_null)
119+
120+
# Pretend this came from a config file / API request — only a string.
121+
runtime_config = {"check_fn": "late_is_not_null"}
122+
123+
assert runtime_config["check_fn"] in ctx.udfs()
124+
125+
fn = ctx.udf(runtime_config["check_fn"])
126+
result = df.select(fn(column("b")).alias("ok")).collect()[0].column(0)
127+
assert result == pa.array([True, True, False])
128+
129+
130+
def test_udaf_lookup_builtin(ctx, df) -> None:
131+
assert "sum" in ctx.udafs()
132+
sum_fn = ctx.udaf("sum")
133+
result = df.aggregate([], [sum_fn(column("a")).alias("total")]).collect()
134+
assert result[0].column(0).to_pylist() == [6]
135+
136+
with pytest.raises(Exception, match="no UDAF named"):
137+
ctx.udaf("does_not_exist")
138+
139+
140+
def test_udwf_lookup_builtin(ctx, df) -> None:
141+
assert "row_number" in ctx.udwfs()
142+
rn = ctx.udwf("row_number")
143+
result = df.select(column("a"), rn().alias("rn")).collect()
144+
assert result[0].column(1).to_pylist() == [1, 2, 3]
145+
146+
with pytest.raises(Exception, match="no UDWF named"):
147+
ctx.udwf("does_not_exist")
148+
149+
100150
class OverThresholdUDF:
101151
def __init__(self, threshold: int = 0) -> None:
102152
self.threshold = threshold

0 commit comments

Comments
 (0)