Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions docs/how-tos/run-data-quality-checks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,47 @@ Async validators

For validation logic that requires async operations (e.g., async database queries or API calls), use ``AsyncDataValidator`` or ``AsyncBaseDefaultValidator`` from ``hamilton.data_quality.base``. These define ``async def validate()`` and work with ``AsyncDriver``. You can mix sync and async validators in a single ``@check_output_custom`` call.

Disabling validators at runtime
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Validators are useful during development but may be unnecessary overhead in a trusted production pipeline. You can disable all ``@check_output`` and ``@check_output_custom`` validators at graph-construction time, so no extra nodes are ever created:

.. code-block:: python

dr = (
hamilton.driver.Builder()
.with_modules(my_pipeline)
.with_data_quality_disabled()
.build()
)

This is equivalent to passing ``{"hamilton.data_quality.disable_checks": True}`` via ``.with_config()``, which is useful when the flag is controlled dynamically (e.g., from an environment variable):

.. code-block:: python

import os

dr = (
hamilton.driver.Builder()
.with_modules(my_pipeline)
.with_config({"hamilton.data_quality.disable_checks": os.getenv("DISABLE_DQ", "false") == "true"})
.build()
)

Because the flag is resolved at graph-construction time, disabled drivers carry zero runtime overhead from validation — no validator nodes are created at all.

A second use case is graph visualization. Each decorated function normally expands into several nodes (``{name}_raw``, one per validator, and the final ``{name}`` node), which can clutter a visualization when you want to communicate pipeline structure rather than validation wiring. Building a driver with ``with_data_quality_disabled()`` gives a clean visualization with only the business-logic nodes:

.. code-block:: python

dr_viz = (
hamilton.driver.Builder()
.with_modules(my_pipeline)
.with_data_quality_disabled()
.build()
)
dr_viz.display_all_functions("pipeline.png")

Note that this requires a separate driver instance from the one used for execution if you still want validations to run.

See the :doc:`check_output reference <../reference/decorators/check_output>` and `data quality writeup <https://github.com/apache/hamilton/blob/main/writeups/data_quality.md>`_ for details and examples.
19 changes: 19 additions & 0 deletions docs/reference/decorators/check_output.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,25 @@ from ``hamilton.data_quality.base`` as your base class instead of the sync varia
See `data_quality <https://github.com/apache/hamilton/blob/main/data\_quality.md>`_ for more information on
available validators and how to build custom ones.

Disabling validators
~~~~~~~~~~~~~~~~~~~~

All ``@check_output`` and ``@check_output_custom`` validators can be disabled at graph-construction
time using ``Builder.with_data_quality_disabled()``:

.. code-block:: python

dr = (
hamilton.driver.Builder()
.with_modules(my_pipeline)
.with_data_quality_disabled()
.build()
)

This eliminates all validator nodes from the graph — no ``_raw`` or validator nodes are created, so
there is zero runtime cost. It is equivalent to ``.with_config({"hamilton.data_quality.disable_checks": True})``.
See :doc:`../drivers/Driver` for full ``Builder`` documentation.

Note we also have a plugins that allow for validation with the pandera and pydantic libraries. There are two ways to access these:

1. ``@check_output(schema=pandera_schema)`` or ``@check_output(model=pydantic_model)``
Expand Down
14 changes: 14 additions & 0 deletions hamilton/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -1843,6 +1843,20 @@ def with_adapters(self, *adapters: lifecycle_base.LifecycleAdapter) -> Self:
self.adapters.extend(adapters)
return self

def with_data_quality_disabled(self) -> Self:
"""Disables all ``@check_output`` / ``@check_output_custom`` validators at graph-construction
time. No validator nodes are created, so there is zero runtime cost.

This is equivalent to ``.with_config({DISABLE_DATA_QUALITY_CHECKS_CONFIG_KEY: True})``.
Note that a subsequent ``.with_config({DISABLE_DATA_QUALITY_CHECKS_CONFIG_KEY: False})``
will re-enable validation, since ``with_config`` always wins on the last write.

:return: self
"""
from hamilton.function_modifiers.validation import DISABLE_DATA_QUALITY_CHECKS_CONFIG_KEY

return self.with_config({DISABLE_DATA_QUALITY_CHECKS_CONFIG_KEY: True})

def with_materializers(self, *materializers: ExtractorFactory | MaterializerFactory) -> Self:
"""Add materializer nodes to the `Driver`
The generated nodes can be referenced by name in `.execute()`
Expand Down
6 changes: 6 additions & 0 deletions hamilton/function_modifiers/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

IS_DATA_VALIDATOR_TAG = "hamilton.data_quality.contains_dq_results"
DATA_VALIDATOR_ORIGINAL_OUTPUT_TAG = "hamilton.data_quality.source_node"
DISABLE_DATA_QUALITY_CHECKS_CONFIG_KEY = "hamilton.data_quality.disable_checks"


class BaseDataValidationDecorator(base.NodeTransformer):
Expand All @@ -42,9 +43,14 @@ def get_validators(self, node_to_validate: node.Node) -> list[dq_base.DataValida
"""
pass

def optional_config(self) -> dict[str, Any]:
return {DISABLE_DATA_QUALITY_CHECKS_CONFIG_KEY: False}

def transform_node(
self, node_: node.Node, config: dict[str, Any], fn: Callable
) -> Collection[node.Node]:
if config.get(DISABLE_DATA_QUALITY_CHECKS_CONFIG_KEY, False):
return [node_]
raw_node = node.Node(
name=node_.name
+ "_raw", # TODO -- make this unique -- this will break with multiple validation decorators, which we *don't* want
Expand Down
38 changes: 38 additions & 0 deletions tests/function_modifiers/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,44 @@ def fn(input: int) -> int:
assert result_fail.passes is False


def test_check_output_disabled_via_config_returns_original_node():
"""With hamilton.data_quality.disable_checks=True, transform_node returns the original node unchanged."""
decorator = check_output_custom(
SampleDataValidator2(dataset_length=1, importance="warn"),
SampleDataValidator3(dtype=np.int64, importance="warn"),
)

def fn(input: pd.Series) -> pd.Series:
return input

node_ = node.Node.from_fn(fn)
subdag = decorator.transform_node(
node_, config={"hamilton.data_quality.disable_checks": True}, fn=fn
)
assert len(subdag) == 1
assert subdag[0] is node_


def test_check_output_builtin_disabled_via_config_returns_original_node():
"""check_output (not custom) also respects hamilton.data_quality.disable_checks."""
decorator = check_output(
importance="warn",
default_validator_candidates=DUMMY_VALIDATORS_FOR_TESTING,
dataset_length=1,
dtype=np.int64,
)

def fn(input: pd.Series) -> pd.Series:
return input

node_ = node.Node.from_fn(fn)
subdag = decorator.transform_node(
node_, config={"hamilton.data_quality.disable_checks": True}, fn=fn
)
assert len(subdag) == 1
assert subdag[0] is node_


def test_sync_wrapper_guards_against_unawaited_coroutine():
"""Sync wrapper should raise TypeError if validator accidentally returns a coroutine."""

Expand Down
11 changes: 5 additions & 6 deletions tests/plugins/test_sklearn_plot_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.

import inspect
import pathlib

import numpy as np
Expand Down Expand Up @@ -141,12 +142,10 @@ def decision_boundary_display() -> DecisionBoundaryDisplay:
grid = np.vstack([feature_1.ravel(), feature_2.ravel()]).T
tree = DecisionTreeClassifier().fit(iris.data[:, :2], iris.target)
y_pred = np.reshape(tree.predict(grid), feature_1.shape)
kwargs = dict(xx0=feature_1, xx1=feature_2, response=y_pred)
# sklearn 1.8+ requires n_classes
sig = inspection.DecisionBoundaryDisplay.__init__
if "n_classes" in sig.__code__.co_varnames:
kwargs["n_classes"] = 3
decision_curve = inspection.DecisionBoundaryDisplay(**kwargs)
dbd_kwargs = dict(xx0=feature_1, xx1=feature_2, response=y_pred)
if "n_classes" in inspect.signature(inspection.DecisionBoundaryDisplay.__init__).parameters:
dbd_kwargs["n_classes"] = len(np.unique(iris.target))
decision_curve = inspection.DecisionBoundaryDisplay(**dbd_kwargs)
return decision_curve


Expand Down
42 changes: 42 additions & 0 deletions tests/test_end_to_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,3 +600,45 @@ def test_driver_v2_inputs_can_be_none():
with pytest.raises(ValueError):
# validate that None doesn't cause issues
dr.execute(["e"], inputs=None)


def test_builder_with_data_quality_disabled_removes_validator_nodes():
"""with_data_quality_disabled() eliminates validator nodes from the graph entirely."""
dr = (
driver.Builder()
.with_modules(tests.resources.data_quality)
.with_data_quality_disabled()
.build()
)
all_vars = dr.list_available_variables()
dq_nodes = [
var for var in all_vars if var.tags.get("hamilton.data_quality.contains_dq_results", False)
]
assert len(dq_nodes) == 0


def test_builder_with_data_quality_disabled_still_executes_correctly():
"""Driver built with data quality disabled returns correct output without raising."""
dr = (
driver.Builder()
.with_modules(tests.resources.data_quality)
.with_data_quality_disabled()
.build()
)
result = dr.execute(["data_might_be_in_range"], inputs={"data_quality_should_fail": True})
assert list(result["data_might_be_in_range"]) == [10.0]


def test_disable_data_quality_checks_config_key_works_directly():
"""hamilton.data_quality.disable_checks can also be passed via with_config directly."""
dr = (
driver.Builder()
.with_modules(tests.resources.data_quality)
.with_config({"hamilton.data_quality.disable_checks": True})
.build()
)
all_vars = dr.list_available_variables()
dq_nodes = [
var for var in all_vars if var.tags.get("hamilton.data_quality.contains_dq_results", False)
]
assert len(dq_nodes) == 0
Loading