NVIDIA-NeMo · andreatgretel · Feb 5, 2026
@@ -0,0 +1,61 @@
+# Data Designer Demo Processors
+
+Demo processor plugins demonstrating PRE_GENERATION and POST_GENERATION stages.
+
+## Installation
+
+```bash
+uv pip install -e demo/data_designer_demo_processors
+```
+
+## Processors
+
+### RegexFilterProcessor (PRE_GENERATION)
+
+Filters seed data rows based on regex pattern matching.
+
+```python
+from data_designer.config.config_builder import DataDesignerConfigBuilder
+from data_designer_demo_processors.regex_filter import RegexFilterProcessorConfig
+
+builder = DataDesignerConfigBuilder(model_configs=[...])
+builder.add_processor(RegexFilterProcessorConfig(
+    name="filter_emails",
+    column="email",
+    pattern=r"@company\.com$",
+    invert=False,  # Keep only matching rows
+))
+```
+
+### SemanticDedupProcessor (POST_GENERATION)
+
+Removes semantically similar rows using sentence embeddings.
+
+```python
+from data_designer_demo_processors.semantic_dedup import SemanticDedupProcessorConfig
+
+builder.add_processor(SemanticDedupProcessorConfig(
+    name="dedup_responses",
+    column="response",
+    similarity_threshold=0.9,  # Remove rows with >90% similarity
+    model_name="all-MiniLM-L6-v2",
+))
+```
+
+## Pre-downloading the Embedding Model
+
+The semantic dedup processor downloads the embedding model on first use. To pre-download:
+
+```bash
+download-semantic-dedup-model
+```
+
+## Entry Points
+
+The package registers plugins via entry points:
+
+```toml
+[project.entry-points."data_designer.plugins"]
+regex-filter = "data_designer_demo_processors.regex_filter.plugin:regex_filter_plugin"
+semantic-dedup = "data_designer_demo_processors.semantic_dedup.plugin:semantic_dedup_plugin"
+```
@@ -0,0 +1,120 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Demo: Processor Plugins with PRE_GENERATION and POST_GENERATION stages.
+
+This notebook demonstrates:
+1. RegexFilterProcessor (PRE_GENERATION) - filters seed data before generation
+2. SemanticDedupProcessor (POST_GENERATION) - deduplicates final dataset
+
+Run cells with `#%%` markers in VS Code or PyCharm.
+"""
+
+# %% Imports
+import tempfile
+from pathlib import Path
+
+import pandas as pd
+from data_designer_demo_processors.regex_filter import RegexFilterProcessorConfig
+from data_designer_demo_processors.semantic_dedup import SemanticDedupProcessorConfig
+
+import data_designer.config as dd
+from data_designer.interface import DataDesigner
+
+# %% Create seed data with some rows we want to filter out
+seed_data = pd.DataFrame(
+    {
+        "topic": [
+            "Python programming",
+            "Machine learning",
+            "SPAM: Buy now!",  # Will be filtered by regex
+            "Data science",
+            "SPAM: Click here",  # Will be filtered by regex
+            "Natural language processing",
+            "Computer vision",
+        ],
+        "difficulty": ["beginner", "advanced", "N/A", "intermediate", "N/A", "advanced", "advanced"],
+    }
+)
+
+print("Seed data before PRE_GENERATION filtering:")
+print(seed_data)
+print(f"Total rows: {len(seed_data)}")
+
+# %% Setup temporary directory and save seed data
+output_dir = Path(tempfile.mkdtemp())
+seed_path = output_dir / "seed.parquet"
+seed_data.to_parquet(seed_path, index=False)
+
+# %% Build the Data Designer configuration (uses default openai-text model)
+config_builder = dd.DataDesignerConfigBuilder()
+
+# Add seed dataset
+config_builder.with_seed_dataset(dd.LocalFileSeedSource(path=str(seed_path)))
+
+# Add LLM column to generate explanations
+config_builder.add_column(
+    dd.LLMTextColumnConfig(
+        name="explanation",
+        prompt="""Write a brief one-sentence explanation of the topic: {{ topic }}
+Difficulty level: {{ difficulty }}
+
+Keep it concise and educational.""",
+        model_alias="openai-text",
+    )
+)
+
+# Add PRE_GENERATION processor to filter out spam rows
+config_builder.add_processor(
+    RegexFilterProcessorConfig(
+        name="filter_spam",
+        column="topic",
+        pattern=r"^SPAM:",
+        invert=True,  # Keep rows that do NOT match (i.e., filter out spam)
+    )
+)
+
+# Add POST_GENERATION processor to deduplicate similar explanations
+config_builder.add_processor(
+    SemanticDedupProcessorConfig(
+        name="dedup_explanations",
+        column="explanation",
+        similarity_threshold=0.85,
+    )
+)
+
+print("Configuration created successfully!")
+processor_configs = config_builder.get_processor_configs()
+print(f"Processors configured: {[p.name for p in processor_configs]}")
+
+# %% Run preview to test with a few records
+data_designer = DataDesigner()
+
+print("\nRunning preview (3 records)...")
+preview = data_designer.preview(config_builder, num_records=3)
+
+print("\nPreview dataset:")
+print(preview.dataset)
+
+# %% Run full generation
+print("\nRunning full generation...")
+results = data_designer.create(
+    config_builder,
+    num_records=5,
+    dataset_name="processor-demo",
+)
+
+# Load the final dataset
+final_dataset = results.load_dataset()
+
+print("\nFinal dataset after all processors:")
+print(final_dataset)
+print(f"\nTotal rows in final dataset: {len(final_dataset)}")
+
+# %% Summary
+print("\n" + "=" * 60)
+print("DEMO SUMMARY")
+print("=" * 60)
+print(f"Original seed rows: {len(seed_data)}")
+print("After PRE_GENERATION (regex filter): Expected ~5 rows (SPAM removed)")
+print(f"After POST_GENERATION (semantic dedup): {len(final_dataset)} rows")
@@ -0,0 +1,25 @@
+[project]
+name = "data-designer-demo-processors"
+version = "0.1.0"
+description = "Demo processor plugins for Data Designer showing PRE_GENERATION and POST_GENERATION stages"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "data-designer-config",
+    "data-designer-engine",
+    "sentence-transformers>=2.2.0",
+]
+
+[project.entry-points."data_designer.plugins"]
+regex-filter = "data_designer_demo_processors.regex_filter.plugin:regex_filter_plugin"
+semantic-dedup = "data_designer_demo_processors.semantic_dedup.plugin:semantic_dedup_plugin"
+
+[project.scripts]
+download-semantic-dedup-model = "data_designer_demo_processors.download_model:main"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/data_designer_demo_processors"]
@@ -0,0 +1,4 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Demo processor plugins for Data Designer."""
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Pre-download the semantic dedup embedding model."""
+
+DEFAULT_MODEL = "all-MiniLM-L6-v2"
+
+
+def main():
+    """Download the embedding model to cache."""
+    from sentence_transformers import SentenceTransformer
+
+    print(f"Downloading model: {DEFAULT_MODEL}")
+    SentenceTransformer(DEFAULT_MODEL)
+    print("Model downloaded successfully!")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from data_designer_demo_processors.regex_filter.config import RegexFilterProcessorConfig
+from data_designer_demo_processors.regex_filter.impl import RegexFilterProcessor
+
+__all__ = ["RegexFilterProcessorConfig", "RegexFilterProcessor"]
@@ -0,0 +1,20 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Literal
+
+from pydantic import Field
+
+from data_designer.config.processors import ProcessorConfig
+
+
+class RegexFilterProcessorConfig(ProcessorConfig):
+    """Filter rows based on regex matching on a column.
+
+    This processor filters seed data during the preprocess stage.
+    """
+
+    processor_type: Literal["regex-filter"] = "regex-filter"
+    column: str = Field(description="Column to apply regex filter on")
+    pattern: str = Field(description="Regex pattern to match")
+    invert: bool = Field(default=False, description="If True, keep rows that do NOT match")
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import TYPE_CHECKING
+
+from data_designer.engine.processing.processors.base import Processor
+from data_designer_demo_processors.regex_filter.config import RegexFilterProcessorConfig
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+
+class RegexFilterProcessor(Processor[RegexFilterProcessorConfig]):
+    """Filters rows based on regex matching on a specified column.
+
+    Runs during preprocess to filter seed data before generation.
+    """
+
+    def preprocess(self, data: pd.DataFrame) -> pd.DataFrame:
+        column = self.config.column
+        pattern = self.config.pattern
+        invert = self.config.invert
+
+        if column not in data.columns:
+            logger.warning(f"⚠️ Column '{column}' not found in dataset. Skipping regex filter.")
+            return data
+
+        compiled = re.compile(pattern)
+        mask = data[column].astype(str).apply(lambda x: bool(compiled.search(x)))
+
+        if invert:
+            mask = ~mask
+
+        original_count = len(data)
+        data = data[mask].reset_index(drop=True)
+        filtered_count = original_count - len(data)
+
+        action = "excluded" if not invert else "kept only non-matching"
+        logger.info(f"🔍 Regex filter: {filtered_count} rows {action} (pattern: {pattern!r} on column '{column}')")
+
+        return data
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from data_designer.plugins.plugin import Plugin, PluginType
+
+regex_filter_plugin = Plugin(
+    config_qualified_name="data_designer_demo_processors.regex_filter.config.RegexFilterProcessorConfig",
+    impl_qualified_name="data_designer_demo_processors.regex_filter.impl.RegexFilterProcessor",
+    plugin_type=PluginType.PROCESSOR,
+)
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from data_designer_demo_processors.semantic_dedup.config import SemanticDedupProcessorConfig
+from data_designer_demo_processors.semantic_dedup.impl import SemanticDedupProcessor
+
+__all__ = ["SemanticDedupProcessorConfig", "SemanticDedupProcessor"]
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Literal
+
+from pydantic import Field
+
+from data_designer.config.processors import ProcessorConfig
+
+
+class SemanticDedupProcessorConfig(ProcessorConfig):
+    """Remove semantically similar rows using embeddings.
+
+    This processor deduplicates the final dataset during the postprocess stage.
+    """
+
+    processor_type: Literal["semantic-dedup"] = "semantic-dedup"
+    column: str = Field(description="Column to compute embeddings on for deduplication")
+    similarity_threshold: float = Field(
+        default=0.9,
+        ge=0.0,
+        le=1.0,
+        description="Cosine similarity threshold above which rows are considered duplicates",
+    )
+    model_name: str = Field(
+        default="all-MiniLM-L6-v2",
+        description="Sentence-transformers model name for computing embeddings",
+    )