Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions demo/data_designer_demo_processors/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Data Designer Demo Processors

Demo processor plugins demonstrating PRE_GENERATION and POST_GENERATION stages.

## Installation

```bash
uv pip install -e demo/data_designer_demo_processors
```

## Processors

### RegexFilterProcessor (PRE_GENERATION)

Filters seed data rows based on regex pattern matching.

```python
from data_designer.config.config_builder import DataDesignerConfigBuilder
from data_designer_demo_processors.regex_filter import RegexFilterProcessorConfig

builder = DataDesignerConfigBuilder(model_configs=[...])
builder.add_processor(RegexFilterProcessorConfig(
name="filter_emails",
column="email",
pattern=r"@company\.com$",
invert=False, # Keep only matching rows
))
```

### SemanticDedupProcessor (POST_GENERATION)

Removes semantically similar rows using sentence embeddings.

```python
from data_designer_demo_processors.semantic_dedup import SemanticDedupProcessorConfig

builder.add_processor(SemanticDedupProcessorConfig(
name="dedup_responses",
column="response",
similarity_threshold=0.9, # Remove rows with >90% similarity
model_name="all-MiniLM-L6-v2",
))
```

## Pre-downloading the Embedding Model

The semantic dedup processor downloads the embedding model on first use. To pre-download:

```bash
download-semantic-dedup-model
```

## Entry Points

The package registers plugins via entry points:

```toml
[project.entry-points."data_designer.plugins"]
regex-filter = "data_designer_demo_processors.regex_filter.plugin:regex_filter_plugin"
semantic-dedup = "data_designer_demo_processors.semantic_dedup.plugin:semantic_dedup_plugin"
```
120 changes: 120 additions & 0 deletions demo/data_designer_demo_processors/notebooks/demo_processors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Demo: Processor Plugins with PRE_GENERATION and POST_GENERATION stages.

This notebook demonstrates:
1. RegexFilterProcessor (PRE_GENERATION) - filters seed data before generation
2. SemanticDedupProcessor (POST_GENERATION) - deduplicates final dataset

Run cells with `#%%` markers in VS Code or PyCharm.
"""

# %% Imports
import tempfile
from pathlib import Path

import pandas as pd
from data_designer_demo_processors.regex_filter import RegexFilterProcessorConfig
from data_designer_demo_processors.semantic_dedup import SemanticDedupProcessorConfig

import data_designer.config as dd
from data_designer.interface import DataDesigner

# %% Create seed data with some rows we want to filter out
seed_data = pd.DataFrame(
{
"topic": [
"Python programming",
"Machine learning",
"SPAM: Buy now!", # Will be filtered by regex
"Data science",
"SPAM: Click here", # Will be filtered by regex
"Natural language processing",
"Computer vision",
],
"difficulty": ["beginner", "advanced", "N/A", "intermediate", "N/A", "advanced", "advanced"],
}
)

print("Seed data before PRE_GENERATION filtering:")
print(seed_data)
print(f"Total rows: {len(seed_data)}")

# %% Setup temporary directory and save seed data
output_dir = Path(tempfile.mkdtemp())
seed_path = output_dir / "seed.parquet"
seed_data.to_parquet(seed_path, index=False)

# %% Build the Data Designer configuration (uses default openai-text model)
config_builder = dd.DataDesignerConfigBuilder()

# Add seed dataset
config_builder.with_seed_dataset(dd.LocalFileSeedSource(path=str(seed_path)))

# Add LLM column to generate explanations
config_builder.add_column(
dd.LLMTextColumnConfig(
name="explanation",
prompt="""Write a brief one-sentence explanation of the topic: {{ topic }}
Difficulty level: {{ difficulty }}

Keep it concise and educational.""",
model_alias="openai-text",
)
)

# Add PRE_GENERATION processor to filter out spam rows
config_builder.add_processor(
RegexFilterProcessorConfig(
name="filter_spam",
column="topic",
pattern=r"^SPAM:",
invert=True, # Keep rows that do NOT match (i.e., filter out spam)
)
)

# Add POST_GENERATION processor to deduplicate similar explanations
config_builder.add_processor(
SemanticDedupProcessorConfig(
name="dedup_explanations",
column="explanation",
similarity_threshold=0.85,
)
)

print("Configuration created successfully!")
processor_configs = config_builder.get_processor_configs()
print(f"Processors configured: {[p.name for p in processor_configs]}")

# %% Run preview to test with a few records
data_designer = DataDesigner()

print("\nRunning preview (3 records)...")
preview = data_designer.preview(config_builder, num_records=3)

print("\nPreview dataset:")
print(preview.dataset)

# %% Run full generation
print("\nRunning full generation...")
results = data_designer.create(
config_builder,
num_records=5,
dataset_name="processor-demo",
)

# Load the final dataset
final_dataset = results.load_dataset()

print("\nFinal dataset after all processors:")
print(final_dataset)
print(f"\nTotal rows in final dataset: {len(final_dataset)}")

# %% Summary
print("\n" + "=" * 60)
print("DEMO SUMMARY")
print("=" * 60)
print(f"Original seed rows: {len(seed_data)}")
print("After PRE_GENERATION (regex filter): Expected ~5 rows (SPAM removed)")
print(f"After POST_GENERATION (semantic dedup): {len(final_dataset)} rows")
25 changes: 25 additions & 0 deletions demo/data_designer_demo_processors/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
[project]
name = "data-designer-demo-processors"
version = "0.1.0"
description = "Demo processor plugins for Data Designer showing PRE_GENERATION and POST_GENERATION stages"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"data-designer-config",
"data-designer-engine",
"sentence-transformers>=2.2.0",
]

[project.entry-points."data_designer.plugins"]
regex-filter = "data_designer_demo_processors.regex_filter.plugin:regex_filter_plugin"
semantic-dedup = "data_designer_demo_processors.semantic_dedup.plugin:semantic_dedup_plugin"

[project.scripts]
download-semantic-dedup-model = "data_designer_demo_processors.download_model:main"

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["src/data_designer_demo_processors"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Demo processor plugins for Data Designer."""
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Pre-download the semantic dedup embedding model."""

DEFAULT_MODEL = "all-MiniLM-L6-v2"


def main():
"""Download the embedding model to cache."""
from sentence_transformers import SentenceTransformer

print(f"Downloading model: {DEFAULT_MODEL}")
SentenceTransformer(DEFAULT_MODEL)
print("Model downloaded successfully!")


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from data_designer_demo_processors.regex_filter.config import RegexFilterProcessorConfig
from data_designer_demo_processors.regex_filter.impl import RegexFilterProcessor

__all__ = ["RegexFilterProcessorConfig", "RegexFilterProcessor"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from typing import Literal

from pydantic import Field

from data_designer.config.processors import ProcessorConfig


class RegexFilterProcessorConfig(ProcessorConfig):
"""Filter rows based on regex matching on a column.

This processor filters seed data during the preprocess stage.
"""

processor_type: Literal["regex-filter"] = "regex-filter"
column: str = Field(description="Column to apply regex filter on")
pattern: str = Field(description="Regex pattern to match")
invert: bool = Field(default=False, description="If True, keep rows that do NOT match")
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

import logging
import re
from typing import TYPE_CHECKING

from data_designer.engine.processing.processors.base import Processor
from data_designer_demo_processors.regex_filter.config import RegexFilterProcessorConfig

if TYPE_CHECKING:
import pandas as pd

logger = logging.getLogger(__name__)


class RegexFilterProcessor(Processor[RegexFilterProcessorConfig]):
"""Filters rows based on regex matching on a specified column.

Runs during preprocess to filter seed data before generation.
"""

def preprocess(self, data: pd.DataFrame) -> pd.DataFrame:
column = self.config.column
pattern = self.config.pattern
invert = self.config.invert

if column not in data.columns:
logger.warning(f"⚠️ Column '{column}' not found in dataset. Skipping regex filter.")
return data

compiled = re.compile(pattern)
mask = data[column].astype(str).apply(lambda x: bool(compiled.search(x)))

if invert:
mask = ~mask

original_count = len(data)
data = data[mask].reset_index(drop=True)
filtered_count = original_count - len(data)

action = "excluded" if not invert else "kept only non-matching"
logger.info(f"🔍 Regex filter: {filtered_count} rows {action} (pattern: {pattern!r} on column '{column}')")

return data
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from data_designer.plugins.plugin import Plugin, PluginType

regex_filter_plugin = Plugin(
config_qualified_name="data_designer_demo_processors.regex_filter.config.RegexFilterProcessorConfig",
impl_qualified_name="data_designer_demo_processors.regex_filter.impl.RegexFilterProcessor",
plugin_type=PluginType.PROCESSOR,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from data_designer_demo_processors.semantic_dedup.config import SemanticDedupProcessorConfig
from data_designer_demo_processors.semantic_dedup.impl import SemanticDedupProcessor

__all__ = ["SemanticDedupProcessorConfig", "SemanticDedupProcessor"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from typing import Literal

from pydantic import Field

from data_designer.config.processors import ProcessorConfig


class SemanticDedupProcessorConfig(ProcessorConfig):
"""Remove semantically similar rows using embeddings.

This processor deduplicates the final dataset during the postprocess stage.
"""

processor_type: Literal["semantic-dedup"] = "semantic-dedup"
column: str = Field(description="Column to compute embeddings on for deduplication")
similarity_threshold: float = Field(
default=0.9,
ge=0.0,
le=1.0,
description="Cosine similarity threshold above which rows are considered duplicates",
)
model_name: str = Field(
default="all-MiniLM-L6-v2",
description="Sentence-transformers model name for computing embeddings",
)
Loading