Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
22ab0e9
chore: rename ColumnWiseDatasetBuilder, wire async preview, unify row…
andreatgretel Mar 24, 2026
709f53d
feat: add consolidated async progress reporting with row group context
andreatgretel Mar 24, 2026
5a7ba4c
fix async preview and progress reporting
andreatgretel Mar 24, 2026
80013eb
feat: add opt-in sticky ANSI progress bars for generation
andreatgretel Mar 25, 2026
16fa889
docs: add missing progress_interval docstring in RunConfig
andreatgretel Mar 25, 2026
86883db
fix: update progress bar on every completion in async path
andreatgretel Mar 25, 2026
324fecf
fix: resolve row-group semaphore deadlock when all tasks are deferred
andreatgretel Mar 25, 2026
86ef54e
fix: eagerly salvage stalled row groups to avoid wasting semaphore slots
andreatgretel Mar 25, 2026
1edacef
fix: address review findings from greptile and codex
andreatgretel Mar 25, 2026
4890fa3
fix: stable progress bar width and accurate failure counts
andreatgretel Mar 25, 2026
f3a522f
fix: address Nabin's review - exclude_columns, dead code, docstring
andreatgretel Mar 25, 2026
d82c3b1
Merge branch 'main' into andreatgretel/chore/async-engine-followup-v2
andreatgretel Mar 25, 2026
b547216
fix: _drain_frontier exits before dispatching ready salvage tasks
andreatgretel Mar 25, 2026
cde30d9
fix: salvage edge cases found by code review
andreatgretel Mar 25, 2026
ae73275
fix: skip record_failure for already-dropped rows in salvage
andreatgretel Mar 25, 2026
193f80d
Merge branch 'main' into andreatgretel/chore/async-engine-followup-v2
andreatgretel Mar 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ make coverage # Run tests with coverage report
- [packages/data-designer/src/data_designer/interface/data_designer.py](packages/data-designer/src/data_designer/interface/data_designer.py) - Main entry point (`DataDesigner` class)
- [packages/data-designer-config/src/data_designer/config/config_builder.py](packages/data-designer-config/src/data_designer/config/config_builder.py) - Configuration API (`DataDesignerConfigBuilder`)
- [packages/data-designer-config/src/data_designer/config/__init__.py](packages/data-designer-config/src/data_designer/config/__init__.py) - User-facing config API exports
- [packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py](packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py) - Generation orchestrator
- [packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py](packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py) - Generation orchestrator
- [pyproject.toml](pyproject.toml) - Project dependencies and tool configurations
- [Makefile](Makefile) - Common development commands

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@ class RunConfig(ConfigBase):
Default is 0.
async_trace: If True, collect per-task tracing data when using the async engine
(DATA_DESIGNER_ASYNC_ENGINE=1). Has no effect on the sync path. Default is False.
progress_bar: If True, display sticky ANSI progress bars instead of periodic log lines
during generation. Requires a TTY; falls back to log lines in non-TTY environments.
Default is False.
progress_interval: How often (in seconds) the async progress reporter emits a
consolidated log block. Must be > 0. Default is 5.0.
throttle: AIMD throttle tuning parameters. See ``ThrottleConfig`` for details.
"""

Expand All @@ -105,6 +110,8 @@ class RunConfig(ConfigBase):
max_conversation_restarts: int = Field(default=5, ge=0)
max_conversation_correction_steps: int = Field(default=0, ge=0)
async_trace: bool = False
progress_bar: bool = False
progress_interval: float = Field(default=5.0, gt=0.0)
throttle: ThrottleConfig = Field(default_factory=ThrottleConfig)

@model_validator(mode="after")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from data_designer.config.column_configs import ExpressionColumnConfig
from data_designer.engine.column_generators.generators.base import ColumnGeneratorFullColumn
from data_designer.engine.column_generators.utils.errors import ExpressionTemplateRenderError
from data_designer.engine.context import format_row_group_tag
from data_designer.engine.processing.ginja.environment import WithJinja2UserTemplateRendering
from data_designer.engine.processing.utils import deserialize_json_values

Expand All @@ -21,7 +22,7 @@

class ExpressionColumnGenerator(WithJinja2UserTemplateRendering, ColumnGeneratorFullColumn[ExpressionColumnConfig]):
def generate(self, data: pd.DataFrame) -> pd.DataFrame:
logger.info(f"🧩 Generating column `{self.config.name}` from expression")
logger.info(f"🧩 {format_row_group_tag()}Generating column `{self.config.name}` from expression")

missing_columns = list(set(self.config.required_columns) - set(data.columns))
if len(missing_columns) > 0:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from data_designer.config.utils.constants import LOCALES_WITH_MANAGED_DATASETS
from data_designer.engine.column_generators.generators.base import FromScratchColumnGenerator, GenerationStrategy
from data_designer.engine.context import format_row_group_tag
from data_designer.engine.dataset_builders.multi_column_configs import SamplerMultiColumnConfig
from data_designer.engine.errors import DataDesignerRuntimeError
from data_designer.engine.processing.utils import concat_datasets
Expand Down Expand Up @@ -68,7 +69,8 @@ def _log_person_generation_if_needed(self) -> None:

def _prepare_for_generation(self, num_records: int) -> SamplingDatasetGenerator:
logger.info(
f"🎲 Preparing samplers to generate {num_records} records across {len(self.config.columns)} columns"
f"🎲 {format_row_group_tag()}Preparing samplers to generate"
f" {num_records} records across {len(self.config.columns)} columns"
)
self._log_person_generation_if_needed()
return self._create_sampling_dataset_generator()
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from data_designer.config.seed import IndexRange, PartitionBlock, SamplingStrategy
from data_designer.engine.column_generators.generators.base import FromScratchColumnGenerator, GenerationStrategy
from data_designer.engine.column_generators.utils.errors import SeedDatasetError
from data_designer.engine.context import format_row_group_tag
from data_designer.engine.dataset_builders.multi_column_configs import SeedDatasetMultiColumnConfig
from data_designer.engine.processing.utils import concat_datasets
from data_designer.logging import LOG_INDENT
Expand Down Expand Up @@ -89,7 +90,7 @@ def _reset_batch_reader(self, num_records: int) -> None:
)

def _sample_records(self, num_records: int) -> pd.DataFrame:
logger.info(f"🌱 Sampling {num_records} records from seed dataset")
logger.info(f"🌱 {format_row_group_tag()}Sampling {num_records} records from seed dataset")
logger.info(f"{LOG_INDENT}seed dataset size: {self._seed_dataset_size} records")
logger.info(f"{LOG_INDENT}sampling strategy: {self.config.sampling_strategy}")
if self._index_range is not None:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

from contextvars import ContextVar

# Set by the async scheduler before executing each task.
# Value: (current_rg_index, total_rg_count) or None.
current_row_group: ContextVar[tuple[int, int] | None] = ContextVar("current_row_group", default=None)


def format_row_group_tag() -> str:
"""Return a '(x/X) ' prefix if a row group context is active, else ''."""
rg = current_row_group.get()
if rg is None:
return ""
current, total = rg[0] + 1, rg[1]
width = len(str(total))
return f"({current:0{width}d}/{total}) "
Loading
Loading