Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 26 additions & 8 deletions src/kaggle/api/kaggle_api_extended.py
Original file line number Diff line number Diff line change
Expand Up @@ -7562,9 +7562,9 @@ def benchmarks_tasks_download_cli(self, task, model=None, output=None, include_s
print(f"Downloading output runs for {task}")
print(f"Target directory: {target_dir}/\n")

display_files = [
f"{self._normalize_model_slug(r.model_version_slug)}/{r.id}/{r.id}.zip" for r in downloadable
]
# Show the extracted output directory (what is actually left on disk after
# extraction), not the intermediate .zip that gets removed.
display_files = [f"{self._normalize_model_slug(r.model_version_slug)}/{r.id}/" for r in downloadable]
model_col = max((len(self._normalize_model_slug(r.model_version_slug)) for r in downloadable), default=20)
model_col = max(model_col, 20)
file_col = max((len(f) for f in display_files), default=40)
Expand All @@ -7575,7 +7575,7 @@ def benchmarks_tasks_download_cli(self, task, model=None, output=None, include_s
print(f"{'Model':<{model_col}} {'File':<{file_col}} {'Size':<{size_col}} {'Progress':<{prog_col}}")
print(f"{'─' * model_col} {'─' * file_col} {'─' * size_col} {'─' * prog_col}")

downloaded, skipped = 0, 0
downloaded, cached, cached_without_source = 0, 0, 0
for r, display_file in zip(downloadable, display_files):
slug = self._normalize_model_slug(r.model_version_slug)
# Hierarchical layout: {output}/{task}/{version}/{model}/{run_id}/
Expand All @@ -7584,8 +7584,16 @@ def benchmarks_tasks_download_cli(self, task, model=None, output=None, include_s

if os.path.isdir(outdir) and not force:
size_str = self._format_size(self._dir_size(outdir))
print(f"{row_prefix} {size_str:<{size_col}} {'Skipped':<{prog_col}}")
skipped += 1
print(f"{row_prefix} {size_str:<{size_col}} {'Cached':<{prog_col}}")
cached += 1
# If the caller asked for source notebooks with -s but the cached dir
# was built without them, count it so we can emit a tip at the end.
# Skip detection requires --force to re-download.
if include_source and not any(
os.path.exists(os.path.join(outdir, n))
for n in ("__notebook__.ipynb", "__notebook_source__.ipynb")
):
cached_without_source += 1
continue

dl_request = ApiDownloadBenchmarkTaskRunOutputRequest()
Expand Down Expand Up @@ -7630,10 +7638,20 @@ def benchmarks_tasks_download_cli(self, task, model=None, output=None, include_s
downloaded += 1
print(f"{row_prefix} {size_str:<{size_col}} {'Done':<{prog_col}}")

# Summary
parts = [f"{n} run(s) {label}" for n, label in ((downloaded, "downloaded"), (skipped, "skipped")) if n]
# Summary — counts are runs (one run may include multiple files, e.g. with --include-source).
# "Cached" = already on disk from a previous download; the data is available without a fetch.
parts = [f"{n} run(s) {label}" for n, label in ((downloaded, "downloaded"), (cached, "cached")) if n]
print(f"\nDone: {', '.join(parts) or '0 runs downloaded'}.")

# Tip: -s alone won't backfill source notebooks into already-cached dirs.
# The check that gates re-download (os.path.isdir(outdir)) doesn't peek inside,
# so the cached row stays untouched even though it lacks the requested files.
if cached_without_source:
print(
f"\nTip: {cached_without_source} cached run(s) lack source notebooks. "
f"Re-run with -f -s to fetch them."
)

@staticmethod
def _format_size(n) -> str:
"""Render a byte count as ``1.06KB`` / ``2.34MB`` / etc."""
Expand Down
51 changes: 46 additions & 5 deletions src/kaggle/test/test_benchmarks_cli.py
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as #1038 (comment), please update the doc and skill file

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this PR on top of download-fix?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, this is based on that branch. This PR is not ready yet :)

Original file line number Diff line number Diff line change
Expand Up @@ -1180,7 +1180,7 @@ def test_download_all_pending_shows_message(self, api, capsys):
assert "kaggle b t status my-task" in output

def test_download_skips_existing_output(self, api, capsys, tmp_path):
"""Already-downloaded runs are skipped without making API calls."""
"""Already-downloaded runs render as Cached without making API calls."""
_setup_runs_response(api, [_make_run(run_id=42)])
self._mock_download(api)
outdir = str(tmp_path / "out")
Expand All @@ -1192,13 +1192,54 @@ def test_download_skips_existing_output(self, api, capsys, tmp_path):

output = capsys.readouterr().out
assert "gemini-pro" in output
assert "Skipped" in output
assert "1 run(s) skipped" in output
assert "Cached" in output
assert "1 run(s) cached" in output
# The File column shows the output directory, not a .zip path
assert "gemini-pro/42/" in output
assert "/42/42.zip" not in output
# No download API call should have been made
api._mock_benchmarks.download_benchmark_task_run_output.assert_not_called()

def test_download_cached_dir_without_source_prints_tip_when_s_passed(self, api, capsys, tmp_path):
"""When -s is passed but the cached dir lacks source notebooks, hint that -f -s is needed."""
_setup_runs_response(api, [_make_run(run_id=42)])
self._mock_download(api)
outdir = str(tmp_path / "out")
# Cached dir exists but has no __notebook__.ipynb / __notebook_source__.ipynb
existing = os.path.join(outdir, "my-task", "1", "gemini-pro", "42")
os.makedirs(existing)
# Drop a placeholder file so the dir isn't empty, but not a source notebook
with open(os.path.join(existing, "result.run.json"), "w") as f:
f.write("{}")

api.benchmarks_tasks_download_cli("my-task", output=outdir, include_source=True)
output = capsys.readouterr().out

assert "Cached" in output
assert "1 cached run(s) lack source notebooks" in output or "lack source notebooks" in output
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this redundant?

assert "-f -s" in output
# Without -f, the cached dir was not touched: no download API call
api._mock_benchmarks.download_benchmark_task_run_output.assert_not_called()

def test_download_cached_dir_with_source_does_not_print_tip(self, api, capsys, tmp_path):
"""When the cached dir already contains source notebooks, no tip is shown."""
_setup_runs_response(api, [_make_run(run_id=42)])
self._mock_download(api)
outdir = str(tmp_path / "out")
existing = os.path.join(outdir, "my-task", "1", "gemini-pro", "42")
os.makedirs(existing)
# Simulate a previous -s download by dropping the source notebook
with open(os.path.join(existing, "__notebook__.ipynb"), "w") as f:
f.write("{}")

api.benchmarks_tasks_download_cli("my-task", output=outdir, include_source=True)
output = capsys.readouterr().out

assert "Cached" in output
assert "lack source notebooks" not in output

def test_download_summary_counts(self, api, capsys, tmp_path):
"""Download summary shows correct downloaded and skipped counts."""
"""Download summary shows correct downloaded and cached counts."""
_setup_runs_response(
api,
[_make_run(model="new-model", run_id=1), _make_run(model="old-model", run_id=2)],
Expand All @@ -1214,7 +1255,7 @@ def test_download_summary_counts(self, api, capsys, tmp_path):

output = capsys.readouterr().out
assert "1 run(s) downloaded" in output
assert "1 run(s) skipped" in output
assert "1 run(s) cached" in output

def test_download_force_overwrites_existing_output(self, api, capsys, tmp_path):
"""Using force=True re-downloads and overwrites existing output."""
Expand Down
Loading