Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 23 additions & 4 deletions dandi/cli/tests/test_digest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,18 @@
import zarr

from ..cmd_digest import digest
from ...tests.test_helpers import zarr_format_of

# The on-disk Zarr serialisation format changed between V2 (``.zgroup`` /
# ``.zarray``) and V3 (``zarr.json``, ``c/<chunk>`` layout, different default
# compressor), so the digest of "the same" Zarr data differs. Key expected
# values on the format that was *actually* produced rather than on
# ``zarr.__version__`` — zarr-python 3.x can still write V2 via
# ``zarr_format=2``.
_EXPECTED_SAMPLE_ZARR_DIGEST_BY_FORMAT = {
"2": "4313ab36412db2981c3ed391b38604d6-5--1516",
"3": "00157f091c9a6295e89eb3c4c2efaeff-5--3935",
}


def test_digest_default():
Expand Down Expand Up @@ -44,16 +56,20 @@ def test_digest(alg, filehash):


def test_digest_zarr():
# This test assumes that the Zarr serialization format never changes
# Expected digest is selected by the Zarr serialisation format that
# ``zarr.save`` actually produced (V2 vs V3 layouts have different digests).
runner = CliRunner()
with runner.isolated_filesystem():
dt = np.dtype("<i8")
zarr.save(
"sample.zarr", np.arange(1000, dtype=dt), np.arange(1000, 0, -1, dtype=dt)
)
expected = _EXPECTED_SAMPLE_ZARR_DIGEST_BY_FORMAT[
zarr_format_of(Path("sample.zarr"))
]
r = runner.invoke(digest, ["--digest", "zarr-checksum", "sample.zarr"])
assert r.exit_code == 0
assert r.output == "sample.zarr: 4313ab36412db2981c3ed391b38604d6-5--1516\n"
assert r.output == f"sample.zarr: {expected}\n"


def test_digest_empty_zarr(tmp_path: Path) -> None:
Expand All @@ -66,13 +82,16 @@ def test_digest_empty_zarr(tmp_path: Path) -> None:


def test_digest_zarr_with_excluded_dotfiles():
# This test assumes that the Zarr serialization format never changes
# See comment in `test_digest_zarr` regarding V2 vs V3 serialisation.
runner = CliRunner()
with runner.isolated_filesystem():
dt = np.dtype("<i8")
zarr.save(
"sample.zarr", np.arange(1000, dtype=dt), np.arange(1000, 0, -1, dtype=dt)
)
expected = _EXPECTED_SAMPLE_ZARR_DIGEST_BY_FORMAT[
zarr_format_of(Path("sample.zarr"))
]
subprocess.run(["git", "init"], cwd="sample.zarr", check=True)
os.mkdir("sample.zarr/.dandi")
Path("sample.zarr", ".dandi", "somefile.txt").touch()
Expand All @@ -82,4 +101,4 @@ def test_digest_zarr_with_excluded_dotfiles():
Path("sample.zarr", "arr_1", ".datalad", "config").touch()
r = runner.invoke(digest, ["--digest", "zarr-checksum", "sample.zarr"])
assert r.exit_code == 0
assert r.output == "sample.zarr: 4313ab36412db2981c3ed391b38604d6-5--1516\n"
assert r.output == f"sample.zarr: {expected}\n"
94 changes: 45 additions & 49 deletions dandi/files/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,25 @@ def _ts_validate_zarr3_array(
return results


def _is_empty_group(group: Any) -> bool:
"""
Return whether a `zarr.Group` has no child arrays or groups.

In zarr-python 2.x ``bool(group)`` / ``len(group)`` is a cheap check, but in
zarr-python 3.x both eagerly iterate the group's members, which raises if
the underlying directory contains non-Zarr files (e.g. an arbitrary
``a/b/c/...`` tree) or arrays with metadata that the new library can't
parse (e.g. legacy ``>u1`` dtypes). Treat any such failure as "not empty"
— there is *some* content there, even if it's not a valid Zarr member —
so we don't spuriously report ``dandi_zarr.empty_group``.
"""
try:
return len(group) == 0
except Exception as exc:
lgr.debug("Could not determine emptiness of Zarr group %r: %s", group, exc)
return False


@dataclass
class LocalZarrEntry(BasePath):
"""A file or directory within a `ZarrAsset`"""
Expand Down Expand Up @@ -455,67 +474,41 @@ def get_validation_errors(
standard=Standard.ZARR,
)

try:
data = zarr.open(str(self.filepath), mode="r")
except zarr.errors.PathNotFoundError as e:
# The asset is potentially in Zarr V3 format, which is not support by
# zarr-python 2.x. Before upgrade to zarr-python 3.x, use tensorstore to
# open it.

format_version = get_zarr_format_version(self.filepath)

if format_version is None:
# === The Zarr format can't be determined ===
# Zarr V3 stores are validated by `_ts_validate_zarr3` regardless of
# zarr-python version. With zarr-python 2.x, `zarr.open()` cannot
# handle V3 at all (raises `PathNotFoundError`); with zarr-python 3.x
# it can sometimes partially open malformed V3 stores (e.g. valid
# root metadata, bad child metadata), which would cause us to miss
# the `_ts_validate_zarr3` path. Detecting the format up front keeps
# the validation behaviour consistent across versions.
format_version = get_zarr_format_version(self.filepath)
if format_version == "3":
errors.extend(_ts_validate_zarr3(self.filepath, devel_debug))
data = None
Comment thread
yarikoptic marked this conversation as resolved.
Dismissed
else:
try:
data = zarr.open(str(self.filepath), mode="r")
except Exception as e:
if devel_debug:
raise
errors.append(
ValidationResult(
id="zarr.cannot_open",
origin=origin_internal_zarr,
scope=Scope.FILE,
origin_result=e,
severity=Severity.ERROR,
message="Error opening file and Zarr format cannot be determined",
path=self.filepath,
)
message = (
"Error opening file and Zarr format cannot be determined"
if format_version is None
else "Error opening file."
)
elif format_version == "3":
# === The Zarr format is V3 ===
errors.extend(_ts_validate_zarr3(self.filepath, devel_debug))
else:
# === A Zarr format should be supported by `zarr.open()` ===
if devel_debug:
raise
errors.append(
ValidationResult(
id="zarr.cannot_open",
origin=origin_internal_zarr,
scope=Scope.FILE,
origin_result=e,
severity=Severity.ERROR,
message="Error opening file.",
message=message,
path=self.filepath,
)
)

# code for temporary workaround for Zarr format V3 with tensorstore ends

except Exception as e:
if devel_debug:
raise
errors.append(
ValidationResult(
origin=origin_internal_zarr,
severity=Severity.ERROR,
id="zarr.cannot_open",
scope=Scope.FILE,
origin_result=e,
path=self.filepath,
message="Error opening file.",
)
)
else:
if isinstance(data, zarr.Group) and not data:
data = None
if isinstance(data, zarr.Group) and _is_empty_group(data):
errors.append(
ValidationResult(
origin=ORIGIN_VALIDATION_DANDI_ZARR,
Expand Down Expand Up @@ -1048,7 +1041,10 @@ class UploadItem:

@classmethod
def from_entry(cls, e: LocalZarrEntry, digest: str) -> UploadItem:
if e.name in {".zarray", ".zattrs", ".zgroup", ".zmetadata"}:
# JSON metadata files. ``.zarray`` / ``.zattrs`` / ``.zgroup`` /
# ``.zmetadata`` are the V2 names; ``zarr.json`` is the V3 name (a
# single file per group/array containing all metadata).
if e.name in {".zarray", ".zattrs", ".zgroup", ".zmetadata", "zarr.json"}:
try:
with e.filepath.open("rb") as fp:
json.load(fp)
Expand Down
48 changes: 25 additions & 23 deletions dandi/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

from .fixtures import SampleDandiset, SampleDandisetFactory
from .skip import mark
from .test_helpers import assert_dirtrees_eq
from .test_helpers import TWO_ARRAY_ZARR_LAYOUT, assert_dirtrees_eq, zarr_format_of
from ..consts import DRAFT, dandiset_metadata_file
from ..dandiarchive import DandisetURL
from ..download import (
Expand Down Expand Up @@ -460,6 +460,10 @@ def test_download_different_zarr_onto_excluded_dotfiles(
dd.mkdir()
zarr_path = dd / "sample.zarr"
zarr.save(zarr_path, np.eye(5))
# `zarr_dandiset` was uploaded with default ``zarr.save`` — its on-disk
# layout (and therefore the layout we expect to see after download) is
# V2 for zarr-python 2.x, V3 for zarr-python 3.x.
layout = TWO_ARRAY_ZARR_LAYOUT[zarr_format_of(zarr_dandiset.dspath / "sample.zarr")]
(zarr_path / ".git").touch()
(zarr_path / ".dandi").mkdir()
(zarr_path / ".dandi" / "somefile.txt").touch()
Expand All @@ -472,21 +476,18 @@ def test_download_different_zarr_onto_excluded_dotfiles(
tmp_path,
existing=DownloadExisting.OVERWRITE_DIFFERENT,
)
assert list_paths(zarr_path, dirs=True, exclude_vcs=False) == [
dotfile_paths = [
zarr_path / ".dandi",
zarr_path / ".dandi" / "somefile.txt",
zarr_path / ".datalad",
zarr_path / ".git",
zarr_path / ".gitattributes",
zarr_path / ".zgroup",
zarr_path / "arr_0",
zarr_path / "arr_0" / ".gitmodules",
zarr_path / "arr_0" / ".zarray",
zarr_path / "arr_0" / "0",
zarr_path / "arr_1",
zarr_path / "arr_1" / ".zarray",
zarr_path / "arr_1" / "0",
]
zarr_paths = [zarr_path / p for p in layout["files_and_dirs"]]
assert list_paths(zarr_path, dirs=True, exclude_vcs=False) == sorted(
set(dotfile_paths + zarr_paths)
)


def test_download_different_zarr_delete_dir(
Expand All @@ -495,7 +496,16 @@ def test_download_different_zarr_delete_dir(
d = new_dandiset.dandiset
dspath = new_dandiset.dspath
zarr.save(dspath / "sample.zarr", np.eye(5))
assert not any(p.is_dir() for p in (dspath / "sample.zarr").iterdir())
# The single-array ``np.eye(5)`` Zarr has a flatter layout than the
# two-array Zarr below: in V2 no subdirs at all; in V3 only ``c/``
# (the chunk container) rather than ``arr_0/`` and ``arr_1/``. This
# is the contrast the test exercises — downloading the smaller Zarr
# must delete the extra ``arr_*`` subdirs of the bigger one.
initial_subdirs = {p.name for p in (dspath / "sample.zarr").iterdir() if p.is_dir()}
if zarr_format_of(dspath / "sample.zarr") == "2":
assert initial_subdirs == set()
else:
assert initial_subdirs == {"c"}
new_dandiset.upload()
dd = tmp_path / d.identifier
dd.mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -922,29 +932,21 @@ def test_download_multiple_urls(
# end up using the same `new_dandiset`. Hence, we need to fall back to
# using `sample_dandiset_factory` here.
zarr_dandiset = sample_dandiset_factory.mkdandiset("Sample Zarr Dandiset")
zarr.save(
zarr_dandiset.dspath / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)
)
zarr_local = zarr_dandiset.dspath / "sample.zarr"
zarr.save(zarr_local, np.arange(1000), np.arange(1000, 0, -1))
layout = TWO_ARRAY_ZARR_LAYOUT[zarr_format_of(zarr_local)]
zarr_dandiset.upload()

download([text_dandiset.dandiset.api_url, zarr_dandiset.dandiset.api_url], tmp_path)
zarr_root = tmp_path / zarr_dandiset.dandiset_id / "sample.zarr"
assert list_paths(tmp_path) == [
tmp_path / text_dandiset.dandiset_id / "dandiset.yaml",
tmp_path / text_dandiset.dandiset_id / "file.txt",
tmp_path / text_dandiset.dandiset_id / "subdir1" / "apple.txt",
tmp_path / text_dandiset.dandiset_id / "subdir2" / "banana.txt",
tmp_path / text_dandiset.dandiset_id / "subdir2" / "coconut.txt",
tmp_path / zarr_dandiset.dandiset_id / "dandiset.yaml",
tmp_path
/ zarr_dandiset.dandiset_id
/ tmp_path
/ zarr_dandiset.dandiset_id
/ "sample.zarr"
/ ".zgroup",
tmp_path / zarr_dandiset.dandiset_id / "sample.zarr" / "arr_0" / ".zarray",
tmp_path / zarr_dandiset.dandiset_id / "sample.zarr" / "arr_0" / "0",
tmp_path / zarr_dandiset.dandiset_id / "sample.zarr" / "arr_1" / ".zarray",
tmp_path / zarr_dandiset.dandiset_id / "sample.zarr" / "arr_1" / "0",
*[zarr_root / p for p in layout["files"]],
]


Expand Down
Loading
Loading