dandi · yarikoptic-gitmate · May 21, 2026 · May 19, 2026 · May 20, 2026 · May 20, 2026
diff --git a/dandi/cli/tests/test_digest.py b/dandi/cli/tests/test_digest.py
@@ -8,6 +8,18 @@
 import zarr
 
 from ..cmd_digest import digest
+from ...tests.test_helpers import zarr_format_of
+
+# The on-disk Zarr serialisation format changed between V2 (``.zgroup`` /
+# ``.zarray``) and V3 (``zarr.json``, ``c/<chunk>`` layout, different default
+# compressor), so the digest of "the same" Zarr data differs. Key expected
+# values on the format that was *actually* produced rather than on
+# ``zarr.__version__`` — zarr-python 3.x can still write V2 via
+# ``zarr_format=2``.
+_EXPECTED_SAMPLE_ZARR_DIGEST_BY_FORMAT = {
+    "2": "4313ab36412db2981c3ed391b38604d6-5--1516",
+    "3": "00157f091c9a6295e89eb3c4c2efaeff-5--3935",
+}
 
 
 def test_digest_default():
@@ -44,16 +56,20 @@ def test_digest(alg, filehash):
 
 
 def test_digest_zarr():
-    # This test assumes that the Zarr serialization format never changes
+    # Expected digest is selected by the Zarr serialisation format that
+    # ``zarr.save`` actually produced (V2 vs V3 layouts have different digests).
     runner = CliRunner()
     with runner.isolated_filesystem():
         dt = np.dtype("<i8")
         zarr.save(
             "sample.zarr", np.arange(1000, dtype=dt), np.arange(1000, 0, -1, dtype=dt)
         )
+        expected = _EXPECTED_SAMPLE_ZARR_DIGEST_BY_FORMAT[
+            zarr_format_of(Path("sample.zarr"))
+        ]
         r = runner.invoke(digest, ["--digest", "zarr-checksum", "sample.zarr"])
         assert r.exit_code == 0
-        assert r.output == "sample.zarr: 4313ab36412db2981c3ed391b38604d6-5--1516\n"
+        assert r.output == f"sample.zarr: {expected}\n"
 
 
 def test_digest_empty_zarr(tmp_path: Path) -> None:
@@ -66,13 +82,16 @@ def test_digest_empty_zarr(tmp_path: Path) -> None:
 
 
 def test_digest_zarr_with_excluded_dotfiles():
-    # This test assumes that the Zarr serialization format never changes
+    # See comment in `test_digest_zarr` regarding V2 vs V3 serialisation.
     runner = CliRunner()
     with runner.isolated_filesystem():
         dt = np.dtype("<i8")
         zarr.save(
             "sample.zarr", np.arange(1000, dtype=dt), np.arange(1000, 0, -1, dtype=dt)
         )
+        expected = _EXPECTED_SAMPLE_ZARR_DIGEST_BY_FORMAT[
+            zarr_format_of(Path("sample.zarr"))
+        ]
         subprocess.run(["git", "init"], cwd="sample.zarr", check=True)
         os.mkdir("sample.zarr/.dandi")
         Path("sample.zarr", ".dandi", "somefile.txt").touch()
@@ -82,4 +101,4 @@ def test_digest_zarr_with_excluded_dotfiles():
         Path("sample.zarr", "arr_1", ".datalad", "config").touch()
         r = runner.invoke(digest, ["--digest", "zarr-checksum", "sample.zarr"])
         assert r.exit_code == 0
-        assert r.output == "sample.zarr: 4313ab36412db2981c3ed391b38604d6-5--1516\n"
+        assert r.output == f"sample.zarr: {expected}\n"
diff --git a/dandi/files/zarr.py b/dandi/files/zarr.py
@@ -271,6 +271,25 @@ def _ts_validate_zarr3_array(
     return results
 
 
+def _is_empty_group(group: Any) -> bool:
+    """
+    Return whether a `zarr.Group` has no child arrays or groups.
+
+    In zarr-python 2.x ``bool(group)`` / ``len(group)`` is a cheap check, but in
+    zarr-python 3.x both eagerly iterate the group's members, which raises if
+    the underlying directory contains non-Zarr files (e.g. an arbitrary
+    ``a/b/c/...`` tree) or arrays with metadata that the new library can't
+    parse (e.g. legacy ``>u1`` dtypes). Treat any such failure as "not empty"
+    — there is *some* content there, even if it's not a valid Zarr member —
+    so we don't spuriously report ``dandi_zarr.empty_group``.
+    """
+    try:
+        return len(group) == 0
+    except Exception as exc:
+        lgr.debug("Could not determine emptiness of Zarr group %r: %s", group, exc)
+        return False
+
+
 @dataclass
 class LocalZarrEntry(BasePath):
     """A file or directory within a `ZarrAsset`"""
@@ -455,67 +474,41 @@ def get_validation_errors(
             standard=Standard.ZARR,
         )
 
-        try:
-            data = zarr.open(str(self.filepath), mode="r")
-        except zarr.errors.PathNotFoundError as e:
-            # The asset is potentially in Zarr V3 format, which is not support by
-            # zarr-python 2.x. Before upgrade to zarr-python 3.x, use tensorstore to
-            # open it.
-
-            format_version = get_zarr_format_version(self.filepath)
-
-            if format_version is None:
-                # === The Zarr format can't be determined ===
+        # Zarr V3 stores are validated by `_ts_validate_zarr3` regardless of
+        # zarr-python version. With zarr-python 2.x, `zarr.open()` cannot
+        # handle V3 at all (raises `PathNotFoundError`); with zarr-python 3.x
+        # it can sometimes partially open malformed V3 stores (e.g. valid
+        # root metadata, bad child metadata), which would cause us to miss
+        # the `_ts_validate_zarr3` path. Detecting the format up front keeps
+        # the validation behaviour consistent across versions.
+        format_version = get_zarr_format_version(self.filepath)
+        if format_version == "3":
+            errors.extend(_ts_validate_zarr3(self.filepath, devel_debug))
+            data = None
+        else:
+            try:
+                data = zarr.open(str(self.filepath), mode="r")
+            except Exception as e:
                 if devel_debug:
                     raise
-                errors.append(
-                    ValidationResult(
-                        id="zarr.cannot_open",
-                        origin=origin_internal_zarr,
-                        scope=Scope.FILE,
-                        origin_result=e,
-                        severity=Severity.ERROR,
-                        message="Error opening file and Zarr format cannot be determined",
-                        path=self.filepath,
-                    )
+                message = (
+                    "Error opening file and Zarr format cannot be determined"
+                    if format_version is None
+                    else "Error opening file."
                 )
-            elif format_version == "3":
-                # === The Zarr format is V3 ===
-                errors.extend(_ts_validate_zarr3(self.filepath, devel_debug))
-            else:
-                # === A Zarr format should be supported by `zarr.open()` ===
-                if devel_debug:
-                    raise
                 errors.append(
                     ValidationResult(
                         id="zarr.cannot_open",
                         origin=origin_internal_zarr,
                         scope=Scope.FILE,
                         origin_result=e,
                         severity=Severity.ERROR,
-                        message="Error opening file.",
+                        message=message,
                         path=self.filepath,
                     )
                 )
-
-            # code for temporary workaround for Zarr format V3 with tensorstore ends
-
-        except Exception as e:
-            if devel_debug:
-                raise
-            errors.append(
-                ValidationResult(
-                    origin=origin_internal_zarr,
-                    severity=Severity.ERROR,
-                    id="zarr.cannot_open",
-                    scope=Scope.FILE,
-                    origin_result=e,
-                    path=self.filepath,
-                    message="Error opening file.",
-                )
-            )
-        else:
-            if isinstance(data, zarr.Group) and not data:
+                data = None
+            if isinstance(data, zarr.Group) and _is_empty_group(data):
                 errors.append(
                     ValidationResult(
                         origin=ORIGIN_VALIDATION_DANDI_ZARR,
@@ -1048,7 +1041,10 @@ class UploadItem:
 
     @classmethod
     def from_entry(cls, e: LocalZarrEntry, digest: str) -> UploadItem:
-        if e.name in {".zarray", ".zattrs", ".zgroup", ".zmetadata"}:
+        # JSON metadata files. ``.zarray`` / ``.zattrs`` / ``.zgroup`` /
+        # ``.zmetadata`` are the V2 names; ``zarr.json`` is the V3 name (a
+        # single file per group/array containing all metadata).
+        if e.name in {".zarray", ".zattrs", ".zgroup", ".zmetadata", "zarr.json"}:
             try:
                 with e.filepath.open("rb") as fp:
                     json.load(fp)

diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py
@@ -27,7 +27,7 @@
 
 from .fixtures import SampleDandiset, SampleDandisetFactory
 from .skip import mark
-from .test_helpers import assert_dirtrees_eq
+from .test_helpers import TWO_ARRAY_ZARR_LAYOUT, assert_dirtrees_eq, zarr_format_of
 from ..consts import DRAFT, dandiset_metadata_file
 from ..dandiarchive import DandisetURL
 from ..download import (
@@ -460,6 +460,10 @@ def test_download_different_zarr_onto_excluded_dotfiles(
     dd.mkdir()
     zarr_path = dd / "sample.zarr"
     zarr.save(zarr_path, np.eye(5))
+    # `zarr_dandiset` was uploaded with default ``zarr.save`` — its on-disk
+    # layout (and therefore the layout we expect to see after download) is
+    # V2 for zarr-python 2.x, V3 for zarr-python 3.x.
+    layout = TWO_ARRAY_ZARR_LAYOUT[zarr_format_of(zarr_dandiset.dspath / "sample.zarr")]
     (zarr_path / ".git").touch()
     (zarr_path / ".dandi").mkdir()
     (zarr_path / ".dandi" / "somefile.txt").touch()
@@ -472,21 +476,18 @@ def test_download_different_zarr_onto_excluded_dotfiles(
         tmp_path,
         existing=DownloadExisting.OVERWRITE_DIFFERENT,
     )
-    assert list_paths(zarr_path, dirs=True, exclude_vcs=False) == [
+    dotfile_paths = [
         zarr_path / ".dandi",
         zarr_path / ".dandi" / "somefile.txt",
         zarr_path / ".datalad",
         zarr_path / ".git",
         zarr_path / ".gitattributes",
-        zarr_path / ".zgroup",
-        zarr_path / "arr_0",
         zarr_path / "arr_0" / ".gitmodules",
-        zarr_path / "arr_0" / ".zarray",
-        zarr_path / "arr_0" / "0",
-        zarr_path / "arr_1",
-        zarr_path / "arr_1" / ".zarray",
-        zarr_path / "arr_1" / "0",
     ]
+    zarr_paths = [zarr_path / p for p in layout["files_and_dirs"]]
+    assert list_paths(zarr_path, dirs=True, exclude_vcs=False) == sorted(
+        set(dotfile_paths + zarr_paths)
+    )
 
 
 def test_download_different_zarr_delete_dir(
@@ -495,7 +496,16 @@ def test_download_different_zarr_delete_dir(
     d = new_dandiset.dandiset
     dspath = new_dandiset.dspath
     zarr.save(dspath / "sample.zarr", np.eye(5))
-    assert not any(p.is_dir() for p in (dspath / "sample.zarr").iterdir())
+    # The single-array ``np.eye(5)`` Zarr has a flatter layout than the
+    # two-array Zarr below: in V2 no subdirs at all; in V3 only ``c/``
+    # (the chunk container) rather than ``arr_0/`` and ``arr_1/``. This
+    # is the contrast the test exercises — downloading the smaller Zarr
+    # must delete the extra ``arr_*`` subdirs of the bigger one.
+    initial_subdirs = {p.name for p in (dspath / "sample.zarr").iterdir() if p.is_dir()}
+    if zarr_format_of(dspath / "sample.zarr") == "2":
+        assert initial_subdirs == set()
+    else:
+        assert initial_subdirs == {"c"}
     new_dandiset.upload()
     dd = tmp_path / d.identifier
     dd.mkdir(parents=True, exist_ok=True)
@@ -922,29 +932,21 @@ def test_download_multiple_urls(
     # end up using the same `new_dandiset`.  Hence, we need to fall back to
     # using `sample_dandiset_factory` here.
     zarr_dandiset = sample_dandiset_factory.mkdandiset("Sample Zarr Dandiset")
-    zarr.save(
-        zarr_dandiset.dspath / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)
-    )
+    zarr_local = zarr_dandiset.dspath / "sample.zarr"
+    zarr.save(zarr_local, np.arange(1000), np.arange(1000, 0, -1))
+    layout = TWO_ARRAY_ZARR_LAYOUT[zarr_format_of(zarr_local)]
     zarr_dandiset.upload()
 
     download([text_dandiset.dandiset.api_url, zarr_dandiset.dandiset.api_url], tmp_path)
+    zarr_root = tmp_path / zarr_dandiset.dandiset_id / "sample.zarr"
     assert list_paths(tmp_path) == [
         tmp_path / text_dandiset.dandiset_id / "dandiset.yaml",
         tmp_path / text_dandiset.dandiset_id / "file.txt",
         tmp_path / text_dandiset.dandiset_id / "subdir1" / "apple.txt",
         tmp_path / text_dandiset.dandiset_id / "subdir2" / "banana.txt",
         tmp_path / text_dandiset.dandiset_id / "subdir2" / "coconut.txt",
         tmp_path / zarr_dandiset.dandiset_id / "dandiset.yaml",
-        tmp_path
-        / zarr_dandiset.dandiset_id
-        / tmp_path
-        / zarr_dandiset.dandiset_id
-        / "sample.zarr"
-        / ".zgroup",
-        tmp_path / zarr_dandiset.dandiset_id / "sample.zarr" / "arr_0" / ".zarray",
-        tmp_path / zarr_dandiset.dandiset_id / "sample.zarr" / "arr_0" / "0",
-        tmp_path / zarr_dandiset.dandiset_id / "sample.zarr" / "arr_1" / ".zarray",
-        tmp_path / zarr_dandiset.dandiset_id / "sample.zarr" / "arr_1" / "0",
+        *[zarr_root / p for p in layout["files"]],
     ]