ESMValGroup · bouweandela · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025
diff --git a/doc/api/esmvalcore.io.rst b/doc/api/esmvalcore.io.rst
@@ -21,3 +21,4 @@ Submodules
    esmvalcore.io.intake_esgf
    esmvalcore.io.local
    esmvalcore.io.protocol
+   esmvalcore.io.xcube
diff --git a/doc/api/esmvalcore.io.xcube.rst b/doc/api/esmvalcore.io.xcube.rst
@@ -0,0 +1,5 @@
+esmvalcore.io.xcube
+===================
+
+.. automodule:: esmvalcore.io.xcube
+    :no-inherited-members:
diff --git a/environment.yml b/environment.yml
@@ -48,8 +48,10 @@ dependencies:
   - scipy >=1.6
   - shapely >=2.0.0
   - xarray
+  - xcube
+  - xcube-cci
   - yamale
-  - zarr >3
+  - zarr >2
   # Python packages needed for building docs
   - autodocsumm >=0.2.2
   - ipython

diff --git a/esmvalcore/cmor/table.py b/esmvalcore/cmor/table.py
@@ -40,6 +40,9 @@
 def _update_cmor_facets(facets):
     """Update `facets` with information from CMOR table."""
     project = facets["project"]
+    if project == "external":
+        facets["original_short_name"] = facets["short_name"]
+        return
     mip = facets["mip"]
     short_name = facets["short_name"]
     derive = facets.get("derive", False)

diff --git a/esmvalcore/config/configurations/data-xcube-ccizarr.yml b/esmvalcore/config/configurations/data-xcube-ccizarr.yml
@@ -0,0 +1,9 @@
+# Read data from the ESA Climate Data Centre (ESA CCI) using xcube.
+# More information available at
+# https://xcube.readthedocs.io/en/latest/dataaccess.html#esa-climate-data-centre-esa-cci-cciodp-ccizarr-esa-cci-kc.
+projects:
+  external:
+    data:
+      ccizarr:
+        type: "esmvalcore.io.xcube.XCubeDataSource"
+        data_store_id: "ccizarr"
diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
@@ -840,40 +840,44 @@ def _load(self) -> Cube:
         )
 
         settings: dict[str, dict[str, Any]] = {}
-        settings["fix_file"] = {
-            "output_dir": fix_dir_prefix,
-            "add_unique_suffix": True,
-            "session": self.session,
-            **self.facets,
-        }
+        if self.facets["project"] != "external":
+            settings["fix_file"] = {
+                "output_dir": fix_dir_prefix,
+                "add_unique_suffix": True,
+                "session": self.session,
+                **self.facets,
+            }
         settings["load"] = {}
-        settings["fix_metadata"] = {
-            "session": self.session,
-            **self.facets,
-        }
+        if self.facets["project"] != "external":
+            settings["fix_metadata"] = {
+                "session": self.session,
+                **self.facets,
+            }
         settings["concatenate"] = {"check_level": self.session["check_level"]}
-        settings["cmor_check_metadata"] = {
-            "check_level": self.session["check_level"],
-            "cmor_table": self.facets["project"],
-            "mip": self.facets["mip"],
-            "frequency": self.facets["frequency"],
-            "short_name": self.facets["short_name"],
-        }
+        if self.facets["project"] != "external":
+            settings["cmor_check_metadata"] = {
+                "check_level": self.session["check_level"],
+                "cmor_table": self.facets["project"],
+                "mip": self.facets["mip"],
+                "frequency": self.facets["frequency"],
+                "short_name": self.facets["short_name"],
+            }
         if "timerange" in self.facets:
             settings["clip_timerange"] = {
                 "timerange": self.facets["timerange"],
             }
-        settings["fix_data"] = {
-            "session": self.session,
-            **self.facets,
-        }
-        settings["cmor_check_data"] = {
-            "check_level": self.session["check_level"],
-            "cmor_table": self.facets["project"],
-            "mip": self.facets["mip"],
-            "frequency": self.facets["frequency"],
-            "short_name": self.facets["short_name"],
-        }
+        if self.facets["project"] != "external":
+            settings["fix_data"] = {
+                "session": self.session,
+                **self.facets,
+            }
+            settings["cmor_check_data"] = {
+                "check_level": self.session["check_level"],
+                "cmor_table": self.facets["project"],
+                "mip": self.facets["mip"],
+                "frequency": self.facets["frequency"],
+                "short_name": self.facets["short_name"],
+            }
 
         result: Sequence[PreprocessorItem] = self.files
         for step, kwargs in settings.items():

diff --git a/esmvalcore/io/xcube.py b/esmvalcore/io/xcube.py
@@ -0,0 +1,224 @@
+"""Access data using `xcube <https://xcube.readthedocs.io>`_.
+
+Run the command ``esmvaltool config copy data-xcube-ccizarr.yml`` to update
+your :ref:`configuration <config-data-sources>` to use this module. This will
+create a file with the following content in your configuration directory:
+
+.. literalinclude:: ../configurations/data-xcube-ccizarr.yml
+   :language: yaml
+   :caption: Contents of ``data-xcube-ccizarr.yml``
+
+"""
+
+from __future__ import annotations
+
+import copy
+import fnmatch
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+import iris.cube
+import iris.std_names
+import xcube.core.store
+
+import esmvalcore.io.protocol
+from esmvalcore.iris_helpers import dataset_to_iris
+
+if TYPE_CHECKING:
+    from esmvalcore.typing import Facets, FacetValue
+
+
+@dataclass
+class XCubeDataset(esmvalcore.io.protocol.DataElement):
+    """A dataset that can be used to load data found using xcube_."""
+
+    name: str
+    """A unique name identifying the data."""
+
+    facets: Facets = field(repr=False)
+    """Facets are key-value pairs that were used to find this data."""
+
+    store: xcube.core.store.store.DataStore = field(repr=False)
+    """The store containing the data."""
+
+    open_params: dict[str, Any] = field(default_factory=dict, repr=False)
+    """Parameters to use when opening the data."""
+
+    _attributes: dict[str, Any] | None = field(
+        init=False,
+        repr=False,
+        default=None,
+    )
+
+    def __hash__(self) -> int:
+        """Return a number uniquely representing the data element."""
+        return hash((self.name, self.facets.get("version")))
+
+    def prepare(self) -> None:
+        """Prepare the data for access."""
+        self.store.preload_data(self.name)
+
+    @property
+    def attributes(self) -> dict[str, Any]:
+        """Attributes are key-value pairs describing the data."""
+        if self._attributes is None:
+            msg = (
+                "Attributes have not been read yet. Call the `to_iris` method "
+                "first to read the attributes from the file."
+            )
+            raise ValueError(msg)
+        return self._attributes
+
+    @attributes.setter
+    def attributes(self, value: dict[str, Any]) -> None:
+        self._attributes = value
+
+    def to_iris(self) -> iris.cube.CubeList:
+        """Load the data as Iris cubes.
+
+        Returns
+        -------
+        :
+            The loaded data.
+        """
+        dataset = self.store.open_data(self.name, **self.open_params)
+        # Keep only variables matching the "short_name" facet.
+        short_names = self.facets.get("short_name", [])
+        if isinstance(short_names, str | int):
+            short_names = [str(short_names)]
+        if short_names:
+            dataset = dataset[short_names]
+
+        # Drop invalid standard_names.
+        # TODO: move this to a standalone fixes package.
+        for data_var in dataset.data_vars.values():
+            if (
+                "standard_name" in data_var.attrs
+                and data_var.attrs["standard_name"]
+                not in iris.std_names.STD_NAMES
+            ):
+                data_var.attrs.pop("standard_name")
+
+        # Cache the attributes.
+        self.attributes = copy.deepcopy(dataset.attrs)
+        return dataset_to_iris(dataset)
+
+
+@dataclass
+class XCubeDataSource(esmvalcore.io.protocol.DataSource):
+    """Data source for finding files on a local filesystem."""
+
+    name: str
+    """A name identifying the data source."""
+
+    project: str
+    """The project that the data source provides data for."""
+
+    priority: int
+    """The priority of the data source. Lower values have priority."""
+
+    debug_info: str = field(init=False, repr=False, default="")
+    """A string containing debug information when no data is found."""
+
+    data_store_id: str
+    """Name of the data store.
+
+    A list of available data stores can be found in the `xcube documentation
+    <https://xcube.readthedocs.io/en/latest/dataaccess.html#available-data-stores>`__.
+    """
+
+    data_store_params: dict[str, Any] = field(default_factory=dict, repr=False)
+    """Parameters to use when creating the data store."""
+
+    open_params: dict[str, Any] = field(default_factory=dict, repr=False)
+    """Parameters to use when opening the data."""
+
+    def find_data(self, **facets: FacetValue) -> list[XCubeDataset]:  # noqa: C901
+        # TODO: fix complexity
+        """Find data.
+
+        Parameters
+        ----------
+        **facets :
+            Find data matching these facets.
+
+        Returns
+        -------
+        :
+            A list of data elements that have been found.
+        """
+        store = xcube.core.store.new_data_store(
+            self.data_store_id,
+            **self.data_store_params,
+        )
+        result = []
+        requested_short_names = facets.get("short_name", "*")
+        if isinstance(requested_short_names, str | int):
+            requested_short_names = [str(requested_short_names)]
+        requested_datasets = facets.get("dataset", "*")
+        if isinstance(requested_datasets, str | int):
+            requested_datasets = [str(requested_datasets)]
+        available_datasets = store.list_data_ids()
+        for data_id in available_datasets:
+            for dataset_pattern in requested_datasets:
+                if fnmatch.fnmatchcase(data_id, dataset_pattern):
+                    description = store.describe_data(data_id)
+                    available_short_names = list(description.data_vars)
+                    short_names = [
+                        short_name
+                        for short_name in available_short_names
+                        for short_name_pattern in requested_short_names
+                        if fnmatch.fnmatchcase(short_name, short_name_pattern)
+                    ]
+                    # TODO: Maybe this is too complicated and we should only
+                    # decide which variables to keep/drop after load and conversion
+                    # to iris cube.
+                    open_params = copy.deepcopy(self.open_params)
+                    open_params_schema = store.get_open_data_params_schema()
+                    if "variable_names" in open_params_schema.properties:
+                        open_params["variable_names"] = short_names
+                    elif "drop_variables" in open_params_schema.properties:
+                        drop_variables = {
+                            short_name
+                            for short_name in available_short_names
+                            if short_name not in short_names
+                        }
+                        for coord in description.coords.values():
+                            if bound_var := coord.attrs.get("bounds"):
+                                drop_variables.remove(bound_var)
+                        for data_var in description.data_vars.values():
+                            # TODO: keep cell measures
+                            for ancillary_var in data_var.attrs.get(
+                                "ancillary_variables",
+                                "",
+                            ).split():
+                                drop_variables.remove(ancillary_var)
+                        open_params["drop_variables"] = sorted(drop_variables)
+                    timerange = f"{description.time_range[0]}/{description.time_range[1]}".replace(
+                        "-",
+                        "",
+                    )
+                    frequencies = {
+                        "P1M": "mon",
+                    }
+                    frequency = frequencies[
+                        description.attrs["time_coverage_resolution"]
+                    ]
+                    dataset = XCubeDataset(
+                        name=data_id,
+                        facets={
+                            "dataset": data_id,
+                            "short_name": short_names
+                            if len(short_names) > 1
+                            else short_names[0],
+                            "frequency": frequency,
+                            "timerange": timerange,
+                        },
+                        store=store,
+                        open_params=open_params,
+                    )
+                    dataset.attributes = description.attrs
+
+                    result.append(dataset)
+
+        return result
diff --git a/pyproject.toml b/pyproject.toml
@@ -69,7 +69,7 @@ dependencies = [
     "stratify>=0.3",
     "xarray",
     "yamale",
-    "zarr>3",
+    "zarr>2",
 ]
 description = "A community tool for pre-processing data from Earth system models in CMIP and running analysis scripts"
 license = {text = "Apache License, Version 2.0"}