Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/api/esmvalcore.io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ Submodules
esmvalcore.io.intake_esgf
esmvalcore.io.local
esmvalcore.io.protocol
esmvalcore.io.xcube
5 changes: 5 additions & 0 deletions doc/api/esmvalcore.io.xcube.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
esmvalcore.io.xcube
===================

.. automodule:: esmvalcore.io.xcube
:no-inherited-members:
4 changes: 3 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@ dependencies:
- scipy >=1.6
- shapely >=2.0.0
- xarray
- xcube
- xcube-cci
- yamale
- zarr >3
- zarr >2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

zarr3 is perfectly able to read zarr2 datasets, bud

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

xcube requires zarr==2

Copy link
Contributor

@valeriupredoi valeriupredoi Dec 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

well that's a bummer - that means it can't read Zarr3 spec?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also Zarr2 is borderline archaic - good luck to us trying to maintain such an evironment

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like it's on the TODO list: xcube-dev/xcube#1182

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good, otherwise without a pixi env this would be diabolically hard to maintain 😁

# Python packages needed for building docs
- autodocsumm >=0.2.2
- ipython
Expand Down
3 changes: 3 additions & 0 deletions esmvalcore/cmor/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@
def _update_cmor_facets(facets):
"""Update `facets` with information from CMOR table."""
project = facets["project"]
if project == "external":
facets["original_short_name"] = facets["short_name"]
return
mip = facets["mip"]
short_name = facets["short_name"]
derive = facets.get("derive", False)
Expand Down
9 changes: 9 additions & 0 deletions esmvalcore/config/configurations/data-xcube-ccizarr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Read data from the ESA Climate Data Centre (ESA CCI) using xcube.
# More information available at
# https://xcube.readthedocs.io/en/latest/dataaccess.html#esa-climate-data-centre-esa-cci-cciodp-ccizarr-esa-cci-kc.
projects:
external:
data:
ccizarr:
type: "esmvalcore.io.xcube.XCubeDataSource"
data_store_id: "ccizarr"
60 changes: 32 additions & 28 deletions esmvalcore/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,40 +840,44 @@ def _load(self) -> Cube:
)

settings: dict[str, dict[str, Any]] = {}
settings["fix_file"] = {
"output_dir": fix_dir_prefix,
"add_unique_suffix": True,
"session": self.session,
**self.facets,
}
if self.facets["project"] != "external":
settings["fix_file"] = {
"output_dir": fix_dir_prefix,
"add_unique_suffix": True,
"session": self.session,
**self.facets,
}
settings["load"] = {}
settings["fix_metadata"] = {
"session": self.session,
**self.facets,
}
if self.facets["project"] != "external":
settings["fix_metadata"] = {
"session": self.session,
**self.facets,
}
settings["concatenate"] = {"check_level": self.session["check_level"]}
settings["cmor_check_metadata"] = {
"check_level": self.session["check_level"],
"cmor_table": self.facets["project"],
"mip": self.facets["mip"],
"frequency": self.facets["frequency"],
"short_name": self.facets["short_name"],
}
if self.facets["project"] != "external":
settings["cmor_check_metadata"] = {
"check_level": self.session["check_level"],
"cmor_table": self.facets["project"],
"mip": self.facets["mip"],
"frequency": self.facets["frequency"],
"short_name": self.facets["short_name"],
}
if "timerange" in self.facets:
settings["clip_timerange"] = {
"timerange": self.facets["timerange"],
}
settings["fix_data"] = {
"session": self.session,
**self.facets,
}
settings["cmor_check_data"] = {
"check_level": self.session["check_level"],
"cmor_table": self.facets["project"],
"mip": self.facets["mip"],
"frequency": self.facets["frequency"],
"short_name": self.facets["short_name"],
}
if self.facets["project"] != "external":
settings["fix_data"] = {
"session": self.session,
**self.facets,
}
settings["cmor_check_data"] = {
"check_level": self.session["check_level"],
"cmor_table": self.facets["project"],
"mip": self.facets["mip"],
"frequency": self.facets["frequency"],
"short_name": self.facets["short_name"],
}

result: Sequence[PreprocessorItem] = self.files
for step, kwargs in settings.items():
Expand Down
224 changes: 224 additions & 0 deletions esmvalcore/io/xcube.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
"""Access data using `xcube <https://xcube.readthedocs.io>`_.

Run the command ``esmvaltool config copy data-xcube-ccizarr.yml`` to update
your :ref:`configuration <config-data-sources>` to use this module. This will
create a file with the following content in your configuration directory:

.. literalinclude:: ../configurations/data-xcube-ccizarr.yml
:language: yaml
:caption: Contents of ``data-xcube-ccizarr.yml``

"""

from __future__ import annotations

import copy
import fnmatch
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any

import iris.cube
import iris.std_names
import xcube.core.store

import esmvalcore.io.protocol
from esmvalcore.iris_helpers import dataset_to_iris

if TYPE_CHECKING:
from esmvalcore.typing import Facets, FacetValue


@dataclass
class XCubeDataset(esmvalcore.io.protocol.DataElement):
"""A dataset that can be used to load data found using xcube_."""

name: str
"""A unique name identifying the data."""

facets: Facets = field(repr=False)
"""Facets are key-value pairs that were used to find this data."""

store: xcube.core.store.store.DataStore = field(repr=False)
"""The store containing the data."""

open_params: dict[str, Any] = field(default_factory=dict, repr=False)
"""Parameters to use when opening the data."""

_attributes: dict[str, Any] | None = field(
init=False,
repr=False,
default=None,
)

def __hash__(self) -> int:
"""Return a number uniquely representing the data element."""
return hash((self.name, self.facets.get("version")))

def prepare(self) -> None:
"""Prepare the data for access."""
self.store.preload_data(self.name)

@property
def attributes(self) -> dict[str, Any]:
"""Attributes are key-value pairs describing the data."""
if self._attributes is None:
msg = (
"Attributes have not been read yet. Call the `to_iris` method "
"first to read the attributes from the file."
)
raise ValueError(msg)
return self._attributes

@attributes.setter
def attributes(self, value: dict[str, Any]) -> None:
self._attributes = value

def to_iris(self) -> iris.cube.CubeList:
"""Load the data as Iris cubes.

Returns
-------
:
The loaded data.
"""
dataset = self.store.open_data(self.name, **self.open_params)
# Keep only variables matching the "short_name" facet.
short_names = self.facets.get("short_name", [])
if isinstance(short_names, str | int):
short_names = [str(short_names)]
if short_names:
dataset = dataset[short_names]

# Drop invalid standard_names.
# TODO: move this to a standalone fixes package.
for data_var in dataset.data_vars.values():
if (
"standard_name" in data_var.attrs
and data_var.attrs["standard_name"]
not in iris.std_names.STD_NAMES
):
data_var.attrs.pop("standard_name")

# Cache the attributes.
self.attributes = copy.deepcopy(dataset.attrs)
return dataset_to_iris(dataset)


@dataclass
class XCubeDataSource(esmvalcore.io.protocol.DataSource):
"""Data source for finding files on a local filesystem."""

name: str
"""A name identifying the data source."""

project: str
"""The project that the data source provides data for."""

priority: int
"""The priority of the data source. Lower values have priority."""

debug_info: str = field(init=False, repr=False, default="")
"""A string containing debug information when no data is found."""

data_store_id: str
"""Name of the data store.

A list of available data stores can be found in the `xcube documentation
<https://xcube.readthedocs.io/en/latest/dataaccess.html#available-data-stores>`__.
"""

data_store_params: dict[str, Any] = field(default_factory=dict, repr=False)
"""Parameters to use when creating the data store."""

open_params: dict[str, Any] = field(default_factory=dict, repr=False)
"""Parameters to use when opening the data."""

def find_data(self, **facets: FacetValue) -> list[XCubeDataset]: # noqa: C901
# TODO: fix complexity
"""Find data.

Parameters
----------
**facets :
Find data matching these facets.

Returns
-------
:
A list of data elements that have been found.
"""
store = xcube.core.store.new_data_store(
self.data_store_id,
**self.data_store_params,
)
result = []
requested_short_names = facets.get("short_name", "*")
if isinstance(requested_short_names, str | int):
requested_short_names = [str(requested_short_names)]
requested_datasets = facets.get("dataset", "*")
if isinstance(requested_datasets, str | int):
requested_datasets = [str(requested_datasets)]
available_datasets = store.list_data_ids()
for data_id in available_datasets:
for dataset_pattern in requested_datasets:
if fnmatch.fnmatchcase(data_id, dataset_pattern):
description = store.describe_data(data_id)
available_short_names = list(description.data_vars)
short_names = [
short_name
for short_name in available_short_names
for short_name_pattern in requested_short_names
if fnmatch.fnmatchcase(short_name, short_name_pattern)
]
# TODO: Maybe this is too complicated and we should only
# decide which variables to keep/drop after load and conversion
# to iris cube.
open_params = copy.deepcopy(self.open_params)
open_params_schema = store.get_open_data_params_schema()
if "variable_names" in open_params_schema.properties:
open_params["variable_names"] = short_names
elif "drop_variables" in open_params_schema.properties:
drop_variables = {
short_name
for short_name in available_short_names
if short_name not in short_names
}
for coord in description.coords.values():
if bound_var := coord.attrs.get("bounds"):
drop_variables.remove(bound_var)
for data_var in description.data_vars.values():
# TODO: keep cell measures
for ancillary_var in data_var.attrs.get(
"ancillary_variables",
"",
).split():
drop_variables.remove(ancillary_var)
open_params["drop_variables"] = sorted(drop_variables)
timerange = f"{description.time_range[0]}/{description.time_range[1]}".replace(
"-",
"",
)
frequencies = {
"P1M": "mon",
}
frequency = frequencies[
description.attrs["time_coverage_resolution"]
]
dataset = XCubeDataset(
name=data_id,
facets={
"dataset": data_id,
"short_name": short_names
if len(short_names) > 1
else short_names[0],
"frequency": frequency,
"timerange": timerange,
},
store=store,
open_params=open_params,
)
dataset.attributes = description.attrs

result.append(dataset)

return result
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ dependencies = [
"stratify>=0.3",
"xarray",
"yamale",
"zarr>3",
"zarr>2",
]
description = "A community tool for pre-processing data from Earth system models in CMIP and running analysis scripts"
license = {text = "Apache License, Version 2.0"}
Expand Down