Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ jobs:
- uses: actions/checkout@v6.0.2
with:
submodules: recursive
- uses: prefix-dev/setup-pixi@v0.9.5
- uses: prefix-dev/setup-pixi@v0.9.6
with:
pixi-version: v0.39.5
pixi-version: latest
- name: Run pre-commit
run: pixi run -e dev pre-commit run --all-files

Expand All @@ -41,9 +41,9 @@ jobs:
submodules: recursive

- name: Setup pixi
uses: prefix-dev/setup-pixi@v0.9.5
uses: prefix-dev/setup-pixi@v0.9.6
with:
pixi-version: v0.39.5
pixi-version: latest
environments: test

- name: Create default config file
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/full-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ jobs:
submodules: recursive

- name: Setup pixi
uses: prefix-dev/setup-pixi@b92f01076d41831104bdb4786cfe1f0a5462a1ce # v0.9.0
uses: prefix-dev/setup-pixi@v0.9.6
with:
pixi-version: v0.39.5
pixi-version: latest

- name: Create default config file
run: |
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ quote-style = "double" # Use double quotes
"test_mop.py" = ["I001"]

[tool.pixi.workspace]
requires-pixi = ">=0.49"
channels = ["conda-forge", { channel = "accessnri", exclude-newer = "0d" }]
exclude-newer = "7d"
platforms = ["linux-64"]
Expand Down
16 changes: 16 additions & 0 deletions src/access_moppy/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
FrequencyMismatchError,
IncompatibleFrequencyError,
ResamplingRequiredWarning,
normalize_cf_time_units,
type_mapping,
validate_and_resample_if_needed,
validate_cmip6_frequency_compatibility,
Expand Down Expand Up @@ -801,6 +802,21 @@ def write(self):

Automatically handles character/string coordinates with proper NetCDF encoding.
"""
# ========== Normalize CF Time-Coordinate Units ==========
# Source models differ in how they write the reference datetime: the UM
# atmosphere and CABLE land models omit the seconds field (e.g.
# "days since 0001-01-01 00:00"), which fails the WCRP units pattern
# check, whereas the ocean/sea-ice models write the full HH:MM:SS form.
# Re-emit any CF time units in canonical form here — the sole write path
# for every realm — so all variables are normalized uniformly. This is
# meaning-preserving, so the numeric time values are unaffected.
for var in self.ds.variables:
units = self.ds[var].attrs.get("units")
normalized = normalize_cf_time_units(units)
if normalized != units:
self.ds[var].attrs["units"] = normalized
logger.debug("Normalized '%s' units: %r -> %r", var, units, normalized)

# ========== Prepare String Coordinates ==========
# Detect and prepare all string/character coordinates before writing
string_coords_info = self._prepare_string_coordinates()
Expand Down
46 changes: 44 additions & 2 deletions src/access_moppy/derivations/calc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

from importlib.resources import as_file

import cftime
import numpy as np
import pandas as pd
import xarray as xr

from access_moppy.utilities import get_bundled_resource_path
Expand Down Expand Up @@ -160,6 +162,36 @@ def rename_coord(var1, var2, ndim, override=False):
return var2, override


def _monthly_midpoint_coord(time_da: xr.DataArray) -> xr.DataArray:
"""Relabel a resampled monthly time coordinate to each month's midpoint.

``resample(..., "ME")`` labels every monthly bin at the month-end
timestamp. CF/CMIP6 require the time coordinate of a time-mean quantity
to be the midpoint of its cell bounds, which for monthly data is the
centre of the ``[month start, next month start)`` interval. This recentres
each label so ``time`` matches ``midpoint(time_bnds)``. Handles both cftime
and datetime64 coordinates.
"""
values = time_da.values
is_cftime = isinstance(values.flat[0], cftime.datetime)
midpoints = np.empty(len(values), dtype=object)
for i, t in enumerate(values):
if is_cftime:
start = cftime.datetime(t.year, t.month, 1, calendar=t.calendar)
if t.month == 12:
nxt = cftime.datetime(t.year + 1, 1, 1, calendar=t.calendar)
else:
nxt = cftime.datetime(t.year, t.month + 1, 1, calendar=t.calendar)
else:
ts = pd.Timestamp(t)
start = pd.Timestamp(year=ts.year, month=ts.month, day=1)
nxt = start + pd.offsets.MonthBegin(1)
midpoints[i] = start + (nxt - start) / 2
if not is_cftime:
midpoints = pd.DatetimeIndex(midpoints).values
return time_da.copy(data=midpoints)


def calculate_monthly_minimum(
da: xr.DataArray, time_dim: str = "time", preserve_attrs: bool = True
) -> xr.DataArray:
Expand Down Expand Up @@ -202,7 +234,7 @@ def calculate_monthly_minimum(
- Input data should have temporal frequency higher than monthly (daily, 3hr, 6hr, etc.)
- The function uses xarray's resample method with 'M' frequency (end of month)
- Cell methods attribute is updated to reflect the temporal aggregation
- Time coordinate is preserved with monthly timestamps
- Time coordinate is set to each month's midpoint (centre of time_bnds)
"""
if time_dim not in da.dims:
raise ValueError(
Expand All @@ -225,6 +257,11 @@ def calculate_monthly_minimum(

try:
monthly_min = da.resample({time_dim: "ME"}).min(keep_attrs=preserve_attrs)
# "ME" labels each bin at month-end; recentre to the cell midpoint
# so the time coordinate matches midpoint(time_bnds) (CF/CMIP6).
monthly_min = monthly_min.assign_coords(
{time_dim: _monthly_midpoint_coord(monthly_min[time_dim])}
)

if preserve_attrs:
# Update cell_methods to reflect the temporal aggregation
Expand Down Expand Up @@ -284,7 +321,7 @@ def calculate_monthly_maximum(
- Input data should have temporal frequency higher than monthly (daily, 3hr, 6hr, etc.)
- The function uses xarray's resample method with 'M' frequency (end of month)
- Cell methods attribute is updated to reflect the temporal aggregation
- Time coordinate is preserved with monthly timestamps
- Time coordinate is set to each month's midpoint (centre of time_bnds)
"""
if time_dim not in da.dims:
raise ValueError(
Expand All @@ -307,6 +344,11 @@ def calculate_monthly_maximum(

try:
monthly_max = da.resample({time_dim: "ME"}).max(keep_attrs=preserve_attrs)
# "ME" labels each bin at month-end; recentre to the cell midpoint
# so the time coordinate matches midpoint(time_bnds) (CF/CMIP6).
monthly_max = monthly_max.assign_coords(
{time_dim: _monthly_midpoint_coord(monthly_max[time_dim])}
)

if preserve_attrs:
# Update cell_methods to reflect the temporal aggregation
Expand Down
42 changes: 42 additions & 0 deletions src/access_moppy/utilities.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import importlib.metadata as importlib_metadata
import json
import logging
import re
import warnings
from datetime import timedelta
from importlib.resources import files
Expand Down Expand Up @@ -2054,6 +2055,47 @@ def validate_and_resample_if_needed(
return ds_resampled, True


# CF time-coordinate units take the form "<interval> since <reference datetime>".
# The WCRP ATTR004 check requires the reference datetime to be either a bare date
# or a full "YYYY-MM-DD HH:MM:SS"; partial forms such as
# "days since 0001-01-01 00:00" (no seconds — written by the UM atmosphere and
# CABLE land models) fail the check. This pattern captures the components so the
# reference instant can be re-emitted in canonical form.
_CF_TIME_UNITS_RE = re.compile(
r"^(?P<interval>\w+)\s+since\s+"
r"(?P<year>\d{1,4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})"
r"(?:[ T](?P<hour>\d{1,2}):(?P<minute>\d{1,2})(?::(?P<second>\d{1,2}(?:\.\d+)?))?)?"
r"\s*$",
re.IGNORECASE,
)


def normalize_cf_time_units(units: Optional[str]) -> Optional[str]:
"""Return *units* with its reference datetime padded to canonical CF form.

Converts partial or date-only reference datetimes to a full
``"<interval> since YYYY-MM-DD HH:MM:SS"`` string, e.g.
``"days since 0001-01-01 00:00"`` -> ``"days since 0001-01-01 00:00:00"``.

The reference *instant* is preserved (an absent or partial time-of-day is
midnight either way), so any numeric time values measured against these units
remain correct. Strings that are not recognisable CF time units — including
those carrying a timezone offset — are returned unchanged.
"""
if not isinstance(units, str):
return units
match = _CF_TIME_UNITS_RE.match(units.strip())
if match is None:
return units
g = match.groupdict()
return (
f"{g['interval']} since "
f"{int(g['year']):04d}-{int(g['month']):02d}-{int(g['day']):02d} "
f"{int(g['hour'] or 0):02d}:{int(g['minute'] or 0):02d}:"
f"{int(float(g['second'] or 0)):02d}"
)


def calculate_time_bounds(
ds: xr.Dataset, time_coord: str = "time", bnds_name: str = "nv"
) -> xr.DataArray:
Expand Down
66 changes: 66 additions & 0 deletions tests/unit/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,72 @@ def test_write_preserves_data_values(self, cmoriser_with_dataset, temp_dir):
finally:
ds_out.close()

@pytest.mark.unit
def test_write_normalizes_partial_time_units(
self, mock_vocab, mock_mapping, temp_dir
):
"""write() pads a partial CF time-units string to the full canonical form.

Regression test for the WCRP ATTR004 failure: UM atmosphere/land source
files write "days since YYYY-MM-DD 00:00" (no seconds), which must be
re-emitted as "...00:00:00". Non-time units must be left untouched, and
the numeric time values must not change (meaning-preserving).
"""
original_time = np.arange(3).astype(float)
ds = xr.Dataset(
{
"tas": (
["time", "lat", "lon"],
np.random.rand(3, 4, 5).astype(np.float32),
{"_FillValue": 1e20, "units": "K"},
),
},
coords={
"time": (
"time",
original_time,
{"units": "days since 2000-01-01 00:00", "calendar": "standard"},
),
"lat": ("lat", np.arange(4)),
"lon": ("lon", np.arange(5)),
},
attrs={
"variable_id": "tas",
"table_id": "Amon",
"source_id": "ACCESS-ESM1-5",
"experiment_id": "historical",
"variant_label": "r1i1p1f1",
"grid_label": "gn",
},
)
cmoriser = CMORiser(
input_paths=["test.nc"],
output_path=str(temp_dir),
vocab=mock_vocab,
variable_mapping=mock_mapping,
compound_name="Amon.tas",
)
cmoriser.ds = ds
cmoriser.cmor_name = "tas"

with patch("access_moppy.base.psutil.virtual_memory") as mock_mem:
mock_mem.return_value = MagicMock(
total=32 * 1024**3, available=16 * 1024**3
)
cmoriser.write()

output_files = list(Path(temp_dir).glob("*.nc"))
ds_out = xr.open_dataset(output_files[0], decode_cf=False)
try:
# time units padded to full HH:MM:SS (the normalized branch)
assert ds_out["time"].attrs["units"] == "days since 2000-01-01 00:00:00"
# non-time units left untouched (the unchanged branch)
assert ds_out["tas"].attrs["units"] == "K"
# numeric time values preserved
np.testing.assert_array_equal(ds_out["time"].values, original_time)
finally:
ds_out.close()

@pytest.mark.unit
def test_write_bounds_attribute_handling(self, cmoriser_with_dataset, temp_dir):
"""
Expand Down
83 changes: 83 additions & 0 deletions tests/unit/test_derivations_calc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,89 @@ def test_resample_failure_raises_runtime_error(self):
calculate_monthly_maximum(da)


# ---------------------------------------------------------------------------
# Monthly time-coordinate midpoint (CF/CMIP6 "time squareness", TIME001)
# ---------------------------------------------------------------------------


class TestMonthlyTimeCoordinateMidpoint:
"""The monthly time coordinate must sit at the midpoint of its cell bounds.

Regression for tasmax/tasmin being stamped at month-end instead of the
midpoint of [month start, next month start) after resampling daily data.
"""

@pytest.mark.unit
@pytest.mark.parametrize(
"func", [calculate_monthly_minimum, calculate_monthly_maximum]
)
def test_time_coord_is_month_midpoint(self, func):
# Daily data for Jan-Mar 1850, matching the failing ACCESS-ESM files.
times = xr.date_range(
"1850-01-01", "1850-03-31", freq="D", calendar="proleptic_gregorian"
)
da = xr.DataArray(
np.random.default_rng(0).random(len(times)),
dims=["time"],
coords={"time": times},
name="tas",
)
result = func(da)
got = result["time"].values.astype("datetime64[s]")
expected = np.array(
["1850-01-16T12:00:00", "1850-02-15T00:00:00", "1850-03-16T12:00:00"],
dtype="datetime64[s]",
)
assert (got == expected).all()

@pytest.mark.unit
def test_time_equals_midpoint_of_time_bnds(self):
from access_moppy.utilities import calculate_time_bounds

times = xr.date_range(
"1850-01-01", "1850-03-31", freq="D", calendar="proleptic_gregorian"
)
da = xr.DataArray(
np.random.default_rng(0).random(len(times)),
dims=["time"],
coords={"time": times},
name="tasmax",
)
result = calculate_monthly_maximum(da)
ds = result.to_dataset(name="tasmax")
ds["time"].attrs.update(
{"units": "days since 1970-01-01 00:00", "calendar": "proleptic_gregorian"}
)
tb = calculate_time_bounds(ds, time_coord="time", bnds_name="bnds")

epoch = np.datetime64("1970-01-01T00:00:00")
day = np.timedelta64(1, "D")
tnum = (result["time"].values - epoch) / day
midpoint = ((tb.values - epoch) / day).mean(axis=1)
assert np.allclose(tnum, midpoint)
# First stored value is the one flagged by the compliance checker.
assert tnum[0] == pytest.approx(-43813.5)

@pytest.mark.unit
def test_cftime_calendar_midpoint(self):
import cftime

times = xr.date_range(
"1850-01-01", periods=90, freq="D", calendar="360_day", use_cftime=True
)
da = xr.DataArray(
np.random.default_rng(1).random(len(times)),
dims=["time"],
coords={"time": times},
name="tasmin",
)
result = calculate_monthly_minimum(da)
first = result["time"].values[0]
assert isinstance(first, cftime.datetime)
# 360_day month is 30 days -> midpoint is day 16 at 00:00.
assert (first.month, first.day, first.hour) == (1, 16, 0)


# ---------------------------------------------------------------------------
# Helpers for load_ressource_data
# ---------------------------------------------------------------------------
Expand Down
Loading