Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
d732fcb
make healpix importable
edoyango Dec 3, 2025
2b33bb6
add no healpy import test
edoyango Dec 4, 2025
a53a6ff
ensure fallback HEALPix works with args
edoyango Dec 4, 2025
6ad4f7b
test numpy dropvalue
edoyango Dec 4, 2025
a9cc8ad
add missing brackets
edoyango Dec 4, 2025
81ced15
cover numpy filters
edoyango Dec 4, 2025
8d7954c
_check -> filter
edoyango Dec 4, 2025
19e45d0
remove not
edoyango Dec 4, 2025
89b9f2e
add dataarray capability to DropAnyNan
edoyango Dec 4, 2025
8ccbb12
add test for xarray dropanynan filter
edoyango Dec 4, 2025
a9e9ada
add check for invalid types
edoyango Dec 4, 2025
52d6645
add dataarray capability for dropallnan
edoyango Dec 4, 2025
5c796b1
add tests for dropallnan
edoyango Dec 4, 2025
2e49922
add dataset functionality for dropvalue
edoyango Dec 4, 2025
eae59f7
add tests for dropvalue
edoyango Dec 4, 2025
e95439c
add coverage for Shape
edoyango Dec 4, 2025
01d9ade
remove not in DropAnyNan and DropAllNan
edoyango Dec 5, 2025
21b5ac1
check_shape -> filter
edoyango Dec 5, 2025
7813538
fix mismatched tuple length error
edoyango Dec 5, 2025
0b43a9f
add dask filter tests
edoyango Dec 5, 2025
e8113ff
Remove unused import
tennlee Dec 17, 2025
ad2e679
Simplify CI/CD install requirements to bring install under disk space…
tennlee Dec 17, 2025
e37f4e2
Test reduced dependencies
tennlee Dec 17, 2025
08c84df
Test reduced-complexity requirements installation for CI/CD needs
tennlee Dec 17, 2025
147e3c8
Test further reduction of dependencies for CI/CD
tennlee Dec 17, 2025
1c57155
Test tweak
tennlee Dec 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,18 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install -r requirements_cicd.txt
- name: Test with pytest
run: |
# editable is necessary as pytest will run against the installed
# package rather than the local files creating a coverage report of 0%
pip install -e packages/utils
pip install -e packages/data[all]
pip install -e packages/training[all]
pip install -e packages/pipeline[all]
pip install -e packages/data
pip install -e packages/training
pip install -e packages/pipeline
pip install -e packages/zoo
pip install -e packages/bundled_models/fourcastnext
pip install -e packages/tutorial
pip install -e .[test,docs]
pip install -e .[test]

pytest -m="not noci" --cov=packages/data --cov=packages/utils --cov=packages/pipeline --cov=packages/training --cov=packages/zoo --cov=packages/bundled_models/fourcastnext --ignore=packages/nci_site_archive
- name: Coveralls GitHub Action
Expand Down
2 changes: 1 addition & 1 deletion packages/data/tests/transform/test_derive.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import pytest
import math

from numpy import nan, isnan
from numpy import isnan
from pyearthtools.data.transforms.derive import evaluate, EquationException


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def filter(self, sample: da.Array):
(bool):
If sample contains nan's
"""
if not bool(da.array(list(da.isnan(sample))).any()):
if da.array(list(da.isnan(sample))).any():
raise PipelineFilterException(sample, "Data contained nan's.")


Expand All @@ -85,7 +85,7 @@ def filter(self, sample: da.Array):
(bool):
If sample contains nan's
"""
if not bool(da.array(list(da.isnan(sample))).all()):
if da.array(list(da.isnan(sample))).all():
raise PipelineFilterException(sample, "Data contained all nan's.")


Expand Down Expand Up @@ -164,9 +164,9 @@ def _find_shape(self, data: Union[tuple[da.Array, ...], da.Array]) -> tuple[Unio
return tuple(map(self._find_shape, data))
return data.shape

def check_shape(self, sample: Union[tuple[da.Array, ...], da.Array]):
def filter(self, sample: Union[tuple[da.Array, ...], da.Array]):
if isinstance(sample, (list, tuple)):
if not isinstance(self._shape, (list, tuple)) and len(self._shape) == len(sample):
if not (isinstance(self._shape, (list, tuple)) and len(self._shape) == len(sample)):
raise RuntimeError(
f"If sample is tuple, shape must also be, and of the same length. {self._shape} != {tuple(self._find_shape(i) for i in sample)}"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,9 @@ def _find_shape(self, data: Union[tuple[np.ndarray, ...], np.ndarray]) -> tuple[
return tuple(map(self._find_shape, data))
return data.shape

def check_shape(self, sample: Union[tuple[np.ndarray, ...], np.ndarray]):
def filter(self, sample: Union[tuple[np.ndarray, ...], np.ndarray]):
if isinstance(sample, (list, tuple)):
if not isinstance(self._shape, (list, tuple)) and len(self._shape) == len(sample):
if not (isinstance(self._shape, (list, tuple)) and len(self._shape) == len(sample)):
raise RuntimeError(
f"If sample is tuple, shape must also be, and of the same length. {self._shape} != {tuple(self._find_shape(i) for i in sample)}"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import numpy as np
import xarray as xr

import warnings
import math

from pyearthtools.pipeline.filters import Filter, PipelineFilterException
Expand Down Expand Up @@ -58,7 +58,7 @@ def __init__(self, variables: Optional[list] = None) -> None:

self.variables = variables

def _check(self, sample: xr.Dataset):
def filter(self, sample: xr.Dataset):
"""Check if any of the sample is nan

Args:
Expand All @@ -68,10 +68,21 @@ def _check(self, sample: xr.Dataset):
(bool):
If sample contains nan's
"""

if self.variables:
sample = sample[self.variables]
if isinstance(sample, xr.DataArray):
warnings.warn("input sample is xr.DataArray - ignoring filter variables.")
else:
sample = sample[self.variables]

if isinstance(sample, xr.DataArray):
has_nan = np.isnan(sample).any()
elif isinstance(sample, xr.Dataset):
has_nan = np.array(list(np.isnan(sample).values())).any()
else:
raise TypeError("This filter only accepts xr.DataArray or xr.Dataset")

if not bool(np.array(list(np.isnan(sample).values())).any()):
if has_nan:
raise PipelineFilterException(sample, "Data contained nan's.")


Expand All @@ -95,7 +106,7 @@ def __init__(self, variables: Optional[list] = None) -> None:

self.variables = variables

def _check(self, sample: xr.Dataset):
def filter(self, sample: xr.Dataset):
"""Check if all of the sample is nan

Args:
Expand All @@ -106,9 +117,19 @@ def _check(self, sample: xr.Dataset):
If sample contains nan's
"""
if self.variables:
sample = sample[self.variables]
if isinstance(sample, xr.DataArray):
warnings.warn("input sample is xr.DataArray - ignoring filter variables.")
else:
sample = sample[self.variables]

if isinstance(sample, xr.DataArray):
all_nan = np.isnan(sample).all()
elif isinstance(sample, xr.Dataset):
all_nan = np.array(list(np.isnan(sample).values())).all()
else:
raise TypeError("This filter only accepts xr.DataArray or xr.Dataset")

if not bool(np.array(list(np.isnan(sample).values())).all()):
if all_nan:
raise PipelineFilterException(sample, "Data contained all nan's.")


Expand Down Expand Up @@ -147,16 +168,24 @@ def filter(self, sample: T):
(bool):
If sample contains nan's
"""
if np.isnan(self._value):
function = ( # noqa
lambda x: ((np.count_nonzero(np.isnan(x)) / math.prod(x.shape)) * 100) >= self._percentage
) # noqa
if isinstance(sample, xr.DataArray):
if np.isnan(self._value):
drop = ((np.count_nonzero(np.isnan(sample)) / math.prod(sample.shape)) * 100) >= self._percentage
else:
drop = ((np.count_nonzero(sample == self._value) / math.prod(sample.shape)) * 100) >= self._percentage
elif isinstance(sample, xr.Dataset):
if np.isnan(self._value):
nmatches = np.sum(list(np.isnan(sample).sum().values()))
nvalues = np.sum([math.prod(v.shape) for v in sample.values()])
drop = nmatches / nvalues * 100 >= self._percentage
else:
nmatches = np.sum(list((sample == 1).sum().values()))
nvalues = np.sum([math.prod(v.shape) for v in sample.values()])
drop = nmatches / nvalues * 100 >= self._percentage
else:
function = ( # noqa
lambda x: ((np.count_nonzero(x == self._value) / math.prod(x.shape)) * 100) >= self._percentage
) # noqa
raise TypeError("This filter only accepts xr.DataArray or xr.Dataset")

if not function(sample):
if not drop:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the doc string for DropValue:

Filter to drop data containing more than a given percentage of a value.

all the DropValue filters calculate whether % of elemns matching value >= the percentage, and then not's them here, which does the opposite of what the doc string says. @tennlee should I follow what the doc string says or leave the logic as is?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The top-level filter class says "filter should sairse a PipelineFilterException if invalid. So I think the not is unwanted? The test for me is that a pipelinefilterexception should be raised if data with a high proportion of nans (or whatever query value) is supplied.

raise PipelineFilterException(sample, f"Data contained more than {self._percentage}% of {self._value}.")


Expand Down Expand Up @@ -198,7 +227,7 @@ def _find_shape(self, data: T) -> tuple[int, ...]:

def filter(self, sample: Union[tuple[T, ...], T]):
if isinstance(sample, (list, tuple)):
if not isinstance(self._shape, (list, tuple)) and len(self._shape) == len(sample):
if not (isinstance(self._shape, (list, tuple)) and len(self._shape) == len(sample)):
raise RuntimeError(
f"If sample is tuple, shape must also be, and of the same length. {self._shape} != {tuple(self._find_shape(i) for i in sample)}"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
except ImportError:

class HEALPix:
def __init__(self):
def __init__(self, *args, **kwargs):
warnings.warn(
"Could not import the healpix projection, please install the 'healpy' and 'reproject' optional dependencies"
)
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from typing import Type, TypeVar
import xarray as xr

from pyearthtools.pipeline import Operation
from pyearthtools.pipeline.operation import Operation

XR_TYPE = TypeVar("XR_TYPE", xr.Dataset, xr.DataArray)

Expand Down
111 changes: 111 additions & 0 deletions packages/pipeline/tests/operations/dask/test_dask_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright Commonwealth of Australia, Bureau of Meteorology 2025.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pyearthtools.pipeline.operations.dask import filters
from pyearthtools.pipeline.exceptions import PipelineFilterException

import numpy as np
import dask.array as da
import pytest


def test_DropAnyNan():
"""Tests DropAnyNan dask filter."""

original = da.ones((2, 2))

# no nans - should succeed quietly
drop = filters.DropAnyNan()
drop.filter(original)

# one nan - should raise exception
original[0, 0] = np.nan
drop = filters.DropAnyNan()
with pytest.raises(PipelineFilterException):
drop.filter(original)


# xfailed since the result seems to be inverted to documented requirements
@pytest.mark.xfail
def test_DropAllNan():
"""Tests DropAllNan dask filter."""

original = da.empty((2, 2))

# no nans - should succeed quietly
drop = filters.DropAllNan()
drop.filter(original)

# one nan - should succeed quietly
original[0, 0] = np.nan
drop.filter(original)

# all nans - should raise exception
original[:, :] = np.nan
with pytest.raises(PipelineFilterException):
drop.filter(original)


def test_DropValue():
"""Tests DropValue dask filter."""

original = da.from_array([[0, 0], [1, 2]])

# drop case (num zeros < threshold)
drop = filters.DropValue(0, 75)
with pytest.raises(PipelineFilterException):
drop.filter(original)

# non-drop case (num zeros >= threshold)
drop = filters.DropValue(0, 50)
drop.filter(original)

# drop case (num nans < threshold)
original = da.from_array([[np.nan, np.nan], [1, 2]])
drop = filters.DropValue("nan", 75)
with pytest.raises(PipelineFilterException):
drop.filter(original)

# non-drop case (num nans >= threshold)
drop = filters.DropValue("nan", 50)
drop.filter(original)


def test_Shape():
"""Tests Shape dask filter."""

originals = (da.empty((2, 2)), da.empty((2, 3)))

# check drop case
drop = filters.Shape((2, 3))
with pytest.raises(PipelineFilterException):
drop.filter(originals[0])

# check non-drop case
drop = filters.Shape((2, 2))
drop.filter(originals[0])

# check tuple inputs drop cases
drop = filters.Shape(((2, 3), (2, 3)))
with pytest.raises(PipelineFilterException):
drop.filter(originals)

# check tuple inputs non-drop cases
drop = filters.Shape(((2, 2), (2, 3)))
drop.filter(originals)

# invalid mismatched shape and input
drop = filters.Shape(((2, 2),))
with pytest.raises(RuntimeError):
drop.filter(originals)
59 changes: 57 additions & 2 deletions packages/pipeline/tests/operations/numpy/test_numpy_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_DropAnyNan_true():
drop = filters.DropAnyNan()

with pytest.raises(PipelineFilterException):
result = drop.filter(original)
drop.filter(original)


def test_DropAllNan_false():
Expand All @@ -54,4 +54,59 @@ def test_DropAllNan_true():
drop = filters.DropAllNan()

with pytest.raises(PipelineFilterException):
result = drop.filter(original)
drop.filter(original)


def test_DropValue():

# test drop case
original = np.array([[1, 1], [np.nan, np.nan]])

drop = filters.DropValue(value=1, percentage=75)

with pytest.raises(PipelineFilterException):
drop.filter(original)

# test no drop case
drop = filters.DropValue(value=1, percentage=50)
drop.filter(original)

# test with nan - drop case
drop = filters.DropValue(value="nan", percentage=75)

with pytest.raises(PipelineFilterException):
drop.filter(original)

# no drop case
drop = filters.DropValue(value="nan", percentage=50)
drop.filter(original)


def test_Shape():

# test drop case
original = np.empty((2, 3))
drop = filters.Shape((2, 2))

with pytest.raises(PipelineFilterException):
drop.filter(original)

# test non-drop case
original = np.empty((2, 2))
drop.filter(original)

# test with multiple inputs
originals = (np.empty((2, 3)), np.empty((2, 2)))
drop = filters.Shape(((2, 2), (2, 3)))

with pytest.raises(PipelineFilterException):
drop.filter(originals)

# test non drop case
drop = filters.Shape(((2, 3), (2, 2)))
drop.filter(originals)

# test mismatched number of input shapes
drop = filters.Shape(((1, 2), (3, 4), (5, 6)))
with pytest.raises(RuntimeError):
drop.filter(originals)
Loading