eWaterCycle · MarkMelotto · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 9, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -139,6 +139,15 @@ files = ["src"]
 target-version = "py310"
 extend-include = ["*.ipynb"]
 
+# Add this to ignore shapefiles
+exclude = [
+    "shapefiles",
+    "*.shp",
+    "*.shx",
+    "*.dbf",
+    "*.prj"
+]
+
 [tool.ruff.lint]
 select = [
     "A",

diff --git a/src/ewatercycle/_forcings/caravan.py b/src/ewatercycle/_forcings/caravan.py
@@ -1,3 +1,4 @@
+import os
 import shutil
 import zipfile
 from pathlib import Path
@@ -12,7 +13,7 @@
 from ewatercycle.util import get_time
 
 COMMON_URL = "ca13056c-c347-4a27-b320-930c2a4dd207"
-OPENDAP_URL = f"https://opendap.4tu.nl/thredds/dodsC/data2/djht/{COMMON_URL}/1/"
+OPENDAP_URL = f"https://opendap.4tu.nl/thredds/dodsC/data2/djht/{COMMON_URL}/2/"
 SHAPEFILE_URL = (
     f"https://data.4tu.nl/file/{COMMON_URL}/bbe94526-cf1a-4b96-8155-244f20094719"
 )
@@ -106,7 +107,12 @@ class CaravanForcing(DefaultForcing):
 
     @classmethod
     def get_dataset(cls: type["CaravanForcing"], dataset: str) -> xr.Dataset:
-        """Opens specified dataset from data.4tu.nl OPeNDAP server.
+        """Opens dataset from data.4tu.nl OPeNDAP server, or cache if available.
+
+        By default, it will open the dataset from data.4tu.nl OPeNDAP server
+        This can be overridden by having an environmental variable: CARAVAN_CACHE.
+        Set this variable to the directory containing the netCDF files.
+
 
         Args:
             dataset (str): name of dataset, choose from:
@@ -118,6 +124,11 @@ def get_dataset(cls: type["CaravanForcing"], dataset: str) -> xr.Dataset:
                 'hysets',
                 'lamah'
         """
+        cache_dir = os.environ.get("CARAVAN_CACHE")
+        # Check if we want to load from 4TU or dCache
+        if cache_dir:
+            cache_dir = cache_dir.rstrip("/")  # ensure no trailing slash issues
+            return xr.open_dataset(Path(cache_dir) / f"{dataset}.nc")
         return xr.open_dataset(f"{OPENDAP_URL}{dataset}.nc")
 
     @classmethod
@@ -246,7 +257,18 @@ def generate(  # type: ignore[override]
 
 
 def get_shapefiles(directory: Path, basin_id: str) -> Path:
-    """Retrieve shapefiles from data 4TU.nl ."""
+    """Retrieve shapefiles from data 4TU.nl or cache."""
+    cache_dir = os.environ.get("CARAVAN_CACHE")
+    # Check if we want to load from 4TU or dCache
+    if cache_dir:
+        shape_path = directory / f"{basin_id}.shp"
+        combined_shapefile_path = Path(cache_dir) / "shapefiles" / "combined.shp"
+
+        if not shape_path.is_file():
+            extract_basin_shapefile(basin_id, combined_shapefile_path, shape_path)
+
+        return shape_path
+
     zip_path = directory / "shapefiles.zip"
     output_path = directory / "shapefiles"
     shape_path = directory / f"{basin_id}.shp"

diff --git a/tests/src/base/forcing_files/README.md b/tests/src/base/forcing_files/README.md
@@ -4,6 +4,6 @@ The data only includes a year of forcing for one catchment.
 
 For own use, please download from the original source and cite correctly. The Caravan dataset itself is also a combination of data from seperate sources.
 
-The Carvan dataset is originanly obtained from https://zenodo.org/records/7944025 and is explained in a paper by Kratzert, F. :'Caravan - A global community dataset for large-sample hydrology' found here: https://doi-org.tudelft.idm.oclc.org/10.1038/s41597-023-01975-w
+The Caravan dataset is originally obtained from https://zenodo.org/records/7944025 and is explained in a paper by Kratzert, F. :'Caravan - A global community dataset for large-sample hydrology' found here: https://doi-org.tudelft.idm.oclc.org/10.1038/s41597-023-01975-w
 
 Distributed under Creative Commons Attribution 4.0 International.
diff --git a/tests/src/base/forcing_files/camels_03439000.cpg b/tests/src/base/forcing_files/camels_03439000.cpg
@@ -1 +1 @@
-ISO-8859-1
+ISO-8859-1
diff --git a/tests/src/base/forcing_files/camels_03439000.prj b/tests/src/base/forcing_files/camels_03439000.prj
@@ -1 +1 @@
-GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
+GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
diff --git a/tests/src/base/forcing_files/camels_03439000.shx b/tests/src/base/forcing_files/camels_03439000.shx
diff --git a/tests/src/base/forcing_files/shapefiles/combined.cpg b/tests/src/base/forcing_files/shapefiles/combined.cpg
@@ -0,0 +1 @@
+UTF-8
diff --git a/tests/src/base/forcing_files/shapefiles/combined.dbf b/tests/src/base/forcing_files/shapefiles/combined.dbf
diff --git a/tests/src/base/forcing_files/shapefiles/combined.prj b/tests/src/base/forcing_files/shapefiles/combined.prj
@@ -0,0 +1 @@
+GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
diff --git a/tests/src/base/forcing_files/shapefiles/combined.shp b/tests/src/base/forcing_files/shapefiles/combined.shp
diff --git a/tests/src/base/forcing_files/shapefiles/combined.shx b/tests/src/base/forcing_files/shapefiles/combined.shx
diff --git a/tests/src/base/test_forcing.py b/tests/src/base/test_forcing.py
@@ -427,3 +427,41 @@ def test_extract_basin_shapefile(tmp_path: Path):
 
     assert len(records) == 1
     assert records[0].attributes["gauge_id"] == basin_id
+
+
+def test_get_dataset_using_cache(tmp_path, monkeypatch):
+    # Prepare cache directory
+    cache_dir = tmp_path / "cache"
+    cache_dir.mkdir()
+
+    basin_id = "camels_01022500"
+    # Use the existing fake Caravan dataset
+    test_files_dir = Path(__file__).parent / "forcing_files"
+    test_file = test_files_dir / "test_caravan_file.nc"
+    cache_target = cache_dir / "camels.nc"
+    cache_target.write_bytes(test_file.read_bytes())
+
+    # Copy shapefiles into the cache so Fiona can find them
+    shapefiles_dir = test_files_dir / "shapefiles"
+    cache_shapefiles_dir = cache_dir / "shapefiles"
+    copytree(shapefiles_dir, cache_shapefiles_dir)
+
+    # Point CARAVAN_CACHE to this directory
+    monkeypatch.setenv("CARAVAN_CACHE", str(cache_dir))
+
+    # Copy other forcing files to tmp_camels_dir
+    tmp_camels_dir = tmp_path / "camels"
+    copytree(test_files_dir, tmp_camels_dir)
+
+    # Call the method
+    ds = CaravanForcing.generate(
+        start_time="1981-01-01T00:00:00Z",
+        end_time="1981-03-01T00:00:00Z",
+        directory=str(tmp_camels_dir),
+        basin_id=basin_id,
+    ).to_xarray()
+
+    # Assert that the file was loaded from cache
+    content = list(ds.data_vars.keys())
+    expected = ["Q", "evspsblpot", "pr", "tas", "tasmax", "tasmin"]
+    assert content == expected
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
		GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]