Skip to content

Commit d7c28a0

Browse files
jason-raitzawilfox
andauthored
AP-687 Only download files if local file is older (#13)
* AP-687 - given a modified datetime (`meta_mtime`) from the metadata string for the requested file, we now check if the file exists locally and has a newer mtime Co-authored-by: Anna Wilcox <AWilcox@Wilcox-Tech.com>
1 parent 5c27a31 commit d7c28a0

4 files changed

Lines changed: 94 additions & 3 deletions

File tree

CHANGELOG.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.2.5]
9+
10+
### Added
11+
- Optional parameter to `fetch_file()` with a modified time of the remote file pulled from the TIND API.
12+
`fetch_file()` uses this to avoid unnecessary downloads if a file already exists at the target
13+
location and has a modified time that is newer than the requested file.
14+
- Added tests for this new behavior.
15+
16+
### Changed
17+
- Raises a file not downloaded error if `tind_download()` fails to return a written file path.
18+
19+
### Fixed
20+
- Found a bug in the `fetch_file()` method where the "backup" filename parsing logic would always return
21+
`download` as the filename.
22+
23+
824
## [0.2.4]
925

1026
### Added

tests/test_fetch.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
import json
66
import xml.etree.ElementTree as E
77

8+
from datetime import datetime, timezone
89
from pathlib import Path
10+
from unittest.mock import MagicMock, patch
911

1012
import pytest
1113
import requests_mock as req_mock # noqa: F401 — activates the requests_mock fixture
@@ -87,6 +89,59 @@ def test_fetch_file_not_found(
8789
client.fetch_file(url, output_dir=str(tmp_path))
8890

8991

92+
def test_fetch_file_skips_download_when_local_file_is_newer(
93+
tmp_path: Path,
94+
client: TINDClient,
95+
) -> None:
96+
"""fetch_file returns the cached path without downloading when local mtime >= meta_mtime."""
97+
98+
url = f"{BASE_URL}/api/v1/record/12345/files/some-image.jpg/download/?version=1"
99+
cached = tmp_path / "some-image.jpg"
100+
cached.write_bytes(b"cached content")
101+
102+
meta_mtime = datetime(2026, 1, 1, 0, 0, 0, tzinfo=timezone.utc)
103+
local_mtime = datetime(2026, 1, 3, 0, 0, 0, tzinfo=timezone.utc) # newer
104+
105+
mock_stat = MagicMock()
106+
mock_stat.st_mtime = local_mtime.timestamp()
107+
108+
with patch.object(Path, "stat", return_value=mock_stat):
109+
result = client.fetch_file(url, output_dir=str(tmp_path), meta_mtime=meta_mtime)
110+
111+
assert result == str(cached)
112+
113+
114+
def test_fetch_file_redownloads_when_local_file_is_older(
115+
requests_mock: req_mock.Mocker,
116+
tmp_path: Path,
117+
client: TINDClient,
118+
) -> None:
119+
"""fetch_file re-downloads when local mtime < meta_mtime."""
120+
121+
url = f"{BASE_URL}/api/v1/record/12345/files/some-other-image.jpg/download/?version=1"
122+
cached = tmp_path / "some-other-image.jpg"
123+
cached.write_bytes(b"stale content")
124+
125+
meta_mtime = datetime(2026, 1, 3, 0, 0, 0, tzinfo=timezone.utc)
126+
local_mtime = datetime(2026, 1, 1, 0, 0, 0, tzinfo=timezone.utc) # older
127+
128+
mock_stat = MagicMock()
129+
mock_stat.st_mtime = local_mtime.timestamp()
130+
131+
requests_mock.get(
132+
url,
133+
content=b"fresh content",
134+
status_code=200,
135+
headers={"Content-Disposition": 'attachment; filename="some-other-image.jpg"'},
136+
)
137+
138+
with patch.object(Path, "stat", return_value=mock_stat):
139+
result = client.fetch_file(url, output_dir=str(tmp_path), meta_mtime=meta_mtime)
140+
141+
assert result.endswith("some-other-image.jpg")
142+
assert Path(result).read_bytes() == b"fresh content"
143+
144+
90145
# ---------------------------------------------------------------------------
91146
# fetch_file_metadata
92147
# ---------------------------------------------------------------------------

tind_client/api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def tind_download(url: str, output_dir: str, api_key: str) -> Tuple[int, str]:
8787
return status, ""
8888

8989
# Fall-back to the file name in the URL if it isn't included in the response.
90-
output_filename = url.rstrip("/").split("/")[-2]
90+
output_filename = url.split("?")[0].rstrip("/").split("/")[-2]
9191

9292
# See if we can extract the filename from the response headers.
9393
if "Content-Disposition" in resp.headers:

tind_client/client.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
"""
44

55
import json
6+
import logging
67
import os
78
import re
9+
from datetime import datetime, timezone
810
from io import StringIO
911
from pathlib import Path
1012
from typing import Any, Iterator
@@ -16,6 +18,7 @@
1618
from .api import tind_get, tind_download
1719
from .errors import RecordNotFoundError, TINDError
1820

21+
logger = logging.getLogger(__name__)
1922

2023
NS = "http://www.loc.gov/MARC21/slim"
2124
E.register_namespace("", NS)
@@ -69,12 +72,19 @@ def fetch_metadata(self, record: str) -> Record:
6972

7073
return records[0]
7174

72-
def fetch_file(self, file_url: str, output_dir: str = "") -> str:
75+
def fetch_file(
76+
self, file_url: str, output_dir: str = "", meta_mtime: datetime | None = None
77+
) -> str:
7378
"""Download a file from TIND and save it locally.
7479
80+
If the file already exists in the output directory and was modified at or after a supplied
81+
``meta_mtime`` timestamp, the file will not be re-downloaded.
82+
7583
:param str file_url: The TIND file download URL.
7684
:param str output_dir: Directory in which to save the file.
7785
Falls back to ``default_storage_dir`` when empty.
86+
:param datetime meta_mtime: Optional modified timestamp from the file metadata returned by
87+
TIND. If not specified, the file will always be downloaded.
7888
:raises AuthorizationError: When the TIND API key is invalid or the file is restricted.
7989
:raises ValueError: When ``file_url`` is not a valid TIND file download URL.
8090
:raises RecordNotFoundError: When the file is invalid or not found.
@@ -84,9 +94,19 @@ def fetch_file(self, file_url: str, output_dir: str = "") -> str:
8494
raise ValueError("URL is not a valid TIND file download URL.")
8595

8696
output_target = output_dir or self.default_storage_dir
97+
98+
expected_filename = file_url.split("?")[0].rstrip("/").split("/")[-2]
99+
expected_path = Path(output_target) / expected_filename
100+
101+
if meta_mtime and expected_path.exists():
102+
local_mtime = datetime.fromtimestamp(expected_path.stat().st_mtime, tz=timezone.utc)
103+
if local_mtime >= meta_mtime:
104+
logger.debug("Cached file at (%s) is newer; skipping download.", expected_path)
105+
return str(expected_path)
106+
87107
(status, saved_to) = tind_download(file_url, output_dir=output_target, api_key=self.api_key)
88108

89-
if status != 200:
109+
if status != 200 or not saved_to:
90110
raise RecordNotFoundError("Referenced file could not be downloaded.")
91111

92112
return saved_to

0 commit comments

Comments
 (0)