Skip to content

Commit 292d2fa

Browse files
committed
feat: python 3.11 migration
1 parent c56da44 commit 292d2fa

File tree

11 files changed

+126
-145
lines changed

11 files changed

+126
-145
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM python:3.10-slim
1+
FROM python:3.11-slim
22

33
WORKDIR /data
44

README.md

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ Command-line and Python client for downloading and deploying datasets on DBpedia
2222
- [Deploy](#module-deploy)
2323
- [Contributing](#contributing)
2424
- [Linting](#linting)
25+
- [Testing](#testing)
2526

2627

2728
## Quickstart
@@ -32,7 +33,7 @@ You can use either **Python** or **Docker**. Both methods support all client fea
3233

3334
### Python
3435

35-
Requirements: [Python](https://www.python.org/downloads/) and [pip](https://pip.pypa.io/en/stable/installation/)
36+
Requirements: [Python 3.11+](https://www.python.org/downloads/) and [pip](https://pip.pypa.io/en/stable/installation/)
3637

3738
Before using the client, install it via pip:
3839

@@ -186,8 +187,8 @@ Options:
186187
e.g. https://databus.dbpedia.org/sparql)
187188
--vault-token TEXT Path to Vault refresh token file
188189
--databus-key TEXT Databus API key to download from protected databus
189-
--latest-only When downloading artifacts, only download the latest
190-
version
190+
--all-versions When downloading artifacts, download all versions
191+
instead of only the latest
191192
--authurl TEXT Keycloak token endpoint URL [default:
192193
https://auth.dbpedia.org/realms/dbpedia/protocol/openid-
193194
connect/token]
@@ -571,12 +572,30 @@ The used linter is [Ruff](https://ruff.rs/). Ruff is configured in `pyproject.to
571572
572573
For development, you can run linting locally with `ruff check . ` and optionally auto-format with `ruff format .`.
573574
574-
To ensuere compatibility with the `pyproject.toml` configured dependencies, run Ruff via Poetry:
575+
To ensure compatibility with the `pyproject.toml` configured dependencies, run Ruff via Poetry:
575576
576577
```bash
577578
# To check for linting issues:
578579
poetry run ruff check .
579580
580581
# To auto-format code:
581582
poetry run ruff format .
583+
```
584+
585+
### Testing
586+
587+
When developing new features please make sure to add appropriate tests and ensure that all tests pass. Tests are under `tests/` and use [pytest](https://docs.pytest.org/en/7.4.x/) as test framework.
588+
589+
When fixing bugs or refactoring existing code, please make sure to add tests that cover the affected functionality. The current test coverage is very low, so any additional tests are highly appreciated.
590+
591+
To run tests locally, use:
592+
593+
```bash
594+
pytest tests/
595+
```
596+
597+
Or to ensure compatibility with the `pyproject.toml` configured dependencies, run pytest via Poetry:
598+
599+
```bash
600+
poetry run pytest tests/
582601
```

databusclient/api/delete.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@
33

44
import requests
55

6-
from databusclient.api.utils import fetch_databus_jsonld, get_databus_id_parts_from_uri
6+
from databusclient.api.utils import (
7+
fetch_databus_jsonld,
8+
get_databus_id_parts_from_file_url,
9+
)
710

811

912
def _confirm_delete(databusURI: str) -> str:
@@ -161,7 +164,7 @@ def _delete_group(
161164
uri = item.get("@id")
162165
if not uri:
163166
continue
164-
_, _, _, _, version, _ = get_databus_id_parts_from_uri(uri)
167+
_, _, _, _, version, _ = get_databus_id_parts_from_file_url(uri)
165168
if version is None:
166169
artifact_uris.append(uri)
167170

@@ -188,8 +191,8 @@ def delete(databusURIs: List[str], databus_key: str, dry_run: bool, force: bool)
188191
"""
189192

190193
for databusURI in databusURIs:
191-
_host, _account, group, artifact, version, file = get_databus_id_parts_from_uri(
192-
databusURI
194+
_host, _account, group, artifact, version, file = (
195+
get_databus_id_parts_from_file_url(databusURI)
193196
)
194197

195198
if group == "collections" and artifact is not None:

databusclient/api/deploy.py

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import requests
77

8-
__debug = False
8+
_debug = False
99

1010

1111
class DeployError(Exception):
@@ -36,6 +36,11 @@ def _get_content_variants(distribution_str: str) -> Optional[Dict[str, str]]:
3636

3737
cvs = {}
3838
for kv in cv_str.split("_"):
39+
if "=" not in kv:
40+
raise BadArgumentException(
41+
f"Invalid content variant format: '{kv}'. Expected 'key=value' format."
42+
)
43+
3944
key, value = kv.split("=")
4045
cvs[key] = value
4146

@@ -141,8 +146,8 @@ def _get_file_stats(distribution_str: str) -> Tuple[Optional[str], Optional[int]
141146

142147

143148
def _load_file_stats(url: str) -> Tuple[str, int]:
144-
resp = requests.get(url)
145-
if resp.status_code > 400:
149+
resp = requests.get(url, timeout=30)
150+
if resp.status_code >= 400:
146151
raise requests.exceptions.RequestException(response=resp)
147152

148153
sha256sum = hashlib.sha256(bytes(resp.content)).hexdigest()
@@ -156,7 +161,7 @@ def get_file_info(distribution_str: str) -> Tuple[Dict[str, str], str, str, str,
156161

157162
# content_variant_part = "_".join([f"{key}={value}" for key, value in cvs.items()])
158163

159-
if __debug:
164+
if _debug:
160165
print("DEBUG", distribution_str, extension_part)
161166

162167
sha256sum, content_length = _get_file_stats(distribution_str)
@@ -306,7 +311,13 @@ def create_dataset(
306311
"""
307312

308313
_versionId = str(version_id).strip("/")
309-
_, account_name, group_name, artifact_name, version = _versionId.rsplit("/", 4)
314+
parts = _versionId.rsplit("/", 4)
315+
if len(parts) < 5:
316+
raise BadArgumentException(
317+
f"Invalid version_id format: '{version_id}'. "
318+
f"Expected format: <BASE>/<ACCOUNT>/<GROUP>/<ARTIFACT>/<VERSION>"
319+
)
320+
_, _account_name, _group_name, _artifact_name, version = parts
310321

311322
# could be build from stuff above,
312323
# was not sure if there are edge cases BASE=http://databus.example.org/"base"/...
@@ -428,22 +439,30 @@ def deploy(
428439

429440
headers = {"X-API-KEY": f"{api_key}", "Content-Type": "application/json"}
430441
data = json.dumps(dataid)
431-
base = "/".join(dataid["@graph"][0]["@id"].split("/")[0:3])
442+
443+
try:
444+
base = "/".join(dataid["@graph"][0]["@id"].split("/")[0:3])
445+
except (KeyError, IndexError, TypeError) as e:
446+
raise DeployError(f"Invalid dataid structure: {e}")
447+
432448
api_uri = (
433449
base
434450
+ f"/api/publish?verify-parts={str(verify_parts).lower()}&log-level={log_level.name}"
435451
)
436-
resp = requests.post(api_uri, data=data, headers=headers)
452+
resp = requests.post(api_uri, data=data, headers=headers, timeout=30)
437453

438-
if debug or __debug:
439-
dataset_uri = dataid["@graph"][0]["@id"]
454+
if debug or _debug:
455+
try:
456+
dataset_uri = dataid["@graph"][0]["@id"]
457+
except (KeyError, IndexError, TypeError) as e:
458+
raise DeployError(f"Invalid dataid structure: {e}")
440459
print(f"Trying submitting data to {dataset_uri}:")
441460
print(data)
442461

443462
if resp.status_code != 200:
444463
raise DeployError(f"Could not deploy dataset to databus. Reason: '{resp.text}'")
445464

446-
if debug or __debug:
465+
if debug or _debug:
447466
print("---------")
448467
print(resp.text)
449468

databusclient/api/download.py

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@
66
from SPARQLWrapper import JSON, SPARQLWrapper
77
from tqdm import tqdm
88

9-
from databusclient.api.utils import fetch_databus_jsonld, get_databus_id_parts_from_uri
9+
from databusclient.api.utils import (
10+
fetch_databus_jsonld,
11+
get_databus_id_parts_from_file_url,
12+
)
1013

1114

1215
def _download_file(
@@ -32,8 +35,8 @@ def _download_file(
3235
2. If server responds with WWW-Authenticate: Bearer, 401 Unauthorized), then fetch Vault access token and retry with Authorization header.
3336
"""
3437
if localDir is None:
35-
_host, account, group, artifact, version, file = get_databus_id_parts_from_uri(
36-
url
38+
_host, account, group, artifact, version, file = (
39+
get_databus_id_parts_from_file_url(url)
3740
)
3841
localDir = os.path.join(
3942
os.getcwd(),
@@ -51,7 +54,7 @@ def _download_file(
5154
if dirpath:
5255
os.makedirs(dirpath, exist_ok=True) # Create the necessary directories
5356
# --- 1. Get redirect URL by requesting HEAD ---
54-
response = requests.head(url, stream=True)
57+
response = requests.head(url, stream=True, timeout=30)
5558
# Check for redirect and update URL if necessary
5659
if response.headers.get("Location") and response.status_code in [
5760
301,
@@ -112,8 +115,8 @@ def _download_file(
112115
progress_bar.close()
113116

114117
# TODO: could be a problem of github raw / openflaas
115-
# if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
116-
# raise IOError("Downloaded size does not match Content-Length header")
118+
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
119+
raise IOError("Downloaded size does not match Content-Length header")
117120

118121

119122
def _download_files(
@@ -161,7 +164,9 @@ def _get_sparql_query_of_collection(uri: str, databus_key: str | None = None) ->
161164
if databus_key is not None:
162165
headers["X-API-KEY"] = databus_key
163166

164-
return requests.get(uri, headers=headers, timeout=30).text
167+
response = requests.get(uri, headers=headers, timeout=30)
168+
response.raise_for_status()
169+
return response.text
165170

166171

167172
def _query_sparql_endpoint(endpoint_url, query, databus_key=None) -> dict:
@@ -247,6 +252,7 @@ def __get_vault_access__(
247252
"grant_type": "refresh_token",
248253
"refresh_token": refresh_token,
249254
},
255+
timeout=30,
250256
)
251257
resp.raise_for_status()
252258
access_token = resp.json()["access_token"]
@@ -270,6 +276,7 @@ def __get_vault_access__(
270276
"subject_token": access_token,
271277
"audience": audience,
272278
},
279+
timeout=30,
273280
)
274281
resp.raise_for_status()
275282
vault_token = resp.json()["access_token"]
@@ -488,7 +495,7 @@ def _get_databus_artifacts_of_group(json_str: str) -> List[str]:
488495
uri = item.get("@id")
489496
if not uri:
490497
continue
491-
_, _, _, _, version, _ = get_databus_id_parts_from_uri(uri)
498+
_, _, _, _, version, _ = get_databus_id_parts_from_file_url(uri)
492499
if version is None:
493500
result.append(uri)
494501
return result
@@ -519,22 +526,25 @@ def download(
519526
- client_id: Client ID for token exchange. Default is "vault-token-exchange".
520527
"""
521528
for databusURI in databusURIs:
522-
host, account, group, artifact, version, file = get_databus_id_parts_from_uri(
523-
databusURI
529+
host, account, group, artifact, version, file = (
530+
get_databus_id_parts_from_file_url(databusURI)
524531
)
525532

533+
# Determine endpoint per-URI if not explicitly provided
534+
uri_endpoint = endpoint
535+
526536
# dataID or databus collection
527537
if databusURI.startswith("http://") or databusURI.startswith("https://"):
528538
# Auto-detect sparql endpoint from host if not given
529-
if endpoint is None:
530-
endpoint = f"https://{host}/sparql"
539+
if uri_endpoint is None:
540+
uri_endpoint = f"https://{host}/sparql"
531541
print(f"SPARQL endpoint {endpoint}")
532542

533543
if group == "collections" and artifact is not None:
534544
print(f"Downloading collection: {databusURI}")
535545
_download_collection(
536546
databusURI,
537-
endpoint,
547+
uri_endpoint,
538548
localDir,
539549
token,
540550
databus_key,
@@ -599,10 +609,10 @@ def download(
599609
# query as argument
600610
else:
601611
print("QUERY {}", databusURI.replace("\n", " "))
602-
if endpoint is None: # endpoint is required for queries (--databus)
612+
if uri_endpoint is None: # endpoint is required for queries (--databus)
603613
raise ValueError("No endpoint given for query")
604614
res = _get_file_download_urls_from_sparql_query(
605-
endpoint, databusURI, databus_key=databus_key
615+
uri_endpoint, databusURI, databus_key=databus_key
606616
)
607617
_download_files(
608618
res,

databusclient/api/utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import requests
44

55

6-
def get_databus_id_parts_from_uri(
6+
def get_databus_id_parts_from_file_url(
77
uri: str,
88
) -> Tuple[
99
Optional[str],
@@ -17,7 +17,8 @@ def get_databus_id_parts_from_uri(
1717
Extract databus ID parts from a given databus URI.
1818
1919
Parameters:
20-
- uri: The full databus URI
20+
- uri: The full databus URI of the form
21+
"http(s)://host/accountId/groupId/artifactId/versionId/fileId"
2122
2223
Returns:
2324
A tuple containing (host, accountId, groupId, artifactId, versionId, fileId).

databusclient/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def deploy(
104104
"Please provide files to upload when using WebDAV/Nextcloud mode."
105105
)
106106

107-
# Check that all given paths exist and are files or directories.#
107+
# Check that all given paths exist and are files or directories.
108108
invalid = [f for f in distributions if not os.path.exists(f)]
109109
if invalid:
110110
raise click.UsageError(

0 commit comments

Comments
 (0)