dbpedia
diff --git a/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 23 additions & 4 deletions b/‎README.md‎
Lines changed: 23 additions & 4 deletions
diff --git a/‎databusclient/api/delete.py‎
Lines changed: 7 additions & 4 deletions b/‎databusclient/api/delete.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎databusclient/api/deploy.py‎
Lines changed: 29 additions & 10 deletions b/‎databusclient/api/deploy.py‎
Lines changed: 29 additions & 10 deletions
diff --git a/‎databusclient/api/download.py‎
Lines changed: 25 additions & 15 deletions b/‎databusclient/api/download.py‎
Lines changed: 25 additions & 15 deletions
diff --git a/‎databusclient/api/utils.py‎
Lines changed: 3 additions & 2 deletions b/‎databusclient/api/utils.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎databusclient/cli.py‎
Lines changed: 1 addition & 1 deletion b/‎databusclient/cli.py‎
Lines changed: 1 addition & 1 deletion
@@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.11-slim
 
 WORKDIR /data
 
 
@@ -22,6 +22,7 @@ Command-line and Python client for downloading and deploying datasets on DBpedia
   - [Deploy](#module-deploy)
 - [Contributing](#contributing)
   - [Linting](#linting)
+  - [Testing](#testing)
 
 
 ## Quickstart
@@ -32,7 +33,7 @@ You can use either **Python** or **Docker**. Both methods support all client fea
 
 ### Python
 
-Requirements: [Python](https://www.python.org/downloads/) and [pip](https://pip.pypa.io/en/stable/installation/)
+Requirements: [Python 3.11+](https://www.python.org/downloads/) and [pip](https://pip.pypa.io/en/stable/installation/)
 
 Before using the client, install it via pip:
 
@@ -186,8 +187,8 @@ Options:
                       e.g. https://databus.dbpedia.org/sparql)
   --vault-token TEXT  Path to Vault refresh token file
   --databus-key TEXT  Databus API key to download from protected databus
-  --latest-only       When downloading artifacts, only download the latest
-                      version
+  --all-versions      When downloading artifacts, download all versions
+                      instead of only the latest
   --authurl TEXT      Keycloak token endpoint URL  [default:
                       https://auth.dbpedia.org/realms/dbpedia/protocol/openid-
                       connect/token]
@@ -571,12 +572,30 @@ The used linter is [Ruff](https://ruff.rs/). Ruff is configured in `pyproject.to
 
 For development, you can run linting locally with `ruff check . ` and optionally auto-format with `ruff format .`.
 
-To ensuere compatibility with the `pyproject.toml` configured dependencies, run Ruff via Poetry:
+To ensure compatibility with the `pyproject.toml` configured dependencies, run Ruff via Poetry:
 
 ```bash
 # To check for linting issues:
 poetry run ruff check .
 
 # To auto-format code:
 poetry run ruff format .
+```
+
+### Testing
+
+When developing new features please make sure to add appropriate tests and ensure that all tests pass. Tests are under `tests/` and use [pytest](https://docs.pytest.org/en/7.4.x/) as test framework.
+
+When fixing bugs or refactoring existing code, please make sure to add tests that cover the affected functionality. The current test coverage is very low, so any additional tests are highly appreciated.
+
+To run tests locally, use:
+
+```bash
+pytest tests/
+```
+
+Or to ensure compatibility with the `pyproject.toml` configured dependencies, run pytest via Poetry:
+
+```bash
+poetry run pytest tests/
 ```
@@ -3,7 +3,10 @@
 
 import requests
 
-from databusclient.api.utils import fetch_databus_jsonld, get_databus_id_parts_from_uri
+from databusclient.api.utils import (
+    fetch_databus_jsonld,
+    get_databus_id_parts_from_file_url,
+)
 
 
 def _confirm_delete(databusURI: str) -> str:
@@ -161,7 +164,7 @@ def _delete_group(
         uri = item.get("@id")
         if not uri:
             continue
-        _, _, _, _, version, _ = get_databus_id_parts_from_uri(uri)
+        _, _, _, _, version, _ = get_databus_id_parts_from_file_url(uri)
         if version is None:
             artifact_uris.append(uri)
 
@@ -188,8 +191,8 @@ def delete(databusURIs: List[str], databus_key: str, dry_run: bool, force: bool)
     """
 
     for databusURI in databusURIs:
-        _host, _account, group, artifact, version, file = get_databus_id_parts_from_uri(
-            databusURI
+        _host, _account, group, artifact, version, file = (
+            get_databus_id_parts_from_file_url(databusURI)
         )
 
         if group == "collections" and artifact is not None:
 
@@ -5,7 +5,7 @@
 
 import requests
 
-__debug = False
+_debug = False
 
 
 class DeployError(Exception):
@@ -36,6 +36,11 @@ def _get_content_variants(distribution_str: str) -> Optional[Dict[str, str]]:
 
     cvs = {}
     for kv in cv_str.split("_"):
+        if "=" not in kv:
+            raise BadArgumentException(
+                f"Invalid content variant format: '{kv}'. Expected 'key=value' format."
+            )
+
         key, value = kv.split("=")
         cvs[key] = value
 
@@ -141,8 +146,8 @@ def _get_file_stats(distribution_str: str) -> Tuple[Optional[str], Optional[int]
 
 
 def _load_file_stats(url: str) -> Tuple[str, int]:
-    resp = requests.get(url)
-    if resp.status_code > 400:
+    resp = requests.get(url, timeout=30)
+    if resp.status_code >= 400:
         raise requests.exceptions.RequestException(response=resp)
 
     sha256sum = hashlib.sha256(bytes(resp.content)).hexdigest()
@@ -156,7 +161,7 @@ def get_file_info(distribution_str: str) -> Tuple[Dict[str, str], str, str, str,
 
     # content_variant_part = "_".join([f"{key}={value}" for key, value in cvs.items()])
 
-    if __debug:
+    if _debug:
         print("DEBUG", distribution_str, extension_part)
 
     sha256sum, content_length = _get_file_stats(distribution_str)
@@ -306,7 +311,13 @@ def create_dataset(
     """
 
     _versionId = str(version_id).strip("/")
-    _, account_name, group_name, artifact_name, version = _versionId.rsplit("/", 4)
+    parts = _versionId.rsplit("/", 4)
+    if len(parts) < 5:
+        raise BadArgumentException(
+            f"Invalid version_id format: '{version_id}'. "
+            f"Expected format: <BASE>/<ACCOUNT>/<GROUP>/<ARTIFACT>/<VERSION>"
+        )
+    _, _account_name, _group_name, _artifact_name, version = parts
 
     # could be build from stuff above,
     # was not sure if there are edge cases BASE=http://databus.example.org/"base"/...
@@ -428,22 +439,30 @@ def deploy(
 
     headers = {"X-API-KEY": f"{api_key}", "Content-Type": "application/json"}
     data = json.dumps(dataid)
-    base = "/".join(dataid["@graph"][0]["@id"].split("/")[0:3])
+
+    try:
+        base = "/".join(dataid["@graph"][0]["@id"].split("/")[0:3])
+    except (KeyError, IndexError, TypeError) as e:
+        raise DeployError(f"Invalid dataid structure: {e}")
+
     api_uri = (
         base
         + f"/api/publish?verify-parts={str(verify_parts).lower()}&log-level={log_level.name}"
     )
-    resp = requests.post(api_uri, data=data, headers=headers)
+    resp = requests.post(api_uri, data=data, headers=headers, timeout=30)
 
-    if debug or __debug:
-        dataset_uri = dataid["@graph"][0]["@id"]
+    if debug or _debug:
+        try:
+            dataset_uri = dataid["@graph"][0]["@id"]
+        except (KeyError, IndexError, TypeError) as e:
+            raise DeployError(f"Invalid dataid structure: {e}")
         print(f"Trying submitting data to {dataset_uri}:")
         print(data)
 
     if resp.status_code != 200:
         raise DeployError(f"Could not deploy dataset to databus. Reason: '{resp.text}'")
 
-    if debug or __debug:
+    if debug or _debug:
         print("---------")
         print(resp.text)
 
 
@@ -6,7 +6,10 @@
 from SPARQLWrapper import JSON, SPARQLWrapper
 from tqdm import tqdm
 
-from databusclient.api.utils import fetch_databus_jsonld, get_databus_id_parts_from_uri
+from databusclient.api.utils import (
+    fetch_databus_jsonld,
+    get_databus_id_parts_from_file_url,
+)
 
 
 def _download_file(
@@ -32,8 +35,8 @@ def _download_file(
     2. If server responds with WWW-Authenticate: Bearer, 401 Unauthorized), then fetch Vault access token and retry with Authorization header.
     """
     if localDir is None:
-        _host, account, group, artifact, version, file = get_databus_id_parts_from_uri(
-            url
+        _host, account, group, artifact, version, file = (
+            get_databus_id_parts_from_file_url(url)
         )
         localDir = os.path.join(
             os.getcwd(),
@@ -51,7 +54,7 @@ def _download_file(
     if dirpath:
         os.makedirs(dirpath, exist_ok=True)  # Create the necessary directories
     # --- 1. Get redirect URL by requesting HEAD ---
-    response = requests.head(url, stream=True)
+    response = requests.head(url, stream=True, timeout=30)
     # Check for redirect and update URL if necessary
     if response.headers.get("Location") and response.status_code in [
         301,
@@ -112,8 +115,8 @@ def _download_file(
     progress_bar.close()
 
     # TODO: could be a problem of github raw / openflaas
-    # if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
-    #     raise IOError("Downloaded size does not match Content-Length header")
+    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
+        raise IOError("Downloaded size does not match Content-Length header")
 
 
 def _download_files(
@@ -161,7 +164,9 @@ def _get_sparql_query_of_collection(uri: str, databus_key: str | None = None) ->
     if databus_key is not None:
         headers["X-API-KEY"] = databus_key
 
-    return requests.get(uri, headers=headers, timeout=30).text
+    response = requests.get(uri, headers=headers, timeout=30)
+    response.raise_for_status()
+    return response.text
 
 
 def _query_sparql_endpoint(endpoint_url, query, databus_key=None) -> dict:
@@ -247,6 +252,7 @@ def __get_vault_access__(
             "grant_type": "refresh_token",
             "refresh_token": refresh_token,
         },
+        timeout=30,
     )
     resp.raise_for_status()
     access_token = resp.json()["access_token"]
@@ -270,6 +276,7 @@ def __get_vault_access__(
             "subject_token": access_token,
             "audience": audience,
         },
+        timeout=30,
     )
     resp.raise_for_status()
     vault_token = resp.json()["access_token"]
@@ -488,7 +495,7 @@ def _get_databus_artifacts_of_group(json_str: str) -> List[str]:
         uri = item.get("@id")
         if not uri:
             continue
-        _, _, _, _, version, _ = get_databus_id_parts_from_uri(uri)
+        _, _, _, _, version, _ = get_databus_id_parts_from_file_url(uri)
         if version is None:
             result.append(uri)
     return result
@@ -519,22 +526,25 @@ def download(
     - client_id: Client ID for token exchange. Default is "vault-token-exchange".
     """
     for databusURI in databusURIs:
-        host, account, group, artifact, version, file = get_databus_id_parts_from_uri(
-            databusURI
+        host, account, group, artifact, version, file = (
+            get_databus_id_parts_from_file_url(databusURI)
         )
 
+        # Determine endpoint per-URI if not explicitly provided
+        uri_endpoint = endpoint
+
         # dataID or databus collection
         if databusURI.startswith("http://") or databusURI.startswith("https://"):
             # Auto-detect sparql endpoint from host if not given
-            if endpoint is None:
-                endpoint = f"https://{host}/sparql"
+            if uri_endpoint is None:
+                uri_endpoint = f"https://{host}/sparql"
             print(f"SPARQL endpoint {endpoint}")
 
             if group == "collections" and artifact is not None:
                 print(f"Downloading collection: {databusURI}")
                 _download_collection(
                     databusURI,
-                    endpoint,
+                    uri_endpoint,
                     localDir,
                     token,
                     databus_key,
@@ -599,10 +609,10 @@ def download(
         # query as argument
         else:
             print("QUERY {}", databusURI.replace("\n", " "))
-            if endpoint is None:  # endpoint is required for queries (--databus)
+            if uri_endpoint is None:  # endpoint is required for queries (--databus)
                 raise ValueError("No endpoint given for query")
             res = _get_file_download_urls_from_sparql_query(
-                endpoint, databusURI, databus_key=databus_key
+                uri_endpoint, databusURI, databus_key=databus_key
             )
             _download_files(
                 res,
 
@@ -3,7 +3,7 @@
 import requests
 
 
-def get_databus_id_parts_from_uri(
+def get_databus_id_parts_from_file_url(
     uri: str,
 ) -> Tuple[
     Optional[str],
@@ -17,7 +17,8 @@ def get_databus_id_parts_from_uri(
     Extract databus ID parts from a given databus URI.
 
     Parameters:
-    - uri: The full databus URI
+    - uri: The full databus URI of the form
+      "http(s)://host/accountId/groupId/artifactId/versionId/fileId"
 
     Returns:
     A tuple containing (host, accountId, groupId, artifactId, versionId, fileId).
 
@@ -104,7 +104,7 @@ def deploy(
                 "Please provide files to upload when using WebDAV/Nextcloud mode."
             )
 
-        # Check that all given paths exist and are files or directories.#
+        # Check that all given paths exist and are files or directories.
         invalid = [f for f in distributions if not os.path.exists(f)]
         if invalid:
             raise click.UsageError(
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM python:3.10-slim`
	`1`	`+FROM python:3.11-slim`
`2`	`2`
`3`	`3`	`WORKDIR /data`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ def deploy(`
`104`	`104`	`"Please provide files to upload when using WebDAV/Nextcloud mode."`
`105`	`105`	`)`
`106`	`106`
`107`		`- # Check that all given paths exist and are files or directories.#`
	`107`	`+ # Check that all given paths exist and are files or directories.`
`108`	`108`	`invalid = [f for f in distributions if not os.path.exists(f)]`
`109`	`109`	`if invalid:`
`110`	`110`	`raise click.UsageError(`