fix: Canonicalize the URls returned by repositories

nnobelis · nnobelis · commit bdf7ea5690c8 · 2026-03-23T10:17:45.000+01:00
Some artifactories (e.g. JFrog), return relative URLs in the package
metadata. This commit canonicalizes them to compare them.

Signed-off-by: Nicolas Nobelis &lt;nicolas.nobelis@bosch.com&gt;
diff --git a/src/python_inspector/package_data.py b/src/python_inspector/package_data.py
@@ -9,6 +9,9 @@
 # See https://aboutcode.org for more information about nexB OSS projects.
 #
 
+import os
+from urllib.parse import urlparse, urlunparse
+
 from typing import Dict
 from typing import List
 from typing import Optional
@@ -81,10 +84,27 @@ async def get_pypi_data_from_purl(
     sdist_url = await get_sdist_download_url(
         purl=parsed_purl, repos=repos, python_version=python_version
     )
+
+    def canonicalize_url(url: str):
+        # Parse the URL into its components
+        parsed = urlparse(url)
+
+        # Canonicalize the path component to resolve ".."
+        # os.path.normpath will handle segments like '.' and '..'
+        canonical_path = os.path.normpath(parsed.path)
+
+        # Rebuild the URL with the canonicalized path
+        # We replace the original path with the new one
+        parsed = parsed._replace(path=canonical_path)
+        canonical_url = urlunparse(parsed)
+
+        return canonical_url
+
     if sdist_url:
         valid_distribution_urls.append(sdist_url)
 
     valid_distribution_urls = [url for url in valid_distribution_urls if url]
+    valid_distribution_urls = list(map(canonicalize_url, valid_distribution_urls))
 
     # if prefer_source is True then only source distribution is used
     # in case of no source distribution available then wheel is used
@@ -100,10 +120,22 @@ async def get_pypi_data_from_purl(
         ]
         wheel_url = choose_single_wheel(wheel_urls)
         if wheel_url:
-            valid_distribution_urls.insert(0, wheel_url)
+            valid_distribution_urls.insert(0, canonicalize_url(wheel_url))
 
     urls = {url.get("url"): url for url in response.get("urls") or []}
 
+    # Sanitize all URLs that are relative and canonicalize them
+    urls_sanitized = {}
+    for url in urls:
+        value = urls.get(url)
+
+        if url.startswith("https"):
+            url_sanitized = canonicalize_url(url)
+        else:
+            url_sanitized = canonicalize_url(base_path + url)
+
+        urls_sanitized[url_sanitized] = value
+
     def remove_credentials_from_url(url: str):
         # Parse the URL into its components
         parsed = urlparse(url)
@@ -122,10 +154,10 @@ def remove_credentials_from_url(url: str):
     # iterate over the valid distribution urls and return the first
     # one that is matching.
     for dist_url in valid_distribution_urls:
-        if dist_url not in urls:
+        if dist_url not in urls_sanitized:
             continue
 
-        url_data = urls.get(dist_url)
+        url_data = urls_sanitized.get(dist_url)
         digests = url_data.get("digests") or {}
 
         return PackageData(