Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 71 additions & 21 deletions api/src/shared/common/license_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ class MatchingLicense:
(re.compile(r"opensource\.org/licenses/MIT/?", re.I), "MIT"),
(re.compile(r"choosealicense\.com/licenses/mit/?", re.I), "MIT"),
(re.compile(r"choosealicense\.com/licenses/apache-2\.0/?", re.I), "Apache-2.0"),
# add Etalab / Québec, etc., once verified
]


Expand Down Expand Up @@ -281,6 +280,44 @@ def resolve_fuzzy_match(
return results


def find_exact_match_license_url(url_normalized: str, db_session: Session | None) -> License | None:
"""Find exact match of normalized license URL in DB (License.url)."""
if not db_session:
return None
# Compare normalized strings using SQL functions on License.url
return (
db_session.query(License)
.filter(normalize_url_str(url_normalized) == func.lower(func.trim(normalize_url(License.url))))
.first()
)


def extract_spdx_id_from_url(url_normalized: str) -> Optional[str]:
"""Extract an SPDX license ID from an SPDX-style URL if present.

Recognizes URLs of the form used on spdx.org, for example::

https://spdx.org/licenses/ODbL-1.0.html
http://spdx.org/licenses/MIT

The function is conservative and only returns an SPDX ID when it finds a
path segment under ``/licenses/`` that looks like an SPDX identifier. Any
optional ``.html`` suffix is stripped.
"""
# Match host 'spdx.org' and capture the token after '/licenses/' up to
# an optional '.html' suffix and optional trailing slash.
match = re.search(r"spdx\.org/licenses/([^/?#]+?)(?:\.html)?/?$", url_normalized, re.I)
if not match:
return None

spdx_id = match.group(1)
# Basic sanity check: SPDX IDs are typically alnum plus '-', '.' (e.g. 'CC-BY-4.0')
if not re.fullmatch(r"[A-Za-z0-9.+-]+", spdx_id):
return None

return spdx_id


def resolve_license(
license_url: str,
allow_fuzzy: bool = True,
Expand All @@ -290,11 +327,12 @@ def resolve_license(
"""Resolve a license URL to one or more SPDX candidates using multiple strategies.

Strategies (in order of precedence):
1) Exact match in DB(db.license) -> return [exact]
2) Creative Commons resolver(cc-resolver) -> return [cc]
3) Generic heuristics(pattern-heuristics) -> return [heuristic]
4) Fuzzy (same host candidates) -> return [fuzzy...]
5) No match -> return [none]
1) Exact match in DB (``db.license``) -> return [exact]
2) Creative Commons resolver (``cc-resolver``) -> return [cc]
3) SPDX catalog URL resolver (``spdx.org/licenses``) -> return [spdx]
4) Generic heuristics (pattern-based) -> return [heuristic]
5) Fuzzy (same-host candidates) -> return [fuzzy...]
6) No match -> return []

Args:
license_url (str): The license URL to resolve.
Expand Down Expand Up @@ -350,7 +388,31 @@ def resolve_license(
)
]

# 3) Generic heuristics
# 3) SPDX catalog URL (spdx.org/licenses/<ID>[.html])
spdx_id = extract_spdx_id_from_url(url_normalized)
if spdx_id:
# Try to enrich from DB if a matching License row exists
db_lic: License | None = (
db_session.query(License).filter(func.lower(License.id) == func.lower(spdx_id)).one_or_none()
)
if db_lic is not None:
return [
MatchingLicense(
license_id=db_lic.id,
license_url=url_str,
normalized_url=url_normalized,
spdx_id=spdx_id,
match_type="heuristic",
confidence=0.98,
matched_name=db_lic.name,
matched_catalog_url=db_lic.url,
matched_source="spdx-resolver",
)
]
else:
logging.warning("SPDX ID %s resolved from URL but not found in DB", spdx_id)

# 4) Generic heuristics
heuristic_match = heuristic_spdx(url_str)
if heuristic_match:
return [
Expand All @@ -366,7 +428,7 @@ def resolve_license(
)
]

# 4) Fuzzy (same host candidates only)
# 5) Fuzzy (same host candidates only)
if allow_fuzzy and url_host and db_session is not None:
fuzzy_results = resolve_fuzzy_match(
url_str=url_str,
Expand All @@ -378,17 +440,5 @@ def resolve_license(
if fuzzy_results:
return fuzzy_results

# 5) No match
# 6) No match
return []


def find_exact_match_license_url(url_normalized: str, db_session: Session | None) -> License | None:
"""Find exact match of normalized license URL in DB (License.url)."""
if not db_session:
return None
# Compare normalized strings using SQL functions on License.url
return (
db_session.query(License)
.filter(normalize_url_str(url_normalized) == func.lower(func.trim(normalize_url(License.url))))
.first()
)
33 changes: 33 additions & 0 deletions api/tests/utils/test_license_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,39 @@ def test_resolve_license_creative_commons(self, _mock_find):
self.assertEqual(results[0].spdx_id, "CC-BY-4.0")
self.assertEqual(results[0].match_type, "heuristic")

@patch("shared.common.license_utils.find_exact_match_license_url", return_value=None)
def test_resolve_license_spdx_catalog_url_db_hit(self, _mock_find):
"""SPDX catalog URLs (spdx.org/licenses/ID) should resolve via SPDX branch when license exists in DB."""
spdx_url = "https://spdx.org/licenses/ODbL-1.0.html"
lic = self._make_license("odbl-1.0", "https://spdx.org/licenses/ODbL-1.0.html", "ODbL 1.0")
# Configure session to return our license when queried by ID
self.session.query.return_value.filter.return_value.one_or_none.return_value = lic

results = resolve_license(spdx_url, db_session=self.session)

self.assertEqual(len(results), 1)
r = results[0]
# Implementation currently lowercases the SPDX ID extracted from the URL
self.assertEqual(r.spdx_id, "odbl-1.0")
self.assertEqual(r.license_id, "odbl-1.0")
self.assertEqual(r.match_type, "heuristic")
self.assertEqual(r.matched_source, "spdx-resolver")
self.assertEqual(r.matched_name, "ODbL 1.0")
self.assertEqual(r.matched_catalog_url, "https://spdx.org/licenses/ODbL-1.0.html")

@patch("shared.common.license_utils.find_exact_match_license_url", return_value=None)
def test_resolve_license_spdx_catalog_url_db_miss(self, _mock_find):
"""When SPDX ID is parsed from URL but not present in DB,
resolver should log and return no SPDX-based result."""
spdx_url = "https://spdx.org/licenses/ODbL-1.0.html"
# Simulate no matching License in DB
self.session.query.return_value.filter.return_value.one_or_none.return_value = None

results = resolve_license(spdx_url, db_session=self.session)

# Current behavior: we only log a warning and return an empty list when SPDX ID is not found in DB.
self.assertEqual(results, [])

@patch("shared.common.license_utils.find_exact_match_license_url", return_value=None)
def test_resolve_license_generic_heuristic(self, _mock_find):
# Provide URL that matches heuristic patterns
Expand Down
Loading