Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions api/src/scripts/populate_db_gbfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from scripts.gbfs_utils.fetching import fetch_data, get_data_content
from scripts.gbfs_utils.license import get_license_url
from scripts.populate_db import DatabasePopulateHelper, set_up_configs
from shared.common.license_utils import assign_license_by_url
from shared.database.database import generate_unique_id, configure_polymorphic_mappers
from shared.database_gen.sqlacodegen_models import Gbfsfeed, Location, Externalid

Expand Down Expand Up @@ -126,6 +127,8 @@ def populate_db(self, session, fetch_url=True):
gbfs_feed.locations = [location]

session.flush()
if is_new_feed and gbfs_feed.license_url:
assign_license_by_url(gbfs_feed, session)
if is_new_feed:
self.added_feeds.append(
{
Expand Down
4 changes: 4 additions & 0 deletions api/src/scripts/populate_db_gtfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from scripts.load_dataset_on_create import publish_all
from scripts.populate_db import DatabasePopulateHelper, set_up_configs
from shared.common.license_utils import assign_license_by_url
from shared.database.database import generate_unique_id
from shared.database_gen.sqlacodegen_models import (
Entitytype,
Expand Down Expand Up @@ -212,6 +213,7 @@ def populate_db(self, session: "Session", fetch_url: bool = True):
stable_id = self.get_stable_id(row)
is_official_from_csv = self.get_safe_boolean_value(row, "is_official", None)
feed = self.query_feed_by_stable_id(session, stable_id, data_type)
is_new_feed = feed is None
if feed:
self.logger.debug(f"Updating {feed.__class__.__name__}: {stable_id}")
# Always set the deprecated status if found in the csv
Expand Down Expand Up @@ -264,6 +266,8 @@ def populate_db(self, session: "Session", fetch_url: bool = True):

session.add(feed)
session.flush()
if is_new_feed and feed.license_url:
assign_license_by_url(feed, session)
# This need to be done after all feeds are added to the session to avoid FK violation
self.process_feed_references(session)
self.process_redirects(session)
Expand Down
94 changes: 93 additions & 1 deletion api/src/shared/common/license_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from typing import List, Tuple, Optional

from shared.common.db_utils import normalize_url, normalize_url_str
from shared.database_gen.sqlacodegen_models import License
from shared.database_gen.sqlacodegen_models import License, FeedLicenseChange


@dataclass
Expand Down Expand Up @@ -442,3 +442,95 @@ def resolve_license(

# 6) No match
return []


# Confidence threshold above which an auto-assigned license is considered verified
# without requiring human review. Covers exact, CC resolver, SPDX, and pattern heuristic matches.
_AUTO_VERIFY_THRESHOLD = 0.95


def assign_license_by_url(
feed,
db_session: Session,
*,
only_if_single: bool = True,
) -> Optional[MatchingLicense]:
"""Resolve feed.license_url and auto-assign a license if exactly one match is found.

Behavior:
- 0 matches: logs info, returns None (no change).
- >1 matches: logs a warning and returns None when only_if_single=True;
the feed retains its current license_id for manual review.
- 1 match: assigns feed.license_id / feed.license_notes and appends a
FeedLicenseChange audit row. verified is set based on confidence:
- True if match_type == 'exact' or confidence >= _AUTO_VERIFY_THRESHOLD
(covers exact DB matches, CC resolver, SPDX, pattern heuristics)
- False if match_type == 'fuzzy' (needs human confirmation)

Args:
feed: Any Feed ORM instance (Gtfsfeed, Gtfsrealtimefeed, Gbfsfeed).
db_session: Active SQLAlchemy session; required for DB-backed resolution.
only_if_single: When True (default), skip assignment if multiple candidates
are returned, requiring a human to choose.

Returns:
The assigned MatchingLicense, or None if no assignment was made.
"""
if not feed.license_url:
return None

matches = resolve_license(feed.license_url, db_session=db_session)

if not matches:
logging.info(
"No license match found for feed %s (url: %s)",
feed.stable_id,
feed.license_url,
)
return None

if only_if_single and len(matches) > 1:
logging.warning(
"Skipping auto-assignment for feed %s: %d license candidates found — manual review required",
feed.stable_id,
len(matches),
)
return None

best = matches[0]

if best.license_id == feed.license_id:
logging.info("Feed %s license unchanged: %s", feed.stable_id, best.license_id)
return best

is_verified = best.match_type == "exact" or best.confidence >= _AUTO_VERIFY_THRESHOLD

logging.info(
"Assigning license %s to feed %s (match_type=%s, confidence=%.2f, verified=%s)",
best.license_id,
feed.stable_id,
best.match_type,
best.confidence,
is_verified,
)

feed.license_id = best.license_id
feed.license_notes = best.notes
feed.feed_license_changes.append(
FeedLicenseChange(
feed_id=feed.id,
changed_at=None, # set by DB default
feed_license_url=feed.license_url,
matched_license_id=best.license_id,
confidence=best.confidence,
match_type=best.match_type,
matched_name=best.matched_name,
matched_catalog_url=best.matched_catalog_url,
matched_source=best.matched_source,
notes=best.notes,
regional_id=best.regional_id,
verified=is_verified,
)
)

return best
154 changes: 152 additions & 2 deletions api/tests/utils/test_license_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
resolve_fuzzy_match,
resolve_license,
find_exact_match_license_url,
assign_license_by_url,
MatchingLicense,
)
from shared.database_gen.sqlacodegen_models import License
Expand Down Expand Up @@ -246,5 +247,154 @@ def test_matching_license_dataclass(self):
self.assertEqual(ml.confidence, 1.0)


if __name__ == "__main__":
unittest.main()
class TestAssignLicenseByUrl(unittest.TestCase):
"""Unit tests for assign_license_by_url."""

def _make_match(self, license_id="MIT", match_type="exact", confidence=1.0):
return MatchingLicense(
license_id=license_id,
license_url="http://example.com/license",
normalized_url="example.com/license",
match_type=match_type,
confidence=confidence,
matched_name="MIT License",
matched_catalog_url="http://example.com/license",
matched_source="db.license",
)

def _make_feed(self, license_url="http://example.com/license", license_id=None):
feed = MagicMock()
feed.stable_id = "test-feed-1"
feed.id = "feed-id-1"
feed.license_url = license_url
feed.license_id = license_id
feed.license_notes = None
feed.feed_license_changes = []
return feed

# --- No license_url ---

def test_no_license_url_returns_none(self):
feed = self._make_feed(license_url=None)
result = assign_license_by_url(feed, MagicMock())
self.assertIsNone(result)
self.assertIsNone(feed.license_id)

def test_empty_license_url_returns_none(self):
feed = self._make_feed(license_url="")
result = assign_license_by_url(feed, MagicMock())
self.assertIsNone(result)

# --- No match ---

@patch("shared.common.license_utils.resolve_license")
def test_no_match_returns_none(self, mock_resolve):
mock_resolve.return_value = []
feed = self._make_feed()
result = assign_license_by_url(feed, MagicMock())
self.assertIsNone(result)
self.assertIsNone(feed.license_id)
self.assertEqual(feed.feed_license_changes, [])

# --- Multiple matches ---

@patch("shared.common.license_utils.resolve_license")
def test_multiple_matches_skips_assignment(self, mock_resolve):
mock_resolve.return_value = [
self._make_match("MIT", "fuzzy", 0.96),
self._make_match("Apache-2.0", "fuzzy", 0.94),
]
feed = self._make_feed()
result = assign_license_by_url(feed, MagicMock())
self.assertIsNone(result)
self.assertIsNone(feed.license_id)
self.assertEqual(feed.feed_license_changes, [])

# --- Single exact match — auto-verified ---

@patch("shared.common.license_utils.resolve_license")
def test_exact_match_assigns_and_marks_verified(self, mock_resolve):
match = self._make_match("MIT", "exact", 1.0)
mock_resolve.return_value = [match]
feed = self._make_feed()

result = assign_license_by_url(feed, MagicMock())

self.assertEqual(result, match)
self.assertEqual(feed.license_id, "MIT")
self.assertEqual(len(feed.feed_license_changes), 1)
self.assertTrue(feed.feed_license_changes[0].verified)

@patch("shared.common.license_utils.resolve_license")
def test_heuristic_high_confidence_assigns_and_marks_verified(self, mock_resolve):
match = self._make_match("CC-BY-4.0", "heuristic", 0.99)
mock_resolve.return_value = [match]
feed = self._make_feed()

result = assign_license_by_url(feed, MagicMock())

self.assertEqual(result, match)
self.assertEqual(feed.license_id, "CC-BY-4.0")
self.assertTrue(feed.feed_license_changes[0].verified)

@patch("shared.common.license_utils.resolve_license")
def test_threshold_boundary_095_marks_verified(self, mock_resolve):
match = self._make_match("ODbL-1.0", "heuristic", 0.95)
mock_resolve.return_value = [match]
feed = self._make_feed()

assign_license_by_url(feed, MagicMock())

self.assertTrue(feed.feed_license_changes[0].verified)

# --- Fuzzy / low-confidence match — unverified ---

@patch("shared.common.license_utils.resolve_license")
def test_fuzzy_match_assigns_but_unverified(self, mock_resolve):
match = self._make_match("MIT", "fuzzy", 0.94)
mock_resolve.return_value = [match]
feed = self._make_feed()

result = assign_license_by_url(feed, MagicMock())

self.assertEqual(result, match)
self.assertEqual(feed.license_id, "MIT")
self.assertFalse(feed.feed_license_changes[0].verified)

@patch("shared.common.license_utils.resolve_license")
def test_below_threshold_unverified(self, mock_resolve):
match = self._make_match("MIT", "heuristic", 0.80)
mock_resolve.return_value = [match]
feed = self._make_feed()

assign_license_by_url(feed, MagicMock())

self.assertFalse(feed.feed_license_changes[0].verified)

# --- Duplicate assignment guard ---

@patch("shared.common.license_utils.resolve_license")
def test_same_license_id_no_new_audit_row(self, mock_resolve):
match = self._make_match("MIT", "exact", 1.0)
mock_resolve.return_value = [match]
feed = self._make_feed(license_id="MIT") # already assigned

result = assign_license_by_url(feed, MagicMock())

self.assertEqual(result, match)
self.assertEqual(feed.license_id, "MIT")
self.assertEqual(feed.feed_license_changes, []) # no new audit row

# --- only_if_single=False allows multiple matches ---

@patch("shared.common.license_utils.resolve_license")
def test_only_if_single_false_assigns_best_match(self, mock_resolve):
best = self._make_match("MIT", "fuzzy", 0.97)
second = self._make_match("Apache-2.0", "fuzzy", 0.94)
mock_resolve.return_value = [best, second]
feed = self._make_feed()

result = assign_license_by_url(feed, MagicMock(), only_if_single=False)

self.assertEqual(result, best)
self.assertEqual(feed.license_id, "MIT")
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
Feed,
Gtfsrealtimefeed,
)
from shared.common.license_utils import assign_license_by_url
from shared.helpers.pub_sub import get_execution_id, trigger_dataset_download
from shared.helpers.query_helper import (
query_feed_by_stable_id,
Expand Down Expand Up @@ -397,6 +398,9 @@ async def create_gtfs_feed(
status_code=500,
detail=f"Failed to create GTFS feed with URL: {new_feed.producer_url}",
)
if created_feed.license_url:
assign_license_by_url(created_feed, db_session)
db_session.commit()
try:
trigger_dataset_download(
created_feed,
Expand Down Expand Up @@ -438,6 +442,9 @@ async def create_gtfs_rt_feed(
db_session.add(new_feed)
db_session.commit()
created_feed = db_session.get(Gtfsrealtimefeed, new_feed.id)
if created_feed and created_feed.license_url:
assign_license_by_url(created_feed, db_session)
db_session.commit()
logging.info("Created new GTFS-RT feed with ID: %s", new_feed.stable_id)
refreshed = refresh_materialized_view(db_session, t_feedsearch.name)
logging.info("Materialized view %s refreshed: %s", t_feedsearch.name, refreshed)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ def assign_feed_license(feed: Feed, license_match: MatchingLicense):
feed.stable_id,
license_match.license_id,
)
from shared.common.license_utils import _AUTO_VERIFY_THRESHOLD

is_verified = (
license_match.match_type == "exact"
or license_match.confidence >= _AUTO_VERIFY_THRESHOLD
)
feed.license_id = license_match.license_id
feed.license_notes = license_match.notes
feed_license_change: FeedLicenseChange = FeedLicenseChange(
Expand All @@ -50,6 +56,7 @@ def assign_feed_license(feed: Feed, license_match: MatchingLicense):
matched_source=license_match.matched_source,
notes=license_match.notes,
regional_id=license_match.regional_id,
verified=is_verified,
)
feed.feed_license_changes.append(feed_license_change)
else:
Expand Down
2 changes: 2 additions & 0 deletions liquibase/changelog.xml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@
<include file="changes/feat_1542.sql" relativeToChangelogFile="true"/>
<!-- Add license_tag table and license_license_tags join table for tag classification of licenses. -->
<include file="changes/feat_1565.sql" relativeToChangelogFile="true"/>
<!-- Add verified column to feed_license_change to track human review of auto-assigned licenses. -->
<include file="changes/feat_1568.sql" relativeToChangelogFile="true"/>
<!-- Centralized materialized view definitions.
Views are rebuilt from source SQL files using runOnChange. -->
<!-- Keep this at the very end to ensure all table and schema changes
Expand Down
10 changes: 10 additions & 0 deletions liquibase/changes/feat_1568.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
-- Add verified column to feed_license_change to track human review status.
-- Auto-assigned licenses start as unverified (false = needs review);
-- manually confirmed or high-confidence assignments are marked true.
ALTER TABLE feed_license_change ADD COLUMN IF NOT EXISTS verified BOOLEAN NOT NULL DEFAULT false;

-- Backfill all pre-existing rows as verified — prior assignments are considered trusted.
UPDATE feed_license_change SET verified = true;

-- Index for efficient filtering of unverified assignments.
CREATE INDEX IF NOT EXISTS ix_flc_verified ON feed_license_change (verified);
Loading