Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ALTER TABLE public.repositories DROP COLUMN licenses;
ALTER TABLE public.repositories ADD COLUMN license VARCHAR(255);
Comment on lines +1 to +2
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ALTER TABLE public.repositories DROP COLUMN license;
ALTER TABLE public.repositories ADD COLUMN licenses VARCHAR(255)[];
Comment on lines +1 to +2
8 changes: 4 additions & 4 deletions services/apps/git_integration/src/crowdgit/database/crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,15 +283,15 @@ async def update_last_processed_commit(repo_id: str, commit_hash: str, branch: s
return str(result)


async def update_repository_license(repository_id: str, license_spdx: str | None) -> None:
async def update_repository_licenses(repository_id: str, licenses: list[str]) -> None:
sql_query = """
UPDATE public.repositories
SET license = $1::varchar,
SET licenses = $1::varchar[],
"updatedAt" = NOW()
WHERE id = $2
AND license IS DISTINCT FROM $1::varchar
AND licenses IS DISTINCT FROM $1::varchar[]
"""
await execute(sql_query, (license_spdx, repository_id))
await execute(sql_query, (licenses, repository_id))


async def mark_repo_as_processed(repo_id: str, repo_state: RepositoryState):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,59 +11,75 @@
class LicenseService(BaseService):
"""Detects SPDX license from a cloned repository using the licensee gem."""

async def detect(self, repo_path: str) -> str | None:
"""Run licensee against repo_path and return the SPDX identifier, or None."""
async def detect(self, repo_path: str) -> list[str]:
"""Run licensee against repo_path and return a list of SPDX identifiers.

Returns [] when licensee is unavailable or finds no license files.
Returns ['NOASSERTION'] when files are found but none meet the confidence threshold.
"""
try:
output = await run_shell_command(
["licensee", "detect", "--json", repo_path], timeout=60
)
except CommandExecutionError:
self.logger.info(f"licensee found no license in {repo_path}")
return None
return []
except CommandTimeoutError as e:
self.logger.warning(f"licensee timed out: {repr(e)}")
return None
return []
except FileNotFoundError as e:
self.logger.warning(f"licensee binary not found in PATH: {repr(e)}")
return None
return []
except Exception as e:
self.logger.warning(f"licensee failed: {repr(e)}")
return None
return []

try:
data = json.loads(output)
licenses = data.get("licenses") or []
matched_files = data.get("matched_files") or []
spdx_id = licenses[0].get("spdx_id") if licenses else None
confidence = (
(matched_files[0].get("matcher") or {}).get("confidence")
if matched_files
else None
)

# Build a map from spdx_id to its best confidence across matched files.
# licensee puts per-file confidence inside each matched_file's matcher object.
confidence_by_spdx: dict[str, float] = {}
for mf in matched_files:
spdx = (mf.get("matched_license") or {}).get("spdx_id") or ""
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Confidence map may never populate due to field access

Medium Severity

The code accesses (mf.get("matched_license") or {}).get("spdx_id") assuming matched_license is a dict. If the licensee gem serializes matched_license as a plain string (the SPDX ID directly, e.g. "MIT"), the or {} fallback won't trigger (since a non-empty string is truthy), and calling .get("spdx_id") on a string raises AttributeError. This would be caught by the outer except Exception on line 83, causing the function to always return [] — silently disabling license detection for all repositories. Even if matched_license is correctly a dict, the confidence_by_spdx map remains empty if the key differs between matched_files and licenses entries, causing all licenses to bypass confidence filtering.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit c471e3b. Configure here.

conf = (mf.get("matcher") or {}).get("confidence")
if spdx and conf is not None:
confidence_by_spdx[spdx] = max(confidence_by_spdx.get(spdx, 0), conf)

# Mirror GitHub's threshold — below LICENSE_CONFIDENCE_THRESHOLD the match is unreliable.
# Downgrade low-confidence matches to NOASSERTION so the distinction is clean:
# NULL = licensee didn't run, timed out, or found no license file
# NOASSERTION = found a license file but couldn't reliably identify it
# The UI should display NOASSERTION as "Other".
if (
spdx_id
and spdx_id != "NOASSERTION"
and confidence is not None
and confidence < LICENSE_CONFIDENCE_THRESHOLD
):
self.logger.info(
f"License downgraded to NOASSERTION: confidence {confidence}% below threshold in {repo_path}"
)
return "NOASSERTION"
# Drop low-confidence entries; if nothing passes, use NOASSERTION:
# [] = licensee didn't run, timed out, or found no license file
# ['NOASSERTION'] = found a license file but couldn't reliably identify it
result: list[str] = []
seen: set[str] = set()
for entry in licenses:
spdx_id = entry.get("spdx_id")
if not spdx_id or spdx_id in seen:
continue
if spdx_id == "NOASSERTION":
continue
confidence = confidence_by_spdx.get(spdx_id)
if confidence is not None and confidence < LICENSE_CONFIDENCE_THRESHOLD:
self.logger.info(
f"License {spdx_id} dropped: confidence {confidence}% below threshold in {repo_path}"
)
continue
result.append(spdx_id)
seen.add(spdx_id)

if spdx_id:
if not result and licenses:
self.logger.info(
f"License detected: {spdx_id} (confidence={confidence}) in {repo_path}"
f"All licenses below threshold, storing NOASSERTION in {repo_path}"
)
return ["NOASSERTION"]

if result:
self.logger.info(f"Licenses detected: {result} in {repo_path}")
else:
self.logger.info(f"No SPDX license matched in {repo_path}")
return spdx_id
return result
except Exception as e:
self.logger.warning(f"Failed to parse licensee output: {repr(e)}")
return None
return []
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
mark_repo_as_processed,
release_repo,
update_last_processed_commit,
update_repository_license,
update_repository_licenses,
)
from crowdgit.enums import RepositoryState
from crowdgit.errors import (
Expand Down Expand Up @@ -242,8 +242,8 @@ async def _process_single_repository(self, repository: Repository):
repository.id, batch_info.repo_path, repository.url
)
await self.maintainer_service.process_maintainers(repository, batch_info)
license_spdx = await self.license_service.detect(batch_info.repo_path)
await update_repository_license(repository.id, license_spdx)
licenses = await self.license_service.detect(batch_info.repo_path)
await update_repository_licenses(repository.id, licenses)
await self.commit_service.process_single_batch_commits(
repository,
batch_info,
Expand Down
6 changes: 3 additions & 3 deletions services/libs/data-access-layer/src/repositories/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ export interface IRepository {
updatedAt: string
deletedAt: string | null
lastArchivedCheckAt: string | null
license: string | null
licenses: string[] | null
}

export interface ICreateRepository {
Expand Down Expand Up @@ -150,7 +150,7 @@ export async function getRepositoriesBySourceIntegrationId(
"updatedAt",
"deletedAt",
"lastArchivedCheckAt",
license
licenses
FROM public.repositories
WHERE "sourceIntegrationId" = $(sourceIntegrationId)
AND "deletedAt" IS NULL
Expand Down Expand Up @@ -193,7 +193,7 @@ export async function getRepositoriesByUrl(
"updatedAt",
"deletedAt",
"lastArchivedCheckAt",
license
licenses
FROM public.repositories
WHERE url IN ($(repoUrls:csv))
${deletedFilter}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ SCHEMA >
`communityLanguages` Array(String),
`status` String,
`maturity` LowCardinality(String),
`lastVulnerabilityScanStatus` Nullable(String)
`lastVulnerabilityScanStatus` Nullable(String),
`repoLicenses` Array(Tuple(String, String))

ENGINE MergeTree
ENGINE_PARTITION_KEY toYear(createdAt)
Expand Down
3 changes: 2 additions & 1 deletion services/libs/tinybird/datasources/repositories.datasource
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ SCHEMA >
`createdAt` DateTime64(3) `json:$.record.createdAt`,
`updatedAt` DateTime64(3) `json:$.record.updatedAt`,
`deletedAt` Nullable(DateTime64(3)) `json:$.record.deletedAt`,
`lastArchivedCheckAt` Nullable(DateTime64(3)) `json:$.record.lastArchivedCheckAt`
`lastArchivedCheckAt` Nullable(DateTime64(3)) `json:$.record.lastArchivedCheckAt`,
`licenses` Array(String) `json:$.record.licenses` DEFAULT []

ENGINE ReplacingMergeTree
ENGINE_PARTITION_KEY toYear(createdAt)
Expand Down
3 changes: 2 additions & 1 deletion services/libs/tinybird/pipes/insightsProjects_filtered.pipe
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ SQL >
insights_projects_populated_ds.communityLanguages,
insights_projects_populated_ds.status,
insights_projects_populated_ds.maturity,
insights_projects_populated_ds.lastVulnerabilityScanStatus
insights_projects_populated_ds.lastVulnerabilityScanStatus,
insights_projects_populated_ds.repoLicenses
FROM insights_projects_populated_ds
where
insights_projects_populated_ds.enabled = 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ SQL >

NODE insights_projects_populated_copy_flatten_projects
SQL >
SELECT i.id, r.url AS repository
SELECT i.id, r.url AS repository, r.licenses AS licenses
FROM insightsProjects i FINAL
JOIN
repositories r FINAL ON r.insightsProjectId = i.id AND isNull (r.deletedAt) AND r.enabled = true
Expand All @@ -77,6 +77,7 @@ SQL >
SELECT
insights_projects_populated_copy_flatten_projects.id as id,
insights_projects_populated_copy_flatten_projects.repository as repository,
insights_projects_populated_copy_flatten_projects.licenses as licenses,
insights_projects_populated_copy_criticality_scores_deduplicated.score as score,
insights_projects_populated_copy_criticality_scores_deduplicated.rank as rank
FROM insights_projects_populated_copy_flatten_projects
Expand All @@ -92,7 +93,8 @@ SQL >
max(score) as projectScore,
argMax(rank, score) AS projectRank,
groupArray((repository, score, rank)) as repoData,
groupArray(repository) as repositories
groupArray(repository) as repositories,
arrayFlatten(groupArray(arrayMap(l -> tuple(repository, l), licenses))) as repoLicenses
FROM insights_projects_populated_copy_repository_criticality
GROUP BY id

Expand Down Expand Up @@ -209,6 +211,7 @@ SQL >
any (insightsProjects.twitter) as twitter,
any (insightsProjects.widgets) as widgets,
any (insights_projects_populated_copy_project_repo_data.repositories) as repositories,
any (insights_projects_populated_copy_project_repo_data.repoLicenses) as repoLicenses,
any (insightsProjects.enabled) as enabled,
any (insightsProjects.isLF) as isLF,
any (insightsProjects.keywords) as keywords,
Expand Down
Loading