Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions providers/fetch/pypiFetch.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ const AbstractFetch = require('./abstractFetch')
const nodeRequest = require('../../lib/fetch')
const fs = require('node:fs')
const spdxCorrect = require('spdx-correct')
const SPDX = require('@clearlydefined/spdx')
const { findLastKey, get, find, clone } = require('lodash')
const FetchResult = require('../../lib/fetchResult')

Expand Down Expand Up @@ -96,7 +97,23 @@ class PyPiFetch extends AbstractFetch {
return null
}

_extractLicenseExpression(registryData) {
const licenseExpression = get(registryData, 'info.license_expression')
if (licenseExpression && typeof licenseExpression === 'string' && licenseExpression !== '') {
return licenseExpression
}
return null
}

_extractDeclaredLicense(registryData) {
const licenseExpression = this._extractLicenseExpression(registryData)
if (licenseExpression) {
// PEP 639: license_expression is authoritative, no fallback to legacy fields
// Even if SPDX.normalize returns NOASSERTION for invalid expressions, we don't
// fall back to info.license or classifiers to maintain PEP 639 compliance
return SPDX.normalize(licenseExpression)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If a package sets license_expression to garbage like "Proprietary - see LICENSE", SPDX.normalize returns "NOASSERTION" and we bail out, never checking info.license or classifiers.

That's probably the right call per PEP 639 (the field is supposed to be authoritative), but it's a real behavior change. A package that previously resolved fine via info.license: "MIT" would now get NOASSERTION if someone also set a bad license_expression.

Two asks:

  1. Add a test that explicitly covers this case so it's clearly intentional (not an accident).
  2. Maybe a brief code comment here — something like // PEP 639: license_expression is authoritative, no fallback to legacy fields so the next person doesn't "fix" it by adding a fallback.

}

const licenseInMetadata = get(registryData, 'info.license')
const hasVersionInMeta = /\d+/.test(licenseInMetadata)
const licenseInClassifiers = this._extractLicenseFromClassifiers(registryData)
Expand Down
110 changes: 110 additions & 0 deletions test/unit/providers/fetch/pypiFetchTests.js
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,117 @@ describe('pypiFetch handle function', () => {
expect(result.outcome).to.be.equal('Missing ')
})

describe('extractLicenseExpression (PEP-639)', () => {
it('returns license_expression when present', () => {
const registryData = { info: { license_expression: 'MIT' } }
const result = fetch._extractLicenseExpression(registryData)
expect(result).to.be.equal('MIT')
})

it('returns null when license_expression is missing', () => {
const registryData = { info: { license: 'MIT' } }
const result = fetch._extractLicenseExpression(registryData)
expect(result).to.be.null
})

it('returns null when license_expression is null', () => {
const registryData = { info: { license_expression: null } }
const result = fetch._extractLicenseExpression(registryData)
expect(result).to.be.null
})

it('returns null when license_expression is empty string', () => {
const registryData = { info: { license_expression: '' } }
const result = fetch._extractLicenseExpression(registryData)
expect(result).to.be.null
})

it('returns null when license_expression is not a string', () => {
const registryData = { info: { license_expression: 123 } }
const result = fetch._extractLicenseExpression(registryData)
expect(result).to.be.null
})
})

describe('extractDeclaredLicense', () => {
it('prioritizes license_expression over info.license (PEP-639)', () => {
const registryData = {
info: {
license_expression: 'MIT',
license: 'Apache-2.0'
}
}
const declared = fetch._extractDeclaredLicense(registryData)
expect(declared).to.be.equal('MIT')
})

it('prioritizes license_expression over classifiers (PEP-639)', () => {
const registryData = {
info: {
license_expression: 'BSD-3-Clause',
license: null,
classifiers: ['License :: OSI Approved :: MIT License']
}
}
const declared = fetch._extractDeclaredLicense(registryData)
expect(declared).to.be.equal('BSD-3-Clause')
})

it('normalizes compound SPDX expressions with LicenseRef', () => {
const registryData = { info: { license_expression: 'MIT AND LicenseRef-proprietary' } }
const declared = fetch._extractDeclaredLicense(registryData)
expect(declared).to.be.equal('MIT AND LicenseRef-proprietary')
})

it('normalizes expressions with WITH clause', () => {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test covers a valid exception (Classpath-exception-2.0). Worth knowing that SPDX.normalize collapses the entire expression to NOASSERTION for an invalid exception e.g., Apache-2.0 WITH commons-clause becomes NOASSERTION, not Apache-2.0 WITH NOASSERTION. That's @clearlydefined/spdx behavior, not a bug in this PR, but a test covering it would prevent confusion later.

const registryData = { info: { license_expression: 'GPL-2.0-or-later WITH Classpath-exception-2.0' } }
const declared = fetch._extractDeclaredLicense(registryData)
expect(declared).to.be.equal('GPL-2.0-or-later WITH Classpath-exception-2.0')
})

it('returns NOASSERTION for invalid exception in WITH clause', () => {
const registryData = { info: { license_expression: 'Apache-2.0 WITH commons-clause' } }
const declared = fetch._extractDeclaredLicense(registryData)
// SPDX.normalize collapses entire expression to NOASSERTION for invalid exceptions,
// not just the exception part (e.g., not "Apache-2.0 WITH NOASSERTION")
expect(declared).to.be.equal('NOASSERTION')
})

it('returns NOASSERTION for invalid parts in compound license_expression', () => {
const registryData = {
info: {
license_expression: 'MIT OR UNKNOWN'
}
}
const declared = fetch._extractDeclaredLicense(registryData)
expect(declared).to.be.equal('MIT OR NOASSERTION')
})

it('does not fallback to info.license when license_expression is invalid (PEP-639)', () => {
const registryData = {
info: {
license_expression: 'Proprietary - see LICENSE',
license: 'MIT',
classifiers: ['License :: OSI Approved :: Apache Software License']
}
}
const declared = fetch._extractDeclaredLicense(registryData)
// PEP 639: license_expression is authoritative, even if invalid.
// No fallback to legacy fields (info.license or classifiers)
expect(declared).to.be.equal('NOASSERTION')
})

it('falls back to info.license when license_expression is missing', () => {
const registryData = {
info: {
license: 'MIT',
classifiers: []
}
}
const declared = fetch._extractDeclaredLicense(registryData)
expect(declared).to.be.equal('MIT')
})

it('parses LGPL-2.1-only: info.license over classifiers (crawler/issues/523)', () => {
const registryData = JSON.parse(fs.readFileSync('test/fixtures/pypi/registryData_lgpl2.json'))
const declared = fetch._extractDeclaredLicense(registryData)
Expand Down