Skip to content

Commit 35cc08f

Browse files
authored
Merge pull request #13 from aboutcode-org/snippet-tests
Snippet tests
2 parents 7b96032 + fc6b4b4 commit 35cc08f

File tree

8 files changed

+15638
-1213
lines changed

8 files changed

+15638
-1213
lines changed

azure-pipelines.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@ jobs:
1111
parameters:
1212
job_name: ubuntu20_cpython
1313
image_name: ubuntu-20.04
14-
python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
14+
python_versions: ['3.9', '3.10', '3.11', '3.12']
1515
test_suites:
1616
all: venv/bin/pytest -n 2 -vvs --ignore src/matchcode_toolkit/pipelines
1717

1818
- template: etc/ci/azure-posix.yml
1919
parameters:
2020
job_name: ubuntu22_cpython
2121
image_name: ubuntu-22.04
22-
python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
22+
python_versions: ['3.9', '3.10', '3.11', '3.12']
2323
test_suites:
2424
all: venv/bin/pytest -n 2 -vvs --ignore src/matchcode_toolkit/pipelines

requirements-dev.txt

Lines changed: 49 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,86 +1,87 @@
1-
aboutcode-toolkit==10.1.0
1+
aboutcode-toolkit==11.0.0
22
banal==1.0.6
3+
beartype==0.19.0
34
binaryornot==0.4.4
4-
black==24.2.0
5+
black==24.10.0
56
boolean.py==4.0
6-
cffi==1.16.0
7-
chardet==5.2.0
7+
cffi==1.17.1
88
colorama==0.4.6
9-
container-inspector==32.0.1
10-
cryptography==42.0.5
9+
container-inspector==33.0.0
10+
cryptography==43.0.3
1111
debian_inspector==31.1.0
1212
dockerfile-parse==2.0.1
13-
docutils==0.20.1
13+
docutils==0.21.2
1414
dparse2==0.7.0
15-
et-xmlfile==1.1.0
16-
execnet==2.0.2
15+
et_xmlfile==2.0.0
16+
execnet==2.1.1
1717
extractcode==31.0.0
1818
extractcode-7z==16.5.210531
1919
extractcode-libarchive==3.5.1.210531
2020
fasteners==0.19
2121
fingerprints==1.2.3
22-
ftfy==6.1.3
23-
gemfileparser2==0.9.3
22+
ftfy==6.3.1
23+
gemfileparser2==0.9.4
2424
html5lib==1.1
25-
importlib-metadata==7.0.1
25+
importlib_metadata==8.5.0
2626
iniconfig==2.0.0
27-
intbitset==3.1.0
28-
isodate==0.6.1
27+
intbitset==4.0.0
2928
isort==5.13.2
30-
jaraco.classes==3.3.1
31-
jaraco.functools==4.0.0
29+
jaraco.classes==3.4.0
30+
jaraco.context==6.0.1
31+
jaraco.functools==4.1.0
3232
javaproperties==0.8.1
3333
jeepney==0.8.0
34-
jinja2==3.1.3
34+
jinja2==3.1.4
3535
jsonstreams==0.6.0
36-
keyring==24.3.0
37-
license-expression==30.2.0
38-
lxml==4.9.4
36+
keyring==25.5.0
37+
license-expression==30.4.0
38+
lxml==5.3.0
3939
markdown-it-py==3.0.0
40-
markupsafe==2.1.5
40+
markupsafe==3.0.2
4141
mdurl==0.1.2
42-
more-itertools==10.2.0
42+
more-itertools==10.5.0
4343
mypy-extensions==1.0.0
44-
nh3==0.2.15
44+
nh3==0.2.18
4545
normality==2.5.0
46-
openpyxl==3.1.2
47-
packageurl-python==0.13.4
48-
packaging==23.2
46+
openpyxl==3.1.5
47+
packageurl-python==0.16.0
48+
packaging==24.1
4949
packvers==21.5
5050
parameter-expansion-patched==0.3.1
5151
pathspec==0.12.1
52-
pdfminer.six==20231228
53-
pefile==2023.2.7
52+
pdfminer.six==20240706
53+
pefile==2024.8.26
5454
pip-requirements-parser==32.0.1
55-
pkginfo==1.9.6
55+
pkginfo==1.10.0
5656
pkginfo2==30.0.0
57-
platformdirs==4.2.0
57+
platformdirs==4.3.6
5858
ply==3.11
5959
publicsuffix2==2.20191221
60-
pyahocorasick==2.0.0
61-
pycodestyle==2.11.1
62-
pycparser==2.21
63-
pygmars==0.8.0
64-
pygments==2.17.2
65-
pymaven-patch==0.3.0
66-
pyparsing==3.1.1
67-
pytest==8.0.1
68-
pytest-xdist==3.5.0
69-
rdflib==7.0.0
70-
readme-renderer==42.0
60+
pyahocorasick==2.1.0
61+
pycodestyle==2.12.1
62+
pycparser==2.22
63+
pygmars==0.9.0
64+
pygments==2.18.0
65+
pymaven-patch==0.3.2
66+
pyparsing==3.2.0
67+
pytest==8.3.3
68+
pytest-xdist==3.6.1
69+
rdflib==7.1.1
70+
readme_renderer==44.0
7171
requests-toolbelt==1.0.0
7272
rfc3986==2.0.0
73-
rich==13.7.0
74-
scancode-toolkit==32.0.8
73+
rich==13.9.4
7574
secretstorage==3.3.3
75+
semantic-version==2.10.0
7676
six==1.16.0
77-
spdx-tools==0.7.0rc0
77+
spdx-tools==0.8.2
7878
toml==0.10.2
79-
twine==5.0.0
80-
typecode==30.0.1
79+
twine==5.1.1
80+
typecode==30.0.2
8181
typecode-libmagic==5.39.210531
82+
uritools==4.0.3
8283
urlpy==0.5
8384
wcwidth==0.2.13
8485
webencodings==0.5.1
85-
xmltodict==0.13.0
86-
zipp==3.17.0
86+
xmltodict==0.14.2
87+
zipp==3.20.2

requirements.txt

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,20 @@
1-
attrs==23.2.0
1+
attrs==24.2.0
22
beautifulsoup4==4.12.3
3-
bitarray==2.9.2
4-
certifi==2024.2.2
5-
charset-normalizer==3.3.2
3+
bitarray==3.0.0
4+
certifi==2024.8.30
5+
chardet==5.2.0
6+
charset-normalizer==3.4.0
67
click==8.1.7
7-
commoncode==31.0.3
8-
idna==3.6
9-
pip==23.3.1
10-
pluggy==1.4.0
8+
commoncode==32.0.0
9+
idna==3.10
10+
pip==24.2
11+
pluggy==1.5.0
1112
plugincode==32.0.0
12-
PyYAML==6.0.1
13-
requests==2.31.0
14-
saneyaml==0.6.0
15-
setuptools==69.0.2
16-
soupsieve==2.5
13+
PyYAML==6.0.2
14+
requests==2.32.3
15+
saneyaml==0.6.1
16+
setuptools==75.3.0
17+
soupsieve==2.6
1718
text-unidecode==1.3
18-
urllib3==2.2.1
19-
wheel==0.42.0
19+
urllib3==2.2.3
20+
wheel==0.44.0

setup.cfg

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ zip_safe = false
4141

4242
setup_requires = setuptools_scm[toml] >= 4
4343

44-
python_requires = >=3.7
44+
python_requires = >=3.9
4545

4646
install_requires =
4747
bitarray
@@ -62,7 +62,7 @@ testing =
6262
twine
6363
black
6464
isort
65-
scancode-toolkit
65+
scancode-toolkit @ git+https://github.com/nexB/scancode-toolkit.git@15b76ea4f86327f11dd509d674cc7dab6fa52b5b
6666

6767
docs =
6868
Sphinx>=5.0.2

src/matchcode_toolkit/fingerprinting.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ def tokenizer(text):
194194
return _tokenizer(text.lower())
195195

196196

197-
def get_file_fingerprint_hashes(location, ngram_length=8, window_length=64, **kwargs):
197+
def get_file_fingerprint_hashes(location, ngram_length=5, window_length=16, include_ngrams=False, **kwargs):
198198
"""
199199
Return a mapping of fingerprint hashes for the file at `location`
200200
@@ -221,12 +221,13 @@ def get_file_fingerprint_hashes(location, ngram_length=8, window_length=64, **kw
221221
content,
222222
ngram_length=ngram_length,
223223
window_length=window_length,
224+
include_ngrams=include_ngrams,
224225
)
225226

226227

227-
def create_file_fingerprints(content, ngram_length=8, window_length=64):
228+
def create_file_fingerprints(content, ngram_length=5, window_length=16, include_ngrams=False):
228229
"""
229-
Return a mapping of halo1 and snippet hashes from content
230+
Return a mapping of halo1 and snippet hashes from content string
230231
"""
231232
from licensedcode.tokenize import ngrams
232233
from licensedcode.tokenize import select_ngrams
@@ -236,12 +237,13 @@ def create_file_fingerprints(content, ngram_length=8, window_length=64):
236237
"snippets": [],
237238
}
238239

239-
# tokenize content intow words
240+
# tokenize content into words
240241
words = list(tokenizer(content))
241242

242243
# Create a file fingerprint from the number of elements in the content hash
243244
# and the content hash digest iteself.
244245
ngs = ngrams(words, ngram_length)
246+
# TODO: consider using itertools.chain.from_iterable()
245247
ngs_bytes = [[g.encode("utf-8") for g in ng] for ng in ngs]
246248
ngs_bytes = [b"".join(ng) for ng in ngs_bytes]
247249
content_hash, ngs_count = BitAverageHaloHash(ngs_bytes), len(ngs_bytes)
@@ -251,14 +253,23 @@ def create_file_fingerprints(content, ngram_length=8, window_length=64):
251253
file_fingerprint = ngs_count_hex_str + content_fingerprint
252254
fingerprints["halo1"] = file_fingerprint
253255

254-
# Select windows from the content to find snippet similarities
256+
# Select windows from the content to compute snippet fingerprints
255257
windows = ngrams(words, window_length)
256-
selected_windows = select_ngrams(windows)
257-
selected_windows_bytes = [[g.encode("utf-8") for g in window] for window in selected_windows]
258-
selected_windows_bytes = [b"".join(window) for window in selected_windows_bytes]
259-
snippets = [
260-
BitAverageHaloHash(window).hexdigest().decode("utf-8") for window in selected_windows_bytes
258+
selected_windows = list(select_ngrams(windows, with_pos=True))
259+
# TODO: consider using itertools.chain.from_iterable()
260+
selected_windows_bytes = [
261+
(pos, [g.encode("utf-8") for g in window]) for pos, window in selected_windows
261262
]
263+
selected_windows_bytes = [(pos, b"".join(window)) for pos, window in selected_windows_bytes]
264+
snippets = []
265+
for (pos, window_bytes), (_, window) in zip(selected_windows_bytes, selected_windows):
266+
s = {
267+
"position": pos,
268+
"snippet": BitAverageHaloHash(window_bytes).hexdigest().decode("utf-8"),
269+
}
270+
if include_ngrams:
271+
s["ngrams"] = list(window)
272+
snippets.append(s)
262273
if snippets:
263274
fingerprints["snippets"] = snippets
264275

tests/test_fingerprinting.py

Lines changed: 69 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88
#
99

1010
import os
11+
from collections import defaultdict
1112

1213
from commoncode.resource import VirtualCodebase
1314
from commoncode.testcase import FileBasedTesting
15+
from commoncode.testcase import check_against_expected_json_file
1416

1517
from matchcode_toolkit.fingerprinting import _create_directory_fingerprint
1618
from matchcode_toolkit.fingerprinting import _get_resource_subpath
@@ -137,13 +139,13 @@ def test_get_file_fingerprint_hashes_one_line_removed(self):
137139
result1_indexed_elements_count, result1_fingerprint = split_fingerprint(result1)
138140
result2_indexed_elements_count, result2_fingerprint = split_fingerprint(result2)
139141

140-
expected_result1_indexed_elements_count = 6395
141-
expected_result2_indexed_elements_count = 6388
142+
expected_result1_indexed_elements_count = 6398
143+
expected_result2_indexed_elements_count = 6391
142144
self.assertEqual(expected_result1_indexed_elements_count, result1_indexed_elements_count)
143145
self.assertEqual(expected_result2_indexed_elements_count, result2_indexed_elements_count)
144146

145-
expected_result1_fingerprint = "a23a49e4cd40718d1297be719e6564a4"
146-
expected_result2_fingerprint = "aa3a49e4cd40718d1297be519e6564a4"
147+
expected_result1_fingerprint = "dc025ae7ebb104419e5314c665a08919"
148+
expected_result2_fingerprint = "dc025ae7ebb104419e5354c665a0891d"
147149
self.assertEqual(expected_result1_fingerprint, result1_fingerprint)
148150
self.assertEqual(expected_result2_fingerprint, result2_fingerprint)
149151

@@ -159,26 +161,76 @@ def test_get_file_fingerprint_hashes_one_line_added(self):
159161
result1_indexed_elements_count, result1_fingerprint = split_fingerprint(result1)
160162
result2_indexed_elements_count, result2_fingerprint = split_fingerprint(result2)
161163

162-
expected_result1_indexed_elements_count = 6395
163-
expected_result2_indexed_elements_count = 6398
164+
expected_result1_indexed_elements_count = 6398
165+
expected_result2_indexed_elements_count = 6401
164166
self.assertEqual(expected_result1_indexed_elements_count, result1_indexed_elements_count)
165167
self.assertEqual(expected_result2_indexed_elements_count, result2_indexed_elements_count)
166168

167-
expected_result1_fingerprint = "a23a49e4cd40718d1297be719e6564a4"
168-
expected_result2_fingerprint = "a23b49e4cd40708d1297be719c6564a4"
169+
expected_result1_fingerprint = "dc025ae7ebb104419e5314c665a08919"
170+
expected_result2_fingerprint = "dc025ae7ebb104419e5314c665a1891d"
169171
self.assertEqual(expected_result1_fingerprint, result1_fingerprint)
170172
self.assertEqual(expected_result2_fingerprint, result2_fingerprint)
171173

172-
self.assertEqual(3, byte_hamming_distance(result1_fingerprint, result2_fingerprint))
174+
self.assertEqual(2, byte_hamming_distance(result1_fingerprint, result2_fingerprint))
175+
176+
@classmethod
177+
def _create_snippet_mappings_by_snippets(cls, snippets):
178+
snippet_mappings_by_snippet = defaultdict(list)
179+
for s in snippets:
180+
snippet = s["snippet"]
181+
snippet_mappings_by_snippet[snippet].append(s)
182+
return snippet_mappings_by_snippet
173183

174-
def test_snippets_similarity(self):
184+
def test_snippets_similarity(self, regen=False):
175185
# 1 function from adler32.c has been added to zutil.c
176186
test_file1 = self.get_test_loc("snippets/adler32.c")
177187
test_file2 = self.get_test_loc("snippets/zutil.c")
178-
results1 = get_file_fingerprint_hashes(test_file1)
179-
results2 = get_file_fingerprint_hashes(test_file2)
180-
result1 = results1.get("snippets")
181-
result2 = results2.get("snippets")
182-
expected_result = {"16e774a453769c012ca1e7f3685b4111", "498885acf844eda1f65af9e746deaff7"}
183-
result = set(result1).intersection(result2)
184-
self.assertEqual(expected_result, result)
188+
results1 = get_file_fingerprint_hashes(test_file1, include_ngrams=True)
189+
results2 = get_file_fingerprint_hashes(test_file2, include_ngrams=True)
190+
results1_snippets = results1.get("snippets")
191+
results2_snippets = results2.get("snippets")
192+
193+
results1_snippet_mappings_by_snippets = self._create_snippet_mappings_by_snippets(
194+
results1_snippets
195+
)
196+
results2_snippet_mappings_by_snippets = self._create_snippet_mappings_by_snippets(results2_snippets)
197+
198+
matching_snippets = (
199+
results1_snippet_mappings_by_snippets.keys() & results2_snippet_mappings_by_snippets.keys()
200+
)
201+
expected_matching_snippets = {
202+
"33b1d50de7e1701bd4beb706bf25970e",
203+
"0dcb44bfa9a7c7e310ea9d4a921b777b",
204+
"9bc102ceddabba9c1dc31140500e6c6c",
205+
"310e6e530d4bda6977774b34515101ab",
206+
"cd50d59e9cd0df93ef6b8dfbf0f7d311",
207+
"5af889295c942ecb75189c86df62e201",
208+
"0057152e3b1795b6befd36a4412c21a5",
209+
"c09e0b1020b5265ccac6d03439dff2dc",
210+
"ecbedbeebd47e4a24210bfb8419c9f8e",
211+
"3c866b47965d9cc62c4640e3ae132d2b",
212+
"2b74fe7dde58dfa20bf75a6b4e589a10",
213+
"07a7b1300fb58b5f9b9b3e56df23e003",
214+
"72a86996522cfb9f83cf388d8010b7ab",
215+
"d45f0d54c32b2d884919665c65c65638",
216+
"cac65171e0f01c57e1af7a5b99929d12",
217+
"8571422ee6dec38705bcdb8c12496473",
218+
"b9db06731d27c61a56600e74d145e814",
219+
"ba34fbe4e05f3f28641958ecc5eb9af9",
220+
"de43d78e467331cc3bcbf87fdb3c90c3",
221+
}
222+
self.assertEqual(expected_matching_snippets, matching_snippets)
223+
224+
results = []
225+
for snippet in sorted(matching_snippets):
226+
sorted_results1 = results1_snippet_mappings_by_snippets[snippet]
227+
sorted_results2 = results2_snippet_mappings_by_snippets[snippet]
228+
results.append(
229+
{
230+
"snippet": snippet,
231+
"snippet_matched_to_results1": sorted_results1,
232+
"snippet_matched_to_results2": sorted_results2,
233+
}
234+
)
235+
expected_results_loc = self.get_test_loc("snippets/snippet-similarity-expected.json")
236+
check_against_expected_json_file(results, expected_results_loc, regen=regen)

0 commit comments

Comments
 (0)