Merge pull request #13 from aboutcode-org/snippet-tests

JonoYang · web-flow · commit 35cc08fdf98f · 2024-11-07T15:44:43.000-08:00
Snippet tests
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -11,14 +11,14 @@ jobs:
       parameters:
           job_name: ubuntu20_cpython
           image_name: ubuntu-20.04
-          python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
+          python_versions: ['3.9', '3.10', '3.11', '3.12']
           test_suites:
               all: venv/bin/pytest -n 2 -vvs --ignore src/matchcode_toolkit/pipelines
 
     - template: etc/ci/azure-posix.yml
       parameters:
           job_name: ubuntu22_cpython
           image_name: ubuntu-22.04
-          python_versions: ['3.8', '3.9', '3.10', '3.11', '3.12']
+          python_versions: ['3.9', '3.10', '3.11', '3.12']
           test_suites:
               all: venv/bin/pytest -n 2 -vvs --ignore src/matchcode_toolkit/pipelines
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,86 +1,87 @@
-aboutcode-toolkit==10.1.0
+aboutcode-toolkit==11.0.0
 banal==1.0.6
+beartype==0.19.0
 binaryornot==0.4.4
-black==24.2.0
+black==24.10.0
 boolean.py==4.0
-cffi==1.16.0
-chardet==5.2.0
+cffi==1.17.1
 colorama==0.4.6
-container-inspector==32.0.1
-cryptography==42.0.5
+container-inspector==33.0.0
+cryptography==43.0.3
 debian_inspector==31.1.0
 dockerfile-parse==2.0.1
-docutils==0.20.1
+docutils==0.21.2
 dparse2==0.7.0
-et-xmlfile==1.1.0
-execnet==2.0.2
+et_xmlfile==2.0.0
+execnet==2.1.1
 extractcode==31.0.0
 extractcode-7z==16.5.210531
 extractcode-libarchive==3.5.1.210531
 fasteners==0.19
 fingerprints==1.2.3
-ftfy==6.1.3
-gemfileparser2==0.9.3
+ftfy==6.3.1
+gemfileparser2==0.9.4
 html5lib==1.1
-importlib-metadata==7.0.1
+importlib_metadata==8.5.0
 iniconfig==2.0.0
-intbitset==3.1.0
-isodate==0.6.1
+intbitset==4.0.0
 isort==5.13.2
-jaraco.classes==3.3.1
-jaraco.functools==4.0.0
+jaraco.classes==3.4.0
+jaraco.context==6.0.1
+jaraco.functools==4.1.0
 javaproperties==0.8.1
 jeepney==0.8.0
-jinja2==3.1.3
+jinja2==3.1.4
 jsonstreams==0.6.0
-keyring==24.3.0
-license-expression==30.2.0
-lxml==4.9.4
+keyring==25.5.0
+license-expression==30.4.0
+lxml==5.3.0
 markdown-it-py==3.0.0
-markupsafe==2.1.5
+markupsafe==3.0.2
 mdurl==0.1.2
-more-itertools==10.2.0
+more-itertools==10.5.0
 mypy-extensions==1.0.0
-nh3==0.2.15
+nh3==0.2.18
 normality==2.5.0
-openpyxl==3.1.2
-packageurl-python==0.13.4
-packaging==23.2
+openpyxl==3.1.5
+packageurl-python==0.16.0
+packaging==24.1
 packvers==21.5
 parameter-expansion-patched==0.3.1
 pathspec==0.12.1
-pdfminer.six==20231228
-pefile==2023.2.7
+pdfminer.six==20240706
+pefile==2024.8.26
 pip-requirements-parser==32.0.1
-pkginfo==1.9.6
+pkginfo==1.10.0
 pkginfo2==30.0.0
-platformdirs==4.2.0
+platformdirs==4.3.6
 ply==3.11
 publicsuffix2==2.20191221
-pyahocorasick==2.0.0
-pycodestyle==2.11.1
-pycparser==2.21
-pygmars==0.8.0
-pygments==2.17.2
-pymaven-patch==0.3.0
-pyparsing==3.1.1
-pytest==8.0.1
-pytest-xdist==3.5.0
-rdflib==7.0.0
-readme-renderer==42.0
+pyahocorasick==2.1.0
+pycodestyle==2.12.1
+pycparser==2.22
+pygmars==0.9.0
+pygments==2.18.0
+pymaven-patch==0.3.2
+pyparsing==3.2.0
+pytest==8.3.3
+pytest-xdist==3.6.1
+rdflib==7.1.1
+readme_renderer==44.0
 requests-toolbelt==1.0.0
 rfc3986==2.0.0
-rich==13.7.0
-scancode-toolkit==32.0.8
+rich==13.9.4
 secretstorage==3.3.3
+semantic-version==2.10.0
 six==1.16.0
-spdx-tools==0.7.0rc0
+spdx-tools==0.8.2
 toml==0.10.2
-twine==5.0.0
-typecode==30.0.1
+twine==5.1.1
+typecode==30.0.2
 typecode-libmagic==5.39.210531
+uritools==4.0.3
 urlpy==0.5
 wcwidth==0.2.13
 webencodings==0.5.1
-xmltodict==0.13.0
-zipp==3.17.0
+xmltodict==0.14.2
+zipp==3.20.2
diff --git a/requirements.txt b/requirements.txt
@@ -1,19 +1,20 @@
-attrs==23.2.0
+attrs==24.2.0
 beautifulsoup4==4.12.3
-bitarray==2.9.2
-certifi==2024.2.2
-charset-normalizer==3.3.2
+bitarray==3.0.0
+certifi==2024.8.30
+chardet==5.2.0
+charset-normalizer==3.4.0
 click==8.1.7
-commoncode==31.0.3
-idna==3.6
-pip==23.3.1
-pluggy==1.4.0
+commoncode==32.0.0
+idna==3.10
+pip==24.2
+pluggy==1.5.0
 plugincode==32.0.0
-PyYAML==6.0.1
-requests==2.31.0
-saneyaml==0.6.0
-setuptools==69.0.2
-soupsieve==2.5
+PyYAML==6.0.2
+requests==2.32.3
+saneyaml==0.6.1
+setuptools==75.3.0
+soupsieve==2.6
 text-unidecode==1.3
-urllib3==2.2.1
-wheel==0.42.0
+urllib3==2.2.3
+wheel==0.44.0
diff --git a/setup.cfg b/setup.cfg
@@ -41,7 +41,7 @@ zip_safe = false
 
 setup_requires = setuptools_scm[toml] >= 4
 
-python_requires = >=3.7
+python_requires = >=3.9
 
 install_requires =
     bitarray
@@ -62,7 +62,7 @@ testing =
     twine
     black
     isort
-    scancode-toolkit
+    scancode-toolkit @ git+https://github.com/nexB/scancode-toolkit.git@15b76ea4f86327f11dd509d674cc7dab6fa52b5b
 
 docs =
     Sphinx>=5.0.2
diff --git a/src/matchcode_toolkit/fingerprinting.py b/src/matchcode_toolkit/fingerprinting.py
@@ -194,7 +194,7 @@ def tokenizer(text):
     return _tokenizer(text.lower())
 
 
-def get_file_fingerprint_hashes(location, ngram_length=8, window_length=64, **kwargs):
+def get_file_fingerprint_hashes(location, ngram_length=5, window_length=16, include_ngrams=False, **kwargs):
     """
     Return a mapping of fingerprint hashes for the file at `location`
 
@@ -221,12 +221,13 @@ def get_file_fingerprint_hashes(location, ngram_length=8, window_length=64, **kw
         content,
         ngram_length=ngram_length,
         window_length=window_length,
+        include_ngrams=include_ngrams,
     )
 
 
-def create_file_fingerprints(content, ngram_length=8, window_length=64):
+def create_file_fingerprints(content, ngram_length=5, window_length=16, include_ngrams=False):
     """
-    Return a mapping of halo1 and snippet hashes from content
+    Return a mapping of halo1 and snippet hashes from content string
     """
     from licensedcode.tokenize import ngrams
     from licensedcode.tokenize import select_ngrams
@@ -236,12 +237,13 @@ def create_file_fingerprints(content, ngram_length=8, window_length=64):
         "snippets": [],
     }
 
-    # tokenize content intow words
+    # tokenize content into words
     words = list(tokenizer(content))
 
     # Create a file fingerprint from the number of elements in the content hash
     # and the content hash digest iteself.
     ngs = ngrams(words, ngram_length)
+    # TODO: consider using itertools.chain.from_iterable()
     ngs_bytes = [[g.encode("utf-8") for g in ng] for ng in ngs]
     ngs_bytes = [b"".join(ng) for ng in ngs_bytes]
     content_hash, ngs_count = BitAverageHaloHash(ngs_bytes), len(ngs_bytes)
@@ -251,14 +253,23 @@ def create_file_fingerprints(content, ngram_length=8, window_length=64):
         file_fingerprint = ngs_count_hex_str + content_fingerprint
         fingerprints["halo1"] = file_fingerprint
 
-    # Select windows from the content to find snippet similarities
+    # Select windows from the content to compute snippet fingerprints
     windows = ngrams(words, window_length)
-    selected_windows = select_ngrams(windows)
-    selected_windows_bytes = [[g.encode("utf-8") for g in window] for window in selected_windows]
-    selected_windows_bytes = [b"".join(window) for window in selected_windows_bytes]
-    snippets = [
-        BitAverageHaloHash(window).hexdigest().decode("utf-8") for window in selected_windows_bytes
+    selected_windows = list(select_ngrams(windows, with_pos=True))
+    # TODO: consider using itertools.chain.from_iterable()
+    selected_windows_bytes = [
+        (pos, [g.encode("utf-8") for g in window]) for pos, window in selected_windows
     ]
+    selected_windows_bytes = [(pos, b"".join(window)) for pos, window in selected_windows_bytes]
+    snippets = []
+    for (pos, window_bytes), (_, window) in zip(selected_windows_bytes, selected_windows):
+        s = {
+            "position": pos,
+            "snippet": BitAverageHaloHash(window_bytes).hexdigest().decode("utf-8"),
+        }
+        if include_ngrams:
+            s["ngrams"] = list(window)
+        snippets.append(s)
     if snippets:
         fingerprints["snippets"] = snippets
 
diff --git a/tests/test_fingerprinting.py b/tests/test_fingerprinting.py
@@ -8,9 +8,11 @@
 #
 
 import os
+from collections import defaultdict
 
 from commoncode.resource import VirtualCodebase
 from commoncode.testcase import FileBasedTesting
+from commoncode.testcase import check_against_expected_json_file
 
 from matchcode_toolkit.fingerprinting import _create_directory_fingerprint
 from matchcode_toolkit.fingerprinting import _get_resource_subpath
@@ -137,13 +139,13 @@ def test_get_file_fingerprint_hashes_one_line_removed(self):
         result1_indexed_elements_count, result1_fingerprint = split_fingerprint(result1)
         result2_indexed_elements_count, result2_fingerprint = split_fingerprint(result2)
 
-        expected_result1_indexed_elements_count = 6395
-        expected_result2_indexed_elements_count = 6388
+        expected_result1_indexed_elements_count = 6398
+        expected_result2_indexed_elements_count = 6391
         self.assertEqual(expected_result1_indexed_elements_count, result1_indexed_elements_count)
         self.assertEqual(expected_result2_indexed_elements_count, result2_indexed_elements_count)
 
-        expected_result1_fingerprint = "a23a49e4cd40718d1297be719e6564a4"
-        expected_result2_fingerprint = "aa3a49e4cd40718d1297be519e6564a4"
+        expected_result1_fingerprint = "dc025ae7ebb104419e5314c665a08919"
+        expected_result2_fingerprint = "dc025ae7ebb104419e5354c665a0891d"
         self.assertEqual(expected_result1_fingerprint, result1_fingerprint)
         self.assertEqual(expected_result2_fingerprint, result2_fingerprint)
 
@@ -159,26 +161,76 @@ def test_get_file_fingerprint_hashes_one_line_added(self):
         result1_indexed_elements_count, result1_fingerprint = split_fingerprint(result1)
         result2_indexed_elements_count, result2_fingerprint = split_fingerprint(result2)
 
-        expected_result1_indexed_elements_count = 6395
-        expected_result2_indexed_elements_count = 6398
+        expected_result1_indexed_elements_count = 6398
+        expected_result2_indexed_elements_count = 6401
         self.assertEqual(expected_result1_indexed_elements_count, result1_indexed_elements_count)
         self.assertEqual(expected_result2_indexed_elements_count, result2_indexed_elements_count)
 
-        expected_result1_fingerprint = "a23a49e4cd40718d1297be719e6564a4"
-        expected_result2_fingerprint = "a23b49e4cd40708d1297be719c6564a4"
+        expected_result1_fingerprint = "dc025ae7ebb104419e5314c665a08919"
+        expected_result2_fingerprint = "dc025ae7ebb104419e5314c665a1891d"
         self.assertEqual(expected_result1_fingerprint, result1_fingerprint)
         self.assertEqual(expected_result2_fingerprint, result2_fingerprint)
 
-        self.assertEqual(3, byte_hamming_distance(result1_fingerprint, result2_fingerprint))
+        self.assertEqual(2, byte_hamming_distance(result1_fingerprint, result2_fingerprint))
+
+    @classmethod
+    def _create_snippet_mappings_by_snippets(cls, snippets):
+        snippet_mappings_by_snippet = defaultdict(list)
+        for s in snippets:
+            snippet = s["snippet"]
+            snippet_mappings_by_snippet[snippet].append(s)
+        return snippet_mappings_by_snippet
 
-    def test_snippets_similarity(self):
+    def test_snippets_similarity(self, regen=False):
         # 1 function from adler32.c has been added to zutil.c
         test_file1 = self.get_test_loc("snippets/adler32.c")
         test_file2 = self.get_test_loc("snippets/zutil.c")
-        results1 = get_file_fingerprint_hashes(test_file1)
-        results2 = get_file_fingerprint_hashes(test_file2)
-        result1 = results1.get("snippets")
-        result2 = results2.get("snippets")
-        expected_result = {"16e774a453769c012ca1e7f3685b4111", "498885acf844eda1f65af9e746deaff7"}
-        result = set(result1).intersection(result2)
-        self.assertEqual(expected_result, result)
+        results1 = get_file_fingerprint_hashes(test_file1, include_ngrams=True)
+        results2 = get_file_fingerprint_hashes(test_file2, include_ngrams=True)
+        results1_snippets = results1.get("snippets")
+        results2_snippets = results2.get("snippets")
+
+        results1_snippet_mappings_by_snippets = self._create_snippet_mappings_by_snippets(
+            results1_snippets
+        )
+        results2_snippet_mappings_by_snippets = self._create_snippet_mappings_by_snippets(results2_snippets)
+
+        matching_snippets = (
+            results1_snippet_mappings_by_snippets.keys() & results2_snippet_mappings_by_snippets.keys()
+        )
+        expected_matching_snippets = {
+            "33b1d50de7e1701bd4beb706bf25970e",
+            "0dcb44bfa9a7c7e310ea9d4a921b777b",
+            "9bc102ceddabba9c1dc31140500e6c6c",
+            "310e6e530d4bda6977774b34515101ab",
+            "cd50d59e9cd0df93ef6b8dfbf0f7d311",
+            "5af889295c942ecb75189c86df62e201",
+            "0057152e3b1795b6befd36a4412c21a5",
+            "c09e0b1020b5265ccac6d03439dff2dc",
+            "ecbedbeebd47e4a24210bfb8419c9f8e",
+            "3c866b47965d9cc62c4640e3ae132d2b",
+            "2b74fe7dde58dfa20bf75a6b4e589a10",
+            "07a7b1300fb58b5f9b9b3e56df23e003",
+            "72a86996522cfb9f83cf388d8010b7ab",
+            "d45f0d54c32b2d884919665c65c65638",
+            "cac65171e0f01c57e1af7a5b99929d12",
+            "8571422ee6dec38705bcdb8c12496473",
+            "b9db06731d27c61a56600e74d145e814",
+            "ba34fbe4e05f3f28641958ecc5eb9af9",
+            "de43d78e467331cc3bcbf87fdb3c90c3",
+        }
+        self.assertEqual(expected_matching_snippets, matching_snippets)
+
+        results = []
+        for snippet in sorted(matching_snippets):
+            sorted_results1 = results1_snippet_mappings_by_snippets[snippet]
+            sorted_results2 = results2_snippet_mappings_by_snippets[snippet]
+            results.append(
+                {
+                    "snippet": snippet,
+                    "snippet_matched_to_results1": sorted_results1,
+                    "snippet_matched_to_results2": sorted_results2,
+                }
+            )
+        expected_results_loc = self.get_test_loc("snippets/snippet-similarity-expected.json")
+        check_against_expected_json_file(results, expected_results_loc, regen=regen)
diff --git a/tests/testfiles/fingerprinting-expected.json b/tests/testfiles/fingerprinting-expected.json
diff --git a/tests/testfiles/fingerprinting/snippets/snippet-similarity-expected.json b/tests/testfiles/fingerprinting/snippets/snippet-similarity-expected.json