88#
99
1010import os
11+ from collections import defaultdict
1112
1213from commoncode .resource import VirtualCodebase
1314from commoncode .testcase import FileBasedTesting
15+ from commoncode .testcase import check_against_expected_json_file
1416
1517from matchcode_toolkit .fingerprinting import _create_directory_fingerprint
1618from matchcode_toolkit .fingerprinting import _get_resource_subpath
@@ -137,13 +139,13 @@ def test_get_file_fingerprint_hashes_one_line_removed(self):
137139 result1_indexed_elements_count , result1_fingerprint = split_fingerprint (result1 )
138140 result2_indexed_elements_count , result2_fingerprint = split_fingerprint (result2 )
139141
140- expected_result1_indexed_elements_count = 6395
141- expected_result2_indexed_elements_count = 6388
142+ expected_result1_indexed_elements_count = 6398
143+ expected_result2_indexed_elements_count = 6391
142144 self .assertEqual (expected_result1_indexed_elements_count , result1_indexed_elements_count )
143145 self .assertEqual (expected_result2_indexed_elements_count , result2_indexed_elements_count )
144146
145- expected_result1_fingerprint = "a23a49e4cd40718d1297be719e6564a4 "
146- expected_result2_fingerprint = "aa3a49e4cd40718d1297be519e6564a4 "
147+ expected_result1_fingerprint = "dc025ae7ebb104419e5314c665a08919 "
148+ expected_result2_fingerprint = "dc025ae7ebb104419e5354c665a0891d "
147149 self .assertEqual (expected_result1_fingerprint , result1_fingerprint )
148150 self .assertEqual (expected_result2_fingerprint , result2_fingerprint )
149151
@@ -159,26 +161,76 @@ def test_get_file_fingerprint_hashes_one_line_added(self):
159161 result1_indexed_elements_count , result1_fingerprint = split_fingerprint (result1 )
160162 result2_indexed_elements_count , result2_fingerprint = split_fingerprint (result2 )
161163
162- expected_result1_indexed_elements_count = 6395
163- expected_result2_indexed_elements_count = 6398
164+ expected_result1_indexed_elements_count = 6398
165+ expected_result2_indexed_elements_count = 6401
164166 self .assertEqual (expected_result1_indexed_elements_count , result1_indexed_elements_count )
165167 self .assertEqual (expected_result2_indexed_elements_count , result2_indexed_elements_count )
166168
167- expected_result1_fingerprint = "a23a49e4cd40718d1297be719e6564a4 "
168- expected_result2_fingerprint = "a23b49e4cd40708d1297be719c6564a4 "
169+ expected_result1_fingerprint = "dc025ae7ebb104419e5314c665a08919 "
170+ expected_result2_fingerprint = "dc025ae7ebb104419e5314c665a1891d "
169171 self .assertEqual (expected_result1_fingerprint , result1_fingerprint )
170172 self .assertEqual (expected_result2_fingerprint , result2_fingerprint )
171173
172- self .assertEqual (3 , byte_hamming_distance (result1_fingerprint , result2_fingerprint ))
174+ self .assertEqual (2 , byte_hamming_distance (result1_fingerprint , result2_fingerprint ))
175+
176+ @classmethod
177+ def _create_snippet_mappings_by_snippets (cls , snippets ):
178+ snippet_mappings_by_snippet = defaultdict (list )
179+ for s in snippets :
180+ snippet = s ["snippet" ]
181+ snippet_mappings_by_snippet [snippet ].append (s )
182+ return snippet_mappings_by_snippet
173183
174- def test_snippets_similarity (self ):
184+ def test_snippets_similarity (self , regen = False ):
175185 # 1 function from adler32.c has been added to zutil.c
176186 test_file1 = self .get_test_loc ("snippets/adler32.c" )
177187 test_file2 = self .get_test_loc ("snippets/zutil.c" )
178- results1 = get_file_fingerprint_hashes (test_file1 )
179- results2 = get_file_fingerprint_hashes (test_file2 )
180- result1 = results1 .get ("snippets" )
181- result2 = results2 .get ("snippets" )
182- expected_result = {"16e774a453769c012ca1e7f3685b4111" , "498885acf844eda1f65af9e746deaff7" }
183- result = set (result1 ).intersection (result2 )
184- self .assertEqual (expected_result , result )
188+ results1 = get_file_fingerprint_hashes (test_file1 , include_ngrams = True )
189+ results2 = get_file_fingerprint_hashes (test_file2 , include_ngrams = True )
190+ results1_snippets = results1 .get ("snippets" )
191+ results2_snippets = results2 .get ("snippets" )
192+
193+ results1_snippet_mappings_by_snippets = self ._create_snippet_mappings_by_snippets (
194+ results1_snippets
195+ )
196+ results2_snippet_mappings_by_snippets = self ._create_snippet_mappings_by_snippets (results2_snippets )
197+
198+ matching_snippets = (
199+ results1_snippet_mappings_by_snippets .keys () & results2_snippet_mappings_by_snippets .keys ()
200+ )
201+ expected_matching_snippets = {
202+ "33b1d50de7e1701bd4beb706bf25970e" ,
203+ "0dcb44bfa9a7c7e310ea9d4a921b777b" ,
204+ "9bc102ceddabba9c1dc31140500e6c6c" ,
205+ "310e6e530d4bda6977774b34515101ab" ,
206+ "cd50d59e9cd0df93ef6b8dfbf0f7d311" ,
207+ "5af889295c942ecb75189c86df62e201" ,
208+ "0057152e3b1795b6befd36a4412c21a5" ,
209+ "c09e0b1020b5265ccac6d03439dff2dc" ,
210+ "ecbedbeebd47e4a24210bfb8419c9f8e" ,
211+ "3c866b47965d9cc62c4640e3ae132d2b" ,
212+ "2b74fe7dde58dfa20bf75a6b4e589a10" ,
213+ "07a7b1300fb58b5f9b9b3e56df23e003" ,
214+ "72a86996522cfb9f83cf388d8010b7ab" ,
215+ "d45f0d54c32b2d884919665c65c65638" ,
216+ "cac65171e0f01c57e1af7a5b99929d12" ,
217+ "8571422ee6dec38705bcdb8c12496473" ,
218+ "b9db06731d27c61a56600e74d145e814" ,
219+ "ba34fbe4e05f3f28641958ecc5eb9af9" ,
220+ "de43d78e467331cc3bcbf87fdb3c90c3" ,
221+ }
222+ self .assertEqual (expected_matching_snippets , matching_snippets )
223+
224+ results = []
225+ for snippet in sorted (matching_snippets ):
226+ sorted_results1 = results1_snippet_mappings_by_snippets [snippet ]
227+ sorted_results2 = results2_snippet_mappings_by_snippets [snippet ]
228+ results .append (
229+ {
230+ "snippet" : snippet ,
231+ "snippet_matched_to_results1" : sorted_results1 ,
232+ "snippet_matched_to_results2" : sorted_results2 ,
233+ }
234+ )
235+ expected_results_loc = self .get_test_loc ("snippets/snippet-similarity-expected.json" )
236+ check_against_expected_json_file (results , expected_results_loc , regen = regen )
0 commit comments