|
10 | 10 | import binascii |
11 | 11 | import re |
12 | 12 |
|
13 | | -from samecode.halohash import BitAverageHaloHash |
14 | 13 | from licensedcode.tokenize import query_lines |
15 | | - |
| 14 | +from samecode.halohash import BitAverageHaloHash |
16 | 15 |
|
17 | 16 | # A collection of directory fingerprints that we want to avoid |
18 | 17 | IGNORED_DIRECTORY_FINGERPRINTS = [ |
|
21 | 20 | "0000000000000000000000000000000000000000", |
22 | 21 | ] |
23 | 22 |
|
| 23 | +SNIPPET_WINDOW_LENGTH = 16 |
| 24 | + |
24 | 25 |
|
25 | 26 | def _create_directory_fingerprint(inputs): |
26 | 27 | """ |
@@ -166,6 +167,7 @@ def create_halohash_chunks(bah128): |
166 | 167 | query_pattern = "[^_\\W]+" |
167 | 168 | word_splitter = re.compile(query_pattern, re.UNICODE).findall |
168 | 169 |
|
| 170 | + |
169 | 171 | # TODO: return line numbers from where the token was taken |
170 | 172 | def _tokenizer(text): |
171 | 173 | """ |
@@ -197,7 +199,7 @@ def tokenizer(text): |
197 | 199 |
|
198 | 200 |
|
199 | 201 | def get_file_fingerprint_hashes( |
200 | | - location, ngram_length=5, window_length=16, include_ngrams=False, **kwargs |
| 202 | + location, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False, **kwargs |
201 | 203 | ): |
202 | 204 | """ |
203 | 205 | Return a mapping of fingerprint hashes for the file at `location` |
@@ -229,7 +231,9 @@ def get_file_fingerprint_hashes( |
229 | 231 | ) |
230 | 232 |
|
231 | 233 |
|
232 | | -def create_file_fingerprints(content, ngram_length=5, window_length=16, include_ngrams=False): |
| 234 | +def create_file_fingerprints( |
| 235 | + content, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False |
| 236 | +): |
233 | 237 | """ |
234 | 238 | Return a mapping of halo1 and snippet hashes from content string |
235 | 239 | """ |
|
0 commit comments