Skip to content

Commit 07f3db0

Browse files
committed
initial version
1 parent 2e2dfd5 commit 07f3db0

File tree

1 file changed

+187
-109
lines changed

1 file changed

+187
-109
lines changed

Lib/difflib.py

Lines changed: 187 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -37,28 +37,109 @@
3737

3838
Match = _namedtuple('Match', 'a b size')
3939

40+
41+
class _LCSUBDict:
42+
"""Dict method for finding longest common substring.
43+
44+
Complexity:
45+
T: O(n1 + n2) best, O(n1 × n2) worst
46+
S: O(n2)
47+
48+
Members:
49+
pos2 for x in seq2, pos2[x] is a list of the indices (into seq2)
50+
at which x appears; junk elements do not appear
51+
"""
52+
53+
def __init__(self, seq2, junk=()):
54+
if not isinstance(junk, frozenset):
55+
junk = frozenset(junk)
56+
self.seq2 = seq2
57+
self.junk = junk
58+
self.pos2 = pos2 = {} # positions of each element in seq2
59+
for i, elt in enumerate(seq2):
60+
indices = pos2.setdefault(elt, [])
61+
indices.append(i)
62+
if junk:
63+
for elt in junk:
64+
del pos2[elt]
65+
66+
def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None):
67+
if stop1 is None:
68+
stop1 = len(seq1)
69+
if stop2 is None:
70+
stop2 = len(self.seq2)
71+
pos2 = self.pos2
72+
j2len = {}
73+
nothing = []
74+
besti, bestj, bestsize = start1, start2, 0
75+
# find longest junk-free match
76+
# during an iteration of the loop, j2len[j] = length of longest
77+
# junk-free match ending with seq1[i-1] and seq2[j]
78+
for i in range(start1, stop1):
79+
# look at all instances of seq1[i] in seq2; note that because
80+
# pos2 has no junk keys, the loop is skipped if seq1[i] is junk
81+
j2lenget = j2len.get
82+
newj2len = {}
83+
for j in pos2.get(seq1[i], nothing):
84+
# seq1[i] matches seq2[j]
85+
if j < start2:
86+
continue
87+
if j >= stop2:
88+
break
89+
k = newj2len[j] = j2lenget(j-1, 0) + 1
90+
if k > bestsize:
91+
besti = i - k + 1
92+
bestj = j - k + 1
93+
bestsize = k
94+
j2len = newj2len
95+
96+
return besti, bestj, bestsize
97+
98+
4099
_LENGTH = 0
41100
_LINK = 1
42101
_NEXT = 2
43102
_POS = 3
44103

45104

46105
class _LCSUBAutomaton:
47-
"""Suffix Automaton for finding longest common substring."""
106+
"""Suffix Automaton for finding longest common substring.
107+
108+
Complexity:
109+
T: O(n1 + n2) - roughly 2 * n1 + 6 * n2
110+
S: O(n2) - maximum nodes: 2 * n2 + 1
111+
112+
Node spec:
113+
node: list = [length: int, link: list, next: dict, end_pos: int]
114+
length - match length when the node is reached
115+
link - reference to a node to fall back to
116+
next - map to nodes to go to when matched
117+
end_pos - end position of first occurrence (used for result)
118+
"""
48119

49-
def __init__(self, s2, start2=0, stop2=None, *, junk=()):
50-
if stop2 is None:
51-
stop2 = len(s2)
120+
def __init__(self, seq2, junk=()):
121+
if not isinstance(junk, frozenset):
122+
junk = frozenset(junk)
123+
self.seq2 = seq2
124+
self.junk = junk
125+
self.root = None
126+
self.cache = (None, None)
52127

53-
self.start2 = start2
54-
self.stop2 = stop2
55-
self.junk = frozenset(junk)
56-
self.root = root = [0, None, {}, -1] # [length, link, next, end_pos]
128+
def _build(self, start2, stop2):
129+
"""
130+
Automaton needs to rebuild for every (start2, stop2)
131+
This is made to cache the last one and only rebuild on new values
132+
"""
133+
if self.root is not None and self.cache == (start2, stop2):
134+
return
57135

136+
self.root = root = [0, None, {}, -1]
137+
seq2 = self.seq2
138+
junk = self.junk
58139
last_len = 0
59140
last = root
60141
for j in range(start2, stop2):
61-
c = s2[j]
142+
c = seq2[j]
62143
if c in junk:
63144
last_len = 0
64145
last = root
@@ -81,6 +162,7 @@ def __init__(self, s2, start2=0, stop2=None, *, junk=()):
81162
if p_length_p1 == q[_LENGTH]:
82163
curr[_LINK] = q
83164
else:
165+
# Copy `q[_POS]` to ensure leftmost match in seq2
84166
clone = [p_length_p1, q[_LINK], q[_NEXT].copy(), q[_POS]]
85167
while (p_next := p[_NEXT]).get(c) is q:
86168
p_next[c] = clone
@@ -92,9 +174,16 @@ def __init__(self, s2, start2=0, stop2=None, *, junk=()):
92174

93175
last = curr
94176

95-
def find(self, s1, start1=0, stop1=None):
96-
if stop1 is None:
97-
stop1 = len(s1)
177+
self.cache = (start2, stop2)
178+
179+
def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None):
180+
size1 = len(seq1)
181+
size2 = len(self.seq2)
182+
if stop1 is None or stop1 > size1:
183+
stop1 = size1
184+
if stop2 is None or stop2 > size2:
185+
stop2 = size2
186+
self._build(start2, stop2)
98187
root = self.root
99188
junk = self.junk
100189
v = root
@@ -104,7 +193,7 @@ def find(self, s1, start1=0, stop1=None):
104193
best_pos = 0
105194

106195
for i in range(start1, stop1):
107-
c = s1[i]
196+
c = seq1[i]
108197
if c in junk:
109198
v = root
110199
l = 0
@@ -123,24 +212,20 @@ def find(self, s1, start1=0, stop1=None):
123212
best_pos = i
124213

125214
if not best_len:
126-
return (start1, self.start2, 0)
215+
return (start1, start2, 0)
127216

128217
start_in_s1 = best_pos + 1 - best_len
129218
end_in_s2 = best_state[_POS]
130219
start_in_s2 = end_in_s2 + 1 - best_len
131220
return (start_in_s1, start_in_s2, best_len)
132221

133222

134-
def longest_common_substring(s1, s2, start1=0, stop1=None, start2=0, stop2=None,
135-
*, junk=()):
136-
return _LCSUBAutomaton(s2, start2, stop2, junk=junk).find(s1, start1, stop1)
137-
138-
139223
def _calculate_ratio(matches, length):
140224
if length:
141225
return 2.0 * matches / length
142226
return 1.0
143227

228+
144229
class SequenceMatcher:
145230

146231
"""
@@ -379,38 +464,40 @@ def __chain_b(self):
379464
self.bjunk = junk = set()
380465
autojunk = self.autojunk
381466
self.bpopular = popular = set()
382-
self.b2j = b2j = {}
383-
if autojunk:
384-
for i, elt in enumerate(b):
385-
indices = b2j.setdefault(elt, [])
386-
indices.append(i)
387-
388-
# Purge junk elements
389-
if isjunk:
390-
for elt in b2j.keys():
391-
if isjunk(elt):
392-
junk.add(elt)
393-
for elt in junk: # separate loop avoids separate list of keys
394-
del b2j[elt]
395-
396-
# Purge popular elements that are not junk
397-
n = len(b)
398-
if autojunk and n >= 200:
399-
ntest = n // 100 + 1
400-
for elt, idxs in b2j.items():
401-
if len(idxs) > ntest:
402-
popular.add(elt)
403-
for elt in popular: # ditto; as fast for 1% deletion
404-
del b2j[elt]
467+
self._bcounts = bcounts = dict(_Counter(b))
468+
if isjunk:
469+
junk.update(filter(isjunk, bcounts))
470+
for elt in junk:
471+
del bcounts[elt]
472+
473+
n = len(b)
474+
if autojunk and n >= 200:
475+
ntest = n // 100 + 1
476+
for elt, num in bcounts.items():
477+
if num > ntest:
478+
popular.add(elt)
479+
for elt in popular: # ditto; as fast for 1% deletion
480+
del bcounts[elt]
481+
482+
self._max_bcount = max(bcounts.values()) if bcounts else 0
483+
self._all_junk = frozenset(junk | popular)
484+
self._lcsub_aut = None # _LCSUBAutomaton instance
485+
self._lcsub_dict = None # _LCSUBDict instanct
486+
487+
def _get_lcsub_calculator(self, automaton=False):
488+
if automaton:
489+
if self._lcsub_aut is None:
490+
self._lcsub_aut = _LCSUBAutomaton(self.b, self._all_junk)
491+
return self._lcsub_aut
405492
else:
406-
# Prepare LCSUB Automaton
407-
if isjunk:
408-
bcounts = _Counter(b)
409-
junk.update(filter(isjunk, bcounts))
410-
for elt in junk:
411-
del bcounts[elt]
412-
self.aut_cache = (None, None) # Cache last automaton
413-
self.all_junk = junk | popular
493+
if self._lcsub_dict is None:
494+
self._lcsub_dict = _LCSUBDict(self.b, self._all_junk)
495+
return self._lcsub_dict
496+
497+
@property
498+
def b2j(self):
499+
# NOTE: For backwards compatibility
500+
return self._get_lcsub_calculator(automaton=False).pos2
414501

415502
def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
416503
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
@@ -475,67 +562,58 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
475562
ahi = len(a)
476563
if bhi is None:
477564
bhi = len(b)
478-
if alo >= ahi:
479-
besti, bestj, bestsize = alo, blo, 0
480-
elif self.autojunk:
481-
b2j = self.b2j
565+
asize = ahi - alo
566+
bsize = bhi - blo
567+
568+
if asize <= 0 and bsize <= 0:
482569
besti, bestj, bestsize = alo, blo, 0
483-
# find longest junk-free match
484-
# during an iteration of the loop, j2len[j] = length of longest
485-
# junk-free match ending with a[i-1] and b[j]
486-
j2len = {}
487-
nothing = []
488-
for i in range(alo, ahi):
489-
# look at all instances of a[i] in b; note that because
490-
# b2j has no junk keys, the loop is skipped if a[i] is junk
491-
j2lenget = j2len.get
492-
newj2len = {}
493-
for j in b2j.get(a[i], nothing):
494-
# a[i] matches b[j]
495-
if j < blo:
496-
continue
497-
if j >= bhi:
498-
break
499-
k = newj2len[j] = j2lenget(j-1, 0) + 1
500-
if k > bestsize:
501-
besti, bestj, bestsize = i-k+1, j-k+1, k
502-
j2len = newj2len
503570
else:
504-
# Without autojunk, run LCSUB Automaton
505-
blo_bhi, aut = self.aut_cache
506-
if aut is None or blo_bhi != (blo, bhi):
507-
aut = _LCSUBAutomaton(b, blo, bhi, junk=self.all_junk)
508-
self.aut_cache = ((blo, bhi), aut)
509-
besti, bestj, bestsize = aut.find(a, alo, ahi)
510-
511-
# Extend the best by non-junk elements on each end. In particular,
512-
# "popular" non-junk elements aren't in b2j, which greatly speeds
513-
# the inner loop above, but also means "the best" match so far
514-
# doesn't contain any junk *or* popular non-junk elements.
515-
while besti > alo and bestj > blo and \
516-
not isbjunk(b[bestj-1]) and \
517-
a[besti-1] == b[bestj-1]:
518-
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
519-
while besti+bestsize < ahi and bestj+bestsize < bhi and \
520-
not isbjunk(b[bestj+bestsize]) and \
521-
a[besti+bestsize] == b[bestj+bestsize]:
522-
bestsize += 1
523-
524-
# Now that we have a wholly interesting match (albeit possibly
525-
# empty!), we may as well suck up the matching junk on each
526-
# side of it too. Can't think of a good reason not to, and it
527-
# saves post-processing the (possibly considerable) expense of
528-
# figuring out what to do with it. In the case of an empty
529-
# interesting match, this is clearly the right thing to do,
530-
# because no other kind of match is possible in the regions.
531-
while besti > alo and bestj > blo and \
532-
isbjunk(b[bestj-1]) and \
533-
a[besti-1] == b[bestj-1]:
534-
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
535-
while besti+bestsize < ahi and bestj+bestsize < bhi and \
536-
isbjunk(b[bestj+bestsize]) and \
537-
a[besti+bestsize] == b[bestj+bestsize]:
538-
bestsize = bestsize + 1
571+
# Constant to contruct automaton is roughly 6.
572+
# Constant to run automaton is roughly 2.
573+
# This has been tested on a range of data sets.
574+
# For that specific set it gave selection accuracy of 95%.
575+
# Weak spot in this is cases with little or no element overlap at all.
576+
# However, such check would have more cost than benefit.
577+
use_automaton = self._max_bcount * asize > bsize * 6 + asize * 2
578+
calc = self._get_lcsub_calculator(use_automaton)
579+
besti, bestj, bestsize = calc.find(a, alo, ahi, blo, bhi)
580+
581+
if self.bpopular:
582+
# Extend the best by non-junk elements on each end. In particular,
583+
# "popular" non-junk elements aren't in b2j, which greatly speeds
584+
# the inner loop above, but also means "the best" match so far
585+
# doesn't contain any junk *or* popular non-junk elements.
586+
while besti > alo and bestj > blo and \
587+
not isbjunk(b[bestj-1]) and \
588+
a[besti-1] == b[bestj-1]:
589+
besti -= 1
590+
bestj -= 1
591+
bestsize += 1
592+
593+
while besti+bestsize < ahi and bestj+bestsize < bhi and \
594+
not isbjunk(b[bestj+bestsize]) and \
595+
a[besti+bestsize] == b[bestj+bestsize]:
596+
bestsize += 1
597+
598+
if self.bjunk:
599+
# Now that we have a wholly interesting match (albeit possibly
600+
# empty!), we may as well suck up the matching junk on each
601+
# side of it too. Can't think of a good reason not to, and it
602+
# saves post-processing the (possibly considerable) expense of
603+
# figuring out what to do with it. In the case of an empty
604+
# interesting match, this is clearly the right thing to do,
605+
# because no other kind of match is possible in the regions.
606+
while besti > alo and bestj > blo and \
607+
isbjunk(b[bestj-1]) and \
608+
a[besti-1] == b[bestj-1]:
609+
besti -= 1
610+
bestj -= 1
611+
bestsize += 1
612+
613+
while besti+bestsize < ahi and bestj+bestsize < bhi and \
614+
isbjunk(b[bestj+bestsize]) and \
615+
a[besti+bestsize] == b[bestj+bestsize]:
616+
bestsize = bestsize + 1
539617

540618
return Match(besti, bestj, bestsize)
541619

0 commit comments

Comments
 (0)