3737
3838Match = _namedtuple ('Match' , 'a b size' )
3939
40+
41+ class _LCSUBDict :
42+ """Dict method for finding longest common substring.
43+
44+ Complexity:
45+ T: O(n1 + n2) best, O(n1 × n2) worst
46+ S: O(n2)
47+
48+ Members:
49+ pos2 for x in seq2, pos2[x] is a list of the indices (into seq2)
50+ at which x appears; junk elements do not appear
51+ """
52+
53+ def __init__ (self , seq2 , junk = ()):
54+ if not isinstance (junk , frozenset ):
55+ junk = frozenset (junk )
56+ self .seq2 = seq2
57+ self .junk = junk
58+ self .pos2 = pos2 = {} # positions of each element in seq2
59+ for i , elt in enumerate (seq2 ):
60+ indices = pos2 .setdefault (elt , [])
61+ indices .append (i )
62+ if junk :
63+ for elt in junk :
64+ del pos2 [elt ]
65+
66+ def find (self , seq1 , start1 = 0 , stop1 = None , start2 = 0 , stop2 = None ):
67+ if stop1 is None :
68+ stop1 = len (seq1 )
69+ if stop2 is None :
70+ stop2 = len (self .seq2 )
71+ pos2 = self .pos2
72+ j2len = {}
73+ nothing = []
74+ besti , bestj , bestsize = start1 , start2 , 0
75+ # find longest junk-free match
76+ # during an iteration of the loop, j2len[j] = length of longest
77+ # junk-free match ending with seq1[i-1] and seq2[j]
78+ for i in range (start1 , stop1 ):
79+ # look at all instances of seq1[i] in seq2; note that because
80+ # pos2 has no junk keys, the loop is skipped if seq1[i] is junk
81+ j2lenget = j2len .get
82+ newj2len = {}
83+ for j in pos2 .get (seq1 [i ], nothing ):
84+ # seq1[i] matches seq2[j]
85+ if j < start2 :
86+ continue
87+ if j >= stop2 :
88+ break
89+ k = newj2len [j ] = j2lenget (j - 1 , 0 ) + 1
90+ if k > bestsize :
91+ besti = i - k + 1
92+ bestj = j - k + 1
93+ bestsize = k
94+ j2len = newj2len
95+
96+ return besti , bestj , bestsize
97+
98+
4099_LENGTH = 0
41100_LINK = 1
42101_NEXT = 2
43102_POS = 3
44103
45104
46105class _LCSUBAutomaton :
47- """Suffix Automaton for finding longest common substring."""
106+ """Suffix Automaton for finding longest common substring.
107+
108+ Complexity:
109+ T: O(n1 + n2) - roughly 2 * n1 + 6 * n2
110+ S: O(n2) - maximum nodes: 2 * n2 + 1
111+
112+ Node spec:
113+ node: list = [length: int, link: list, next: dict, end_pos: int]
114+ length - match length when the node is reached
115+ link - reference to a node to fall back to
116+ next - map to nodes to go to when matched
117+ end_pos - end position of first occurrence (used for result)
118+ """
48119
49- def __init__ (self , s2 , start2 = 0 , stop2 = None , * , junk = ()):
50- if stop2 is None :
51- stop2 = len (s2 )
120+ def __init__ (self , seq2 , junk = ()):
121+ if not isinstance (junk , frozenset ):
122+ junk = frozenset (junk )
123+ self .seq2 = seq2
124+ self .junk = junk
125+ self .root = None
126+ self .cache = (None , None )
52127
53- self .start2 = start2
54- self .stop2 = stop2
55- self .junk = frozenset (junk )
56- self .root = root = [0 , None , {}, - 1 ] # [length, link, next, end_pos]
128+ def _build (self , start2 , stop2 ):
129+ """
130+ Automaton needs to rebuild for every (start2, stop2)
131+ This is made to cache the last one and only rebuild on new values
132+ """
133+ if self .root is not None and self .cache == (start2 , stop2 ):
134+ return
57135
136+ self .root = root = [0 , None , {}, - 1 ]
137+ seq2 = self .seq2
138+ junk = self .junk
58139 last_len = 0
59140 last = root
60141 for j in range (start2 , stop2 ):
61- c = s2 [j ]
142+ c = seq2 [j ]
62143 if c in junk :
63144 last_len = 0
64145 last = root
@@ -81,6 +162,7 @@ def __init__(self, s2, start2=0, stop2=None, *, junk=()):
81162 if p_length_p1 == q [_LENGTH ]:
82163 curr [_LINK ] = q
83164 else :
165+ # Copy `q[_POS]` to ensure leftmost match in seq2
84166 clone = [p_length_p1 , q [_LINK ], q [_NEXT ].copy (), q [_POS ]]
85167 while (p_next := p [_NEXT ]).get (c ) is q :
86168 p_next [c ] = clone
@@ -92,9 +174,16 @@ def __init__(self, s2, start2=0, stop2=None, *, junk=()):
92174
93175 last = curr
94176
95- def find (self , s1 , start1 = 0 , stop1 = None ):
96- if stop1 is None :
97- stop1 = len (s1 )
177+ self .cache = (start2 , stop2 )
178+
179+ def find (self , seq1 , start1 = 0 , stop1 = None , start2 = 0 , stop2 = None ):
180+ size1 = len (seq1 )
181+ size2 = len (self .seq2 )
182+ if stop1 is None or stop1 > size1 :
183+ stop1 = size1
184+ if stop2 is None or stop2 > size2 :
185+ stop2 = size2
186+ self ._build (start2 , stop2 )
98187 root = self .root
99188 junk = self .junk
100189 v = root
@@ -104,7 +193,7 @@ def find(self, s1, start1=0, stop1=None):
104193 best_pos = 0
105194
106195 for i in range (start1 , stop1 ):
107- c = s1 [i ]
196+ c = seq1 [i ]
108197 if c in junk :
109198 v = root
110199 l = 0
@@ -123,24 +212,20 @@ def find(self, s1, start1=0, stop1=None):
123212 best_pos = i
124213
125214 if not best_len :
126- return (start1 , self . start2 , 0 )
215+ return (start1 , start2 , 0 )
127216
128217 start_in_s1 = best_pos + 1 - best_len
129218 end_in_s2 = best_state [_POS ]
130219 start_in_s2 = end_in_s2 + 1 - best_len
131220 return (start_in_s1 , start_in_s2 , best_len )
132221
133222
134- def longest_common_substring (s1 , s2 , start1 = 0 , stop1 = None , start2 = 0 , stop2 = None ,
135- * , junk = ()):
136- return _LCSUBAutomaton (s2 , start2 , stop2 , junk = junk ).find (s1 , start1 , stop1 )
137-
138-
139223def _calculate_ratio (matches , length ):
140224 if length :
141225 return 2.0 * matches / length
142226 return 1.0
143227
228+
144229class SequenceMatcher :
145230
146231 """
@@ -379,38 +464,40 @@ def __chain_b(self):
379464 self .bjunk = junk = set ()
380465 autojunk = self .autojunk
381466 self .bpopular = popular = set ()
382- self .b2j = b2j = {}
383- if autojunk :
384- for i , elt in enumerate (b ):
385- indices = b2j .setdefault (elt , [])
386- indices .append (i )
387-
388- # Purge junk elements
389- if isjunk :
390- for elt in b2j .keys ():
391- if isjunk (elt ):
392- junk .add (elt )
393- for elt in junk : # separate loop avoids separate list of keys
394- del b2j [elt ]
395-
396- # Purge popular elements that are not junk
397- n = len (b )
398- if autojunk and n >= 200 :
399- ntest = n // 100 + 1
400- for elt , idxs in b2j .items ():
401- if len (idxs ) > ntest :
402- popular .add (elt )
403- for elt in popular : # ditto; as fast for 1% deletion
404- del b2j [elt ]
467+ self ._bcounts = bcounts = dict (_Counter (b ))
468+ if isjunk :
469+ junk .update (filter (isjunk , bcounts ))
470+ for elt in junk :
471+ del bcounts [elt ]
472+
473+ n = len (b )
474+ if autojunk and n >= 200 :
475+ ntest = n // 100 + 1
476+ for elt , num in bcounts .items ():
477+ if num > ntest :
478+ popular .add (elt )
479+ for elt in popular : # ditto; as fast for 1% deletion
480+ del bcounts [elt ]
481+
482+ self ._max_bcount = max (bcounts .values ()) if bcounts else 0
483+ self ._all_junk = frozenset (junk | popular )
484+ self ._lcsub_aut = None # _LCSUBAutomaton instance
485+ self ._lcsub_dict = None # _LCSUBDict instanct
486+
487+ def _get_lcsub_calculator (self , automaton = False ):
488+ if automaton :
489+ if self ._lcsub_aut is None :
490+ self ._lcsub_aut = _LCSUBAutomaton (self .b , self ._all_junk )
491+ return self ._lcsub_aut
405492 else :
406- # Prepare LCSUB Automaton
407- if isjunk :
408- bcounts = _Counter ( b )
409- junk . update ( filter ( isjunk , bcounts ))
410- for elt in junk :
411- del bcounts [ elt ]
412- self . aut_cache = ( None , None ) # Cache last automaton
413- self .all_junk = junk | popular
493+ if self . _lcsub_dict is None :
494+ self . _lcsub_dict = _LCSUBDict ( self . b , self . _all_junk )
495+ return self . _lcsub_dict
496+
497+ @ property
498+ def b2j ( self ):
499+ # NOTE: For backwards compatibility
500+ return self ._get_lcsub_calculator ( automaton = False ). pos2
414501
415502 def find_longest_match (self , alo = 0 , ahi = None , blo = 0 , bhi = None ):
416503 """Find longest matching block in a[alo:ahi] and b[blo:bhi].
@@ -475,67 +562,58 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
475562 ahi = len (a )
476563 if bhi is None :
477564 bhi = len (b )
478- if alo > = ahi :
479- besti , bestj , bestsize = alo , blo , 0
480- elif self . autojunk :
481- b2j = self . b2j
565+ asize = ahi - alo
566+ bsize = bhi - blo
567+
568+ if asize <= 0 and bsize <= 0 :
482569 besti , bestj , bestsize = alo , blo , 0
483- # find longest junk-free match
484- # during an iteration of the loop, j2len[j] = length of longest
485- # junk-free match ending with a[i-1] and b[j]
486- j2len = {}
487- nothing = []
488- for i in range (alo , ahi ):
489- # look at all instances of a[i] in b; note that because
490- # b2j has no junk keys, the loop is skipped if a[i] is junk
491- j2lenget = j2len .get
492- newj2len = {}
493- for j in b2j .get (a [i ], nothing ):
494- # a[i] matches b[j]
495- if j < blo :
496- continue
497- if j >= bhi :
498- break
499- k = newj2len [j ] = j2lenget (j - 1 , 0 ) + 1
500- if k > bestsize :
501- besti , bestj , bestsize = i - k + 1 , j - k + 1 , k
502- j2len = newj2len
503570 else :
504- # Without autojunk, run LCSUB Automaton
505- blo_bhi , aut = self .aut_cache
506- if aut is None or blo_bhi != (blo , bhi ):
507- aut = _LCSUBAutomaton (b , blo , bhi , junk = self .all_junk )
508- self .aut_cache = ((blo , bhi ), aut )
509- besti , bestj , bestsize = aut .find (a , alo , ahi )
510-
511- # Extend the best by non-junk elements on each end. In particular,
512- # "popular" non-junk elements aren't in b2j, which greatly speeds
513- # the inner loop above, but also means "the best" match so far
514- # doesn't contain any junk *or* popular non-junk elements.
515- while besti > alo and bestj > blo and \
516- not isbjunk (b [bestj - 1 ]) and \
517- a [besti - 1 ] == b [bestj - 1 ]:
518- besti , bestj , bestsize = besti - 1 , bestj - 1 , bestsize + 1
519- while besti + bestsize < ahi and bestj + bestsize < bhi and \
520- not isbjunk (b [bestj + bestsize ]) and \
521- a [besti + bestsize ] == b [bestj + bestsize ]:
522- bestsize += 1
523-
524- # Now that we have a wholly interesting match (albeit possibly
525- # empty!), we may as well suck up the matching junk on each
526- # side of it too. Can't think of a good reason not to, and it
527- # saves post-processing the (possibly considerable) expense of
528- # figuring out what to do with it. In the case of an empty
529- # interesting match, this is clearly the right thing to do,
530- # because no other kind of match is possible in the regions.
531- while besti > alo and bestj > blo and \
532- isbjunk (b [bestj - 1 ]) and \
533- a [besti - 1 ] == b [bestj - 1 ]:
534- besti , bestj , bestsize = besti - 1 , bestj - 1 , bestsize + 1
535- while besti + bestsize < ahi and bestj + bestsize < bhi and \
536- isbjunk (b [bestj + bestsize ]) and \
537- a [besti + bestsize ] == b [bestj + bestsize ]:
538- bestsize = bestsize + 1
571+ # Constant to contruct automaton is roughly 6.
572+ # Constant to run automaton is roughly 2.
573+ # This has been tested on a range of data sets.
574+ # For that specific set it gave selection accuracy of 95%.
575+ # Weak spot in this is cases with little or no element overlap at all.
576+ # However, such check would have more cost than benefit.
577+ use_automaton = self ._max_bcount * asize > bsize * 6 + asize * 2
578+ calc = self ._get_lcsub_calculator (use_automaton )
579+ besti , bestj , bestsize = calc .find (a , alo , ahi , blo , bhi )
580+
581+ if self .bpopular :
582+ # Extend the best by non-junk elements on each end. In particular,
583+ # "popular" non-junk elements aren't in b2j, which greatly speeds
584+ # the inner loop above, but also means "the best" match so far
585+ # doesn't contain any junk *or* popular non-junk elements.
586+ while besti > alo and bestj > blo and \
587+ not isbjunk (b [bestj - 1 ]) and \
588+ a [besti - 1 ] == b [bestj - 1 ]:
589+ besti -= 1
590+ bestj -= 1
591+ bestsize += 1
592+
593+ while besti + bestsize < ahi and bestj + bestsize < bhi and \
594+ not isbjunk (b [bestj + bestsize ]) and \
595+ a [besti + bestsize ] == b [bestj + bestsize ]:
596+ bestsize += 1
597+
598+ if self .bjunk :
599+ # Now that we have a wholly interesting match (albeit possibly
600+ # empty!), we may as well suck up the matching junk on each
601+ # side of it too. Can't think of a good reason not to, and it
602+ # saves post-processing the (possibly considerable) expense of
603+ # figuring out what to do with it. In the case of an empty
604+ # interesting match, this is clearly the right thing to do,
605+ # because no other kind of match is possible in the regions.
606+ while besti > alo and bestj > blo and \
607+ isbjunk (b [bestj - 1 ]) and \
608+ a [besti - 1 ] == b [bestj - 1 ]:
609+ besti -= 1
610+ bestj -= 1
611+ bestsize += 1
612+
613+ while besti + bestsize < ahi and bestj + bestsize < bhi and \
614+ isbjunk (b [bestj + bestsize ]) and \
615+ a [besti + bestsize ] == b [bestj + bestsize ]:
616+ bestsize = bestsize + 1
539617
540618 return Match (besti , bestj , bestsize )
541619
0 commit comments