@@ -55,19 +55,25 @@ def __init__(self, seq2, junk=()):
5555 junk = frozenset (junk )
5656 self .seq2 = seq2
5757 self .junk = junk
58- self .pos2 = pos2 = {} # positions of each element in seq2
59- for i , elt in enumerate (seq2 ):
60- indices = pos2 .setdefault (elt , [])
61- indices .append (i )
62- if junk :
63- for elt in junk :
64- del pos2 [elt ]
58+ self .pos2 = None
59+
60+ def _build (self ):
61+ if self .pos2 is None :
62+ self .pos2 = pos2 = {} # positions of each element in seq2
63+ for i , elt in enumerate (self .seq2 ):
64+ indices = pos2 .setdefault (elt , [])
65+ indices .append (i )
66+ junk = self .junk
67+ if junk :
68+ for elt in junk :
69+ del pos2 [elt ]
6570
6671 def find (self , seq1 , start1 = 0 , stop1 = None , start2 = 0 , stop2 = None ):
6772 if stop1 is None :
6873 stop1 = len (seq1 )
6974 if stop2 is None :
7075 stop2 = len (self .seq2 )
76+ self ._build ()
7177 pos2 = self .pos2
7278 j2len = {}
7379 nothing = []
@@ -129,6 +135,12 @@ def _build(self, start2, stop2):
129135 """
130136 Automaton needs to rebuild for every (start2, stop2)
131137 This is made to cache the last one and only rebuild on new values
138+
139+ Note that to construct Automaton that can be queried for any
140+ (start2, stop2), each node would need to store a store a set of
141+ indices. And this is prone to O(n^2) memory explosion.
142+ Current approach maintains reasonable memory guarantees
143+ and is also much simpler in comparison.
132144 """
133145 if self .root is not None and self .cache == (start2 , stop2 ):
134146 return
@@ -480,24 +492,17 @@ def __chain_b(self):
480492 del bcounts [elt ]
481493
482494 self ._max_bcount = max (bcounts .values ()) if bcounts else 0
483- self ._all_junk = frozenset (junk | popular )
484- self ._lcsub_automaton = None # _LCSUBAutomaton instance
485- self ._lcsub_simple = None # _LCSUBSimple instanct
486-
487- def _get_lcsub_calculator (self , automaton = False ):
488- if automaton :
489- if self ._lcsub_automaton is None :
490- self ._lcsub_automaton = _LCSUBAutomaton (self .b , self ._all_junk )
491- return self ._lcsub_automaton
492- else :
493- if self ._lcsub_simple is None :
494- self ._lcsub_simple = _LCSUBSimple (self .b , self ._all_junk )
495- return self ._lcsub_simple
495+ self ._all_junk = all_junk = frozenset (junk | popular )
496+ self ._lcsub_simple = _LCSUBSimple (b , all_junk )
497+ self ._lcsub_automaton = _LCSUBAutomaton (b , all_junk )
496498
497499 @property
498500 def b2j (self ):
499501 # NOTE: For backwards compatibility
500- return self ._get_lcsub_calculator (automaton = False ).pos2
502+ simple_calc = self ._lcsub_simple
503+ if simple_calc .pos2 is None :
504+ simple_calc ._build ()
505+ return simple_calc .pos2
501506
502507 def find_longest_match (self , alo = 0 , ahi = None , blo = 0 , bhi = None ):
503508 """Find longest matching block in a[alo:ahi] and b[blo:bhi].
@@ -568,17 +573,54 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
568573 if asize <= 0 and bsize <= 0 :
569574 besti , bestj , bestsize = alo , blo , 0
570575 else :
571- # Constant to contruct automaton is roughly 6.
572- # Constant to run automaton is roughly 2.
573- # This has been tested on a range of data sets.
574- # For that specific set it gave selection accuracy of 95%.
575- # Weak spot in this is cases with little or no element overlap at all.
576- # However, such check would have more cost than benefit.
577- automaton_cost = bsize * 6 + asize * 2
578- simple_cost = self ._max_bcount * asize
579- use_automaton = simple_cost > automaton_cost
580- calc = self ._get_lcsub_calculator (use_automaton )
581- besti , bestj , bestsize = calc .find (a , alo , ahi , blo , bhi )
576+ # Can trim a from both ends while characters are not in b
577+ # This is cheap and we have bcounts at all times
578+ bcounts = self ._bcounts
579+ tmp_alo = alo
580+ tmp_ahi = ahi
581+ while tmp_alo < tmp_ahi and a [tmp_alo ] not in bcounts :
582+ tmp_alo += 1
583+ while tmp_alo < tmp_ahi and a [tmp_ahi - 1 ] not in bcounts :
584+ tmp_ahi -= 1
585+ tmp_asize = tmp_ahi - tmp_alo
586+ if tmp_asize <= 0 :
587+ besti , bestj , bestsize = alo , blo , 0
588+ else :
589+ # Constant to contruct automaton is roughly - 6.
590+ # Constant to run automaton is roughly - 1.
591+ # This has been tested on a range of data sets.
592+ # It gave selection accuracy of ~95%.
593+ # Weak spot is cases with little or no element overlap at all.
594+ # However, such check would likely have more cost than benefit.
595+ simple_calc = self ._lcsub_simple
596+ automaton = self ._lcsub_automaton
597+
598+ automaton_cost = tmp_asize
599+ if automaton .cache != (blo , bhi ):
600+ automaton_cost += bsize * 6
601+ simple_cost = self ._max_bcount * tmp_asize
602+ if simple_calc .pos2 is None :
603+ simple_cost += bsize
604+ if simple_cost < automaton_cost :
605+ calc = simple_calc
606+ else :
607+ calc = automaton
608+ besti , bestj , bestsize = calc .find (a , tmp_alo , tmp_ahi , blo , bhi )
609+
610+ # NOTE: Doing it at the same time results in bigger matches!
611+ # # If bjunk or bpopular were omitted in matching (performance reasons)
612+ # # We now extend the match to capture as much as we can
613+ # if self.bjunk or self.bpopular:
614+ # while besti > alo and bestj > blo and a[besti-1] == b[bestj-1]:
615+ # besti -= 1
616+ # bestj -= 1
617+ # bestsize += 1
618+ # lasti = besti + bestsize
619+ # lastj = bestj + bestsize
620+ # while lasti < ahi and lastj < bhi and a[lasti] == b[lastj]:
621+ # lasti += 1
622+ # lastj += 1
623+ # bestsize += 1
582624
583625 if self .bpopular :
584626 # Extend the best by non-junk elements on each end. In particular,
0 commit comments