Skip to content

Commit 8fb5f47

Browse files
committed
initial trimming of a and test fix
1 parent c1470ad commit 8fb5f47

File tree

2 files changed

+75
-33
lines changed

2 files changed

+75
-33
lines changed

Lib/difflib.py

Lines changed: 74 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -55,19 +55,25 @@ def __init__(self, seq2, junk=()):
5555
junk = frozenset(junk)
5656
self.seq2 = seq2
5757
self.junk = junk
58-
self.pos2 = pos2 = {} # positions of each element in seq2
59-
for i, elt in enumerate(seq2):
60-
indices = pos2.setdefault(elt, [])
61-
indices.append(i)
62-
if junk:
63-
for elt in junk:
64-
del pos2[elt]
58+
self.pos2 = None
59+
60+
def _build(self):
61+
if self.pos2 is None:
62+
self.pos2 = pos2 = {} # positions of each element in seq2
63+
for i, elt in enumerate(self.seq2):
64+
indices = pos2.setdefault(elt, [])
65+
indices.append(i)
66+
junk = self.junk
67+
if junk:
68+
for elt in junk:
69+
del pos2[elt]
6570

6671
def find(self, seq1, start1=0, stop1=None, start2=0, stop2=None):
6772
if stop1 is None:
6873
stop1 = len(seq1)
6974
if stop2 is None:
7075
stop2 = len(self.seq2)
76+
self._build()
7177
pos2 = self.pos2
7278
j2len = {}
7379
nothing = []
@@ -129,6 +135,12 @@ def _build(self, start2, stop2):
129135
"""
130136
Automaton needs to rebuild for every (start2, stop2)
131137
This is made to cache the last one and only rebuild on new values
138+
139+
Note that to construct Automaton that can be queried for any
140+
(start2, stop2), each node would need to store a store a set of
141+
indices. And this is prone to O(n^2) memory explosion.
142+
Current approach maintains reasonable memory guarantees
143+
and is also much simpler in comparison.
132144
"""
133145
if self.root is not None and self.cache == (start2, stop2):
134146
return
@@ -480,24 +492,17 @@ def __chain_b(self):
480492
del bcounts[elt]
481493

482494
self._max_bcount = max(bcounts.values()) if bcounts else 0
483-
self._all_junk = frozenset(junk | popular)
484-
self._lcsub_automaton = None # _LCSUBAutomaton instance
485-
self._lcsub_simple = None # _LCSUBSimple instanct
486-
487-
def _get_lcsub_calculator(self, automaton=False):
488-
if automaton:
489-
if self._lcsub_automaton is None:
490-
self._lcsub_automaton = _LCSUBAutomaton(self.b, self._all_junk)
491-
return self._lcsub_automaton
492-
else:
493-
if self._lcsub_simple is None:
494-
self._lcsub_simple = _LCSUBSimple(self.b, self._all_junk)
495-
return self._lcsub_simple
495+
self._all_junk = all_junk = frozenset(junk | popular)
496+
self._lcsub_simple = _LCSUBSimple(b, all_junk)
497+
self._lcsub_automaton = _LCSUBAutomaton(b, all_junk)
496498

497499
@property
498500
def b2j(self):
499501
# NOTE: For backwards compatibility
500-
return self._get_lcsub_calculator(automaton=False).pos2
502+
simple_calc = self._lcsub_simple
503+
if simple_calc.pos2 is None:
504+
simple_calc._build()
505+
return simple_calc.pos2
501506

502507
def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
503508
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
@@ -568,17 +573,54 @@ def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
568573
if asize <= 0 and bsize <= 0:
569574
besti, bestj, bestsize = alo, blo, 0
570575
else:
571-
# Constant to contruct automaton is roughly 6.
572-
# Constant to run automaton is roughly 2.
573-
# This has been tested on a range of data sets.
574-
# For that specific set it gave selection accuracy of 95%.
575-
# Weak spot in this is cases with little or no element overlap at all.
576-
# However, such check would have more cost than benefit.
577-
automaton_cost = bsize * 6 + asize * 2
578-
simple_cost = self._max_bcount * asize
579-
use_automaton = simple_cost > automaton_cost
580-
calc = self._get_lcsub_calculator(use_automaton)
581-
besti, bestj, bestsize = calc.find(a, alo, ahi, blo, bhi)
576+
# Can trim a from both ends while characters are not in b
577+
# This is cheap and we have bcounts at all times
578+
bcounts = self._bcounts
579+
tmp_alo = alo
580+
tmp_ahi = ahi
581+
while tmp_alo < tmp_ahi and a[tmp_alo] not in bcounts:
582+
tmp_alo += 1
583+
while tmp_alo < tmp_ahi and a[tmp_ahi - 1] not in bcounts:
584+
tmp_ahi -= 1
585+
tmp_asize = tmp_ahi - tmp_alo
586+
if tmp_asize <= 0:
587+
besti, bestj, bestsize = alo, blo, 0
588+
else:
589+
# Constant to contruct automaton is roughly - 6.
590+
# Constant to run automaton is roughly - 1.
591+
# This has been tested on a range of data sets.
592+
# It gave selection accuracy of ~95%.
593+
# Weak spot is cases with little or no element overlap at all.
594+
# However, such check would likely have more cost than benefit.
595+
simple_calc = self._lcsub_simple
596+
automaton = self._lcsub_automaton
597+
598+
automaton_cost = tmp_asize
599+
if automaton.cache != (blo, bhi):
600+
automaton_cost += bsize * 6
601+
simple_cost = self._max_bcount * tmp_asize
602+
if simple_calc.pos2 is None:
603+
simple_cost += bsize
604+
if simple_cost < automaton_cost:
605+
calc = simple_calc
606+
else:
607+
calc = automaton
608+
besti, bestj, bestsize = calc.find(a, tmp_alo, tmp_ahi, blo, bhi)
609+
610+
# NOTE: Doing it at the same time results in bigger matches!
611+
# # If bjunk or bpopular were omitted in matching (performance reasons)
612+
# # We now extend the match to capture as much as we can
613+
# if self.bjunk or self.bpopular:
614+
# while besti > alo and bestj > blo and a[besti-1] == b[bestj-1]:
615+
# besti -= 1
616+
# bestj -= 1
617+
# bestsize += 1
618+
# lasti = besti + bestsize
619+
# lastj = bestj + bestsize
620+
# while lasti < ahi and lastj < bhi and a[lasti] == b[lastj]:
621+
# lasti += 1
622+
# lastj += 1
623+
# bestsize += 1
582624

583625
if self.bpopular:
584626
# Extend the best by non-junk elements on each end. In particular,

Lib/test/test_pyclbr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def test_easy(self):
172172
with temporary_main_spec():
173173
self.checkModule('doctest', ignore=("TestResults", "_SpoofOut",
174174
"DocTestCase", '_DocTestSuite'))
175-
self.checkModule('difflib', ignore=("Match",))
175+
self.checkModule('difflib', ignore=("Match", "b2j"))
176176

177177
def test_cases(self):
178178
# see test.pyclbr_input for the rationale behind the ignored symbols

0 commit comments

Comments
 (0)