3939
4040
4141def _adjust_indices (seq , start , stop ):
42- assert start >= 0
42+ if start < 0 :
43+ raise ValueError ('Starting index can not be negative' )
4344 size = len (seq )
4445 if stop is None or stop > size :
4546 stop = size
@@ -52,9 +53,10 @@ class _LCSUBSimple:
5253 Complexity:
5354 T: O(n1 + n2) best, O(n1 × n2) worst
5455 S: O(n2)
56+ , where n1 = len(a), n2 = len(b)
5557
5658 Members:
57- b2j for x in b, b2j[x] is a list of the indices (into b)
59+ _b2j for x in b, b2j[x] is a list of the indices (into b)
5860 at which x appears; junk elements do not appear
5961 """
6062
@@ -73,17 +75,18 @@ def isbuilt(self, blo, bhi):
7375
7476 def _get_b2j (self ):
7577 b2j = self ._b2j
76- if b2j is None :
77- b2j = {} # positions of each element in b
78- for i , elt in enumerate (self .b ):
79- indices = b2j .setdefault (elt , [])
80- indices .append (i )
81- junk = self .junk
82- if junk :
83- for elt in junk :
84- del b2j [elt ]
85- self ._b2j = b2j
78+ if b2j is not None :
79+ return b2j
8680
81+ b2j = {} # positions of each element in b
82+ for i , elt in enumerate (self .b ):
83+ indices = b2j .setdefault (elt , [])
84+ indices .append (i )
85+ junk = self .junk
86+ if junk :
87+ for elt in junk :
88+ del b2j [elt ]
89+ self ._b2j = b2j
8790 return b2j
8891
8992 def find (self , a , alo = 0 , ahi = None , blo = 0 , bhi = None ):
@@ -120,18 +123,13 @@ def find(self, a, alo=0, ahi=None, blo=0, bhi=None):
120123 return besti , bestj , bestsize
121124
122125
123- _LENGTH = 0
124- _LINK = 1
125- _NEXT = 2
126- _POS = 3
127-
128-
129126class _LCSUBAutomaton :
130127 """Suffix Automaton for finding longest common substring.
131128
132129 Complexity:
133130 T: O(n1 + n2) - roughly 2 * n1 + 6 * n2
134131 S: O(n2) - maximum nodes: 2 * n2 + 1
132+ , where n1 = len(a), n2 = len(b)
135133
136134 Node spec:
137135 node: list = [length: int, link: list, next: dict, end_pos: int]
@@ -157,62 +155,58 @@ def isbuilt(self, blo, bhi):
157155
158156 def _get_root (self , blo , bhi ):
159157 """
160- Automaton needs to rebuild for every (blo, bhi)
161- This is made to cache the last one and only rebuild on new values
162-
163- Note that to construct Automaton that can be queried for any
164- (blo, bhi), each node would need to store a store a set of
165- indices. And this is prone to O(n^2) memory explosion.
166- Current approach maintains reasonable memory guarantees
167- and is also much simpler in comparison.
158+ Automaton needs to rebuild for every (start2, stop2)
159+ It is made to cache the last one and only rebuilds for new range
168160 """
169161 key = (blo , bhi )
170162 root = self ._root
171- if root is None or self ._cache != key :
172- root = [0 , None , {}, - 1 ]
173- b = self .b
174- junk = self .junk
175- last_len = 0
176- last = root
177- for j in range (blo , bhi ):
178- c = b [j ]
179- if c in junk :
180- last_len = 0
181- last = root
163+ if root is not None and self ._cache == key :
164+ return root
165+
166+ LEN , LINK , NEXT , EPOS = 0 , 1 , 2 , 3
167+ root = [0 , None , {}, - 1 ]
168+ b = self .b
169+ junk = self .junk
170+ last_len = 0
171+ last = root
172+ for j in range (blo , bhi ):
173+ c = b [j ]
174+ if c in junk :
175+ last_len = 0
176+ last = root
177+ else :
178+ last_len += 1
179+ curr = [last_len , None , {}, j ]
180+
181+ p = last
182+ p_next = p [NEXT ]
183+ while c not in p_next :
184+ p_next [c ] = curr
185+ if p is root :
186+ curr [LINK ] = root
187+ break
188+ p = p [LINK ]
189+ p_next = p [NEXT ]
182190 else :
183- last_len += 1
184- curr = [last_len , None , {}, j ]
185-
186- p = last
187- p_next = p [_NEXT ]
188- while c not in p_next :
189- p_next [c ] = curr
190- if p is root :
191- curr [_LINK ] = root
192- break
193- p = p [_LINK ]
194- p_next = p [_NEXT ]
191+ q = p_next [c ]
192+ p_len_p1 = p [LEN ] + 1
193+ if p_len_p1 == q [LEN ]:
194+ curr [LINK ] = q
195195 else :
196- q = p_next [c ]
197- p_length_p1 = p [_LENGTH ] + 1
198- if p_length_p1 == q [_LENGTH ]:
199- curr [_LINK ] = q
200- else :
201- # Copy `q[_POS]` to ensure leftmost match in b
202- clone = [p_length_p1 , q [_LINK ], q [_NEXT ].copy (), q [_POS ]]
203- while (p_next := p [_NEXT ]).get (c ) is q :
204- p_next [c ] = clone
205- if p is root :
206- break
207- p = p [_LINK ]
208-
209- q [_LINK ] = curr [_LINK ] = clone
210-
211- last = curr
212-
213- self ._root = root
214- self ._cache = key
196+ # Copy `q[EPOS]` to ensure leftmost match in b
197+ clone = [p_len_p1 , q [LINK ], q [NEXT ].copy (), q [EPOS ]]
198+ while (p_next := p [NEXT ]).get (c ) is q :
199+ p_next [c ] = clone
200+ if p is root :
201+ break
202+ p = p [LINK ]
203+
204+ q [LINK ] = curr [LINK ] = clone
205+
206+ last = curr
215207
208+ self ._root = root
209+ self ._cache = key
216210 return root
217211
218212 def find (self , a , alo = 0 , ahi = None , blo = 0 , bhi = None ):
@@ -221,6 +215,7 @@ def find(self, a, alo=0, ahi=None, blo=0, bhi=None):
221215 if alo >= ahi or blo >= bhi :
222216 return (alo , blo , 0 )
223217
218+ LEN , LINK , NEXT , EPOS = 0 , 1 , 2 , 3
224219 root = self ._get_root (blo , bhi )
225220 junk = self .junk
226221 v = root
@@ -235,11 +230,11 @@ def find(self, a, alo=0, ahi=None, blo=0, bhi=None):
235230 v = root
236231 l = 0
237232 else :
238- while v is not root and c not in v [_NEXT ]:
239- v = v [_LINK ]
240- l = v [_LENGTH ]
233+ while v is not root and c not in v [NEXT ]:
234+ v = v [LINK ]
235+ l = v [LEN ]
241236
242- v_next = v [_NEXT ]
237+ v_next = v [NEXT ]
243238 if c in v_next :
244239 v = v_next [c ]
245240 l += 1
@@ -252,8 +247,7 @@ def find(self, a, alo=0, ahi=None, blo=0, bhi=None):
252247 return (alo , blo , 0 )
253248
254249 start_in_s1 = best_pos + 1 - best_len
255- end_in_s2 = best_state [_POS ]
256- start_in_s2 = end_in_s2 + 1 - best_len
250+ start_in_s2 = best_state [EPOS ] + 1 - best_len
257251 return (start_in_s1 , start_in_s2 , best_len )
258252
259253
0 commit comments