Skip to content

Commit 834f081

Browse files
committed
Implemented add-chunk compression, which clearly reduces chain size, but might not really be worth it in python
1 parent 1c2caf5 commit 834f081

File tree

2 files changed

+61
-38
lines changed

2 files changed

+61
-38
lines changed

fun.py

Lines changed: 52 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from itertools import islice, izip
1414

1515
from copy import copy
16+
from cStringIO import StringIO
1617

1718
# INVARIANTS
1819
OFS_DELTA = 6
@@ -57,10 +58,6 @@ def _set_delta_rbound(d, size):
5758
:return: d"""
5859
if d.ts == size:
5960
return
60-
if size == 0:
61-
raise ValueError("size to truncate to must not be 0")
62-
if size > d.ts:
63-
raise ValueError("Cannot extend rbound")
6461

6562
d.ts = size
6663

@@ -76,8 +73,6 @@ def _move_delta_lbound(d, bytes):
7673
:return: d"""
7774
if bytes == 0:
7875
return
79-
if bytes >= d.ts:
80-
raise ValueError("Cannot move offset that much")
8176

8277
d.to += bytes
8378
d.so += bytes
@@ -139,7 +134,6 @@ def has_copy_chunklist(self):
139134
def set_copy_chunklist(self, dcl):
140135
"""Set the deltachunk list to be used as basis for copying.
141136
:note: only works if this chunk is a copy delta chunk"""
142-
assert self.data is None, "Cannot assign chain to add delta chunk"
143137
self.data = dcl
144138
self.sob = self.so
145139
self.so = 0 # allows lbound moves to be virtual
@@ -150,13 +144,13 @@ def apply(self, bbuf, write):
150144
:param write: write method to call with data to write"""
151145
if self.data is None:
152146
# COPY DATA FROM SOURCE
153-
assert len(bbuf) - self.so - self.ts > -1
154147
write(buffer(bbuf, self.so, self.ts))
155148
elif isinstance(self.data, DeltaChunkList):
156149
self.data.apply(bbuf, write, self.so, self.ts)
157150
else:
158151
# APPEND DATA
159152
# whats faster: if + 4 function calls or just a write with a slice ?
153+
# Considering data can be larger than 127 bytes now, it should be worth it
160154
if self.ts < len(self.data):
161155
write(self.data[:self.ts])
162156
else:
@@ -209,11 +203,54 @@ def connect_with(self, bdcl):
209203
:param bdcl: DeltaChunkList to serve as base"""
210204
for dc in self:
211205
if not dc.has_data():
212-
# dc.set_copy_chunklist(bdcl[dc.copy_offset():dc.ts])
213206
dc.set_copy_chunklist(bdcl[dc.so:dc.ts])
214207
# END handle overlap
215208
# END for each dc
216209

210+
def compress(self):
211+
"""Alter the list to reduce the amount of nodes. Currently we concatenate
212+
add-chunks
213+
:return: self"""
214+
slen = len(self)
215+
if slen < 2:
216+
return self
217+
i = 0
218+
slen_orig = slen
219+
220+
first_data_index = None
221+
while i < slen:
222+
dc = self[i]
223+
i += 1
224+
if not dc.has_data():
225+
if first_data_index is not None and i-2-first_data_index > 1:
226+
#if first_data_index is not None:
227+
nd = StringIO() # new data
228+
so = self[first_data_index].to # start offset in target buffer
229+
for x in xrange(first_data_index, i-1):
230+
xdc = self[x]
231+
nd.write(xdc.data[:xdc.ts])
232+
# END collect data
233+
234+
del(self[first_data_index:i-1])
235+
buf = nd.getvalue()
236+
self.insert(first_data_index, DeltaChunk(so, len(buf), 0, buf))
237+
238+
slen = len(self)
239+
i = first_data_index + 1
240+
241+
# END concatenate data
242+
first_data_index = None
243+
continue
244+
# END skip non-data chunks
245+
246+
if first_data_index is None:
247+
first_data_index = i-1
248+
# END iterate list
249+
250+
#if slen_orig != len(self):
251+
# print "INFO: Reduced delta list len to %f %% of former size" % ((float(len(self)) / slen_orig) * 100)
252+
return self
253+
217254
def apply(self, bbuf, write, lbound_offset=0, size=0):
218255
"""Apply the chain's changes and write the final result using the passed
219256
write function.
@@ -232,12 +269,6 @@ def apply(self, bbuf, write, lbound_offset=0, size=0):
232269
if size == 0:
233270
size = self.rbound() - absofs
234271
# END initialize size
235-
if absofs + size > self.rbound():
236-
raise ValueError("Cannot apply more bytes than there are in this chain")
237-
# END sanity check
238-
239-
if size > self.rbound() - absofs:
240-
raise ValueError("Trying to apply more than there is available")
241272

242273
dapply = DeltaChunk.apply
243274
if lbound_offset or absofs + size != self.rbound():
@@ -347,7 +378,7 @@ def __getslice__(self, absofs, size):
347378
# END for each chunk
348379
assert size == 0, "size was %i" % size
349380

350-
ndcl.check_integrity()
381+
# ndcl.check_integrity()
351382
return ndcl
352383

353384

@@ -540,7 +571,8 @@ def connect_deltas(dstreams):
540571
dcl.append(DeltaChunk(tbw, cp_size, cp_off, None))
541572
tbw += cp_size
542573
elif c:
543-
# TODO: Concatenate multiple deltachunks
574+
# NOTE: in C, the data chunks should probably be concatenated here.
575+
# In python, we do it as a post-process
544576
dcl.append(DeltaChunk(tbw, c, 0, db[i:i+c]))
545577
i += c
546578
tbw += c
@@ -549,12 +581,14 @@ def connect_deltas(dstreams):
549581
# END handle command byte
550582
# END while processing delta data
551583

584+
dcl.compress()
585+
552586
# merge the lists !
553587
if bdcl is not None:
554588
dcl.connect_with(bdcl)
555589
# END handle merge
556590

557-
dcl.check_integrity()
591+
# dcl.check_integrity()
558592

559593
# prepare next base
560594
bdcl = dcl

stream.py

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,14 @@ def __init__(self, stream_list):
322322
self._br = 0
323323

324324
def _set_cache_(self, attr):
325+
# the direct algorithm is fastest and most direct if there is only one
326+
# delta. Also, the extra overhead might not be worth it for items smaller
327+
# than X - definitely the case in python
328+
#print "num streams", len(self._dstreams)
329+
#if len(self._dstreams) == 1 or (len(self._dstreams) * self._dstreams.size) > 25*1000*1000:
330+
if len(self._dstreams) == 1:
331+
return self._set_cache_brute_(attr)
332+
325333
# Aggregate all deltas into one delta in reverse order. Hence we take
326334
# the last delta, and reverse-merge its ancestor delta, until we receive
327335
# the final delta data stream.
@@ -345,26 +353,7 @@ def _set_cache_(self, attr):
345353

346354
self._mm_target.seek(0)
347355

348-
## DEBUG ##
349-
mt = self._mm_target
350-
for ds in self._dstreams:
351-
ds.stream.seek(0)
352-
self._bstream.stream.seek(0)
353-
self._set_cache_old(attr)
354-
355-
import chardet
356-
357-
print "num dstreams", len(self._dstreams)
358-
#if chardet.detect(mt[:self._size])['encoding'] == 'ascii':
359-
if self._mm_target[:self._size] != mt[:]:
360-
open("working.txt", "w").write(self._mm_target[:self._size])
361-
open("incorrect.txt", "w").write(mt[:])
362-
raise AssertionError("Output didn't match")
363-
# END debug
364-
print "success"
365-
366-
367-
def _set_cache_old(self, attr):
356+
def _set_cache_brute_(self, attr):
368357
"""If we are here, we apply the actual deltas"""
369358

370359
buffer_info_list = list()

0 commit comments

Comments
 (0)