@@ -11,7 +11,7 @@ typedef unsigned char uchar;
1111typedef uchar bool ;
1212
1313// Constants
14- const ull gDVC_grow_by = 100 ;
14+ const ull gDCV_grow_by = 100 ;
1515
1616#ifdef DEBUG
1717#define DBG_check (vec ) assert(DCV_dbg_check_integrity(vec))
@@ -170,8 +170,8 @@ int DCV_reserve_memory(DeltaChunkVector* vec, uint num_dc)
170170 return 1 ;
171171 }
172172
173- if (num_dc - vec -> reserved_size ){
174- num_dc += gDVC_grow_by ;
173+ if (num_dc - vec -> reserved_size < 10 ){
174+ num_dc += gDCV_grow_by ;
175175 }
176176
177177#ifdef DEBUG
@@ -255,6 +255,12 @@ ull DCV_rbound(const DeltaChunkVector* vec)
255255 return DC_rbound (DCV_last (vec ));
256256}
257257
258+ inline
259+ ull DCV_size (const DeltaChunkVector * vec )
260+ {
261+ return DCV_rbound (vec ) - DCV_lbound (vec );
262+ }
263+
258264inline
259265int DCV_empty (const DeltaChunkVector * vec )
260266{
@@ -269,6 +275,14 @@ const DeltaChunk* DCV_end(const DeltaChunkVector* vec)
269275 return vec -> mem + vec -> size ;
270276}
271277
278+ // return first item in vector
279+ inline
280+ DeltaChunk * DCV_first (const DeltaChunkVector * vec )
281+ {
282+ assert (!DCV_empty (vec ));
283+ return vec -> mem ;
284+ }
285+
272286void DCV_destroy (DeltaChunkVector * vec )
273287{
274288 if (vec -> mem ){
@@ -306,7 +320,7 @@ void DCV_reset(DeltaChunkVector* vec)
306320 if (vec -> size == 0 )
307321 return ;
308322
309- DeltaChunk * dc = vec -> mem ;
323+ DeltaChunk * dc = DCV_first ( vec ) ;
310324 const DeltaChunk * dcend = DCV_end (vec );
311325 for (;dc < dcend ; dc ++ ){
312326 DC_destroy (dc );
@@ -322,7 +336,7 @@ static inline
322336DeltaChunk * DCV_append (DeltaChunkVector * vec )
323337{
324338 if (vec -> size + 1 > vec -> reserved_size ){
325- DCV_grow_by (vec , gDVC_grow_by );
339+ DCV_grow_by (vec , gDCV_grow_by );
326340 }
327341
328342 DeltaChunk * next = vec -> mem + vec -> size ;
@@ -364,7 +378,7 @@ int DCV_dbg_check_integrity(const DeltaChunkVector* vec)
364378 if (DCV_empty (vec )){
365379 return 0 ;
366380 }
367- const DeltaChunk * i = vec -> mem ;
381+ const DeltaChunk * i = DCV_first ( vec ) ;
368382 const DeltaChunk * end = DCV_end (vec );
369383
370384 ull aparent_size = DCV_rbound (vec ) - DCV_lbound (vec );
@@ -380,7 +394,7 @@ int DCV_dbg_check_integrity(const DeltaChunkVector* vec)
380394 }
381395
382396 const DeltaChunk * endm1 = DCV_end (vec ) - 1 ;
383- for (i = vec -> mem ; i < endm1 ; i ++ ){
397+ for (i = DCV_first ( vec ) ; i < endm1 ; i ++ ){
384398 const DeltaChunk * n = i + 1 ;
385399 if (DC_rbound (i ) != n -> to ){
386400 return 0 ;
@@ -390,46 +404,82 @@ int DCV_dbg_check_integrity(const DeltaChunkVector* vec)
390404 return 1 ;
391405}
392406
407+ // Return the amount of chunks a slice at the given spot would have
408+ inline
409+ uint DCV_count_slice_chunks (const DeltaChunkVector * src , ull ofs , ull size )
410+ {
411+ uint num_dc = 0 ;
412+ DeltaChunk * cdc = DCV_closest_chunk (src , ofs );
413+
414+ // partial overlap
415+ if (cdc -> to != ofs ) {
416+ const ull relofs = ofs - cdc -> to ;
417+ size -= cdc -> ts - relofs < size ? cdc -> ts - relofs : size ;
418+ num_dc += 1 ;
419+ cdc += 1 ;
420+
421+ if (size == 0 ){
422+ return num_dc ;
423+ }
424+ }
425+
426+ const DeltaChunk * vecend = DCV_end (src );
427+ for ( ;(cdc < vecend ) && size ; ++ cdc ){
428+ num_dc += 1 ;
429+ if (cdc -> ts < size ) {
430+ size -= cdc -> ts ;
431+ } else {
432+ size = 0 ;
433+ break ;
434+ }
435+ }
436+
437+ return num_dc ;
438+ }
439+
393440// Write a slice as defined by its absolute offset in bytes and its size into the given
394- // destination. The individual chunks written will be a deep copy of the source
441+ // destination memory . The individual chunks written will be a deep copy of the source
395442// data chunks
396- // TODO: this could trigger copying many smallish add-chunk pieces - maybe some sort
397- // of append-only memory pool would improve performance
443+ // Return: number of chunks in the slice
398444inline
399- void DCV_copy_slice_to (const DeltaChunkVector * src , DeltaChunkVector * dest , ull ofs , ull size )
445+ uint DCV_copy_slice_to (const DeltaChunkVector * src , DeltaChunk * dest , ull ofs , ull size )
400446{
401447 assert (DCV_lbound (src ) <= ofs );
402448 assert ((ofs + size ) <= DCV_rbound (src ));
403449
404450 DeltaChunk * cdc = DCV_closest_chunk (src , ofs );
451+ uint num_chunks = 0 ;
405452
406453 // partial overlap
407454 if (cdc -> to != ofs ) {
408- DeltaChunk * destc = DCV_append (dest );
409455 const ull relofs = ofs - cdc -> to ;
410- DC_offset_copy_to (cdc , destc , relofs , cdc -> ts - relofs < size ? cdc -> ts - relofs : size );
456+ DC_offset_copy_to (cdc , dest , relofs , cdc -> ts - relofs < size ? cdc -> ts - relofs : size );
411457 cdc += 1 ;
412- size -= destc -> ts ;
458+ size -= dest -> ts ;
459+ dest += 1 ;
460+ num_chunks += 1 ;
413461
414462 if (size == 0 ){
415- return ;
463+ return num_chunks ;
416464 }
417465 }
418466
419467 const DeltaChunk * vecend = DCV_end (src );
420468 for ( ;(cdc < vecend ) && size ; ++ cdc )
421469 {
470+ num_chunks += 1 ;
422471 if (cdc -> ts < size ) {
423- DC_copy_to (cdc , DCV_append ( dest ) );
472+ DC_copy_to (cdc , dest ++ );
424473 size -= cdc -> ts ;
425474 } else {
426- DC_offset_copy_to (cdc , DCV_append ( dest ) , 0 , size );
475+ DC_offset_copy_to (cdc , dest ++ , 0 , size );
427476 size = 0 ;
428477 break ;
429478 }
430479 }
431480
432481 assert (size == 0 );
482+ return num_chunks ;
433483}
434484
435485
@@ -458,64 +508,85 @@ void DCV_replace_one_by_many(const DeltaChunkVector* from, DeltaChunkVector* to,
458508 }
459509
460510 // Finally copy all the items in
461- memcpy ((void * ) at , (void * )from -> mem , from -> size * sizeof (DeltaChunk ));
511+ memcpy ((void * ) at , (void * )DCV_first ( from ) , from -> size * sizeof (DeltaChunk ));
462512
463513 // FINALLY: update size
464514 to -> size += from -> size - 1 ;
465515}
466516
467517// Take slices of bdcv into the corresponding area of the tdcv, which is the topmost
468- // delta to apply. tmpl is used as temporary space and must be initialzed and destroyed by the
469- // caller
470- void DCV_connect_with_base (DeltaChunkVector * tdcv , const DeltaChunkVector * bdcv , DeltaChunkVector * tmpl )
518+ // delta to apply.
519+ bool DCV_connect_with_base (DeltaChunkVector * tdcv , const DeltaChunkVector * bdcv )
471520{
472- Py_ssize_t dci = 0 ;
473- Py_ssize_t iend = tdcv -> size ;
474- DeltaChunk * dc ;
475-
476521 DBG_check (tdcv );
477522 DBG_check (bdcv );
478523
479- for (;dci < iend ; dci ++ )
524+ uint * offset_array = PyMem_Malloc (tdcv -> size * sizeof (uint ));
525+ if (!offset_array ){
526+ return 0 ;
527+ }
528+
529+ fprintf (stderr , "old size = %i\n" , (int )tdcv -> size );
530+ uint * pofs = offset_array ;
531+ uint num_addchunks = 0 ;
532+
533+ DeltaChunk * dc = DCV_first (tdcv );
534+ const DeltaChunk * dcend = DCV_end (tdcv );
535+ const ull oldsize = DCV_size (tdcv );
536+
537+ // OFFSET RUN
538+ for (;dc < dcend ; dc ++ , pofs ++ )
480539 {
481540 // Data chunks don't need processing
482- dc = DCV_get ( tdcv , dci ) ;
541+ * pofs = num_addchunks ;
483542 if (dc -> data ){
484543 continue ;
485544 }
486545
487- // Copy Chunk Handling
488- DCV_copy_slice_to (bdcv , tmpl , dc -> so , dc -> ts );
489- DBG_check (tmpl );
490- assert (tmpl -> size );
491-
492- // move target bounds
493- DeltaChunk * tdc = tmpl -> mem ;
494- DeltaChunk * tdcend = tmpl -> mem + tmpl -> size ;
495- const ull ofs = dc -> to - dc -> so ;
496- for (;tdc < tdcend ; tdc ++ ){
497- tdc -> to += ofs ;
546+ // offset the next chunk by the amount of chunks in the slice
547+ // - 1, because we replace our own chunk
548+ num_addchunks += DCV_count_slice_chunks (bdcv , dc -> so , dc -> ts ) - 1 ;
549+ }
550+
551+ // reserve enough memory to hold all the new chunks
552+ // reinit pointers, array could have been reallocated
553+ DCV_reserve_memory (tdcv , tdcv -> size + num_addchunks );
554+ dc = DCV_last (tdcv );
555+ dcend = DCV_first (tdcv ) - 1 ;
556+
557+ // now, that we have our pointers with the old size
558+ tdcv -> size += num_addchunks ;
559+
560+ // Insert slices, from the end to the beginning, which allows memcpy
561+ // to be used, with a little help of the offset array
562+ for (pofs -= 1 ; dc > dcend ; dc -- , pofs -- )
563+ {
564+ // Data chunks don't need processing
565+ const uint ofs = * pofs ;
566+ if (dc -> data ){
567+ // TODO: peak the preceeding chunks to figure out whether they are
568+ // all just moved by ofs. In that case, they can move as a whole!
569+ // just copy the chunk according to its offset
570+ if (ofs ){
571+ memcpy ((void * )(dc + ofs ), (void * )dc , sizeof (DeltaChunk ));
572+ }
573+ continue ;
498574 }
499575
500- // insert slice into our list
501- if (tmpl -> size == 1 ){
502- // Its not data, so destroy is not really required, anyhow ...
503- DC_destroy (dc );
504- * dc = * DCV_get (tmpl , 0 );
505- } else {
506- DCV_reserve_memory (tdcv , tdcv -> size + tmpl -> size - 1 + gDVC_grow_by );
507- dc = DCV_get (tdcv , dci );
508- DCV_replace_one_by_many (tmpl , tdcv , dc );
509- // Compensate for us being replaced
510- dci += tmpl -> size - 1 ;
511- iend += tmpl -> size - 1 ;
576+ // Copy Chunks, and move their target offset into place
577+ DeltaChunk * tdc = dc + ofs ;
578+ DeltaChunk * tdcend = tdc + DCV_copy_slice_to (bdcv , tdc , dc -> so , dc -> ts );
579+ const ull relofs = dc -> to - dc -> so ;
580+ for (;tdc < tdcend ; tdc ++ ){
581+ tdc -> to += relofs ;
512582 }
513-
514- DBG_check (tdcv );
515-
516- // make sure the members will not be deallocated by the list
517- DCV_forget_members (tmpl );
518583 }
584+
585+ fprintf (stderr , "NEW size = %i\n" , (int )tdcv -> size );
586+ DBG_check (tdcv );
587+ assert (DCV_size (tdcv ) == oldsize );
588+ PyMem_Free (offset_array );
589+ return 1 ;
519590}
520591
521592// DELTA CHUNK LIST (PYTHON)
@@ -699,10 +770,8 @@ static PyObject* connect_deltas(PyObject *self, PyObject *dstreams)
699770
700771 DeltaChunkVector dcv ;
701772 DeltaChunkVector tdcv ;
702- DeltaChunkVector tmpl ;
703773 DCV_init (& dcv , 100 ); // should be enough to keep the average text file
704774 DCV_init (& tdcv , 0 );
705- DCV_init (& tmpl , 200 );
706775
707776 unsigned int dsi = 0 ;
708777 PyObject * ds = 0 ;
@@ -725,7 +794,6 @@ static PyObject* connect_deltas(PyObject *self, PyObject *dstreams)
725794 const ull base_size = msb_size (& data , dend );
726795 const ull target_size = msb_size (& data , dend );
727796
728- // estimate number of ops - assume one third adds, half two byte (size+offset) copies
729797 // Assume good compression for the adds
730798 const uint approx_num_cmds = ((dlen / 3 ) / 10 ) + (((dlen / 3 ) * 2 ) / (2 + 2 + 1 ));
731799 DCV_reserve_memory (& dcv , approx_num_cmds );
@@ -824,7 +892,9 @@ static PyObject* connect_deltas(PyObject *self, PyObject *dstreams)
824892 }
825893
826894 if (!is_first_run ){
827- DCV_connect_with_base (& tdcv , & dcv , & tmpl );
895+ if (!DCV_connect_with_base (& tdcv , & dcv )){
896+ error = 1 ;
897+ }
828898 }
829899
830900 #ifdef DEBUG
@@ -859,7 +929,6 @@ static PyObject* connect_deltas(PyObject *self, PyObject *dstreams)
859929 Py_DECREF (stream_iter );
860930 }
861931
862- DCV_destroy (& tmpl );
863932 if (dsi > 1 ){
864933 // otherwise dcv equals tcl
865934 DCV_destroy (& dcv );
0 commit comments