2323#include < random>
2424#include < sstream>
2525
26- #include < boost/crc.hpp> // for boost::crc_32_type
27- #include < boost/iostreams/device/file.hpp>
28- #include < boost/iostreams/filter/gzip.hpp>
29- #include < boost/iostreams/filter/zlib.hpp>
30-
3126#ifdef SNAPPY_CODEC_AVAILABLE
3227#include < snappy.h>
3328#endif
3429
30+ #include < zlib.h>
31+
3532namespace avro {
3633using std::copy;
3734using std::istringstream;
@@ -55,12 +52,8 @@ const string AVRO_SNAPPY_CODEC = "snappy";
5552const size_t minSyncInterval = 32 ;
5653const size_t maxSyncInterval = 1u << 30 ;
5754
58- boost::iostreams::zlib_params get_zlib_params () {
59- boost::iostreams::zlib_params ret;
60- ret.method = boost::iostreams::zlib::deflated;
61- ret.noheader = true ;
62- return ret;
63- }
55+ // Recommended by https://www.zlib.net/zlib_how.html
56+ const size_t zlibBufGrowSize = 128 * 1024 ;
6457
6558} // namespace
6659
@@ -144,21 +137,45 @@ void DataFileWriterBase::sync() {
144137 std::unique_ptr<InputStream> in = memoryInputStream (*buffer_);
145138 copy (*in, *stream_);
146139 } else if (codec_ == DEFLATE_CODEC) {
147- std::vector<char > buf;
140+ std::vector<uint8_t > buf;
148141 {
149- boost::iostreams::filtering_ostream os;
150- os.push (boost::iostreams::zlib_compressor (get_zlib_params ()));
151- os.push (boost::iostreams::back_inserter (buf));
152- const uint8_t *data;
153- size_t len;
142+ z_stream zs;
143+ zs.zalloc = Z_NULL;
144+ zs.zfree = Z_NULL;
145+ zs.opaque = Z_NULL;
146+
147+ int ret = deflateInit2 (&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, -15 , 8 , Z_DEFAULT_STRATEGY);
148+ if (ret != Z_OK) {
149+ throw Exception (" Failed to initialize deflate, error: {}" , ret);
150+ }
154151
155152 std::unique_ptr<InputStream> input = memoryInputStream (*buffer_);
156- while (input->next (&data, &len)) {
157- boost::iostreams::write (os, reinterpret_cast <const char *>(data), len);
153+ const uint8_t *data;
154+ size_t len;
155+ while (ret != Z_STREAM_END && input->next (&data, &len)) {
156+ zs.avail_in = static_cast <uInt>(len);
157+ zs.next_in = const_cast <Bytef *>(data);
158+ bool flush = (zs.total_in + len) >= buffer_->byteCount ();
159+ do {
160+ if (zs.total_out == buf.size ()) {
161+ buf.resize (buf.size () + zlibBufGrowSize);
162+ }
163+ zs.avail_out = static_cast <uInt>(buf.size () - zs.total_out );
164+ zs.next_out = buf.data () + zs.total_out ;
165+ ret = deflate (&zs, flush ? Z_FINISH : Z_NO_FLUSH);
166+ if (ret == Z_STREAM_END) {
167+ break ;
168+ }
169+ if (ret != Z_OK) {
170+ throw Exception (" Failed to deflate, error: {}" , ret);
171+ }
172+ } while (zs.avail_out == 0 );
158173 }
174+
175+ buf.resize (zs.total_out );
176+ (void ) deflateEnd (&zs);
159177 } // make sure all is flushed
160- std::unique_ptr<InputStream> in = memoryInputStream (
161- reinterpret_cast <const uint8_t *>(buf.data ()), buf.size ());
178+ std::unique_ptr<InputStream> in = memoryInputStream (buf.data (), buf.size ());
162179 int64_t byteCount = buf.size ();
163180 avro::encode (*encoderPtr_, byteCount);
164181 encoderPtr_->flush ();
@@ -167,35 +184,28 @@ void DataFileWriterBase::sync() {
167184 } else if (codec_ == SNAPPY_CODEC) {
168185 std::vector<char > temp;
169186 std::string compressed;
170- boost::crc_32_type crc;
171- {
172- boost::iostreams::filtering_ostream os;
173- os.push (boost::iostreams::back_inserter (temp));
174- const uint8_t *data;
175- size_t len;
176187
177- std::unique_ptr<InputStream> input = memoryInputStream (*buffer_);
178- while (input->next (&data, &len)) {
179- boost::iostreams::write (os, reinterpret_cast <const char *>(data),
180- len);
181- }
182- } // make sure all is flushed
188+ const uint8_t *data;
189+ size_t len;
190+ std::unique_ptr<InputStream> input = memoryInputStream (*buffer_);
191+ while (input->next (&data, &len)) {
192+ temp.insert (temp.end (), reinterpret_cast <const char *>(data),
193+ reinterpret_cast <const char *>(data) + len);
194+ }
183195
184- crc.process_bytes (reinterpret_cast <const char *>(temp.data ()),
185- temp.size ());
186196 // For Snappy, add the CRC32 checksum
187- auto checksum = crc ();
197+ auto checksum = crc32 (0 , reinterpret_cast <const Bytef *>(temp.data ()),
198+ static_cast <uInt>(temp.size ()));
188199
189200 // Now compress
190201 size_t compressed_size = snappy::Compress (
191202 reinterpret_cast <const char *>(temp.data ()), temp.size (),
192203 &compressed);
204+
193205 temp.clear ();
194- {
195- boost::iostreams::filtering_ostream os;
196- os.push (boost::iostreams::back_inserter (temp));
197- boost::iostreams::write (os, compressed.c_str (), compressed_size);
198- }
206+ temp.insert (temp.end (), compressed.c_str (),
207+ compressed.c_str () + compressed_size);
208+
199209 temp.push_back (static_cast <char >((checksum >> 24 ) & 0xFF ));
200210 temp.push_back (static_cast <char >((checksum >> 16 ) & 0xFF ));
201211 temp.push_back (static_cast <char >((checksum >> 8 ) & 0xFF ));
@@ -285,8 +295,7 @@ void DataFileReaderBase::init(const ValidSchema &readerSchema) {
285295static void drain (InputStream &in) {
286296 const uint8_t *p = nullptr ;
287297 size_t n = 0 ;
288- while (in.next (&p, &n))
289- ;
298+ while (in.next (&p, &n));
290299}
291300
292301char hex (unsigned int x) {
@@ -384,7 +393,6 @@ void DataFileReaderBase::readDataBlock() {
384393 dataStream_ = std::move (st);
385394#ifdef SNAPPY_CODEC_AVAILABLE
386395 } else if (codec_ == SNAPPY_CODEC) {
387- boost::crc_32_type crc;
388396 uint32_t checksum = 0 ;
389397 compressed_.clear ();
390398 uncompressed.clear ();
@@ -408,35 +416,67 @@ void DataFileReaderBase::readDataBlock() {
408416 throw Exception (
409417 " Snappy Compression reported an error when decompressing" );
410418 }
411- crc. process_bytes (uncompressed. c_str (), uncompressed.size ());
412- auto c = crc ( );
419+ auto c = crc32 ( 0 , reinterpret_cast < const Bytef *>( uncompressed.c_str ()),
420+ static_cast <uInt>(uncompressed. size ()) );
413421 if (checksum != c) {
414422 throw Exception (
415423 " Checksum did not match for Snappy compression: Expected: {}, computed: {}" ,
416424 checksum, c);
417425 }
418- os_.reset (new boost::iostreams::filtering_istream ());
419- os_->push (
420- boost::iostreams::basic_array_source<char >(uncompressed.c_str (),
421- uncompressed.size ()));
422- std::unique_ptr<InputStream> in = istreamInputStream (*os_);
426+
427+ std::unique_ptr<InputStream> in = memoryInputStream (
428+ reinterpret_cast <const uint8_t *>(uncompressed.c_str ()),
429+ uncompressed.size ());
423430
424431 dataDecoder_->init (*in);
425432 dataStream_ = std::move (in);
426433#endif
427434 } else {
428435 compressed_.clear ();
429- const uint8_t *data;
430- size_t len;
431- while (st->next (&data, &len)) {
432- compressed_.insert (compressed_.end (), data, data + len);
436+ uncompressed.clear ();
437+
438+ {
439+ z_stream zs;
440+ zs.zalloc = Z_NULL;
441+ zs.zfree = Z_NULL;
442+ zs.opaque = Z_NULL;
443+ zs.avail_in = 0 ;
444+ zs.next_in = Z_NULL;
445+
446+ int ret = inflateInit2 (&zs, /* windowBits=*/ -15 );
447+ if (ret != Z_OK) {
448+ throw Exception (" Failed to initialize inflate, error: {}" , ret);
449+ }
450+
451+ const uint8_t *data;
452+ size_t len;
453+ while (ret != Z_STREAM_END && st->next (&data, &len)) {
454+ zs.avail_in = static_cast <uInt>(len);
455+ zs.next_in = const_cast <Bytef *>(data);
456+ do {
457+ if (zs.total_out == uncompressed.size ()) {
458+ uncompressed.resize (uncompressed.size () + zlibBufGrowSize);
459+ }
460+ zs.avail_out = static_cast <uInt>(uncompressed.size () - zs.total_out );
461+ zs.next_out = reinterpret_cast <Bytef *>(uncompressed.data () + zs.total_out );
462+ ret = inflate (&zs, Z_NO_FLUSH);
463+ if (ret == Z_STREAM_END) {
464+ break ;
465+ }
466+ if (ret != Z_OK) {
467+ throw Exception (" Failed to inflate, error: {}" , ret);
468+ }
469+ } while (zs.avail_out == 0 );
470+ }
471+
472+ uncompressed.resize (zs.total_out );
473+ (void ) inflateEnd (&zs);
433474 }
434- os_.reset (new boost::iostreams::filtering_istream ());
435- os_->push (boost::iostreams::zlib_decompressor (get_zlib_params ()));
436- os_->push (boost::iostreams::basic_array_source<char >(
437- compressed_.data (), compressed_.size ()));
438475
439- std::unique_ptr<InputStream> in = nonSeekableIstreamInputStream (*os_);
476+ std::unique_ptr<InputStream> in = memoryInputStream (
477+ reinterpret_cast <const uint8_t *>(uncompressed.c_str ()),
478+ uncompressed.size ());
479+
440480 dataDecoder_->init (*in);
441481 dataStream_ = std::move (in);
442482 }
0 commit comments