From a714fcdb52db4ce31fb53fa1e812ed30bf156a11 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Wed, 26 Aug 2015 20:41:26 -0400 Subject: [PATCH] adding a work-around for when a block of input data does not compress enough to fit into the output block. Typically the input and output blocks are of the same size, but the output block has to contain some extra information, like GZIP header and footer, so the input data must compress enough for that extra info to fit. If the input is already gzipped (or block-gzipped), or too random, it may not compress well enough. We could simply just compress fewer bytes of input data, as bgzf.c does, but this is not implemented. Instead, we choose to reduce the input block size uniformly. This may hurt compression performance. --- pbgzf.c | 8 ++++---- pbgzf.h | 2 +- pbgzip.c | 21 +++++++++++++++++---- reader.c | 10 +++++----- reader.h | 3 ++- 5 files changed, 29 insertions(+), 15 deletions(-) diff --git a/pbgzf.c b/pbgzf.c index 4f9a8bb..74e2e53 100644 --- a/pbgzf.c +++ b/pbgzf.c @@ -296,12 +296,12 @@ pbgzf_init(int fd, const char* __restrict mode) } else { // read from a compressed file if(strchr(mode, 'u')) {// hidden functionality - fp->r = reader_init(fd, fp->input, 1, fp->pool); // read the uncompressed file + fp->r = reader_init(fd, fp->input, 1, fp->pool, -1); // read the uncompressed file fp->p = producer_init(fp->r); fp->c = consumers_init(fp->num_threads, fp->input, fp->output, fp->r, 2, compress_level, compress_type); // do nothing } else { - fp->r = reader_init(fd, fp->input, 0, fp->pool); // read the compressed file + fp->r = reader_init(fd, fp->input, 0, fp->pool, -1); // read the compressed file fp->p = producer_init(fp->r); fp->c = consumers_init(fp->num_threads, fp->input, fp->output, fp->r, 0, compress_level, compress_type); // inflate fp->eof_ok = bgzf_check_EOF(fp->r->fp_bgzf); @@ -696,7 +696,7 @@ void pbgzf_set_cache_size(PBGZF *fp, int cache_size) } void -pbgzf_main(int f_src, int f_dst, int compress, int compress_level, int compress_type, int queue_size, int num_threads) +pbgzf_main(int f_src, int f_dst, int compress, int compress_level, int compress_type, int queue_size, int num_threads, int uncompressed_block_size) { // NB: this gives us greater control over queue size and the like queue_t *input = NULL; @@ -712,7 +712,7 @@ pbgzf_main(int f_src, int f_dst, int compress, int compress_level, int compress_ input = queue_init(queue_size, 0, 1, num_threads); output = queue_init(queue_size, 1, num_threads, 1); - r = reader_init(f_src, input, compress, pool); + r = reader_init(f_src, input, compress, pool, uncompressed_block_size); w = writer_init(f_dst, output, compress, compress_level, compress_type, pool); c = consumers_init(num_threads, input, output, r, compress, compress_level, compress_type); p = producer_init(r); diff --git a/pbgzf.h b/pbgzf.h index 71b2477..d8a620a 100644 --- a/pbgzf.h +++ b/pbgzf.h @@ -139,7 +139,7 @@ void pbgzf_set_cache_size(PBGZF *fp, int cache_size); #endif void -pbgzf_main(int f_src, int f_dst, int compress, int compress_level, int compress_type, int queue_size, int num_threads); +pbgzf_main(int f_src, int f_dst, int compress, int compress_level, int compress_type, int queue_size, int num_threads, int uncompressed_block_size); #endif diff --git a/pbgzip.c b/pbgzip.c index 0619ee6..1bfccd9 100644 --- a/pbgzip.c +++ b/pbgzip.c @@ -52,6 +52,9 @@ pbgzip_main_usage() fprintf(stderr, " -t INT the compress type (0 - gz, 1 - bz2) [%d]\n", 0); #endif fprintf(stderr, " -1 .. -9 the compression level [%d]\n", Z_DEFAULT_COMPRESSION); + fprintf(stderr, " -S the block size when reading uncompressed data (must be less than or equal to %d; -1 is auto) [%d]\n", + MAX_BLOCK_SIZE, + -1); fprintf(stderr, " -h give this help\n"); fprintf(stderr, "\n"); return 1; @@ -62,14 +65,15 @@ int main(int argc, char *argv[]) { int opt, f_src, f_dst; - int32_t compress, compress_level, compress_type, pstdout, is_forced, queue_size, n_threads; + int32_t compress, compress_level, compress_type, pstdout, is_forced, queue_size, n_threads, uncompressed_block_size; compress = 1; compress_level = -1; compress_type = 0; pstdout = 0; is_forced = 0; queue_size = 1000; n_threads = detect_cpus(); + uncompressed_block_size = -1; #ifndef DISABLE_BZ2 - while((opt = getopt(argc, argv, "cdhfn:t:q:0123456789")) >= 0){ + while((opt = getopt(argc, argv, "cdhfn:t:q:S:0123456789")) >= 0){ #else - while((opt = getopt(argc, argv, "cdhfn:q:0123456789")) >= 0){ + while((opt = getopt(argc, argv, "cdhfn:q:S:0123456789")) >= 0){ #endif if('0' <= opt && opt <= '9') { compress_level = opt - '0'; @@ -84,6 +88,7 @@ main(int argc, char *argv[]) #ifndef DISABLE_BZ2 case 't': compress_type = atoi(optarg); break; #endif + case 'S': uncompressed_block_size = atoi(optarg); break; case 'h': default: return pbgzip_main_usage(); @@ -92,6 +97,14 @@ main(int argc, char *argv[]) if(argc <= 1) return pbgzip_main_usage(); + if(MAX_BLOCK_SIZE < uncompressed_block_size) { + fprintf(stderr, "[pbgzip] -S (%d) was too big; must be less than or equal to %d.\n", + uncompressed_block_size, + MAX_BLOCK_SIZE); + return 1; + } + + if(pstdout) { f_dst = fileno(stdout); } @@ -143,7 +156,7 @@ main(int argc, char *argv[]) return 1; } - pbgzf_main(f_src, f_dst, compress, compress_level, compress_type, queue_size, n_threads); + pbgzf_main(f_src, f_dst, compress, compress_level, compress_type, queue_size, n_threads, uncompressed_block_size); if(!pstdout) unlink(argv[optind]); diff --git a/reader.c b/reader.c index 2460435..6430352 100644 --- a/reader.c +++ b/reader.c @@ -11,10 +11,8 @@ #include "pbgzf.h" #include "reader.h" -static const int WINDOW_SIZE = MAX_BLOCK_SIZE; - reader_t* -reader_init(int fd, queue_t *input, uint8_t compress, block_pool_t *pool) +reader_init(int fd, queue_t *input, uint8_t compress, block_pool_t *pool, int32_t uncompressed_block_size) { reader_t *r = calloc(1, sizeof(reader_t)); @@ -23,6 +21,8 @@ reader_init(int fd, queue_t *input, uint8_t compress, block_pool_t *pool) } else { r->fd_file = fd; + if (-1 == uncompressed_block_size) r->uncompressed_block_size = MAX_BLOCK_SIZE; + else r->uncompressed_block_size = uncompressed_block_size; } r->input = input; r->compress = compress; @@ -117,7 +117,7 @@ reader_run(void *arg) } } else { - if((b->block_length = read(r->fd_file, b->buffer, WINDOW_SIZE)) < 0) { + if((b->block_length = read(r->fd_file, b->buffer, r->uncompressed_block_size)) < 0) { fprintf(stderr, "reader read: bug encountered\n"); exit(1); } @@ -183,7 +183,7 @@ reader_run(void *arg) } } else { - if((b->block_length = read(r->fd_file, b->buffer, WINDOW_SIZE)) < 0) { + if((b->block_length = read(r->fd_file, b->buffer, r->uncompressed_block_size)) < 0) { fprintf(stderr, "reader read: bug encountered\n"); exit(1); } diff --git a/reader.h b/reader.h index c79b17b..b990ddf 100644 --- a/reader.h +++ b/reader.h @@ -9,10 +9,11 @@ typedef struct { uint8_t is_closed; uint8_t compress; block_pool_t *pool; + int32_t uncompressed_block_size; // when read uncompressed data } reader_t; reader_t* -reader_init(int fd, queue_t *input, uint8_t compress, block_pool_t *pool); +reader_init(int fd, queue_t *input, uint8_t compress, block_pool_t *pool, int32_t uncompressed_block_size); void* reader_run(void *arg);