Skip to content

Commit

Permalink
adding a work-around for when a block of input data does not compress
Browse files Browse the repository at this point in the history
enough to fit into the output block.  Typically the input and output
blocks are of the same size, but the output block has to contain some
extra information, like GZIP header and footer, so the input data must
compress enough for that extra info to fit.  If the input is already
gzipped (or block-gzipped), or too random, it may not compress well
enough.  We could simply just compress fewer bytes of input data, as
bgzf.c does, but this is not implemented.  Instead, we choose to reduce
the input block size uniformly.  This may hurt compression performance.
  • Loading branch information
nh13 committed Aug 27, 2015
1 parent 6b7b3ae commit a714fcd
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 15 deletions.
8 changes: 4 additions & 4 deletions pbgzf.c
Original file line number Diff line number Diff line change
Expand Up @@ -296,12 +296,12 @@ pbgzf_init(int fd, const char* __restrict mode)
}
else { // read from a compressed file
if(strchr(mode, 'u')) {// hidden functionality
fp->r = reader_init(fd, fp->input, 1, fp->pool); // read the uncompressed file
fp->r = reader_init(fd, fp->input, 1, fp->pool, -1); // read the uncompressed file
fp->p = producer_init(fp->r);
fp->c = consumers_init(fp->num_threads, fp->input, fp->output, fp->r, 2, compress_level, compress_type); // do nothing
}
else {
fp->r = reader_init(fd, fp->input, 0, fp->pool); // read the compressed file
fp->r = reader_init(fd, fp->input, 0, fp->pool, -1); // read the compressed file
fp->p = producer_init(fp->r);
fp->c = consumers_init(fp->num_threads, fp->input, fp->output, fp->r, 0, compress_level, compress_type); // inflate
fp->eof_ok = bgzf_check_EOF(fp->r->fp_bgzf);
Expand Down Expand Up @@ -696,7 +696,7 @@ void pbgzf_set_cache_size(PBGZF *fp, int cache_size)
}

void
pbgzf_main(int f_src, int f_dst, int compress, int compress_level, int compress_type, int queue_size, int num_threads)
pbgzf_main(int f_src, int f_dst, int compress, int compress_level, int compress_type, int queue_size, int num_threads, int uncompressed_block_size)
{
// NB: this gives us greater control over queue size and the like
queue_t *input = NULL;
Expand All @@ -712,7 +712,7 @@ pbgzf_main(int f_src, int f_dst, int compress, int compress_level, int compress_
input = queue_init(queue_size, 0, 1, num_threads);
output = queue_init(queue_size, 1, num_threads, 1);

r = reader_init(f_src, input, compress, pool);
r = reader_init(f_src, input, compress, pool, uncompressed_block_size);
w = writer_init(f_dst, output, compress, compress_level, compress_type, pool);
c = consumers_init(num_threads, input, output, r, compress, compress_level, compress_type);
p = producer_init(r);
Expand Down
2 changes: 1 addition & 1 deletion pbgzf.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ void pbgzf_set_cache_size(PBGZF *fp, int cache_size);
#endif

void
pbgzf_main(int f_src, int f_dst, int compress, int compress_level, int compress_type, int queue_size, int num_threads);
pbgzf_main(int f_src, int f_dst, int compress, int compress_level, int compress_type, int queue_size, int num_threads, int uncompressed_block_size);


#endif
21 changes: 17 additions & 4 deletions pbgzip.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ pbgzip_main_usage()
fprintf(stderr, " -t INT the compress type (0 - gz, 1 - bz2) [%d]\n", 0);
#endif
fprintf(stderr, " -1 .. -9 the compression level [%d]\n", Z_DEFAULT_COMPRESSION);
fprintf(stderr, " -S the block size when reading uncompressed data (must be less than or equal to %d; -1 is auto) [%d]\n",
MAX_BLOCK_SIZE,
-1);
fprintf(stderr, " -h give this help\n");
fprintf(stderr, "\n");
return 1;
Expand All @@ -62,14 +65,15 @@ int
main(int argc, char *argv[])
{
int opt, f_src, f_dst;
int32_t compress, compress_level, compress_type, pstdout, is_forced, queue_size, n_threads;
int32_t compress, compress_level, compress_type, pstdout, is_forced, queue_size, n_threads, uncompressed_block_size;

compress = 1; compress_level = -1; compress_type = 0;
pstdout = 0; is_forced = 0; queue_size = 1000; n_threads = detect_cpus();
uncompressed_block_size = -1;
#ifndef DISABLE_BZ2
while((opt = getopt(argc, argv, "cdhfn:t:q:0123456789")) >= 0){
while((opt = getopt(argc, argv, "cdhfn:t:q:S:0123456789")) >= 0){
#else
while((opt = getopt(argc, argv, "cdhfn:q:0123456789")) >= 0){
while((opt = getopt(argc, argv, "cdhfn:q:S:0123456789")) >= 0){
#endif
if('0' <= opt && opt <= '9') {
compress_level = opt - '0';
Expand All @@ -84,6 +88,7 @@ main(int argc, char *argv[])
#ifndef DISABLE_BZ2
case 't': compress_type = atoi(optarg); break;
#endif
case 'S': uncompressed_block_size = atoi(optarg); break;
case 'h':
default:
return pbgzip_main_usage();
Expand All @@ -92,6 +97,14 @@ main(int argc, char *argv[])

if(argc <= 1) return pbgzip_main_usage();

if(MAX_BLOCK_SIZE < uncompressed_block_size) {
fprintf(stderr, "[pbgzip] -S (%d) was too big; must be less than or equal to %d.\n",
uncompressed_block_size,
MAX_BLOCK_SIZE);
return 1;
}


if(pstdout) {
f_dst = fileno(stdout);
}
Expand Down Expand Up @@ -143,7 +156,7 @@ main(int argc, char *argv[])
return 1;
}

pbgzf_main(f_src, f_dst, compress, compress_level, compress_type, queue_size, n_threads);
pbgzf_main(f_src, f_dst, compress, compress_level, compress_type, queue_size, n_threads, uncompressed_block_size);

if(!pstdout) unlink(argv[optind]);

Expand Down
10 changes: 5 additions & 5 deletions reader.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,8 @@
#include "pbgzf.h"
#include "reader.h"

static const int WINDOW_SIZE = MAX_BLOCK_SIZE;

reader_t*
reader_init(int fd, queue_t *input, uint8_t compress, block_pool_t *pool)
reader_init(int fd, queue_t *input, uint8_t compress, block_pool_t *pool, int32_t uncompressed_block_size)
{
reader_t *r = calloc(1, sizeof(reader_t));

Expand All @@ -23,6 +21,8 @@ reader_init(int fd, queue_t *input, uint8_t compress, block_pool_t *pool)
}
else {
r->fd_file = fd;
if (-1 == uncompressed_block_size) r->uncompressed_block_size = MAX_BLOCK_SIZE;
else r->uncompressed_block_size = uncompressed_block_size;
}
r->input = input;
r->compress = compress;
Expand Down Expand Up @@ -117,7 +117,7 @@ reader_run(void *arg)
}
}
else {
if((b->block_length = read(r->fd_file, b->buffer, WINDOW_SIZE)) < 0) {
if((b->block_length = read(r->fd_file, b->buffer, r->uncompressed_block_size)) < 0) {
fprintf(stderr, "reader read: bug encountered\n");
exit(1);
}
Expand Down Expand Up @@ -183,7 +183,7 @@ reader_run(void *arg)
}
}
else {
if((b->block_length = read(r->fd_file, b->buffer, WINDOW_SIZE)) < 0) {
if((b->block_length = read(r->fd_file, b->buffer, r->uncompressed_block_size)) < 0) {
fprintf(stderr, "reader read: bug encountered\n");
exit(1);
}
Expand Down
3 changes: 2 additions & 1 deletion reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ typedef struct {
uint8_t is_closed;
uint8_t compress;
block_pool_t *pool;
int32_t uncompressed_block_size; // when read uncompressed data
} reader_t;

reader_t*
reader_init(int fd, queue_t *input, uint8_t compress, block_pool_t *pool);
reader_init(int fd, queue_t *input, uint8_t compress, block_pool_t *pool, int32_t uncompressed_block_size);

void*
reader_run(void *arg);
Expand Down

0 comments on commit a714fcd

Please sign in to comment.