diff --git a/README.md b/README.md index b87e74a..2ddee5e 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ You can find more information about mannual editing with Juicebox here [Issue 4] ## Other tools * ***juicer*** is a tool used to quickly generate HiC alignment file required for HiC contact map generation with tools like [Juicebox](https://github.com/aidenlab/Juicebox), [PretextMap](https://github.com/wtsi-hpag/PretextMap) and [Higlass](https://github.com/higlass/higlass) (`juicer pre`). It can be also used to generate AGP and FASTA files after manual editing with Juicebox JBAT (`juicer post`). -* ***agp_to_fasta*** creates a FASTA file from a AGP file. It takes two positional parameters: the AGP file and the contig FASTA file. By default, the output will be directed to `stdout`. You can write to a file with `-o` option. It also allows changing the FASTA line width with `-l` option, which by default is 60. +* ***agp_to_fasta*** creates a FASTA file from a AGP file. It takes two positional parameters: the AGP file and the contig FASTA file. By default, the output will be directed to `stdout`. You can write to a file with `-o` option. It allows changing the FASTA line width with `-l` option, which by default is 60. If the AGP file contains sequence components of unknown orientations ('?', '0' or 'na' identifiers, see [AGP format](https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/)), you will need `-u` option, with which components with unknown orientation are treated as if they had '+' orientation. ## Limitations * In rare cases, YaHS has been seen making telomere-to-telomere false joins. diff --git a/asset.c b/asset.c index 3ac957d..e1bb9fa 100644 --- a/asset.c +++ b/asset.c @@ -33,7 +33,6 @@ #include #include #include -#include #include "asset.h" @@ -47,8 +46,6 @@ double cputime(void) } #ifdef __linux__ -#include -#include void liftrlimit() { struct rlimit r; @@ -78,6 +75,10 @@ double realtime(void) return tp.tv_sec + tp.tv_usec * 1e-6; } +#if defined CTL_HW && defined HW_USERMEM +#include +#endif + long physmem_total(void) { #if defined _SC_PHYS_PAGES && defined _SC_PAGESIZE diff --git a/kseq.h b/kseq.h index 4de1acb..c7cfcd5 100644 --- a/kseq.h +++ b/kseq.h @@ -31,6 +31,7 @@ #include #include #include +#include #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r #define KS_SEP_TAB 1 // isspace() && !' ' @@ -84,7 +85,7 @@ } \ return (int)ks->buf[ks->begin++]; \ } \ - static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + static inline int64_t ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ { return ks_getuntil2(ks, delimiter, str, dret, 0); } \ static inline int ks_seek(kstream_t *ks, long int offset, int whence) \ { \ @@ -104,10 +105,12 @@ typedef struct __kstring_t { #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif - +#ifndef kroundup64 +#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x)) +#endif #define __KS_GETUNTIL(SCOPE, __read) \ - SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ + SCOPE int64_t ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ { \ if (dret) *dret = 0; \ str->l = append? str->l : 0; \ @@ -137,7 +140,7 @@ typedef struct __kstring_t { } else i = 0; /* never come to here! */ \ if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ str->m = str->l + (i - ks->begin) + 1; \ - kroundup32(str->m); \ + kroundup64(str->m); \ str->s = (char*)realloc(str->s, str->m); \ } \ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ @@ -168,7 +171,7 @@ typedef struct __kstring_t { #define KSTREAM_DECLARE(type_t, __read) \ __KS_TYPE(type_t) \ - extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ + extern int64_t ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ extern kstream_t *ks_init(type_t f); \ extern void ks_destroy(kstream_t *ks); \ __KS_INLINED(__read) @@ -200,7 +203,7 @@ typedef struct __kstring_t { -2 truncated quality string */ #define __KSEQ_READ(SCOPE) \ - SCOPE int kseq_read(kseq_t *seq) \ + SCOPE int64_t kseq_read(kseq_t *seq) \ { \ int c; \ kstream_t *ks = seq->f; \ @@ -224,7 +227,7 @@ typedef struct __kstring_t { if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ seq->seq.m = seq->seq.l + 2; \ - kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + kroundup64(seq->seq.m); /* rounded to the next closest 2^k */ \ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ } \ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ @@ -261,6 +264,6 @@ typedef struct __kstring_t { __KSEQ_TYPE(type_t) \ extern kseq_t *kseq_init(type_t fd); \ void kseq_destroy(kseq_t *ks); \ - int kseq_read(kseq_t *seq); + int64_t kseq_read(kseq_t *seq); #endif diff --git a/sdict.c b/sdict.c index 3f84ee7..cf980e1 100644 --- a/sdict.c +++ b/sdict.c @@ -166,7 +166,8 @@ void sd_hash(sdict_t *d) sdict_t *make_sdict_from_fa(const char *f, uint32_t min_len) { - int fd, l; + int fd; + int64_t l; gzFile fp; kseq_t *ks; void *ko = 0; @@ -181,11 +182,14 @@ sdict_t *make_sdict_from_fa(const char *f, uint32_t min_len) sdict_t *d; d = sd_init(); - // WARNING: might introduce bug here - // l is int type by definition in kseq.h - while ((l = kseq_read(ks)) >= 0) + while ((l = kseq_read(ks)) >= 0) { + if (l > UINT32_MAX) { + fprintf(stderr, "[E::%s] >4G sequence chunks are not supported: %s [%ld]\n", __func__, ks->name.s, l); + exit(EXIT_FAILURE); + } if (strlen(ks->seq.s) >= min_len) sd_put1(d, ks->name.s, ks->seq.s, strlen(ks->seq.s)); + } kseq_destroy(ks); gzclose(fp); @@ -201,7 +205,7 @@ sdict_t *make_sdict_from_index(const char *f, uint32_t min_len) size_t ln = 0; ssize_t read; char name[4096]; - uint32_t len; + int64_t len; fp = fopen(f, "r"); if (fp == NULL) { @@ -212,7 +216,11 @@ sdict_t *make_sdict_from_index(const char *f, uint32_t min_len) sdict_t *d; d = sd_init(); while ((read = getline(&line, &ln, fp)) != -1) { - sscanf(line, "%s %u", name, &len); + sscanf(line, "%s %ld", name, &len); + if (len > UINT32_MAX) { + fprintf(stderr, "[E::%s] >4G sequence chunks are not supported: %s [%ld]\n", __func__, name, len); + exit(EXIT_FAILURE); + } if (len >= min_len) sd_put(d, name, len); } @@ -227,7 +235,7 @@ sdict_t *make_sdict_from_gfa(const char *f, uint32_t min_len) size_t ln = 0; ssize_t read; char name[4096], lens[4096]; - uint32_t len; + uint64_t len; fp = fopen(f, "r"); if (fp == NULL) { @@ -241,6 +249,10 @@ sdict_t *make_sdict_from_gfa(const char *f, uint32_t min_len) if (line[0] == 'S') { sscanf(line, "%*s %s %*s %s", name, lens); len = strtoul(lens + 5, NULL, 10); + if (len > UINT32_MAX) { + fprintf(stderr, "[E::%s] >4G sequence chunks are not supported: %s [%ld]\n", __func__, name, len); + exit(EXIT_FAILURE); + } if (len >= min_len) sd_put(d, name, len); } @@ -593,7 +605,7 @@ void write_fasta_file_from_agp(const char *fa, const char *agp, FILE *fo, int li char sname[256], type[4], cname[256], cstarts[16], cends[16], oris[16]; char *name = NULL; uint32_t c; - uint64_t i, l, cstart, cend; + int64_t i, l, cstart, cend, ns, L; agp_in = fopen(agp, "r"); if (agp_in == NULL) { @@ -602,12 +614,12 @@ void write_fasta_file_from_agp(const char *fa, const char *agp, FILE *fo, int li } dict = make_sdict_from_fa(fa, 0); - l = 0; + l = L = ns = 0; while ((read = getline(&line, &ln, agp_in)) != -1) { - sscanf(line, "%s %*s %*s %*s %s %s %s %s %s", sname, type, cname, cstarts, cends, oris); - if (!strncmp(sname, "#", 1)) + if (!strncmp(line, "#", 1)) // header lines continue; + sscanf(line, "%s %*s %*s %*s %s %s %s %s %s", sname, type, cname, cstarts, cends, oris); if (!strncmp(type, "N", 1) || !strncmp(type, "U", 1)) { cend = strtoul(cname, NULL, 10); for (i = 0; i < cend; ++i) { @@ -623,6 +635,8 @@ void write_fasta_file_from_agp(const char *fa, const char *agp, FILE *fo, int li l = 0; } if (strcmp(sname, name)) { + ++ns; + L += l; free(name); name = strdup(sname); if (l % line_wd != 0) @@ -639,6 +653,12 @@ void write_fasta_file_from_agp(const char *fa, const char *agp, FILE *fo, int li exit(EXIT_FAILURE); } s = dict->s[c]; + if (cstart < 1 || cstart > cend || cend > s.len) { + fprintf(stderr, "[E::%s] invalid sequence component %s:%ld-%ld on %s\n", __func__, cname, cstart, cend, sname); + if (cend > s.len) + fprintf(stderr, "[E::%s] sequence end position (%ld) greater than sequence length (%u)\n", __func__, cend, s.len); + exit(EXIT_FAILURE); + } if (!strncmp(oris, "+", 1) || (un_oris && (!strncmp(oris, "?", 1) || !strncmp(oris, "0", 1) || !strncmp(oris, "na", 2)))) { // forward for (i = cstart - 1; i < cend; ++i) { @@ -654,7 +674,7 @@ void write_fasta_file_from_agp(const char *fa, const char *agp, FILE *fo, int li fputc('\n', fo); } } else { - fprintf(stderr, "[E::%s] unknown orientation of sequence component %s:%lu-%lu on %s: '%s'\n", __func__, cname, cstart, cend, sname, oris); + fprintf(stderr, "[E::%s] unknown orientation of sequence component %s:%ld-%ld on %s: '%s'\n", __func__, cname, cstart, cend, sname, oris); if (un_oris) { fprintf(stderr, "[E::%s] valid identifiers for unorientated sequence include: '?', '0' and 'na'\n", __func__); fprintf(stderr, "[E::%s] see https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/#FORMAT\n", __func__); @@ -664,6 +684,11 @@ void write_fasta_file_from_agp(const char *fa, const char *agp, FILE *fo, int li } if (l % line_wd != 0) fputc('\n', fo); + ++ns; + L += l; + fprintf(stderr, "[I::%s] Number sequences: %ld\n", __func__, ns); + fprintf(stderr, "[I::%s] Number bases: %ld\n", __func__, L); + free(name); free(dict);