Skip to content

Commit

Permalink
support >4G sequence component
Browse files Browse the repository at this point in the history
  • Loading branch information
Chenxi Zhou committed Jul 5, 2022
1 parent 6332e30 commit a80b3de
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 24 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ You can find more information about mannual editing with Juicebox here [Issue 4]

## Other tools
* ***juicer*** is a tool used to quickly generate HiC alignment file required for HiC contact map generation with tools like [Juicebox](https://github.com/aidenlab/Juicebox), [PretextMap](https://github.com/wtsi-hpag/PretextMap) and [Higlass](https://github.com/higlass/higlass) (`juicer pre`). It can be also used to generate AGP and FASTA files after manual editing with Juicebox JBAT (`juicer post`).
* ***agp_to_fasta*** creates a FASTA file from a AGP file. It takes two positional parameters: the AGP file and the contig FASTA file. By default, the output will be directed to `stdout`. You can write to a file with `-o` option. It also allows changing the FASTA line width with `-l` option, which by default is 60.
* ***agp_to_fasta*** creates a FASTA file from a AGP file. It takes two positional parameters: the AGP file and the contig FASTA file. By default, the output will be directed to `stdout`. You can write to a file with `-o` option. It allows changing the FASTA line width with `-l` option, which by default is 60. If the AGP file contains sequence components of unknown orientations ('?', '0' or 'na' identifiers, see [AGP format](https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/)), you will need `-u` option, with which components with unknown orientation are treated as if they had '+' orientation.

## Limitations
* In rare cases, YaHS has been seen making telomere-to-telomere false joins.
Expand Down
7 changes: 4 additions & 3 deletions asset.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
#include <unistd.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <sys/sysctl.h>

#include "asset.h"

Expand All @@ -47,8 +46,6 @@ double cputime(void)
}

#ifdef __linux__
#include <sys/resource.h>
#include <sys/time.h>
void liftrlimit()
{
struct rlimit r;
Expand Down Expand Up @@ -78,6 +75,10 @@ double realtime(void)
return tp.tv_sec + tp.tv_usec * 1e-6;
}

#if defined CTL_HW && defined HW_USERMEM
#include <sys/sysctl.h>
#endif

long physmem_total(void)
{
#if defined _SC_PHYS_PAGES && defined _SC_PAGESIZE
Expand Down
19 changes: 11 additions & 8 deletions kseq.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>

#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
#define KS_SEP_TAB 1 // isspace() && !' '
Expand Down Expand Up @@ -84,7 +85,7 @@
} \
return (int)ks->buf[ks->begin++]; \
} \
static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
static inline int64_t ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
{ return ks_getuntil2(ks, delimiter, str, dret, 0); } \
static inline int ks_seek(kstream_t *ks, long int offset, int whence) \
{ \
Expand All @@ -104,10 +105,12 @@ typedef struct __kstring_t {
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif


#ifndef kroundup64
#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
#endif

#define __KS_GETUNTIL(SCOPE, __read) \
SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
SCOPE int64_t ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
{ \
if (dret) *dret = 0; \
str->l = append? str->l : 0; \
Expand Down Expand Up @@ -137,7 +140,7 @@ typedef struct __kstring_t {
} else i = 0; /* never come to here! */ \
if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
str->m = str->l + (i - ks->begin) + 1; \
kroundup32(str->m); \
kroundup64(str->m); \
str->s = (char*)realloc(str->s, str->m); \
} \
memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
Expand Down Expand Up @@ -168,7 +171,7 @@ typedef struct __kstring_t {

#define KSTREAM_DECLARE(type_t, __read) \
__KS_TYPE(type_t) \
extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \
extern int64_t ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \
extern kstream_t *ks_init(type_t f); \
extern void ks_destroy(kstream_t *ks); \
__KS_INLINED(__read)
Expand Down Expand Up @@ -200,7 +203,7 @@ typedef struct __kstring_t {
-2 truncated quality string
*/
#define __KSEQ_READ(SCOPE) \
SCOPE int kseq_read(kseq_t *seq) \
SCOPE int64_t kseq_read(kseq_t *seq) \
{ \
int c; \
kstream_t *ks = seq->f; \
Expand All @@ -224,7 +227,7 @@ typedef struct __kstring_t {
if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
seq->seq.m = seq->seq.l + 2; \
kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
kroundup64(seq->seq.m); /* rounded to the next closest 2^k */ \
seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
} \
seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
Expand Down Expand Up @@ -261,6 +264,6 @@ typedef struct __kstring_t {
__KSEQ_TYPE(type_t) \
extern kseq_t *kseq_init(type_t fd); \
void kseq_destroy(kseq_t *ks); \
int kseq_read(kseq_t *seq);
int64_t kseq_read(kseq_t *seq);

#endif
49 changes: 37 additions & 12 deletions sdict.c
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,8 @@ void sd_hash(sdict_t *d)

sdict_t *make_sdict_from_fa(const char *f, uint32_t min_len)
{
int fd, l;
int fd;
int64_t l;
gzFile fp;
kseq_t *ks;
void *ko = 0;
Expand All @@ -181,11 +182,14 @@ sdict_t *make_sdict_from_fa(const char *f, uint32_t min_len)

sdict_t *d;
d = sd_init();
// WARNING: might introduce bug here
// l is int type by definition in kseq.h
while ((l = kseq_read(ks)) >= 0)
while ((l = kseq_read(ks)) >= 0) {
if (l > UINT32_MAX) {
fprintf(stderr, "[E::%s] >4G sequence chunks are not supported: %s [%ld]\n", __func__, ks->name.s, l);
exit(EXIT_FAILURE);
}
if (strlen(ks->seq.s) >= min_len)
sd_put1(d, ks->name.s, ks->seq.s, strlen(ks->seq.s));
}

kseq_destroy(ks);
gzclose(fp);
Expand All @@ -201,7 +205,7 @@ sdict_t *make_sdict_from_index(const char *f, uint32_t min_len)
size_t ln = 0;
ssize_t read;
char name[4096];
uint32_t len;
int64_t len;

fp = fopen(f, "r");
if (fp == NULL) {
Expand All @@ -212,7 +216,11 @@ sdict_t *make_sdict_from_index(const char *f, uint32_t min_len)
sdict_t *d;
d = sd_init();
while ((read = getline(&line, &ln, fp)) != -1) {
sscanf(line, "%s %u", name, &len);
sscanf(line, "%s %ld", name, &len);
if (len > UINT32_MAX) {
fprintf(stderr, "[E::%s] >4G sequence chunks are not supported: %s [%ld]\n", __func__, name, len);
exit(EXIT_FAILURE);
}
if (len >= min_len)
sd_put(d, name, len);
}
Expand All @@ -227,7 +235,7 @@ sdict_t *make_sdict_from_gfa(const char *f, uint32_t min_len)
size_t ln = 0;
ssize_t read;
char name[4096], lens[4096];
uint32_t len;
uint64_t len;

fp = fopen(f, "r");
if (fp == NULL) {
Expand All @@ -241,6 +249,10 @@ sdict_t *make_sdict_from_gfa(const char *f, uint32_t min_len)
if (line[0] == 'S') {
sscanf(line, "%*s %s %*s %s", name, lens);
len = strtoul(lens + 5, NULL, 10);
if (len > UINT32_MAX) {
fprintf(stderr, "[E::%s] >4G sequence chunks are not supported: %s [%ld]\n", __func__, name, len);
exit(EXIT_FAILURE);
}
if (len >= min_len)
sd_put(d, name, len);
}
Expand Down Expand Up @@ -593,7 +605,7 @@ void write_fasta_file_from_agp(const char *fa, const char *agp, FILE *fo, int li
char sname[256], type[4], cname[256], cstarts[16], cends[16], oris[16];
char *name = NULL;
uint32_t c;
uint64_t i, l, cstart, cend;
int64_t i, l, cstart, cend, ns, L;

agp_in = fopen(agp, "r");
if (agp_in == NULL) {
Expand All @@ -602,12 +614,12 @@ void write_fasta_file_from_agp(const char *fa, const char *agp, FILE *fo, int li
}

dict = make_sdict_from_fa(fa, 0);
l = 0;
l = L = ns = 0;
while ((read = getline(&line, &ln, agp_in)) != -1) {
sscanf(line, "%s %*s %*s %*s %s %s %s %s %s", sname, type, cname, cstarts, cends, oris);
if (!strncmp(sname, "#", 1))
if (!strncmp(line, "#", 1))
// header lines
continue;
sscanf(line, "%s %*s %*s %*s %s %s %s %s %s", sname, type, cname, cstarts, cends, oris);
if (!strncmp(type, "N", 1) || !strncmp(type, "U", 1)) {
cend = strtoul(cname, NULL, 10);
for (i = 0; i < cend; ++i) {
Expand All @@ -623,6 +635,8 @@ void write_fasta_file_from_agp(const char *fa, const char *agp, FILE *fo, int li
l = 0;
}
if (strcmp(sname, name)) {
++ns;
L += l;
free(name);
name = strdup(sname);
if (l % line_wd != 0)
Expand All @@ -639,6 +653,12 @@ void write_fasta_file_from_agp(const char *fa, const char *agp, FILE *fo, int li
exit(EXIT_FAILURE);
}
s = dict->s[c];
if (cstart < 1 || cstart > cend || cend > s.len) {
fprintf(stderr, "[E::%s] invalid sequence component %s:%ld-%ld on %s\n", __func__, cname, cstart, cend, sname);
if (cend > s.len)
fprintf(stderr, "[E::%s] sequence end position (%ld) greater than sequence length (%u)\n", __func__, cend, s.len);
exit(EXIT_FAILURE);
}
if (!strncmp(oris, "+", 1) || (un_oris && (!strncmp(oris, "?", 1) || !strncmp(oris, "0", 1) || !strncmp(oris, "na", 2)))) {
// forward
for (i = cstart - 1; i < cend; ++i) {
Expand All @@ -654,7 +674,7 @@ void write_fasta_file_from_agp(const char *fa, const char *agp, FILE *fo, int li
fputc('\n', fo);
}
} else {
fprintf(stderr, "[E::%s] unknown orientation of sequence component %s:%lu-%lu on %s: '%s'\n", __func__, cname, cstart, cend, sname, oris);
fprintf(stderr, "[E::%s] unknown orientation of sequence component %s:%ld-%ld on %s: '%s'\n", __func__, cname, cstart, cend, sname, oris);
if (un_oris) {
fprintf(stderr, "[E::%s] valid identifiers for unorientated sequence include: '?', '0' and 'na'\n", __func__);
fprintf(stderr, "[E::%s] see https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/#FORMAT\n", __func__);
Expand All @@ -664,6 +684,11 @@ void write_fasta_file_from_agp(const char *fa, const char *agp, FILE *fo, int li
}
if (l % line_wd != 0)
fputc('\n', fo);
++ns;
L += l;
fprintf(stderr, "[I::%s] Number sequences: %ld\n", __func__, ns);
fprintf(stderr, "[I::%s] Number bases: %ld\n", __func__, L);

free(name);
free(dict);

Expand Down

0 comments on commit a80b3de

Please sign in to comment.