Skip to content

Commit

Permalink
prepare new release 1.2
Browse files Browse the repository at this point in the history
  • Loading branch information
c-zhou committed Sep 24, 2024
1 parent 64cc27a commit 3cb1d28
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 57 deletions.
26 changes: 13 additions & 13 deletions cov.c
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ double calc_avg_cov(cov_t *cov, sdict_t *sdict, double q_drop)
return avg_cov;
}

cov_norm_t *calc_cov_norms(cov_t *cov, sdict_t *sdict, uint32_t window, double q_drop)
cov_norm_t *calc_cov_norms(cov_t *cov, sdict_t *sdict, double q_drop)
{
size_t i, j, k, s;
int32_t c;
Expand All @@ -424,12 +424,12 @@ cov_norm_t *calc_cov_norms(cov_t *cov, sdict_t *sdict, uint32_t window, double q
n = sdict->n;
m = 0;
for (i = 0; i < n; ++i)
m += div_ceil(sdict->s[i].len, window);
m += div_ceil(sdict->s[i].len, COV_NORM_WINDOW);
norm_a = (double *) calloc(m, sizeof(double));
norm = (double **) calloc(n, sizeof(double *));
norm[0] = norm_a;
for (i = 1; i < n; ++i)
norm[i] = norm[i-1] + div_ceil(sdict->s[i-1].len, window);
norm[i] = norm[i-1] + div_ceil(sdict->s[i-1].len, COV_NORM_WINDOW);

// calculate average base coverage
avg_cov = calc_avg_cov(cov, sdict, q_drop);
Expand All @@ -441,16 +441,16 @@ cov_norm_t *calc_cov_norms(cov_t *cov, sdict_t *sdict, uint32_t window, double q
for (j = 0; j < s; ++j) {
p1 = (uint32_t) (a[j] >> 32);
if (p1 > p) {
if (p1 - p < window - w) {
if (p1 - p < COV_NORM_WINDOW - w) {
c1 += (int64_t) c * (p1 - p);
w += p1 - p;
p = p1;
} else {
while (p1 > p) {
if (p1 - p >= window - w) {
c1 += (int64_t) c * (window - w);
norm[i][k++] = MIN(max_cov_scale, c1 > 0? avg_cov * window / c1 : DBL_MAX);
p += window - w;
if (p1 - p >= COV_NORM_WINDOW - w) {
c1 += (int64_t) c * (COV_NORM_WINDOW - w);
norm[i][k++] = MIN(max_cov_scale, c1 > 0? avg_cov * COV_NORM_WINDOW / c1 : DBL_MAX);
p += COV_NORM_WINDOW - w;
c1 = 0;
w = 0;
} else {
Expand All @@ -466,9 +466,9 @@ cov_norm_t *calc_cov_norms(cov_t *cov, sdict_t *sdict, uint32_t window, double q

p1 = sdict->s[i].len;
while (p1 > p) {
if (p1 - p >= window - w) {
norm[i][k++] = MIN(max_cov_scale, c1 > 0? avg_cov * window / c1 : DBL_MAX);
p += window - w;
if (p1 - p >= COV_NORM_WINDOW - w) {
norm[i][k++] = MIN(max_cov_scale, c1 > 0? avg_cov * COV_NORM_WINDOW / c1 : DBL_MAX);
p += COV_NORM_WINDOW - w;
} else {
norm[i][k++] = MIN(max_cov_scale, c1 > 0? avg_cov * (p1 - p + w) / c1 : DBL_MAX);
p = p1;
Expand All @@ -480,12 +480,12 @@ cov_norm_t *calc_cov_norms(cov_t *cov, sdict_t *sdict, uint32_t window, double q
if (w > 0)
norm[i][k++] = MIN(max_cov_scale, c1 > 0? avg_cov * w / c1 : DBL_MAX);

assert(k == div_ceil(p1, window));
assert(k == div_ceil(p1, COV_NORM_WINDOW));
}

cov_norm = (cov_norm_t *) malloc(sizeof(cov_norm_t));
cov_norm->n = m;
cov_norm->w = window;
cov_norm->w = COV_NORM_WINDOW;
cov_norm->norm = norm;

return cov_norm;
Expand Down
4 changes: 3 additions & 1 deletion cov.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ typedef struct {
double **norm;
} cov_norm_t;

#define COV_NORM_WINDOW 1000

#ifdef __cplusplus
extern "C" {
#endif
Expand All @@ -64,7 +66,7 @@ uint64_t pos_compression(cov_t *cov);
cov_t *bam_cstats(const char *bam, sdict_t *sdict, int match_header);
cov_t *bed_cstats(const char *bed, sdict_t *sdict);
double calc_avg_cov(cov_t *cov, sdict_t *sdict, double q_drop);
cov_norm_t *calc_cov_norms(cov_t *cov, sdict_t *sdict, uint32_t window, double q_drop);
cov_norm_t *calc_cov_norms(cov_t *cov, sdict_t *sdict, double q_drop);
void print_cov_in_bed(cov_t *cov, sdict_t *sdict, FILE *fo);

#ifdef __cplusplus
Expand Down
19 changes: 10 additions & 9 deletions link.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "sdict.h"
#include "enzyme.h"
#include "link.h"
#include "cov.h"
#include "asset.h"

#undef DEBUG_NOISE
Expand Down Expand Up @@ -639,7 +640,7 @@ inter_link_mat_t *inter_link_mat_from_file(const char *f, cov_norm_t *cov_norm,
return link_mat;
}

cov_norm_t *cov_norm_from_file(const char *f, sdict_t *dict, uint32_t window)
cov_norm_t *cov_norm_from_file(const char *f, sdict_t *dict)
{
uint32_t i;
uint64_t m, n;
Expand Down Expand Up @@ -667,13 +668,13 @@ cov_norm_t *cov_norm_from_file(const char *f, sdict_t *dict, uint32_t window)
norm = (double **) malloc(sizeof(double *) * dict->n);
norm[0] = norm_a;
for (i = 1; i < dict->n; ++i)
norm[i] = norm[i-1] + div_ceil(dict->s[i-1].len, window);
norm[i] = norm[i-1] + div_ceil(dict->s[i-1].len, COV_NORM_WINDOW);

fclose(fp);

cov_norm = (cov_norm_t *) malloc(sizeof(cov_norm_t));
cov_norm->n = n;
cov_norm->w = window;
cov_norm->w = COV_NORM_WINDOW;
cov_norm->norm = norm;

return cov_norm;
Expand Down Expand Up @@ -1266,7 +1267,7 @@ static int parse_bam_rec1(bam1_t *b, bam_header_t *h, char **cname0, int32_t *s0
return (b->core.flag & 0x40)? 1 : 0;
}

void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, uint32_t wd, double q_drop, const char *out)
void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, double q_drop, const char *out)
{
bamFile fp;
FILE *fo;
Expand Down Expand Up @@ -1511,7 +1512,7 @@ void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8
fprintf(stderr, "[I::%s] dumped %lu read pairs from %lu records: %lu intra links + %lu inter links \n", __func__, pair_c, rec_c, intra_c, inter_c);

// cov_t *cov = bam_cstats(f, dict, 0);
cov_norm_t *cov_norm = calc_cov_norms(cov, dict, wd, q_drop);
cov_norm_t *cov_norm = calc_cov_norms(cov, dict, q_drop);
fwrite(&cov_norm->n, sizeof(uint64_t), 1, fo);
fwrite(cov_norm->norm[0], sizeof(double), cov_norm->n, fo);
cov_destroy(cov);
Expand All @@ -1521,7 +1522,7 @@ void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8
fclose(fo);
}

void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, uint32_t wd, double q_drop, const char *out)
void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, double q_drop, const char *out)
{
FILE *fp, *fo;
int fd;
Expand Down Expand Up @@ -1674,7 +1675,7 @@ void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8
fprintf(stderr, "[I::%s] dumped %lu read pairs from %lu records: %lu intra links + %lu inter links \n", __func__, pair_c, rec_c, intra_c, inter_c);

// cov_t *cov = bed_cstats(f, dict);
cov_norm_t *cov_norm = calc_cov_norms(cov, dict, wd, q_drop);
cov_norm_t *cov_norm = calc_cov_norms(cov, dict, q_drop);
fwrite(&cov_norm->n, sizeof(uint64_t), 1, fo);
fwrite(cov_norm->norm[0], sizeof(double), cov_norm->n, fo);
cov_destroy(cov);
Expand All @@ -1684,7 +1685,7 @@ void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8
fclose(fo);
}

void dump_links_from_pa5_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, uint32_t rl, uint32_t wd, double q_drop, const char *out)
void dump_links_from_pa5_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, uint32_t rl, double q_drop, const char *out)
{
FILE *fp, *fo;
int fd;
Expand Down Expand Up @@ -1817,7 +1818,7 @@ void dump_links_from_pa5_file(const char *f, const char *fai, uint32_t ml, uint8
fprintf(stderr, "[I::%s] dumped %lu read pairs from %lu records: %lu intra links + %lu inter links \n", __func__, pair_c, rec_c, intra_c, inter_c);

// cov_t *cov = bed_cstats(f, dict);
cov_norm_t *cov_norm = calc_cov_norms(cov, dict, wd, q_drop);
cov_norm_t *cov_norm = calc_cov_norms(cov, dict, q_drop);
fwrite(&cov_norm->n, sizeof(uint64_t), 1, fo);
fwrite(cov_norm->norm[0], sizeof(double), cov_norm->n, fo);
cov_destroy(cov);
Expand Down
8 changes: 4 additions & 4 deletions link.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ intra_link_mat_t *intra_link_mat_init(void *dict, uint32_t resolution, int use_g
inter_link_mat_t *inter_link_mat_init(asm_dict_t *dict, uint32_t resolution, uint32_t radius);
intra_link_mat_t *intra_link_mat_from_file(const char *f, cov_norm_t *cov_norm, asm_dict_t *dict, re_cuts_t *re_cuts, uint32_t resolution, int use_gap_seq, uint8_t mq);
inter_link_mat_t *inter_link_mat_from_file(const char *f, cov_norm_t *cov_norm, asm_dict_t *dict, re_cuts_t *re_cuts, uint32_t resolution, uint32_t radius, uint8_t mq);
cov_norm_t *cov_norm_from_file(const char *f, sdict_t *dict, uint32_t window);
cov_norm_t *cov_norm_from_file(const char *f, sdict_t *dict);
intra_link_t *get_intra_link(intra_link_mat_t *link_mat, uint32_t i, uint32_t j);
inter_link_t *get_inter_link(inter_link_mat_t *link_mat, uint32_t i, uint32_t j);
norm_t *calc_norms(intra_link_mat_t *link_mat, uint32_t d_min_cell, double d_mass_frac);
Expand All @@ -107,9 +107,9 @@ void norm_destroy(norm_t *norm);
double *get_max_inter_norms(inter_link_mat_t *link_mat, asm_dict_t *dict);
int8_t *calc_link_directs_from_file(const char *f, asm_dict_t *dict, uint8_t mq);
void calc_link_directs(inter_link_mat_t *link_mat, double min_norm, asm_dict_t *dict, int8_t *directs);
void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, uint32_t wd, double q_drop, const char *out);
void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, uint32_t wd, double q_drop, const char *out);
void dump_links_from_pa5_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, uint32_t rl, uint32_t wd, double q_drop, const char *out);
void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, double q_drop, const char *out);
void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, double q_drop, const char *out);
void dump_links_from_pa5_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, uint32_t rl, double q_drop, const char *out);
long estimate_intra_link_mat_init_rss(void *dict, uint32_t resolution, int use_gap_seq);
long estimate_inter_link_mat_init_rss(asm_dict_t *dict, uint32_t resolution, uint32_t radius);
#ifdef __cplusplus
Expand Down
65 changes: 35 additions & 30 deletions yahs.c
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ int run_scaffolding(char *fai, char *agp, char *link_file, cov_norm_t *cov_norm,
fprintf(stderr, "[I::%s] RAM limit: %.3fGB\n", __func__, (double) rss_limit / GB);
fprintf(stderr, "[I::%s] RAM required: %.3fGB\n", __func__, (double) rss_intra / GB);
re = ENOMEM_ERR;
goto scaff_done;
goto scaff_failed_0;
}
rss_limit -= rss_intra;
fprintf(stderr, "[I::%s] starting norm estimation...\n", __func__);
Expand All @@ -213,9 +213,8 @@ int run_scaffolding(char *fai, char *agp, char *link_file, cov_norm_t *cov_norm,
norm_t *norm = calc_norms(intra_link_mat, d_min_cell, d_mass_frac);
if (norm == 0) {
fprintf(stderr, "[W::%s] No enough bands for norm calculation... End of scaffolding round.\n", __func__);
intra_link_mat_destroy(intra_link_mat);
re = ENOBND_ERR;
goto scaff_done;
goto scaff_failed_1;
}

rss_inter = no_mem_check? 0 : estimate_inter_link_mat_init_rss(dict, resolution, norm->r);
Expand All @@ -225,7 +224,7 @@ int run_scaffolding(char *fai, char *agp, char *link_file, cov_norm_t *cov_norm,
fprintf(stderr, "[I::%s] RAM limit: %.3fGB\n", __func__, (double) rss_limit / GB);
fprintf(stderr, "[I::%s] RAM required: %.3fGB\n", __func__, (double) rss_inter / GB);
re = ENOMEM_ERR;
goto scaff_done;
goto scaff_failed_1;
}
rss_limit -= rss_inter;
fprintf(stderr, "[I::%s] starting link estimation...\n", __func__);
Expand All @@ -242,8 +241,7 @@ int run_scaffolding(char *fai, char *agp, char *link_file, cov_norm_t *cov_norm,
double la;
re = inter_link_norms(inter_link_mat, norm, 1, max_noise_ratio, &la);
if (re) {
inter_link_mat_destroy(inter_link_mat);
goto scaff_done;
goto scaff_failed_2;
}

int8_t *directs = 0;
Expand Down Expand Up @@ -310,7 +308,7 @@ int run_scaffolding(char *fai, char *agp, char *link_file, cov_norm_t *cov_norm,

asm_dict_t *d = make_asm_dict_from_graph(g, g->sdict);
// write scaffolds to AGP file
FILE *agp_out;
FILE *agp_out = NULL;
if (out) {
char *agp_out_name = (char *) malloc(strlen(out) + 5);
sprintf(agp_out_name, "%s.agp", out);
Expand All @@ -321,19 +319,23 @@ int run_scaffolding(char *fai, char *agp, char *link_file, cov_norm_t *cov_norm,
}
if (agp_out == NULL) {
fprintf(stderr, "[E::%s] fail to open file to write\n", __func__);
return 0;
re = 1;
} else {
write_asm_dict_to_agp(d, agp_out);
fclose(agp_out);
}
write_asm_dict_to_agp(d, agp_out);
fclose(agp_out);


asm_destroy(d);
graph_destroy(g);

intra_link_mat_destroy(intra_link_mat);
scaff_failed_2:
inter_link_mat_destroy(inter_link_mat);

scaff_failed_1:
norm_destroy(norm);
graph_destroy(g);
scaff_done:
intra_link_mat_destroy(intra_link_mat);

scaff_failed_0:
asm_destroy(dict);
sd_destroy(sdict);

Expand Down Expand Up @@ -446,7 +448,7 @@ static void print_asm_stats(uint64_t *n_stats, uint32_t *l_stats, int all)
#endif
}

int run_yahs(char *fai, char *agp, char *link_file, uint32_t ml, uint8_t mq, uint32_t wd, char *out, int *resolutions, int nr, int rr, re_cuts_t *re_cuts, int8_t *telo_ends, uint32_t d_min_cell, double d_mass_frac, int no_contig_ec, int no_scaffold_ec, int no_mem_check)
int run_yahs(char *fai, char *agp, char *link_file, uint32_t ml, uint8_t mq, char *out, int *resolutions, int nr, int rr, re_cuts_t *re_cuts, int8_t *telo_ends, uint32_t d_min_cell, double d_mass_frac, int no_contig_ec, int no_scaffold_ec, int no_mem_check)
{
int ec_round, resolution, re, r, rn, rc, ex;
uint64_t n50;
Expand All @@ -468,8 +470,6 @@ int run_yahs(char *fai, char *agp, char *link_file, uint32_t ml, uint8_t mq, uin
out_fn = (char *) malloc(strlen(out) + 35);
out_agp = (char *) malloc(strlen(out) + 35);
out_agp_break = (char *) malloc(strlen(out) + 35);

cov_norm = cov_norm_from_file(link_file, sdict, wd);

if (no_contig_ec == 0) {
#ifdef DEBUG
Expand Down Expand Up @@ -521,6 +521,8 @@ int run_yahs(char *fai, char *agp, char *link_file, uint32_t ml, uint8_t mq, uin
print_asm_stats(n_stats, l_stats, 1);
asm_destroy(dict);

cov_norm = cov_norm_from_file(link_file, sdict);

r = rc = 0;
rn = rr;
ex = max_extra_try;
Expand Down Expand Up @@ -696,22 +698,27 @@ static void print_help(FILE *fp_help, int is_long_help)
fprintf(fp_help, " -e STR restriction enzyme cutting sites [none]\n");
fprintf(fp_help, " -l INT minimum length of a contig to scaffold [0]\n");
fprintf(fp_help, " -q INT minimum mapping quality [10]\n");
fprintf(fp_help, "\n");
fprintf(fp_help, " --no-contig-ec do not do contig error correction\n");
fprintf(fp_help, " --no-scaffold-ec do not do scaffold error correction\n");
fprintf(fp_help, " --no-mem-check do not do memory check at runtime\n");
fprintf(fp_help, " --file-type STR input file type BED|BAM|PA5|BIN, file name extension is ignored if set\n");
fprintf(fp_help, " --read-length read length (required for PA5 format input) [150]\n");
fprintf(fp_help, " --telo-motif STR telomeric sequence motif\n");
if (is_long_help) {
fprintf(fp_help, "\n");
fprintf(fp_help, " --D-min-cells INT minimum number of cells to calculate the distance threshold [30]\n");
fprintf(fp_help, " --D-mass-frac FLOAT fraction of HiC signals to calculate the distance threshold [0.99]\n");
fprintf(fp_help, "\n");
fprintf(fp_help, " --seq-ctype STR AGP output sequence component type [%s]\n", agp_component_type_val(DEFAULT_AGP_SEQ_COMPONENT_TYPE));
fprintf(fp_help, " --gap-ctype STR AGP output gap component type [%s]\n", agp_component_type_val(DEFAULT_AGP_GAP_COMPONENT_TYPE));
fprintf(fp_help, " --gap-link STR AGP output gap linkage evidence [%s]\n", agp_linkage_evidence_val(DEFAULT_AGP_LINKAGE_EVIDENCE));
fprintf(fp_help, " --gap-size INT AGP output gap size between sequence component [%d]\n", DEFAULT_AGP_GAP_SIZE);
fprintf(fp_help, "\n");
fprintf(fp_help, " --convert-to-binary make a binary ouput file from the input and exit\n");
fprintf(fp_help, " --print-telo-motifs print telomeric motifs in the database and exit\n");
}
fprintf(fp_help, " --no-contig-ec do not do contig error correction\n");
fprintf(fp_help, " --no-scaffold-ec do not do scaffold error correction\n");
fprintf(fp_help, " --no-mem-check do not do memory check at runtime\n");
fprintf(fp_help, " --file-type STR input file type BED|BAM|PA5|BIN, file name extension is ignored if set\n");
fprintf(fp_help, " --read-length read length (required for PA5 format input) [150]\n");
fprintf(fp_help, " --telo-motif STR telomeric sequence motif\n");
fprintf(fp_help, "\n");
fprintf(fp_help, " -o STR prefix of output files [yahs.out]\n");
fprintf(fp_help, " -v INT verbose level [%d]\n", VERBOSE);
fprintf(fp_help, " -? print long help with extra option list\n");
Expand Down Expand Up @@ -754,7 +761,6 @@ int main(int argc, char *argv[])
int *resolutions, nr, rr, mq, ml, rl;
int no_contig_ec, no_scaffold_ec, no_mem_check, d_min_cell, print_telomotifs, convert_binary;
int8_t *telo_ends;
uint32_t wd;
double q_drop, d_mass_frac;
enum fileTypes f_type;

Expand All @@ -770,7 +776,6 @@ int main(int argc, char *argv[])
ml = 0;
rl = 150;
ecstr = 0;
wd = 1000;
q_drop = 0.1;
d_min_cell = 30;
d_mass_frac = 0.99;
Expand Down Expand Up @@ -1032,17 +1037,17 @@ int main(int argc, char *argv[])
link_bin_file = malloc(strlen(out) + 5);
sprintf(link_bin_file, "%s.bin", out);
fprintf(stderr, "[I::%s] dump hic links (BAM) to binary file %s\n", __func__, link_bin_file);
dump_links_from_bam_file(link_file, fai, ml, 0, wd, q_drop, link_bin_file);
dump_links_from_bam_file(link_file, fai, ml, 0, q_drop, link_bin_file);
} else if (f_type == BED) {
link_bin_file = malloc(strlen(out) + 5);
sprintf(link_bin_file, "%s.bin", out);
fprintf(stderr, "[I::%s] dump hic links (BED) to binary file %s\n", __func__, link_bin_file);
dump_links_from_bed_file(link_file, fai, ml, 0, wd, q_drop, link_bin_file);
dump_links_from_bed_file(link_file, fai, ml, 0, q_drop, link_bin_file);
} else if (f_type == PA5) {
link_bin_file = malloc(strlen(out) + 5);
sprintf(link_bin_file, "%s.bin", out);
fprintf(stderr, "[I::%s] dump hic links (PA5) to binary file %s\n", __func__, link_bin_file);
dump_links_from_pa5_file(link_file, fai, ml, 0, rl, wd, q_drop, link_bin_file);
dump_links_from_pa5_file(link_file, fai, ml, 0, rl, q_drop, link_bin_file);
} else if (f_type == BIN) {
if (convert_binary) {
fprintf(stderr, "[E::%s] Input is already in BIN format\n", __func__);
Expand Down Expand Up @@ -1083,7 +1088,7 @@ int main(int argc, char *argv[])

if (convert_binary) goto final_clean;

ret = run_yahs(fai, agp, link_bin_file, ml, mq8, wd, out, resolutions, nr, rr, re_cuts, telo_ends, d_min_cell, d_mass_frac, no_contig_ec, no_scaffold_ec, no_mem_check);
ret = run_yahs(fai, agp, link_bin_file, ml, mq8, out, resolutions, nr, rr, re_cuts, telo_ends, d_min_cell, d_mass_frac, no_contig_ec, no_scaffold_ec, no_mem_check);

if (ret == 0) {
agp_final = (char *) malloc(strlen(out) + 35);
Expand Down

0 comments on commit 3cb1d28

Please sign in to comment.