From 3cb1d28e3aaa0592c32fdab9e369eba28d643345 Mon Sep 17 00:00:00 2001 From: Chenxi Zhou Date: Tue, 24 Sep 2024 17:20:50 +0100 Subject: [PATCH] prepare new release 1.2 --- cov.c | 26 +++++++++++------------ cov.h | 4 +++- link.c | 19 +++++++++-------- link.h | 8 ++++---- yahs.c | 65 +++++++++++++++++++++++++++++++--------------------------- 5 files changed, 65 insertions(+), 57 deletions(-) diff --git a/cov.c b/cov.c index f49f6e3..dd5e5d5 100644 --- a/cov.c +++ b/cov.c @@ -411,7 +411,7 @@ double calc_avg_cov(cov_t *cov, sdict_t *sdict, double q_drop) return avg_cov; } -cov_norm_t *calc_cov_norms(cov_t *cov, sdict_t *sdict, uint32_t window, double q_drop) +cov_norm_t *calc_cov_norms(cov_t *cov, sdict_t *sdict, double q_drop) { size_t i, j, k, s; int32_t c; @@ -424,12 +424,12 @@ cov_norm_t *calc_cov_norms(cov_t *cov, sdict_t *sdict, uint32_t window, double q n = sdict->n; m = 0; for (i = 0; i < n; ++i) - m += div_ceil(sdict->s[i].len, window); + m += div_ceil(sdict->s[i].len, COV_NORM_WINDOW); norm_a = (double *) calloc(m, sizeof(double)); norm = (double **) calloc(n, sizeof(double *)); norm[0] = norm_a; for (i = 1; i < n; ++i) - norm[i] = norm[i-1] + div_ceil(sdict->s[i-1].len, window); + norm[i] = norm[i-1] + div_ceil(sdict->s[i-1].len, COV_NORM_WINDOW); // calculate average base coverage avg_cov = calc_avg_cov(cov, sdict, q_drop); @@ -441,16 +441,16 @@ cov_norm_t *calc_cov_norms(cov_t *cov, sdict_t *sdict, uint32_t window, double q for (j = 0; j < s; ++j) { p1 = (uint32_t) (a[j] >> 32); if (p1 > p) { - if (p1 - p < window - w) { + if (p1 - p < COV_NORM_WINDOW - w) { c1 += (int64_t) c * (p1 - p); w += p1 - p; p = p1; } else { while (p1 > p) { - if (p1 - p >= window - w) { - c1 += (int64_t) c * (window - w); - norm[i][k++] = MIN(max_cov_scale, c1 > 0? avg_cov * window / c1 : DBL_MAX); - p += window - w; + if (p1 - p >= COV_NORM_WINDOW - w) { + c1 += (int64_t) c * (COV_NORM_WINDOW - w); + norm[i][k++] = MIN(max_cov_scale, c1 > 0? avg_cov * COV_NORM_WINDOW / c1 : DBL_MAX); + p += COV_NORM_WINDOW - w; c1 = 0; w = 0; } else { @@ -466,9 +466,9 @@ cov_norm_t *calc_cov_norms(cov_t *cov, sdict_t *sdict, uint32_t window, double q p1 = sdict->s[i].len; while (p1 > p) { - if (p1 - p >= window - w) { - norm[i][k++] = MIN(max_cov_scale, c1 > 0? avg_cov * window / c1 : DBL_MAX); - p += window - w; + if (p1 - p >= COV_NORM_WINDOW - w) { + norm[i][k++] = MIN(max_cov_scale, c1 > 0? avg_cov * COV_NORM_WINDOW / c1 : DBL_MAX); + p += COV_NORM_WINDOW - w; } else { norm[i][k++] = MIN(max_cov_scale, c1 > 0? avg_cov * (p1 - p + w) / c1 : DBL_MAX); p = p1; @@ -480,12 +480,12 @@ cov_norm_t *calc_cov_norms(cov_t *cov, sdict_t *sdict, uint32_t window, double q if (w > 0) norm[i][k++] = MIN(max_cov_scale, c1 > 0? avg_cov * w / c1 : DBL_MAX); - assert(k == div_ceil(p1, window)); + assert(k == div_ceil(p1, COV_NORM_WINDOW)); } cov_norm = (cov_norm_t *) malloc(sizeof(cov_norm_t)); cov_norm->n = m; - cov_norm->w = window; + cov_norm->w = COV_NORM_WINDOW; cov_norm->norm = norm; return cov_norm; diff --git a/cov.h b/cov.h index dce2dbb..aece02e 100644 --- a/cov.h +++ b/cov.h @@ -53,6 +53,8 @@ typedef struct { double **norm; } cov_norm_t; +#define COV_NORM_WINDOW 1000 + #ifdef __cplusplus extern "C" { #endif @@ -64,7 +66,7 @@ uint64_t pos_compression(cov_t *cov); cov_t *bam_cstats(const char *bam, sdict_t *sdict, int match_header); cov_t *bed_cstats(const char *bed, sdict_t *sdict); double calc_avg_cov(cov_t *cov, sdict_t *sdict, double q_drop); -cov_norm_t *calc_cov_norms(cov_t *cov, sdict_t *sdict, uint32_t window, double q_drop); +cov_norm_t *calc_cov_norms(cov_t *cov, sdict_t *sdict, double q_drop); void print_cov_in_bed(cov_t *cov, sdict_t *sdict, FILE *fo); #ifdef __cplusplus diff --git a/link.c b/link.c index 572f3c9..8d8428c 100644 --- a/link.c +++ b/link.c @@ -40,6 +40,7 @@ #include "sdict.h" #include "enzyme.h" #include "link.h" +#include "cov.h" #include "asset.h" #undef DEBUG_NOISE @@ -639,7 +640,7 @@ inter_link_mat_t *inter_link_mat_from_file(const char *f, cov_norm_t *cov_norm, return link_mat; } -cov_norm_t *cov_norm_from_file(const char *f, sdict_t *dict, uint32_t window) +cov_norm_t *cov_norm_from_file(const char *f, sdict_t *dict) { uint32_t i; uint64_t m, n; @@ -667,13 +668,13 @@ cov_norm_t *cov_norm_from_file(const char *f, sdict_t *dict, uint32_t window) norm = (double **) malloc(sizeof(double *) * dict->n); norm[0] = norm_a; for (i = 1; i < dict->n; ++i) - norm[i] = norm[i-1] + div_ceil(dict->s[i-1].len, window); + norm[i] = norm[i-1] + div_ceil(dict->s[i-1].len, COV_NORM_WINDOW); fclose(fp); cov_norm = (cov_norm_t *) malloc(sizeof(cov_norm_t)); cov_norm->n = n; - cov_norm->w = window; + cov_norm->w = COV_NORM_WINDOW; cov_norm->norm = norm; return cov_norm; @@ -1266,7 +1267,7 @@ static int parse_bam_rec1(bam1_t *b, bam_header_t *h, char **cname0, int32_t *s0 return (b->core.flag & 0x40)? 1 : 0; } -void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, uint32_t wd, double q_drop, const char *out) +void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, double q_drop, const char *out) { bamFile fp; FILE *fo; @@ -1511,7 +1512,7 @@ void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8 fprintf(stderr, "[I::%s] dumped %lu read pairs from %lu records: %lu intra links + %lu inter links \n", __func__, pair_c, rec_c, intra_c, inter_c); // cov_t *cov = bam_cstats(f, dict, 0); - cov_norm_t *cov_norm = calc_cov_norms(cov, dict, wd, q_drop); + cov_norm_t *cov_norm = calc_cov_norms(cov, dict, q_drop); fwrite(&cov_norm->n, sizeof(uint64_t), 1, fo); fwrite(cov_norm->norm[0], sizeof(double), cov_norm->n, fo); cov_destroy(cov); @@ -1521,7 +1522,7 @@ void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8 fclose(fo); } -void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, uint32_t wd, double q_drop, const char *out) +void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, double q_drop, const char *out) { FILE *fp, *fo; int fd; @@ -1674,7 +1675,7 @@ void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8 fprintf(stderr, "[I::%s] dumped %lu read pairs from %lu records: %lu intra links + %lu inter links \n", __func__, pair_c, rec_c, intra_c, inter_c); // cov_t *cov = bed_cstats(f, dict); - cov_norm_t *cov_norm = calc_cov_norms(cov, dict, wd, q_drop); + cov_norm_t *cov_norm = calc_cov_norms(cov, dict, q_drop); fwrite(&cov_norm->n, sizeof(uint64_t), 1, fo); fwrite(cov_norm->norm[0], sizeof(double), cov_norm->n, fo); cov_destroy(cov); @@ -1684,7 +1685,7 @@ void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8 fclose(fo); } -void dump_links_from_pa5_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, uint32_t rl, uint32_t wd, double q_drop, const char *out) +void dump_links_from_pa5_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, uint32_t rl, double q_drop, const char *out) { FILE *fp, *fo; int fd; @@ -1817,7 +1818,7 @@ void dump_links_from_pa5_file(const char *f, const char *fai, uint32_t ml, uint8 fprintf(stderr, "[I::%s] dumped %lu read pairs from %lu records: %lu intra links + %lu inter links \n", __func__, pair_c, rec_c, intra_c, inter_c); // cov_t *cov = bed_cstats(f, dict); - cov_norm_t *cov_norm = calc_cov_norms(cov, dict, wd, q_drop); + cov_norm_t *cov_norm = calc_cov_norms(cov, dict, q_drop); fwrite(&cov_norm->n, sizeof(uint64_t), 1, fo); fwrite(cov_norm->norm[0], sizeof(double), cov_norm->n, fo); cov_destroy(cov); diff --git a/link.h b/link.h index fa65756..dac4cf5 100644 --- a/link.h +++ b/link.h @@ -90,7 +90,7 @@ intra_link_mat_t *intra_link_mat_init(void *dict, uint32_t resolution, int use_g inter_link_mat_t *inter_link_mat_init(asm_dict_t *dict, uint32_t resolution, uint32_t radius); intra_link_mat_t *intra_link_mat_from_file(const char *f, cov_norm_t *cov_norm, asm_dict_t *dict, re_cuts_t *re_cuts, uint32_t resolution, int use_gap_seq, uint8_t mq); inter_link_mat_t *inter_link_mat_from_file(const char *f, cov_norm_t *cov_norm, asm_dict_t *dict, re_cuts_t *re_cuts, uint32_t resolution, uint32_t radius, uint8_t mq); -cov_norm_t *cov_norm_from_file(const char *f, sdict_t *dict, uint32_t window); +cov_norm_t *cov_norm_from_file(const char *f, sdict_t *dict); intra_link_t *get_intra_link(intra_link_mat_t *link_mat, uint32_t i, uint32_t j); inter_link_t *get_inter_link(inter_link_mat_t *link_mat, uint32_t i, uint32_t j); norm_t *calc_norms(intra_link_mat_t *link_mat, uint32_t d_min_cell, double d_mass_frac); @@ -107,9 +107,9 @@ void norm_destroy(norm_t *norm); double *get_max_inter_norms(inter_link_mat_t *link_mat, asm_dict_t *dict); int8_t *calc_link_directs_from_file(const char *f, asm_dict_t *dict, uint8_t mq); void calc_link_directs(inter_link_mat_t *link_mat, double min_norm, asm_dict_t *dict, int8_t *directs); -void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, uint32_t wd, double q_drop, const char *out); -void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, uint32_t wd, double q_drop, const char *out); -void dump_links_from_pa5_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, uint32_t rl, uint32_t wd, double q_drop, const char *out); +void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, double q_drop, const char *out); +void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, double q_drop, const char *out); +void dump_links_from_pa5_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, uint32_t rl, double q_drop, const char *out); long estimate_intra_link_mat_init_rss(void *dict, uint32_t resolution, int use_gap_seq); long estimate_inter_link_mat_init_rss(asm_dict_t *dict, uint32_t resolution, uint32_t radius); #ifdef __cplusplus diff --git a/yahs.c b/yahs.c index b21b025..253a3ae 100644 --- a/yahs.c +++ b/yahs.c @@ -198,7 +198,7 @@ int run_scaffolding(char *fai, char *agp, char *link_file, cov_norm_t *cov_norm, fprintf(stderr, "[I::%s] RAM limit: %.3fGB\n", __func__, (double) rss_limit / GB); fprintf(stderr, "[I::%s] RAM required: %.3fGB\n", __func__, (double) rss_intra / GB); re = ENOMEM_ERR; - goto scaff_done; + goto scaff_failed_0; } rss_limit -= rss_intra; fprintf(stderr, "[I::%s] starting norm estimation...\n", __func__); @@ -213,9 +213,8 @@ int run_scaffolding(char *fai, char *agp, char *link_file, cov_norm_t *cov_norm, norm_t *norm = calc_norms(intra_link_mat, d_min_cell, d_mass_frac); if (norm == 0) { fprintf(stderr, "[W::%s] No enough bands for norm calculation... End of scaffolding round.\n", __func__); - intra_link_mat_destroy(intra_link_mat); re = ENOBND_ERR; - goto scaff_done; + goto scaff_failed_1; } rss_inter = no_mem_check? 0 : estimate_inter_link_mat_init_rss(dict, resolution, norm->r); @@ -225,7 +224,7 @@ int run_scaffolding(char *fai, char *agp, char *link_file, cov_norm_t *cov_norm, fprintf(stderr, "[I::%s] RAM limit: %.3fGB\n", __func__, (double) rss_limit / GB); fprintf(stderr, "[I::%s] RAM required: %.3fGB\n", __func__, (double) rss_inter / GB); re = ENOMEM_ERR; - goto scaff_done; + goto scaff_failed_1; } rss_limit -= rss_inter; fprintf(stderr, "[I::%s] starting link estimation...\n", __func__); @@ -242,8 +241,7 @@ int run_scaffolding(char *fai, char *agp, char *link_file, cov_norm_t *cov_norm, double la; re = inter_link_norms(inter_link_mat, norm, 1, max_noise_ratio, &la); if (re) { - inter_link_mat_destroy(inter_link_mat); - goto scaff_done; + goto scaff_failed_2; } int8_t *directs = 0; @@ -310,7 +308,7 @@ int run_scaffolding(char *fai, char *agp, char *link_file, cov_norm_t *cov_norm, asm_dict_t *d = make_asm_dict_from_graph(g, g->sdict); // write scaffolds to AGP file - FILE *agp_out; + FILE *agp_out = NULL; if (out) { char *agp_out_name = (char *) malloc(strlen(out) + 5); sprintf(agp_out_name, "%s.agp", out); @@ -321,19 +319,23 @@ int run_scaffolding(char *fai, char *agp, char *link_file, cov_norm_t *cov_norm, } if (agp_out == NULL) { fprintf(stderr, "[E::%s] fail to open file to write\n", __func__); - return 0; + re = 1; + } else { + write_asm_dict_to_agp(d, agp_out); + fclose(agp_out); } - write_asm_dict_to_agp(d, agp_out); - fclose(agp_out); - + asm_destroy(d); + graph_destroy(g); - intra_link_mat_destroy(intra_link_mat); +scaff_failed_2: inter_link_mat_destroy(inter_link_mat); + +scaff_failed_1: norm_destroy(norm); - graph_destroy(g); - -scaff_done: + intra_link_mat_destroy(intra_link_mat); + +scaff_failed_0: asm_destroy(dict); sd_destroy(sdict); @@ -446,7 +448,7 @@ static void print_asm_stats(uint64_t *n_stats, uint32_t *l_stats, int all) #endif } -int run_yahs(char *fai, char *agp, char *link_file, uint32_t ml, uint8_t mq, uint32_t wd, char *out, int *resolutions, int nr, int rr, re_cuts_t *re_cuts, int8_t *telo_ends, uint32_t d_min_cell, double d_mass_frac, int no_contig_ec, int no_scaffold_ec, int no_mem_check) +int run_yahs(char *fai, char *agp, char *link_file, uint32_t ml, uint8_t mq, char *out, int *resolutions, int nr, int rr, re_cuts_t *re_cuts, int8_t *telo_ends, uint32_t d_min_cell, double d_mass_frac, int no_contig_ec, int no_scaffold_ec, int no_mem_check) { int ec_round, resolution, re, r, rn, rc, ex; uint64_t n50; @@ -468,8 +470,6 @@ int run_yahs(char *fai, char *agp, char *link_file, uint32_t ml, uint8_t mq, uin out_fn = (char *) malloc(strlen(out) + 35); out_agp = (char *) malloc(strlen(out) + 35); out_agp_break = (char *) malloc(strlen(out) + 35); - - cov_norm = cov_norm_from_file(link_file, sdict, wd); if (no_contig_ec == 0) { #ifdef DEBUG @@ -521,6 +521,8 @@ int run_yahs(char *fai, char *agp, char *link_file, uint32_t ml, uint8_t mq, uin print_asm_stats(n_stats, l_stats, 1); asm_destroy(dict); + cov_norm = cov_norm_from_file(link_file, sdict); + r = rc = 0; rn = rr; ex = max_extra_try; @@ -696,22 +698,27 @@ static void print_help(FILE *fp_help, int is_long_help) fprintf(fp_help, " -e STR restriction enzyme cutting sites [none]\n"); fprintf(fp_help, " -l INT minimum length of a contig to scaffold [0]\n"); fprintf(fp_help, " -q INT minimum mapping quality [10]\n"); + fprintf(fp_help, "\n"); + fprintf(fp_help, " --no-contig-ec do not do contig error correction\n"); + fprintf(fp_help, " --no-scaffold-ec do not do scaffold error correction\n"); + fprintf(fp_help, " --no-mem-check do not do memory check at runtime\n"); + fprintf(fp_help, " --file-type STR input file type BED|BAM|PA5|BIN, file name extension is ignored if set\n"); + fprintf(fp_help, " --read-length read length (required for PA5 format input) [150]\n"); + fprintf(fp_help, " --telo-motif STR telomeric sequence motif\n"); if (is_long_help) { + fprintf(fp_help, "\n"); fprintf(fp_help, " --D-min-cells INT minimum number of cells to calculate the distance threshold [30]\n"); fprintf(fp_help, " --D-mass-frac FLOAT fraction of HiC signals to calculate the distance threshold [0.99]\n"); + fprintf(fp_help, "\n"); fprintf(fp_help, " --seq-ctype STR AGP output sequence component type [%s]\n", agp_component_type_val(DEFAULT_AGP_SEQ_COMPONENT_TYPE)); fprintf(fp_help, " --gap-ctype STR AGP output gap component type [%s]\n", agp_component_type_val(DEFAULT_AGP_GAP_COMPONENT_TYPE)); fprintf(fp_help, " --gap-link STR AGP output gap linkage evidence [%s]\n", agp_linkage_evidence_val(DEFAULT_AGP_LINKAGE_EVIDENCE)); fprintf(fp_help, " --gap-size INT AGP output gap size between sequence component [%d]\n", DEFAULT_AGP_GAP_SIZE); + fprintf(fp_help, "\n"); fprintf(fp_help, " --convert-to-binary make a binary ouput file from the input and exit\n"); fprintf(fp_help, " --print-telo-motifs print telomeric motifs in the database and exit\n"); } - fprintf(fp_help, " --no-contig-ec do not do contig error correction\n"); - fprintf(fp_help, " --no-scaffold-ec do not do scaffold error correction\n"); - fprintf(fp_help, " --no-mem-check do not do memory check at runtime\n"); - fprintf(fp_help, " --file-type STR input file type BED|BAM|PA5|BIN, file name extension is ignored if set\n"); - fprintf(fp_help, " --read-length read length (required for PA5 format input) [150]\n"); - fprintf(fp_help, " --telo-motif STR telomeric sequence motif\n"); + fprintf(fp_help, "\n"); fprintf(fp_help, " -o STR prefix of output files [yahs.out]\n"); fprintf(fp_help, " -v INT verbose level [%d]\n", VERBOSE); fprintf(fp_help, " -? print long help with extra option list\n"); @@ -754,7 +761,6 @@ int main(int argc, char *argv[]) int *resolutions, nr, rr, mq, ml, rl; int no_contig_ec, no_scaffold_ec, no_mem_check, d_min_cell, print_telomotifs, convert_binary; int8_t *telo_ends; - uint32_t wd; double q_drop, d_mass_frac; enum fileTypes f_type; @@ -770,7 +776,6 @@ int main(int argc, char *argv[]) ml = 0; rl = 150; ecstr = 0; - wd = 1000; q_drop = 0.1; d_min_cell = 30; d_mass_frac = 0.99; @@ -1032,17 +1037,17 @@ int main(int argc, char *argv[]) link_bin_file = malloc(strlen(out) + 5); sprintf(link_bin_file, "%s.bin", out); fprintf(stderr, "[I::%s] dump hic links (BAM) to binary file %s\n", __func__, link_bin_file); - dump_links_from_bam_file(link_file, fai, ml, 0, wd, q_drop, link_bin_file); + dump_links_from_bam_file(link_file, fai, ml, 0, q_drop, link_bin_file); } else if (f_type == BED) { link_bin_file = malloc(strlen(out) + 5); sprintf(link_bin_file, "%s.bin", out); fprintf(stderr, "[I::%s] dump hic links (BED) to binary file %s\n", __func__, link_bin_file); - dump_links_from_bed_file(link_file, fai, ml, 0, wd, q_drop, link_bin_file); + dump_links_from_bed_file(link_file, fai, ml, 0, q_drop, link_bin_file); } else if (f_type == PA5) { link_bin_file = malloc(strlen(out) + 5); sprintf(link_bin_file, "%s.bin", out); fprintf(stderr, "[I::%s] dump hic links (PA5) to binary file %s\n", __func__, link_bin_file); - dump_links_from_pa5_file(link_file, fai, ml, 0, rl, wd, q_drop, link_bin_file); + dump_links_from_pa5_file(link_file, fai, ml, 0, rl, q_drop, link_bin_file); } else if (f_type == BIN) { if (convert_binary) { fprintf(stderr, "[E::%s] Input is already in BIN format\n", __func__); @@ -1083,7 +1088,7 @@ int main(int argc, char *argv[]) if (convert_binary) goto final_clean; - ret = run_yahs(fai, agp, link_bin_file, ml, mq8, wd, out, resolutions, nr, rr, re_cuts, telo_ends, d_min_cell, d_mass_frac, no_contig_ec, no_scaffold_ec, no_mem_check); + ret = run_yahs(fai, agp, link_bin_file, ml, mq8, out, resolutions, nr, rr, re_cuts, telo_ends, d_min_cell, d_mass_frac, no_contig_ec, no_scaffold_ec, no_mem_check); if (ret == 0) { agp_final = (char *) malloc(strlen(out) + 35);