Skip to content

Commit

Permalink
add mapping quality score to binary file
Browse files Browse the repository at this point in the history
  • Loading branch information
Chenxi Zhou committed Jul 8, 2022
1 parent a80b3de commit 4814e99
Show file tree
Hide file tree
Showing 11 changed files with 323 additions and 168 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ With `-o` option, you can specify the prefix of the output files. It is `./yash.

With `-a` option, you can specify a AGP format file to ask YaHS to do scaffolding with the scaffolds in the AGP file as the start point.

With `-r` option, you can specify a range of resultions. It is `50000,100000,200000,500000,1000000,2000000,5000000,10000000,20000000` by default and the upper limit is automatically adjusted with the genome size.
With `-r` option, you can specify a range of resultions (in ascending order). It is `10000,20000,50000,100000,200000,500000,1000000,2000000,5000000,10000000,20000000,50000000,100000000,200000000,500000000` by default and the upper limit is automatically adjusted by the genome size. For highly fragmented genome assemblies, you can try to start with higher resultions by adding smaller `-r` values.

With `-e` option, you can specify the restriction enzyme(s) used by the Hi-C experiment. For example, `GATC` for the DpnII restriction enzyme used by the Dovetail Hi-C Kit; `GATC,GANT` and `CGATC,GANTC,CTNAG,TTAA` for Arima genomics 2-enzyme and 4-enzyme protocol, respectively. Sometimes, the specification of enzymes may not change the scaffolding result very much if not make it worse, especically when the base quality of the assembly is not very good, e.g., assembies constructed from noisy long reads.
With `-e` option, you can specify the restriction enzyme(s) used by the Hi-C experiment. For example, `GATC` for the DpnII restriction enzyme used by the Dovetail Hi-C Kit; `GATC,GANT` and `CGATC,GANTC,CTNAG,TTAA` for Arima genomics 2-enzyme and 4-enzyme protocol, respectively. Sometimes, the specification of enzymes may not change the scaffolding result very much if not make it worse, especially when the base quality of the assembly is not very good, e.g., assembies constructed from noisy long reads.

With `-l` option, you can specify the minimum contig length included for scaffolding.

Expand Down
16 changes: 16 additions & 0 deletions asset.c
Original file line number Diff line number Diff line change
Expand Up @@ -194,3 +194,19 @@ uint64_t linear_scale(uint64_t g, int *scale, uint64_t max_g)
return g;
}

void write_bin_header(FILE *fo)
{
int64_t magic_number = BIN_H;
int64_t bin_version = BIN_V;
magic_number |= bin_version;
fwrite(&magic_number, sizeof(int64_t), 1, fo);
}

int is_valid_bin_header(int64_t n)
{
int64_t magic_number = BIN_H;
int64_t bin_version = BIN_V;
magic_number |= bin_version;
return n == magic_number;
}

5 changes: 5 additions & 0 deletions asset.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
#define BUFF_SIZE 4096
#define GAP_SZ 200
#define BIN_H 0x5941485342494E56
#define BIN_V 0x1
#define LINK_EVIDENCE "proximity_ligation"

#ifdef __cplusplus
extern "C" {
Expand All @@ -51,6 +54,8 @@ int file_copy(char *fin, char *fout);
int8_t is_read_pair(const char *rname0, const char *rname1);
uint32_t div_ceil(uint64_t x, uint32_t y);
uint64_t linear_scale(uint64_t g, int *scale, uint64_t max_g);
void write_bin_header(FILE *fo);
int is_valid_bin_header(int64_t magic_number);
#ifdef __cplusplus
}
#endif
Expand Down
71 changes: 47 additions & 24 deletions break.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,14 @@ void link_mat_destroy(link_mat_t *link_mat)
free(link_mat);
}

uint32_t estimate_dist_thres_from_file(const char *f, asm_dict_t *dict, double min_frac, uint32_t resolution)
uint32_t estimate_dist_thres_from_file(const char *f, asm_dict_t *dict, double min_frac, uint32_t resolution, uint8_t mq)
{
uint32_t i;
FILE *fp;
uint32_t i, m, i0, i1, nb, *link_c;
uint8_t buffer[BUFF_SIZE * 17];
uint64_t max_len, p0, p1;
int64_t magic_number;
long pair_c, intra_c, cum_c;
uint64_t max_len;
uint32_t nb, *link_c;
uint32_t buffer[BUFF_SIZE], m, i0, i1;
uint64_t p0, p1;

max_len = 0;
for (i = 0; i < dict->n; ++i)
Expand All @@ -101,21 +100,33 @@ uint32_t estimate_dist_thres_from_file(const char *f, asm_dict_t *dict, double m
exit(EXIT_FAILURE);
}

m = fread(&magic_number, sizeof(int64_t), 1, fp);
if (!m || !is_valid_bin_header(magic_number)) {
fprintf(stderr, "[E::%s] not a valid BIN file\n", __func__);
exit(EXIT_FAILURE);
}

pair_c = 0;
intra_c = 0;
while (1) {
m = fread(&buffer, sizeof(uint32_t), BUFF_SIZE, fp);
for (i = 0; i < m; i += 4) {
sd_coordinate_conversion(dict, buffer[i], buffer[i + 1], &i0, &p0, 0);
sd_coordinate_conversion(dict, buffer[i + 2], buffer[i + 3], &i1, &p1, 0);
m = fread(buffer, sizeof(uint8_t), BUFF_SIZE * 17, fp);

for (i = 0; i < m; i += 17) {
if (*(uint8_t *) (buffer + i + 16) < mq)
continue;

sd_coordinate_conversion(dict, *(uint32_t *) (buffer + i), *(uint32_t *) (buffer + i + 4), &i0, &p0, 0);
sd_coordinate_conversion(dict, *(uint32_t *) (buffer + i + 8), *(uint32_t *) (buffer + i + 12), &i1, &p1, 0);

if (i0 == i1) {
++link_c[labs((long) p0 - p1) / resolution];
++intra_c;
}

++pair_c;
}
pair_c += m / 4;

if (m < BUFF_SIZE) {
if (m < BUFF_SIZE * 17) {
if (ferror(fp))
return 0;
break;
Expand All @@ -130,7 +141,7 @@ uint32_t estimate_dist_thres_from_file(const char *f, asm_dict_t *dict, double m
cum_c += link_c[i++];
free(link_c);

#ifdef REMOVE_NOISE
#ifdef DEBUG
printf("[I::%s] %ld read pairs processed, intra links: %ld \n", __func__, pair_c, intra_c);
#endif
return i * resolution;
Expand Down Expand Up @@ -176,12 +187,13 @@ static void calc_moving_average(int64_t *arr, int32_t n, int32_t a)
free(buff);
}

link_mat_t *link_mat_from_file(const char *f, asm_dict_t *dict, uint32_t dist_thres, uint32_t resolution, double noise, uint32_t move_avg)
link_mat_t *link_mat_from_file(const char *f, asm_dict_t *dict, uint32_t dist_thres, uint32_t resolution, double noise, uint32_t move_avg, uint8_t mq)
{
FILE *fp;
uint32_t i, j, n;
uint32_t buffer[BUFF_SIZE], m, i0, i1;
uint32_t i, j, m, n, i0, i1;
uint64_t p0, p1;
int64_t magic_number;
uint8_t buffer[BUFF_SIZE * 17];
long pair_c, intra_c;

fp = fopen(f, "r");
Expand All @@ -190,6 +202,12 @@ link_mat_t *link_mat_from_file(const char *f, asm_dict_t *dict, uint32_t dist_th
exit(EXIT_FAILURE);
}

m = fread(&magic_number, sizeof(int64_t), 1, fp);
if (!m || !is_valid_bin_header(magic_number)) {
fprintf(stderr, "[E::%s] not a valid BIN file\n", __func__);
exit(EXIT_FAILURE);
}

link_mat_t *link_mat = (link_mat_t *) malloc(sizeof(link_mat_t));
link_mat->b = resolution;
link_mat->n = dict->n;
Expand All @@ -203,22 +221,27 @@ link_mat_t *link_mat_from_file(const char *f, asm_dict_t *dict, uint32_t dist_th

pair_c = intra_c = 0;
while (1) {
m = fread(&buffer, sizeof(uint32_t), BUFF_SIZE, fp);
for (i = 0; i < m; i += 4) {
sd_coordinate_conversion(dict, buffer[i], buffer[i + 1], &i0, &p0, 0);
sd_coordinate_conversion(dict, buffer[i + 2], buffer[i + 3], &i1, &p1, 0);
m = fread(buffer, sizeof(uint8_t), BUFF_SIZE * 17, fp);

for (i = 0; i < m; i += 17) {
if (*(uint8_t *) (buffer + i + 16) < mq)
continue;

sd_coordinate_conversion(dict, *(uint32_t *) (buffer + i), *(uint32_t *) (buffer + i + 4), &i0, &p0, 0);
sd_coordinate_conversion(dict, *(uint32_t *) (buffer + i + 8), *(uint32_t *) (buffer + i + 12), &i1, &p1, 0);

if (p0 > p1)
SWAP(uint64_t, p0, p1);
if (i0 == i1 && p1 - p0 <= dist_thres) {
++intra_c;
link_mat->link[i0].link[(MAX(p0, 1) - 1) / resolution] += 1;
link_mat->link[i1].link[(MAX(p1, 1) - 1) / resolution] -= 1;
++intra_c;
}

++pair_c;
}
pair_c += m / 4;

if (m < BUFF_SIZE) {

if (m < BUFF_SIZE * 17) {
if (ferror(fp))
return 0;
break;
Expand Down
4 changes: 2 additions & 2 deletions break.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ extern "C" {

link_t *link_init(uint32_t s, uint32_t n);
link_mat_t *link_mat_init(asm_dict_t *dict, uint32_t b);
link_mat_t *link_mat_from_file(const char *f, asm_dict_t *dict, uint32_t dist_thres, uint32_t resolution, double noise, uint32_t move_avg);
uint32_t estimate_dist_thres_from_file(const char *f, asm_dict_t *dict, double min_frac, uint32_t resolution);
link_mat_t *link_mat_from_file(const char *f, asm_dict_t *dict, uint32_t dist_thres, uint32_t resolution, double noise, uint32_t move_avg, uint8_t mq);
uint32_t estimate_dist_thres_from_file(const char *f, asm_dict_t *dict, double min_frac, uint32_t resolution, uint8_t mq);
void link_mat_destroy(link_mat_t *link_mat);
void print_link_mat(link_mat_t *link_mat, asm_dict_t *dict, FILE *fp);
bp_t *detect_break_points(link_mat_t *link_mat, uint32_t bin_size, uint32_t merge_size, double fold_thres, uint32_t dual_break_thres, uint32_t *bp_n);
Expand Down
2 changes: 1 addition & 1 deletion graph.c
Original file line number Diff line number Diff line change
Expand Up @@ -800,7 +800,7 @@ int search_graph_path(graph_t *g, asm_dict_t *dict, char *out)
fprintf(agp_out, "scaffold_%u\t%lu\t%lu\t%u\tW\t%s\t%u\t%u\t%c\n", s, len + 1, len + cseg.y, ++t, sd->s[cseg.c >> 1].name, cseg.x + 1, cseg.x + cseg.y, "+-"[(cseg.c & 1) ^ ori]);
len += cseg.y;
if (k != nseg - 1 || j != qs - 1) {
fprintf(agp_out, "scaffold_%u\t%lu\t%lu\t%u\tN\t%d\tscaffold\tyes\tna\n", s, len + 1, len + GAP_SZ, ++t, GAP_SZ);
fprintf(agp_out, "scaffold_%u\t%lu\t%lu\t%u\tN\t%d\tscaffold\tyes\t%s\n", s, len + 1, len + GAP_SZ, ++t, GAP_SZ, LINK_EVIDENCE);
len += GAP_SZ;
}
}
Expand Down
Loading

0 comments on commit 4814e99

Please sign in to comment.