diff --git a/src/lofreq/lofreq_call.c b/src/lofreq/lofreq_call.c index 81c0c7b..48ca6ea 100644 --- a/src/lofreq/lofreq_call.c +++ b/src/lofreq/lofreq_call.c @@ -127,8 +127,10 @@ report_var(vcf_file_t *vcf_file, const plp_col_t *p, const char *ref, &sb_left_pv, &sb_right_pv, &sb_two_pv); sb_qual = PROB_TO_PHREDQUAL_SAFE(sb_two_pv); } + + vcf_var_sprintf_info(var, is_indel? p->coverage_plp - p->num_tails : p->coverage_plp, - af, sb_qual, dp4, is_indel, p->hrun, is_consvar); + af, sb_qual, dp4, is_indel, p->hrun, is_consvar, p->num_alt_bases); vcf_write_var(vcf_file, var); vcf_free_var(&var); @@ -730,7 +732,7 @@ call_indels(const plp_col_t *p, varcall_conf_t *conf) * */ void -call_snvs(const plp_col_t *p, varcall_conf_t *conf) +call_snvs(plp_col_t *p, varcall_conf_t *conf) { double *bc_err_probs; /* error probs (qualities) passed down to snpcaller */ int bc_num_err_probs; /* #elements in bc_err_probs */ @@ -854,6 +856,9 @@ call_snvs(const plp_col_t *p, varcall_conf_t *conf) dp4.alt_fw = p->fw_counts[alt_nt4]; dp4.alt_rv = p->rv_counts[alt_nt4]; + // remember the filtered alt count to report it later + p->num_alt_bases = alt_count; + report_var(& conf->vcf_out, p, report_ref, report_alt, af, PROB_TO_PHREDQUAL(pvalue), is_indel, is_consvar, &dp4); diff --git a/src/lofreq/plp.h b/src/lofreq/plp.h index 5fa35e0..c652c31 100644 --- a/src/lofreq/plp.h +++ b/src/lofreq/plp.h @@ -77,6 +77,7 @@ typedef struct { char cons_base[MAX_INDELSIZE]; /* uppercase consensus base according to base-counts, after read-level filtering. */ int coverage_plp; /* original samtools value. upper count limit for all kept values */ int num_bases; /* number of bases after base filtering */ + int num_alt_bases; /* number of alt bases after base filtering (for a snp call) */ /* num_ins and num_dels gives 'num_indels' */ int num_ign_indels; /* a hack: indels often get filtered because of low quality of missing qualities in bam file. we need to know nevertheless they are present. this is the count of all "ignored" indels */ diff --git a/src/lofreq/snpcaller.c b/src/lofreq/snpcaller.c index f3a8e44..b0b6b22 100644 --- a/src/lofreq/snpcaller.c +++ b/src/lofreq/snpcaller.c @@ -408,6 +408,14 @@ plp_to_errprobs(double **err_probs, int *num_err_probs, if (p->base_quals[i].n) { bq = p->base_quals[i].data[j]; + /* + * count alt bases before applying bq filter + * AF will be calculated based on raw counts + */ + if (is_alt_base) { + alt_raw_counts[alt_idx] += 1; + } + /* bq filtering for all * FIXME min_bq was meant to just influence quality calculations, not af etc * but the filtering leaks out later, because alt_counts is computed and returned after filtering @@ -418,7 +426,6 @@ plp_to_errprobs(double **err_probs, int *num_err_probs, /* alt bq threshold and overwrite if needed */ if (is_alt_base) { - alt_raw_counts[alt_idx] += 1; /* ignore altogether if below alt bq threshold */ if (bq < conf->min_alt_bq) { continue; diff --git a/src/lofreq/vcf.c b/src/lofreq/vcf.c index e97eb83..2db66d4 100644 --- a/src/lofreq/vcf.c +++ b/src/lofreq/vcf.c @@ -50,7 +50,7 @@ #include "vcf.h" #include "defaults.h" -#define LINE_BUF_SIZE 1<<12 +#define LINE_BUF_SIZE 1<<13 /* this is the actual header. all the other stuff is actually called meta-info @@ -587,7 +587,7 @@ void vcf_var_sprintf_info(var_t *var, const int dp, const float af, const int sb, const dp4_counts_t *dp4, const int indel, const int hrun, - const int consvar) + const int consvar, const int num_alt_bases) { char buf[LINE_BUF_SIZE]; snprintf(buf, sizeof(buf), @@ -596,6 +596,8 @@ void vcf_var_sprintf_info(var_t *var, if (indel) { strcat(buf, ";INDEL"); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ";HRUN=%d", hrun); + } else { + snprintf(buf +strlen(buf), sizeof(buf) - strlen(buf), ";HQA=%d", num_alt_bases); } if (consvar) { strcat(buf, ";CONSVAR"); @@ -644,6 +646,7 @@ void vcf_write_new_header(vcf_file_t *vcf_file, const char *src, const char *ref vcf_printf(vcf_file, "##INFO=\n"); vcf_printf(vcf_file, "##INFO=\n"); vcf_printf(vcf_file, "##INFO=\n"); + vcf_printf(vcf_file, "##INFO=\n"); vcf_printf(vcf_file, "##INFO=\n"); vcf_printf(vcf_file, "##INFO=\n"); vcf_printf(vcf_file, "##INFO=\n"); diff --git a/src/lofreq/vcf.h b/src/lofreq/vcf.h index e2f36e5..f3216b9 100644 --- a/src/lofreq/vcf.h +++ b/src/lofreq/vcf.h @@ -124,7 +124,7 @@ char *vcf_var_add_to_info(var_t *var, const char *info_str); void vcf_var_sprintf_info(var_t *var, const int dp, const float af, const int sb, const dp4_counts_t *dp4, - const int is_indel, const int hrun, const int is_consvar); + const int is_indel, const int hrun, const int is_consvar, const int num_alt_bases); void vcf_write_var(vcf_file_t *vcf_file, const var_t *var); void vcf_write_header(vcf_file_t *vcf_file, const char *header); void vcf_write_new_header(vcf_file_t *vcf_file, const char *srcprog, const char *reffa);