From 8db90f38ff383897ad2adf33e38403f44e7f09ea Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 8 Jan 2025 14:09:09 +0000 Subject: [PATCH] Print the number of removed duplicate sites in the final statistics Resolves #2346 --- NEWS | 4 ++++ vcfnorm.c | 16 ++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/NEWS b/NEWS index edf981c5..62b09a6d 100644 --- a/NEWS +++ b/NEWS @@ -32,6 +32,10 @@ Changes affecting specific commands: - The option `--merge none` is expected to create no new multiallelic sites, but it should allow to merge, say, A>C with A>C,AT (#2333) +* bcftools norm + + - Print the number of removed duplicate sites in the final statistics (#2346) + * bcftools query - The functions used in -i/-e filtering expressions (such as SUM, MEDIAN, etc) can be diff --git a/vcfnorm.c b/vcfnorm.c index 2e5a97ef..a2576d71 100644 --- a/vcfnorm.c +++ b/vcfnorm.c @@ -1,6 +1,6 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2024 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -105,7 +105,7 @@ typedef struct struct { int tot, set, swap; } nref; char **argv, *output_fname, *ref_fname, *vcf_fname, *region, *targets; int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels, clevel; - int nchanged, nskipped, nsplit, njoined, ntotal, nfilter, mrows_op, mrows_collapse, parsimonious; + int nchanged, nskipped, nsplit, njoined, ntotal, nfilter, nrmdup, mrows_op, mrows_collapse, parsimonious; int record_cmd_line, force, force_warned, keep_sum_ad; abuf_t *abuf; abuf_opt_t atomize; @@ -2138,10 +2138,10 @@ static void flush_buffer(args_t *args, htsFile *file, int n) int line_type = bcf_get_variant_types(args->lines[k]); if ( prev_rid>=0 && prev_rid==args->lines[k]->rid && prev_pos==args->lines[k]->pos ) { - if ( args->rmdup & BCF_SR_PAIR_ANY ) continue; // rmdup by position only - if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) continue; - if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) continue; - if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, &args->cmpals_out, args->lines[k]) ) continue; + if ( args->rmdup & BCF_SR_PAIR_ANY ) { args->nrmdup++; continue; } // rmdup by position only + if ( args->rmdup & BCF_SR_PAIR_SNPS && line_type&(VCF_SNP|VCF_MNP) && prev_type&(VCF_SNP|VCF_MNP) ) { args->nrmdup++; continue; } + if ( args->rmdup & BCF_SR_PAIR_INDELS && line_type&(VCF_INDEL) && prev_type&(VCF_INDEL) ) { args->nrmdup++; continue; } + if ( args->rmdup & BCF_SR_PAIR_EXACT && cmpals_match(args, &args->cmpals_out, args->lines[k]) ) { args->nrmdup++; continue; } } else { @@ -2425,8 +2425,8 @@ static void normalize_vcf(args_t *args) } if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); - fprintf(stderr,"Lines total/split/joined/realigned/removed/skipped:\t%d/%d/%d/%d/%d/%d\n", - args->ntotal,args->nsplit,args->njoined,args->nchanged,args->nskipped,args->nfilter); + fprintf(stderr,"Lines total/split/joined/realigned/mismatch_removed/dup_removed/skipped:\t%d/%d/%d/%d/%d/%d/%d\n", + args->ntotal,args->nsplit,args->njoined,args->nchanged,args->nskipped,args->nrmdup,args->nfilter); if ( args->check_ref & CHECK_REF_FIX ) fprintf(stderr,"REF/ALT total/modified/added: \t%d/%d/%d\n", args->nref.tot,args->nref.swap,args->nref.set); }