From 827b2cd35be9c9e7ed76ea3f1c85c43bb8b63601 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 29 Jan 2025 09:21:02 +0100 Subject: [PATCH] Make the regex negation work correctly with missing values for example, -i 'TAG[*]~"\."' -i 'TAG[*]!~"\."' Resolves #2355 --- NEWS | 5 +- doc/bcftools.1 | 132 ++++++++++++++++++++----------------- doc/bcftools.html | 20 ++++-- filter.c | 10 ++- test/query.filter.15.1.out | 3 + test/query.filter.15.2.out | 3 + test/query.filter.15.vcf | 9 +++ test/test.pl | 4 ++ 8 files changed, 113 insertions(+), 73 deletions(-) create mode 100644 test/query.filter.15.1.out create mode 100644 test/query.filter.15.2.out create mode 100644 test/query.filter.15.vcf diff --git a/NEWS b/NEWS index af82a1f3..c3e7b42d 100644 --- a/NEWS +++ b/NEWS @@ -2,7 +2,10 @@ Changes affecting the whole of bcftools, or multiple commands: -* Add support for matching lines by ID (#1739) +* Add support for matching lines by ID via the --pair-logic and --collapse options (#1739) + +* The -i/-e filtering expressions now properly match the regex negation of missing + values, e.g. -i 'TAG!~"\."' (#2355) Changes affecting specific commands: diff --git a/doc/bcftools.1 b/doc/bcftools.1 index ceb6bd7e..750944eb 100644 --- a/doc/bcftools.1 +++ b/doc/bcftools.1 @@ -1,13 +1,13 @@ '\" t .\" Title: bcftools .\" Author: [see the "AUTHOR(S)" section] -.\" Generator: Asciidoctor 2.0.15.dev -.\" Date: 2024-12-28 +.\" Generator: Asciidoctor 2.0.20 +.\" Date: 2025-01-29 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "BCFTOOLS" "1" "2024-12-28" "\ \&" "\ \&" +.TH "BCFTOOLS" "1" "2025-01-29" "\ \&" "\ \&" .ie \n(.g .ds Aq \(aq .el .ds Aq ' .ss \n[.ss] 0 @@ -51,7 +51,7 @@ standard input (stdin) and outputs to the standard output (stdout). Several commands can thus be combined with Unix pipes. .SS "VERSION" .sp -This manual page was last updated \fB2024\-12\-28 22:19 GMT\fP and refers to bcftools git version \fB1.21\-61\-g68f13f2e+\fP. +This manual page was last updated \fB2025\-01\-29 08:37 CET\fP and refers to bcftools git version \fB1.21\-72\-g724713f+\fP. .SS "BCF1" .sp The obsolete BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP @@ -606,7 +606,7 @@ Such a file can be easily created from a VCF using: .if n .RS 4 .nf .fam C - bcftools query \-f\(aq%CHROM\(rst%POS\(rst%REF,%ALT\(rsn\(aq file.vcf | bgzip \-c > als.tsv.gz && tabix \-s1 \-b2 \-e2 als.tsv.gz + bcftools query \-f\*(Aq%CHROM\(rst%POS\(rst%REF,%ALT\(rsn\*(Aq file.vcf | bgzip \-c > als.tsv.gz && tabix \-s1 \-b2 \-e2 als.tsv.gz .fam .fi .if n .RE @@ -815,7 +815,7 @@ one can use .if n .RS 4 .nf .fam C - bcftools annotate \-\-set\-id +\(aq%CHROM\(rs_%POS\(rs_%REF\(rs_%FIRST_ALT\(aq file.vcf + bcftools annotate \-\-set\-id +\*(Aq%CHROM\(rs_%POS\(rs_%REF\(rs_%FIRST_ALT\*(Aq file.vcf .fam .fi .if n .RE @@ -835,13 +835,13 @@ file dynamically for each record: .if n .RS 4 .nf .fam C - # The field \(aqSTR\(aq from the \-a file is required to match INFO/TAG in VCF. In the first example + # The field \*(AqSTR\*(Aq from the \-a file is required to match INFO/TAG in VCF. In the first example # the alleles REF,ALT must match, in the second example they are ignored. The option \-k is required # to output also records that are not annotated. The third example shows the same concept with # a numerical expression. - bcftools annotate \-a annots.tsv.gz \-c CHROM,POS,REF,ALT,SCORE,~STR \-i\(aqTAG={STR}\(aq \-k input.vcf - bcftools annotate \-a annots.tsv.gz \-c CHROM,POS,\-,\-,SCORE,~STR \-i\(aqTAG={STR}\(aq \-k input.vcf - bcftools annotate \-a annots.tsv.gz \-c CHROM,POS,\-,\-,SCORE,~INT \-i\(aqTAG>{INT}\(aq \-k input.vcf + bcftools annotate \-a annots.tsv.gz \-c CHROM,POS,REF,ALT,SCORE,~STR \-i\*(AqTAG={STR}\*(Aq \-k input.vcf + bcftools annotate \-a annots.tsv.gz \-c CHROM,POS,\-,\-,SCORE,~STR \-i\*(AqTAG={STR}\*(Aq \-k input.vcf + bcftools annotate \-a annots.tsv.gz \-c CHROM,POS,\-,\-,SCORE,~INT \-i\*(AqTAG>{INT}\*(Aq \-k input.vcf .fam .fi .if n .RE @@ -872,7 +872,7 @@ This is an experimental feature. annotate sites which are present ("+") or absent ("\-") in the \fB\-a\fP file with a new INFO/TAG flag .RE .sp -\fB\-\-min\-overlap\fP \fIANN\fP:\(aqVCF\(aq +\fB\-\-min\-overlap\fP \fIANN\fP:\*(AqVCF\*(Aq .RS 4 minimum overlap required as a fraction of the variant in the annotation \fB\-a\fP file (\fIANN\fP), in the target VCF file (\fI:VCF\fP), or both for reciprocal overlap (\fIANN:VCF\fP). @@ -1149,10 +1149,10 @@ workflow looks like this: .nf .fam C # Extract AN,AC values from an existing VCF, such 1000Genomes - bcftools query \-f\(aq%CHROM\(rst%POS\(rst%REF\(rst%ALT\(rst%AN\(rst%AC\(rsn\(aq 1000Genomes.bcf | bgzip \-c > AFs.tab.gz + bcftools query \-f\*(Aq%CHROM\(rst%POS\(rst%REF\(rst%ALT\(rst%AN\(rst%AC\(rsn\*(Aq 1000Genomes.bcf | bgzip \-c > AFs.tab.gz # If the tags AN,AC are not already present, use the +fill\-tags plugin - bcftools +fill\-tags 1000Genomes.bcf | bcftools query \-f\(aq%CHROM\(rst%POS\(rst%REF\(rst%ALT\(rst%AN\(rst%AC\(rsn\(aq | bgzip \-c > AFs.tab.gz + bcftools +fill\-tags 1000Genomes.bcf | bcftools query \-f\*(Aq%CHROM\(rst%POS\(rst%REF\(rst%ALT\(rst%AN\(rst%AC\(rsn\*(Aq | bgzip \-c > AFs.tab.gz tabix \-s1 \-b2 \-e2 AFs.tab.gz # Create a VCF header description, here we name the tags REF_AN,REF_AC @@ -2164,7 +2164,7 @@ An example of a minimal working GFF file: .fam C # The program looks for "CDS", "exon", "three_prime_UTR" and "five_prime_UTR" lines, # looks up their parent transcript (determined from the "Parent=transcript:" attribute), - # the gene (determined from the transcript\(aqs "Parent=gene:" attribute), and the biotype + # the gene (determined from the transcript\*(Aqs "Parent=gene:" attribute), and the biotype # (the most interesting is "protein_coding"). # # Empty and commented lines are skipped, the following GFF columns are required @@ -2349,7 +2349,7 @@ one of "tbi" or "csi" depending on output file format. # %TBCSQ{0} .. print the first haplotype only # %TBCSQ{1} .. print the second haplotype only # %TBCSQ{*} .. print a list of unique consequences present in either haplotype - bcftools query \-f\(aq[%CHROM\(rst%POS\(rst%SAMPLE\(rst%TBCSQ\(rsn]\(aq out.bcf + bcftools query \-f\*(Aq[%CHROM\(rst%POS\(rst%SAMPLE\(rst%TBCSQ\(rsn]\*(Aq out.bcf .fam .fi .if n .RE @@ -2428,7 +2428,7 @@ exclude sites for which \fIEXPRESSION\fP is true. For valid expressions see \fBEXPRESSIONS\fP. .RE .sp -\fB\-g, \-\-SnpGap\fP \fIINT\fP[:\(aqindel\(aq,\fImnp\fP,\fIbnd\fP,\fIother\fP,\fIoverlap\fP] +\fB\-g, \-\-SnpGap\fP \fIINT\fP[:\*(Aqindel\*(Aq,\fImnp\fP,\fIbnd\fP,\fIother\fP,\fIoverlap\fP] .RS 4 filter SNPs within \fIINT\fP base pairs of an indel or other other variant type. The following example demonstrates the logic of \fB\-\-SnpGap\fP \fI3\fP applied on a deletion and @@ -2594,7 +2594,7 @@ in\-memory sorting and DIR is the temporary directory for external sorting. This Stop after first record to estimate required time. .RE .sp -\fB\-e, \-\-exclude\fP [\fIqry\fP|\fIgt\fP]:\(aqEXPRESSION\(aq +\fB\-e, \-\-exclude\fP [\fIqry\fP|\fIgt\fP]:\*(AqEXPRESSION\*(Aq .RS 4 Exclude sites from query file (\fIqry:\fP) or genotype file (\fIgt:\fP) for which \fIEXPRESSION\fP is true. For valid expressions see \fBEXPRESSIONS\fP. @@ -2636,7 +2636,7 @@ VCF/BCF file with reference genotypes to compare against Homozygous genotypes only, useful with low coverage data (requires \fB\-g, \-\-genotypes\fP) .RE .sp -\fB\-i, \-\-include\fP [\fIqry\fP|\fIgt\fP]:\(aqEXPRESSION\(aq +\fB\-i, \-\-include\fP [\fIqry\fP|\fIgt\fP]:\*(AqEXPRESSION\*(Aq .RS 4 Include sites from query file (\fIqry:\fP) or genotype file (\fIgt:\fP) for which \fIEXPRESSION\fP is true. For valid expressions see \fBEXPRESSIONS\fP. @@ -2684,7 +2684,7 @@ from the query file, the second from the genotypes file when \fB\-g\fP is given Restrict to comma\-separated list of regions, see \fBCommon Options\fP .RE .sp -*\-R, \-\-regions\-file\(aq \fIFILE\fP +*\-R, \-\-regions\-file\*(Aq \fIFILE\fP .RS 4 Restrict to regions listed in a file, see \fBCommon Options\fP .RE @@ -2694,11 +2694,11 @@ Restrict to regions listed in a file, see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp -\fB\-s, \-\-samples\fP [\fIqry\fP|\fIgt\fP]:\(aqLIST\(aq: +\fB\-s, \-\-samples\fP [\fIqry\fP|\fIgt\fP]:\*(AqLIST\*(Aq: List of query samples or \fB\-g\fP samples. If neither \fB\-s\fP nor \fB\-S\fP are given, all possible sample pair combinations are compared .sp -\fB\-S, \-\-samples\-file\fP [\fIqry\fP|\fIgt\fP]:\(aqFILE\(aq +\fB\-S, \-\-samples\-file\fP [\fIqry\fP|\fIgt\fP]:\*(AqFILE\*(Aq File with the query or \fB\-g\fP samples to compare. If neither \fB\-s\fP nor \fB\-S\fP are given, all possible sample pair combinations are compared .sp @@ -2966,7 +2966,7 @@ the files after filters have been applied .if n .RS 4 .nf .fam C - bcftools isec \-e\(aqMAF<0.01\(aq \-i\(aqdbSNP=1\(aq \-e\- A.vcf.gz B.vcf.gz C.vcf.gz \-n +2 \-p dir + bcftools isec \-e\*(AqMAF<0.01\*(Aq \-i\*(AqdbSNP=1\*(Aq \-e\- A.vcf.gz B.vcf.gz C.vcf.gz \-n +2 \-p dir .fam .fi .if n .RE @@ -3115,7 +3115,7 @@ if two asterisks \fI**\fP are appended, the unobserved allele will be removed al \-m both,* .. same as above but remove <*> (or ) from variant sites \-m both,** .. same as above but remove <*> (or ) at all sites \-m all .. SNP records can be merged with indel records -\-m snp\-ins\-del .. allow multiallelic SNVs, insertions, deletions, but don\(aqt mix them +\-m snp\-ins\-del .. allow multiallelic SNVs, insertions, deletions, but don\*(Aqt mix them \-m id .. merge by ID .fam .fi @@ -3579,7 +3579,7 @@ see \fBCommon Options\fP \fB\-o, \-\-output\fP \fIFILE\fP .RS 4 Write output to \fIFILE\fP, rather than the default of standard output. -(The same short option is used for both \fB\-\-open\-prob\fP and \fB\-\-output\fP. If \fB\-o\fP\(aqs +(The same short option is used for both \fB\-\-open\-prob\fP and \fB\-\-output\fP. If \fB\-o\fP\*(Aqs argument contains any non\-digit characters other than a leading + or \- sign, it is interpreted as \fB\-\-output\fP. Usually the filename extension will take care of this, but to write to an entirely numeric filename use \fB\-o @@ -3860,7 +3860,7 @@ but may not other aligners. .fam C bcftools mpileup \-Ou \-f ref.fa aln.bam | \(rs bcftools call \-Ou \-mv | \(rs - bcftools filter \-s LowQual \-e \(aqQUAL<20 || DP>100\(aq > var.flt.vcf + bcftools filter \-s LowQual \-e \*(AqQUAL<20 || DP>100\*(Aq > var.flt.vcf .fam .fi .if n .RE @@ -3898,7 +3898,7 @@ by shell and must be put in quotes or escaped by a backslash: 101 C G ./1 # After: - # bcftools norm \-a \-\-atom\-overlaps \(aq*\(aq + # bcftools norm \-a \-\-atom\-overlaps \*(Aq*\*(Aq # bcftools norm \-a \-\-atom\-overlaps \(rs* 100 C G,* 2/1 100 CC C,* 1/2 @@ -3918,8 +3918,7 @@ cannot be stressed enough, that \fIs\fP will NOT fix strand issues in your VCF, do NOT use it for that purpose!!! (Instead see .URL "http://samtools.github.io/bcftools/howtos/plugin.af\-dist.html" "" "" and -\c -.URL "http://samtools.github.io/bcftools/howtos/plugin.fixref.html." "" ")" +.URL "http://samtools.github.io/bcftools/howtos/plugin.fixref.html" "" ".)" .RE .sp \fB\-d, \-\-rm\-dup\fP \fIsnps\fP|\fIindels\fP|\fIboth\fP|\fIall\fP|\fIexact\fP @@ -4568,7 +4567,7 @@ determine parental origin of a CNV region \fBprune\fP .RS 4 prune sites by missingness, allele frequency or linkage disequilibrium. -Alternatively, annotate sites with r2, Lewontin\(cqs D\(aq (PMID:19433632), Ragsdale\(cqs D (PMID:31697386). +Alternatively, annotate sites with r2, Lewontin\(cqs D\*(Aq (PMID:19433632), Ragsdale\(cqs D (PMID:31697386). .RE .sp \fBremove\-overlaps\fP @@ -4702,10 +4701,10 @@ Does the environment variable BCFTOOLS_PLUGINS include the correct path? .if n .RS 4 .nf .fam C -// Short description used by \(aqbcftools plugin \-l\(aq +// Short description used by \*(Aqbcftools plugin \-l\*(Aq const char *about(void); -// Longer description used by \(aqbcftools +name \-h\(aq +// Longer description used by \*(Aqbcftools +name \-h\*(Aq const char *usage(void); // Called once at startup, allows initialization of local variables. @@ -4967,7 +4966,7 @@ Everything else is printed verbatim. .nf .fam C # Print chromosome, position, ref allele and the first alternate allele -bcftools query \-f \(aq%CHROM %POS %REF %ALT{0}\(rsn\(aq file.vcf.gz +bcftools query \-f \*(Aq%CHROM %POS %REF %ALT{0}\(rsn\*(Aq file.vcf.gz .fam .fi .if n .RE @@ -4976,7 +4975,7 @@ bcftools query \-f \(aq%CHROM %POS %REF %ALT{0}\(rsn\(aq file.vcf.gz .nf .fam C # Similar to above, but use tabs instead of spaces, add sample name and genotype -bcftools query \-f \(aq%CHROM\(rst%POS\(rst%REF\(rst%ALT[\(rst%SAMPLE=%GT]\(rsn\(aq file.vcf.gz +bcftools query \-f \*(Aq%CHROM\(rst%POS\(rst%REF\(rst%ALT[\(rst%SAMPLE=%GT]\(rsn\*(Aq file.vcf.gz .fam .fi .if n .RE @@ -4985,7 +4984,7 @@ bcftools query \-f \(aq%CHROM\(rst%POS\(rst%REF\(rst%ALT[\(rst%SAMPLE=%GT]\(rsn\ .nf .fam C # Print FORMAT/GT fields followed by FORMAT/GT fields -bcftools query \-f \(aqGQ:[ %GQ] \(rst GT:[ %GT]\(rsn\(aq file.vcf +bcftools query \-f \*(AqGQ:[ %GQ] \(rst GT:[ %GT]\(rsn\*(Aq file.vcf .fam .fi .if n .RE @@ -4994,7 +4993,7 @@ bcftools query \-f \(aqGQ:[ %GQ] \(rst GT:[ %GT]\(rsn\(aq file.vcf .nf .fam C # Make a BED file: chr, pos (0\-based), end pos (1\-based), id -bcftools query \-f\(aq%CHROM\(rst%POS0\(rst%END\(rst%ID\(rsn\(aq file.bcf +bcftools query \-f\*(Aq%CHROM\(rst%POS0\(rst%END\(rst%ID\(rsn\*(Aq file.bcf .fam .fi .if n .RE @@ -5003,7 +5002,7 @@ bcftools query \-f\(aq%CHROM\(rst%POS0\(rst%END\(rst%ID\(rsn\(aq file.bcf .nf .fam C # Print only samples with alternate (non\-reference) genotypes -bcftools query \-f\(aq[%CHROM:%POS %SAMPLE %GT\(rsn]\(aq \-i\(aqGT="alt"\(aq file.bcf +bcftools query \-f\*(Aq[%CHROM:%POS %SAMPLE %GT\(rsn]\*(Aq \-i\*(AqGT="alt"\*(Aq file.bcf .fam .fi .if n .RE @@ -5012,7 +5011,7 @@ bcftools query \-f\(aq[%CHROM:%POS %SAMPLE %GT\(rsn]\(aq \-i\(aqGT="alt"\(aq fil .nf .fam C # Print all samples at sites with at least one alternate genotype -bcftools view \-i\(aqGT="alt"\(aq file.bcf \-Ou | bcftools query \-f\(aq[%CHROM:%POS %SAMPLE %GT\(rsn]\(aq +bcftools view \-i\*(AqGT="alt"\*(Aq file.bcf \-Ou | bcftools query \-f\*(Aq[%CHROM:%POS %SAMPLE %GT\(rsn]\*(Aq .fam .fi .if n .RE @@ -5021,7 +5020,7 @@ bcftools view \-i\(aqGT="alt"\(aq file.bcf \-Ou | bcftools query \-f\(aq[%CHROM: .nf .fam C # Print phred\-scaled binomial probability from FORMAT/AD tag for all heterozygous genotypes -bcftools query \-i\(aqGT="het"\(aq \-f\(aq[%CHROM:%POS %SAMPLE %GT %PBINOM(AD)\(rsn]\(aq file.vcf +bcftools query \-i\*(AqGT="het"\*(Aq \-f\*(Aq[%CHROM:%POS %SAMPLE %GT %PBINOM(AD)\(rsn]\*(Aq file.vcf .fam .fi .if n .RE @@ -5032,7 +5031,7 @@ bcftools query \-i\(aqGT="het"\(aq \-f\(aq[%CHROM:%POS %SAMPLE %GT %PBINOM(AD)\( # Print the second value of AC field if bigger than 10. Note the (unfortunate) difference in # index subscript notation: formatting expressions (\-f) uses "{}" while filtering expressions # (\-i) use "[]". This is for historic reasons and backward\-compatibility. -bcftools query \-f \(aq%AC{1}\(rsn\(aq \-i \(aqAC[1]>10\(aq file.vcf.gz +bcftools query \-f \*(Aq%AC{1}\(rsn\*(Aq \-i \*(AqAC[1]>10\*(Aq file.vcf.gz .fam .fi .if n .RE @@ -5042,8 +5041,17 @@ bcftools query \-f \(aq%AC{1}\(rsn\(aq \-i \(aqAC[1]>10\(aq file.vcf.gz .fam C # Print all samples at sites where at least one sample has DP=1 or DP=2. In the second case # print only samples with DP=1 or DP=2, the difference is in the logical operator used, || vs |. -bcftools query \-f \(aq[%SAMPLE %GT %DP\(rsn]\(aq \-i \(aqFMT/DP=1 || FMT/DP=2\(aq file.vcf -bcftools query \-f \(aq[%SAMPLE %GT %DP\(rsn]\(aq \-i \(aqFMT/DP=1 | FMT/DP=2\(aq file.vcf +bcftools query \-f \*(Aq[%SAMPLE %GT %DP\(rsn]\*(Aq \-i \*(AqFMT/DP=1 || FMT/DP=2\*(Aq file.vcf +bcftools query \-f \*(Aq[%SAMPLE %GT %DP\(rsn]\*(Aq \-i \*(AqFMT/DP=1 | FMT/DP=2\*(Aq file.vcf +.fam +.fi +.if n .RE +.sp +.if n .RS 4 +.nf +.fam C +# Refer to ID column vs INFO/ID tag vs FORMAT/ID tag +bcftools query \-f \*(AqcolumnID=%ID infoID=%INFO/ID [fmtID=%ID ] [columnID=%/ID]\*(Aq .fam .fi .if n .RE @@ -5145,7 +5153,7 @@ Note that such a file can be easily created from a VCF using: .if n .RS 4 .nf .fam C - bcftools query \-f\(aq%CHROM\(rst%POS\(rst%REF,%ALT\(rst%INFO/TAG\(rsn\(aq file.vcf | bgzip \-c > freqs.tab.gz + bcftools query \-f\*(Aq%CHROM\(rst%POS\(rst%REF,%ALT\(rst%INFO/TAG\(rsn\*(Aq file.vcf | bgzip \-c > freqs.tab.gz .fam .fi .if n .RE @@ -5601,7 +5609,7 @@ multiple subsets simultaneously using the \fBsplit\fP plugin. Note that filter options below dealing with counting the number of alleles will, for speed, first check for the values of AC and AN in the INFO column to avoid parsing all the genotype (FORMAT/GT) fields in the VCF. This means -that filters like \fI\-\-uncalled\fP, \-\-exclude\-uncalled\(aq, or \fI\-\-min\-af 0.1\fP will be calculated from INFO/AC and +that filters like \fI\-\-uncalled\fP, \-\-exclude\-uncalled\*(Aq, or \fI\-\-min\-af 0.1\fP will be calculated from INFO/AC and INFO/AN when available or FORMAT/GT otherwise. However, it will not attempt to use any other existing field, like INFO/AF for example. For that, use \fI\-\-exclude AF<0.1\fP instead. .sp @@ -5613,7 +5621,7 @@ column when present but calculated on the fly when absent. Therefore it is stron required order explicitly by separating such commands into two steps. (Make sure to use the \fB\-O u\fP option when piping!) .sp -\fB\-c, \-\-min\-ac\fP \fIINT\fP[\fI:nref\fP|\fI:alt1\fP|\fI:minor\fP|\fI:major\fP|:\(aqnonmajor\(aq] +\fB\-c, \-\-min\-ac\fP \fIINT\fP[\fI:nref\fP|\fI:alt1\fP|\fI:minor\fP|\fI:major\fP|:\*(Aqnonmajor\*(Aq] .RS 4 minimum allele count (INFO/AC) of sites to be printed. Specifying the type of allele is optional and can be set to @@ -5622,7 +5630,7 @@ frequent (\fIminor\fP), the most frequent (\fImajor\fP) or sum of all but the most frequent (\fInonmajor\fP) alleles. .RE .sp -\fB\-C, \-\-max\-ac\fP \fIINT\fP[\fI:nref\fP|\fI:alt1\fP|\fI:minor\fP|:\(aqmajor\(aq|:\(aqnonmajor\(aq] +\fB\-C, \-\-max\-ac\fP \fIINT\fP[\fI:nref\fP|\fI:alt1\fP|\fI:minor\fP|:\*(Aqmajor\*(Aq|:\*(Aqnonmajor\*(Aq] .RS 4 maximum allele count (INFO/AC) of sites to be printed. Specifying the type of allele is optional and can be set to @@ -5843,7 +5851,7 @@ plot\-vcfstats \-p outdir file.vchk .nf .fam C # The final looks can be customized by editing the generated -# \(aqoutdir/plot.py\(aq script and re\-running manually +# \*(Aqoutdir/plot.py\*(Aq script and re\-running manually cd outdir && python plot.py && pdflatex summary.tex .fam .fi @@ -6445,9 +6453,9 @@ Consequently, the following two expressions are equivalent but not the third: .if n .RS 4 .nf .fam C -\-i \(aqTAG="hello,world"\(aq -\-i \(aqTAG="hello" || TAG="world"\(aq -\-i \(aqTAG="hello" && TAG="world"\(aq +\-i \*(AqTAG="hello,world"\*(Aq +\-i \*(AqTAG="hello" || TAG="world"\*(Aq +\-i \*(AqTAG="hello" && TAG="world"\*(Aq .fam .fi .if n .RE @@ -6480,14 +6488,14 @@ used on the result. For example, when querying "TAG=1,2,3,4", it will be evaluat .if n .RS 4 .nf .fam C -\-i \(aqTAG[*]=1\(aq .. true, the record will be printed -\-i \(aqTAG[*]!=1\(aq .. true -\-e \(aqTAG[*]=1\(aq .. false, the record will be discarded -\-e \(aqTAG[*]!=1\(aq .. false -\-i \(aqTAG[0]=1\(aq .. true -\-i \(aqTAG[0]!=1\(aq .. false -\-e \(aqTAG[0]=1\(aq .. false -\-e \(aqTAG[0]!=1\(aq .. true +\-i \*(AqTAG[*]=1\*(Aq .. true, the record will be printed +\-i \*(AqTAG[*]!=1\*(Aq .. true +\-e \*(AqTAG[*]=1\*(Aq .. false, the record will be discarded +\-e \*(AqTAG[*]!=1\*(Aq .. false +\-i \*(AqTAG[0]=1\*(Aq .. true +\-i \*(AqTAG[0]!=1\*(Aq .. false +\-e \*(AqTAG[0]=1\*(Aq .. false +\-e \*(AqTAG[0]!=1\*(Aq .. true .fam .fi .if n .RE @@ -6512,7 +6520,7 @@ compute the resulting vector C: . sp -1 . IP \(bu 2.3 .\} -C_i = A_i + B_i when length(A)==B(A) and sets length\(co=length(A) +C_i = A_i + B_i when length(A)==B(A) and sets length(C)=length(A) .RE .sp .RS 4 @@ -6523,7 +6531,7 @@ C_i = A_i + B_i when length(A)==B(A) and sets length\(co=length(A) . sp -1 . IP \(bu 2.3 .\} -C_i = A_i + B_0 when length(B)=1 and sets length\(co=length(A) +C_i = A_i + B_0 when length(B)=1 and sets length(C)=length(A) .RE .sp .RS 4 @@ -6534,7 +6542,7 @@ C_i = A_i + B_0 when length(B)=1 and sets length\(co=length(A) . sp -1 . IP \(bu 2.3 .\} -C_i = A_0 + B_i when length(A)=1 and sets length\(co=length(B) +C_i = A_0 + B_i when length(A)=1 and sets length(C)=length(B) .RE .sp .RS 4 @@ -6689,7 +6697,7 @@ that the whole expression is passed to the program as intended: .if n .RS 4 .nf .fam C -bcftools view \-i \(aqID!="." & MAF[0]<0.01\(aq +bcftools view \-i \*(AqID!="." & MAF[0]<0.01\*(Aq .fam .fi .if n .RE @@ -6713,7 +6721,7 @@ C CAAA .. indel, insertion (regardless of length) C <*> .. gVCF block, the allele <*> is a placeholder for alternate allele possibly missed because of low coverage C .. synonymous to <*> C * .. overlapping deletion -C .. symbolic allele, known also as \(aqother [than above]\(aq +C .. symbolic allele, known also as \*(Aqother [than above]\*(Aq .fam .fi .if n .RE diff --git a/doc/bcftools.html b/doc/bcftools.html index eb8b1cbe..41d78439 100644 --- a/doc/bcftools.html +++ b/doc/bcftools.html @@ -4,7 +4,7 @@ - + bcftools(1) @@ -50,7 +50,7 @@

DESCRIPTION

VERSION

-

This manual page was last updated 2024-12-28 22:19 GMT and refers to bcftools git version 1.21-61-g68f13f2e+.

+

This manual page was last updated 2025-01-29 08:37 CET and refers to bcftools git version 1.21-72-g724713f+.

@@ -3481,7 +3481,7 @@

bcftools norm [OPTIONS] file.vcf.gz

cannot be stressed enough, that s will NOT fix strand issues in your VCF, do NOT use it for that purpose!!! (Instead see http://samtools.github.io/bcftools/howtos/plugin.af-dist.html and -<http://samtools.github.io/bcftools/howtos/plugin.fixref.html>.)

+http://samtools.github.io/bcftools/howtos/plugin.fixref.html.)

-d, --rm-dup snps|indels|both|all|exact
@@ -4336,6 +4336,12 @@

Examples:

bcftools query -f '[%SAMPLE %GT %DP\n]' -i 'FMT/DP=1 | FMT/DP=2' file.vcf
+
+
+
# Refer to ID column vs INFO/ID tag vs FORMAT/ID tag
+bcftools query -f 'columnID=%ID   infoID=%INFO/ID  [fmtID=%ID ]  [columnID=%/ID]'
+
+
@@ -5493,13 +5499,13 @@

FILTERING EXPRESSIONS

  • -

    C_i = A_i + B_i when length(A)==B(A) and sets length©=length(A)

    +

    C_i = A_i + B_i when length(A)==B(A) and sets length(C)=length(A)

  • -

    C_i = A_i + B_0 when length(B)=1 and sets length©=length(A)

    +

    C_i = A_i + B_0 when length(B)=1 and sets length(C)=length(A)

  • -

    C_i = A_0 + B_i when length(A)=1 and sets length©=length(B)

    +

    C_i = A_0 + B_i when length(A)=1 and sets length(C)=length(B)

  • throw an error when length(A)!=length(B) AND length(A)!=1 AND length(B)!=1

    @@ -5717,7 +5723,7 @@

    COPYING

diff --git a/filter.c b/filter.c index cce97ee5..1f1f04ea 100644 --- a/filter.c +++ b/filter.c @@ -1,6 +1,6 @@ /* filter.c -- filter expressions. - Copyright (C) 2013-2024 Genome Research Ltd. + Copyright (C) 2013-2025 Genome Research Ltd. Author: Petr Danecek @@ -2609,9 +2609,11 @@ static int _regex_vector_strings(regex_t *regex, char *str, size_t len, int logi char *mid = str; while ( mid < end && *mid && *mid!=',' ) mid++; int miss = mid - str == 1 && str[0]=='.' ? 1 : 0; - if ( miss && missing_logic[miss] ) return 1; + int match = ( miss && missing_logic[miss] ) ? 1 : 0; + if ( logic==TOK_NLIKE ) match = match ? 0 : 1; + if ( match ) return 1; char tmp = *mid; *mid = 0; - int match = regexec(regex, str, 0,NULL,0) ? 0 : 1; + match = regexec(regex, str, 0,NULL,0) ? 0 : 1; *mid = tmp; if ( logic==TOK_NLIKE ) match = match ? 0 : 1; if ( match ) return 1; @@ -2698,6 +2700,7 @@ static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok) { token_t *tok = atok->regex ? btok : atok; rtok->pass_site = _regex_vector_strings(regex, tok->str_value.s, tok->str_value.l, logic, missing_logic); + fprintf(stderr,"pass=%d [%s]\n",rtok->pass_site,tok->str_value.s); } return; } @@ -3742,6 +3745,7 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error if ( type==BCF_HT_INT ) set_missing = 1; else if ( type==BCF_HT_REAL ) set_missing = 1; } + else if ( !out[k].tag ) error("Error: could not parse the expression\n"); // e.g. =~ else if ( !strcmp("QUAL",out[k].tag) ) set_missing = 1; if ( set_missing ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); } } diff --git a/test/query.filter.15.1.out b/test/query.filter.15.1.out new file mode 100644 index 00000000..f8116883 --- /dev/null +++ b/test/query.filter.15.1.out @@ -0,0 +1,3 @@ +. +.,. +a,.,c diff --git a/test/query.filter.15.2.out b/test/query.filter.15.2.out new file mode 100644 index 00000000..341f9ae3 --- /dev/null +++ b/test/query.filter.15.2.out @@ -0,0 +1,3 @@ +a,b,c +a +a,.,c diff --git a/test/query.filter.15.vcf b/test/query.filter.15.vcf new file mode 100644 index 00000000..a574086f --- /dev/null +++ b/test/query.filter.15.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##contig= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 1 . * * . . TAG=a,b,c +chr1 2 . * * . . TAG=a +chr1 3 . * * . . TAG=. +chr1 4 . * * . . TAG=.,. +chr1 5 . * * . . TAG=a,.,c diff --git a/test/test.pl b/test/test.pl index e968cb51..240e1983 100755 --- a/test/test.pl +++ b/test/test.pl @@ -115,6 +115,10 @@ run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.5.a','merge.gvcf.5.b'],out=>'merge.gvcf.5.1.out',args=>'--gvcf - --merge none'); run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.11.a','merge.gvcf.11.b','merge.gvcf.11.c'],out=>'merge.gvcf.11.1.out',args=>'--gvcf -'); # run_test(\&test_vcf_merge_big,$opts,in=>'merge_big.1',out=>'merge_big.1.1',nsmpl=>79000,nfiles=>79,nalts=>486,args=>''); # commented out for speed +run_test(\&test_vcf_query,$opts,in=>'query.filter.15',out=>'query.filter.15.1.out',args=>q[-f '%TAG' -i 'TAG[*]="."']); +run_test(\&test_vcf_query,$opts,in=>'query.filter.15',out=>'query.filter.15.1.out',args=>q[-f '%TAG' -i 'TAG[*]~"\."']); +run_test(\&test_vcf_query,$opts,in=>'query.filter.15',out=>'query.filter.15.2.out',args=>q[-f '%TAG' -i 'TAG[*]!="."']); +run_test(\&test_vcf_query,$opts,in=>'query.filter.15',out=>'query.filter.15.2.out',args=>q[-f '%TAG' -i 'TAG[*]!~"\."']); run_test(\&test_vcf_query,$opts,in=>'query.3',out=>'query.3.1.out',args=>q[-f '%CHROM %POS %ID %REF %ALT %QUAL %FILTER \\t %INFO/CHROM %INFO/POS %INFO/ID %INFO/REF %INFO/ALT %INFO/QUAL %INFO/FILTER']); run_test(\&test_vcf_query,$opts,in=>'query.3',out=>'query.3.2.out',args=>q[-f '[ %CHROM] \\t [ %POS] \\t [ %ID] \\t [ %REF] \\t [ %ALT] \\t [ %QUAL] \\t [ %FILTER]']); run_test(\&test_vcf_query,$opts,in=>'query.3',out=>'query.3.3.out',args=>q[-f '[ %/CHROM] \\t [ %/POS] \\t [ %/ID] \\t [ %/REF] \\t [ %/ALT] \\t [ %/QUAL] \\t [ %/FILTER]']);