Skip to content

Commit

Permalink
Merge pull request #26 from martinghunt/handle_non_acgt
Browse files Browse the repository at this point in the history
Handle non acgt characters
  • Loading branch information
martinghunt authored Mar 18, 2021
2 parents 7913adb + ef50e57 commit d6cdb75
Show file tree
Hide file tree
Showing 16 changed files with 56 additions and 14 deletions.
8 changes: 4 additions & 4 deletions .ci/install_dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,10 @@ cp -s vt-git/vt .

#________________________ mummer ____________________________#
cd $install_root
wget https://github.com/mummer4/mummer/releases/download/v4.0.0beta2/mummer-4.0.0beta2.tar.gz
tar xf mummer-4.0.0beta2.tar.gz
rm mummer-4.0.0beta2.tar.gz
cd mummer-4.0.0beta2
wget https://github.com/mummer4/mummer/releases/download/v4.0.0rc1/mummer-4.0.0rc1.tar.gz
tar xf mummer-4.0.0rc1.tar.gz
rm mummer-4.0.0rc1.tar.gz
cd mummer-4.0.0rc1
./configure
make
make install
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM continuumio/miniconda3
ENV MINIMAP_VERSION 2.17
ENV MUMMER_VERSION "4.0.0beta2"
ENV MUMMER_VERSION "4.0.0rc1"
ENV BIOPY_VERSION 1.77
ENV CLUSTER_VERSION 0.13.1
ENV PANDAS_VERSION 1.1.0
Expand Down
1 change: 1 addition & 0 deletions tests/data/tasks/vcf_eval.expect.masked.summary_stats.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"filter_fail": 1,
"heterozygous": 0,
"no_genotype": 0,
"non_acgt": 0,
"other": 0,
"ref_call": 1
},
Expand Down
1 change: 1 addition & 0 deletions tests/data/tasks/vcf_eval.expect.summary_stats.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"filter_fail": 2,
"heterozygous": 0,
"no_genotype": 0,
"non_acgt": 0,
"other": 0,
"ref_call": 1
},
Expand Down
2 changes: 1 addition & 1 deletion tests/data/truth_variant_finding/make_truth_vcf.ref.fa
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
>ref
AAGCCCAATAAACCACTAGTGATATAGGCAACGACATGTACAGTGCGGCGACCCTTGCAA
AGACAGTGACGCTTTCGCCTCCGTTGCCTAAACCTATTTGAAGGAGTCGCATAGCAGCCG
AGACAGTGACGCTTTCGCCTCCGTTGCCTAAACCTATTTGAAGNAGTCGCATAGCAGCCG
CAGTAAGGCACAATACCTCGTGTCCGTGTTACCAGACCAAAACAAGACGTCCTCTTCAAT
GTTTAAATGACCCTCTCGTCATAAAACCTTTCTACTATGTGTTCCGCAAGAATCAACAAC
TACAATGGCGCGTCGTGAATAACGCGACGGCTGAGACGAACGGCGCGTGAATGAAGCGCA
Expand Down
2 changes: 1 addition & 1 deletion tests/data/truth_variant_finding/make_truth_vcf.truth.fa
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
>truth
CTGACTGGCCGAATAGGTCAGATATAGGCAACGACATGTGCAGTGCGGCGACCCTTGCAG
AGACAGTGACGCTTTCGCCCCCGTTGCCTAAACCTATTTGAAGGAGTCCTGTAGCAGCCG
AGACAGTGANNGCTTTCGCCCCCGTTGCCTAAACCTATTTGAAGAGTCCTGTAGCAGCCG
CAGTAAGGCACAATACCTCGGTCCGTGTTACCAGACCAATAACAAGACGTCCTCTTCAAT
GTTTAAATGACCCTCTCGTCATAAAACCTTTCTACTATGTGTTCCGCAAGAATCAACAAC
TACAATGGCCCGTCGTGAATAACGCGACGGCTGAGACGAACGGCGCGTGAATGAAGCGCA
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"filter_fail": 1,
"heterozygous": 0,
"no_genotype": 0,
"non_acgt": 0,
"other": 0,
"ref_call": 1
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"filter_fail": 2,
"heterozygous": 0,
"no_genotype": 0,
"non_acgt": 0,
"other": 0,
"ref_call": 1
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ ref 5 . A . . PASS . GT:NOTES:VFR_EXCLUDE_REASON 1:Should always get removed bec
ref 6 . T C . PASS . NOTES:VFR_EXCLUDE_REASON Should always get removed because no GT:no_genotype
ref 7 . . A . PASS . GT:NOTES:VFR_EXCLUDE_REASON 1:Should always get removed because REF is .:other
ref 8 . A T . PASS . GT:NOTES:VFR_EXCLUDE_REASON 0/1:Should always get removed because heterozygous:heterozygous
ref 12 . C N . PASS . GT:NOTES:VFR_EXCLUDE_REASON 1/1:Should always get removed because N in alt:non_acgt
ref 13 . ANNN A . PASS . GT:NOTES:VFR_EXCLUDE_REASON 1/1:Should always get removed because N in ref:non_acgt
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ ref 8 . A T . PASS . GT:NOTES:VFR_EXCLUDE_REASON 0/1:Should always get removed b
ref 9 . C G . . . GT:NOTES:VFR_EXCLUDE_REASON 1/1:Removal depends on filter_pass option:filter_fail
ref 10 . T G . FILTER_1 . GT:NOTES:VFR_EXCLUDE_REASON 1/1:Removal depends on filter_pass option:filter_fail
ref 11 . G G . FILTER_1;FILTER_2 . GT:NOTES:VFR_EXCLUDE_REASON 1/1:Removal depends on filter_pass option:filter_fail
ref 12 . C N . PASS . GT:NOTES:VFR_EXCLUDE_REASON 1/1:Should always get removed because N in alt:non_acgt
ref 13 . ANNN A . PASS . GT:NOTES:VFR_EXCLUDE_REASON 1/1:Should always get removed because N in ref:non_acgt
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@ ref 6 . T C . PASS . NOTES:VFR_EXCLUDE_REASON Should always get removed because
ref 7 . . A . PASS . GT:NOTES:VFR_EXCLUDE_REASON 1:Should always get removed because REF is .:other
ref 8 . A T . PASS . GT:NOTES:VFR_EXCLUDE_REASON 0/1:Should always get removed because heterozygous:heterozygous
ref 10 . T G . FILTER_1 . GT:NOTES:VFR_EXCLUDE_REASON 1/1:Removal depends on filter_pass option:filter_fail
ref 12 . C N . PASS . GT:NOTES:VFR_EXCLUDE_REASON 1/1:Should always get removed because N in alt:non_acgt
ref 13 . ANNN A . PASS . GT:NOTES:VFR_EXCLUDE_REASON 1/1:Should always get removed because N in ref:non_acgt
2 changes: 2 additions & 0 deletions tests/data/vcf_evaluate/filter_vcf.in.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ ref 8 . A T . PASS . GT:NOTES 0/1:Should always get removed because heterozygous
ref 9 . C G . . . GT:NOTES 1/1:Removal depends on filter_pass option
ref 10 . T G . FILTER_1 . GT:NOTES 1/1:Removal depends on filter_pass option
ref 11 . G G . FILTER_1;FILTER_2 . GT:NOTES 1/1:Removal depends on filter_pass option
ref 12 . C N . PASS . GT:NOTES 1/1:Should always get removed because N in alt
ref 13 . ANNN A . PASS . GT:NOTES 1/1:Should always get removed because N in ref
17 changes: 10 additions & 7 deletions tests/vcf_evaluate_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def test_filter_vcf():
got_keep = "tmp.filter_vcf.keep.vcf"
got_exclude = "tmp.filter_vcf.exclude.vcf"
subprocess.check_output(f"rm -rf {got_keep} {got_exclude}", shell=True)
ref_seqs = {"ref": "ATGCATGACTGCATTACTCATCATCGAATG"}
ref_seqs = {"ref": "ATGCATGACTGCANNNCTCATCATCGAATG"}
got_counts = vcf_evaluate._filter_vcf(
infile, got_keep, got_exclude, ref_seqs, filter_pass=None, keep_ref_calls=True
)
Expand All @@ -49,6 +49,7 @@ def test_filter_vcf():
"heterozygous": 1,
"no_genotype": 1,
"ref_call": 0,
"non_acgt": 2,
"other": 3,
}
assert got_counts == expect_counts
Expand All @@ -58,8 +59,8 @@ def test_filter_vcf():
expect_exclude = os.path.join(
data_dir, "filter_vcf.expect.no_filter_pass_exclude_ref_calls.exclude.vcf"
)
utils.vcf_records_are_the_same(got_keep, expect_keep)
utils.vcf_records_are_the_same(got_exclude, expect_exclude)
assert utils.vcf_records_are_the_same(got_keep, expect_keep)
assert utils.vcf_records_are_the_same(got_exclude, expect_exclude)
os.unlink(got_keep)
os.unlink(got_exclude)

Expand All @@ -76,15 +77,16 @@ def test_filter_vcf():
"heterozygous": 1,
"no_genotype": 1,
"ref_call": 1,
"non_acgt": 2,
"other": 3,
}
assert got_counts == expect_counts
expect_keep = os.path.join(data_dir, "filter_vcf.expect.with_filtering.1.keep.vcf")
expect_exclude = os.path.join(
data_dir, "filter_vcf.expect.with_filtering.1.exclude.vcf"
)
utils.vcf_records_are_the_same(got_keep, expect_keep)
utils.vcf_records_are_the_same(got_exclude, expect_exclude)
assert utils.vcf_records_are_the_same(got_keep, expect_keep)
assert utils.vcf_records_are_the_same(got_exclude, expect_exclude)
os.unlink(got_keep)
os.unlink(got_exclude)

Expand All @@ -101,15 +103,16 @@ def test_filter_vcf():
"heterozygous": 1,
"no_genotype": 1,
"ref_call": 1,
"non_acgt": 2,
"other": 3,
}
assert got_counts == expect_counts
expect_keep = os.path.join(data_dir, "filter_vcf.expect.with_filtering.2.keep.vcf")
expect_exclude = os.path.join(
data_dir, "filter_vcf.expect.with_filtering.2.exclude.vcf"
)
utils.vcf_records_are_the_same(got_keep, expect_keep)
utils.vcf_records_are_the_same(got_exclude, expect_exclude)
assert utils.vcf_records_are_the_same(got_keep, expect_keep)
assert utils.vcf_records_are_the_same(got_exclude, expect_exclude)
os.unlink(got_keep)
os.unlink(got_exclude)

Expand Down
11 changes: 11 additions & 0 deletions varifier/truth_variant_finding.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,15 @@ def _bcftools_norm(ref_fasta, vcf_in, vcf_out):
print(vcf_string, end="", file=f)


def _remove_non_acgt_records(vcf_in, vcf_out):
header_lines, vcf_records = vcf_file_read.vcf_file_to_list(vcf_in)
with open(vcf_out, "w") as f:
print(*header_lines, sep="\n", file=f)
for record in vcf_records:
if utils.vcf_record_is_all_acgt(record):
print(record, file=f)


def make_truth_vcf(
ref_fasta,
truth_fasta,
Expand Down Expand Up @@ -159,6 +168,8 @@ def make_truth_vcf(
_truth_using_minimap2_paftools(
ref_fasta, truth_fasta, minimap2_vcf, threads=threads
)
_remove_non_acgt_records(minimap2_vcf, minimap2_vcf)
_remove_non_acgt_records(dnadiff_vcf, dnadiff_vcf)
to_merge = [dnadiff_vcf, minimap2_vcf]
_merge_vcf_files_for_probe_mapping(to_merge, ref_fasta, merged_vcf)
logging.info(f"Made merged VCF file {merged_vcf}")
Expand Down
13 changes: 13 additions & 0 deletions varifier/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,20 @@
import re

import pyfastaq

from cluster_vcf_records import vcf_file_read

all_acgt_regex = re.compile("^[ACGT]+$", re.IGNORECASE)


def vcf_record_is_all_acgt(record):
if all_acgt_regex.search(record.REF) is None:
return False
for alt in record.ALT:
if all_acgt_regex.search(alt) is None:
return False
return True


def load_mask_bed_file(mask_bed_file):
"""Loads a BED file of ref seq names, and start and end postiions.
Expand Down
3 changes: 3 additions & 0 deletions varifier/vcf_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def _filter_vcf(
"heterozygous": 0,
"no_genotype": 0,
"ref_call": 0,
"non_acgt": 0,
"other": 0,
}
with vcf_file_read.open_vcf_file_for_reading(infile) as f_in, open(
Expand Down Expand Up @@ -80,6 +81,8 @@ def _filter_vcf(
exclude_reason = "filter_fail"
elif len(record.ALT) == 0 or record.ALT == ["."]:
exclude_reason = "other"
elif not utils.vcf_record_is_all_acgt(record):
exclude_reason = "non_acgt"
elif record.FORMAT is None:
exclude_reason = "no_genotype"
elif record.REF in [".", ""]:
Expand Down

0 comments on commit d6cdb75

Please sign in to comment.