Skip to content

Commit

Permalink
Merge pull request #110 from broadinstitute/dp-fasta-handling
Browse files Browse the repository at this point in the history
handle spaces in fasta headers
  • Loading branch information
dpark01 committed Sep 18, 2024
2 parents dd0d3c8 + d5de7d5 commit b15de41
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 1 deletion.
43 changes: 43 additions & 0 deletions test/input/TestToolPicard/messy-headers.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
>Influenza A virus A/bovine_milk/USA/Broad_MC_023/2024(H5N1))-1
CTGGAGCAAGACAAATGATGCCGGATCAGATCGAGTGATGGTATCACCCCTGGCTGTGAC
ATGGTGGAATAGGAATGGACCAACAACAAGTACAATTCACTATCCAAAGGTATACAAAAC
AAAGGGAGAGAAGGCTAATGTGCTAATTGGGCAAGGAGACGTGGTGTTGGTGATGAAACG
GAAACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGC
CATCAATTAGTGTGGAATTGTTTAAAAACGACCTTGTTTCTACT
>Influenza A virus A/bovine_milk/USA/Broad_MC_023/2024(H5N1))-2
CGAAAGCAGGCAAACCATTTGAATGGATGTCAATCCGACCTTACTCTTCTTGAAAGTTCC
AGCGCAAAATGCCATAAGCACCACATTCCCGTATACTGGAGATCCTCCATACAGCCATGG
ACGGATTAAGAAGGAGGAGTTTGCTGAGATCATGAAGATCTGTTCCACCATTGAAGAGCT
CAGACGGCAGAAATAGTGAATTTAGCTTGTCCTTCATGAAAAAATGCCTTGTTTCTACT
>Influenza A virus A/bovine_milk/USA/Broad_MC_023/2024(H5N1))-3
AAGCAGGTACTGATTCAAAATGGAAGACTTTGTGCGACAATGCTTCAATCCAATGATTGT
CGAGCTTGCGGAAAAAGCAATGAAAGAACATGGGGAAGATCCGAAAATCGAGACAAACAA
GATTAACGATCCCTGGGTTTTGCTTAATGCATCTTGGTTCAACTCCTTCCTCACACATGC
ACTGAAATAGTTGTGGCAATGCTACTATTTGCTATCCATACTGTCCAAAAAAGTACCTTG
TTTCTACT
>Influenza A virus A/bovine_milk/USA/Broad_MC_023/2024(H5N1))-4
GCAGTGTAGCTGGATGGCTCCTCGGAAACCCAATGTGCGACGAATTCATCAGAGTGCCGG
AATGGTCTTACATAGTGGAGCGGGCTAACCCAGCTAATGACCTCTGTTACCCAGGGAGCC
TATCTTTATGGATGTGCTCCAATGGGTCGTTACAATGCAGAATTTGCATTTAGATTTATG
AGCTCAGATTGTAGTTAAAAACACCCTTGTTTCT
>Influenza A virus A/bovine_milk/USA/Broad_MC_023/2024(H5N1))-5
AAGCAGGGTAGATAATCACTCACTGAGTGACATCCACATCATGGCGTCTCAAGGCACCAA
ATCTGTTGGAAGAATGGTTGGCGGAATCGGGAGATTCTACATACAGATGTGCACTGAGCT
GGACGAAAAGGCAACGAACCCGATCGTGCCTTCCTTTGACATGAACAATGAAGGATCTTA
TTTCTTCGGAGACAATGCAGAGGAGTATGACAATTAAAGAAAAATACCCTTGTTTCTACT
>Influenza A virus A/bovine_milk/USA/Broad_MC_023/2024(H5N1))-6
CGGGCAATTCATCTCTTTGCCCTATTAGTGGGTGGGCAATATACAGTAAGGACAACGGTA
TAAGAATTGGATCTAAGGGGGATGTGTTTGTTATAAGAGAACCATTCATCTCATGCTCCC
GTGGTGTAAATAGTGACACTGTGGGTTGGTCTTGGCCAGACGGTGCTGAGTTGCCATTCA
CCATTGACAAGTAGTTTGTTCAAAAAACTCCTTGTTTCTACT
>Influenza A virus A/bovine_milk/USA/Broad_MC_023/2024(H5N1))-7
AAGCAGGTAGATATTGAAAGATGAGTCTTCTAACCGAGGTCGAAACGTACGTTCTCTCTA
TCGTCCCGTCGGGCCCCCTCAAAGCCGAGATCGCGCAGAGACTTGAAGATGTCTTTGCAG
GGAAGAACACCGATCTTGAGGCTCTCATGGAATGGCTAAAGACAAGACCAATCCTGTCAC
CTCTGACCAAGGGGATTTTGGGATTTGTGTTCACGCTCACCGTGCCCAGTGAGCGAGGAC
CT
>Influenza A virus A/bovine_milk/USA/Broad_MC_023/2024(H5N1))-8
AGCDGGGTGACAAAAACATAATGGATTCCAACACTGTGTTAAGCTTTCAGGTAGACTGCT
TTCTTTGGCATGTCCGCAAACGATTTGCAGACCAAGAACTGGGTGATGCCCCATTCCTTG
TCGAGACGGCCACTCGTGCTGGGAAGCAGATAGTGGAGAGGATTCTGGAGGAAGAATCCG
TCGTTTCAGCTTATTTAATAATAAAAAACACCCTTGTTTCTACT
17 changes: 17 additions & 0 deletions test/unit/test_tools_picard.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,23 @@ def test_fasta_index(self):
actual_first3 = [re.sub(r'VN:1\.[4-6]','VN:1.4',x.strip()).split('\t')[:3] for x in inf.readlines()]
self.assertEqual(actual_first3, expected_first3)

def test_messy_fasta_index(self):
orig_ref = os.path.join(util.file.get_test_input_path(self), 'messy-headers.fasta')
picard_index = tools.picard.CreateSequenceDictionaryTool()
with util.file.tempfname('.fasta') as inRef:
shutil.copyfile(orig_ref, inRef)
picard_index.execute(inRef, overwrite=True)
with open(inRef[:-6] + '.dict', 'rt') as inf:
seqnames = set()
for line in inf:
if line.startswith('@SQ'):
seq_name = dict(x.split(':', maxsplit=1) for x in line.strip().split('\t')[1:])['SN']
# old versions of code cut this off at "Influenza"
self.assertGreater(len(seq_name), 50)
seqnames.add(seq_name)
# require that all sequence names are unique
self.assertEqual(len(seqnames), 8)

def test_sam_downsample(self):
desired_count = 100
tolerance = 0.02
Expand Down
2 changes: 1 addition & 1 deletion util/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,7 @@ def write_fasta_with_sanitized_ids(fasta_in, out_filepath):
fasta_out = FastaIO.FastaWriter(handle, wrap=None)
fasta_out.write_header()
for record in SeqIO.parse(fasta_in, "fasta"):
record.id=sanitize_id_for_sam_rname(record.id)
record.id=sanitize_id_for_sam_rname(record.description)
fasta_out.write_record(record)
print("out_filepath",out_filepath)
print("os.path.dirname(out_filepath)",os.path.dirname(out_filepath))
Expand Down

0 comments on commit b15de41

Please sign in to comment.