-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCodonAlign.py
executable file
·81 lines (73 loc) · 3.05 KB
/
CodonAlign.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python
import argparse
import sys
import warnings
warnings.simplefilter("ignore")
# import re
try:
from Bio import AlignIO
from Bio import SeqIO
from Bio import Seq
from Bio import Alphabet
from Bio import codonalign
except ImportError:
print("Biopython is required. Try with: pip install biopython")
sys.exit()
else:
# args
parser = argparse.ArgumentParser(prog="CodonAlign.py",
formatter_class=argparse.RawTextHelpFormatter,
description="""
Converts an amino acid alignment to nucleotides respecting codon positions\n""",
epilog="""
Examples:
python CodonAlign.py -p myAlignment.aa.fas -c myCDSseqs.nt.fas -o CDS.aln.fas
python CodonAlign.py -p myAlignment.aa.fas -c myCDSseqs.nt.fas -s
Note that amino acid and nucleotide sequences should correspondingly have the same label.\n""")
parser.add_argument(
'--prot', '-p', type=str, default="", required=True,
help='an amino acid alignment in FASTA format.')
parser.add_argument(
'--cds', '-c', type=str, default="", required=True,
help='''un aligned CDS sequences FASTA format. These must be in-frame and correspond
to the untranslated version of the amino acid sequence.''')
parser.add_argument(
'--outfile', '-o', default="aln.cds.fasta", nargs="?", type=str,
help='name for output file (default: %(default)s)')
parser.add_argument(
'--stdout', '-s', action="store_true", default=False,
help='if sequences should be printed to screen.')
args = parser.parse_args()
# code
# read data
aln = AlignIO.read(args.prot, format="fasta", alphabet=Alphabet.IUPAC.protein)
seq = list(SeqIO.parse(args.cds, format="fasta", alphabet=Alphabet.IUPAC.ambiguous_dna))
# change missing data for gap
# change lowercase for uppercase
# for s in seq:
# s.seq = Seq.Seq(re.sub("N","-",str(s.seq)).upper(), alphabet=Alphabet.IUPAC.ambiguous_dna)
# get names
names_aln = [ s.name for s in aln ]
names_seq = [ s.name for s in seq ]
# check number of seqs
if len(names_aln) != len(names_seq):
sys.exit("Number of sequences in both files does not match.")
# check names
if all([ name in names_aln for name in names_seq]) and all([ name in names_seq for name in names_aln]):
# build alignment
try:
codon_aln = codonalign.build(aln, seq)
except:
print("Could not generate alignment. Sequences probably include ambiguous or missing data.", file=sys.stderr)
else:
# delete the <unknown description> label
for i in range(len(codon_aln)): codon_aln[i].description = ""
if args.stdout:
# print to screen
print(format(codon_aln, "fasta"))
else:
# print to file
AlignIO.write(codon_aln, args.outfile, "fasta")
print(f"{len(names_aln)} aligned CDS sequences saved to {args.outfile}.", file=sys.stderr)
else:
sys.exit("Amino acid and nucleotide sequences do not have the same labels.")