-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcli.py
210 lines (176 loc) · 7.16 KB
/
cli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
""" Functions for the command-line entrypoints defined in setup.py
for the CAFA sprot targets module.
Three entrypoints are defined:
CAFA_experimental_growth
CAFA_print_annotation_counts
CAFA_generate_no_exp_files
"""
import os
from pathlib import Path
import yaml
import click
from experimental_growth import (
count_annotated_proteins_from_files,
print_annotation_counts_table,
)
from filter_sprot_species import (
species_filter,
filter_sprot_by_taxonomies,
generate_protein_ids_mapping,
)
<<<<<<< HEAD
from make_blast_predictor import generate_protein_fasta
from utils import TAXONOMY_LOOKUP
@click.command()
@click.argument("config_handle", type=click.File("r"))
def cli_generate_protein_fasta(config_handle):
""" Generates fasta-formatted file containing protein sequences for the
proteins IDs in the provided protein_ids_filepath file. """
conf = yaml.load(config_handle, Loader=yaml.FullLoader)
protein_ids_filepath = conf.get("protein_ids_filepath")
swissprot_filepath = conf.get("swissprot_filepath")
output_filepath = conf.get("output_filepath")
with open(output_filepath, "w") as output_handle:
generate_protein_fasta(protein_ids_filepath, swissprot_filepath, output_handle)
@click.command()
@click.argument("config_handle", type=click.File("r"))
def cli_experimental_growth(config_handle):
=======
from utils import TAXONOMY_LOOKUP
@click.command()
@click.argument("config_handle", type=click.File("r"))
def experimental_growth(config_handle):
>>>>>>> b4741a762d6e50506fc95d6c8e974644e1f02f75
"""Collects counts of annotations for the given taxonomy per ontology namespace
filtered by the provided evidence codes. The counts are printed to stdout.
"""
conf = yaml.load(config_handle, Loader=yaml.FullLoader)
namespaces = conf.get("ontologies")
evidence_codes = conf.get("evidence_codes")
swissprot_file_list = conf.get("input_files")
taxons = conf.get("taxonomies")
if taxons is None:
taxon = conf.get("taxonomy")
if taxon is None:
raise Exception("either taxon or taxonomies is required")
taxons = (taxon,)
growth_counts = count_annotated_proteins_from_files(
swissprot_file_list,
taxons,
allowed_namespaces=namespaces,
allowed_evidence_codes=evidence_codes,
)
keys = ("filename", "taxon_id", "taxon_name") + tuple(namespaces)
print("\t".join(keys))
# Process the results:
# 1. Derive nested lists from the nested dicts
# 2. Derive SORTED nested lists from the data in step #1
_unsorted = []
for count_record in growth_counts:
sprot_filename = count_record.get("filename")
for key, counts in count_record.get("counts").items():
taxon, namespace = key
_unsorted.append([sprot_filename, taxon, namespace, counts])
# Loop through the configuration taxons, filelist and ontologies in order
# to main the original order of things:
for taxon in taxons:
for sprot_filename in swissprot_file_list:
counts = [
row
for row in _unsorted
if row[0] == sprot_filename and row[1] == str(taxon)
]
# create a new list merging the individual namespace counts into a
# single row
combined_row = [sprot_filename, str(taxon), TAXONOMY_LOOKUP.get(str(taxon))]
for ontology in namespaces:
# row[-1] is the protein count, row[-2] is the ontology namespace
combined_row += [str(row[-1]) for row in counts if row[-2] == ontology]
print("\t".join(combined_row))
@click.command()
@click.argument("config_handle", type=click.File("r"))
<<<<<<< HEAD
def cli_print_annotation_counts(config_handle):
=======
def print_annotation_counts(config_handle):
>>>>>>> b4741a762d6e50506fc95d6c8e974644e1f02f75
"""Collects counts of annotations for the given taxonomy per ontology namespace
filtered by the provided evidence codes. The counts are printed to stdout.
"""
conf = yaml.load(config_handle, Loader=yaml.FullLoader)
taxons = conf.get("taxonomies") or (conf.get("taxon"),)
namespaces = conf.get("ontologies")
evidence_codes = conf.get("evidence_codes")
swissprot_file_list = conf.get("input_files")
data = count_annotated_proteins_from_files(
swissprot_file_list,
taxons,
allowed_namespaces=namespaces,
allowed_evidence_codes=evidence_codes,
)
print_annotation_counts_table(data, taxons)
@click.command()
@click.argument("config_handle", type=click.File("r"))
@click.option(
"-q",
"--quiet",
"quiet",
is_flag=True,
default=False,
help="suppress stdout progress messages",
)
<<<<<<< HEAD
def cli_generate_no_exp_files(config_handle, quiet=False):
=======
def generate_no_exp_files(config_handle, quiet=False):
>>>>>>> b4741a762d6e50506fc95d6c8e974644e1f02f75
"""Parses a swissprot file and extracts protein data for the given taxonomy
where there is little/no experimental annotation for the relevant GO namespaces
and evidence codes. That protein data is written to a series of fasta files.
CONFIG_HANDLE is the filepath for a yaml file containing various parameters. See
filter_sprot_species_example.yml in the git repo for more info.
"""
def _print(message: str):
""" simple wrapper around print() obeying the 'quiet' flag """
if not quiet:
print(message)
conf = yaml.load(config_handle, Loader=yaml.FullLoader)
taxonomies = conf.get("taxonomies")
if taxonomies is None:
taxon = conf.get("taxonomy")
if taxon is None:
raise Exception("either taxon or taxonomies is required")
else:
taxonomies = (taxon,)
namespaces = conf.get("ontologies")
allowed_evidence_codes = conf.get("allowed_evidence_codes")
sprot_file = conf.get("sprot_file")
output_directory = conf.get("output_directory")
_print(f"Parsing {sprot_file} for taxonomies {taxonomies}")
output_dir_as_path = Path(output_directory)
if not output_dir_as_path.is_dir():
_print(f"\tMaking output directory {output_directory}")
os.makedirs(output_dir_as_path, exist_ok=True)
_print(f"Opening {sprot_file}")
with open(sprot_file, "r") as sprot_handle:
<<<<<<< HEAD
species_filter(sprot_handle, taxonomies=taxonomies)
=======
#species_filter(sprot_handle, taxonomies=taxonomies)
>>>>>>> b4741a762d6e50506fc95d6c8e974644e1f02f75
_print("Filtering by GO Namespace and Evidence Code")
if namespaces:
_print(f"\tUsing GO namespaces: {namespaces}")
if allowed_evidence_codes:
_print(f"\tUsing ALLOWED evidence codes: {allowed_evidence_codes}")
_print(f"Writing results to {output_directory}")
filter_sprot_by_taxonomies(
sprot_handle,
output_dir=output_directory,
taxonomies=taxonomies,
namespaces=namespaces,
allowed_evidence_codes=allowed_evidence_codes,
)
_print(f"Generating Swissprot ID => CAFA ID map file(s) for {output_directory}")
generate_protein_ids_mapping(taxonomies, output_directory)
_print("")