-
Notifications
You must be signed in to change notification settings - Fork 3
/
utils.py
356 lines (275 loc) · 12.8 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
import requests
import json
import os
import sys
import time
import subprocess
import pandas as pd
try:
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
except:
pass
library_df = pd.read_csv("library_names.tsv")
LIBRARY_NAMES = list(library_df["library"])
def get_inchikey(smiles, inchi):
inchikey_from_smiles = ""
inchikey_from_inchi = ""
try:
if len(smiles) > 5:
inchikey_from_smiles = str(Chem.MolToInchiKey(Chem.MolFromSmiles(smiles)))
else:
inchikey_from_smiles = ""
except:
inchikey_from_smiles = ""
try:
if len(inchi) > 5:
inchikey_from_inchi = str(Chem.InchiToInchiKey(inchi))
else:
inchikey_from_inchi = ""
except:
inchikey_from_inchi = ""
if len(inchikey_from_smiles) > 2 and len(inchikey_from_inchi) > 2:
return inchikey_from_smiles, inchikey_from_inchi
if len(inchikey_from_smiles) > 2:
return inchikey_from_smiles, ""
if len(inchikey_from_inchi) > 2:
return inchikey_from_inchi, ""
return "", ""
def get_formula(smiles, inchi):
formula_from_smiles = ""
formula_from_inchi = ""
try:
if len(smiles) > 5:
formula_from_smiles = str(CalcMolFormula(Chem.MolFromSmiles(smiles)))
else:
formula_from_smiles = ""
except:
formula_from_smiles = ""
try:
if len(inchi) > 5:
formula_from_inchi = str(CalcMolFormula(Chem.MolFromInchi(inchi)))
else:
formula_from_inchi = ""
except:
formula_from_inchi = ""
if len(formula_from_smiles) > 2 and len(formula_from_inchi) > 2:
return formula_from_smiles, formula_from_inchi
if len(formula_from_smiles) > 2:
return formula_from_smiles, ""
if len(formula_from_inchi) > 2:
return formula_from_inchi, ""
return "", ""
# Loads all GNPS libraries from GNPS servers and returns as python objects
def load_GNPS(library_names=LIBRARY_NAMES):
all_GNPS_list = []
for library_name in library_names:
print(library_name)
try:
url = "https://gnps.ucsd.edu/ProteoSAFe/LibraryServlet?library=%s" % (library_name)
all_GNPS_list += requests.get(url).json()["spectra"]
except:
print("Error with", library_name, url)
continue
return all_GNPS_list, library_df
# Enriching GNPS Library data with structures and formulas
def gnps_format_libraries(all_GNPS_list):
from tqdm import tqdm
all_spectra = []
for spectrum in tqdm(all_GNPS_list):
smiles = spectrum["Smiles"]
inchi = spectrum["INCHI"]
if len(smiles) < 5 and len(inchi) < 5:
inchikey_from_smiles = ""
inchikey_from_inchi = ""
formula_from_smiles = ""
formula_from_inchi = ""
else:
if "InChI=" not in inchi and len(inchi) > 10:
inchi = "InChI=" + inchi
# TODO: We gotta fix this and also just get rid of rdkit from the dependencies
inchikey_from_smiles, inchikey_from_inchi = get_inchikey(smiles, inchi)
formula_from_smiles, formula_from_inchi = get_formula(smiles, inchi)
spectrum_object = spectrum
spectrum_object["InChIKey_smiles"] = inchikey_from_smiles
spectrum_object["InChIKey_inchi"] = inchikey_from_inchi
spectrum_object["Formula_smiles"] = formula_from_smiles
spectrum_object["Formula_inchi"] = formula_from_inchi
spectrum_object["url"] = "https://gnps.ucsd.edu/ProteoSAFe/gnpslibraryspectrum.jsp?SpectrumID=%s" % spectrum["spectrum_id"]
all_spectra.append(spectrum_object)
return all_spectra
# Reformatting output for NPAtlas
def gnps_filter_for_key(formatted_spectra_list, filterKeysOut=True):
new_data_list = []
for element in formatted_spectra_list:
if len(element["InChIKey_inchi"]) < 5 and len(element["InChIKey_smiles"]) < 5 and filterKeysOut:
continue
new_data_list.append(element)
#Finding Discordant Entries
discordant_list = [element for element in new_data_list \
if (len(element["InChIKey_inchi"]) == len(element["InChIKey_smiles"]) \
and element["InChIKey_inchi"].split("-")[0] != element["InChIKey_smiles"].split("-")[0])]
print("Discordant List", len(discordant_list))
#Formatting
output_list = []
for element in new_data_list:
inchi_key = element["InChIKey_inchi"]
if len(element["InChIKey_smiles"]) > len(inchi_key):
inchi_key = element["InChIKey_smiles"]
output_dict = {}
output_dict["GNPSID"] = element["spectrum_id"]
output_dict["COMPOUND_NAME"] = element["Compound_Name"]
output_dict["COMPOUND_INCHIKEY"] = inchi_key
output_dict["COMPOUND_INCHI"] = element["INCHI"]
output_dict["COMPOUND_SMILES"] = element["Smiles"]
output_dict["LIBRARY_QUALITY"] = element["Library_Class"]
output_list.append(output_dict)
print("GNPS Library Entries with INCHIKEY", len(output_list))
return output_list
# Getting all the spectrum peaks for the library spectrum
def get_gnps_peaks(all_GNPS_list):
import copy
from tqdm import tqdm
output_list = []
for spectrum in tqdm(all_GNPS_list):
new_spectrum = copy.deepcopy(spectrum)
try:
# We first try to get it locally
spectrum_peaks_url = "http://externalstructureproxy-web:5000/gnpsspectrum?SpectrumID={}".format(spectrum["spectrum_id"])
r = requests.get(spectrum_peaks_url)
spectrum_json = r.json()
new_spectrum["peaks_json"] = spectrum_json["spectruminfo"]["peaks_json"]
new_spectrum["annotation_history"] = spectrum_json["annotations"]
output_list.append(new_spectrum)
except KeyboardInterrupt:
raise
except:
continue
return output_list
# Utils for outputting GNPS libraries and getting all the peaks, returns the list
def output_all_gnps_individual_libraries(all_json_list, output_folder):
spectra_list_with_peaks = []
for library in LIBRARY_NAMES:
library_spectra_list = [spectrum for spectrum in all_json_list if spectrum["library_membership"] == library]
library_spectra_list_with_peaks = get_gnps_peaks(library_spectra_list)
_output_library_files(library_spectra_list_with_peaks, output_folder, library)
spectra_list_with_peaks += library_spectra_list_with_peaks
return spectra_list_with_peaks
def _output_library_files(library_spectra_list_with_peaks, output_folder, library_name):
if len(library_spectra_list_with_peaks) > 0:
with open(os.path.join(output_folder, "{}.mgf".format(library_name)), "wb") as output_file:
output_file.write(get_full_mgf_string(library_spectra_list_with_peaks).encode("ascii", "ignore"))
with open(os.path.join(output_folder, "{}.msp".format(library_name)), "wb") as output_file:
output_file.write(get_full_msp_string(library_spectra_list_with_peaks).encode("ascii", "ignore"))
with open(os.path.join(output_folder, "{}.json".format(library_name)), "w") as output_file:
output_file.write(json.dumps(library_spectra_list_with_peaks, indent=4))
def get_full_mgf_string(all_json_list):
mgf_string_list = []
for spectrum in all_json_list:
mgf_string_list.append(json_object_to_string(spectrum))
return "\n".join(mgf_string_list)
def get_full_msp_string(all_json_list):
msp_string_list = []
for spectrum in all_json_list:
try:
msp_string = json_to_msp(spectrum)
if msp_string is not None:
msp_string_list.append(msp_string)
except KeyboardInterrupt:
raise
except:
pass
return "\n".join(msp_string_list)
def json_object_to_string(json_spectrum):
#print(json_spectrum["SpectrumID"])
if int(json_spectrum["Library_Class"]) > 4:
#print("CHALLENGE OR UNKNOWN CLASS, SKIPPING: " + json_spectrum["Library_Class"] + "\t" + json_spectrum["SpectrumID"])
return ""
mgf_string = "BEGIN IONS\n"
mgf_string += "PEPMASS=" + json_spectrum["Precursor_MZ"] + "\n"
mgf_string += "CHARGE=" + json_spectrum["Charge"] + "\n"
mgf_string += "MSLEVEL=2" + "\n"
mgf_string += "SOURCE_INSTRUMENT=" + json_spectrum["Ion_Source"] + "-" + json_spectrum["Instrument"] + "\n"
mgf_string += "FILENAME=" + json_spectrum["source_file"] + "\n"
mgf_string += "SEQ=*..*" + "\n"
mgf_string += "IONMODE=" + json_spectrum["Ion_Mode"] + "\n"
mgf_string += "ORGANISM=" + json_spectrum["library_membership"] + "\n"
mgf_string += "NAME=" + json_spectrum["Compound_Name"] + " " + json_spectrum["Adduct"] + "\n"
mgf_string += "PI=" + json_spectrum["PI"] + "\n"
mgf_string += "DATACOLLECTOR=" + json_spectrum["Data_Collector"] + "\n"
mgf_string += "SMILES=" + json_spectrum["Smiles"] + "\n"
mgf_string += "INCHI=" + json_spectrum["INCHI"] + "\n"
mgf_string += "INCHIAUX=" + json_spectrum["INCHI_AUX"] + "\n"
mgf_string += "PUBMED=" + json_spectrum["Pubmed_ID"] + "\n"
mgf_string += "SUBMITUSER=" + json_spectrum["submit_user"] + "\n"
mgf_string += "LIBRARYQUALITY=" + json_spectrum["Library_Class"] + "\n"
mgf_string += "SPECTRUMID=" + json_spectrum["SpectrumID"] + "\n"
mgf_string += "SCANS=" + json_spectrum["scan"] + "\n"
peaks_json = json_spectrum["peaks_json"]
if len(peaks_json) < 1000000:
peaks_object = json.loads(peaks_json)
for peak in peaks_object:
if peak[1] > 0:
mgf_string += str(peak[0]) + "\t" + str(peak[1]) + "\n"
# else:
# print("SKIPPING: " + json_spectrum["SpectrumID"] + " " + str(len(peaks_json)))
mgf_string += "END IONS\n\n"
return mgf_string
#output libraries into MSDial usable msp
def json_to_msp(json_spectrum):
#print(json_spectrum["SpectrumID"])
if int(json_spectrum["Library_Class"]) > 3:
#print("CHALLENGE OR UNKNOWN CLASS, SKIPPING: " + json_spectrum["Library_Class"] + "\t" + json_spectrum["SpectrumID"])
return ""
# Determine inchikey
inchi_key = json_spectrum["InChIKey_inchi"]
if len(inchi_key) < 5 and len(json_spectrum["InChIKey_smiles"]) > 5:
inchi_key = json_spectrum["InChIKey_smiles"]
# Determine formula
formula = json_spectrum["Formula_smiles"]
if len(formula) < 5 and len(json_spectrum["Formula_inchi"]) > 5:
formula = json_spectrum["Formula_inchi"]
msp_string = "NAME: " + json_spectrum["Compound_Name"] + "\n"
msp_string += "PRECURSORMZ: " + json_spectrum["Precursor_MZ"] + "\n"
msp_string += "PRECURSORTYPE: " + json_spectrum["Adduct"] + "\n"
msp_string += "FORMULA: {}\n".format(formula)
msp_string += "Ontology: \n"
msp_string += "INCHIKEY: {}\n".format(inchi_key)
msp_string += "INCHI: " + json_spectrum["INCHI"] + "\n"
msp_string += "SMILES: " + json_spectrum["Smiles"] + "\n"
msp_string += "RETENTIONTIME: "
msp_string += "CCS: \n"
msp_string += "IONMODE: " + json_spectrum["Ion_Mode"] + "\n"
msp_string += "INSTRUMENTTYPE: " + json_spectrum["Ion_Source"] + "-" + json_spectrum["Instrument"] + "\n"
msp_string += "INSTRUMENT: " + json_spectrum["Instrument"] + "\n"
msp_string += "COLLISIONENERGY: \n"
msp_string += "Comment: DB#=" + json_spectrum["SpectrumID"] + "; origin=" + "GNPS\n"
peaks_json = json_spectrum["peaks_json"]
if len(peaks_json) < 1000000:
peaks_list = json.loads(peaks_json)
peaks_list = [peak for peak in peaks_list if peak[1] > 0]
if len(peaks_list) == 0:
return None
msp_string += "Num Peaks: " + str(len(peaks_list)) + "\n"
for peak in peaks_list:
msp_string += str(peak[0]) + "\t" + str(peak[1]) + "\n"
# else:
# print("SKIPPING: " + json_spectrum["SpectrumID"] + " " + str(len(peaks_json)))
msp_string += "\n"
return msp_string
def run_cleaning_pipeline(gnps_json_file, output_directory):
"""
"""
path_to_script = "/app/pipelines//gnps_ml_processing_workflow/GNPS_ML_Processing/nf_workflow.nf"
if not os.path.isdir(output_directory):
os.makedirs(output_directory, exist_ok=True)
# Use subprocess to run a nextflow script to generate all everything we need
result = subprocess.run(["/nextflow", "run", path_to_script,
"--GNPS_json_path", gnps_json_file,
"--output_dir", output_directory,
"--conda_path", os.path.join(output_directory, "gnps2_ml_processing_env"),
"--matchms_conda_path", os.path.join(output_directory, "matchms_env")],
capture_output=True,
text=True)
print(result, flush=True)
return result