-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcompound_gene_relationships.Rmd
314 lines (241 loc) · 10.2 KB
/
compound_gene_relationships.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
---
title: "Identifying compound & gene relationships"
author: "Fang Liu"
date: "5/10/2022"
output: html_document
---
> This file identify compound-gene target associations for the 7,170 compounds using publicly available data from Pharos and Drug Repurposing Hub (note: there were 8971 compounds in the original dataset; however, after data cleaning and keeping compounds with complete bioassay data, only 7,170 compounds are left). Of these 7,170 compounds, *7,030* has a valid SMILES identifier (i.e., 6503 from parent SMILES list `tox21_10k_cas_smiles_without_salts.txt` and 527 from the aggregated tox21 dataset `tox21_aggregated.txt`).
***
Load required libraries
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(readxl)
```
# Salt-stripped SMILES for original tox21 data
## First, use Ruili's parent smiles list
```{r}
#load original data file: 7,170 compounds from clustering
#filtered_tox21 = read.delim("./data/clusters.txt") #cas, cluster
filtered_tox21 = read.csv("results/cluster_results.csv")
#DEBUG ONLY: change the clusters.txt file if clustering result changes; keeping everything else the same
#z = read.delim("./results/cluster_results.txt") #cas, cluster
tox21_without_salts = read.delim("./data/tox21_10k_cas_smiles_without_salts.txt") %>%
janitor::clean_names() %>%
rename(cas = cas_rn,
smiles = structure_smiles_2d_qsar)
## Combine data using the cas as the "key" using smiles w/o salts data file
data_combined =
left_join(filtered_tox21, tox21_without_salts, by = "cas") %>%
distinct() #7170 compounds = 6503 with smiles + 667 without smiles
#THIS IS THE FIRST HALF
not_missing = data_combined %>% remove_missing() #6503 compounds
colnames(not_missing) # "cas" "cluster" "smiles"
#Find which compounds/drugs are actually missing the "SMILES"
missing = data_combined %>% filter(is.na(smiles)) %>%
select(-smiles) #667 compounds
```
## Second, use the Tox21 aggregated data to see if we can find more smiles for the compounds without smiles above
```{r}
#first, clean the aggregated data
aggregated_tox21_unique = read_delim("./data/tox21_aggregated.txt",
delim = "\t", escape_double = FALSE, trim_ws = TRUE) %>%
janitor::clean_names() %>%
select(cas, smiles) %>%
unique()
colnames(aggregated_tox21_unique) # "cas" "smiles"
#try find the smiles for the 667 "missing" compounds above (667 = 140 missing + 527 not missing)
data_wo_smiles =
left_join(missing, aggregated_tox21_unique, by = "cas") %>%
distinct() %>%
remove_missing() %>%
mutate(smiles_long = map_chr(str_split(smiles, "\\."), ~ .[which.max(nchar(.))])) %>%
select(-smiles) %>%
rename(smiles = smiles_long)
#remove_missing() removes all non-complete rows
```
## Last, combine the results to get a complete list of salt-stripped SMILES for the Tox21 compounds
```{r combined_Tox21_smiles}
data_final = bind_rows(not_missing, data_wo_smiles)
length(unique(data_final$cas)) # 7030 left = 7170 - 140 no smiles
#create output file (cas, cluster, smiles) --> used in Python
write_csv(data_final, "./results/cluster_and_smiles.csv")
```
## DEBUG ONLY: check the characteristics of the 140 compounds with missing smiles
```{r, eval = FALSE}
data_no_smiles =
left_join(missing, aggregated_tox21_unique, by = "cas") %>%
distinct() %>%
filter(is.na(smiles)) %>%
filter(str_detect(cas, "NOCAS"))
# 140 cmps w/o smiles = 118 "NOCAS" + 22 "oils?"
```
# Salt-stripped SMILES for the Pharos dataset
```{r}
#load in the pharos dataset
pt1 = read.csv("./data/pharos_all_target_ligands-p1.csv")
pt2 = read.csv("./data/pharos_all_target_ligands-p2.csv")
pharos_combined = bind_rows(pt1, pt2)
# data cleaning for pharos data
pharos_final = pharos_combined %>%
janitor::clean_names() %>%
rename(chembl = "ligand_ch_embl_id",
smiles = "ligand_smiles") %>%
mutate(smiles_short = map_chr(str_split(smiles, "\\."), ~ .[which.max(nchar(.))])) %>%
select(smiles_short, symbol) %>%
rename(smiles = smiles_short)
colnames(pharos_final) #"smiles" "symbol"
#save to .csv file
write.csv(pharos_final, "./results/pharos_symbol.csv")
#NOTE: This file contains salt-stripped smiles and associated gene targets for all compounds pulled from NIH's Pharos database. This file is also the input for the Rdkit python program (recall: Rdkit is used to find the corresponding inchikeys for the smiles).
```
***
# Find the InChIKey for each compound
## Load RDKit results for original Tox21 compounds
```{r}
cas_inchikey = read.csv("./data/rdkit/smi_to_inchikey.csv") %>%
mutate(short_key = str_sub(inchikey, 1, 14)) #7030 compounds
#cas_inchikey %>% filter(is.na(inchikey)) #13 cmps without inchikey
#7030 = 13 w/o inchikey + 7017 w/inchikey
inchikey = cas_inchikey %>%
select(cas, cluster, inchikey, short_key) %>%
distinct() %>%
drop_na() #7017 compounds
colnames(inchikey)
#"cas" "cluster" "inchikey" "short_key"
```
## Load RDKit result for Pharos
```{r}
pharos_inchikey = read.csv("./data/rdkit/pharos_to_inchikey.csv") %>%
mutate(short_key = str_sub(inchikey, 1, 14)) %>%
unique()
colnames(pharos_inchikey)
# "smiles" "symbol" "inchikey" "short_key"
```
## Load results for drug repurposing hub
```{r}
# data from Broad institute (this is data from Chunxu)
# note, because the broad institute's data already have the inchikey variables, we can skip the Rdkit step
broad_reformatted <- read_delim("./data/broad_reformatted.txt",
"\t", escape_double = FALSE, trim_ws = TRUE) %>%
janitor::clean_names() %>%
mutate(short_key = str_sub(in_key, 1, 14))
```
## Join tox21 and pharos dataset
```{r}
#left_join means only keep if it is in the inchikey dataset!
data_joined = left_join(inchikey, pharos_inchikey, by = "short_key")
colnames(data_joined)
data_distinct = data_joined %>%
rename(target_gene = "symbol") %>%
select(cas, cluster, inchikey.x, short_key, target_gene) %>%
distinct() %>%
drop_na() %>%
filter(!target_gene == "9-Sep") %>%
filter(!target_gene == "") %>%
rename(inchikey = inchikey.x)
#write to output
write.csv(data_distinct, "./results/inchikey/pharos_inchikey.csv")
```
**NOTE: ** data_distinct is the final data set! It contains the original cas compound, the cluster it belongs to, its inchikey (found using RDkit smiles -> inchikey conversion), and a list of gene targets from **Pharos**.
## Join tox21 and drug repurposing dataset
```{r}
#join the two data set using inchikey as identifier
data_combined = left_join(inchikey, broad_reformatted,
by = "short_key") %>%
select(cas, cluster, inchikey, short_key, target_gene) %>%
unique()
#find how many unique compounds have any gene information
data_unique = data_combined %>%
drop_na()
length(unique(data_unique$cas)) #1346 unique CAS/compounds
length(unique(data_unique$target_gene)) #1303 unique CAS/compounds
#write to file
write.csv(data_unique, "./results/inchikey/drug_hub_inchikey.csv")
```
# Merge Pharos and Drug Repurposing Hub datasets
```{r}
pharos = read.csv("./results/inchikey/pharos_inchikey.csv")
drug_hub = read.csv("./results/inchikey/drug_hub_inchikey.csv")
#length(unique(drug_hub$target_gene))
#length(unique(pharos$target_gene))
#length(unique(drug_hub$cas))
#length(unique(pharos$cas))
#stack the tables on top of each other
pharos_and_hub = rbind(drug_hub, pharos)
#data cleaning to get rid of duplicates -> FINAL data set!!
pharos_hub_clean = pharos_and_hub %>%
select(-1, -short_key) %>%
unique() %>%
arrange(cas)
#length(unique(pharos_hub_clean$target_gene)) #1629 unique genes
#length(unique(pharos_hub_clean$cas)) #1829 unique compounds
#write the combined data to output
write.csv(pharos_hub_clean, "./results/inchikey/pharos_and_drughub.csv")
```
# DEBUG ONLY: check how many of these compounds are drugs/EPA/NTP
```{r}
tox21_aggregated <- read_delim("./data/tox21_aggregated.txt",
"\t", escape_double = FALSE, trim_ws = TRUE) %>%
janitor::clean_names() %>%
select(cas, tox21_id, sample_name) %>%
unique() %>%
mutate(type = as.numeric(str_sub(tox21_id, 7, 7))) %>%
mutate(
origin = case_when(
type == 1 ~ "drug",
type == 2 ~ "NTP",
type == 3 ~ "EPA",
type == 4 ~ "toxic")
)
cmp_origin = left_join(pharos_hub_clean, tox21_aggregated, by="cas") %>% select(cas, origin) %>%
unique() %>%
filter(origin == "drug")
length(unique(cmp_origin$cas))
```
# Create `data_w_identifiers.csv' data sheet
```{r}
#7030 compounds
#Inchikey are for the parent form, NOT SALT form!
cas_inchikey = read.csv("./data/rdkit/smi_to_inchikey.csv") %>%
rename(parent_smiles = smiles)
#aggregated data has pubchem_cid and sample_name,and the salt verison of smiles
aggregated_tox21_unique = read_delim("./data/tox21_aggregated.txt",
delim = "\t", escape_double = FALSE, trim_ws = TRUE) %>%
janitor::clean_names() %>%
rename(ncgc = sample_id) %>%
select(cas, pubchem_cid, sample_name,smiles) %>%
unique()
data_combined = left_join(cas_inchikey, aggregated_tox21_unique, by = "cas") %>%
select(cluster, cas, pubchem_cid, sample_name, smiles, parent_smiles, inchikey) %>%
unique()
#length(unique(data_combined$cas))
# add the gene list for each compound (uses the combined pharos & drug hub data)
data_temp = left_join(data_combined, pharos_hub_clean %>% select(cas, target_gene), by = "cas")
# use hgnc data to find add the gene name for each compound
hgnc_data <- read_delim("./data/hgnc_data.txt",
delim = "\t", escape_double = FALSE, trim_ws = TRUE) %>%
janitor::clean_names() %>%
rename(target_gene = approved_symbol) %>%
select(-1) %>% #remove hgnc_ids
unique()
#left_join
data_w_identifiers <- left_join(data_temp, hgnc_data, by="target_gene")
#length(unique(data_w_identifiers$cas))
write.csv(data_w_identifiers, "./results/data_w_identifiers.csv")
#7030 unique compounds
```
# Debug only - Explore the data with identifiers datasheet...
```{r}
test = data_w_identifiers %>%
filter(cluster == 1)
length(unique(test$cas)) #32 unique compounds in cluster #1
length(unique(data_w_identifiers$cas))
w = data_w_identifiers %>%
filter(cluster == 94)
#see which compounds has no target...
compounds_no_target = data_w_identifiers %>%
filter(!is.na(target_gene)) %>%
unique()
length(unique(compounds_no_target$cas)) #5201 no gene + 1829 gene = 7030 compounds
```