compound_gene_relationships.Rmd

---
title: "Identifying compound & gene relationships"
author: "Fang Liu"
date: "5/10/2022"
output: html_document
---

> This file identify compound-gene target associations for the 7,170 compounds using publicly available data from Pharos and Drug Repurposing Hub (note: there were 8971 compounds in the original dataset; however, after data cleaning and keeping compounds with complete bioassay data, only 7,170 compounds are left). Of these 7,170 compounds, *7,030* has a valid SMILES identifier (i.e., 6503 from parent SMILES list `tox21_10k_cas_smiles_without_salts.txt` and 527 from the aggregated tox21 dataset `tox21_aggregated.txt`). 

***

Load required libraries
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)
library(readxl)
```

# Salt-stripped SMILES for original tox21 data

## First, use Ruili's parent smiles list
```{r}
#load original data file: 7,170 compounds from clustering
#filtered_tox21 = read.delim("./data/clusters.txt") #cas, cluster
filtered_tox21 = read.csv("results/cluster_results.csv")

#DEBUG ONLY: change the clusters.txt file if clustering result changes; keeping everything else the same
#z = read.delim("./results/cluster_results.txt") #cas, cluster

tox21_without_salts = read.delim("./data/tox21_10k_cas_smiles_without_salts.txt") %>% 
  janitor::clean_names() %>% 
  rename(cas = cas_rn,
         smiles = structure_smiles_2d_qsar)

## Combine data using the cas as the "key" using smiles w/o salts data file
data_combined = 
  left_join(filtered_tox21, tox21_without_salts, by = "cas") %>% 
  distinct() #7170 compounds = 6503 with smiles + 667 without smiles

#THIS IS THE FIRST HALF
not_missing = data_combined %>% remove_missing() #6503 compounds
colnames(not_missing) # "cas"     "cluster" "smiles" 

#Find which compounds/drugs are actually missing the "SMILES" 
missing = data_combined %>% filter(is.na(smiles)) %>% 
  select(-smiles) #667 compounds
```


## Second, use the Tox21 aggregated data to see if we can find more smiles for the compounds without smiles above
```{r}
#first, clean the aggregated data 
aggregated_tox21_unique = read_delim("./data/tox21_aggregated.txt",
delim = "\t", escape_double = FALSE, trim_ws = TRUE) %>% 
  janitor::clean_names() %>% 
  select(cas, smiles) %>%
  unique()

colnames(aggregated_tox21_unique) # "cas"    "smiles"

#try find the smiles for the 667 "missing" compounds above (667 = 140 missing + 527 not missing)
data_wo_smiles = 
  left_join(missing, aggregated_tox21_unique, by = "cas") %>% 
  distinct() %>% 
  remove_missing() %>% 
  mutate(smiles_long = map_chr(str_split(smiles, "\\."), ~ .[which.max(nchar(.))])) %>% 
  select(-smiles) %>% 
  rename(smiles = smiles_long)
#remove_missing() removes all non-complete rows
```


## Last, combine the results to get a complete list of salt-stripped SMILES for the Tox21 compounds
```{r combined_Tox21_smiles}
data_final = bind_rows(not_missing, data_wo_smiles)
length(unique(data_final$cas)) # 7030 left = 7170 - 140 no smiles

#create output file (cas, cluster, smiles) --> used in Python
write_csv(data_final, "./results/cluster_and_smiles.csv")
```

## DEBUG ONLY: check the characteristics of the 140 compounds with missing smiles
```{r, eval = FALSE}
data_no_smiles = 
  left_join(missing, aggregated_tox21_unique, by = "cas") %>% 
  distinct() %>% 
  filter(is.na(smiles)) %>% 
  filter(str_detect(cas, "NOCAS"))

# 140 cmps w/o smiles = 118 "NOCAS" + 22 "oils?"
```


# Salt-stripped SMILES for the Pharos dataset
```{r}
#load in the pharos dataset
pt1 = read.csv("./data/pharos_all_target_ligands-p1.csv")
pt2 = read.csv("./data/pharos_all_target_ligands-p2.csv")
pharos_combined = bind_rows(pt1, pt2)

# data cleaning for pharos data
pharos_final = pharos_combined %>% 
  janitor::clean_names() %>% 
  rename(chembl = "ligand_ch_embl_id",
         smiles = "ligand_smiles") %>% 
  mutate(smiles_short = map_chr(str_split(smiles, "\\."), ~ .[which.max(nchar(.))])) %>%
  select(smiles_short, symbol) %>% 
  rename(smiles = smiles_short)

colnames(pharos_final) #"smiles" "symbol"

#save to .csv file
write.csv(pharos_final, "./results/pharos_symbol.csv") 

#NOTE: This file contains salt-stripped smiles and associated gene targets for all compounds pulled from NIH's Pharos database. This file is also the input for the Rdkit python program (recall: Rdkit is used to find the corresponding inchikeys for the smiles). 
```

***


# Find the InChIKey for each compound
## Load RDKit results for original Tox21 compounds
```{r}
cas_inchikey = read.csv("./data/rdkit/smi_to_inchikey.csv") %>% 
  mutate(short_key = str_sub(inchikey, 1, 14)) #7030 compounds

#cas_inchikey %>% filter(is.na(inchikey)) #13 cmps without inchikey
#7030 = 13 w/o inchikey + 7017 w/inchikey

inchikey = cas_inchikey %>% 
  select(cas, cluster, inchikey, short_key) %>% 
  distinct() %>% 
  drop_na() #7017 compounds

colnames(inchikey)
#"cas"      "cluster"  "inchikey" "short_key"
```

## Load RDKit result for Pharos 
```{r}
pharos_inchikey = read.csv("./data/rdkit/pharos_to_inchikey.csv") %>% 
  mutate(short_key = str_sub(inchikey, 1, 14)) %>% 
  unique()

colnames(pharos_inchikey)
# "smiles"   "symbol"   "inchikey" "short_key"
```

## Load results for drug repurposing hub
```{r}
# data from Broad institute (this is data from Chunxu)
# note, because the broad institute's data already have the inchikey variables, we can skip the Rdkit step 

broad_reformatted <- read_delim("./data/broad_reformatted.txt",
"\t", escape_double = FALSE, trim_ws = TRUE) %>% 
  janitor::clean_names() %>% 
  mutate(short_key = str_sub(in_key, 1, 14))
```

## Join tox21 and pharos dataset
```{r}
#left_join means only keep if it is in the inchikey dataset!
data_joined = left_join(inchikey, pharos_inchikey, by = "short_key")
colnames(data_joined)

data_distinct = data_joined %>% 
  rename(target_gene = "symbol") %>% 
  select(cas, cluster, inchikey.x, short_key, target_gene) %>% 
  distinct() %>% 
  drop_na() %>% 
  filter(!target_gene == "9-Sep") %>% 
  filter(!target_gene == "") %>% 
  rename(inchikey = inchikey.x)

#write to output
write.csv(data_distinct, "./results/inchikey/pharos_inchikey.csv")
```

**NOTE: ** data_distinct is the final data set! It contains the original cas compound, the cluster it belongs to, its inchikey (found using RDkit smiles -> inchikey conversion), and a list of gene targets from **Pharos**.


## Join tox21 and drug repurposing dataset 
```{r}
#join the two data set using inchikey as identifier
data_combined = left_join(inchikey, broad_reformatted, 
                          by = "short_key") %>% 
  select(cas, cluster, inchikey, short_key, target_gene) %>%
  unique()

#find how many unique compounds have any gene information
data_unique = data_combined %>% 
  drop_na() 

length(unique(data_unique$cas)) #1346 unique CAS/compounds
length(unique(data_unique$target_gene)) #1303 unique CAS/compounds

#write to file
write.csv(data_unique, "./results/inchikey/drug_hub_inchikey.csv")
```


# Merge Pharos and Drug Repurposing Hub datasets
```{r}
pharos = read.csv("./results/inchikey/pharos_inchikey.csv") 
drug_hub = read.csv("./results/inchikey/drug_hub_inchikey.csv")

#length(unique(drug_hub$target_gene)) 
#length(unique(pharos$target_gene)) 

#length(unique(drug_hub$cas)) 
#length(unique(pharos$cas)) 

#stack the tables on top of each other 
pharos_and_hub = rbind(drug_hub, pharos)

#data cleaning to get rid of duplicates -> FINAL data set!! 
pharos_hub_clean = pharos_and_hub %>% 
  select(-1, -short_key) %>%  
  unique() %>% 
  arrange(cas)

#length(unique(pharos_hub_clean$target_gene)) #1629 unique genes
#length(unique(pharos_hub_clean$cas)) #1829 unique compounds

#write the combined data to output
write.csv(pharos_hub_clean, "./results/inchikey/pharos_and_drughub.csv")
```


# DEBUG ONLY: check how many of these compounds are drugs/EPA/NTP
```{r}
tox21_aggregated <- read_delim("./data/tox21_aggregated.txt",
"\t", escape_double = FALSE, trim_ws = TRUE) %>% 
  janitor::clean_names() %>% 
  select(cas, tox21_id, sample_name) %>% 
  unique() %>% 
  mutate(type = as.numeric(str_sub(tox21_id, 7, 7))) %>% 
  mutate(
    origin = case_when(
      type == 1 ~ "drug",
      type == 2 ~ "NTP",
      type == 3 ~ "EPA",
      type == 4 ~ "toxic")
  )

cmp_origin = left_join(pharos_hub_clean, tox21_aggregated, by="cas") %>% select(cas, origin) %>%
  unique() %>% 
  filter(origin == "drug")
  
length(unique(cmp_origin$cas))
```


# Create `data_w_identifiers.csv' data sheet
```{r}
#7030 compounds
#Inchikey are for the parent form, NOT SALT form!
cas_inchikey = read.csv("./data/rdkit/smi_to_inchikey.csv") %>% 
  rename(parent_smiles = smiles)

#aggregated data has pubchem_cid and sample_name,and the salt verison of smiles
aggregated_tox21_unique = read_delim("./data/tox21_aggregated.txt",
delim = "\t", escape_double = FALSE, trim_ws = TRUE) %>% 
  janitor::clean_names() %>% 
  rename(ncgc = sample_id) %>% 
  select(cas, pubchem_cid, sample_name,smiles) %>%
  unique()

data_combined = left_join(cas_inchikey, aggregated_tox21_unique, by = "cas") %>% 
  select(cluster, cas, pubchem_cid, sample_name, smiles, parent_smiles, inchikey) %>% 
  unique()
#length(unique(data_combined$cas))

# add the gene list for each compound (uses the combined pharos & drug hub data)
data_temp = left_join(data_combined, pharos_hub_clean %>% select(cas, target_gene), by = "cas") 

# use hgnc data to find add the gene name for each compound
hgnc_data <- read_delim("./data/hgnc_data.txt",
delim = "\t", escape_double = FALSE, trim_ws = TRUE) %>% 
  janitor::clean_names() %>% 
  rename(target_gene = approved_symbol) %>% 
  select(-1) %>%  #remove hgnc_ids
  unique() 

#left_join
data_w_identifiers <- left_join(data_temp, hgnc_data, by="target_gene")
#length(unique(data_w_identifiers$cas))

write.csv(data_w_identifiers, "./results/data_w_identifiers.csv") 
#7030 unique compounds

```


# Debug only - Explore the data with identifiers datasheet...
```{r}
test = data_w_identifiers %>% 
  filter(cluster == 1)

length(unique(test$cas)) #32 unique compounds in cluster #1
length(unique(data_w_identifiers$cas))

w = data_w_identifiers %>% 
  filter(cluster == 94)

#see which compounds has no target...
compounds_no_target = data_w_identifiers %>% 
  filter(!is.na(target_gene)) %>% 
  unique()

length(unique(compounds_no_target$cas)) #5201 no gene + 1829 gene = 7030 compounds 
```