-
Notifications
You must be signed in to change notification settings - Fork 0
/
TCGA_clinical.R
58 lines (52 loc) · 1.83 KB
/
TCGA_clinical.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
library(TCGAbiolinks)
library(xml2)
library(tidyverse)
setwd("~/storage/data/TCGA/clinical/TCGAbiolinks")
# Download xml clinical data files -------------------------------------------
projects <- c(
c("TCGA-BRCA", "TCGA-COAD", "TCGA-HNSC", "TCGA-KIRC",
"TCGA-KIRP", "TCGA-LUAD", "TCGA-STAD", "TCGA-THCA"),
c("TCGA-BLCA", "TCGA-ESCA", "TCGA-KICH", "TCGA-LIHC", "TCGA-LUSC")
)
for (proj in projects) {
query <- GDCquery(project = proj,
data.category = "Clinical",
file.type = "xml")
GDCdownload(query)
}
# Parse xml files for each cancer type ---------------------------------------
projects <- list.files("./GDCdata/")
for (proj in projects.downloaded) {
patients <- list.files(
stringr::str_glue("./GDCdata/{proj}/harmonized/Clinical/Clinical_Supplement/"),
pattern = "xml$", recursive = T)
filenames <- stringr::str_glue(
"./GDCdata/{proj}/harmonized/Clinical/Clinical_Supplement/{patients}"
)
patients <- map_chr(str_split(patients, "/"), function(x) x[[1]])
res <- NULL
for (i in seq_along(filenames)) {
df <- read_xml(filenames[i])
barcode <- df %>%
xml_find_first("//shared:bcr_patient_barcode") %>%
xml_text()
# In project:patient, look for all fields with text
df <- xml_find_all(df, "//*[local-name(.) = 'patient']//*[text()]")
df <- tibble(
fileID = patients[i],
field = map_chr(df, xml_path),
value = map_chr(df, xml_text)
) %>%
mutate(field = str_extract(field, "[^:]+$"))
res <- bind_rows(res, df)
}
fields <- c("fileID", unique(res$field))
res <- res %>%
distinct() %>%
group_by(fileID, field) %>%
summarise(value = paste(value, collapse = "|")) %>%
ungroup() %>%
spread(key = field, value = value) %>%
select(one_of(fields))
data.table::fwrite(res, str_glue("./{proj}.csv"))
}