Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add WTS support #19

Merged
merged 2 commits into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Generated by roxygen2: do not edit by hand

export(awsvault_profile)
export(datashare_um)
export(datashare_wts)
export(envvar_undefined)
export(meta_bcl_convert)
export(meta_oncoanalyser_wgs)
export(meta_oncoanalyser_wgts_existing_both)
Expand Down
223 changes: 223 additions & 0 deletions R/datasharing.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
#' Datashare umccrise Results
#'
#' @param sid SubjectID.
#' @param lid LibraryID of WTS tumor.
#' @param token_ica ICA_ACCESS_TOKEN.
#'
#' @return Tibble with presigned URLs.
#' @export
datashare_um <- function(sid, lid, token_ica) {
umccrise_files <- dplyr::tribble(
~regex, ~fun,
"multiqc_report\\.html$", "HTML_MultiQC",
"somatic\\.pcgr\\.html$", "HTML_PCGR",
"normal\\.cpsr\\.html$", "HTML_CPSR",
"cancer_report\\.html$", "HTML_CanRep",
"germline\\.predispose_genes\\.vcf\\.gz$", "VCF_Germline",
"germline\\.predispose_genes\\.vcf\\.gz\\.tbi$", "VCFi_Germline",
"somatic-PASS\\.vcf\\.gz$", "VCF_Somatic",
"somatic-PASS\\.vcf\\.gz\\.tbi$", "VCFi_Somatic",
"somatic\\.pcgr\\.snvs_indels\\.tiers\\.tsv$", "TSV_SmallVariantsSomatic",
"manta\\.tsv$", "TSV_StructuralVariantsManta",
"manta\\.vcf\\.gz$", "VCF_StructuralVariantsManta",
"manta\\.vcf\\.gz.tbi$", "VCFi_StructuralVariantsManta",
"purple\\.cnv\\.gene\\.tsv$", "TSV_CopyNumberVariantsPurpleGene",
"purple\\.cnv\\.somatic\\.tsv$", "TSV_CopyNumberVariantsPurpleSegments"
)

tn_files <- dplyr::tribble(
~regex, ~fun,
"tumor\\.bam$", "BAM_tumor",
"tumor\\.bam\\.bai$", "BAMi_tumor",
"normal\\.bam$", "BAM_normal",
"normal\\.bam\\.bai$", "BAMi_normal",
)
query_um <- glue(
"WHERE \"type_name\" = 'umccrise' AND \"end_status\" = 'Succeeded' AND ",
"REGEXP_LIKE(\"wfr_name\", 'umccr__automated__umccrise__{SubjectID}__{LibraryID_tumor}') ",
"ORDER BY \"start\" DESC;"
)
d_um_raw <- rportal::portaldb_query_workflow(query_um)
n_um_runs <- nrow(d_um_raw)
if (n_um_runs == 0) {
cli::cli_abort("ERROR: No umccrise results found for {SubjectID}__{LibraryID_tumor}")
} else if (n_um_runs > 1) {
d_um_raw <- d_um_raw |> dplyr::slice_head(n = 1)
msg <- glue(
"There are {n_um_runs} > 1 umccrise workflows run for ",
"{SubjectID}__{LibraryID_tumor};\n",
"We use the latest run with portal_run_id=\"{d_um_raw$portal_run_id}\" ",
"which ended at {d_um_raw$end}."
)
cli::cli_alert_info(msg)
}
d_um_tidy <- rportal::meta_umccrise(d_um_raw)
um_dragen_input <- d_um_tidy[["gds_indir_dragen_somatic"]]
stopifnot(!is.na(um_dragen_input))

query_tn <- glue(
"WHERE \"type_name\" = 'wgs_tumor_normal' AND \"end_status\" = 'Succeeded' AND ",
"REGEXP_LIKE(\"wfr_name\", 'umccr__automated__wgs_tumor_normal__{SubjectID}__{LibraryID_tumor}') ",
"ORDER BY \"start\" DESC;"
)
d_tn_raw <- rportal::portaldb_query_workflow(query_tn)
if (nrow(d_tn_raw) == 0) {
cli::cli_abort("ERROR: No wgs_tumor_normal results found for {SubjectID}__{LibraryID_tumor}")
}
d_tn_tidy <- rportal::meta_wgs_tumor_normal(d_tn_raw)
n_tn_runs <- nrow(d_tn_tidy)
if (n_tn_runs > 1) {
if (um_dragen_input %in% d_tn_tidy[["gds_outdir_dragen_somatic"]]) {
msg <- glue(
"There are {n_tn_runs} > 1 wgs_tumor_normal workflows run for ",
"{SubjectID}__{LibraryID_tumor}\n",
"We use the run which output somatic results into the following location:\n{um_dragen_input}"
)
cli::cli_alert_info(msg)
} else {
msg <- glue(
"ERROR: No wgs_tumor_normal results found for {SubjectID}__{LibraryID_tumor} ",
"with a gds_outdir_dragen_somatic of {um_dragen_input}"
)
cli::cli_abort(msg)
}
}
d_tn_tidy <- d_tn_tidy |>
dplyr::filter(.data$gds_outdir_dragen_somatic == um_dragen_input)
stopifnot(nrow(d_tn_tidy) == 1)
SampleID_tumor <- d_um_tidy[["SampleID_tumor"]]
LibraryID_normal <- d_um_tidy[["LibraryID_normal"]]
sbjid_sampid_dir <- glue("{SubjectID}__{SampleID_tumor}")
umccrise_dir <- file.path(d_um_tidy[["gds_outdir_umccrise"]], sbjid_sampid_dir)
umccrise_work_dir <- file.path(d_um_tidy[["gds_outdir_umccrise"]], "work", sbjid_sampid_dir)
amber_dir <- file.path(umccrise_work_dir, "purple/amber")
cobalt_dir <- file.path(umccrise_work_dir, "purple/cobalt")
sigs_dir <- file.path(umccrise_dir, "cancer_report_tables/sigs")
d_um_urls1 <- umccrise_dir |>
dracarys::gds_files_list_filter_relevant(
token = token_ica, include_url = TRUE, page_size = 500, regexes = umccrise_files
)
d_um_urls_sigs <- sigs_dir |>
dracarys::gds_files_list(token = token_ica, include_url = TRUE, page_size = 100) |>
dplyr::mutate(type = "Signatures") |>
dplyr::select("type", "bname", "size", "file_id", "path", "presigned_url")
d_um_urls_amber <- amber_dir |>
dracarys::gds_files_list(token = token_ica, include_url = TRUE, page_size = 100) |>
dplyr::mutate(type = "AMBER") |>
dplyr::select("type", "bname", "size", "file_id", "path", "presigned_url")
d_um_urls_cobalt <- cobalt_dir |>
dracarys::gds_files_list(token = token_ica, include_url = TRUE, page_size = 100) |>
dplyr::mutate(type = "COBALT") |>
dplyr::select("type", "bname", "size", "file_id", "path", "presigned_url")
d_um_urls2 <- d_um_tidy[["gds_indir_dragen_somatic"]] |>
dracarys::gds_files_list_filter_relevant(
token = token_ica, include_url = TRUE, page_size = 500, regexes = tn_files
)
fq_list <- d_tn_tidy |>
dplyr::select("fastq_tumor", "fastq_normal") |>
tidyr::pivot_longer(cols = c("fastq_tumor", "fastq_normal"), names_to = "fastq_tn") |>
tidyr::unnest("value") |>
dplyr::mutate(id = dplyr::row_number(), .by = "fastq_tn") |>
tidyr::pivot_longer(cols = c("read1", "read2"), names_to = "read", values_to = "path") |>
dplyr::mutate(
fastq_id = glue("{.data$fastq_tn}_{.data$id}_{.data$read}"),
fastq_dir = dirname(.data$path),
path = basename(.data$path)
) |>
dplyr::select(fun = "fastq_id", regex = "path", "fastq_dir") |>
base::split(~fastq_dir)
fq_urls <- NULL
for (outdir in names(fq_list)) {
fq_urls_tmp <- dracarys::gds_files_list_filter_relevant(
gdsdir = outdir, token = token_ica,
include_url = TRUE, page_size = 500, regexes = fq_list[[outdir]]
)
fq_urls <- dplyr::bind_rows(fq_urls, fq_urls_tmp)
}
if (nrow(fq_urls) == 0) {
cli::cli_alert_danger(
"No FASTQs were found for {SubjectID}__{LibraryID_tumor}"
)
}
fq_urls <- fq_urls |>
dplyr::mutate(type = sub("fastq", "FASTQ", .data$type))
if ((nrow(d_um_urls2) != nrow(tn_files)) | ((nrow(d_um_urls1) != nrow(umccrise_files)))) {
# files were not found? might also have files with the same pattern matching,
# though I have not encountered any such cases.
cli::cli_alert_danger(
"There was not a 1-1 match between files requested and files found. ",
"Contact the UMCCR bioinformatics team."
)
}
urls_all <- dplyr::bind_rows(d_um_urls1, d_um_urls2, fq_urls) |>
dplyr::arrange(.data$type) |>
dplyr::bind_rows(d_um_urls_amber, d_um_urls_cobalt, d_um_urls_sigs) |>
dplyr::mutate(
sbjid_libid = glue("{SubjectID}__{LibraryID_tumor}"),
path = sub("gds://", "", .data$path),
size = trimws(as.character(.data$size))
) |>
dplyr::relocate("sbjid_libid")
urls_all
}

#' Datashare WTS Results
#'
#' @param sid SubjectID.
#' @param lid LibraryID of WTS tumor.
#' @param token_ica ICA_ACCESS_TOKEN.
#'
#' @return Tibble with presigned URLs.
#' @export
datashare_wts <- function(sid, lid, token_ica) {
wts_files <- dplyr::tribble(
~regex, ~fun,
"\\.bam$", "BAM_WTS_tumor",
"\\.bam\\.bai$", "BAMi_WTS_tumor",
"fusion_candidates\\.final$", "TSV_WTS_FusionCandidatesDragen",
"quant\\.genes\\.sf$", "TSV_WTS_QuantificationGenes",
"quant\\.sf", "TSV_WTS_Quantification",
)
wts_arriba_files <- dplyr::tribble(
~regex, ~fun,
"fusions\\.pdf$", "PDF_WTS_FusionsArriba",
"fusions\\.tsv$", "TSV_WTS_FusionsArriba",
)
query_wts <- glue(
"WHERE \"type_name\" = 'wts_tumor_only' AND \"end_status\" = 'Succeeded' AND ",
"REGEXP_LIKE(\"wfr_name\", 'umccr__automated__wts_tumor_only__{sid}__{lid}') ",
"ORDER BY \"start\" DESC;"
)
d_wts_raw <- rportal::portaldb_query_workflow(query_wts)
n_wts_runs <- nrow(d_wts_raw)
if (n_wts_runs == 0) {
cli::cli_abort("ERROR: No WTS results found for {sid}__{lid}")
} else if (n_wts_runs > 1) {
d_wts_raw <- d_wts_raw |> dplyr::slice_head(n = 1)
msg <- glue(
"There are {n_wts_runs} > 1 WTS workflows run for ",
"{sid}__{lid};\n",
"We use the latest run with portal_run_id=\"{d_wts_raw$portal_run_id}\" ",
"which ended at {d_wts_raw$end}."
)
cli::cli_alert_info(msg)
}
d_wts_tidy <- rportal::meta_wts_tumor_only(d_wts_raw)
d_wts_urls1 <- d_wts_tidy[["gds_outdir_dragen"]] |>
dracarys::gds_files_list_filter_relevant(
token = token_ica, include_url = TRUE, page_size = 100, regexes = wts_files
)
d_wts_urls2 <- d_wts_tidy[["gds_outdir_arriba"]] |>
dracarys::gds_files_list_filter_relevant(
token = token_ica, include_url = TRUE, page_size = 10, regexes = wts_arriba_files
)
d_wts_urls <- dplyr::bind_rows(d_wts_urls1, d_wts_urls2) |>
dplyr::arrange(.data$type) |>
dplyr::mutate(
sbjid_libid = glue("{SubjectID}__{LibraryID_tumor}"),
path = sub("gds://", "", .data$path),
size = trimws(as.character(.data$size))
) |>
dplyr::relocate("sbjid_libid")
d_wts_urls
}
12 changes: 6 additions & 6 deletions R/meta_rnasum.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,26 +31,26 @@ meta_rnasum <- function(pmeta, status = "Succeeded") {
# renamed in v1.1.0
gds_indir_dragen = purrr::map_chr(.data$input, list("dragen_transcriptome_directory", "location"), .default = NA),
gds_indir_dragen = ifelse(
is.na(gds_indir_dragen),
is.na(.data$gds_indir_dragen),
purrr::map_chr(.data$input, list("dragen_wts_dir", "location"), .default = NA),
gds_indir_dragen
.data$gds_indir_dragen
),
gds_indir_umccrise = purrr::map_chr(.data$input, list("umccrise_directory", "location"), .default = NA),
# renamed in v1.1.0
gds_indir_arriba = purrr::map_chr(.data$input, list("arriba_directory", "location"), .default = NA),
gds_indir_arriba = ifelse(
is.na(gds_indir_arriba),
is.na(.data$gds_indir_arriba),
purrr::map_chr(.data$input, list("arriba_dir", "location"), .default = NA),
gds_indir_arriba
.data$gds_indir_arriba
),
rnasum_sample_name = purrr::map_chr(.data$input, "sample_name", .default = NA),
rnasum_dataset = purrr::map_chr(.data$input, "dataset", .default = NA),
rnasum_report_dir = purrr::map_chr(.data$input, "report_directory", .default = NA),
# renamed in v1.1.0
rnasum_report_dir = ifelse(
is.na(rnasum_report_dir),
is.na(.data$rnasum_report_dir),
purrr::map_chr(.data$input, "report_dir", .default = NA),
rnasum_report_dir
.data$rnasum_report_dir
),
sbjid1 = sub("(SBJ.*)__L.*", "\\1", .data$rnasum_report_dir),
libid1 = sub("(SBJ.*)__(L.*)", "\\2", .data$rnasum_report_dir),
Expand Down
19 changes: 19 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,22 @@ dummy1 <- function() {
optparse::make_option
fs::dir_create
}

#' Are AWS/ICA EnvVars Undefined?
#'
#' @return Tibble with undefined env vars.
#' @export
envvar_undefined <- function() {
env <- dplyr::tibble(
var = c(
"AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_REGION", "ICA_ACCESS_TOKEN"
)
) |>
dplyr::mutate(
value = Sys.getenv(.data$var),
defined = nchar(.data$value) > 0,
) |>
dplyr::filter(!.data$defined) |>
dplyr::pull("var")
env
}
Loading
Loading