Skip to content

Commit

Permalink
More efficient parallelisation for biosample xmls
Browse files Browse the repository at this point in the history
  • Loading branch information
stitam committed Feb 1, 2024
1 parent 07d6bf6 commit 2ba8318
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 63 deletions.
9 changes: 8 additions & 1 deletion R/ncbi_parse.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#' @param db character; the NCBI database from which the data was retrieved.
#' @param format character; the format of the data set. Currently only
#' \code{"xml"} is supported.
#' @param mc_cores integer; number of cores to use for parallel processing.
#' @param verbose logical; Should verbose messages be printed to console?
#' @return a tibble.
#' @details This function is integrated into \code{ncbi_get_meta()} and is
Expand Down Expand Up @@ -54,6 +55,7 @@ ncbi_parse <- function(
meta,
db = NULL,
format = "xml",
mc_cores = NULL,
verbose = getOption("verbose")
) {
if ("ncbi_meta" %in% class(meta)) {
Expand All @@ -71,11 +73,16 @@ ncbi_parse <- function(
}
db <- match.arg(db, choices = c("assembly", "biosample"))
format <- match.arg(format, choices = c("xml"))
if (is.null(mc_cores)) {
mc_cores <- max(parallel::detectCores() - 1, 1)
} else {
mc_cores <- as.integer(mc_cores)
}
f <- get(paste("ncbi_parse", db, format, sep = "_"))
if (db == "assembly" && format == "xml") {
out <- f(meta, verbose = verbose)
} else if (db == "biosample" && format == "xml") {
out <- f(meta, verbose = verbose)
out <- f(meta, mc_cores = mc_cores, verbose = verbose)
} else {
if (verbose) message("Parsing is not supported.")
out <- NA_character_
Expand Down
72 changes: 55 additions & 17 deletions R/ncbi_parse_biosample_xml.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,29 +17,67 @@ ncbi_parse_biosample_xml <- function(
return(NA_character_)
}
if (is.null(mc_cores)) {
mc_cores <- parallel::detectCores()
mc_cores <- max(parallel::detectCores() - 1, 1)
} else {
mc_cores <- as.integer(mc_cores)
}
parsed_xml <- ncbi_xml_to_list(
xml = biosample_xml,
mc_cores = mc_cores,
verbose = verbose
)
if (length(parsed_xml) == 1 && is.na(parsed_xml)) return(NA_character_)
out <- try(parallel::mclapply(parsed_xml, function(x) {
ncbi_parse_biosample_xml_entry(x, verbose = verbose)
}, mc.cores = mc_cores), silent = TRUE)
if (inherits(out, "try-error")) {
return(NA_character_)
biosample_df <- data.frame()
if (verbose) message("Attempting to parse BioSample XMLs.")
for (i in seq_along(biosample_xml)) {
if (verbose) message(
"BioSample XML ", i, " of ", length(biosample_xml),
" to a list.. ", appendLF = FALSE
)
parsed_xml <- try(
xml2::as_list(xml2::read_xml(biosample_xml[[i]]))[[1]],
silent = TRUE
)
if (inherits(parsed_xml, "try-error")) {
if (verbose) message("Failed.")
next()
}
if (verbose) message("Successful. ", appendLF = FALSE)
names(parsed_xml) <- sapply(parsed_xml, function(x) {
attributes(x)$accession
})
if (verbose) message(
"List to data frame.. ", appendLF = FALSE
)
out <- parallel::mclapply(seq_along(parsed_xml), function(x) {
entry <- try(
ncbi_parse_biosample_xml_entry(parsed_xml[[x]], verbose = verbose),
silent = TRUE
)
if (inherits(entry, "try-error")) {
entry <- names(parsed_xml)[x]
}
return(entry)
}, mc.cores = mc_cores)
index_failed <- which(unlist(parallel::mclapply(out, function(x) {
"character" %in% class(x)
}, mc.cores = mc_cores)))
if (length(index_failed) == length(out)) {
if (verbose) message("Failed.")
next()
}
if (length(index_failed > 0)) {
if (verbose) {
ids_parsed <- paste(unlist(out[index_failed]), collapse = ", ")
message("Failed for some NCBI BioSample IDs: ", ids_parsed, ".")
}
out <- out[-index_failed]
}
if (verbose) message("Successful.")
out <- dplyr::bind_rows(out)
out <- dplyr::relocate(out, biosample_uid)
out <- dplyr::relocate(out, biosample, .after = biosample_uid)
biosample_df <- dplyr::bind_rows(biosample_df, out)
}
out <- dplyr::bind_rows(out)
out <- dplyr::relocate(out, biosample_uid)
out <- dplyr::relocate(out, biosample, .after = biosample_uid)
out <- tibble::as_tibble(out)
return(out)
biosample_tbl <- tibble::as_tibble(biosample_df)
return(biosample_tbl)
}


ncbi_parse_biosample_xml_entry <- function(x, verbose = getOption("verbose")) {
# attributes(x)$names contains all fields!
# TODO have an extractor for each field and then bind them together
Expand Down
44 changes: 0 additions & 44 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -146,47 +146,3 @@ validate_webseq_class <- function(x) {
testthat::expect_true(attr(x, "db") %in% ncbi_dbs())
}
}

#' Convert an xml retrieved from NCBI to a list
#'
#' @param xml character, a vector of xmls
#' @param mc_cores integer, number of cores to use for parallel processing. If
#' \code{NULL} use all available cores.
#' @param verbose logical; Should verbose messages be printed to console?
#' @return If conversion is successful, returns a vector of lists, otherwise NA.
#' @details
#' The function takes a vector of xmls and attempts to convert it to a vector of
#' lists. The function will only return a vector of lists if all xmls can be
#' converted, otherwise it will return NA.
#' @noRd
ncbi_xml_to_list <- function(xml, mc_cores = NULL, verbose) {
if (is.null(mc_cores)) {
mc_cores <- parallel::detectCores()
} else {
mc_cores <- as.integer(mc_cores)
}
pxml <- try(parallel::mclapply(seq_along(xml), function(i) {
if (verbose) message(
"Attempting to convert xml ", i, " to list. ",
appendLF = FALSE
)
res <- try(
xml2::as_list(xml2::read_xml(xml[[i]]))[[1]],
silent = TRUE
)
if (inherits(res, "try-error")) {
if (verbose) message("Failed.")
stop()
} else {
if (verbose) message("Successful.")
return(res)
}
}, mc.cores = mc_cores), silent = TRUE)
if (inherits(pxml, "try-error")) {
return(NA_character_)
} else {
pxml <- unlist(pxml, recursive = FALSE)
names(pxml) <- sapply(pxml, function(x) attributes(x)$accession)
return(pxml)
}
}
10 changes: 9 additions & 1 deletion man/ncbi_parse.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 2ba8318

Please sign in to comment.