diff --git a/R/ncbi_parse.R b/R/ncbi_parse.R index 15b1781..df5e684 100644 --- a/R/ncbi_parse.R +++ b/R/ncbi_parse.R @@ -9,6 +9,7 @@ #' @param db character; the NCBI database from which the data was retrieved. #' @param format character; the format of the data set. Currently only #' \code{"xml"} is supported. +#' @param mc_cores integer; number of cores to use for parallel processing. #' @param verbose logical; Should verbose messages be printed to console? #' @return a tibble. #' @details This function is integrated into \code{ncbi_get_meta()} and is @@ -54,6 +55,7 @@ ncbi_parse <- function( meta, db = NULL, format = "xml", + mc_cores = NULL, verbose = getOption("verbose") ) { if ("ncbi_meta" %in% class(meta)) { @@ -71,11 +73,16 @@ ncbi_parse <- function( } db <- match.arg(db, choices = c("assembly", "biosample")) format <- match.arg(format, choices = c("xml")) + if (is.null(mc_cores)) { + mc_cores <- max(parallel::detectCores() - 1, 1) + } else { + mc_cores <- as.integer(mc_cores) + } f <- get(paste("ncbi_parse", db, format, sep = "_")) if (db == "assembly" && format == "xml") { out <- f(meta, verbose = verbose) } else if (db == "biosample" && format == "xml") { - out <- f(meta, verbose = verbose) + out <- f(meta, mc_cores = mc_cores, verbose = verbose) } else { if (verbose) message("Parsing is not supported.") out <- NA_character_ diff --git a/R/ncbi_parse_biosample_xml.R b/R/ncbi_parse_biosample_xml.R index 6c9c561..d0d401e 100644 --- a/R/ncbi_parse_biosample_xml.R +++ b/R/ncbi_parse_biosample_xml.R @@ -4,30 +4,80 @@ #' function parses metadata retrieved in XML format. #' @param biosample_xml character; unparsed XML metadata either returned by #' \code{ncbi_get_meta()} or the path to a file that was downloaded from NCBI. +#' @param mc_cores integer; number of cores to use for parallel processing. If +#' \code{NULL} use all available cores. #' @param verbose logical; Should verbose messages be printed to console? ncbi_parse_biosample_xml <- function( biosample_xml, + mc_cores = NULL, verbose = getOption("verbose") ) { if (length(biosample_xml) == 1 && is.na(biosample_xml)) { if (verbose) message("No BioSample metadata to parse.") return(NA_character_) } - parsed_xml <- ncbi_xml_to_list(xml = biosample_xml, verbose = verbose) - if (length(parsed_xml) == 1 && is.na(parsed_xml)) return(NA_character_) - out <- try(lapply(parsed_xml, function(x) { - ncbi_parse_biosample_xml_entry(x, verbose = verbose) - }),silent = TRUE) - if (inherits(out, "try-error")) { - return(NA_character_) + if (is.null(mc_cores)) { + mc_cores <- max(parallel::detectCores() - 1, 1) + } else { + mc_cores <- as.integer(mc_cores) } - out <- dplyr::bind_rows(out) - out <- dplyr::relocate(out, biosample_uid) - out <- dplyr::relocate(out, biosample, .after = biosample_uid) - out <- tibble::as_tibble(out) - return(out) + biosample_df <- data.frame() + if (verbose) message("Attempting to parse BioSample XMLs.") + for (i in seq_along(biosample_xml)) { + if (verbose) message( + "BioSample XML ", i, " of ", length(biosample_xml), + " to a list.. ", appendLF = FALSE + ) + parsed_xml <- try( + xml2::as_list(xml2::read_xml(biosample_xml[[i]]))[[1]], + silent = TRUE + ) + if (inherits(parsed_xml, "try-error")) { + if (verbose) message("Failed.") + next() + } + if (verbose) message("Successful. ", appendLF = FALSE) + names(parsed_xml) <- sapply(parsed_xml, function(x) { + attributes(x)$accession + }) + if (verbose) message( + "List to data frame.. ", appendLF = FALSE + ) + out <- parallel::mclapply(seq_along(parsed_xml), function(x) { + entry <- try( + ncbi_parse_biosample_xml_entry(parsed_xml[[x]], verbose = verbose), + silent = TRUE + ) + if (inherits(entry, "try-error")) { + entry <- names(parsed_xml)[x] + } + return(entry) + }, mc.cores = mc_cores) + index_failed <- which(unlist(parallel::mclapply(out, function(x) { + "character" %in% class(x) + }, mc.cores = mc_cores))) + if (length(index_failed) == length(out)) { + if (verbose) message("Failed.") + next() + } + if (length(index_failed > 0)) { + if (verbose) { + ids_parsed <- paste(unlist(out[index_failed]), collapse = ", ") + message("Failed for some NCBI BioSample IDs: ", ids_parsed, ".") + } + out <- out[-index_failed] + } + if (verbose) message("Successful.") + out <- dplyr::bind_rows(out) + out <- dplyr::relocate(out, biosample_uid) + out <- dplyr::relocate(out, biosample, .after = biosample_uid) + biosample_df <- dplyr::bind_rows(biosample_df, out) + } + biosample_tbl <- tibble::as_tibble(biosample_df) + return(biosample_tbl) } + ncbi_parse_biosample_xml_entry <- function(x, verbose = getOption("verbose")) { # attributes(x)$names contains all fields! # TODO have an extractor for each field and then bind them together diff --git a/R/utils.R b/R/utils.R index a9608cc..df77b15 100644 --- a/R/utils.R +++ b/R/utils.R @@ -146,40 +146,3 @@ validate_webseq_class <- function(x) { testthat::expect_true(attr(x, "db") %in% ncbi_dbs()) } } - -#' Convert an xml retrieved from NCBI to a list -#' -#' @param xml character, a vector of xmls -#' @param verbose logical; Should verbose messages be printed to console? -#' @return If conversion is successful, returns a vector of lists, otherwise NA. -#' @details -#' The function takes a vector of xmls and attempts to convert it to a vector of -#' lists. The function will only return a vector of lists if all xmls can be -#' converted, otherwise it will return NA. -#' @noRd -ncbi_xml_to_list <- function(xml, verbose) { - pxml <- try(lapply(seq_along(xml), function(i) { - if (verbose) message( - "Attempting to convert xml ", i, " to list. ", - appendLF = FALSE - ) - res <- try( - xml2::as_list(xml2::read_xml(xml[[i]]))[[1]], - silent = TRUE - ) - if (inherits(res, "try-error")) { - if (verbose) message("Failed.") - stop() - } else { - if (verbose) message("Successful.") - return(res) - } - }), silent = TRUE) - if (inherits(pxml, "try-error")) { - return(NA_character_) - } else { - pxml <- unlist(pxml, recursive = FALSE) - names(pxml) <- sapply(pxml, function(x) attributes(x)$accession) - return(pxml) - } -} diff --git a/man/ncbi_parse.Rd b/man/ncbi_parse.Rd index 5c04706..a5db726 100644 --- a/man/ncbi_parse.Rd +++ b/man/ncbi_parse.Rd @@ -4,7 +4,13 @@ \alias{ncbi_parse} \title{Parse NCBI sequence metadata} \usage{ -ncbi_parse(meta, db = NULL, format = "xml", verbose = getOption("verbose")) +ncbi_parse( + meta, + db = NULL, + format = "xml", + mc_cores = NULL, + verbose = getOption("verbose") +) } \arguments{ \item{meta}{character; either an unparsed metadata object returned by @@ -15,6 +21,8 @@ ncbi_parse(meta, db = NULL, format = "xml", verbose = getOption("verbose")) \item{format}{character; the format of the data set. Currently only \code{"xml"} is supported.} +\item{mc_cores}{integer; number of cores to use for parallel processing.} + \item{verbose}{logical; Should verbose messages be printed to console?} } \value{ diff --git a/man/ncbi_parse_biosample_xml.Rd b/man/ncbi_parse_biosample_xml.Rd index 59bd21f..9a0d21d 100644 --- a/man/ncbi_parse_biosample_xml.Rd +++ b/man/ncbi_parse_biosample_xml.Rd @@ -4,12 +4,19 @@ \alias{ncbi_parse_biosample_xml} \title{Parse NCBI BioSample metadata} \usage{ -ncbi_parse_biosample_xml(biosample_xml, verbose = getOption("verbose")) +ncbi_parse_biosample_xml( + biosample_xml, + mc_cores = NULL, + verbose = getOption("verbose") +) } \arguments{ \item{biosample_xml}{character; unparsed XML metadata either returned by \code{ncbi_get_meta()} or the path to a file that was downloaded from NCBI.} +\item{mc_cores}{integer; number of cores to use for parallel processing. If +\code{NULL} use all available cores.} + \item{verbose}{logical; Should verbose messages be printed to console?} } \description{