Skip to content

Commit

Permalink
Merge pull request #58 from stitam/speed
Browse files Browse the repository at this point in the history
Speed up parsing for NCBI BioSample XMLs
  • Loading branch information
stitam authored Feb 1, 2024
2 parents 10d3bb0 + 2ba8318 commit 6615f23
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 52 deletions.
9 changes: 8 additions & 1 deletion R/ncbi_parse.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#' @param db character; the NCBI database from which the data was retrieved.
#' @param format character; the format of the data set. Currently only
#' \code{"xml"} is supported.
#' @param mc_cores integer; number of cores to use for parallel processing.
#' @param verbose logical; Should verbose messages be printed to console?
#' @return a tibble.
#' @details This function is integrated into \code{ncbi_get_meta()} and is
Expand Down Expand Up @@ -54,6 +55,7 @@ ncbi_parse <- function(
meta,
db = NULL,
format = "xml",
mc_cores = NULL,
verbose = getOption("verbose")
) {
if ("ncbi_meta" %in% class(meta)) {
Expand All @@ -71,11 +73,16 @@ ncbi_parse <- function(
}
db <- match.arg(db, choices = c("assembly", "biosample"))
format <- match.arg(format, choices = c("xml"))
if (is.null(mc_cores)) {
mc_cores <- max(parallel::detectCores() - 1, 1)
} else {
mc_cores <- as.integer(mc_cores)
}
f <- get(paste("ncbi_parse", db, format, sep = "_"))
if (db == "assembly" && format == "xml") {
out <- f(meta, verbose = verbose)
} else if (db == "biosample" && format == "xml") {
out <- f(meta, verbose = verbose)
out <- f(meta, mc_cores = mc_cores, verbose = verbose)
} else {
if (verbose) message("Parsing is not supported.")
out <- NA_character_
Expand Down
74 changes: 62 additions & 12 deletions R/ncbi_parse_biosample_xml.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,80 @@
#' function parses metadata retrieved in XML format.
#' @param biosample_xml character; unparsed XML metadata either returned by
#' \code{ncbi_get_meta()} or the path to a file that was downloaded from NCBI.
#' @param mc_cores integer; number of cores to use for parallel processing. If
#' \code{NULL} use all available cores.
#' @param verbose logical; Should verbose messages be printed to console?
ncbi_parse_biosample_xml <- function(
biosample_xml,
mc_cores = NULL,
verbose = getOption("verbose")
) {
if (length(biosample_xml) == 1 && is.na(biosample_xml)) {
if (verbose) message("No BioSample metadata to parse.")
return(NA_character_)
}
parsed_xml <- ncbi_xml_to_list(xml = biosample_xml, verbose = verbose)
if (length(parsed_xml) == 1 && is.na(parsed_xml)) return(NA_character_)
out <- try(lapply(parsed_xml, function(x) {
ncbi_parse_biosample_xml_entry(x, verbose = verbose)
}),silent = TRUE)
if (inherits(out, "try-error")) {
return(NA_character_)
if (is.null(mc_cores)) {
mc_cores <- max(parallel::detectCores() - 1, 1)
} else {
mc_cores <- as.integer(mc_cores)
}
out <- dplyr::bind_rows(out)
out <- dplyr::relocate(out, biosample_uid)
out <- dplyr::relocate(out, biosample, .after = biosample_uid)
out <- tibble::as_tibble(out)
return(out)
biosample_df <- data.frame()
if (verbose) message("Attempting to parse BioSample XMLs.")
for (i in seq_along(biosample_xml)) {
if (verbose) message(
"BioSample XML ", i, " of ", length(biosample_xml),
" to a list.. ", appendLF = FALSE
)
parsed_xml <- try(
xml2::as_list(xml2::read_xml(biosample_xml[[i]]))[[1]],
silent = TRUE
)
if (inherits(parsed_xml, "try-error")) {
if (verbose) message("Failed.")
next()
}
if (verbose) message("Successful. ", appendLF = FALSE)
names(parsed_xml) <- sapply(parsed_xml, function(x) {
attributes(x)$accession
})
if (verbose) message(
"List to data frame.. ", appendLF = FALSE
)
out <- parallel::mclapply(seq_along(parsed_xml), function(x) {
entry <- try(
ncbi_parse_biosample_xml_entry(parsed_xml[[x]], verbose = verbose),
silent = TRUE
)
if (inherits(entry, "try-error")) {
entry <- names(parsed_xml)[x]
}
return(entry)
}, mc.cores = mc_cores)
index_failed <- which(unlist(parallel::mclapply(out, function(x) {
"character" %in% class(x)
}, mc.cores = mc_cores)))
if (length(index_failed) == length(out)) {
if (verbose) message("Failed.")
next()
}
if (length(index_failed > 0)) {
if (verbose) {
ids_parsed <- paste(unlist(out[index_failed]), collapse = ", ")
message("Failed for some NCBI BioSample IDs: ", ids_parsed, ".")
}
out <- out[-index_failed]
}
if (verbose) message("Successful.")
out <- dplyr::bind_rows(out)
out <- dplyr::relocate(out, biosample_uid)
out <- dplyr::relocate(out, biosample, .after = biosample_uid)
biosample_df <- dplyr::bind_rows(biosample_df, out)
}
biosample_tbl <- tibble::as_tibble(biosample_df)
return(biosample_tbl)
}


ncbi_parse_biosample_xml_entry <- function(x, verbose = getOption("verbose")) {
# attributes(x)$names contains all fields!
# TODO have an extractor for each field and then bind them together
Expand Down
37 changes: 0 additions & 37 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -146,40 +146,3 @@ validate_webseq_class <- function(x) {
testthat::expect_true(attr(x, "db") %in% ncbi_dbs())
}
}

#' Convert an xml retrieved from NCBI to a list
#'
#' @param xml character, a vector of xmls
#' @param verbose logical; Should verbose messages be printed to console?
#' @return If conversion is successful, returns a vector of lists, otherwise NA.
#' @details
#' The function takes a vector of xmls and attempts to convert it to a vector of
#' lists. The function will only return a vector of lists if all xmls can be
#' converted, otherwise it will return NA.
#' @noRd
ncbi_xml_to_list <- function(xml, verbose) {
pxml <- try(lapply(seq_along(xml), function(i) {
if (verbose) message(
"Attempting to convert xml ", i, " to list. ",
appendLF = FALSE
)
res <- try(
xml2::as_list(xml2::read_xml(xml[[i]]))[[1]],
silent = TRUE
)
if (inherits(res, "try-error")) {
if (verbose) message("Failed.")
stop()
} else {
if (verbose) message("Successful.")
return(res)
}
}), silent = TRUE)
if (inherits(pxml, "try-error")) {
return(NA_character_)
} else {
pxml <- unlist(pxml, recursive = FALSE)
names(pxml) <- sapply(pxml, function(x) attributes(x)$accession)
return(pxml)
}
}
10 changes: 9 additions & 1 deletion man/ncbi_parse.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 8 additions & 1 deletion man/ncbi_parse_biosample_xml.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 6615f23

Please sign in to comment.