Skip to content

Commit

Permalink
Merge pull request #90 from stitam/abgram
Browse files Browse the repository at this point in the history
Extract antibiograms from NCBI biosample metadata
  • Loading branch information
stitam authored Dec 26, 2024
2 parents 99ef318 + 37c9d8f commit 147dfc8
Show file tree
Hide file tree
Showing 6 changed files with 180 additions and 30 deletions.
34 changes: 34 additions & 0 deletions R/flatten.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#' Bind rows in a list of list of data frames
#'
#' This is a utility function that converts a list of lists, each containing
#' a names list of data frames into a single flat list of data frames.
#' @param x list; a list which contains a named list of data frames
#' @return a flat list of data frames where lists of data frames with the same
#' name are merged.
flatten <- function(x) {
is_named_list <- function(x) {
!is.null(names(x)) && all(names(x) != "")
}
#if (!is_named_list(x)) {
# stop("'x' must be a named list.")
#}
if (any(!sapply(x, is_named_list))) {
stop("Each element of 'x' must be a named list.")
}
df_names <- lapply(x, names) |> unlist() |> unique()
out <- setNames(lapply(df_names, function(x) tibble::tibble()), df_names)
for (i in seq_along(out)) {
df_name <- names(out)[i]
for (j in seq_along(x)) {
if (df_name %in% names(x[[j]])) {
out[[i]] <- dplyr::bind_rows(
out[[i]],
x[[j]][[names(out)[i]]]
)
} else {
next()
}
}
}
return(out)
}
9 changes: 1 addition & 8 deletions R/ncbi_get_meta.R
Original file line number Diff line number Diff line change
Expand Up @@ -163,14 +163,7 @@ ncbi_get_meta <- function(
if (verbose) {
message("Attempting to parse retrieved metadata.")
}
res_parsed <- ncbi_parse(meta = res, mc_cores = mc_cores, verbose = verbose)
if (all("data.frame" %in% class(res_parsed))) {
out <- dplyr::bind_rows(res_parsed)
if (verbose) message("Done.")
} else {
if (verbose) message("Returning unparsed metadata.")
out <- res
}
out <- ncbi_parse(meta = res, mc_cores = mc_cores, verbose = verbose)
} else {
out <- res
}
Expand Down
89 changes: 79 additions & 10 deletions R/ncbi_parse_biosample_xml.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ ncbi_parse_biosample_xml <- function(
mc_cores <- get_mc_cores(mc_cores, verbose = verbose)
biosample_df <- data.frame()
if (verbose) message("Attempting to parse BioSample XMLs.")
result <- list()
for (i in seq_along(biosample_xml)) {
if (verbose) message(
"BioSample XML ", i, " of ", length(biosample_xml),
Expand All @@ -37,7 +38,7 @@ ncbi_parse_biosample_xml <- function(
attributes(x)$accession
})
if (verbose) message(
"List to data frame.. ", appendLF = FALSE
"List to a list of data frames.. ", appendLF = FALSE
)
pfoo <- function(x) {
entry <- try(
Expand Down Expand Up @@ -85,16 +86,14 @@ ncbi_parse_biosample_xml <- function(
out <- out[-index_failed]
}
if (verbose) message("Successful.")
out <- dplyr::bind_rows(out)
out <- dplyr::relocate(out, "biosample_uid")
out <- dplyr::relocate(out, "biosample", .after = "biosample_uid")
biosample_df <- dplyr::bind_rows(biosample_df, out)
result[[i]] <- webseq:::flatten(out)
}
out <- tibble::as_tibble(biosample_df)
out <- out[, order(unname(sapply(out, function(x) sum(is.na(x)))))]
out <- dplyr::relocate(out, "biosample_uid")
out <- dplyr::relocate(out, "biosample", .after = "biosample_uid")
return(out)
result <- webseq:::flatten(result)
result$main <- result$main[, order(unname(sapply(result$main, function(x) sum(is.na(x)))))]
result$main <- result$main |>
dplyr::relocate(biosample_uid) |>
dplyr::relocate(biosample, .after = biosample_uid)
return(result)
}


Expand Down Expand Up @@ -165,6 +164,25 @@ ncbi_parse_biosample_xml_entry <- function(x, verbose = getOption("verbose")) {
))
}
}
# if description contains a table, extract it
if (grepl("^WEBSEQ", out$description)) {
description_table <- extract_description_table(x, main_attrs$accession, verbose)
out_tbl <- dplyr::bind_cols(
tibble::tibble(
biosample_uid = main_attrs$id,
biosample = main_attrs$accession
),
description_table
)
out <- list(
out,
out_tbl
)
out <- setNames(out, c("main", attributes(description_table)$caption))
} else {
out <- list(out)
out <- setNames(out, "main")
}
return(out)
}

Expand Down Expand Up @@ -399,6 +417,57 @@ extract_description <- function(
return(out)
}

extract_description_table <- function(
x,
biosample,
verbose = getOption("verbose")
) {
foo <- function (x, biosample, verbose) {
out <- data.frame()
if ("Comment" %in% names(x$Description)) {
if ("Table" %in% names(x$Description$Comment)) {
# works if there is a single table
# TODO find an example with multiple tables
table <- x$Description$Comment$Table
if ("Caption" %in% names(table)) {
caption <- unname(unlist(table$Caption))
if (length(caption) == 1) {
headers <- table$Header |> unlist() |> unname()
out <- lapply(table$Body, function(x) {
df <- x |> t() |> as.data.frame()
names(df) <- headers
return(df)
}) |> dplyr::bind_rows()
out <- lapply(out, function(x) unname(unlist(x)))
out_lens <- sapply(out, length)
index <- which(out_lens == 0)
if (length(index) > 0) {
for (i in index) {
out[[i]] <- rep(NA, times = max(out_lens))
}
}
out <- data.frame(out) |> tibble::as_tibble()
attr(out, "caption") <- tolower(caption)
} else {
stop("Multiple tables.")
}
}
}
}
return(out)
}
out <- try(foo(x, biosample, verbose), silent = TRUE)
if (inherits(out, "try-error") | is.null(out)) {
if (verbose) {
message(paste0(
"Could not extract Description table for BioSample ", biosample, "."
))
}
stop()
}
return(out)
}

# test <- ncbi_get_uid("pathogen cl 1 0[filter]", "biosample")
extract_package <- function(
x,
Expand Down
19 changes: 19 additions & 0 deletions man/flatten.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

46 changes: 38 additions & 8 deletions tests/testthat/test-ncbi_get_meta.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@ test_that("ncbi_get_meta() works with history", {
# with history, use history, one batch
uids <- ncbi_get_uid("Microthrix parvicella", db = "biosample")
meta <- suppressWarnings(ncbi_get_meta(uids))
expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame"))
testthat::expect_true(inherits(meta, "list"))
testthat::expect_s3_class(
meta$main,
c("ncbi_meta", "tbl_df", "tbl", "data.frame")
)

# with history, use history, multiple batches
uids <- ncbi_get_uid(
Expand All @@ -13,8 +17,12 @@ test_that("ncbi_get_meta() works with history", {
batch_size = 5
)
meta <- suppressWarnings(ncbi_get_meta(uids))
expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame"))
expect_equal(nrow(meta), length(uids$uid))
testthat::expect_true(inherits(meta, "list"))
testthat::expect_s3_class(
meta$main,
c("ncbi_meta", "tbl_df", "tbl", "data.frame")
)
testthat::expect_equal(nrow(meta$main), length(uids$uid))
})

test_that("ncbi_get_meta() works without history", {
Expand All @@ -25,36 +33,58 @@ test_that("ncbi_get_meta() works without history", {
use_history = TRUE
)
meta <- suppressWarnings(ncbi_get_meta(uids, use_history = FALSE))
expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame"))
testthat::expect_true(inherits(meta, "list"))
testthat::expect_s3_class(
meta$main,
c("ncbi_meta", "tbl_df", "tbl", "data.frame")
)

# with history, do not use history, multiple batches
meta <- suppressWarnings(ncbi_get_meta(
uids,
use_history = FALSE,
batch_size = 5
))
expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame"))
testthat::expect_true(inherits(meta, "list"))
testthat::expect_s3_class(
meta$main,
c("ncbi_meta", "tbl_df", "tbl", "data.frame")
)

# without history, attempt to use history but fall back, one batch
uids <- ncbi_get_uid(
"Microthrix parvicella",
db = "biosample",
use_history = FALSE
)
meta <- suppressWarnings(ncbi_get_meta(uids))
expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame"))
testthat::expect_true(inherits(meta, "list"))
testthat::expect_s3_class(
meta$main,
c("ncbi_meta", "tbl_df", "tbl", "data.frame")
)
# only ids, one batch
meta <- suppressWarnings(ncbi_get_meta(
uids$uid,
db = "biosample"
))
expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame"))
testthat::expect_true(inherits(meta, "list"))
testthat::expect_s3_class(
meta$main,
c("ncbi_meta", "tbl_df", "tbl", "data.frame")
)

# only ids, multiple batches
meta <- suppressWarnings(ncbi_get_meta(
uids$uid,
db = "biosample",
batch_size = 5
))
expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame"))
testthat::expect_true(inherits(meta, "list"))
testthat::expect_s3_class(
meta$main,
c("ncbi_meta", "tbl_df", "tbl", "data.frame")
)
})

test_that("ncbi_get_meta() works with all supported dbs", {
Expand Down
13 changes: 9 additions & 4 deletions tests/testthat/test-ncbi_parse.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,20 @@ test_that("ncbi_parse() works with a BioSample", {
data(examples)
biosample_uid <- ncbi_get_uid(examples$biosample[1], db = "biosample")
res <- ncbi_get_meta(biosample_uid)
expect_s3_class(res, c("tbl_df", "tbl", "data.frame"))
expect_equal(res$biosample, "SAMN02714232")
testthat::expect_true(inherits(res, "list"))
testthat::expect_s3_class(res$main, c("tbl_df", "tbl", "data.frame"))
expect_equal(res$main$biosample, "SAMN02714232")
})

test_that("ncbi_parse() works with all BioSamples", {
data(examples)
biosample_uid <- ncbi_get_uid(examples$biosample, db = "biosample")
res <- ncbi_get_meta(biosample_uid)
expect_s3_class(res, c("tbl_df", "tbl", "data.frame"))
expect_equal(nrow(res), length(examples$biosample))
testthat::expect_true(inherits(res, "list"))
testthat::expect_equal(length(res), 2)
testthat::expect_equal(names(res), c("main", "antibiogram"))
testthat::expect_s3_class(res$main, c("tbl_df", "tbl", "data.frame"))
testthat::expect_s3_class(res$antibiogram, c("tbl_df", "tbl", "data.frame"))
expect_equal(nrow(res$main), length(examples$biosample))
})

0 comments on commit 147dfc8

Please sign in to comment.