From 3a4566162edff1064f1228228c2e2e9a288c0804 Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Mon, 23 Dec 2024 22:21:54 +0100 Subject: [PATCH 1/6] Add function to extract ncbi biosample description table --- R/ncbi_parse_biosample_xml.R | 51 ++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/R/ncbi_parse_biosample_xml.R b/R/ncbi_parse_biosample_xml.R index c4e6b4e..687030e 100644 --- a/R/ncbi_parse_biosample_xml.R +++ b/R/ncbi_parse_biosample_xml.R @@ -399,6 +399,57 @@ extract_description <- function( return(out) } +extract_description_table <- function( + x, + biosample, + verbose = getOption("verbose") + ) { + foo <- function (x, biosample, verbose) { + out <- data.frame() + if ("Comment" %in% names(x$Description)) { + if ("Table" %in% names(x$Description$Comment)) { + # works if there is a single table + # TODO find an example with multiple tables + table <- x$Description$Comment$Table + if ("Caption" %in% names(table)) { + caption <- unname(unlist(table$Caption)) + if (length(caption) == 1) { + headers <- table$Header |> unlist() |> unname() + out <- lapply(table$Body, function(x) { + df <- x |> t() |> as.data.frame() + names(df) <- headers + return(df) + }) |> dplyr::bind_rows() + out <- lapply(out, function(x) unname(unlist(x))) + out_lens <- sapply(out, length) + index <- which(out_lens == 0) + if (length(index) > 0) { + for (i in index) { + out[[i]] <- rep(NA, times = max(out_lens)) + } + } + out <- data.frame(out) |> tibble::as_tibble() + attr(out, "caption") <- tolower(caption) + } else { + stop("Multiple tables.") + } + } + } + } + return(out) + } + out <- try(foo(x, biosample, verbose), silent = TRUE) + if (inherits(out, "try-error") | is.null(out)) { + if (verbose) { + message(paste0( + "Could not extract Description table for BioSample ", biosample, "." + )) + } + stop() + } + return(out) +} + # test <- ncbi_get_uid("pathogen cl 1 0[filter]", "biosample") extract_package <- function( x, From f6cc529e06abe565ed1fc81cb9b70adcb33084a9 Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Mon, 23 Dec 2024 22:23:30 +0100 Subject: [PATCH 2/6] When parsing a biosample entry return a list of data frames --- R/ncbi_parse_biosample_xml.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/R/ncbi_parse_biosample_xml.R b/R/ncbi_parse_biosample_xml.R index 687030e..c2661f5 100644 --- a/R/ncbi_parse_biosample_xml.R +++ b/R/ncbi_parse_biosample_xml.R @@ -165,6 +165,18 @@ ncbi_parse_biosample_xml_entry <- function(x, verbose = getOption("verbose")) { )) } } + # if description contains a table, extract it + if (grepl("^WEBSEQ", out$description)) { + description_table <- extract_description_table(x, main_attrs$accession, verbose) + out <- list( + out, + description_table + ) + out <- setNames(out, c("meta", attributes(description_table)$caption)) + } else { + out <- list(out) + out <- setNames(out, "meta") + } return(out) } From fdea228f8a5c8101a6963d7edde3876be054351d Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Tue, 24 Dec 2024 13:19:21 +0100 Subject: [PATCH 3/6] Add BioSample IDs to Description Tables --- R/ncbi_parse_biosample_xml.R | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/R/ncbi_parse_biosample_xml.R b/R/ncbi_parse_biosample_xml.R index c2661f5..27cdc5a 100644 --- a/R/ncbi_parse_biosample_xml.R +++ b/R/ncbi_parse_biosample_xml.R @@ -168,9 +168,16 @@ ncbi_parse_biosample_xml_entry <- function(x, verbose = getOption("verbose")) { # if description contains a table, extract it if (grepl("^WEBSEQ", out$description)) { description_table <- extract_description_table(x, main_attrs$accession, verbose) + out_tbl <- dplyr::bind_cols( + tibble::tibble( + biosample_uid = main_attrs$id, + biosample = main_attrs$accession + ), + description_table + ) out <- list( out, - description_table + out_tbl ) out <- setNames(out, c("meta", attributes(description_table)$caption)) } else { From 134e544eb5abc0b86518ee4314e8a5028a8e3ee4 Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Thu, 26 Dec 2024 20:12:35 +0100 Subject: [PATCH 4/6] Add new function flatten() --- R/flatten.R | 34 ++++++++++++++++++++++++++++++++++ man/flatten.Rd | 19 +++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 R/flatten.R create mode 100644 man/flatten.Rd diff --git a/R/flatten.R b/R/flatten.R new file mode 100644 index 0000000..8b640ee --- /dev/null +++ b/R/flatten.R @@ -0,0 +1,34 @@ +#' Bind rows in a list of list of data frames +#' +#' This is a utility function that converts a list of lists, each containing +#' a names list of data frames into a single flat list of data frames. +#' @param x list; a list which contains a named list of data frames +#' @return a flat list of data frames where lists of data frames with the same +#' name are merged. +flatten <- function(x) { + is_named_list <- function(x) { + !is.null(names(x)) && all(names(x) != "") + } + #if (!is_named_list(x)) { + # stop("'x' must be a named list.") + #} + if (any(!sapply(x, is_named_list))) { + stop("Each element of 'x' must be a named list.") + } + df_names <- lapply(x, names) |> unlist() |> unique() + out <- setNames(lapply(df_names, function(x) tibble::tibble()), df_names) + for (i in seq_along(out)) { + df_name <- names(out)[i] + for (j in seq_along(x)) { + if (df_name %in% names(x[[j]])) { + out[[i]] <- dplyr::bind_rows( + out[[i]], + x[[j]][[names(out)[i]]] + ) + } else { + next() + } + } + } + return(out) +} \ No newline at end of file diff --git a/man/flatten.Rd b/man/flatten.Rd new file mode 100644 index 0000000..4864fd3 --- /dev/null +++ b/man/flatten.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/flatten.R +\name{flatten} +\alias{flatten} +\title{Bind rows in a list of list of data frames} +\usage{ +flatten(x) +} +\arguments{ +\item{x}{list; a list which contains a named list of data frames} +} +\value{ +a flat list of data frames where lists of data frames with the same +name are merged. +} +\description{ +This is a utility function that converts a list of lists, each containing +a names list of data frames into a single flat list of data frames. +} From c43e8735819f2b3f1da1e0663bb89af7866af029 Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Thu, 26 Dec 2024 20:13:34 +0100 Subject: [PATCH 5/6] Return list of data frames when parsing biosample metadata --- R/ncbi_get_meta.R | 9 +-------- R/ncbi_parse_biosample_xml.R | 23 +++++++++++------------ 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/R/ncbi_get_meta.R b/R/ncbi_get_meta.R index 37e6f25..1bd716f 100644 --- a/R/ncbi_get_meta.R +++ b/R/ncbi_get_meta.R @@ -163,14 +163,7 @@ ncbi_get_meta <- function( if (verbose) { message("Attempting to parse retrieved metadata.") } - res_parsed <- ncbi_parse(meta = res, mc_cores = mc_cores, verbose = verbose) - if (all("data.frame" %in% class(res_parsed))) { - out <- dplyr::bind_rows(res_parsed) - if (verbose) message("Done.") - } else { - if (verbose) message("Returning unparsed metadata.") - out <- res - } + out <- ncbi_parse(meta = res, mc_cores = mc_cores, verbose = verbose) } else { out <- res } diff --git a/R/ncbi_parse_biosample_xml.R b/R/ncbi_parse_biosample_xml.R index 27cdc5a..a6ed16b 100644 --- a/R/ncbi_parse_biosample_xml.R +++ b/R/ncbi_parse_biosample_xml.R @@ -19,6 +19,7 @@ ncbi_parse_biosample_xml <- function( mc_cores <- get_mc_cores(mc_cores, verbose = verbose) biosample_df <- data.frame() if (verbose) message("Attempting to parse BioSample XMLs.") + result <- list() for (i in seq_along(biosample_xml)) { if (verbose) message( "BioSample XML ", i, " of ", length(biosample_xml), @@ -37,7 +38,7 @@ ncbi_parse_biosample_xml <- function( attributes(x)$accession }) if (verbose) message( - "List to data frame.. ", appendLF = FALSE + "List to a list of data frames.. ", appendLF = FALSE ) pfoo <- function(x) { entry <- try( @@ -85,16 +86,14 @@ ncbi_parse_biosample_xml <- function( out <- out[-index_failed] } if (verbose) message("Successful.") - out <- dplyr::bind_rows(out) - out <- dplyr::relocate(out, "biosample_uid") - out <- dplyr::relocate(out, "biosample", .after = "biosample_uid") - biosample_df <- dplyr::bind_rows(biosample_df, out) + result[[i]] <- webseq:::flatten(out) } - out <- tibble::as_tibble(biosample_df) - out <- out[, order(unname(sapply(out, function(x) sum(is.na(x)))))] - out <- dplyr::relocate(out, "biosample_uid") - out <- dplyr::relocate(out, "biosample", .after = "biosample_uid") - return(out) + result <- webseq:::flatten(result) + result$main <- result$main[, order(unname(sapply(result$main, function(x) sum(is.na(x)))))] + result$main <- result$main |> + dplyr::relocate(biosample_uid) |> + dplyr::relocate(biosample, .after = biosample_uid) + return(result) } @@ -179,10 +178,10 @@ ncbi_parse_biosample_xml_entry <- function(x, verbose = getOption("verbose")) { out, out_tbl ) - out <- setNames(out, c("meta", attributes(description_table)$caption)) + out <- setNames(out, c("main", attributes(description_table)$caption)) } else { out <- list(out) - out <- setNames(out, "meta") + out <- setNames(out, "main") } return(out) } From 37c9d8f59a668724022b5d8c1c70a2af89e61b99 Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Thu, 26 Dec 2024 20:13:41 +0100 Subject: [PATCH 6/6] Update tests --- tests/testthat/test-ncbi_get_meta.R | 46 ++++++++++++++++++++++++----- tests/testthat/test-ncbi_parse.R | 13 +++++--- 2 files changed, 47 insertions(+), 12 deletions(-) diff --git a/tests/testthat/test-ncbi_get_meta.R b/tests/testthat/test-ncbi_get_meta.R index d436209..b460278 100644 --- a/tests/testthat/test-ncbi_get_meta.R +++ b/tests/testthat/test-ncbi_get_meta.R @@ -4,7 +4,11 @@ test_that("ncbi_get_meta() works with history", { # with history, use history, one batch uids <- ncbi_get_uid("Microthrix parvicella", db = "biosample") meta <- suppressWarnings(ncbi_get_meta(uids)) - expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame")) + testthat::expect_true(inherits(meta, "list")) + testthat::expect_s3_class( + meta$main, + c("ncbi_meta", "tbl_df", "tbl", "data.frame") + ) # with history, use history, multiple batches uids <- ncbi_get_uid( @@ -13,8 +17,12 @@ test_that("ncbi_get_meta() works with history", { batch_size = 5 ) meta <- suppressWarnings(ncbi_get_meta(uids)) - expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame")) - expect_equal(nrow(meta), length(uids$uid)) + testthat::expect_true(inherits(meta, "list")) + testthat::expect_s3_class( + meta$main, + c("ncbi_meta", "tbl_df", "tbl", "data.frame") + ) + testthat::expect_equal(nrow(meta$main), length(uids$uid)) }) test_that("ncbi_get_meta() works without history", { @@ -25,14 +33,24 @@ test_that("ncbi_get_meta() works without history", { use_history = TRUE ) meta <- suppressWarnings(ncbi_get_meta(uids, use_history = FALSE)) - expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame")) + testthat::expect_true(inherits(meta, "list")) + testthat::expect_s3_class( + meta$main, + c("ncbi_meta", "tbl_df", "tbl", "data.frame") + ) + # with history, do not use history, multiple batches meta <- suppressWarnings(ncbi_get_meta( uids, use_history = FALSE, batch_size = 5 )) - expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame")) + testthat::expect_true(inherits(meta, "list")) + testthat::expect_s3_class( + meta$main, + c("ncbi_meta", "tbl_df", "tbl", "data.frame") + ) + # without history, attempt to use history but fall back, one batch uids <- ncbi_get_uid( "Microthrix parvicella", @@ -40,13 +58,21 @@ test_that("ncbi_get_meta() works without history", { use_history = FALSE ) meta <- suppressWarnings(ncbi_get_meta(uids)) - expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame")) + testthat::expect_true(inherits(meta, "list")) + testthat::expect_s3_class( + meta$main, + c("ncbi_meta", "tbl_df", "tbl", "data.frame") + ) # only ids, one batch meta <- suppressWarnings(ncbi_get_meta( uids$uid, db = "biosample" )) - expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame")) + testthat::expect_true(inherits(meta, "list")) + testthat::expect_s3_class( + meta$main, + c("ncbi_meta", "tbl_df", "tbl", "data.frame") + ) # only ids, multiple batches meta <- suppressWarnings(ncbi_get_meta( @@ -54,7 +80,11 @@ test_that("ncbi_get_meta() works without history", { db = "biosample", batch_size = 5 )) - expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame")) + testthat::expect_true(inherits(meta, "list")) + testthat::expect_s3_class( + meta$main, + c("ncbi_meta", "tbl_df", "tbl", "data.frame") + ) }) test_that("ncbi_get_meta() works with all supported dbs", { diff --git a/tests/testthat/test-ncbi_parse.R b/tests/testthat/test-ncbi_parse.R index f5d5e5c..e836945 100644 --- a/tests/testthat/test-ncbi_parse.R +++ b/tests/testthat/test-ncbi_parse.R @@ -4,15 +4,20 @@ test_that("ncbi_parse() works with a BioSample", { data(examples) biosample_uid <- ncbi_get_uid(examples$biosample[1], db = "biosample") res <- ncbi_get_meta(biosample_uid) - expect_s3_class(res, c("tbl_df", "tbl", "data.frame")) - expect_equal(res$biosample, "SAMN02714232") + testthat::expect_true(inherits(res, "list")) + testthat::expect_s3_class(res$main, c("tbl_df", "tbl", "data.frame")) + expect_equal(res$main$biosample, "SAMN02714232") }) test_that("ncbi_parse() works with all BioSamples", { data(examples) biosample_uid <- ncbi_get_uid(examples$biosample, db = "biosample") res <- ncbi_get_meta(biosample_uid) - expect_s3_class(res, c("tbl_df", "tbl", "data.frame")) - expect_equal(nrow(res), length(examples$biosample)) + testthat::expect_true(inherits(res, "list")) + testthat::expect_equal(length(res), 2) + testthat::expect_equal(names(res), c("main", "antibiogram")) + testthat::expect_s3_class(res$main, c("tbl_df", "tbl", "data.frame")) + testthat::expect_s3_class(res$antibiogram, c("tbl_df", "tbl", "data.frame")) + expect_equal(nrow(res$main), length(examples$biosample)) })