diff --git a/R/ncbi_parse_biosample_xml.R b/R/ncbi_parse_biosample_xml.R index d0d401e..eacad3d 100644 --- a/R/ncbi_parse_biosample_xml.R +++ b/R/ncbi_parse_biosample_xml.R @@ -73,8 +73,11 @@ ncbi_parse_biosample_xml <- function( out <- dplyr::relocate(out, biosample, .after = biosample_uid) biosample_df <- dplyr::bind_rows(biosample_df, out) } - biosample_tbl <- tibble::as_tibble(biosample_df) - return(biosample_tbl) + out <- tibble::as_tibble(biosample_df) + out <- out[, order(unname(sapply(out, function(x) sum(is.na(x)))))] + out <- dplyr::relocate(out, biosample_uid) + out <- dplyr::relocate(out, biosample, .after = biosample_uid) + return(out) } @@ -133,6 +136,11 @@ ncbi_parse_biosample_xml_entry <- function(x, verbose = getOption("verbose")) { message(paste0("No status for BioSample ", main_attrs$accession, ".")) } } + if (all(c("geo", "geo_link") %in% names(out)) & nrow(out) == 1) { + if (grepl(out$geo, out$geo_link)) { + out <- dplyr::select(out, -geo_link) + } + } if (nrow(out) > 1) { if (verbose) { message(paste0( @@ -190,25 +198,31 @@ extract_attributes <- function(x, biosample, verbose = getOption("verbose")) { out <- data.frame( attr = unname(sapply(res, function(A) { attr_instance <- attributes(A) - if (!is.null(attr_instance$harmonized_name)) { + if ("harmonised_name" %in% names(attr_instance)) { attr_name <- attr_instance$harmonized_name - } else if ( - length(attr_instance) == 1 && - names(attr_instance) == "attribute_name") { + } else if ("display_name" %in% names(attr_instance)) { + attr_name <- attr_instance$display_name + } else if ("attribute_name" %in% names(attr_instance)) { attr_name <- attr_instance$attribute_name + } else { + stop() } + attr_name <- tolower(gsub(" +", "_", attr_name)) paste0("attr_", attr_name) })), value = unname(sapply(res, function(x) { if (length(x) == 0) { - return(NA) - } - if (length(x) == 1) { - return(unlist(x)) - } - if (length(x) > 1) { - return(paste(unlist(x), collapse = "|")) + val <- NA + } else { + val <- unlist(x) + if ("unit" %in% names(attributes(x))) { + val <- paste0(val, attributes(x)$unit) + } + if (length(val) > 1) { + val <- paste(val, collapse = "|") + } } + return(val) })) ) out <- dplyr::distinct(out) @@ -333,6 +347,10 @@ extract_description <- function( if ("Comment" %in% names(x$Description)) { if ("Paragraph" %in% names(x$Description$Comment)) { out <- unlist(x$Description$Comment$Paragraph) + # SAMN32317800 + if (is.null(out)) { + out <- NA + } } if ("Table" %in% names(x$Description$Comment)) { if (is.null(out)) { @@ -481,13 +499,19 @@ extract_links <- function( ) } if ("type" %in% names(attributes(A)) && attributes(A)$type == "url") { - longlinks <- dplyr::bind_rows( - longlinks, - data.frame( - target = attributes(A)$label, - label = unlist(A) + if (attributes(A)$label != "") { + target <- attributes(A)$label + if (grepl("^GEO Sample", target)) { + target <- "geo_link" + } + longlinks <- dplyr::bind_rows( + longlinks, + data.frame( + target = target, + label = unlist(A) + ) ) - ) + } } longlinks$target <- gsub(" +", "_", longlinks$target) return(longlinks) diff --git a/data/examples.rda b/data/examples.rda index e441081..0809093 100644 Binary files a/data/examples.rda and b/data/examples.rda differ diff --git a/scripts/prep_examples.R b/scripts/prep_examples.R index 960f208..b591ab6 100644 --- a/scripts/prep_examples.R +++ b/scripts/prep_examples.R @@ -20,9 +20,12 @@ examples <- list( "SAMEA14036741", "SAMN03863711", "SAMN39003160", - "SAMN01918814", #links - "SAMN02692951", #owner - "SAMN06220566" #ids + "SAMN01918814", # links + "SAMN02692951", # owner + "SAMN06220566", # ids + "SAMN32317800", # empty description + "SAMEA111396836", # unit in attributes + "SAMN14387582" ), gene = "948356", protein = "AAA27507.1", diff --git a/tests/testthat/test-ncbi_parse.R b/tests/testthat/test-ncbi_parse.R index 21666e1..f556f1b 100644 --- a/tests/testthat/test-ncbi_parse.R +++ b/tests/testthat/test-ncbi_parse.R @@ -1,8 +1,16 @@ -test_that("ncbi_parse() works with BioSamples", { +test_that("ncbi_parse() works with a BioSample", { data(examples) biosample_uid <- ncbi_get_uid(examples$biosample[1], db = "biosample") - meta_xml <- suppressWarnings(ncbi_get_meta(biosample_uid, parse = FALSE)) - res <- ncbi_parse(meta = meta_xml[[1]], db = "biosample", format = "xml") + res <- ncbi_get_meta(biosample_uid) expect_s3_class(res, c("tbl_df", "tbl", "data.frame")) - expect_equal(res$biosample[1], "SAMN02714232") + expect_equal(res$biosample, "SAMN02714232") }) + +test_that("ncbi_parse() works with all BioSamples", { + data(examples) + biosample_uid <- ncbi_get_uid(examples$biosample, db = "biosample") + res <- ncbi_get_meta(biosample_uid) + expect_s3_class(res, c("tbl_df", "tbl", "data.frame")) + expect_equal(nrow(res), length(examples$biosample)) +}) +