Skip to content

Commit

Permalink
Merge pull request #61 from stitam/biosample
Browse files Browse the repository at this point in the history
Fix NCBI BioSample metadata parsing sort metadata columns by data availability
  • Loading branch information
stitam authored Feb 8, 2024
2 parents 6615f23 + d4cef7c commit f7d8810
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 26 deletions.
62 changes: 43 additions & 19 deletions R/ncbi_parse_biosample_xml.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,11 @@ ncbi_parse_biosample_xml <- function(
out <- dplyr::relocate(out, biosample, .after = biosample_uid)
biosample_df <- dplyr::bind_rows(biosample_df, out)
}
biosample_tbl <- tibble::as_tibble(biosample_df)
return(biosample_tbl)
out <- tibble::as_tibble(biosample_df)
out <- out[, order(unname(sapply(out, function(x) sum(is.na(x)))))]
out <- dplyr::relocate(out, biosample_uid)
out <- dplyr::relocate(out, biosample, .after = biosample_uid)
return(out)
}


Expand Down Expand Up @@ -133,6 +136,11 @@ ncbi_parse_biosample_xml_entry <- function(x, verbose = getOption("verbose")) {
message(paste0("No status for BioSample ", main_attrs$accession, "."))
}
}
if (all(c("geo", "geo_link") %in% names(out)) & nrow(out) == 1) {
if (grepl(out$geo, out$geo_link)) {
out <- dplyr::select(out, -geo_link)
}
}
if (nrow(out) > 1) {
if (verbose) {
message(paste0(
Expand Down Expand Up @@ -190,25 +198,31 @@ extract_attributes <- function(x, biosample, verbose = getOption("verbose")) {
out <- data.frame(
attr = unname(sapply(res, function(A) {
attr_instance <- attributes(A)
if (!is.null(attr_instance$harmonized_name)) {
if ("harmonised_name" %in% names(attr_instance)) {
attr_name <- attr_instance$harmonized_name
} else if (
length(attr_instance) == 1 &&
names(attr_instance) == "attribute_name") {
} else if ("display_name" %in% names(attr_instance)) {
attr_name <- attr_instance$display_name
} else if ("attribute_name" %in% names(attr_instance)) {
attr_name <- attr_instance$attribute_name
} else {
stop()
}
attr_name <- tolower(gsub(" +", "_", attr_name))
paste0("attr_", attr_name)
})),
value = unname(sapply(res, function(x) {
if (length(x) == 0) {
return(NA)
}
if (length(x) == 1) {
return(unlist(x))
}
if (length(x) > 1) {
return(paste(unlist(x), collapse = "|"))
val <- NA
} else {
val <- unlist(x)
if ("unit" %in% names(attributes(x))) {
val <- paste0(val, attributes(x)$unit)
}
if (length(val) > 1) {
val <- paste(val, collapse = "|")
}
}
return(val)
}))
)
out <- dplyr::distinct(out)
Expand Down Expand Up @@ -333,6 +347,10 @@ extract_description <- function(
if ("Comment" %in% names(x$Description)) {
if ("Paragraph" %in% names(x$Description$Comment)) {
out <- unlist(x$Description$Comment$Paragraph)
# SAMN32317800
if (is.null(out)) {
out <- NA
}
}
if ("Table" %in% names(x$Description$Comment)) {
if (is.null(out)) {
Expand Down Expand Up @@ -481,13 +499,19 @@ extract_links <- function(
)
}
if ("type" %in% names(attributes(A)) && attributes(A)$type == "url") {
longlinks <- dplyr::bind_rows(
longlinks,
data.frame(
target = attributes(A)$label,
label = unlist(A)
if (attributes(A)$label != "") {
target <- attributes(A)$label
if (grepl("^GEO Sample", target)) {
target <- "geo_link"
}
longlinks <- dplyr::bind_rows(
longlinks,
data.frame(
target = target,
label = unlist(A)
)
)
)
}
}
longlinks$target <- gsub(" +", "_", longlinks$target)
return(longlinks)
Expand Down
Binary file modified data/examples.rda
Binary file not shown.
9 changes: 6 additions & 3 deletions scripts/prep_examples.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@ examples <- list(
"SAMEA14036741",
"SAMN03863711",
"SAMN39003160",
"SAMN01918814", #links
"SAMN02692951", #owner
"SAMN06220566" #ids
"SAMN01918814", # links
"SAMN02692951", # owner
"SAMN06220566", # ids
"SAMN32317800", # empty description
"SAMEA111396836", # unit in attributes
"SAMN14387582"
),
gene = "948356",
protein = "AAA27507.1",
Expand Down
16 changes: 12 additions & 4 deletions tests/testthat/test-ncbi_parse.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
test_that("ncbi_parse() works with BioSamples", {
test_that("ncbi_parse() works with a BioSample", {
data(examples)
biosample_uid <- ncbi_get_uid(examples$biosample[1], db = "biosample")
meta_xml <- suppressWarnings(ncbi_get_meta(biosample_uid, parse = FALSE))
res <- ncbi_parse(meta = meta_xml[[1]], db = "biosample", format = "xml")
res <- ncbi_get_meta(biosample_uid)
expect_s3_class(res, c("tbl_df", "tbl", "data.frame"))
expect_equal(res$biosample[1], "SAMN02714232")
expect_equal(res$biosample, "SAMN02714232")
})

test_that("ncbi_parse() works with all BioSamples", {
data(examples)
biosample_uid <- ncbi_get_uid(examples$biosample, db = "biosample")
res <- ncbi_get_meta(biosample_uid)
expect_s3_class(res, c("tbl_df", "tbl", "data.frame"))
expect_equal(nrow(res), length(examples$biosample))
})

0 comments on commit f7d8810

Please sign in to comment.