-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #88 from stitam/repos
Implement ID conversions between ENA studies and NCBI bioprojects
- Loading branch information
Showing
7 changed files
with
110 additions
and
91 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#' Convert accession between ENA and NCBI | ||
#' | ||
#' Take a vector of accessions from either ENA or NCBI and convert them to | ||
#' accessions in the other repository. | ||
#' @param accessions character; a vector or ENA or NCBI accessions. | ||
#' @param from character; the repository to convert from. | ||
#' @param to character; the repository to convert to. | ||
#' @param type character; the name of the database in the from repository. | ||
#' @return a tibble. | ||
#' @noRd | ||
convert_repo_accession <- function(accessions, from, to, type) { | ||
from <- match.arg(from, choices = c("ena", "ncbi")) | ||
to <- match.arg(to, choices = c("ena", "ncbi")) | ||
dict <- data.frame( | ||
ena = c("sample", "study"), | ||
ncbi = c("biosample", "bioproject") | ||
) | ||
if (from == "ena") type <- match.arg(type, choices = dict$ena) | ||
if (from == "ncbi") { | ||
type <- match.arg(type, choices = dict$ncbi) | ||
type <- dict$ena[which(dict$ncbi == type)] | ||
} | ||
url <- "https://www.ebi.ac.uk/ena/portal/api/search" | ||
query <- paste0(type, '_accession="', accessions, '"') | ||
if (from == "ena") query <- paste0("secondary_", query) | ||
query_collapsed <- paste(query, collapse = " OR ") | ||
body <- list( | ||
result = type, | ||
query = query_collapsed, | ||
fields = paste0(type, '_accession,secondary_', type, '_accession'), | ||
format = 'tsv' | ||
) | ||
res <- httr::POST(url, body = body, encode = "form") | ||
content <- httr::content(res, encoding = "utf-8") | ||
df <- utils::read.table(text = content, header = TRUE, sep = "\t") %>% | ||
dplyr::rename( | ||
"ncbi" = paste0(type, "_accession"), | ||
"ena" = paste0("secondary_", type, "_accession") | ||
) | ||
if (from == "ena") { | ||
df <- df %>% dplyr::relocate("ena") | ||
} | ||
df$ena <- as.character(df$ena) | ||
df$ncbi <- as.character(df$ncbi) | ||
if (from == "ena") { | ||
index <- which(accessions %in% df$ena) | ||
} else if (from == "ncbi") { | ||
index <- which(accessions %in% df$ncbi) | ||
} | ||
if (length(index) < length(accessions)) { | ||
if (from == "ena") { | ||
missing <- accessions[which(!accessions %in% df$ena)] | ||
} else if (from == "ncbi") { | ||
missing <- accessions[which(!accessions %in% df$ncbi)] | ||
} | ||
missing_collapsed <- paste(missing, collapse = ", ") | ||
msg <- paste0("The following accessions were not found: ", missing_collapsed) | ||
warning(msg) | ||
} | ||
df <- df[order(accessions[index]), ] | ||
tbl <- tibble::as_tibble(df) | ||
return(tbl) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,36 +1,36 @@ | ||
test_that("ena2ncbi() works with a single query", { | ||
A <- ena2ncbi("ERS15941089") | ||
A <- ena2ncbi("ERS15941089", type = "sample") | ||
expect_equal(dim(A), c(1,2)) | ||
expect_equal(names(A), c("ena", "ncbi")) | ||
expect_equal(A$ena, "ERS15941089") | ||
expect_equal(A$ncbi, "SAMEA113946840") | ||
}) | ||
|
||
test_that("ena2ncbi() works with multiple queries", { | ||
B <- ena2ncbi(c("ERS15941089", "ERS15939592")) | ||
B <- ena2ncbi(c("ERS15941089", "ERS15939592"), type = "sample") | ||
expect_equal(dim(B), c(2,2)) | ||
expect_equal(B$ena, c("ERS15941089", "ERS15939592")) | ||
expect_equal(B$ncbi, c("SAMEA113946840", "SAMEA113945342")) | ||
}) | ||
|
||
test_that("ena2ncbi() returns results in the same order as the query", { | ||
C <- ena2ncbi(c("ERS15939592", "ERS15941089")) | ||
C <- ena2ncbi(c("ERS15939592", "ERS15941089"), type = "sample") | ||
expect_equal(C$ena, c("ERS15939592", "ERS15941089")) | ||
expect_equal(C$ncbi, c("SAMEA113945342", "SAMEA113946840")) | ||
}) | ||
|
||
test_that("ena2ncbi() returns an empty data frame when given an NCBI ID", { | ||
D <- suppressWarnings(ena2ncbi("SAMEA111452506")) | ||
D_msg <- capture_warnings(ena2ncbi("SAMEA111452506")) | ||
D <- suppressWarnings(ena2ncbi("SAMEA111452506", type = "sample")) | ||
D_msg <- capture_warnings(ena2ncbi("SAMEA111452506", type = "sample")) | ||
expect_equal(dim(D), c(0,2)) | ||
expect_equal(class(D$ena), "character") | ||
expect_equal(class(D$ncbi), "character") | ||
expect_equal(D_msg, "The following accessions were not found: SAMEA111452506") | ||
}) | ||
|
||
test_that("ena2ncbi() removes invalid queries and returns the rest", { | ||
E <- suppressWarnings(ena2ncbi(c("ERS15939592", "balloon", "ERS15941089"))) | ||
E <- suppressWarnings( | ||
ena2ncbi(c("ERS15939592", "balloon", "ERS15941089"), type = "sample") | ||
) | ||
expect_equal(dim(E), c(2,2)) | ||
}) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,35 +1,38 @@ | ||
test_that("ncbi2ena() works with a single query", { | ||
A <- ncbi2ena("SAMEA113946840") | ||
A <- ncbi2ena("SAMEA113946840", type = "biosample") | ||
expect_equal(dim(A), c(1,2)) | ||
expect_equal(names(A), c("ncbi", "ena")) | ||
expect_equal(A$ncbi, "SAMEA113946840") | ||
expect_equal(A$ena, "ERS15941089") | ||
}) | ||
|
||
test_that("ncbi2ena() works with multiple queries", { | ||
B <- ncbi2ena(c("SAMEA113946840", "SAMEA113945342")) | ||
B <- ncbi2ena(c("SAMEA113946840", "SAMEA113945342"), type = "biosample") | ||
expect_equal(dim(B), c(2,2)) | ||
expect_equal(B$ncbi, c("SAMEA113946840", "SAMEA113945342")) | ||
expect_equal(B$ena, c("ERS15941089", "ERS15939592")) | ||
|
||
}) | ||
|
||
test_that("ncbi2ena() returns results in the same order as the query", { | ||
C <- ncbi2ena(c("SAMEA113945342", "SAMEA113946840")) | ||
C <- ncbi2ena(c("SAMEA113945342", "SAMEA113946840"), type = "biosample") | ||
expect_equal(C$ncbi, c("SAMEA113945342", "SAMEA113946840")) | ||
expect_equal(C$ena, c("ERS15939592", "ERS15941089")) | ||
}) | ||
|
||
test_that("ncbi2ena() returns an empty data frame when given an ENA ID", { | ||
D <- suppressWarnings(ncbi2ena("ERS15941089")) | ||
D_msg <- capture_warnings(ncbi2ena("ERS15941089")) | ||
D <- suppressWarnings(ncbi2ena("ERS15941089", type = "biosample")) | ||
D_msg <- capture_warnings(ncbi2ena("ERS15941089", type = "biosample")) | ||
expect_equal(dim(D), c(0,2)) | ||
expect_equal(class(D$ncbi), "character") | ||
expect_equal(class(D$ena), "character") | ||
expect_equal(D_msg, "The following accessions were not found: ERS15941089") | ||
}) | ||
|
||
test_that("ncbi2ena() removes invalid queries and returns the rest", { | ||
E <- suppressWarnings(ncbi2ena(c("SAMEA113945342", "balloon", "SAMEA113946840"))) | ||
E <- suppressWarnings(ncbi2ena( | ||
c("SAMEA113945342", "balloon", "SAMEA113946840"), | ||
type = "biosample" | ||
)) | ||
expect_equal(dim(E), c(2,2)) | ||
}) |