Skip to content

Commit

Permalink
Merge pull request #88 from stitam/repos
Browse files Browse the repository at this point in the history
Implement ID conversions between ENA studies and NCBI bioprojects
  • Loading branch information
stitam authored Nov 29, 2024
2 parents 9af6e75 + 8f221c2 commit 104010d
Show file tree
Hide file tree
Showing 7 changed files with 110 additions and 91 deletions.
63 changes: 63 additions & 0 deletions R/convert_repo_accession.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#' Convert accession between ENA and NCBI
#'
#' Take a vector of accessions from either ENA or NCBI and convert them to
#' accessions in the other repository.
#' @param accessions character; a vector or ENA or NCBI accessions.
#' @param from character; the repository to convert from.
#' @param to character; the repository to convert to.
#' @param type character; the name of the database in the from repository.
#' @return a tibble.
#' @noRd
convert_repo_accession <- function(accessions, from, to, type) {
from <- match.arg(from, choices = c("ena", "ncbi"))
to <- match.arg(to, choices = c("ena", "ncbi"))
dict <- data.frame(
ena = c("sample", "study"),
ncbi = c("biosample", "bioproject")
)
if (from == "ena") type <- match.arg(type, choices = dict$ena)
if (from == "ncbi") {
type <- match.arg(type, choices = dict$ncbi)
type <- dict$ena[which(dict$ncbi == type)]
}
url <- "https://www.ebi.ac.uk/ena/portal/api/search"
query <- paste0(type, '_accession="', accessions, '"')
if (from == "ena") query <- paste0("secondary_", query)
query_collapsed <- paste(query, collapse = " OR ")
body <- list(
result = type,
query = query_collapsed,
fields = paste0(type, '_accession,secondary_', type, '_accession'),
format = 'tsv'
)
res <- httr::POST(url, body = body, encode = "form")
content <- httr::content(res, encoding = "utf-8")
df <- utils::read.table(text = content, header = TRUE, sep = "\t") %>%
dplyr::rename(
"ncbi" = paste0(type, "_accession"),
"ena" = paste0("secondary_", type, "_accession")
)
if (from == "ena") {
df <- df %>% dplyr::relocate("ena")
}
df$ena <- as.character(df$ena)
df$ncbi <- as.character(df$ncbi)
if (from == "ena") {
index <- which(accessions %in% df$ena)
} else if (from == "ncbi") {
index <- which(accessions %in% df$ncbi)
}
if (length(index) < length(accessions)) {
if (from == "ena") {
missing <- accessions[which(!accessions %in% df$ena)]
} else if (from == "ncbi") {
missing <- accessions[which(!accessions %in% df$ncbi)]
}
missing_collapsed <- paste(missing, collapse = ", ")
msg <- paste0("The following accessions were not found: ", missing_collapsed)
warning(msg)
}
df <- df[order(accessions[index]), ]
tbl <- tibble::as_tibble(df)
return(tbl)
}
46 changes: 11 additions & 35 deletions R/ena2ncbi.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,47 +2,23 @@
#'
#' Take a vector of ENA accessions and convert them to NCBI accessions.
#' @param accessions character; a vector or ENA accessions.
#' @param type character; type of accessions. Currently only sample accessions
#' are supported.
#' @param type character; type of accessions. Supported types: `sample`,
#' `study`.
#' @return A tibble with two columns, `ena` and `ncbi`.
#' @examples
#' ena2ncbi("ERS3202441")
#' ena2ncbi(c("ERS3202441", "ERS3202442"))
#' ena2ncbi("ERS3202441", type = "sample")
#' ena2ncbi(c("ERS3202441", "ERS3202442"), type = "sample")
#' ena2ncbi("ERP161024", type = "study")
#' @importFrom magrittr %>%
#' @export
ena2ncbi <- function(accessions, type = "sample") {
type <- match.arg(type, choices = "sample")
url <- "https://www.ebi.ac.uk/ena/portal/api/search"
query <- paste0('secondary_sample_accession="', accessions, '"')
query_collapsed <- paste(query, collapse = " OR ")
body <- list(
result = 'sample',
query = query_collapsed,
fields = 'sample_accession,secondary_sample_accession',
format = 'tsv'
ena2ncbi <- function(accessions, type) {
convert_repo_accession(
accessions = accessions,
from = "ena",
to = "ncbi",
type = type
)
res <- httr::POST(url, body = body, encode = "form")
content <- httr::content(res, encoding = "utf-8")
df <- utils::read.table(text = content, header = TRUE, sep = "\t") %>%
dplyr::rename(
"ncbi" = "sample_accession",
"ena" = "secondary_sample_accession"
) %>%
dplyr::relocate("ena")
df$ena <- as.character(df$ena)
df$ncbi <- as.character(df$ncbi)
index <- which(accessions %in% df$ena)
if (length(index) < length(accessions)) {
missing <- accessions[which(!accessions %in% df$ena)]
missing_collapsed <- paste(missing, collapse = ", ")
msg <- paste0("The following accessions were not found: ", missing_collapsed)
warning(msg)
}
df <- df[order(accessions[index]), ]
tbl <- tibble::as_tibble(df)
return(tbl)
}

# todo: function automatically sorts results, remove this
# todo: test if invalid and valid entries are mixed
# todo: add tests for ncbi2ena
42 changes: 9 additions & 33 deletions R/ncbi2ena.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,18 @@
#'
#' Take a vector of NCBI accessions and convert them to ENA accessions.
#' @param accessions character; a vector or ENA accessions.
#' @param type character; type of accessions. Currently only sample accessions
#' are supported.
#' @param type character; type of accessions. Supported types: `biosample`,
#' `bioproject`.
#' @return A tibble with two columns, `ncbi` and `ena`.
#' @examples
#' ncbi2ena("SAMEA111452506")
#' ncbi2ena("SAMEA111452506", type = "biosample")
#' @importFrom magrittr %>%
#' @export
ncbi2ena <- function(accessions, type = "sample") {
type <- match.arg(type, choices = "sample")
url <- "https://www.ebi.ac.uk/ena/portal/api/search"
query <- paste0('sample_accession="', accessions, '"')
query_collapsed <- paste(query, collapse = " OR ")
body <- list(
result = 'sample',
query = query_collapsed,
fields = 'sample_accession,secondary_sample_accession',
format = 'tsv'
ncbi2ena <- function(accessions, type) {
convert_repo_accession(
accessions = accessions,
from = "ncbi",
to = "ena",
type = type
)
res <- httr::POST(url, body = body, encode = "form")
content <- httr::content(res, encoding = "utf-8")
df <- utils::read.table(text = content, header = TRUE, sep = "\t") %>%
dplyr::rename(
"ncbi" = "sample_accession",
"ena" = "secondary_sample_accession"
) %>%
dplyr::relocate("ncbi")
df$ncbi <- as.character(df$ncbi)
df$ena <- as.character(df$ena)
index <- which(accessions %in% df$ncbi)
if (length(index) < length(accessions)) {
missing <- accessions[which(!accessions %in% df$ncbi)]
missing_collapsed <- paste(missing, collapse = ", ")
msg <- paste0("The following accessions were not found: ", missing_collapsed)
warning(msg)
}
df <- df[order(accessions[index]), ]
tbl <- tibble::as_tibble(df)
return(tbl)
}
11 changes: 6 additions & 5 deletions man/ena2ncbi.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions man/ncbi2ena.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 8 additions & 8 deletions tests/testthat/test-ena2ncbi.R
Original file line number Diff line number Diff line change
@@ -1,36 +1,36 @@
test_that("ena2ncbi() works with a single query", {
A <- ena2ncbi("ERS15941089")
A <- ena2ncbi("ERS15941089", type = "sample")
expect_equal(dim(A), c(1,2))
expect_equal(names(A), c("ena", "ncbi"))
expect_equal(A$ena, "ERS15941089")
expect_equal(A$ncbi, "SAMEA113946840")
})

test_that("ena2ncbi() works with multiple queries", {
B <- ena2ncbi(c("ERS15941089", "ERS15939592"))
B <- ena2ncbi(c("ERS15941089", "ERS15939592"), type = "sample")
expect_equal(dim(B), c(2,2))
expect_equal(B$ena, c("ERS15941089", "ERS15939592"))
expect_equal(B$ncbi, c("SAMEA113946840", "SAMEA113945342"))
})

test_that("ena2ncbi() returns results in the same order as the query", {
C <- ena2ncbi(c("ERS15939592", "ERS15941089"))
C <- ena2ncbi(c("ERS15939592", "ERS15941089"), type = "sample")
expect_equal(C$ena, c("ERS15939592", "ERS15941089"))
expect_equal(C$ncbi, c("SAMEA113945342", "SAMEA113946840"))
})

test_that("ena2ncbi() returns an empty data frame when given an NCBI ID", {
D <- suppressWarnings(ena2ncbi("SAMEA111452506"))
D_msg <- capture_warnings(ena2ncbi("SAMEA111452506"))
D <- suppressWarnings(ena2ncbi("SAMEA111452506", type = "sample"))
D_msg <- capture_warnings(ena2ncbi("SAMEA111452506", type = "sample"))
expect_equal(dim(D), c(0,2))
expect_equal(class(D$ena), "character")
expect_equal(class(D$ncbi), "character")
expect_equal(D_msg, "The following accessions were not found: SAMEA111452506")
})

test_that("ena2ncbi() removes invalid queries and returns the rest", {
E <- suppressWarnings(ena2ncbi(c("ERS15939592", "balloon", "ERS15941089")))
E <- suppressWarnings(
ena2ncbi(c("ERS15939592", "balloon", "ERS15941089"), type = "sample")
)
expect_equal(dim(E), c(2,2))
})


15 changes: 9 additions & 6 deletions tests/testthat/test-ncbi2ena.R
Original file line number Diff line number Diff line change
@@ -1,35 +1,38 @@
test_that("ncbi2ena() works with a single query", {
A <- ncbi2ena("SAMEA113946840")
A <- ncbi2ena("SAMEA113946840", type = "biosample")
expect_equal(dim(A), c(1,2))
expect_equal(names(A), c("ncbi", "ena"))
expect_equal(A$ncbi, "SAMEA113946840")
expect_equal(A$ena, "ERS15941089")
})

test_that("ncbi2ena() works with multiple queries", {
B <- ncbi2ena(c("SAMEA113946840", "SAMEA113945342"))
B <- ncbi2ena(c("SAMEA113946840", "SAMEA113945342"), type = "biosample")
expect_equal(dim(B), c(2,2))
expect_equal(B$ncbi, c("SAMEA113946840", "SAMEA113945342"))
expect_equal(B$ena, c("ERS15941089", "ERS15939592"))

})

test_that("ncbi2ena() returns results in the same order as the query", {
C <- ncbi2ena(c("SAMEA113945342", "SAMEA113946840"))
C <- ncbi2ena(c("SAMEA113945342", "SAMEA113946840"), type = "biosample")
expect_equal(C$ncbi, c("SAMEA113945342", "SAMEA113946840"))
expect_equal(C$ena, c("ERS15939592", "ERS15941089"))
})

test_that("ncbi2ena() returns an empty data frame when given an ENA ID", {
D <- suppressWarnings(ncbi2ena("ERS15941089"))
D_msg <- capture_warnings(ncbi2ena("ERS15941089"))
D <- suppressWarnings(ncbi2ena("ERS15941089", type = "biosample"))
D_msg <- capture_warnings(ncbi2ena("ERS15941089", type = "biosample"))
expect_equal(dim(D), c(0,2))
expect_equal(class(D$ncbi), "character")
expect_equal(class(D$ena), "character")
expect_equal(D_msg, "The following accessions were not found: ERS15941089")
})

test_that("ncbi2ena() removes invalid queries and returns the rest", {
E <- suppressWarnings(ncbi2ena(c("SAMEA113945342", "balloon", "SAMEA113946840")))
E <- suppressWarnings(ncbi2ena(
c("SAMEA113945342", "balloon", "SAMEA113946840"),
type = "biosample"
))
expect_equal(dim(E), c(2,2))
})

0 comments on commit 104010d

Please sign in to comment.