Merge pull request #88 from stitam/repos

Implement ID conversions between ENA studies and NCBI bioprojects
stitam · Nov 29, 2024 · 104010d · 104010d
2 parents 9af6e75 + 8f221c2
commit 104010d
Show file tree

Hide file tree

Showing 7 changed files with 110 additions and 91 deletions.
diff --git a/R/convert_repo_accession.R b/R/convert_repo_accession.R
@@ -0,0 +1,63 @@
+#' Convert accession between ENA and NCBI
+#' 
+#' Take a vector of accessions from either ENA or NCBI and convert them to 
+#' accessions in the other repository.
+#' @param accessions character; a vector or ENA or NCBI accessions.
+#' @param from character; the repository to convert from.
+#' @param to character; the repository to convert to.
+#' @param type character; the name of the database in the from repository.
+#' @return a tibble.
+#' @noRd
+convert_repo_accession <- function(accessions, from, to, type) {
+  from <- match.arg(from, choices = c("ena", "ncbi"))
+  to <- match.arg(to, choices = c("ena", "ncbi"))
+  dict <- data.frame(
+    ena = c("sample", "study"),
+    ncbi = c("biosample", "bioproject")
+  )
+  if (from == "ena") type <- match.arg(type, choices = dict$ena)
+  if (from == "ncbi") {
+    type <- match.arg(type, choices = dict$ncbi)
+    type <- dict$ena[which(dict$ncbi == type)]
+  }
+  url <- "https://www.ebi.ac.uk/ena/portal/api/search"
+  query <- paste0(type, '_accession="', accessions, '"')
+  if (from == "ena") query <- paste0("secondary_", query)
+  query_collapsed <- paste(query, collapse = " OR ")
+  body <- list(
+    result = type,
+    query = query_collapsed,
+    fields = paste0(type, '_accession,secondary_', type, '_accession'),
+    format = 'tsv'
+  )
+  res <- httr::POST(url, body = body, encode = "form")
+  content <- httr::content(res, encoding = "utf-8")
+  df <- utils::read.table(text = content, header = TRUE, sep = "\t") %>%
+    dplyr::rename(
+      "ncbi" = paste0(type, "_accession"),
+      "ena" = paste0("secondary_", type, "_accession")
+    )
+  if (from == "ena") {
+    df <- df %>% dplyr::relocate("ena")
+  }
+  df$ena <- as.character(df$ena)
+  df$ncbi <- as.character(df$ncbi)
+  if (from == "ena") {
+    index <- which(accessions %in% df$ena)
+  } else if (from == "ncbi") {
+    index <- which(accessions %in% df$ncbi)
+  }
+  if (length(index) < length(accessions)) {
+    if (from == "ena") {
+      missing <- accessions[which(!accessions %in% df$ena)]
+    } else if (from == "ncbi") {
+      missing <- accessions[which(!accessions %in% df$ncbi)]
+    }
+    missing_collapsed <- paste(missing, collapse = ", ")
+    msg <- paste0("The following accessions were not found: ", missing_collapsed)
+    warning(msg)
+  }
+  df <- df[order(accessions[index]), ]
+  tbl <- tibble::as_tibble(df)
+  return(tbl)
+}
diff --git a/R/ena2ncbi.R b/R/ena2ncbi.R
@@ -2,47 +2,23 @@
 #'
 #' Take a vector of ENA accessions and convert them to NCBI accessions.
 #' @param accessions character; a vector or ENA accessions.
-#' @param type character; type of accessions. Currently only sample accessions
-#' are supported.
+#' @param type character; type of accessions. Supported types: `sample`, 
+#' `study`.
 #' @return A tibble with two columns, `ena` and `ncbi`.
 #' @examples
-#' ena2ncbi("ERS3202441")
-#' ena2ncbi(c("ERS3202441", "ERS3202442"))
+#' ena2ncbi("ERS3202441", type = "sample")
+#' ena2ncbi(c("ERS3202441", "ERS3202442"), type = "sample")
+#' ena2ncbi("ERP161024", type = "study")
 #' @importFrom magrittr %>%
 #' @export
-ena2ncbi <- function(accessions, type = "sample") {
-  type <- match.arg(type, choices = "sample")
-  url <- "https://www.ebi.ac.uk/ena/portal/api/search"
-  query <- paste0('secondary_sample_accession="', accessions, '"')
-  query_collapsed <- paste(query, collapse = " OR ")
-  body <- list(
-    result = 'sample',
-    query = query_collapsed,
-    fields = 'sample_accession,secondary_sample_accession',
-    format = 'tsv'
+ena2ncbi <- function(accessions, type) {
+  convert_repo_accession(
+    accessions = accessions,
+    from = "ena",
+    to = "ncbi",
+    type = type
   )
-  res <- httr::POST(url, body = body, encode = "form")
-  content <- httr::content(res, encoding = "utf-8")
-  df <- utils::read.table(text = content, header = TRUE, sep = "\t") %>%
-    dplyr::rename(
-      "ncbi" = "sample_accession",
-      "ena" = "secondary_sample_accession"
-    ) %>%
-    dplyr::relocate("ena")
-  df$ena <- as.character(df$ena)
-  df$ncbi <- as.character(df$ncbi)
-  index <- which(accessions %in% df$ena)
-  if (length(index) < length(accessions)) {
-    missing <- accessions[which(!accessions %in% df$ena)]
-    missing_collapsed <- paste(missing, collapse = ", ")
-    msg <- paste0("The following accessions were not found: ", missing_collapsed)
-    warning(msg)
-  }
-  df <- df[order(accessions[index]), ]
-  tbl <- tibble::as_tibble(df)
-  return(tbl)
 }
 
 # todo: function automatically sorts results, remove this
 # todo: test if invalid and valid entries are mixed
-# todo: add tests for ncbi2ena
diff --git a/R/ncbi2ena.R b/R/ncbi2ena.R
@@ -2,42 +2,18 @@
 #'
 #' Take a vector of NCBI accessions and convert them to ENA accessions.
 #' @param accessions character; a vector or ENA accessions.
-#' @param type character; type of accessions. Currently only sample accessions
-#' are supported.
+#' @param type character; type of accessions. Supported types: `biosample`, 
+#' `bioproject`.
 #' @return A tibble with two columns, `ncbi` and `ena`.
 #' @examples
-#' ncbi2ena("SAMEA111452506")
+#' ncbi2ena("SAMEA111452506", type = "biosample")
 #' @importFrom magrittr %>%
 #' @export
-ncbi2ena <- function(accessions, type = "sample") {
-  type <- match.arg(type, choices = "sample")
-  url <- "https://www.ebi.ac.uk/ena/portal/api/search"
-  query <- paste0('sample_accession="', accessions, '"')
-  query_collapsed <- paste(query, collapse = " OR ")
-  body <- list(
-    result = 'sample',
-    query = query_collapsed,
-    fields = 'sample_accession,secondary_sample_accession',
-    format = 'tsv'
+ncbi2ena <- function(accessions, type) {
+  convert_repo_accession(
+    accessions = accessions,
+    from = "ncbi",
+    to = "ena",
+    type = type
   )
-  res <- httr::POST(url, body = body, encode = "form")
-  content <- httr::content(res, encoding = "utf-8")
-  df <- utils::read.table(text = content, header = TRUE, sep = "\t") %>%
-    dplyr::rename(
-      "ncbi" = "sample_accession",
-      "ena" = "secondary_sample_accession"
-    ) %>%
-    dplyr::relocate("ncbi")
-  df$ncbi <- as.character(df$ncbi)
-  df$ena <- as.character(df$ena)
-  index <- which(accessions %in% df$ncbi)
-  if (length(index) < length(accessions)) {
-    missing <- accessions[which(!accessions %in% df$ncbi)]
-    missing_collapsed <- paste(missing, collapse = ", ")
-    msg <- paste0("The following accessions were not found: ", missing_collapsed)
-    warning(msg)
-  }
-  df <- df[order(accessions[index]), ]
-  tbl <- tibble::as_tibble(df)
-  return(tbl)
 }
diff --git a/man/ena2ncbi.Rd b/man/ena2ncbi.Rd
diff --git a/man/ncbi2ena.Rd b/man/ncbi2ena.Rd
diff --git a/tests/testthat/test-ena2ncbi.R b/tests/testthat/test-ena2ncbi.R
@@ -1,36 +1,36 @@
 test_that("ena2ncbi() works with a single query", {
-  A <- ena2ncbi("ERS15941089")
+  A <- ena2ncbi("ERS15941089", type = "sample")
   expect_equal(dim(A), c(1,2))
   expect_equal(names(A), c("ena", "ncbi"))
   expect_equal(A$ena, "ERS15941089")
   expect_equal(A$ncbi, "SAMEA113946840")
 })
 
 test_that("ena2ncbi() works with multiple queries", {
-  B <- ena2ncbi(c("ERS15941089", "ERS15939592"))
+  B <- ena2ncbi(c("ERS15941089", "ERS15939592"), type = "sample")
   expect_equal(dim(B), c(2,2))
   expect_equal(B$ena, c("ERS15941089", "ERS15939592"))
   expect_equal(B$ncbi, c("SAMEA113946840", "SAMEA113945342"))
 })
 
 test_that("ena2ncbi() returns results in the same order as the query", {
-  C <- ena2ncbi(c("ERS15939592", "ERS15941089"))
+  C <- ena2ncbi(c("ERS15939592", "ERS15941089"), type = "sample")
   expect_equal(C$ena, c("ERS15939592", "ERS15941089"))
   expect_equal(C$ncbi, c("SAMEA113945342", "SAMEA113946840"))
 })
 
 test_that("ena2ncbi() returns an empty data frame when given an NCBI ID", {
-  D <- suppressWarnings(ena2ncbi("SAMEA111452506"))
-  D_msg <- capture_warnings(ena2ncbi("SAMEA111452506"))
+  D <- suppressWarnings(ena2ncbi("SAMEA111452506", type = "sample"))
+  D_msg <- capture_warnings(ena2ncbi("SAMEA111452506", type = "sample"))
   expect_equal(dim(D), c(0,2))
   expect_equal(class(D$ena), "character")
   expect_equal(class(D$ncbi), "character")
   expect_equal(D_msg, "The following accessions were not found: SAMEA111452506")
 })
 
 test_that("ena2ncbi() removes invalid queries and returns the rest", {
-  E <- suppressWarnings(ena2ncbi(c("ERS15939592", "balloon", "ERS15941089")))
+  E <- suppressWarnings(
+    ena2ncbi(c("ERS15939592", "balloon", "ERS15941089"), type = "sample")
+  )
   expect_equal(dim(E), c(2,2))
 })
-
-
diff --git a/tests/testthat/test-ncbi2ena.R b/tests/testthat/test-ncbi2ena.R
@@ -1,35 +1,38 @@
 test_that("ncbi2ena() works with a single query", {
-  A <- ncbi2ena("SAMEA113946840")
+  A <- ncbi2ena("SAMEA113946840", type = "biosample")
   expect_equal(dim(A), c(1,2))
   expect_equal(names(A), c("ncbi", "ena"))
   expect_equal(A$ncbi, "SAMEA113946840")
   expect_equal(A$ena, "ERS15941089")
 })
 
 test_that("ncbi2ena() works with multiple queries", {
-  B <- ncbi2ena(c("SAMEA113946840", "SAMEA113945342"))
+  B <- ncbi2ena(c("SAMEA113946840", "SAMEA113945342"), type = "biosample")
   expect_equal(dim(B), c(2,2))
   expect_equal(B$ncbi, c("SAMEA113946840", "SAMEA113945342"))
   expect_equal(B$ena, c("ERS15941089", "ERS15939592"))
 
 })
 
 test_that("ncbi2ena() returns results in the same order as the query", {
-  C <- ncbi2ena(c("SAMEA113945342", "SAMEA113946840"))
+  C <- ncbi2ena(c("SAMEA113945342", "SAMEA113946840"), type = "biosample")
   expect_equal(C$ncbi, c("SAMEA113945342", "SAMEA113946840"))
   expect_equal(C$ena, c("ERS15939592", "ERS15941089"))
 })
 
 test_that("ncbi2ena() returns an empty data frame when given an ENA ID", {
-  D <- suppressWarnings(ncbi2ena("ERS15941089"))
-  D_msg <- capture_warnings(ncbi2ena("ERS15941089"))
+  D <- suppressWarnings(ncbi2ena("ERS15941089", type = "biosample"))
+  D_msg <- capture_warnings(ncbi2ena("ERS15941089", type = "biosample"))
   expect_equal(dim(D), c(0,2))
   expect_equal(class(D$ncbi), "character")
   expect_equal(class(D$ena), "character")
   expect_equal(D_msg, "The following accessions were not found: ERS15941089")
 })
 
 test_that("ncbi2ena() removes invalid queries and returns the rest", {
-  E <- suppressWarnings(ncbi2ena(c("SAMEA113945342", "balloon", "SAMEA113946840")))
+  E <- suppressWarnings(ncbi2ena(
+      c("SAMEA113945342", "balloon", "SAMEA113946840"),
+      type = "biosample"
+  ))
   expect_equal(dim(E), c(2,2))
 })