stitam · stitam · Aug 22, 2024 · Jul 23, 2024 · Jul 23, 2024 · Jul 29, 2024
diff --git a/R/ncbi_download_genome.R b/R/ncbi_download_genome.R
@@ -1,8 +1,8 @@
 #' Download Genomes from NCBI Assembly Database
 #'
 #' This function directly downloads genome data through the NCBI FTP server.
-#' @param query either an object of class \code{ncbi_uid} or an integer vector 
-#' of NCBI Assembly UIDs. See Details for more information.
+#' @param query an object of class `ncbi_uid`, `ncbi_uid_link`, `ncbi_link`, or 
+#' an integer vector of NCBI Assembly UIDs. See Details for more information.
 #' @param type character; the file extension to download. Valid options are
 #' \code{"assembly_report"}, \code{"assembly_stats"}, \code{"cds"},
 #' \code{"feature_count"}, \code{"feature_table"}, \code{"genomic.fna"},
@@ -13,34 +13,41 @@
 #' @param mirror logical; should the download directory mirror the structure of 
 #' the FTP directory?
 #' @param verbose logical; should verbose messages be printed to console?
-#' @details Some functions in webseq, e.g. \code{ncbi_get_uid()} or
-#' \code{ncbi_link_uid()} return objects of class \code{"ncbi_uid"}. These
-#' objects may be used directly as query input for
-#' \code{ncbi_download_genome()}. It is recommended to use this approach because
-#' then the function will check whether the query really contains UIDs from the
-#' NCBI Assembly database and fail if not. Alternatively, you can also use a
-#' character vector of UIDs as query input but in this case there will be no
-#' consistency checks and the function will just attempt to interpret them as
-#' NCBI Assembly UIDs.
+#' @details `ncbi_get_uid()` returns an object of class `ncbi_uid`; 
+#' `ncbi_link_uid` returns an object of class `ncbi_uid_link`; `ncbi_link`
+#' returns and object of class `ncbi_link`. These objects may be used directly 
+#' as query input for `ncbi_download_genome`. It is recommended to use this
+#' approach. Alternatively, you can also use a character vector of UIDs as query
+#' input.  This approach is not recommended because there are no consistency 
+#' checks, the function will just attempt to interpret the query as NCBI 
+#' Assembly UIDs.
 #' @examples
 #' \dontrun{
-#' # Download genbank file for GCF_003007635.1.
-#' # The function will access files within this directory:
-#' # ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/007/635/
+#' # Download a single genome
+#' ncbi_get_uid("GCF_003007635.1", db = "assembly") |>
+#'   ncbi_download_genome()
 #' 
-#' uid <- ncbi_get_uid("GCF_003007635.1", db = "assembly")
-#' ncbi_download_genome(uid, type = "genomic.gbff", verbose = TRUE)
+#' "SAMN08619567" |>
+#'   ncbi_get_uid(db = "biosample") |>
+#'   ncbi_link_uid(to = "assembly") |>
+#'   ncbi_download_genome()
+#'   
+#' "SAMN08619567" |>
+#'   ncbi_link(from = "biosample", to = "assembly") |>
+#'   ncbi_download_genome()
 #' 
-#' # Download multiple files
+#' # Download multiple genomes, mirror FTP directory structure
 #' data(examples) 
-#' uids <- ncbi_get_uid(examples$assembly, db = "assembly")
-#' ncbi_download_genome(uids, type = "genomic.gff", verbose = TRUE)
+#' 
+#' examples$assembly |> 
+#'   ncbi_get_uid(db = "assembly") |>
+#'   ncbi_download_genome()
 #' }
 #' @export
 ncbi_download_genome <- function(query,
-                                 type = "genomic.gbff",
+                                 type = "genomic.fna",
                                  dirpath = NULL,
-                                 mirror = TRUE,
+                                 mirror = FALSE,
                                  verbose = getOption("verbose")) {
   type <- match.arg(type, c(
     "assembly_report", "assembly_stats", "cds", "feature_count",
@@ -50,7 +57,19 @@ ncbi_download_genome <- function(query,
     if (query$db == "assembly") {
       assembly_uid <- query$uid
     } else {
-      stop("Query must contain NCBI Assembly UIDs.")
+      stop("'ncbi_uid' object must contain NCBI Assembly UIDs.")
+    }
+  } else if ("ncbi_uid_link" %in% class(query)) {
+    if (names(query)[length(names(query))] == "assembly") {
+      assembly_uid <- unique(query$assembly)
+    } else {
+      stop("'ncbi_uid_link' object must contain links to NCBI Assembly UIDs.")
+    }
+  } else if ("ncbi_link" %in% class(query)) {
+    if (names(query)[length(names(query))] == "assembly") {
+      assembly_uid <- ncbi_get_uid(query$assembly, db = "assembly")$uid
+    } else {
+      stop("'ncbi_link' object must contain links to NCBI Assembly IDs.")
     }
   } else {
     assembly_uid <- query

diff --git a/R/ncbi_get_uid.R b/R/ncbi_get_uid.R
@@ -12,6 +12,8 @@
 #' are split into batches and queried separately.
 #' @param use_history logical; should the function use web history for faster
 #' API queries? 
+#' @param na_strings character; a vector of strings which should be interpreted
+#' as `NA`.
 #' @param verbose logical; should verbose messages be printed to the console?
 #' @return An object of class \code{"ncbi_uid"} which is a list with three
 #' elements:
@@ -33,14 +35,30 @@ ncbi_get_uid <- function(
     db,
     batch_size = 100,
     use_history = TRUE,
+    na_strings = "NA",
     verbose = getOption("verbose")
     ) {
   db <- match.arg(db, choices = ncbi_dbs())
-  if (all(is.na(term))) {
-    stop("No valid search terms.")
-  } else if (any(is.na(term))){
-    if (verbose) message("Removing NA-s from search terms.")
+  index <- which(term %in% na_strings)
+  if (length(index) > 0) {
+    if (verbose) {
+      terms_collapsed <- paste(term[index], collapse = ", ")
+      msg <- paste0(
+        "The following terms will be replaced with NAs: ",
+        terms_collapsed,
+        ". "
+      )
+      message(msg)
+    }
+    term[index] <- NA
+  }
+  if (any(is.na(term))){
+    if (verbose) message("Removing NA-s from search terms. ", appendLF = FALSE)
     term <- term[which(!is.na(term))]
+    if (verbose) message(paste0(length(term), " terms remain."))
+  }
+  if (length(term) == 0) {
+    stop("No valid search terms.")
   }
   termlist <- list()
   if (length(term) > batch_size) {

diff --git a/R/ncbi_link.R b/R/ncbi_link.R
@@ -9,6 +9,8 @@
 #' \code{ncbi_dbs()} lists all available options.
 #' @param to character; the database in which the function should look for links.
 #' \code{ncbi_dbs()} lists all available options.
+#' @param multiple character; handling of rows in x with multiple matches in y.
+#' For more information see `?dplyr::left_join()`.
 #' @param batch_size integer; the number of search terms to query at once. If
 #' the number of search terms is larger than \code{batch_size}, the search terms
 #' are split into batches and queried separately.
@@ -25,14 +27,28 @@ ncbi_link <- function(
     query, 
     from,
     to,  
+    multiple = "all",
     batch_size = 100,
     verbose = getOption("verbose")
     ){
-  f <- try(get(paste("ncbi_link", from, to , sep = "_")), silent = TRUE)
-  if (inherits(f, "try-error")) {
+  if (from == "assembly") {
+    if (to == "biosample") {
+      linkfun <- "ncbi_link_assembly_biosample"
+    } else {
     stop("Link not supported.")
+    }
+  } else {
+    linkfun <- "ncbi_link_generic"
   }
-  f(query, batch_size = batch_size, verbose = verbose)
+  f <- get(linkfun)
+  f(
+    query,
+    from = from,
+    to = to,
+    multiple = multiple,
+    batch_size = batch_size,
+    verbose = verbose
+  )
 }
 
 #' Convert NCBI Assembly IDs to NCBI BioSample IDs
@@ -51,9 +67,14 @@ ncbi_link <- function(
 #' @noRd
 ncbi_link_assembly_biosample <- function(
     assembly, 
+    from = "assembly",
+    to = "biosample",
+    multiple = "all",
     batch_size, 
     verbose = getOption("verbose")
   ) {
+  from <- match.arg(from, "assembly")
+  to <- match.arg(to, "biosample")
   from_uid <- ncbi_get_uid(
     assembly,
     db = "assembly",
@@ -88,46 +109,57 @@ ncbi_link_assembly_biosample <- function(
   return(out)
 }
 
-#' Convert NCBI BioSample IDs to NCBI Assembly IDs
+#' Convert NCBI IDs between databases
 #' 
-#' This function converts one or more NCBI BioSample IDs to NCBI Assembly IDs.
-#' @param biosample character; a vector of NCBI BioSample IDs.
-#' @param batch_size integer; the number of search terms to query at once. 
+#' This function converts one or more NCBI IDs between databases. This generic 
+#' function retrieves UIDs from one database, links them to UIDs from another
+#' database and then recovers the IDs from the UIDs. The function should work
+#' with most links
+#' @param query character; a vector of IDs
+#' @param from character; the database the queried ID-s come from.
+#' \code{ncbi_dbs()} lists all available options.
+#' @param to character; the database in which the function should look for links.
+#' \code{ncbi_dbs()} lists all available options.
+#' @param batch_size integer; the number of search terms to query at once. If
+#' the number of search terms is larger than \code{batch_size}, the search terms
+#' are split into batches and queried separately.
 #' @param verbose logical; should verbose messages be printed to the console?
-#' @return A data frame of NCBI Biosample IDs and matching Assembly IDs.
-#' @examples
-#' \dontrun{
-#' ncbi_convert_assembly_biosample("GCF_000002435.2")
-#' }
+#' @return A tibble
 #' @importFrom dplyr left_join
 #' @importFrom tibble tibble
 #' @noRd
-ncbi_link_biosample_assembly <- function(
-    biosample,
+ncbi_link_generic <- function(
+    query,
+    from,
+    to,
+    multiple = "all",
     batch_size, 
     verbose = getOption("verbose")
   ) {
   uid <- ncbi_get_uid(
-    biosample, 
-    db = "biosample",
+    query,
+    db = from,
     batch_size = batch_size,
     use_history = FALSE,
     verbose = verbose
   )
   linked_uid <- ncbi_link_uid(
     uid,
-    to = "assembly",
+    to = to,
     batch_size = batch_size,
     verbose = verbose
   )
   linked_id <- tibble::tibble(
-    biosample = ncbi_recover_id(linked_uid$biosample, db = "biosample"),
-    assembly = ncbi_recover_id(linked_uid$assembly, db = "assembly")
+    from = ncbi_recover_id(linked_uid[[1]], db = from),
+    to = ncbi_recover_id(linked_uid[[2]], db = to)
   )
   out <- dplyr::left_join(
-    tibble::tibble(biosample = biosample),
+    tibble::tibble(from = query),
     linked_id,
-    by = "biosample"
-  )
+    multiple = multiple,
+    relationship = "many-to-many"
+  ) |> suppressMessages()
+  names(out) <- c(from, to)
+  class(out) <- c("ncbi_link", class(out))
   return(out)
 }