From 53d3028acf2a33c1c20780541eb268b727887a12 Mon Sep 17 00:00:00 2001
From: Tamas Stirling <stirling.tamas@gmail.com>
Date: Tue, 23 Jul 2024 20:52:41 +0200
Subject: [PATCH 1/9] Recover nuccore IDs

---
 R/ncbi_recover_id.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/R/ncbi_recover_id.R b/R/ncbi_recover_id.R
index ba0aa66..0f161bb 100644
--- a/R/ncbi_recover_id.R
+++ b/R/ncbi_recover_id.R
@@ -60,6 +60,8 @@ ncbi_recover_id <- function(
     id <- unname(sapply(summaries, function(x) x$project_acc))
   } else if (db == "gene") {
     id <- unname(sapply(summaries, function(x) x$uid))
+  } else if (db == "nuccore") {
+    id <- unname(sapply(summaries, function(x) x$accessionversion))
   } else if (db == "protein") {
     id <- unname(sapply(summaries, function(x) x$accessionversion))
   } else {

From 07a2a28760ed7cb91f49fea716bf124c64933947 Mon Sep 17 00:00:00 2001
From: Tamas Stirling <stirling.tamas@gmail.com>
Date: Tue, 23 Jul 2024 20:58:08 +0200
Subject: [PATCH 2/9] Implement links between more NCBI databases

---
 R/ncbi_link.R    | 74 ++++++++++++++++++++++++++++++++++--------------
 man/ncbi_link.Rd |  9 +++++-
 2 files changed, 60 insertions(+), 23 deletions(-)

diff --git a/R/ncbi_link.R b/R/ncbi_link.R
index 56c482e..6b1dd92 100644
--- a/R/ncbi_link.R
+++ b/R/ncbi_link.R
@@ -25,14 +25,28 @@ ncbi_link <- function(
     query, 
     from,
     to,  
+    multiple = "all",
     batch_size = 100,
     verbose = getOption("verbose")
     ){
-  f <- try(get(paste("ncbi_link", from, to , sep = "_")), silent = TRUE)
-  if (inherits(f, "try-error")) {
+  if (from == "assembly") {
+    if (to == "biosample") {
+      linkfun <- "ncbi_link_assembly_biosample"
+    } else {
     stop("Link not supported.")
+    }
+  } else {
+    linkfun <- "ncbi_link_generic"
   }
-  f(query, batch_size = batch_size, verbose = verbose)
+  f <- get(linkfun)
+  f(
+    query,
+    from = from,
+    to = to,
+    multiple = multiple,
+    batch_size = batch_size,
+    verbose = verbose
+  )
 }
 
 #' Convert NCBI Assembly IDs to NCBI BioSample IDs
@@ -51,9 +65,14 @@ ncbi_link <- function(
 #' @noRd
 ncbi_link_assembly_biosample <- function(
     assembly, 
+    from = "assembly",
+    to = "biosample",
+    multiple = "all",
     batch_size, 
     verbose = getOption("verbose")
   ) {
+  from <- match.arg(from, "assembly")
+  to <- match.arg(to, "biosample")
   from_uid <- ncbi_get_uid(
     assembly,
     db = "assembly",
@@ -88,46 +107,57 @@ ncbi_link_assembly_biosample <- function(
   return(out)
 }
 
-#' Convert NCBI BioSample IDs to NCBI Assembly IDs
+#' Convert NCBI IDs between databases
 #' 
-#' This function converts one or more NCBI BioSample IDs to NCBI Assembly IDs.
-#' @param biosample character; a vector of NCBI BioSample IDs.
-#' @param batch_size integer; the number of search terms to query at once. 
+#' This function converts one or more NCBI IDs between databases. This generic 
+#' function retrieves UIDs from one database, links them to UIDs from another
+#' database and then recovers the IDs from the UIDs. The function should work
+#' with most links
+#' @param query character; a vector of IDs
+#' @param from character; the database the queried ID-s come from.
+#' \code{ncbi_dbs()} lists all available options.
+#' @param to character; the database in which the function should look for links.
+#' \code{ncbi_dbs()} lists all available options.
+#' @param batch_size integer; the number of search terms to query at once. If
+#' the number of search terms is larger than \code{batch_size}, the search terms
+#' are split into batches and queried separately.
 #' @param verbose logical; should verbose messages be printed to the console?
-#' @return A data frame of NCBI Biosample IDs and matching Assembly IDs.
-#' @examples
-#' \dontrun{
-#' ncbi_convert_assembly_biosample("GCF_000002435.2")
-#' }
+#' @return A tibble
 #' @importFrom dplyr left_join
 #' @importFrom tibble tibble
 #' @noRd
-ncbi_link_biosample_assembly <- function(
-    biosample,
+ncbi_link_generic <- function(
+    query,
+    from,
+    to,
+    multiple = "all",
     batch_size, 
     verbose = getOption("verbose")
   ) {
   uid <- ncbi_get_uid(
-    biosample, 
-    db = "biosample",
+    query,
+    db = from,
     batch_size = batch_size,
     use_history = FALSE,
     verbose = verbose
   )
   linked_uid <- ncbi_link_uid(
     uid,
-    to = "assembly",
+    to = to,
     batch_size = batch_size,
     verbose = verbose
   )
   linked_id <- tibble::tibble(
-    biosample = ncbi_recover_id(linked_uid$biosample, db = "biosample"),
-    assembly = ncbi_recover_id(linked_uid$assembly, db = "assembly")
+    from = ncbi_recover_id(linked_uid[[1]], db = from),
+    to = ncbi_recover_id(linked_uid[[2]], db = to)
   )
   out <- dplyr::left_join(
-    tibble::tibble(biosample = biosample),
+    tibble::tibble(from = query),
     linked_id,
-    by = "biosample"
-  )
+    multiple = multiple,
+    relationship = "many-to-many"
+  ) |> suppressMessages()
+  names(out) <- c(from, to)
+  class(out) <- c("ncbi_link", class(out))
   return(out)
 }
diff --git a/man/ncbi_link.Rd b/man/ncbi_link.Rd
index 5cc30e4..3390d42 100644
--- a/man/ncbi_link.Rd
+++ b/man/ncbi_link.Rd
@@ -4,7 +4,14 @@
 \alias{ncbi_link}
 \title{Link ID-s from one NCBI database to another}
 \usage{
-ncbi_link(query, from, to, batch_size = 100, verbose = getOption("verbose"))
+ncbi_link(
+  query,
+  from,
+  to,
+  multiple = "all",
+  batch_size = 100,
+  verbose = getOption("verbose")
+)
 }
 \arguments{
 \item{query}{character; a vector of IDs}

From e7130af3de441ed1b4d2b5aca6a5579a7f622a33 Mon Sep 17 00:00:00 2001
From: Tamas Stirling <stirling.tamas@gmail.com>
Date: Mon, 29 Jul 2024 17:23:54 +0200
Subject: [PATCH 3/9] Set S3 class for ncbi_link_uid() output

---
 R/ncbi_link_uid.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/ncbi_link_uid.R b/R/ncbi_link_uid.R
index 67129e8..ede1efa 100644
--- a/R/ncbi_link_uid.R
+++ b/R/ncbi_link_uid.R
@@ -113,5 +113,6 @@ ncbi_link_uid <- function(
     out <- dplyr::left_join(tibble::tibble(query = query), out, by = "query")
   }
   names(out) <- c(from, to)
+  class(out) <- c("ncbi_uid_link", class(out))
   return(out)
 }

From e05c51a53a8a241ccacb967e9385a0e770a43c7c Mon Sep 17 00:00:00 2001
From: Tamas Stirling <stirling.tamas@gmail.com>
Date: Mon, 29 Jul 2024 17:24:20 +0200
Subject: [PATCH 4/9] More tests for ncbi_recover_id()

---
 tests/testthat/test-ncbi_recover_id.R | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/testthat/test-ncbi_recover_id.R b/tests/testthat/test-ncbi_recover_id.R
index 0033534..07e7a5f 100644
--- a/tests/testthat/test-ncbi_recover_id.R
+++ b/tests/testthat/test-ncbi_recover_id.R
@@ -21,3 +21,9 @@ test_that("ncbi_recover_id() works with duplicates", {
   
   expect_equal(ids, c("SAMN02597423", "SAMN02597423"))
 })
+
+test_that("ncbi_recover_id() works with nuccore", {
+  uid <- ncbi_get_uid("OP617744.1", db = "nuccore")
+  id <- ncbi_recover_id(uid)
+  expect_equal(id,"OP617744.1")
+})

From c7acca6c724d53e92c4a41dcdd5500e9ab1c05f0 Mon Sep 17 00:00:00 2001
From: Tamas Stirling <stirling.tamas@gmail.com>
Date: Mon, 29 Jul 2024 17:25:38 +0200
Subject: [PATCH 5/9] More input types for ncbi_download_genome() adjust
 default 'type' and 'mirror'

---
 R/ncbi_download_genome.R    | 63 ++++++++++++++++++++++++-------------
 man/ncbi_download_genome.Rd | 49 ++++++++++++++++-------------
 2 files changed, 69 insertions(+), 43 deletions(-)

diff --git a/R/ncbi_download_genome.R b/R/ncbi_download_genome.R
index b664a0e..c49fe29 100644
--- a/R/ncbi_download_genome.R
+++ b/R/ncbi_download_genome.R
@@ -1,8 +1,8 @@
 #' Download Genomes from NCBI Assembly Database
 #'
 #' This function directly downloads genome data through the NCBI FTP server.
-#' @param query either an object of class \code{ncbi_uid} or an integer vector 
-#' of NCBI Assembly UIDs. See Details for more information.
+#' @param query an object of class `ncbi_uid`, `ncbi_uid_link`, `ncbi_link`, or 
+#' an integer vector of NCBI Assembly UIDs. See Details for more information.
 #' @param type character; the file extension to download. Valid options are
 #' \code{"assembly_report"}, \code{"assembly_stats"}, \code{"cds"},
 #' \code{"feature_count"}, \code{"feature_table"}, \code{"genomic.fna"},
@@ -13,34 +13,41 @@
 #' @param mirror logical; should the download directory mirror the structure of 
 #' the FTP directory?
 #' @param verbose logical; should verbose messages be printed to console?
-#' @details Some functions in webseq, e.g. \code{ncbi_get_uid()} or
-#' \code{ncbi_link_uid()} return objects of class \code{"ncbi_uid"}. These
-#' objects may be used directly as query input for
-#' \code{ncbi_download_genome()}. It is recommended to use this approach because
-#' then the function will check whether the query really contains UIDs from the
-#' NCBI Assembly database and fail if not. Alternatively, you can also use a
-#' character vector of UIDs as query input but in this case there will be no
-#' consistency checks and the function will just attempt to interpret them as
-#' NCBI Assembly UIDs.
+#' @details `ncbi_get_uid()` returns an object of class `ncbi_uid`; 
+#' `ncbi_link_uid` returns an object of class `ncbi_uid_link`; `ncbi_link`
+#' returns and object of class `ncbi_link`. These objects may be used directly 
+#' as query input for `ncbi_download_genome`. It is recommended to use this
+#' approach. Alternatively, you can also use a character vector of UIDs as query
+#' input.  This approach is not recommended because there are no consistency 
+#' checks, the function will just attempt to interpret the query as NCBI 
+#' Assembly UIDs.
 #' @examples
 #' \dontrun{
-#' # Download genbank file for GCF_003007635.1.
-#' # The function will access files within this directory:
-#' # ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/007/635/
+#' # Download a single genome
+#' ncbi_get_uid("GCF_003007635.1", db = "assembly") |>
+#'   ncbi_download_genome()
 #' 
-#' uid <- ncbi_get_uid("GCF_003007635.1", db = "assembly")
-#' ncbi_download_genome(uid, type = "genomic.gbff", verbose = TRUE)
+#' "SAMN08619567" |>
+#'   ncbi_get_uid(db = "biosample") |>
+#'   ncbi_link_uid(to = "assembly") |>
+#'   ncbi_download_genome()
+#'   
+#' "SAMN08619567" |>
+#'   ncbi_link(from = "biosample", to = "assembly") |>
+#'   ncbi_download_genome()
 #' 
-#' # Download multiple files
+#' # Download multiple genomes, mirror FTP directory structure
 #' data(examples) 
-#' uids <- ncbi_get_uid(examples$assembly, db = "assembly")
-#' ncbi_download_genome(uids, type = "genomic.gff", verbose = TRUE)
+#' 
+#' examples$assembly |> 
+#'   ncbi_get_uid(db = "assembly") |>
+#'   ncbi_download_genome()
 #' }
 #' @export
 ncbi_download_genome <- function(query,
-                                 type = "genomic.gbff",
+                                 type = "genomic.fna",
                                  dirpath = NULL,
-                                 mirror = TRUE,
+                                 mirror = FALSE,
                                  verbose = getOption("verbose")) {
   type <- match.arg(type, c(
     "assembly_report", "assembly_stats", "cds", "feature_count",
@@ -50,7 +57,19 @@ ncbi_download_genome <- function(query,
     if (query$db == "assembly") {
       assembly_uid <- query$uid
     } else {
-      stop("Query must contain NCBI Assembly UIDs.")
+      stop("'ncbi_uid' object must contain NCBI Assembly UIDs.")
+    }
+  } else if ("ncbi_uid_link" %in% class(query)) {
+    if (names(query)[2] == "assembly") {
+      assembly_uid <- unique(query$assembly)
+    } else {
+      stop("'ncbi_uid_link' object must contain links to NCBI Assembly UIDs.")
+    }
+  } else if ("ncbi_link" %in% class(query)) {
+    if (names(query)[2] == "assembly") {
+      assembly_uid <- ncbi_get_uid(query$assembly, db = "assembly")$uid
+    } else {
+      stop("'ncbi_link' object must contain links to NCBI Assembly IDs.")
     }
   } else {
     assembly_uid <- query
diff --git a/man/ncbi_download_genome.Rd b/man/ncbi_download_genome.Rd
index 18a2a52..594bbdd 100644
--- a/man/ncbi_download_genome.Rd
+++ b/man/ncbi_download_genome.Rd
@@ -6,15 +6,15 @@
 \usage{
 ncbi_download_genome(
   query,
-  type = "genomic.gbff",
+  type = "genomic.fna",
   dirpath = NULL,
-  mirror = TRUE,
+  mirror = FALSE,
   verbose = getOption("verbose")
 )
 }
 \arguments{
-\item{query}{either an object of class \code{ncbi_uid} or an integer vector 
-of NCBI Assembly UIDs. See Details for more information.}
+\item{query}{an object of class `ncbi_uid`, `ncbi_uid_link`, `ncbi_link`, or 
+an integer vector of NCBI Assembly UIDs. See Details for more information.}
 
 \item{type}{character; the file extension to download. Valid options are
 \code{"assembly_report"}, \code{"assembly_stats"}, \code{"cds"},
@@ -34,28 +34,35 @@ the FTP directory?}
 This function directly downloads genome data through the NCBI FTP server.
 }
 \details{
-Some functions in webseq, e.g. \code{ncbi_get_uid()} or
-\code{ncbi_link_uid()} return objects of class \code{"ncbi_uid"}. These
-objects may be used directly as query input for
-\code{ncbi_download_genome()}. It is recommended to use this approach because
-then the function will check whether the query really contains UIDs from the
-NCBI Assembly database and fail if not. Alternatively, you can also use a
-character vector of UIDs as query input but in this case there will be no
-consistency checks and the function will just attempt to interpret them as
-NCBI Assembly UIDs.
+`ncbi_get_uid()` returns an object of class `ncbi_uid`; 
+`ncbi_link_uid` returns an object of class `ncbi_uid_link`; `ncbi_link`
+returns and object of class `ncbi_link`. These objects may be used directly 
+as query input for `ncbi_download_genome`. It is recommended to use this
+approach. Alternatively, you can also use a character vector of UIDs as query
+input.  This approach is not recommended because there are no consistency 
+checks, the function will just attempt to interpret the query as NCBI 
+Assembly UIDs.
 }
 \examples{
 \dontrun{
-# Download genbank file for GCF_003007635.1.
-# The function will access files within this directory:
-# ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/007/635/
+# Download a single genome
+ncbi_get_uid("GCF_003007635.1", db = "assembly") |>
+  ncbi_download_genome()
 
-uid <- ncbi_get_uid("GCF_003007635.1", db = "assembly")
-ncbi_download_genome(uid, type = "genomic.gbff", verbose = TRUE)
+"SAMN08619567" |>
+  ncbi_get_uid(db = "biosample") |>
+  ncbi_link_uid(to = "assembly") |>
+  ncbi_download_genome()
+  
+"SAMN08619567" |>
+  ncbi_link(from = "biosample", to = "assembly") |>
+  ncbi_download_genome()
 
-# Download multiple files
+# Download multiple genomes, mirror FTP directory structure
 data(examples) 
-uids <- ncbi_get_uid(examples$assembly, db = "assembly")
-ncbi_download_genome(uids, type = "genomic.gff", verbose = TRUE)
+
+examples$assembly |> 
+  ncbi_get_uid(db = "assembly") |>
+  ncbi_download_genome()
 }
 }

From 107489073cc095e27067e18566aac0762951498f Mon Sep 17 00:00:00 2001
From: Tamas Stirling <stirling.tamas@gmail.com>
Date: Wed, 21 Aug 2024 21:56:59 +0200
Subject: [PATCH 6/9] Enable successive ncbi_link_uid() calls

---
 R/ncbi_link_uid.R                   | 95 +++++++++++++++++++++++------
 man/ncbi_link_uid.Rd                | 47 +++++++++-----
 tests/testthat/test-ncbi_link_uid.R | 26 +++++++-
 3 files changed, 131 insertions(+), 37 deletions(-)

diff --git a/R/ncbi_link_uid.R b/R/ncbi_link_uid.R
index ede1efa..df86f98 100644
--- a/R/ncbi_link_uid.R
+++ b/R/ncbi_link_uid.R
@@ -4,28 +4,47 @@
 #' different databases may be linked. For example, entries in the NCBI Assembly
 #' database may be linked with entries in the NCBI BioSample database. This
 #' function attempts to link uids from one database to another.
-#' @param query either an object of class \code{ncbi_uid} or an integer vector 
-#' of UIDs. See Details for more information.
+#' @param query either an object of class `ncbi_uid` or `ncbi_uid_link`, or an
+#' integer vector of UIDs. See Details for more information.
 #' @param from character; the database the queried UIDs come from.
-#' \code{ncbi_dbs()} lists all available options.
-#' @param to character; the database in which the function should look for links.
 #' \code{ncbi_dbs()} lists all available options. See Details for more
 #' information.
+#' @param to character; the database in which the function should look for links.
+#' \code{ncbi_dbs()} lists all available options. 
 #' @param batch_size integer; the number of search terms to query at once. If
 #' the number of search terms is larger than \code{batch_size}, the search terms
-#' are split into batches and queried separately. Not used when using web
-#' history.
+#' are split into batches and queried separately.
 #' @param verbose logical; should verbose messages be printed to the console?
-#' @return A tibble with two columns. The first column contains UIDs in the 
-#' `from` database, the second column contains linked UIDs in the `to` database.
-#' @details The function `ncbi_get_uid()` returns an object of class `ncbi_uid`. 
-#' This object may be used directly as query for `ncbi_link_uid()`. If query is 
-#' an `ncbi_uid` object, the `from` argument is optional. If `from` is not 
-#' specified, the function will retrieve it from the query object. However, if 
-#' it is specified, it must be identical to the `db` attribute of the query.
+#' @return A tibble with two or more columns. When `ncbi_link_uid()` is called
+#' on a `ncbi_uid` object or a vector of UIDs, the function returns a tibble
+#' with exactly two columns: the first column contains UIDs in the `from`
+#' database, and the second column contains linked UIDs in the `to` database.
+#' However, `ncbi_link_uid()` can be called multiple times in succession. Each
+#' call after the first call will add a new column to the returned tibble. 
+#' See Details for more information.
+#' @details The function can take three query classes: It can take `ncbi_uid`
+#' objects, these are returned by `ncbi_get_uid()`. In this case, the `from`
+#' argument will be retrieved from the query object, by default. It can also
+#' take `ncbi_uid_link` objects, which means `ncbi_link_uid()` can be called
+#' several times in a sequence to perform a number of successive conversions.
+#' When the query is an `ncbi_uid_link` object, the function will always convert
+#' the UIDs in the last column of the query object, and will retrieve the `from`
+#' argument from the name of the last column. This means links should always be
+#' interpreted "left-to-right". Note, when tibbles are joined during subsequent
+#' `ncbi_link_uid` calls they are joined using "many-to-many" relationships; see
+#' `?dplyr::left_join()` for more information. Lastly, the function can also
+#' take a vector of integer UIDs.
 #' @examples
+#' # Simple call with integer UIDs
 #' ncbi_link_uid(5197591, "assembly", "biosample")
 #' ncbi_link_uid(c(1226742659, 1883410844), "protein", "nuccore")
+#' 
+#' # Complex call with ncbi_get_uid() and several ncbi_link_uid() calls
+#' "GCF_000299415.1" |> 
+#'   ncbi_get_uid(db = "assembly") |> 
+#'   ncbi_link_uid(to = "biosample") |>
+#'   ncbi_link_uid(to = "bioproject") |>
+#'   ncbi_link_uid(to = "pubmed")
 #' @export
 ncbi_link_uid <- function(
     query,
@@ -46,6 +65,19 @@ ncbi_link_uid <- function(
         stop(msg)
       }
     }
+  } else if ("ncbi_uid_link" %in% class(query)) {
+    fromdb <- names(query)[length(names(query))]
+    if (is.null(from)) {
+      from <- fromdb
+    } else {
+      if (from != fromdb) {
+        msg <- paste0(
+          "Database for queried UIDs does not match 'from' argument.\n",
+          "Provide identical values (last column name) or use from = NULL (default)."
+        )
+        stop(msg)
+      }
+    }
   } else {
     if (is.null(from)) {
       msg <- paste0(
@@ -93,12 +125,16 @@ ncbi_link_uid <- function(
     return(out)
   }
   if ("ncbi_uid" %in% class(query)) {
-    query <- query$uid
+    query_vector <- query$uid
+  } else if ("ncbi_uid_link" %in% class(query)) {
+    query_vector <- query[[from]] |> unique()
+  } else {
+    query_vector <- query
   }
-  if (!is.numeric(query)) {
+  if (!is.numeric(query_vector)) {
     stop("Query must be an ncbi_uid object or a numeric vector or UIDs.")
   }
-  idlist <- get_idlist(query, batch_size, verbose)
+  idlist <- get_idlist(query_vector, batch_size, verbose)
   res <- lapply(idlist, function(x) {
     foo_from_ids(
       x,
@@ -108,11 +144,30 @@ ncbi_link_uid <- function(
   })
   out <- dplyr::bind_rows(res)
   if ("ncbi_uid" %in% class(query)) {
-    out <- dplyr::left_join(tibble::tibble(from = query$uid), out, by = "query")
+    out <- dplyr::left_join(
+      tibble::tibble(query = query_vector),
+      out,
+      by = "query"
+    )
+    names(out) <- c(from, to)
+  } else if ("ncbi_uid_link" %in% class(query)) {
+    names(out) <- c(from, to)
+    out <- dplyr::left_join(
+      query,
+      out,
+      by = from,
+      relationship = "many-to-many"
+    )
   } else {
-    out <- dplyr::left_join(tibble::tibble(query = query), out, by = "query")
+    out <- dplyr::left_join(
+      tibble::tibble(query = query_vector),
+      out,
+      by = "query"
+    )
+    names(out) <- c(from, to)
+  }
+  if (!"ncbi_uid_link" %in% class(out)) {
+    class(out) <- c("ncbi_uid_link", class(out))
   }
-  names(out) <- c(from, to)
-  class(out) <- c("ncbi_uid_link", class(out))
   return(out)
 }
diff --git a/man/ncbi_link_uid.Rd b/man/ncbi_link_uid.Rd
index 834e25f..e79c073 100644
--- a/man/ncbi_link_uid.Rd
+++ b/man/ncbi_link_uid.Rd
@@ -13,26 +13,30 @@ ncbi_link_uid(
 )
 }
 \arguments{
-\item{query}{either an object of class \code{ncbi_uid} or an integer vector 
-of UIDs. See Details for more information.}
+\item{query}{either an object of class `ncbi_uid` or `ncbi_uid_link`, or an
+integer vector of UIDs. See Details for more information.}
 
 \item{from}{character; the database the queried UIDs come from.
-\code{ncbi_dbs()} lists all available options.}
-
-\item{to}{character; the database in which the function should look for links.
 \code{ncbi_dbs()} lists all available options. See Details for more
 information.}
 
+\item{to}{character; the database in which the function should look for links.
+\code{ncbi_dbs()} lists all available options.}
+
 \item{batch_size}{integer; the number of search terms to query at once. If
 the number of search terms is larger than \code{batch_size}, the search terms
-are split into batches and queried separately. Not used when using web
-history.}
+are split into batches and queried separately.}
 
 \item{verbose}{logical; should verbose messages be printed to the console?}
 }
 \value{
-A tibble with two columns. The first column contains UIDs in the 
-`from` database, the second column contains linked UIDs in the `to` database.
+A tibble with two or more columns. When `ncbi_link_uid()` is called
+on a `ncbi_uid` object or a vector of UIDs, the function returns a tibble
+with exactly two columns: the first column contains UIDs in the `from`
+database, and the second column contains linked UIDs in the `to` database.
+However, `ncbi_link_uid()` can be called multiple times in succession. Each
+call after the first call will add a new column to the returned tibble. 
+See Details for more information.
 }
 \description{
 Each entry in an NCBI database has its unique internal id. Entries in
@@ -41,13 +45,28 @@ database may be linked with entries in the NCBI BioSample database. This
 function attempts to link uids from one database to another.
 }
 \details{
-The function `ncbi_get_uid()` returns an object of class `ncbi_uid`. 
-This object may be used directly as query for `ncbi_link_uid()`. If query is 
-an `ncbi_uid` object, the `from` argument is optional. If `from` is not 
-specified, the function will retrieve it from the query object. However, if 
-it is specified, it must be identical to the `db` attribute of the query.
+The function can take three query classes: It can take `ncbi_uid`
+objects, these are returned by `ncbi_get_uid()`. In this case, the `from`
+argument will be retrieved from the query object, by default. It can also
+take `ncbi_uid_link` objects, which means `ncbi_link_uid()` can be called
+several times in a sequence to perform a number of successive conversions.
+When the query is an `ncbi_uid_link` object, the function will always convert
+the UIDs in the last column of the query object, and will retrieve the `from`
+argument from the name of the last column. This means links should always be
+interpreted "left-to-right". Note, when tibbles are joined during subsequent
+`ncbi_link_uid` calls they are joined using "many-to-many" relationships; see
+`?dplyr::left_join()` for more information. Lastly, the function can also
+take a vector of integer UIDs.
 }
 \examples{
+# Simple call with integer UIDs
 ncbi_link_uid(5197591, "assembly", "biosample")
 ncbi_link_uid(c(1226742659, 1883410844), "protein", "nuccore")
+
+# Complex call with ncbi_get_uid() and several ncbi_link_uid() calls
+"GCF_000299415.1" |> 
+  ncbi_get_uid(db = "assembly") |> 
+  ncbi_link_uid(to = "biosample") |>
+  ncbi_link_uid(to = "bioproject") |>
+  ncbi_link_uid(to = "pubmed")
 }
diff --git a/tests/testthat/test-ncbi_link_uid.R b/tests/testthat/test-ncbi_link_uid.R
index dc232fe..196a055 100644
--- a/tests/testthat/test-ncbi_link_uid.R
+++ b/tests/testthat/test-ncbi_link_uid.R
@@ -64,9 +64,9 @@ test_that("ncbi_link_uid() returns more rows when there are multiple links", {
   query <- "PRJEB54063"
   puid <- ncbi_get_uid(query, db = "bioproject")
   buid <- ncbi_link_uid(puid, to = "biosample")
-  expect_equal(dim(buid), c(2,2))
-  expect_equal(buid$bioproject, c(883889, 883889))
-  expect_equal(buid$biosample, c(31267349, 31250566))
+  expect_equal(dim(buid), c(148,2))
+  expect_equal(buid$bioproject[1:2], c(883889, 883889))
+  expect_equal(buid$biosample[1:2], c(31267349, 31250566))
 })
 
 test_that("ncbi_link_uid() returns results for all valid queries", {
@@ -84,3 +84,23 @@ test_that("ncbi_link_uid() converts UIDs to numeric without coercion to NA", {
   
   expect_equal(sum(is.na(nuccore_uid$nuccore)), 0)
 })
+
+test_that("ncbi_link_uid() works with ncbi_uid_link objects", {
+  pubmed_uid <- "GCF_000299415.1" |> 
+    ncbi_get_uid(db = "assembly") |> 
+    ncbi_link_uid(to = "biosample") |>
+    ncbi_link_uid(to = "bioproject") |>
+    ncbi_link_uid(to = "pubmed")
+  
+  expect_true(inherits(pubmed_uid, "ncbi_uid_link"))
+  expect_true(inherits(pubmed_uid, "data.frame"))
+  expect_equal(dim(pubmed_uid), c(2,4))
+  expect_equal(
+    names(pubmed_uid), 
+    c("assembly", "biosample", "bioproject", "pubmed")
+  )
+  expect_equal(pubmed_uid$assembly, c(623048, 623048))
+  expect_equal(pubmed_uid$biosample, c(1730125, 1730125))
+  expect_equal(pubmed_uid$bioproject, c(224116, 174686))
+  expect_equal(pubmed_uid$pubmed, c(24316578, 23144412))
+})

From 6978b3e035cc25d2318d97ffc5607def6cda4b34 Mon Sep 17 00:00:00 2001
From: Tamas Stirling <stirling.tamas@gmail.com>
Date: Wed, 21 Aug 2024 23:02:33 +0200
Subject: [PATCH 7/9] Interpret NA strings as NA when using ncbi_get_uid()

---
 R/ncbi_get_uid.R                   | 26 ++++++++++++++++++++++----
 man/ncbi_get_uid.Rd                |  4 ++++
 tests/testthat/test-ncbi_get_uid.R | 14 ++++++++++----
 3 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/R/ncbi_get_uid.R b/R/ncbi_get_uid.R
index a41246b..9b19314 100644
--- a/R/ncbi_get_uid.R
+++ b/R/ncbi_get_uid.R
@@ -12,6 +12,8 @@
 #' are split into batches and queried separately.
 #' @param use_history logical; should the function use web history for faster
 #' API queries? 
+#' @param na_strings character; a vector of strings which should be interpreted
+#' as `NA`.
 #' @param verbose logical; should verbose messages be printed to the console?
 #' @return An object of class \code{"ncbi_uid"} which is a list with three
 #' elements:
@@ -33,14 +35,30 @@ ncbi_get_uid <- function(
     db,
     batch_size = 100,
     use_history = TRUE,
+    na_strings = "NA",
     verbose = getOption("verbose")
     ) {
   db <- match.arg(db, choices = ncbi_dbs())
-  if (all(is.na(term))) {
-    stop("No valid search terms.")
-  } else if (any(is.na(term))){
-    if (verbose) message("Removing NA-s from search terms.")
+  index <- which(term %in% na_strings)
+  if (length(index) > 0) {
+    if (verbose) {
+      terms_collapsed <- paste(term[index], collapse = ", ")
+      msg <- paste0(
+        "The following terms will be replaced with NAs: ",
+        terms_collapsed,
+        ". "
+      )
+      message(msg)
+    }
+    term[index] <- NA
+  }
+  if (any(is.na(term))){
+    if (verbose) message("Removing NA-s from search terms. ", appendLF = FALSE)
     term <- term[which(!is.na(term))]
+    if (verbose) message(paste0(length(term), " terms remain."))
+  }
+  if (length(term) == 0) {
+    stop("No valid search terms.")
   }
   termlist <- list()
   if (length(term) > batch_size) {
diff --git a/man/ncbi_get_uid.Rd b/man/ncbi_get_uid.Rd
index 0161bf0..63c0e1d 100644
--- a/man/ncbi_get_uid.Rd
+++ b/man/ncbi_get_uid.Rd
@@ -9,6 +9,7 @@ ncbi_get_uid(
   db,
   batch_size = 100,
   use_history = TRUE,
+  na_strings = "NA",
   verbose = getOption("verbose")
 )
 }
@@ -25,6 +26,9 @@ are split into batches and queried separately.}
 \item{use_history}{logical; should the function use web history for faster
 API queries?}
 
+\item{na_strings}{character; a vector of strings which should be interpreted
+as `NA`.}
+
 \item{verbose}{logical; should verbose messages be printed to the console?}
 }
 \value{
diff --git a/tests/testthat/test-ncbi_get_uid.R b/tests/testthat/test-ncbi_get_uid.R
index a7e770d..7eb9a28 100644
--- a/tests/testthat/test-ncbi_get_uid.R
+++ b/tests/testthat/test-ncbi_get_uid.R
@@ -39,9 +39,10 @@ test_that("ncbi_get_uid() handles NA", {
   expect_true(all(c("ncbi_uid", "list") %in% class(res)))
   expect_equal(length(res$uid), 2)
   
-  expect_true(res_messages[1] == "Removing NA-s from search terms.\n")
-  expect_true(res_messages[2] == "Querying UIDs for batch 1. ")
-  expect_true(res_messages[3] == "Query successful.\n")
+  expect_true(res_messages[1] == "Removing NA-s from search terms. ")
+  expect_true(res_messages[2] == "2 terms remain.\n")
+  expect_true(res_messages[3] == "Querying UIDs for batch 1. ")
+  expect_true(res_messages[4] == "Query successful.\n")
 })
 
 test_that("ncbi_get_uid() handles invalid terms", {
@@ -57,8 +58,13 @@ test_that("ncbi_get_uid() handles invalid terms", {
   expect_equal(nrow(res$web_history), 0)
 })
 
-test_that("ncbi_get_uid works with a complex term", {
+test_that("ncbi_get_uid() works with a complex term", {
   res <- ncbi_get_uid("Autographiviridae OR Podoviridae", db = "assembly")
 
   expect_true(length(res$uid) > 3000)
 })
+
+# issue #80
+test_that("ncbi_get_uid() stops with an error when input is 'NA' (string)", {
+  expect_error(ncbi_get_uid("NA", db = "biosample"))
+})

From 22378831eee5f8afc4ac514f5d036cef5891e804 Mon Sep 17 00:00:00 2001
From: Tamas Stirling <stirling.tamas@gmail.com>
Date: Wed, 21 Aug 2024 23:02:59 +0200
Subject: [PATCH 8/9] Update documentation for ncbi_link()

---
 R/ncbi_link.R    | 2 ++
 man/ncbi_link.Rd | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/R/ncbi_link.R b/R/ncbi_link.R
index 6b1dd92..ec4f610 100644
--- a/R/ncbi_link.R
+++ b/R/ncbi_link.R
@@ -9,6 +9,8 @@
 #' \code{ncbi_dbs()} lists all available options.
 #' @param to character; the database in which the function should look for links.
 #' \code{ncbi_dbs()} lists all available options.
+#' @param multiple character; handling of rows in x with multiple matches in y.
+#' For more information see `?dplyr::left_join()`.
 #' @param batch_size integer; the number of search terms to query at once. If
 #' the number of search terms is larger than \code{batch_size}, the search terms
 #' are split into batches and queried separately.
diff --git a/man/ncbi_link.Rd b/man/ncbi_link.Rd
index 3390d42..f4e1e0d 100644
--- a/man/ncbi_link.Rd
+++ b/man/ncbi_link.Rd
@@ -22,6 +22,9 @@ ncbi_link(
 \item{to}{character; the database in which the function should look for links.
 \code{ncbi_dbs()} lists all available options.}
 
+\item{multiple}{character; handling of rows in x with multiple matches in y.
+For more information see `?dplyr::left_join()`.}
+
 \item{batch_size}{integer; the number of search terms to query at once. If
 the number of search terms is larger than \code{batch_size}, the search terms
 are split into batches and queried separately.}

From 20ecde86519ca3e7954979dfd7fb97ed49ff9669 Mon Sep 17 00:00:00 2001
From: Tamas Stirling <stirling.tamas@gmail.com>
Date: Thu, 22 Aug 2024 19:44:59 +0200
Subject: [PATCH 9/9] Make ncbi_download_genome() robust to multiple links.

---
 R/ncbi_download_genome.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/ncbi_download_genome.R b/R/ncbi_download_genome.R
index c49fe29..50d85e6 100644
--- a/R/ncbi_download_genome.R
+++ b/R/ncbi_download_genome.R
@@ -60,13 +60,13 @@ ncbi_download_genome <- function(query,
       stop("'ncbi_uid' object must contain NCBI Assembly UIDs.")
     }
   } else if ("ncbi_uid_link" %in% class(query)) {
-    if (names(query)[2] == "assembly") {
+    if (names(query)[length(names(query))] == "assembly") {
       assembly_uid <- unique(query$assembly)
     } else {
       stop("'ncbi_uid_link' object must contain links to NCBI Assembly UIDs.")
     }
   } else if ("ncbi_link" %in% class(query)) {
-    if (names(query)[2] == "assembly") {
+    if (names(query)[length(names(query))] == "assembly") {
       assembly_uid <- ncbi_get_uid(query$assembly, db = "assembly")$uid
     } else {
       stop("'ncbi_link' object must contain links to NCBI Assembly IDs.")