From 53d3028acf2a33c1c20780541eb268b727887a12 Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Tue, 23 Jul 2024 20:52:41 +0200 Subject: [PATCH 1/9] Recover nuccore IDs --- R/ncbi_recover_id.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/ncbi_recover_id.R b/R/ncbi_recover_id.R index ba0aa66..0f161bb 100644 --- a/R/ncbi_recover_id.R +++ b/R/ncbi_recover_id.R @@ -60,6 +60,8 @@ ncbi_recover_id <- function( id <- unname(sapply(summaries, function(x) x$project_acc)) } else if (db == "gene") { id <- unname(sapply(summaries, function(x) x$uid)) + } else if (db == "nuccore") { + id <- unname(sapply(summaries, function(x) x$accessionversion)) } else if (db == "protein") { id <- unname(sapply(summaries, function(x) x$accessionversion)) } else { From 07a2a28760ed7cb91f49fea716bf124c64933947 Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Tue, 23 Jul 2024 20:58:08 +0200 Subject: [PATCH 2/9] Implement links between more NCBI databases --- R/ncbi_link.R | 74 ++++++++++++++++++++++++++++++++++-------------- man/ncbi_link.Rd | 9 +++++- 2 files changed, 60 insertions(+), 23 deletions(-) diff --git a/R/ncbi_link.R b/R/ncbi_link.R index 56c482e..6b1dd92 100644 --- a/R/ncbi_link.R +++ b/R/ncbi_link.R @@ -25,14 +25,28 @@ ncbi_link <- function( query, from, to, + multiple = "all", batch_size = 100, verbose = getOption("verbose") ){ - f <- try(get(paste("ncbi_link", from, to , sep = "_")), silent = TRUE) - if (inherits(f, "try-error")) { + if (from == "assembly") { + if (to == "biosample") { + linkfun <- "ncbi_link_assembly_biosample" + } else { stop("Link not supported.") + } + } else { + linkfun <- "ncbi_link_generic" } - f(query, batch_size = batch_size, verbose = verbose) + f <- get(linkfun) + f( + query, + from = from, + to = to, + multiple = multiple, + batch_size = batch_size, + verbose = verbose + ) } #' Convert NCBI Assembly IDs to NCBI BioSample IDs @@ -51,9 +65,14 @@ ncbi_link <- function( #' @noRd ncbi_link_assembly_biosample <- function( assembly, + from = "assembly", + to = "biosample", + multiple = "all", batch_size, verbose = getOption("verbose") ) { + from <- match.arg(from, "assembly") + to <- match.arg(to, "biosample") from_uid <- ncbi_get_uid( assembly, db = "assembly", @@ -88,46 +107,57 @@ ncbi_link_assembly_biosample <- function( return(out) } -#' Convert NCBI BioSample IDs to NCBI Assembly IDs +#' Convert NCBI IDs between databases #' -#' This function converts one or more NCBI BioSample IDs to NCBI Assembly IDs. -#' @param biosample character; a vector of NCBI BioSample IDs. -#' @param batch_size integer; the number of search terms to query at once. +#' This function converts one or more NCBI IDs between databases. This generic +#' function retrieves UIDs from one database, links them to UIDs from another +#' database and then recovers the IDs from the UIDs. The function should work +#' with most links +#' @param query character; a vector of IDs +#' @param from character; the database the queried ID-s come from. +#' \code{ncbi_dbs()} lists all available options. +#' @param to character; the database in which the function should look for links. +#' \code{ncbi_dbs()} lists all available options. +#' @param batch_size integer; the number of search terms to query at once. If +#' the number of search terms is larger than \code{batch_size}, the search terms +#' are split into batches and queried separately. #' @param verbose logical; should verbose messages be printed to the console? -#' @return A data frame of NCBI Biosample IDs and matching Assembly IDs. -#' @examples -#' \dontrun{ -#' ncbi_convert_assembly_biosample("GCF_000002435.2") -#' } +#' @return A tibble #' @importFrom dplyr left_join #' @importFrom tibble tibble #' @noRd -ncbi_link_biosample_assembly <- function( - biosample, +ncbi_link_generic <- function( + query, + from, + to, + multiple = "all", batch_size, verbose = getOption("verbose") ) { uid <- ncbi_get_uid( - biosample, - db = "biosample", + query, + db = from, batch_size = batch_size, use_history = FALSE, verbose = verbose ) linked_uid <- ncbi_link_uid( uid, - to = "assembly", + to = to, batch_size = batch_size, verbose = verbose ) linked_id <- tibble::tibble( - biosample = ncbi_recover_id(linked_uid$biosample, db = "biosample"), - assembly = ncbi_recover_id(linked_uid$assembly, db = "assembly") + from = ncbi_recover_id(linked_uid[[1]], db = from), + to = ncbi_recover_id(linked_uid[[2]], db = to) ) out <- dplyr::left_join( - tibble::tibble(biosample = biosample), + tibble::tibble(from = query), linked_id, - by = "biosample" - ) + multiple = multiple, + relationship = "many-to-many" + ) |> suppressMessages() + names(out) <- c(from, to) + class(out) <- c("ncbi_link", class(out)) return(out) } diff --git a/man/ncbi_link.Rd b/man/ncbi_link.Rd index 5cc30e4..3390d42 100644 --- a/man/ncbi_link.Rd +++ b/man/ncbi_link.Rd @@ -4,7 +4,14 @@ \alias{ncbi_link} \title{Link ID-s from one NCBI database to another} \usage{ -ncbi_link(query, from, to, batch_size = 100, verbose = getOption("verbose")) +ncbi_link( + query, + from, + to, + multiple = "all", + batch_size = 100, + verbose = getOption("verbose") +) } \arguments{ \item{query}{character; a vector of IDs} From e7130af3de441ed1b4d2b5aca6a5579a7f622a33 Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Mon, 29 Jul 2024 17:23:54 +0200 Subject: [PATCH 3/9] Set S3 class for ncbi_link_uid() output --- R/ncbi_link_uid.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/ncbi_link_uid.R b/R/ncbi_link_uid.R index 67129e8..ede1efa 100644 --- a/R/ncbi_link_uid.R +++ b/R/ncbi_link_uid.R @@ -113,5 +113,6 @@ ncbi_link_uid <- function( out <- dplyr::left_join(tibble::tibble(query = query), out, by = "query") } names(out) <- c(from, to) + class(out) <- c("ncbi_uid_link", class(out)) return(out) } From e05c51a53a8a241ccacb967e9385a0e770a43c7c Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Mon, 29 Jul 2024 17:24:20 +0200 Subject: [PATCH 4/9] More tests for ncbi_recover_id() --- tests/testthat/test-ncbi_recover_id.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/testthat/test-ncbi_recover_id.R b/tests/testthat/test-ncbi_recover_id.R index 0033534..07e7a5f 100644 --- a/tests/testthat/test-ncbi_recover_id.R +++ b/tests/testthat/test-ncbi_recover_id.R @@ -21,3 +21,9 @@ test_that("ncbi_recover_id() works with duplicates", { expect_equal(ids, c("SAMN02597423", "SAMN02597423")) }) + +test_that("ncbi_recover_id() works with nuccore", { + uid <- ncbi_get_uid("OP617744.1", db = "nuccore") + id <- ncbi_recover_id(uid) + expect_equal(id,"OP617744.1") +}) From c7acca6c724d53e92c4a41dcdd5500e9ab1c05f0 Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Mon, 29 Jul 2024 17:25:38 +0200 Subject: [PATCH 5/9] More input types for ncbi_download_genome() adjust default 'type' and 'mirror' --- R/ncbi_download_genome.R | 63 ++++++++++++++++++++++++------------- man/ncbi_download_genome.Rd | 49 ++++++++++++++++------------- 2 files changed, 69 insertions(+), 43 deletions(-) diff --git a/R/ncbi_download_genome.R b/R/ncbi_download_genome.R index b664a0e..c49fe29 100644 --- a/R/ncbi_download_genome.R +++ b/R/ncbi_download_genome.R @@ -1,8 +1,8 @@ #' Download Genomes from NCBI Assembly Database #' #' This function directly downloads genome data through the NCBI FTP server. -#' @param query either an object of class \code{ncbi_uid} or an integer vector -#' of NCBI Assembly UIDs. See Details for more information. +#' @param query an object of class `ncbi_uid`, `ncbi_uid_link`, `ncbi_link`, or +#' an integer vector of NCBI Assembly UIDs. See Details for more information. #' @param type character; the file extension to download. Valid options are #' \code{"assembly_report"}, \code{"assembly_stats"}, \code{"cds"}, #' \code{"feature_count"}, \code{"feature_table"}, \code{"genomic.fna"}, @@ -13,34 +13,41 @@ #' @param mirror logical; should the download directory mirror the structure of #' the FTP directory? #' @param verbose logical; should verbose messages be printed to console? -#' @details Some functions in webseq, e.g. \code{ncbi_get_uid()} or -#' \code{ncbi_link_uid()} return objects of class \code{"ncbi_uid"}. These -#' objects may be used directly as query input for -#' \code{ncbi_download_genome()}. It is recommended to use this approach because -#' then the function will check whether the query really contains UIDs from the -#' NCBI Assembly database and fail if not. Alternatively, you can also use a -#' character vector of UIDs as query input but in this case there will be no -#' consistency checks and the function will just attempt to interpret them as -#' NCBI Assembly UIDs. +#' @details `ncbi_get_uid()` returns an object of class `ncbi_uid`; +#' `ncbi_link_uid` returns an object of class `ncbi_uid_link`; `ncbi_link` +#' returns and object of class `ncbi_link`. These objects may be used directly +#' as query input for `ncbi_download_genome`. It is recommended to use this +#' approach. Alternatively, you can also use a character vector of UIDs as query +#' input. This approach is not recommended because there are no consistency +#' checks, the function will just attempt to interpret the query as NCBI +#' Assembly UIDs. #' @examples #' \dontrun{ -#' # Download genbank file for GCF_003007635.1. -#' # The function will access files within this directory: -#' # ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/007/635/ +#' # Download a single genome +#' ncbi_get_uid("GCF_003007635.1", db = "assembly") |> +#' ncbi_download_genome() #' -#' uid <- ncbi_get_uid("GCF_003007635.1", db = "assembly") -#' ncbi_download_genome(uid, type = "genomic.gbff", verbose = TRUE) +#' "SAMN08619567" |> +#' ncbi_get_uid(db = "biosample") |> +#' ncbi_link_uid(to = "assembly") |> +#' ncbi_download_genome() +#' +#' "SAMN08619567" |> +#' ncbi_link(from = "biosample", to = "assembly") |> +#' ncbi_download_genome() #' -#' # Download multiple files +#' # Download multiple genomes, mirror FTP directory structure #' data(examples) -#' uids <- ncbi_get_uid(examples$assembly, db = "assembly") -#' ncbi_download_genome(uids, type = "genomic.gff", verbose = TRUE) +#' +#' examples$assembly |> +#' ncbi_get_uid(db = "assembly") |> +#' ncbi_download_genome() #' } #' @export ncbi_download_genome <- function(query, - type = "genomic.gbff", + type = "genomic.fna", dirpath = NULL, - mirror = TRUE, + mirror = FALSE, verbose = getOption("verbose")) { type <- match.arg(type, c( "assembly_report", "assembly_stats", "cds", "feature_count", @@ -50,7 +57,19 @@ ncbi_download_genome <- function(query, if (query$db == "assembly") { assembly_uid <- query$uid } else { - stop("Query must contain NCBI Assembly UIDs.") + stop("'ncbi_uid' object must contain NCBI Assembly UIDs.") + } + } else if ("ncbi_uid_link" %in% class(query)) { + if (names(query)[2] == "assembly") { + assembly_uid <- unique(query$assembly) + } else { + stop("'ncbi_uid_link' object must contain links to NCBI Assembly UIDs.") + } + } else if ("ncbi_link" %in% class(query)) { + if (names(query)[2] == "assembly") { + assembly_uid <- ncbi_get_uid(query$assembly, db = "assembly")$uid + } else { + stop("'ncbi_link' object must contain links to NCBI Assembly IDs.") } } else { assembly_uid <- query diff --git a/man/ncbi_download_genome.Rd b/man/ncbi_download_genome.Rd index 18a2a52..594bbdd 100644 --- a/man/ncbi_download_genome.Rd +++ b/man/ncbi_download_genome.Rd @@ -6,15 +6,15 @@ \usage{ ncbi_download_genome( query, - type = "genomic.gbff", + type = "genomic.fna", dirpath = NULL, - mirror = TRUE, + mirror = FALSE, verbose = getOption("verbose") ) } \arguments{ -\item{query}{either an object of class \code{ncbi_uid} or an integer vector -of NCBI Assembly UIDs. See Details for more information.} +\item{query}{an object of class `ncbi_uid`, `ncbi_uid_link`, `ncbi_link`, or +an integer vector of NCBI Assembly UIDs. See Details for more information.} \item{type}{character; the file extension to download. Valid options are \code{"assembly_report"}, \code{"assembly_stats"}, \code{"cds"}, @@ -34,28 +34,35 @@ the FTP directory?} This function directly downloads genome data through the NCBI FTP server. } \details{ -Some functions in webseq, e.g. \code{ncbi_get_uid()} or -\code{ncbi_link_uid()} return objects of class \code{"ncbi_uid"}. These -objects may be used directly as query input for -\code{ncbi_download_genome()}. It is recommended to use this approach because -then the function will check whether the query really contains UIDs from the -NCBI Assembly database and fail if not. Alternatively, you can also use a -character vector of UIDs as query input but in this case there will be no -consistency checks and the function will just attempt to interpret them as -NCBI Assembly UIDs. +`ncbi_get_uid()` returns an object of class `ncbi_uid`; +`ncbi_link_uid` returns an object of class `ncbi_uid_link`; `ncbi_link` +returns and object of class `ncbi_link`. These objects may be used directly +as query input for `ncbi_download_genome`. It is recommended to use this +approach. Alternatively, you can also use a character vector of UIDs as query +input. This approach is not recommended because there are no consistency +checks, the function will just attempt to interpret the query as NCBI +Assembly UIDs. } \examples{ \dontrun{ -# Download genbank file for GCF_003007635.1. -# The function will access files within this directory: -# ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/007/635/ +# Download a single genome +ncbi_get_uid("GCF_003007635.1", db = "assembly") |> + ncbi_download_genome() -uid <- ncbi_get_uid("GCF_003007635.1", db = "assembly") -ncbi_download_genome(uid, type = "genomic.gbff", verbose = TRUE) +"SAMN08619567" |> + ncbi_get_uid(db = "biosample") |> + ncbi_link_uid(to = "assembly") |> + ncbi_download_genome() + +"SAMN08619567" |> + ncbi_link(from = "biosample", to = "assembly") |> + ncbi_download_genome() -# Download multiple files +# Download multiple genomes, mirror FTP directory structure data(examples) -uids <- ncbi_get_uid(examples$assembly, db = "assembly") -ncbi_download_genome(uids, type = "genomic.gff", verbose = TRUE) + +examples$assembly |> + ncbi_get_uid(db = "assembly") |> + ncbi_download_genome() } } From 107489073cc095e27067e18566aac0762951498f Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Wed, 21 Aug 2024 21:56:59 +0200 Subject: [PATCH 6/9] Enable successive ncbi_link_uid() calls --- R/ncbi_link_uid.R | 95 +++++++++++++++++++++++------ man/ncbi_link_uid.Rd | 47 +++++++++----- tests/testthat/test-ncbi_link_uid.R | 26 +++++++- 3 files changed, 131 insertions(+), 37 deletions(-) diff --git a/R/ncbi_link_uid.R b/R/ncbi_link_uid.R index ede1efa..df86f98 100644 --- a/R/ncbi_link_uid.R +++ b/R/ncbi_link_uid.R @@ -4,28 +4,47 @@ #' different databases may be linked. For example, entries in the NCBI Assembly #' database may be linked with entries in the NCBI BioSample database. This #' function attempts to link uids from one database to another. -#' @param query either an object of class \code{ncbi_uid} or an integer vector -#' of UIDs. See Details for more information. +#' @param query either an object of class `ncbi_uid` or `ncbi_uid_link`, or an +#' integer vector of UIDs. See Details for more information. #' @param from character; the database the queried UIDs come from. -#' \code{ncbi_dbs()} lists all available options. -#' @param to character; the database in which the function should look for links. #' \code{ncbi_dbs()} lists all available options. See Details for more #' information. +#' @param to character; the database in which the function should look for links. +#' \code{ncbi_dbs()} lists all available options. #' @param batch_size integer; the number of search terms to query at once. If #' the number of search terms is larger than \code{batch_size}, the search terms -#' are split into batches and queried separately. Not used when using web -#' history. +#' are split into batches and queried separately. #' @param verbose logical; should verbose messages be printed to the console? -#' @return A tibble with two columns. The first column contains UIDs in the -#' `from` database, the second column contains linked UIDs in the `to` database. -#' @details The function `ncbi_get_uid()` returns an object of class `ncbi_uid`. -#' This object may be used directly as query for `ncbi_link_uid()`. If query is -#' an `ncbi_uid` object, the `from` argument is optional. If `from` is not -#' specified, the function will retrieve it from the query object. However, if -#' it is specified, it must be identical to the `db` attribute of the query. +#' @return A tibble with two or more columns. When `ncbi_link_uid()` is called +#' on a `ncbi_uid` object or a vector of UIDs, the function returns a tibble +#' with exactly two columns: the first column contains UIDs in the `from` +#' database, and the second column contains linked UIDs in the `to` database. +#' However, `ncbi_link_uid()` can be called multiple times in succession. Each +#' call after the first call will add a new column to the returned tibble. +#' See Details for more information. +#' @details The function can take three query classes: It can take `ncbi_uid` +#' objects, these are returned by `ncbi_get_uid()`. In this case, the `from` +#' argument will be retrieved from the query object, by default. It can also +#' take `ncbi_uid_link` objects, which means `ncbi_link_uid()` can be called +#' several times in a sequence to perform a number of successive conversions. +#' When the query is an `ncbi_uid_link` object, the function will always convert +#' the UIDs in the last column of the query object, and will retrieve the `from` +#' argument from the name of the last column. This means links should always be +#' interpreted "left-to-right". Note, when tibbles are joined during subsequent +#' `ncbi_link_uid` calls they are joined using "many-to-many" relationships; see +#' `?dplyr::left_join()` for more information. Lastly, the function can also +#' take a vector of integer UIDs. #' @examples +#' # Simple call with integer UIDs #' ncbi_link_uid(5197591, "assembly", "biosample") #' ncbi_link_uid(c(1226742659, 1883410844), "protein", "nuccore") +#' +#' # Complex call with ncbi_get_uid() and several ncbi_link_uid() calls +#' "GCF_000299415.1" |> +#' ncbi_get_uid(db = "assembly") |> +#' ncbi_link_uid(to = "biosample") |> +#' ncbi_link_uid(to = "bioproject") |> +#' ncbi_link_uid(to = "pubmed") #' @export ncbi_link_uid <- function( query, @@ -46,6 +65,19 @@ ncbi_link_uid <- function( stop(msg) } } + } else if ("ncbi_uid_link" %in% class(query)) { + fromdb <- names(query)[length(names(query))] + if (is.null(from)) { + from <- fromdb + } else { + if (from != fromdb) { + msg <- paste0( + "Database for queried UIDs does not match 'from' argument.\n", + "Provide identical values (last column name) or use from = NULL (default)." + ) + stop(msg) + } + } } else { if (is.null(from)) { msg <- paste0( @@ -93,12 +125,16 @@ ncbi_link_uid <- function( return(out) } if ("ncbi_uid" %in% class(query)) { - query <- query$uid + query_vector <- query$uid + } else if ("ncbi_uid_link" %in% class(query)) { + query_vector <- query[[from]] |> unique() + } else { + query_vector <- query } - if (!is.numeric(query)) { + if (!is.numeric(query_vector)) { stop("Query must be an ncbi_uid object or a numeric vector or UIDs.") } - idlist <- get_idlist(query, batch_size, verbose) + idlist <- get_idlist(query_vector, batch_size, verbose) res <- lapply(idlist, function(x) { foo_from_ids( x, @@ -108,11 +144,30 @@ ncbi_link_uid <- function( }) out <- dplyr::bind_rows(res) if ("ncbi_uid" %in% class(query)) { - out <- dplyr::left_join(tibble::tibble(from = query$uid), out, by = "query") + out <- dplyr::left_join( + tibble::tibble(query = query_vector), + out, + by = "query" + ) + names(out) <- c(from, to) + } else if ("ncbi_uid_link" %in% class(query)) { + names(out) <- c(from, to) + out <- dplyr::left_join( + query, + out, + by = from, + relationship = "many-to-many" + ) } else { - out <- dplyr::left_join(tibble::tibble(query = query), out, by = "query") + out <- dplyr::left_join( + tibble::tibble(query = query_vector), + out, + by = "query" + ) + names(out) <- c(from, to) + } + if (!"ncbi_uid_link" %in% class(out)) { + class(out) <- c("ncbi_uid_link", class(out)) } - names(out) <- c(from, to) - class(out) <- c("ncbi_uid_link", class(out)) return(out) } diff --git a/man/ncbi_link_uid.Rd b/man/ncbi_link_uid.Rd index 834e25f..e79c073 100644 --- a/man/ncbi_link_uid.Rd +++ b/man/ncbi_link_uid.Rd @@ -13,26 +13,30 @@ ncbi_link_uid( ) } \arguments{ -\item{query}{either an object of class \code{ncbi_uid} or an integer vector -of UIDs. See Details for more information.} +\item{query}{either an object of class `ncbi_uid` or `ncbi_uid_link`, or an +integer vector of UIDs. See Details for more information.} \item{from}{character; the database the queried UIDs come from. -\code{ncbi_dbs()} lists all available options.} - -\item{to}{character; the database in which the function should look for links. \code{ncbi_dbs()} lists all available options. See Details for more information.} +\item{to}{character; the database in which the function should look for links. +\code{ncbi_dbs()} lists all available options.} + \item{batch_size}{integer; the number of search terms to query at once. If the number of search terms is larger than \code{batch_size}, the search terms -are split into batches and queried separately. Not used when using web -history.} +are split into batches and queried separately.} \item{verbose}{logical; should verbose messages be printed to the console?} } \value{ -A tibble with two columns. The first column contains UIDs in the -`from` database, the second column contains linked UIDs in the `to` database. +A tibble with two or more columns. When `ncbi_link_uid()` is called +on a `ncbi_uid` object or a vector of UIDs, the function returns a tibble +with exactly two columns: the first column contains UIDs in the `from` +database, and the second column contains linked UIDs in the `to` database. +However, `ncbi_link_uid()` can be called multiple times in succession. Each +call after the first call will add a new column to the returned tibble. +See Details for more information. } \description{ Each entry in an NCBI database has its unique internal id. Entries in @@ -41,13 +45,28 @@ database may be linked with entries in the NCBI BioSample database. This function attempts to link uids from one database to another. } \details{ -The function `ncbi_get_uid()` returns an object of class `ncbi_uid`. -This object may be used directly as query for `ncbi_link_uid()`. If query is -an `ncbi_uid` object, the `from` argument is optional. If `from` is not -specified, the function will retrieve it from the query object. However, if -it is specified, it must be identical to the `db` attribute of the query. +The function can take three query classes: It can take `ncbi_uid` +objects, these are returned by `ncbi_get_uid()`. In this case, the `from` +argument will be retrieved from the query object, by default. It can also +take `ncbi_uid_link` objects, which means `ncbi_link_uid()` can be called +several times in a sequence to perform a number of successive conversions. +When the query is an `ncbi_uid_link` object, the function will always convert +the UIDs in the last column of the query object, and will retrieve the `from` +argument from the name of the last column. This means links should always be +interpreted "left-to-right". Note, when tibbles are joined during subsequent +`ncbi_link_uid` calls they are joined using "many-to-many" relationships; see +`?dplyr::left_join()` for more information. Lastly, the function can also +take a vector of integer UIDs. } \examples{ +# Simple call with integer UIDs ncbi_link_uid(5197591, "assembly", "biosample") ncbi_link_uid(c(1226742659, 1883410844), "protein", "nuccore") + +# Complex call with ncbi_get_uid() and several ncbi_link_uid() calls +"GCF_000299415.1" |> + ncbi_get_uid(db = "assembly") |> + ncbi_link_uid(to = "biosample") |> + ncbi_link_uid(to = "bioproject") |> + ncbi_link_uid(to = "pubmed") } diff --git a/tests/testthat/test-ncbi_link_uid.R b/tests/testthat/test-ncbi_link_uid.R index dc232fe..196a055 100644 --- a/tests/testthat/test-ncbi_link_uid.R +++ b/tests/testthat/test-ncbi_link_uid.R @@ -64,9 +64,9 @@ test_that("ncbi_link_uid() returns more rows when there are multiple links", { query <- "PRJEB54063" puid <- ncbi_get_uid(query, db = "bioproject") buid <- ncbi_link_uid(puid, to = "biosample") - expect_equal(dim(buid), c(2,2)) - expect_equal(buid$bioproject, c(883889, 883889)) - expect_equal(buid$biosample, c(31267349, 31250566)) + expect_equal(dim(buid), c(148,2)) + expect_equal(buid$bioproject[1:2], c(883889, 883889)) + expect_equal(buid$biosample[1:2], c(31267349, 31250566)) }) test_that("ncbi_link_uid() returns results for all valid queries", { @@ -84,3 +84,23 @@ test_that("ncbi_link_uid() converts UIDs to numeric without coercion to NA", { expect_equal(sum(is.na(nuccore_uid$nuccore)), 0) }) + +test_that("ncbi_link_uid() works with ncbi_uid_link objects", { + pubmed_uid <- "GCF_000299415.1" |> + ncbi_get_uid(db = "assembly") |> + ncbi_link_uid(to = "biosample") |> + ncbi_link_uid(to = "bioproject") |> + ncbi_link_uid(to = "pubmed") + + expect_true(inherits(pubmed_uid, "ncbi_uid_link")) + expect_true(inherits(pubmed_uid, "data.frame")) + expect_equal(dim(pubmed_uid), c(2,4)) + expect_equal( + names(pubmed_uid), + c("assembly", "biosample", "bioproject", "pubmed") + ) + expect_equal(pubmed_uid$assembly, c(623048, 623048)) + expect_equal(pubmed_uid$biosample, c(1730125, 1730125)) + expect_equal(pubmed_uid$bioproject, c(224116, 174686)) + expect_equal(pubmed_uid$pubmed, c(24316578, 23144412)) +}) From 6978b3e035cc25d2318d97ffc5607def6cda4b34 Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Wed, 21 Aug 2024 23:02:33 +0200 Subject: [PATCH 7/9] Interpret NA strings as NA when using ncbi_get_uid() --- R/ncbi_get_uid.R | 26 ++++++++++++++++++++++---- man/ncbi_get_uid.Rd | 4 ++++ tests/testthat/test-ncbi_get_uid.R | 14 ++++++++++---- 3 files changed, 36 insertions(+), 8 deletions(-) diff --git a/R/ncbi_get_uid.R b/R/ncbi_get_uid.R index a41246b..9b19314 100644 --- a/R/ncbi_get_uid.R +++ b/R/ncbi_get_uid.R @@ -12,6 +12,8 @@ #' are split into batches and queried separately. #' @param use_history logical; should the function use web history for faster #' API queries? +#' @param na_strings character; a vector of strings which should be interpreted +#' as `NA`. #' @param verbose logical; should verbose messages be printed to the console? #' @return An object of class \code{"ncbi_uid"} which is a list with three #' elements: @@ -33,14 +35,30 @@ ncbi_get_uid <- function( db, batch_size = 100, use_history = TRUE, + na_strings = "NA", verbose = getOption("verbose") ) { db <- match.arg(db, choices = ncbi_dbs()) - if (all(is.na(term))) { - stop("No valid search terms.") - } else if (any(is.na(term))){ - if (verbose) message("Removing NA-s from search terms.") + index <- which(term %in% na_strings) + if (length(index) > 0) { + if (verbose) { + terms_collapsed <- paste(term[index], collapse = ", ") + msg <- paste0( + "The following terms will be replaced with NAs: ", + terms_collapsed, + ". " + ) + message(msg) + } + term[index] <- NA + } + if (any(is.na(term))){ + if (verbose) message("Removing NA-s from search terms. ", appendLF = FALSE) term <- term[which(!is.na(term))] + if (verbose) message(paste0(length(term), " terms remain.")) + } + if (length(term) == 0) { + stop("No valid search terms.") } termlist <- list() if (length(term) > batch_size) { diff --git a/man/ncbi_get_uid.Rd b/man/ncbi_get_uid.Rd index 0161bf0..63c0e1d 100644 --- a/man/ncbi_get_uid.Rd +++ b/man/ncbi_get_uid.Rd @@ -9,6 +9,7 @@ ncbi_get_uid( db, batch_size = 100, use_history = TRUE, + na_strings = "NA", verbose = getOption("verbose") ) } @@ -25,6 +26,9 @@ are split into batches and queried separately.} \item{use_history}{logical; should the function use web history for faster API queries?} +\item{na_strings}{character; a vector of strings which should be interpreted +as `NA`.} + \item{verbose}{logical; should verbose messages be printed to the console?} } \value{ diff --git a/tests/testthat/test-ncbi_get_uid.R b/tests/testthat/test-ncbi_get_uid.R index a7e770d..7eb9a28 100644 --- a/tests/testthat/test-ncbi_get_uid.R +++ b/tests/testthat/test-ncbi_get_uid.R @@ -39,9 +39,10 @@ test_that("ncbi_get_uid() handles NA", { expect_true(all(c("ncbi_uid", "list") %in% class(res))) expect_equal(length(res$uid), 2) - expect_true(res_messages[1] == "Removing NA-s from search terms.\n") - expect_true(res_messages[2] == "Querying UIDs for batch 1. ") - expect_true(res_messages[3] == "Query successful.\n") + expect_true(res_messages[1] == "Removing NA-s from search terms. ") + expect_true(res_messages[2] == "2 terms remain.\n") + expect_true(res_messages[3] == "Querying UIDs for batch 1. ") + expect_true(res_messages[4] == "Query successful.\n") }) test_that("ncbi_get_uid() handles invalid terms", { @@ -57,8 +58,13 @@ test_that("ncbi_get_uid() handles invalid terms", { expect_equal(nrow(res$web_history), 0) }) -test_that("ncbi_get_uid works with a complex term", { +test_that("ncbi_get_uid() works with a complex term", { res <- ncbi_get_uid("Autographiviridae OR Podoviridae", db = "assembly") expect_true(length(res$uid) > 3000) }) + +# issue #80 +test_that("ncbi_get_uid() stops with an error when input is 'NA' (string)", { + expect_error(ncbi_get_uid("NA", db = "biosample")) +}) From 22378831eee5f8afc4ac514f5d036cef5891e804 Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Wed, 21 Aug 2024 23:02:59 +0200 Subject: [PATCH 8/9] Update documentation for ncbi_link() --- R/ncbi_link.R | 2 ++ man/ncbi_link.Rd | 3 +++ 2 files changed, 5 insertions(+) diff --git a/R/ncbi_link.R b/R/ncbi_link.R index 6b1dd92..ec4f610 100644 --- a/R/ncbi_link.R +++ b/R/ncbi_link.R @@ -9,6 +9,8 @@ #' \code{ncbi_dbs()} lists all available options. #' @param to character; the database in which the function should look for links. #' \code{ncbi_dbs()} lists all available options. +#' @param multiple character; handling of rows in x with multiple matches in y. +#' For more information see `?dplyr::left_join()`. #' @param batch_size integer; the number of search terms to query at once. If #' the number of search terms is larger than \code{batch_size}, the search terms #' are split into batches and queried separately. diff --git a/man/ncbi_link.Rd b/man/ncbi_link.Rd index 3390d42..f4e1e0d 100644 --- a/man/ncbi_link.Rd +++ b/man/ncbi_link.Rd @@ -22,6 +22,9 @@ ncbi_link( \item{to}{character; the database in which the function should look for links. \code{ncbi_dbs()} lists all available options.} +\item{multiple}{character; handling of rows in x with multiple matches in y. +For more information see `?dplyr::left_join()`.} + \item{batch_size}{integer; the number of search terms to query at once. If the number of search terms is larger than \code{batch_size}, the search terms are split into batches and queried separately.} From 20ecde86519ca3e7954979dfd7fb97ed49ff9669 Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Thu, 22 Aug 2024 19:44:59 +0200 Subject: [PATCH 9/9] Make ncbi_download_genome() robust to multiple links. --- R/ncbi_download_genome.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/ncbi_download_genome.R b/R/ncbi_download_genome.R index c49fe29..50d85e6 100644 --- a/R/ncbi_download_genome.R +++ b/R/ncbi_download_genome.R @@ -60,13 +60,13 @@ ncbi_download_genome <- function(query, stop("'ncbi_uid' object must contain NCBI Assembly UIDs.") } } else if ("ncbi_uid_link" %in% class(query)) { - if (names(query)[2] == "assembly") { + if (names(query)[length(names(query))] == "assembly") { assembly_uid <- unique(query$assembly) } else { stop("'ncbi_uid_link' object must contain links to NCBI Assembly UIDs.") } } else if ("ncbi_link" %in% class(query)) { - if (names(query)[2] == "assembly") { + if (names(query)[length(names(query))] == "assembly") { assembly_uid <- ncbi_get_uid(query$assembly, db = "assembly")$uid } else { stop("'ncbi_link' object must contain links to NCBI Assembly IDs.")