Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update existing NCBI functions #86

Merged
merged 9 commits into from
Aug 22, 2024
63 changes: 41 additions & 22 deletions R/ncbi_download_genome.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#' Download Genomes from NCBI Assembly Database
#'
#' This function directly downloads genome data through the NCBI FTP server.
#' @param query either an object of class \code{ncbi_uid} or an integer vector
#' of NCBI Assembly UIDs. See Details for more information.
#' @param query an object of class `ncbi_uid`, `ncbi_uid_link`, `ncbi_link`, or
#' an integer vector of NCBI Assembly UIDs. See Details for more information.
#' @param type character; the file extension to download. Valid options are
#' \code{"assembly_report"}, \code{"assembly_stats"}, \code{"cds"},
#' \code{"feature_count"}, \code{"feature_table"}, \code{"genomic.fna"},
Expand All @@ -13,34 +13,41 @@
#' @param mirror logical; should the download directory mirror the structure of
#' the FTP directory?
#' @param verbose logical; should verbose messages be printed to console?
#' @details Some functions in webseq, e.g. \code{ncbi_get_uid()} or
#' \code{ncbi_link_uid()} return objects of class \code{"ncbi_uid"}. These
#' objects may be used directly as query input for
#' \code{ncbi_download_genome()}. It is recommended to use this approach because
#' then the function will check whether the query really contains UIDs from the
#' NCBI Assembly database and fail if not. Alternatively, you can also use a
#' character vector of UIDs as query input but in this case there will be no
#' consistency checks and the function will just attempt to interpret them as
#' NCBI Assembly UIDs.
#' @details `ncbi_get_uid()` returns an object of class `ncbi_uid`;
#' `ncbi_link_uid` returns an object of class `ncbi_uid_link`; `ncbi_link`
#' returns and object of class `ncbi_link`. These objects may be used directly
#' as query input for `ncbi_download_genome`. It is recommended to use this
#' approach. Alternatively, you can also use a character vector of UIDs as query
#' input. This approach is not recommended because there are no consistency
#' checks, the function will just attempt to interpret the query as NCBI
#' Assembly UIDs.
#' @examples
#' \dontrun{
#' # Download genbank file for GCF_003007635.1.
#' # The function will access files within this directory:
#' # ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/007/635/
#' # Download a single genome
#' ncbi_get_uid("GCF_003007635.1", db = "assembly") |>
#' ncbi_download_genome()
#'
#' uid <- ncbi_get_uid("GCF_003007635.1", db = "assembly")
#' ncbi_download_genome(uid, type = "genomic.gbff", verbose = TRUE)
#' "SAMN08619567" |>
#' ncbi_get_uid(db = "biosample") |>
#' ncbi_link_uid(to = "assembly") |>
#' ncbi_download_genome()
#'
#' "SAMN08619567" |>
#' ncbi_link(from = "biosample", to = "assembly") |>
#' ncbi_download_genome()
#'
#' # Download multiple files
#' # Download multiple genomes, mirror FTP directory structure
#' data(examples)
#' uids <- ncbi_get_uid(examples$assembly, db = "assembly")
#' ncbi_download_genome(uids, type = "genomic.gff", verbose = TRUE)
#'
#' examples$assembly |>
#' ncbi_get_uid(db = "assembly") |>
#' ncbi_download_genome()
#' }
#' @export
ncbi_download_genome <- function(query,
type = "genomic.gbff",
type = "genomic.fna",
dirpath = NULL,
mirror = TRUE,
mirror = FALSE,
verbose = getOption("verbose")) {
type <- match.arg(type, c(
"assembly_report", "assembly_stats", "cds", "feature_count",
Expand All @@ -50,7 +57,19 @@ ncbi_download_genome <- function(query,
if (query$db == "assembly") {
assembly_uid <- query$uid
} else {
stop("Query must contain NCBI Assembly UIDs.")
stop("'ncbi_uid' object must contain NCBI Assembly UIDs.")
}
} else if ("ncbi_uid_link" %in% class(query)) {
if (names(query)[length(names(query))] == "assembly") {
assembly_uid <- unique(query$assembly)
} else {
stop("'ncbi_uid_link' object must contain links to NCBI Assembly UIDs.")
}
} else if ("ncbi_link" %in% class(query)) {
if (names(query)[length(names(query))] == "assembly") {
assembly_uid <- ncbi_get_uid(query$assembly, db = "assembly")$uid
} else {
stop("'ncbi_link' object must contain links to NCBI Assembly IDs.")
}
} else {
assembly_uid <- query
Expand Down
26 changes: 22 additions & 4 deletions R/ncbi_get_uid.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#' are split into batches and queried separately.
#' @param use_history logical; should the function use web history for faster
#' API queries?
#' @param na_strings character; a vector of strings which should be interpreted
#' as `NA`.
#' @param verbose logical; should verbose messages be printed to the console?
#' @return An object of class \code{"ncbi_uid"} which is a list with three
#' elements:
Expand All @@ -33,14 +35,30 @@ ncbi_get_uid <- function(
db,
batch_size = 100,
use_history = TRUE,
na_strings = "NA",
verbose = getOption("verbose")
) {
db <- match.arg(db, choices = ncbi_dbs())
if (all(is.na(term))) {
stop("No valid search terms.")
} else if (any(is.na(term))){
if (verbose) message("Removing NA-s from search terms.")
index <- which(term %in% na_strings)
if (length(index) > 0) {
if (verbose) {
terms_collapsed <- paste(term[index], collapse = ", ")
msg <- paste0(
"The following terms will be replaced with NAs: ",
terms_collapsed,
". "
)
message(msg)
}
term[index] <- NA
}
if (any(is.na(term))){
if (verbose) message("Removing NA-s from search terms. ", appendLF = FALSE)
term <- term[which(!is.na(term))]
if (verbose) message(paste0(length(term), " terms remain."))
}
if (length(term) == 0) {
stop("No valid search terms.")
}
termlist <- list()
if (length(term) > batch_size) {
Expand Down
76 changes: 54 additions & 22 deletions R/ncbi_link.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#' \code{ncbi_dbs()} lists all available options.
#' @param to character; the database in which the function should look for links.
#' \code{ncbi_dbs()} lists all available options.
#' @param multiple character; handling of rows in x with multiple matches in y.
#' For more information see `?dplyr::left_join()`.
#' @param batch_size integer; the number of search terms to query at once. If
#' the number of search terms is larger than \code{batch_size}, the search terms
#' are split into batches and queried separately.
Expand All @@ -25,14 +27,28 @@ ncbi_link <- function(
query,
from,
to,
multiple = "all",
batch_size = 100,
verbose = getOption("verbose")
){
f <- try(get(paste("ncbi_link", from, to , sep = "_")), silent = TRUE)
if (inherits(f, "try-error")) {
if (from == "assembly") {
if (to == "biosample") {
linkfun <- "ncbi_link_assembly_biosample"
} else {
stop("Link not supported.")
}
} else {
linkfun <- "ncbi_link_generic"
}
f(query, batch_size = batch_size, verbose = verbose)
f <- get(linkfun)
f(
query,
from = from,
to = to,
multiple = multiple,
batch_size = batch_size,
verbose = verbose
)
}

#' Convert NCBI Assembly IDs to NCBI BioSample IDs
Expand All @@ -51,9 +67,14 @@ ncbi_link <- function(
#' @noRd
ncbi_link_assembly_biosample <- function(
assembly,
from = "assembly",
to = "biosample",
multiple = "all",
batch_size,
verbose = getOption("verbose")
) {
from <- match.arg(from, "assembly")
to <- match.arg(to, "biosample")
from_uid <- ncbi_get_uid(
assembly,
db = "assembly",
Expand Down Expand Up @@ -88,46 +109,57 @@ ncbi_link_assembly_biosample <- function(
return(out)
}

#' Convert NCBI BioSample IDs to NCBI Assembly IDs
#' Convert NCBI IDs between databases
#'
#' This function converts one or more NCBI BioSample IDs to NCBI Assembly IDs.
#' @param biosample character; a vector of NCBI BioSample IDs.
#' @param batch_size integer; the number of search terms to query at once.
#' This function converts one or more NCBI IDs between databases. This generic
#' function retrieves UIDs from one database, links them to UIDs from another
#' database and then recovers the IDs from the UIDs. The function should work
#' with most links
#' @param query character; a vector of IDs
#' @param from character; the database the queried ID-s come from.
#' \code{ncbi_dbs()} lists all available options.
#' @param to character; the database in which the function should look for links.
#' \code{ncbi_dbs()} lists all available options.
#' @param batch_size integer; the number of search terms to query at once. If
#' the number of search terms is larger than \code{batch_size}, the search terms
#' are split into batches and queried separately.
#' @param verbose logical; should verbose messages be printed to the console?
#' @return A data frame of NCBI Biosample IDs and matching Assembly IDs.
#' @examples
#' \dontrun{
#' ncbi_convert_assembly_biosample("GCF_000002435.2")
#' }
#' @return A tibble
#' @importFrom dplyr left_join
#' @importFrom tibble tibble
#' @noRd
ncbi_link_biosample_assembly <- function(
biosample,
ncbi_link_generic <- function(
query,
from,
to,
multiple = "all",
batch_size,
verbose = getOption("verbose")
) {
uid <- ncbi_get_uid(
biosample,
db = "biosample",
query,
db = from,
batch_size = batch_size,
use_history = FALSE,
verbose = verbose
)
linked_uid <- ncbi_link_uid(
uid,
to = "assembly",
to = to,
batch_size = batch_size,
verbose = verbose
)
linked_id <- tibble::tibble(
biosample = ncbi_recover_id(linked_uid$biosample, db = "biosample"),
assembly = ncbi_recover_id(linked_uid$assembly, db = "assembly")
from = ncbi_recover_id(linked_uid[[1]], db = from),
to = ncbi_recover_id(linked_uid[[2]], db = to)
)
out <- dplyr::left_join(
tibble::tibble(biosample = biosample),
tibble::tibble(from = query),
linked_id,
by = "biosample"
)
multiple = multiple,
relationship = "many-to-many"
) |> suppressMessages()
names(out) <- c(from, to)
class(out) <- c("ncbi_link", class(out))
return(out)
}
Loading
Loading