From 119fcc1f8b64e149f9ecc30f8d0b6ca97e4e31a4 Mon Sep 17 00:00:00 2001 From: stitam Date: Thu, 16 Apr 2020 09:48:34 +0100 Subject: [PATCH 01/11] Add functions to customize communication with PubChem REST-API --- R/pubchem.R | 296 +++++++++++++++++++++++++++++++++- tests/testthat/test-pubchem.R | 61 +++++++ 2 files changed, 356 insertions(+), 1 deletion(-) diff --git a/R/pubchem.R b/R/pubchem.R index 256bbb92..7e619613 100644 --- a/R/pubchem.R +++ b/R/pubchem.R @@ -232,7 +232,6 @@ pc_prop <- function(cid, properties = NULL, verbose = TRUE, ...) { return(out) } - #' Search synonyms in pubchem #' #' Search synonyms using PUG-REST, @@ -320,3 +319,298 @@ pc_synonyms <- function(query, from = "name", choices = NULL, verbose = TRUE, out <- unlist(out) return(out) } + +#' Construct PubChem PUG-REST URL path +#' +#' Constructs a PUG-REST url and returns a listkey +#' @importFrom httr POST http_status +#' @param query character; comma-separated list of positive integers (e.g. +#' cid, sid, aid) or identifier strings (source, inchikey, formula). In some +#' cases only a single identifier string (name, smiles, xref; inchi, sdf by POST +#' only). +#' @param domain character; the domain of the search term. Can be one of +#' \code{"compound"}, \code{"substance"}, \code{"assay"} or +#' \code{}. See details for more information. +#' @param namespace character; the namespace within the domain. Valid values +#' depend on the domain. See details for more information. + +#' @param operation character; tells the service which part of the record to +#' retrieve. Currently, if no operation is specified at all, the default is to +#' retrieve the entire record. See details for more information. +#' @details Valid \code{namespace} values for \code{domain = "compound"} are: +#' \itemize{ +#' \item \code{"cid"}, \code{"name"}, \code{"smiles"}, \code{"inchi"}, +#' \code{"sdf"}, \code{"inchikey"}, \code{"formula"}, \code{}, +#' \code{}, \code{"listkey"}, \code{}. +#' \item \code{} is assembled as "\{\code{substructure}, +#' \code{superstructure}, \code{similarity}, \code{identity}\}/\{\code{smiles}, +#' \code{inchi}, \code{sdf}, \code{cid}\}", e.g. +#' \code{namespace = "substructure/smiles"}. +#' \item \code{} is assembled as "\code{xref}/\{\code{RegistryID}, +#' \code{RN}, \code{PubMedID}, \code{MMDBID}, \code{ProteinGI}, +#' \code{NucleotideGI}, \code{TaxonomyID}, \code{MIMID}, \code{GeneID}, +#' \code{ProbeID}, \code{PatentID}\}, e.g. \code{namespace = "xref/RN"}. +#' \item \code{} is either \code{"fastformula"} or is assembled as +#' "\{\code{fastidentity}, \code{fastsimilarity_2d}, \code{fastsimilarity_3d}, +#' \code{fastsubstructure}, \code{fastsuperstructure}\}/\{\code{smiles}, +#' \code{smarts}, \code{inchi}, \code{sdf} or \code{cid}\}", e.g. +#' \code{namespace = "fastidentity/smiles"}. +#' } +#' @details Valid \code{namespace} values for \code{domain = "substance"} are: +#' \itemize{ +#' \item \code{"sid"}, \code{"sourceid/"}, +#' \code{"sourceall/"}, \code{name}, \code{} or +#' \code{listkey}. +#' \item \code{} is any valid PubChem depositor name. +#' } +#' @details Valid \code{namespace} values for \code{domain = "assay"} are: +#' \itemize{ +#' \item \code{"aid"}, \code{"listkey"}, \code{"type/"}, +#' \code{"sourceall/"}, \code{"target/"} or +#' \code{"activity/"}. +#' \item \code{} can be \code{all}, \code{confirmatory}, +#' \code{doseresponse}, \code{onhold}, \code{panel}, \code{rnai}, +#' \code{screening}, \code{summary}, \code{cellbased}, \code{biochemical}, +#' \code{invivo}, \code{invitro} or \code{activeconcentrationspecified}. +#' \item \code{} can be \code{gi}, \code{proteinname}, +#' \code{geneid}, \code{genesymbol} or \code{accession}. +#' } +#' @details \code{} for domain can be \code{"sources/[substance, +#' assay]"}, \code{"sourcetable"}, \code{"conformers"}, \code{"annotations/ +#' [sourcename/, heading/]"}. +#' @details Valid \code{operation} values for \code{domain = "compound"} are: +#' \itemize{ +#' \item \code{"record"}, \code{}, \code{"synonyms"}, +#' \code{"sids"}, \code{"cids"}, \code{"aids"}, \code{"assaysummary"}, +#' \code{"classification"}, \code{}, \code{"description"} or +#' \code{"conformers"}. +#' \item \code{}, \code{"description"}. +#' \item \code{} is assembled as \code{"xrefs/ +#' [comma-separated list of xrefs tags]"}. +#' } +#' @details Valid \code{operation} values for \code{domain = "assay"} are: +#' \itemize{ +#' \item \code{"record"}, \code{"concise"}, \code{"aids"}, \code{"sids"}, +#' \code{"cids"}, \code{"description"}, \code{"targets/"}, +#' \code{}, \code{"summary"} or \code{"classification"}. +#' \item \code{} can be \code{"ProteinGI"}, \code{"ProteinName"}, +#' \code{"GeneID"} or \code{"GeneSymbol"}. +#' \item \code{} is assembled as \code{"doseresponse/sid"}. +#' } +#' @note Some source names contain the "/" character, which which is +#' incompatible with the URL syntax. Replace "/" with ".". +#' @note Other special characters may need to be escaped, such as "&" should be +#' replaced by "\%26". +#' @references For more information, visit +#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest} +#' @author Tamás Stirling, \email{stirling.tamas@@gmail.com} +#' @export +pc_pugrest <- function(query, domain, from, to, output) { + query <- paste(query, collapse = ",") + domain <- try(match.arg(domain, choices = c("compound", "substance", "assay", + "other")), silent = TRUE) + if (inherits(domain, "try-error")) stop("domain = ", domain, " is invalid.", + call. = FALSE) + from <- pc_validate(from, domain = domain, type ="from") + to <- pc_validate(to, domain = domain, type = "to") + output <- pc_validate(output, domain = domain, type = "output") + + url <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", domain, from, query, + to, output, sep = "/") + cont <- try(jsonlite::fromJSON(url), silent = TRUE) + if (inherits(cont, "try-error")) { + stop(httr::http_status(httr::POST(url))$message) + } + return(cont) +} + +#' Validate a PubChem PUG-REST API query and construct the query URL and body +#' +#' @param query character; +#' @param domain character; +#' @param from character; +#' @param to character; +#' @param output character; +#' @return a character vector of length 1 or an error message. +#' @note When validating an entry for \code{domain = "substance"}, +#' \code{type = "from"}, the function cannot tell if a source id or a depositor +#' name is valid. If the entry starts with \code{"sourceid/"} or +#' \code{"sourceall/"} it will acept the entry. +#' @note When validating an entry for \code{domain = "assay"}, +#' \code{type = "from"}, the function cannot tell if a depositor name or an +#' activity is valid. If the entry starts with \code{"sourceall/"} or +#' \code{"activity/"} it will acept the entry. +#' @note It is possible to query multiple properties or multiple xrefs in the +#' same query. When validating an entry for \code{domain = "compound"}, +#' \code{type = "to"} if the entry starts with \code{property/} or \code{xrefs/} +#' the function will also check the comma separated values after the forward +#' slash and evaluate whether the entry can be whitelisted. +#' @references \url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest} +#' @author Tamás Stirling, \email{stirling.tamas@@gmail.com} +#' @noRd +#' @examples +pc_validate <- function(query, domain, from, to = "all", output) { + domain <- tolower(gsub(" *", "", domain)) + from <- tolower(gsub(" *", "", from)) + to <- tolower(gsub(" *", "", to)) + output <- tolower(gsub(" *", "", output)) + xref <- paste( + "xref", + c("registryid", "rn", "pubmedid", "mmdbid", "proteingi", "nucleotidegi", + "taxonomyic", "mimid", "geneid", "probeid", "patentid"), + sep = "/" + ) + xrefs <- c("registryid", "rn", "pubmedid", "mmdbid", "dburl", "sburl", + "proteingi", "nucleotidegi", "taxonomyid", "mimid", "geneid", + "probeid","patentid", "sourcename", "sourcecategory") + #from_choices for domain = "compound" + if(domain == "compound") { + structure_search <- expand.grid( + c("substructure", "superstructure", "similarity", "identity"), + c("smiles", "inchi", "sdf", "cid") + ) + structure_search <- with(structure_search, paste(Var1, Var2, sep = "/")) + fast_search <- expand.grid( + c("fastidentity", "fastsimilarity_2d", "fastsimilarity_3d", + "fastsubstructure", "fastsuperstructure"), + c("smiles", "smarts", "inchi", "sdf", "cid") + ) + fast_search <- c(with(fast_search, paste(Var1, Var2, sep = "/")), + "fastformula") + from_choices <-c("cid", "name", "smiles", "inchi", "sdf", "inchikey", + "formula", structure_search, xref, "listkey", fast_search) + } + #from_choices for domain = "substance" + if (domain == "substance") { + #from = "sourceid/IBM", from = "sourceid/EU REGULATION (EC) No 1272/2008" + pre <- strsplit(from, "/")[[1]][1] + post <- paste(strsplit(from, "/")[[1]][-1], collapse =".") + if (pre == "sourceid") { + from <- paste(pre, post, sep = "/") + from_choices <- from + } + else { + from_choices <- c("sid", "name", "sourceall", xref, "listkey") + } + } + # from_choices for domain = "assay" + if (domain == "assay") { + from_choices <- c("aid", "listkey", "type", "sourceall", "target", + "activity") + } + # query_choices + if ("sourceall" %in% from_choices) { + # how to validate against all valid source names? + query <- gsub("/", ".", query) #ensure URL syntax + query_choice <- query + } + if ("type" %in% from_choices){ + query_choices <- c("all", "confirmatory", "doseresponse", "onhold", + "panel", "rnai", "screening", "summary", "cellbased", + "biochemical", "invivo", "invitro", + "activeconcentrationspecified") + } + if ("target" %in% from_choices){ + query_choices <- c("gi", "proteinname", "geneid", "genesymbol", "accession") + } + if (mean(c("sourceall", "type", "target") %in% + from_choices) == 0) { + query_choices <- query + } + #to_choices for domain = "compound" + if (domain == "compound") { + properties <- c("molecularformula", "molecularweight", "canonicalsmiles", + "isomericsmiles", "inchi", "inchikey", "iupacname", + "xlogp", "exactmass", "monoisotopicmass", "tpsa", + "complexity", "charge", "hbonddonorcount", + "hbondacceptorcount", "rotatablebondcount", "heavyatomcount", + "isotopeatomcount", "atomstereocount", + "definedatomstereocount", "undefinedatomstereocount", + "bondstereocount", "definedbondstereocount", + "undefinedbondstereocount", "covalentunitcount", "volume3d", + "xstericquadrupole3d", "ystericquadrupole3d", + "zstericquadrupole3d", "featurecount3d", + "featureacceptorcount3d", "featuredonorcount3d", + "featureanioncount3d", "featurecationcount3d", + "featureringcount3d", "featurehydrophobecount3d", + "conformermodelrmsd3d", "effectiverotorcount3d", + "conformercount3d", "fingerprint2d") + pre <- strsplit(to, "/")[[1]][1] + post <- paste(strsplit(to, "/")[[1]][-1], collapse =",") + if(pre == "property") { + props <- strsplit(post, ",")[[1]] + if (mean(props %in% properties) == 1) to_choices <- to + } + if (pre == "xrefs") { + refs <- strsplit(post, ",")[[1]] + if (mean(refs %in% xrefs) == 1) to_choices <- to + } + if (sum(pre %in% c("property", "xrefs")) == 0) { + to_choices <- c("record", "synonyms", "sids", "cids", "aids", + "assaysummary", "classification", "description", + "conformers") + } + } + #to_choices for domain = "substance" + if (domain == "substance") { + pre <- strsplit(to, "/")[[1]][1] + post <- paste(strsplit(to, "/")[[1]][-1], collapse =",") + if (pre == "xrefs") { + refs <- strsplit(post, ",")[[1]] + if (mean(refs %in% xrefs) == 1) to_choices <- to + } + else { + to_choices <- c("record", "synonyms", "sids", "cids", "aids", + "assaysummary", "classification", "description") + } + } + if (domain == "assay") { + pre <- strsplit(to, "/")[[1]][1] + post <- paste(strsplit(to, "/")[[1]][-1], collapse =",") + if (pre == "targets") { + tgts <- c("proteingi", "proteinname", "geneid", "genesymbol") + targets <- strsplit(post, ",")[[1]] + if (mean(targets %in% tgts) == 1) to_choices <- to + } + else { + to_choices <- c("record", "concise", "aids", "sids", "cids", + "description", "doseresponse/sid", "summary", + "classification") + } + } + if (strsplit(output, "?")[[1]][1] == "jsonp") output_choices <- output + else output_choices <- c("xml", "asnt", "asnb", "json", "sdf", "csv", + "png", "txt") + for (i in query) { + if (i %in% query_choices == FALSE) { + stop(paste0("query = ", i, " is invalid.")) + } + } + if (from %in% from_choices == FALSE) { + stop(paste0("from = ", from, " is invalid.")) + } + if (to %in% to_choices == FALSE & to != "all") { + stop(paste0("to = ", to, " is invalid.")) + } + if (output %in% output_choices == FALSE) { + stop(paste0("output = ", output, " is invalid.")) + } + if (to == "all") { + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", domain, from, + output, sep = "/") + } + else { + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", domain, from, to, + output, sep = "/") + } + qurl <- URLencode(qurl) + body <- paste0(strsplit(from, "/")[[1]][1], "=", paste(query, collapse = ",")) + return(list(qurl = qurl, body = body)) +} diff --git a/tests/testthat/test-pubchem.R b/tests/testthat/test-pubchem.R index b28f6ca9..7f9e5ba6 100644 --- a/tests/testthat/test-pubchem.R +++ b/tests/testthat/test-pubchem.R @@ -47,3 +47,64 @@ test_that("cid integration tests", { expect_true(is.na(pc_prop(NA, properties = "CanonicalSmiles", verbose = FALSE))) }) + +test_that("pc_prurl", { + #name/names to cid + + a <- pc_prurl(2244, domain = "compound", from = "cid", + to = "property/MolecularFormula", output = "JSON") + expect_equal(strsplit(a$qurl, "pug/")[[1]][2], + "compound/cid/property/molecularformula/json") + expect_equal(a$body, "cid=2244") + b <- pc_prurl(c(1234,2244), domain = "compound", from = "cid", + to = "property/MolecularFormula", output = "JSON") + expect_equal(strsplit(b$qurl, "pug/")[[1]][2], + "compound/cid/property/molecularformula/json") + expect_equal(b$body, "cid=1234,2244") + c <- pc_prurl("5F1CA2B314D35F28C7F94168627B29E3", domain = "substance", + from = "sourceid/ibm", output = "ASNT") + expect_equal(strsplit(c$qurl, "pug/")[[1]][2], + "substance/sourceid/ibm/asnt") + expect_equal(c$body, "sourceid=5F1CA2B314D35F28C7F94168627B29E3") + d <- pc_prurl(747285, domain = "substance", + from = "sourceid/dtp/nci", output = "sdf") + expect_equal(strsplit(d$qurl, "pug/")[[1]][2], + "substance/sourceid/dtp.nci/sdf") + expect_equal(d$body, "sourceid=747285") + e <- pc_prurl(747285, domain = "substance", + from = "sourceid/dtp/nci", output = "png") + expect_equal(strsplit(e$qurl, "pug/")[[1]][2], + "substance/sourceid/dtp.nci/png") + expect_equal(e$body, "sourceid=747285") + f <- pc_prurl(2244, domain = "compound", + from = "cid", output = "sdf") + expect_equal(strsplit(f$qurl, "pug/")[[1]][2], + "compound/cid/sdf") + expect_equal(f$body, "cid = 2244") + g <- pc_prurl("aspirin", domain = "compound", + from = "name", output = "json") + expect_equal(strsplit(g$qurl, "pug/")[[1]][2], + "compound/name/json") + expect_equal(g$body, "name = aspirin") + h <- pc_prurl(c(1,2,3,4,5), domain = "compound", from = "cid", + to = "property/molecularformula,molecularweight", + output = "xml") + expect_equal(strsplit(h$qurl, "pug/")[[1]][2], + "compound/cid/property/molecularformula,molecularweight/xml") + expect_equal(h$body, "cid=1,2,3,4,5") + i <- pc_prurl("acetic acid", domain = "compound", from = "name", + to = "cids", + output = "json") + expect_equal(strsplit(i$qurl, "pug/")[[1]][2], + "compound/name/cids/json") + expect_equal(h$body, "name=acetic acid") + + + +}) + +test_that("pc_pugrest", { + #pc_pugrest(176, "compound", "cid", "synonyms", "JSON") + #pc_pugrest("EU REGULATION (EC) No 1272/2008", substance, ) + #pc_pugrest("DC Chemicals", "substance", "sourceall", "sids", "JSON") +}) From 419c6927cee15c64c23ebe497332dfe2402f7192 Mon Sep 17 00:00:00 2001 From: stitam Date: Thu, 16 Apr 2020 16:23:04 +0100 Subject: [PATCH 02/11] New functions for the PubChem PUG-REST API --- NAMESPACE | 1 + R/pubchem.R | 144 ++++++++++++++++++++------------ man/nist_ri.Rd | 10 +-- man/pc_pugrest.Rd | 153 ++++++++++++++++++++++++++++++++++ tests/testthat/test-pubchem.R | 26 +++--- 5 files changed, 262 insertions(+), 72 deletions(-) create mode 100644 man/pc_pugrest.Rd diff --git a/NAMESPACE b/NAMESPACE index d4eb3573..0510f292 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -65,6 +65,7 @@ export(opsin_query) export(pan_query) export(parse_mol) export(pc_prop) +export(pc_pugrest) export(pc_synonyms) export(ping_cs) export(ping_pan) diff --git a/R/pubchem.R b/R/pubchem.R index 7e619613..0a4bef95 100644 --- a/R/pubchem.R +++ b/R/pubchem.R @@ -320,24 +320,40 @@ pc_synonyms <- function(query, from = "name", choices = NULL, verbose = TRUE, return(out) } -#' Construct PubChem PUG-REST URL path +#' Search PubChem through highly customizable requests #' -#' Constructs a PUG-REST url and returns a listkey +#' @description This function gives you more control over your communications +#' with the PubChem PUG-REST webservice. This documentation page was created +#' from the original PubChem PUG-REST specification document which can be found +#' at \url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest}. #' @importFrom httr POST http_status -#' @param query character; comma-separated list of positive integers (e.g. -#' cid, sid, aid) or identifier strings (source, inchikey, formula). In some -#' cases only a single identifier string (name, smiles, xref; inchi, sdf by POST -#' only). +#' @param query character; search term. A comma-separated list of positive +#' integers (e.g. cid, sid, aid) or identifier strings (source, inchikey, +#' formula). In some cases only a single identifier string (name, smiles, xref; +#' inchi, sdf by POST only). #' @param domain character; the domain of the search term. Can be one of -#' \code{"compound"}, \code{"substance"}, \code{"assay"} or -#' \code{}. See details for more information. -#' @param namespace character; the namespace within the domain. Valid values -#' depend on the domain. See details for more information. - -#' @param operation character; tells the service which part of the record to -#' retrieve. Currently, if no operation is specified at all, the default is to -#' retrieve the entire record. See details for more information. -#' @details Valid \code{namespace} values for \code{domain = "compound"} are: +#' \code{"compound"}, \code{"substance"}, \code{"assay"}, +#' \code{"sources/[substance, assay]"}, \code{"sourcetable"}, +#' \code{"conformers"}, +#' \code{"annotations/[sourcename/|heading/]"}. See +#' details for more information. +#' @param from character; the type of the search term, refered to as +#' \code{namespace} within the reference. Valid values depend on the domain. +#' See details for more information. +#' @param to character; refered to as \code{operations} within the reference. +#' Tells the service which part of the record to retrieve. Currently, if no +#' operation is specified at all, the default is to retrieve the entire record. +#' See details for more information. +#' @param output character; the format of the output. Can be one of +#' \code{"XML"}, \code{"ASNT"}, \code{"ASNB"}, \code{"JSON"}, +#' \code{"JSONP[?callback=]"}, \code{"SDF"}, \code{"CSV"}, +#' \code{"PNG"}, \code{"TXT"}. Note that not all formats are applicable to the +#' results of all operations; one cannot, for example, retrieve a whole compound +#' record as CSV or a property table as SDF. TXT output is only available in a +#' restricted set of cases where all the information is the same – for example, +#' synonyms for a single CID where there is one synonym per line. +#' @return a html POST response. +#' @details Valid \code{from} values for \code{domain = "compound"} are: #' \itemize{ #' \item \code{"cid"}, \code{"name"}, \code{"smiles"}, \code{"inchi"}, #' \code{"sdf"}, \code{"inchikey"}, \code{"formula"}, \code{}, @@ -345,25 +361,26 @@ pc_synonyms <- function(query, from = "name", choices = NULL, verbose = TRUE, #' \item \code{} is assembled as "\{\code{substructure}, #' \code{superstructure}, \code{similarity}, \code{identity}\}/\{\code{smiles}, #' \code{inchi}, \code{sdf}, \code{cid}\}", e.g. -#' \code{namespace = "substructure/smiles"}. +#' \code{from = "substructure/smiles"}. #' \item \code{} is assembled as "\code{xref}/\{\code{RegistryID}, #' \code{RN}, \code{PubMedID}, \code{MMDBID}, \code{ProteinGI}, #' \code{NucleotideGI}, \code{TaxonomyID}, \code{MIMID}, \code{GeneID}, -#' \code{ProbeID}, \code{PatentID}\}, e.g. \code{namespace = "xref/RN"}. +#' \code{ProbeID}, \code{PatentID}\}, e.g. \code{from = "xref/RN"}. is +#' not case sensitive. #' \item \code{} is either \code{"fastformula"} or is assembled as #' "\{\code{fastidentity}, \code{fastsimilarity_2d}, \code{fastsimilarity_3d}, #' \code{fastsubstructure}, \code{fastsuperstructure}\}/\{\code{smiles}, #' \code{smarts}, \code{inchi}, \code{sdf} or \code{cid}\}", e.g. -#' \code{namespace = "fastidentity/smiles"}. +#' \code{from = "fastidentity/smiles"}. #' } -#' @details Valid \code{namespace} values for \code{domain = "substance"} are: +#' @details Valid \code{from} values for \code{domain = "substance"} are: #' \itemize{ #' \item \code{"sid"}, \code{"sourceid/"}, #' \code{"sourceall/"}, \code{name}, \code{} or #' \code{listkey}. #' \item \code{} is any valid PubChem depositor name. #' } -#' @details Valid \code{namespace} values for \code{domain = "assay"} are: +#' @details Valid \code{from} values for \code{domain = "assay"} are: #' \itemize{ #' \item \code{"aid"}, \code{"listkey"}, \code{"type/"}, #' \code{"sourceall/"}, \code{"target/"} or @@ -378,16 +395,34 @@ pc_synonyms <- function(query, from = "name", choices = NULL, verbose = TRUE, #' @details \code{} for domain can be \code{"sources/[substance, #' assay]"}, \code{"sourcetable"}, \code{"conformers"}, \code{"annotations/ #' [sourcename/, heading/]"}. -#' @details Valid \code{operation} values for \code{domain = "compound"} are: +#' @details Valid \code{to} values for \code{domain = "compound"} are: #' \itemize{ #' \item \code{"record"}, \code{}, \code{"synonyms"}, #' \code{"sids"}, \code{"cids"}, \code{"aids"}, \code{"assaysummary"}, #' \code{"classification"}, \code{}, \code{"description"} or #' \code{"conformers"}. -#' \item \code{} is assembled as \code{"property/ +#' [comma-separated list of ]"}. +#' \item Valid \code{} are \code{"MolecularFormula"}, +#' \code{"MolecularWeight"}, \code{"CanonicalSMILES"}, \code{"IsomericSMILES"}, +#' \code{"InChI"}, \code{"InChIKey"}, \code{"IUPACname"}, \code{"XlogP"}, +#' \code{"ExactMass"}, \code{"MonoisotopicMass"}, \code{"TPSA"}, +#' \code{"Complexity"}, \code{"Charge"}, \code{"HBondDonorCount"}, +#' \code{"HBondAcceptorCount"}, \code{"RotatableBondCount"}, +#' \code{"HeavyAtomCount"}, \code{"IsotopeAtomCount"}, \code{"AtomStereoCount"}, +#' \code{"DefinedAtomStereoCount"}, \code{"UndefinedAtomStereoCount"}, +#' \code{"BondStereoCount"}, \code{"DefinedBondStereoCount"}, +#' \code{"UndefinedBondStereoCount"}, \code{"CovalentUnitCount"}, +#' \code{"Volume3D"}, \code{"XStericQuadrupole3D"}, +#' \code{"YStericQuadrupole3D"}, \code{"ZStericQuadrupole3D"}, +#' \code{"FeatureCount3D"}, \code{"FeatureAcceptorCount3D"}, +#' \code{"FeatureDonorCount3D"}, \code{"FeatureAnionCount3D"}, +#' \code{"FeatureCationCount3D"}, \code{"FeatureRingCount3D"}, +#' \code{"FeatureHydrophobeCount3D"}, \code{"ConformerModelRMSD3D"}, +#' \code{"EffectiveRotorCount3D"}, \code{"ConformerCount3D"}, +#' \code{"Fingerprint2D"}. are not case sensitive. #' } -#' @details Valid \code{operation} values for \code{domain = "substance"} are: +#' @details Valid \code{to} values for \code{domain = "substance"} are: #' \itemize{ #' \item \code{"record"}, \code{"synonyms"}, \code{"sids"}, \code{"cids"}, #' \code{"aids"}, \code{"assaysummary"}, \code{"classification"}, @@ -395,7 +430,7 @@ pc_synonyms <- function(query, from = "name", choices = NULL, verbose = TRUE, #' \item \code{} is assembled as \code{"xrefs/ #' [comma-separated list of xrefs tags]"}. #' } -#' @details Valid \code{operation} values for \code{domain = "assay"} are: +#' @details Valid \code{to} values for \code{domain = "assay"} are: #' \itemize{ #' \item \code{"record"}, \code{"concise"}, \code{"aids"}, \code{"sids"}, #' \code{"cids"}, \code{"description"}, \code{"targets/"}, @@ -404,49 +439,50 @@ pc_synonyms <- function(query, from = "name", choices = NULL, verbose = TRUE, #' \code{"GeneID"} or \code{"GeneSymbol"}. #' \item \code{} is assembled as \code{"doseresponse/sid"}. #' } -#' @note Some source names contain the "/" character, which which is -#' incompatible with the URL syntax. Replace "/" with ".". #' @note Other special characters may need to be escaped, such as "&" should be #' replaced by "\%26". #' @references For more information, visit #' \url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest} #' @author Tamás Stirling, \email{stirling.tamas@@gmail.com} #' @export -pc_pugrest <- function(query, domain, from, to, output) { - query <- paste(query, collapse = ",") - domain <- try(match.arg(domain, choices = c("compound", "substance", "assay", - "other")), silent = TRUE) - if (inherits(domain, "try-error")) stop("domain = ", domain, " is invalid.", - call. = FALSE) - from <- pc_validate(from, domain = domain, type ="from") - to <- pc_validate(to, domain = domain, type = "to") - output <- pc_validate(output, domain = domain, type = "output") - - url <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", domain, from, query, - to, output, sep = "/") - cont <- try(jsonlite::fromJSON(url), silent = TRUE) - if (inherits(cont, "try-error")) { - stop(httr::http_status(httr::POST(url))$message) +pc_pugrest <- function(query, domain, from, to = "all", output) { + request <- pc_validate(query, domain, from, to, output) + Sys.sleep(rgamma(1, shape = 15, scale = 1/10)) + response <- try(POST(request$qurl, body = request$body), silent = TRUE) + if (response$status_code != 200) { + stop(httr::http_status(response)$message) } - return(cont) + return(response) } #' Validate a PubChem PUG-REST API query and construct the query URL and body #' +#' @description This function validates a PubChem PUG-REST API query according +#' to teh PubChem PUG-REST specification document, which can be found at +#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest}. If a query argument is +#' invalid the function prints an informative error message. If all query +#' arguments are valid, the function returns the query url and body that can +#' then be used in http requests. See \code{\link{pc_pugrest}} for lists of +#' valid values for each argument. #' @param query character; -#' @param domain character; -#' @param from character; -#' @param to character; +#' @param domain character; \code{"compound"}, \code{"substance"}, etc. +#' @param from character; mentioned as \code{"namespace"} in the PubChem +#' PUG-REST specification document. +#' @param to character; mentioned as \code{"operations"} in the PubChem +#' PUG-REST specification document. #' @param output character; -#' @return a character vector of length 1 or an error message. -#' @note When validating an entry for \code{domain = "substance"}, -#' \code{type = "from"}, the function cannot tell if a source id or a depositor +#' @return a list of two elements, \code{url} and \code{body}. +#' @note The function cannot tell if a source id or a depositor #' name is valid. If the entry starts with \code{"sourceid/"} or #' \code{"sourceall/"} it will acept the entry. #' @note When validating an entry for \code{domain = "assay"}, #' \code{type = "from"}, the function cannot tell if a depositor name or an #' activity is valid. If the entry starts with \code{"sourceall/"} or #' \code{"activity/"} it will acept the entry. +#' @note Some source names contain the "/" character, which which is +#' incompatible with the URL syntax, e.g. EU REGULATION (EC) No 1272/2008. +#' The functions attempts to convert "/" to "." as recommended by the +#' specification document. #' @note It is possible to query multiple properties or multiple xrefs in the #' same query. When validating an entry for \code{domain = "compound"}, #' \code{type = "to"} if the entry starts with \code{property/} or \code{xrefs/} @@ -454,8 +490,8 @@ pc_pugrest <- function(query, domain, from, to, output) { #' slash and evaluate whether the entry can be whitelisted. #' @references \url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest} #' @author Tamás Stirling, \email{stirling.tamas@@gmail.com} +#' @seealso \code{\link{pc_pugrest}} #' @noRd -#' @examples pc_validate <- function(query, domain, from, to = "all", output) { domain <- tolower(gsub(" *", "", domain)) from <- tolower(gsub(" *", "", from)) @@ -464,12 +500,12 @@ pc_validate <- function(query, domain, from, to = "all", output) { xref <- paste( "xref", c("registryid", "rn", "pubmedid", "mmdbid", "proteingi", "nucleotidegi", - "taxonomyic", "mimid", "geneid", "probeid", "patentid"), + "taxonomyid", "mimid", "geneid", "probeid", "patentid"), sep = "/" ) - xrefs <- c("registryid", "rn", "pubmedid", "mmdbid", "dburl", "sburl", - "proteingi", "nucleotidegi", "taxonomyid", "mimid", "geneid", - "probeid","patentid", "sourcename", "sourcecategory") + xrefs <- c("registryid", "rn", "pubmedid", "mmdbid", "proteingi", + "nucleotidegi", "taxonomyid", "mimid", "geneid", "probeid", + "patentid", "dburl", "sburl", "sourcename", "sourcecategory") #from_choices for domain = "compound" if(domain == "compound") { structure_search <- expand.grid( diff --git a/man/nist_ri.Rd b/man/nist_ri.Rd index 0afbfead..ff0b904f 100644 --- a/man/nist_ri.Rd +++ b/man/nist_ri.Rd @@ -18,17 +18,17 @@ nist_ri( \item{from}{character; type of search term. can be one of \code{"name"}, \code{"inchi"}, \code{"inchikey"}, or \code{"cas"}. Using an identifier is -preferred to \code{"name"} since \code{NA} is returned in the even of +preferred to \code{"name"} since \code{NA} is returned in the event of multiple matches to a query.} \item{type}{Retention index type. One of \code{"kovats"}, \code{"linear"}, \code{"alkane"}, or \code{"lee"}. See details for more.} -\item{polarity}{Column polarity. One of "polar" or "non-polar" to get RIs -calculated for polar or non-polar columns.} +\item{polarity}{Column polarity. One of \code{"polar"} or \code{"non-polar"} +to get RIs calculated for polar or non-polar columns.} -\item{temp_prog}{Temperature program. One of "isothermal", "ramp", or -"custom".} +\item{temp_prog}{Temperature program. One of \code{"isothermal"}, +\code{"ramp"}, or \code{"custom"}.} \item{cas}{deprecated. Use \code{query} instead.} } diff --git a/man/pc_pugrest.Rd b/man/pc_pugrest.Rd new file mode 100644 index 00000000..e30861cf --- /dev/null +++ b/man/pc_pugrest.Rd @@ -0,0 +1,153 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pubchem.R +\name{pc_pugrest} +\alias{pc_pugrest} +\title{Search PubChem through highly customizable requests} +\usage{ +pc_pugrest(query, domain, from, to = "all", output) +} +\arguments{ +\item{query}{character; search term. A comma-separated list of positive +integers (e.g. cid, sid, aid) or identifier strings (source, inchikey, +formula). In some cases only a single identifier string (name, smiles, xref; +inchi, sdf by POST only).} + +\item{domain}{character; the domain of the search term. Can be one of +\code{"compound"}, \code{"substance"}, \code{"assay"}, +\code{"sources/[substance, assay]"}, \code{"sourcetable"}, +\code{"conformers"}, +\code{"annotations/[sourcename/|heading/]"}. See +details for more information.} + +\item{from}{character; the type of the search term, refered to as +\code{namespace} within the reference. Valid values depend on the domain. +See details for more information.} + +\item{to}{character; refered to as \code{operations} within the reference. +Tells the service which part of the record to retrieve. Currently, if no +operation is specified at all, the default is to retrieve the entire record. +See details for more information.} + +\item{output}{character; the format of the output. Can be one of +\code{"XML"}, \code{"ASNT"}, \code{"ASNB"}, \code{"JSON"}, +\code{"JSONP[?callback=]"}, \code{"SDF"}, \code{"CSV"}, +\code{"PNG"}, \code{"TXT"}. Note that not all formats are applicable to the +results of all operations; one cannot, for example, retrieve a whole compound +record as CSV or a property table as SDF. TXT output is only available in a +restricted set of cases where all the information is the same – for example, +synonyms for a single CID where there is one synonym per line.} +} +\value{ +a html POST response. +} +\description{ +This function gives you more control over your communications +with the PubChem PUG-REST webservice. This documentation page was created +from the original PubChem PUG-REST specification document which can be found +at \url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest}. +} +\details{ +Valid \code{from} values for \code{domain = "compound"} are: +\itemize{ +\item \code{"cid"}, \code{"name"}, \code{"smiles"}, \code{"inchi"}, +\code{"sdf"}, \code{"inchikey"}, \code{"formula"}, \code{}, +\code{}, \code{"listkey"}, \code{}. +\item \code{} is assembled as "\{\code{substructure}, +\code{superstructure}, \code{similarity}, \code{identity}\}/\{\code{smiles}, +\code{inchi}, \code{sdf}, \code{cid}\}", e.g. +\code{from = "substructure/smiles"}. +\item \code{} is assembled as "\code{xref}/\{\code{RegistryID}, +\code{RN}, \code{PubMedID}, \code{MMDBID}, \code{ProteinGI}, +\code{NucleotideGI}, \code{TaxonomyID}, \code{MIMID}, \code{GeneID}, +\code{ProbeID}, \code{PatentID}\}, e.g. \code{from = "xref/RN"}. is +not case sensitive. +\item \code{} is either \code{"fastformula"} or is assembled as +"\{\code{fastidentity}, \code{fastsimilarity_2d}, \code{fastsimilarity_3d}, +\code{fastsubstructure}, \code{fastsuperstructure}\}/\{\code{smiles}, +\code{smarts}, \code{inchi}, \code{sdf} or \code{cid}\}", e.g. +\code{from = "fastidentity/smiles"}. +} + +Valid \code{from} values for \code{domain = "substance"} are: +\itemize{ +\item \code{"sid"}, \code{"sourceid/"}, +\code{"sourceall/"}, \code{name}, \code{} or +\code{listkey}. +\item \code{} is any valid PubChem depositor name. +} + +Valid \code{from} values for \code{domain = "assay"} are: +\itemize{ +\item \code{"aid"}, \code{"listkey"}, \code{"type/"}, +\code{"sourceall/"}, \code{"target/"} or +\code{"activity/"}. +\item \code{} can be \code{all}, \code{confirmatory}, +\code{doseresponse}, \code{onhold}, \code{panel}, \code{rnai}, +\code{screening}, \code{summary}, \code{cellbased}, \code{biochemical}, +\code{invivo}, \code{invitro} or \code{activeconcentrationspecified}. +\item \code{} can be \code{gi}, \code{proteinname}, +\code{geneid}, \code{genesymbol} or \code{accession}. +} + +\code{} for domain can be \code{"sources/[substance, +assay]"}, \code{"sourcetable"}, \code{"conformers"}, \code{"annotations/ +[sourcename/, heading/]"}. + +Valid \code{to} values for \code{domain = "compound"} are: +\itemize{ +\item \code{"record"}, \code{}, \code{"synonyms"}, +\code{"sids"}, \code{"cids"}, \code{"aids"}, \code{"assaysummary"}, +\code{"classification"}, \code{}, \code{"description"} or +\code{"conformers"}. +\item \code{} is assembled as \code{"property/ +[comma-separated list of ]"}. +\item Valid \code{} are \code{"MolecularFormula"}, +\code{"MolecularWeight"}, \code{"CanonicalSMILES"}, \code{"IsomericSMILES"}, +\code{"InChI"}, \code{"InChIKey"}, \code{"IUPACname"}, \code{"XlogP"}, +\code{"ExactMass"}, \code{"MonoisotopicMass"}, \code{"TPSA"}, +\code{"Complexity"}, \code{"Charge"}, \code{"HBondDonorCount"}, +\code{"HBondAcceptorCount"}, \code{"RotatableBondCount"}, +\code{"HeavyAtomCount"}, \code{"IsotopeAtomCount"}, \code{"AtomStereoCount"}, +\code{"DefinedAtomStereoCount"}, \code{"UndefinedAtomStereoCount"}, +\code{"BondStereoCount"}, \code{"DefinedBondStereoCount"}, +\code{"UndefinedBondStereoCount"}, \code{"CovalentUnitCount"}, +\code{"Volume3D"}, \code{"XStericQuadrupole3D"}, +\code{"YStericQuadrupole3D"}, \code{"ZStericQuadrupole3D"}, +\code{"FeatureCount3D"}, \code{"FeatureAcceptorCount3D"}, +\code{"FeatureDonorCount3D"}, \code{"FeatureAnionCount3D"}, +\code{"FeatureCationCount3D"}, \code{"FeatureRingCount3D"}, +\code{"FeatureHydrophobeCount3D"}, \code{"ConformerModelRMSD3D"}, +\code{"EffectiveRotorCount3D"}, \code{"ConformerCount3D"}, +\code{"Fingerprint2D"}. are not case sensitive. +} + +Valid \code{to} values for \code{domain = "substance"} are: +\itemize{ +\item \code{"record"}, \code{"synonyms"}, \code{"sids"}, \code{"cids"}, +\code{"aids"}, \code{"assaysummary"}, \code{"classification"}, +\code{}, \code{"description"}. +\item \code{} is assembled as \code{"xrefs/ +[comma-separated list of xrefs tags]"}. +} + +Valid \code{to} values for \code{domain = "assay"} are: +\itemize{ +\item \code{"record"}, \code{"concise"}, \code{"aids"}, \code{"sids"}, +\code{"cids"}, \code{"description"}, \code{"targets/"}, +\code{}, \code{"summary"} or \code{"classification"}. +\item \code{} can be \code{"ProteinGI"}, \code{"ProteinName"}, +\code{"GeneID"} or \code{"GeneSymbol"}. +\item \code{} is assembled as \code{"doseresponse/sid"}. +} +} +\note{ +Other special characters may need to be escaped, such as "&" should be +replaced by "\%26". +} +\references{ +For more information, visit +\url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest} +} +\author{ +Tamás Stirling, \email{stirling.tamas@gmail.com} +} diff --git a/tests/testthat/test-pubchem.R b/tests/testthat/test-pubchem.R index 7f9e5ba6..dffa100a 100644 --- a/tests/testthat/test-pubchem.R +++ b/tests/testthat/test-pubchem.R @@ -48,56 +48,56 @@ test_that("cid integration tests", { verbose = FALSE))) }) -test_that("pc_prurl", { +test_that("pc_validate", { #name/names to cid - a <- pc_prurl(2244, domain = "compound", from = "cid", + a <- pc_validate(2244, domain = "compound", from = "cid", to = "property/MolecularFormula", output = "JSON") expect_equal(strsplit(a$qurl, "pug/")[[1]][2], "compound/cid/property/molecularformula/json") expect_equal(a$body, "cid=2244") - b <- pc_prurl(c(1234,2244), domain = "compound", from = "cid", + b <- pc_validate(c(1234,2244), domain = "compound", from = "cid", to = "property/MolecularFormula", output = "JSON") expect_equal(strsplit(b$qurl, "pug/")[[1]][2], "compound/cid/property/molecularformula/json") expect_equal(b$body, "cid=1234,2244") - c <- pc_prurl("5F1CA2B314D35F28C7F94168627B29E3", domain = "substance", + c <- pc_validate("5F1CA2B314D35F28C7F94168627B29E3", domain = "substance", from = "sourceid/ibm", output = "ASNT") expect_equal(strsplit(c$qurl, "pug/")[[1]][2], "substance/sourceid/ibm/asnt") expect_equal(c$body, "sourceid=5F1CA2B314D35F28C7F94168627B29E3") - d <- pc_prurl(747285, domain = "substance", + d <- pc_validate(747285, domain = "substance", from = "sourceid/dtp/nci", output = "sdf") expect_equal(strsplit(d$qurl, "pug/")[[1]][2], "substance/sourceid/dtp.nci/sdf") expect_equal(d$body, "sourceid=747285") - e <- pc_prurl(747285, domain = "substance", + e <- pc_validate(747285, domain = "substance", from = "sourceid/dtp/nci", output = "png") expect_equal(strsplit(e$qurl, "pug/")[[1]][2], "substance/sourceid/dtp.nci/png") expect_equal(e$body, "sourceid=747285") - f <- pc_prurl(2244, domain = "compound", + f <- pc_validate(2244, domain = "compound", from = "cid", output = "sdf") expect_equal(strsplit(f$qurl, "pug/")[[1]][2], "compound/cid/sdf") - expect_equal(f$body, "cid = 2244") - g <- pc_prurl("aspirin", domain = "compound", + expect_equal(f$body, "cid=2244") + g <- pc_validate("aspirin", domain = "compound", from = "name", output = "json") expect_equal(strsplit(g$qurl, "pug/")[[1]][2], "compound/name/json") - expect_equal(g$body, "name = aspirin") - h <- pc_prurl(c(1,2,3,4,5), domain = "compound", from = "cid", + expect_equal(g$body, "name=aspirin") + h <- pc_validate(c(1,2,3,4,5), domain = "compound", from = "cid", to = "property/molecularformula,molecularweight", output = "xml") expect_equal(strsplit(h$qurl, "pug/")[[1]][2], "compound/cid/property/molecularformula,molecularweight/xml") expect_equal(h$body, "cid=1,2,3,4,5") - i <- pc_prurl("acetic acid", domain = "compound", from = "name", + i <- pc_validate("acetic acid", domain = "compound", from = "name", to = "cids", output = "json") expect_equal(strsplit(i$qurl, "pug/")[[1]][2], "compound/name/cids/json") - expect_equal(h$body, "name=acetic acid") + expect_equal(i$body, "name=acetic acid") From c13b7e1cf22e1d34999edfb048d53eb9c29eb8bb Mon Sep 17 00:00:00 2001 From: stitam Date: Thu, 16 Apr 2020 16:36:53 +0100 Subject: [PATCH 03/11] Updates documentation for pc_pugrest --- R/pubchem.R | 6 +++--- man/pc_pugrest.Rd | 8 +++----- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/R/pubchem.R b/R/pubchem.R index 0a4bef95..88e0de91 100644 --- a/R/pubchem.R +++ b/R/pubchem.R @@ -378,7 +378,9 @@ pc_synonyms <- function(query, from = "name", choices = NULL, verbose = TRUE, #' \item \code{"sid"}, \code{"sourceid/"}, #' \code{"sourceall/"}, \code{name}, \code{} or #' \code{listkey}. -#' \item \code{} is any valid PubChem depositor name. +#' \item \code{} is any valid PubChem depositor name. Valid names +#' can be found at \url{https://pubchem.ncbi.nlm.nih.gov/sources/}. Special +#' characters may need to be escaped, such as "&" should be replaced by "\%26". #' } #' @details Valid \code{from} values for \code{domain = "assay"} are: #' \itemize{ @@ -439,8 +441,6 @@ pc_synonyms <- function(query, from = "name", choices = NULL, verbose = TRUE, #' \code{"GeneID"} or \code{"GeneSymbol"}. #' \item \code{} is assembled as \code{"doseresponse/sid"}. #' } -#' @note Other special characters may need to be escaped, such as "&" should be -#' replaced by "\%26". #' @references For more information, visit #' \url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest} #' @author Tamás Stirling, \email{stirling.tamas@@gmail.com} diff --git a/man/pc_pugrest.Rd b/man/pc_pugrest.Rd index e30861cf..d2c07ddc 100644 --- a/man/pc_pugrest.Rd +++ b/man/pc_pugrest.Rd @@ -73,7 +73,9 @@ Valid \code{from} values for \code{domain = "substance"} are: \item \code{"sid"}, \code{"sourceid/"}, \code{"sourceall/"}, \code{name}, \code{} or \code{listkey}. -\item \code{} is any valid PubChem depositor name. +\item \code{} is any valid PubChem depositor name. Valid names +can be found at \url{https://pubchem.ncbi.nlm.nih.gov/sources/}. Special +characters may need to be escaped, such as "&" should be replaced by "\%26". } Valid \code{from} values for \code{domain = "assay"} are: @@ -140,10 +142,6 @@ Valid \code{to} values for \code{domain = "assay"} are: \item \code{} is assembled as \code{"doseresponse/sid"}. } } -\note{ -Other special characters may need to be escaped, such as "&" should be -replaced by "\%26". -} \references{ For more information, visit \url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest} From 87352369cccd01d96216d07d775bdeaebeceec02 Mon Sep 17 00:00:00 2001 From: stitam Date: Thu, 16 Apr 2020 17:30:19 +0100 Subject: [PATCH 04/11] Multiple ways to post http requests --- R/pubchem.R | 50 ++++++++++++++++++++++++++++++++++++++--------- man/pc_pugrest.Rd | 2 +- 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/R/pubchem.R b/R/pubchem.R index 88e0de91..503ee399 100644 --- a/R/pubchem.R +++ b/R/pubchem.R @@ -445,11 +445,21 @@ pc_synonyms <- function(query, from = "name", choices = NULL, verbose = TRUE, #' \url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest} #' @author Tamás Stirling, \email{stirling.tamas@@gmail.com} #' @export -pc_pugrest <- function(query, domain, from, to = "all", output) { - request <- pc_validate(query, domain, from, to, output) +pc_pugrest <- function(query, + domain, + from, + to = "all", + output, + single_url = FALSE) { + request <- pc_validate(query, domain, from, to, output, single_url) Sys.sleep(rgamma(1, shape = 15, scale = 1/10)) - response <- try(POST(request$qurl, body = request$body), silent = TRUE) - if (response$status_code != 200) { + if (single_url == TRUE){ + response <- try(POST(request$qurl), silent = TRUE) + } + else { + response <- try(POST(request$qurl, body = request$body), silent = TRUE) + } + if (response$status_code %in% c(200,202) == FALSE) { stop(httr::http_status(response)$message) } return(response) @@ -492,7 +502,12 @@ pc_pugrest <- function(query, domain, from, to = "all", output) { #' @author Tamás Stirling, \email{stirling.tamas@@gmail.com} #' @seealso \code{\link{pc_pugrest}} #' @noRd -pc_validate <- function(query, domain, from, to = "all", output) { +pc_validate <- function(query, + domain, + from, + to = "all", + output, + single_url = FALSE) { domain <- tolower(gsub(" *", "", domain)) from <- tolower(gsub(" *", "", from)) to <- tolower(gsub(" *", "", to)) @@ -639,14 +654,31 @@ pc_validate <- function(query, domain, from, to = "all", output) { stop(paste0("output = ", output, " is invalid.")) } if (to == "all") { - qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", domain, from, + if (single_url == TRUE) { + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", domain, from, + query, output, sep = "/") + } + else { + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", domain, from, output, sep = "/") + } } else { - qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", domain, from, to, - output, sep = "/") + if (single_url == TRUE){ + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", domain, from, + query, to, output, sep = "/") + } + else{ + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", domain, from, + to, output, sep = "/") + } } qurl <- URLencode(qurl) body <- paste0(strsplit(from, "/")[[1]][1], "=", paste(query, collapse = ",")) - return(list(qurl = qurl, body = body)) + if (single_url == TRUE){ + return(list(qurl = qurl)) + } + else{ + return(list(qurl = qurl, body = body)) + } } diff --git a/man/pc_pugrest.Rd b/man/pc_pugrest.Rd index d2c07ddc..0f7068a1 100644 --- a/man/pc_pugrest.Rd +++ b/man/pc_pugrest.Rd @@ -4,7 +4,7 @@ \alias{pc_pugrest} \title{Search PubChem through highly customizable requests} \usage{ -pc_pugrest(query, domain, from, to = "all", output) +pc_pugrest(query, domain, from, to = "all", output, single_url = FALSE) } \arguments{ \item{query}{character; search term. A comma-separated list of positive From 407f2726b9b2a3cad74a015156b0e94a1bfc4f18 Mon Sep 17 00:00:00 2001 From: stitam Date: Sat, 18 Apr 2020 20:33:28 +0100 Subject: [PATCH 05/11] Remove pc_pugrest, pc_validate, add get_sid --- NAMESPACE | 2 +- R/pubchem.R | 472 ++++++++-------------------------- man/get_sid.Rd | 79 ++++++ man/pc_pugrest.Rd | 151 ----------- tests/testthat/test-pubchem.R | 75 +----- 5 files changed, 202 insertions(+), 577 deletions(-) create mode 100644 man/get_sid.Rd delete mode 100644 man/pc_pugrest.Rd diff --git a/NAMESPACE b/NAMESPACE index 0510f292..36cf3936 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -65,7 +65,6 @@ export(opsin_query) export(pan_query) export(parse_mol) export(pc_prop) -export(pc_pugrest) export(pc_synonyms) export(ping_cs) export(ping_pan) @@ -90,6 +89,7 @@ importFrom(dplyr,left_join) importFrom(httr,GET) importFrom(httr,POST) importFrom(httr,add_headers) +importFrom(httr,content) importFrom(httr,http_status) importFrom(jsonlite,fromJSON) importFrom(jsonlite,toJSON) diff --git a/R/pubchem.R b/R/pubchem.R index 503ee399..2e099e51 100644 --- a/R/pubchem.R +++ b/R/pubchem.R @@ -114,7 +114,114 @@ get_cid <- function(query, from = "name", first = FALSE, return(out) } - +#' Retrieve PubChem Substance ID (SID) +#' +#' @description Retrieve one or more substance IDs (SIDs) from PubChem based on +#' substance name, registry ID (e.g. CAS RN), or source ID. Alternatively, +#' retrieve SIDs of all substances provided by a PubChem depositor. +#' @importFrom httr POST content +#' @importFrom jsonlite fromJSON +#' @importFrom tibble as_tibble +#' @param query character; search term, one ore more substances or depositors. +#' @param from character; type of input. Valid values are \code{"name"}, +#' \code{}, \code{"sourceid/"} or \code{"sourceall"}. +#' @param match character; functionality not yet implemented. +#' @param verbose logical; should a verbose output be printed on the console? +#' @return Returns a tibble of substance id-s. +#' @details \code{} is assembled as "\code{xref}/\{\code{RegistryID}, +#' \code{RN}, \code{PubMedID}, \code{MMDBID}, \code{ProteinGI}, +#' \code{NucleotideGI}, \code{TaxonomyID}, \code{MIMID}, \code{GeneID}, +#' \code{ProbeID}, \code{PatentID}\}, e.g. \code{from = "xref/RN"} will query by +#' CAS RN. +#' @details \code{} is any valid PubChem Data Source ID. +#' @details If \code{from = "sourceall"} the query is one or more valid Pubchem +#' depositor names. Depositor names are not case sensitive, but are sensitive to +#' spaces, and special characters may need to be escaped, such as "&" should be +#' replaced by "\%26". +#' @details Depositor names and Data Source IDs can be found at +#' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}. +#' @references Wang, Y., J. Xiao, T. O. Suzek, et al. (2009). PubChem: A Public +#' Information System for Analyzing Bioactivities of Small Molecules. Nucleic +#' Acids Research 37: 623–633. +#' @references Kim, S., Thiessen, P. A., Bolton, E. E., & Bryant, S. H. (2015). +#' PUG-SOAP and PUG-REST: web services for programmatic access to chemical +#' information in PubChem. Nucleic acids research, gkv396. +#' @references Kim, Sunghwan, Paul A. Thiessen, Evan E. Bolton, et al. (2016). +#' PubChem Substance and Compound Databases. Nucleic Acids Research 44(D1): +#' D1202–D1213. +#' @references Sunghwan Kim, Paul A Thiessen, Tiejun Cheng, Bo Yu, Evan E Bolton +#' (2018) An update on PUG-REST: RESTful interface for programmatic access to +#' PubChem. Nucleic Acids Research 46(W1): W563–W570. +#' @note Please respect the Terms and Conditions of the National Library of +#' Medicine, \url{https://www.nlm.nih.gov/databases/download.html} and the data +#' usage policies of National Center for Biotechnology Information, +#' \url{https://www.ncbi.nlm.nih.gov/home/about/policies/}, +#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}. +#' @author Tamás Stirling, \email{stirling.tamas@@gmail.com} +#' @examples +#' # might fail if API is not available +#' \donttest{ +#' get_sid("2-(Acetyloxy)benzoic acid", from = "name") +#' get_sid("57-27-2", from = "xref/rn") +#' get_sid(c("VCC957895", "VCC883692"), from = "sourceid/23706") +#' get_sid("Optopharma Ltd", from = "sourceall") +#' } +get_sid <- function(query, from, match = NULL, verbose = TRUE, ...) { + from <- tolower(from) + if (grepl("^sourceid/", from) == FALSE) { + xref <- paste("xref", c("registryid", "rn", "pubmedid", "mmdbid", + "proteingi", "nucleotidegi", "taxonomyid", "mimid", + "geneid", "probeid", "patentid"), sep = "/") + from <- match.arg(from, choices = c("name", xref, "sourceall")) + } + if (ping_pubchem() != TRUE) { + stop("The web service is currently unavailable.") + } + empty_df <- function(x) { + return(data.frame(query = x, sid = NA_character_, stringsAsFactors = FALSE)) + } + foo <- function(query, from, match, verbose, ...) { + if (is.na(query)) { + return(empty_df(query)) + } + if (verbose == TRUE) { + message(paste0("Querying ", query, ". "), appendLF = FALSE) + } + Sys.sleep(rgamma(1, shape = 15, scale = 1 / 10)) + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/", + from, query, "sids", "json", sep = "/") + res <- httr::POST(qurl, user_agent("webchem"), timeout(10)) + if (res$status_code != 200) { + if (res$status_code == 202) { + cont <- httr::content(res, type = "text", encoding = "UTF-8") + listkey <- jsonlite::fromJSON(cont)$Waiting$ListKey + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance", + "listkey", listkey, "sids", "json", sep = "/") + Sys.sleep(rgamma(1, shape = 15, scale = 1 / 10)) + res <- httr::POST(qurl, user_agent("webchem"), timeout(10)) + if (res$status_code != 200) { + if (verbose == TRUE) message(httr::message_for_status(res)) + return(empty_df(query)) + } + } + else{ + if (verbose == TRUE) message(httr::message_for_status(res)) + return(empty_df(query)) + } + } + if (verbose == TRUE) message(httr::message_for_status(res)) + cont <- httr::content(res, type = "text", encoding = "UTF-8") + cont <- jsonlite::fromJSON(cont)$IdentifierList$SID + cont <- data.frame(query = query, + sid = cont, + stringsAsFactors = FALSE) + return(cont) + } + out <- lapply(query, function(x) foo(x, from, match, verbose)) + out <- do.call(rbind, out) + out <- tibble::as_tibble(out) + return(out) +} #' Retrieve compound properties from a pubchem CID #' @@ -319,366 +426,3 @@ pc_synonyms <- function(query, from = "name", choices = NULL, verbose = TRUE, out <- unlist(out) return(out) } - -#' Search PubChem through highly customizable requests -#' -#' @description This function gives you more control over your communications -#' with the PubChem PUG-REST webservice. This documentation page was created -#' from the original PubChem PUG-REST specification document which can be found -#' at \url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest}. -#' @importFrom httr POST http_status -#' @param query character; search term. A comma-separated list of positive -#' integers (e.g. cid, sid, aid) or identifier strings (source, inchikey, -#' formula). In some cases only a single identifier string (name, smiles, xref; -#' inchi, sdf by POST only). -#' @param domain character; the domain of the search term. Can be one of -#' \code{"compound"}, \code{"substance"}, \code{"assay"}, -#' \code{"sources/[substance, assay]"}, \code{"sourcetable"}, -#' \code{"conformers"}, -#' \code{"annotations/[sourcename/|heading/]"}. See -#' details for more information. -#' @param from character; the type of the search term, refered to as -#' \code{namespace} within the reference. Valid values depend on the domain. -#' See details for more information. -#' @param to character; refered to as \code{operations} within the reference. -#' Tells the service which part of the record to retrieve. Currently, if no -#' operation is specified at all, the default is to retrieve the entire record. -#' See details for more information. -#' @param output character; the format of the output. Can be one of -#' \code{"XML"}, \code{"ASNT"}, \code{"ASNB"}, \code{"JSON"}, -#' \code{"JSONP[?callback=]"}, \code{"SDF"}, \code{"CSV"}, -#' \code{"PNG"}, \code{"TXT"}. Note that not all formats are applicable to the -#' results of all operations; one cannot, for example, retrieve a whole compound -#' record as CSV or a property table as SDF. TXT output is only available in a -#' restricted set of cases where all the information is the same – for example, -#' synonyms for a single CID where there is one synonym per line. -#' @return a html POST response. -#' @details Valid \code{from} values for \code{domain = "compound"} are: -#' \itemize{ -#' \item \code{"cid"}, \code{"name"}, \code{"smiles"}, \code{"inchi"}, -#' \code{"sdf"}, \code{"inchikey"}, \code{"formula"}, \code{}, -#' \code{}, \code{"listkey"}, \code{}. -#' \item \code{} is assembled as "\{\code{substructure}, -#' \code{superstructure}, \code{similarity}, \code{identity}\}/\{\code{smiles}, -#' \code{inchi}, \code{sdf}, \code{cid}\}", e.g. -#' \code{from = "substructure/smiles"}. -#' \item \code{} is assembled as "\code{xref}/\{\code{RegistryID}, -#' \code{RN}, \code{PubMedID}, \code{MMDBID}, \code{ProteinGI}, -#' \code{NucleotideGI}, \code{TaxonomyID}, \code{MIMID}, \code{GeneID}, -#' \code{ProbeID}, \code{PatentID}\}, e.g. \code{from = "xref/RN"}. is -#' not case sensitive. -#' \item \code{} is either \code{"fastformula"} or is assembled as -#' "\{\code{fastidentity}, \code{fastsimilarity_2d}, \code{fastsimilarity_3d}, -#' \code{fastsubstructure}, \code{fastsuperstructure}\}/\{\code{smiles}, -#' \code{smarts}, \code{inchi}, \code{sdf} or \code{cid}\}", e.g. -#' \code{from = "fastidentity/smiles"}. -#' } -#' @details Valid \code{from} values for \code{domain = "substance"} are: -#' \itemize{ -#' \item \code{"sid"}, \code{"sourceid/"}, -#' \code{"sourceall/"}, \code{name}, \code{} or -#' \code{listkey}. -#' \item \code{} is any valid PubChem depositor name. Valid names -#' can be found at \url{https://pubchem.ncbi.nlm.nih.gov/sources/}. Special -#' characters may need to be escaped, such as "&" should be replaced by "\%26". -#' } -#' @details Valid \code{from} values for \code{domain = "assay"} are: -#' \itemize{ -#' \item \code{"aid"}, \code{"listkey"}, \code{"type/"}, -#' \code{"sourceall/"}, \code{"target/"} or -#' \code{"activity/"}. -#' \item \code{} can be \code{all}, \code{confirmatory}, -#' \code{doseresponse}, \code{onhold}, \code{panel}, \code{rnai}, -#' \code{screening}, \code{summary}, \code{cellbased}, \code{biochemical}, -#' \code{invivo}, \code{invitro} or \code{activeconcentrationspecified}. -#' \item \code{} can be \code{gi}, \code{proteinname}, -#' \code{geneid}, \code{genesymbol} or \code{accession}. -#' } -#' @details \code{} for domain can be \code{"sources/[substance, -#' assay]"}, \code{"sourcetable"}, \code{"conformers"}, \code{"annotations/ -#' [sourcename/, heading/]"}. -#' @details Valid \code{to} values for \code{domain = "compound"} are: -#' \itemize{ -#' \item \code{"record"}, \code{}, \code{"synonyms"}, -#' \code{"sids"}, \code{"cids"}, \code{"aids"}, \code{"assaysummary"}, -#' \code{"classification"}, \code{}, \code{"description"} or -#' \code{"conformers"}. -#' \item \code{} is assembled as \code{"property/ -#' [comma-separated list of ]"}. -#' \item Valid \code{} are \code{"MolecularFormula"}, -#' \code{"MolecularWeight"}, \code{"CanonicalSMILES"}, \code{"IsomericSMILES"}, -#' \code{"InChI"}, \code{"InChIKey"}, \code{"IUPACname"}, \code{"XlogP"}, -#' \code{"ExactMass"}, \code{"MonoisotopicMass"}, \code{"TPSA"}, -#' \code{"Complexity"}, \code{"Charge"}, \code{"HBondDonorCount"}, -#' \code{"HBondAcceptorCount"}, \code{"RotatableBondCount"}, -#' \code{"HeavyAtomCount"}, \code{"IsotopeAtomCount"}, \code{"AtomStereoCount"}, -#' \code{"DefinedAtomStereoCount"}, \code{"UndefinedAtomStereoCount"}, -#' \code{"BondStereoCount"}, \code{"DefinedBondStereoCount"}, -#' \code{"UndefinedBondStereoCount"}, \code{"CovalentUnitCount"}, -#' \code{"Volume3D"}, \code{"XStericQuadrupole3D"}, -#' \code{"YStericQuadrupole3D"}, \code{"ZStericQuadrupole3D"}, -#' \code{"FeatureCount3D"}, \code{"FeatureAcceptorCount3D"}, -#' \code{"FeatureDonorCount3D"}, \code{"FeatureAnionCount3D"}, -#' \code{"FeatureCationCount3D"}, \code{"FeatureRingCount3D"}, -#' \code{"FeatureHydrophobeCount3D"}, \code{"ConformerModelRMSD3D"}, -#' \code{"EffectiveRotorCount3D"}, \code{"ConformerCount3D"}, -#' \code{"Fingerprint2D"}. are not case sensitive. -#' } -#' @details Valid \code{to} values for \code{domain = "substance"} are: -#' \itemize{ -#' \item \code{"record"}, \code{"synonyms"}, \code{"sids"}, \code{"cids"}, -#' \code{"aids"}, \code{"assaysummary"}, \code{"classification"}, -#' \code{}, \code{"description"}. -#' \item \code{} is assembled as \code{"xrefs/ -#' [comma-separated list of xrefs tags]"}. -#' } -#' @details Valid \code{to} values for \code{domain = "assay"} are: -#' \itemize{ -#' \item \code{"record"}, \code{"concise"}, \code{"aids"}, \code{"sids"}, -#' \code{"cids"}, \code{"description"}, \code{"targets/"}, -#' \code{}, \code{"summary"} or \code{"classification"}. -#' \item \code{} can be \code{"ProteinGI"}, \code{"ProteinName"}, -#' \code{"GeneID"} or \code{"GeneSymbol"}. -#' \item \code{} is assembled as \code{"doseresponse/sid"}. -#' } -#' @references For more information, visit -#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest} -#' @author Tamás Stirling, \email{stirling.tamas@@gmail.com} -#' @export -pc_pugrest <- function(query, - domain, - from, - to = "all", - output, - single_url = FALSE) { - request <- pc_validate(query, domain, from, to, output, single_url) - Sys.sleep(rgamma(1, shape = 15, scale = 1/10)) - if (single_url == TRUE){ - response <- try(POST(request$qurl), silent = TRUE) - } - else { - response <- try(POST(request$qurl, body = request$body), silent = TRUE) - } - if (response$status_code %in% c(200,202) == FALSE) { - stop(httr::http_status(response)$message) - } - return(response) -} - -#' Validate a PubChem PUG-REST API query and construct the query URL and body -#' -#' @description This function validates a PubChem PUG-REST API query according -#' to teh PubChem PUG-REST specification document, which can be found at -#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest}. If a query argument is -#' invalid the function prints an informative error message. If all query -#' arguments are valid, the function returns the query url and body that can -#' then be used in http requests. See \code{\link{pc_pugrest}} for lists of -#' valid values for each argument. -#' @param query character; -#' @param domain character; \code{"compound"}, \code{"substance"}, etc. -#' @param from character; mentioned as \code{"namespace"} in the PubChem -#' PUG-REST specification document. -#' @param to character; mentioned as \code{"operations"} in the PubChem -#' PUG-REST specification document. -#' @param output character; -#' @return a list of two elements, \code{url} and \code{body}. -#' @note The function cannot tell if a source id or a depositor -#' name is valid. If the entry starts with \code{"sourceid/"} or -#' \code{"sourceall/"} it will acept the entry. -#' @note When validating an entry for \code{domain = "assay"}, -#' \code{type = "from"}, the function cannot tell if a depositor name or an -#' activity is valid. If the entry starts with \code{"sourceall/"} or -#' \code{"activity/"} it will acept the entry. -#' @note Some source names contain the "/" character, which which is -#' incompatible with the URL syntax, e.g. EU REGULATION (EC) No 1272/2008. -#' The functions attempts to convert "/" to "." as recommended by the -#' specification document. -#' @note It is possible to query multiple properties or multiple xrefs in the -#' same query. When validating an entry for \code{domain = "compound"}, -#' \code{type = "to"} if the entry starts with \code{property/} or \code{xrefs/} -#' the function will also check the comma separated values after the forward -#' slash and evaluate whether the entry can be whitelisted. -#' @references \url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest} -#' @author Tamás Stirling, \email{stirling.tamas@@gmail.com} -#' @seealso \code{\link{pc_pugrest}} -#' @noRd -pc_validate <- function(query, - domain, - from, - to = "all", - output, - single_url = FALSE) { - domain <- tolower(gsub(" *", "", domain)) - from <- tolower(gsub(" *", "", from)) - to <- tolower(gsub(" *", "", to)) - output <- tolower(gsub(" *", "", output)) - xref <- paste( - "xref", - c("registryid", "rn", "pubmedid", "mmdbid", "proteingi", "nucleotidegi", - "taxonomyid", "mimid", "geneid", "probeid", "patentid"), - sep = "/" - ) - xrefs <- c("registryid", "rn", "pubmedid", "mmdbid", "proteingi", - "nucleotidegi", "taxonomyid", "mimid", "geneid", "probeid", - "patentid", "dburl", "sburl", "sourcename", "sourcecategory") - #from_choices for domain = "compound" - if(domain == "compound") { - structure_search <- expand.grid( - c("substructure", "superstructure", "similarity", "identity"), - c("smiles", "inchi", "sdf", "cid") - ) - structure_search <- with(structure_search, paste(Var1, Var2, sep = "/")) - fast_search <- expand.grid( - c("fastidentity", "fastsimilarity_2d", "fastsimilarity_3d", - "fastsubstructure", "fastsuperstructure"), - c("smiles", "smarts", "inchi", "sdf", "cid") - ) - fast_search <- c(with(fast_search, paste(Var1, Var2, sep = "/")), - "fastformula") - from_choices <-c("cid", "name", "smiles", "inchi", "sdf", "inchikey", - "formula", structure_search, xref, "listkey", fast_search) - } - #from_choices for domain = "substance" - if (domain == "substance") { - #from = "sourceid/IBM", from = "sourceid/EU REGULATION (EC) No 1272/2008" - pre <- strsplit(from, "/")[[1]][1] - post <- paste(strsplit(from, "/")[[1]][-1], collapse =".") - if (pre == "sourceid") { - from <- paste(pre, post, sep = "/") - from_choices <- from - } - else { - from_choices <- c("sid", "name", "sourceall", xref, "listkey") - } - } - # from_choices for domain = "assay" - if (domain == "assay") { - from_choices <- c("aid", "listkey", "type", "sourceall", "target", - "activity") - } - # query_choices - if ("sourceall" %in% from_choices) { - # how to validate against all valid source names? - query <- gsub("/", ".", query) #ensure URL syntax - query_choice <- query - } - if ("type" %in% from_choices){ - query_choices <- c("all", "confirmatory", "doseresponse", "onhold", - "panel", "rnai", "screening", "summary", "cellbased", - "biochemical", "invivo", "invitro", - "activeconcentrationspecified") - } - if ("target" %in% from_choices){ - query_choices <- c("gi", "proteinname", "geneid", "genesymbol", "accession") - } - if (mean(c("sourceall", "type", "target") %in% - from_choices) == 0) { - query_choices <- query - } - #to_choices for domain = "compound" - if (domain == "compound") { - properties <- c("molecularformula", "molecularweight", "canonicalsmiles", - "isomericsmiles", "inchi", "inchikey", "iupacname", - "xlogp", "exactmass", "monoisotopicmass", "tpsa", - "complexity", "charge", "hbonddonorcount", - "hbondacceptorcount", "rotatablebondcount", "heavyatomcount", - "isotopeatomcount", "atomstereocount", - "definedatomstereocount", "undefinedatomstereocount", - "bondstereocount", "definedbondstereocount", - "undefinedbondstereocount", "covalentunitcount", "volume3d", - "xstericquadrupole3d", "ystericquadrupole3d", - "zstericquadrupole3d", "featurecount3d", - "featureacceptorcount3d", "featuredonorcount3d", - "featureanioncount3d", "featurecationcount3d", - "featureringcount3d", "featurehydrophobecount3d", - "conformermodelrmsd3d", "effectiverotorcount3d", - "conformercount3d", "fingerprint2d") - pre <- strsplit(to, "/")[[1]][1] - post <- paste(strsplit(to, "/")[[1]][-1], collapse =",") - if(pre == "property") { - props <- strsplit(post, ",")[[1]] - if (mean(props %in% properties) == 1) to_choices <- to - } - if (pre == "xrefs") { - refs <- strsplit(post, ",")[[1]] - if (mean(refs %in% xrefs) == 1) to_choices <- to - } - if (sum(pre %in% c("property", "xrefs")) == 0) { - to_choices <- c("record", "synonyms", "sids", "cids", "aids", - "assaysummary", "classification", "description", - "conformers") - } - } - #to_choices for domain = "substance" - if (domain == "substance") { - pre <- strsplit(to, "/")[[1]][1] - post <- paste(strsplit(to, "/")[[1]][-1], collapse =",") - if (pre == "xrefs") { - refs <- strsplit(post, ",")[[1]] - if (mean(refs %in% xrefs) == 1) to_choices <- to - } - else { - to_choices <- c("record", "synonyms", "sids", "cids", "aids", - "assaysummary", "classification", "description") - } - } - if (domain == "assay") { - pre <- strsplit(to, "/")[[1]][1] - post <- paste(strsplit(to, "/")[[1]][-1], collapse =",") - if (pre == "targets") { - tgts <- c("proteingi", "proteinname", "geneid", "genesymbol") - targets <- strsplit(post, ",")[[1]] - if (mean(targets %in% tgts) == 1) to_choices <- to - } - else { - to_choices <- c("record", "concise", "aids", "sids", "cids", - "description", "doseresponse/sid", "summary", - "classification") - } - } - if (strsplit(output, "?")[[1]][1] == "jsonp") output_choices <- output - else output_choices <- c("xml", "asnt", "asnb", "json", "sdf", "csv", - "png", "txt") - for (i in query) { - if (i %in% query_choices == FALSE) { - stop(paste0("query = ", i, " is invalid.")) - } - } - if (from %in% from_choices == FALSE) { - stop(paste0("from = ", from, " is invalid.")) - } - if (to %in% to_choices == FALSE & to != "all") { - stop(paste0("to = ", to, " is invalid.")) - } - if (output %in% output_choices == FALSE) { - stop(paste0("output = ", output, " is invalid.")) - } - if (to == "all") { - if (single_url == TRUE) { - qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", domain, from, - query, output, sep = "/") - } - else { - qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", domain, from, - output, sep = "/") - } - } - else { - if (single_url == TRUE){ - qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", domain, from, - query, to, output, sep = "/") - } - else{ - qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", domain, from, - to, output, sep = "/") - } - } - qurl <- URLencode(qurl) - body <- paste0(strsplit(from, "/")[[1]][1], "=", paste(query, collapse = ",")) - if (single_url == TRUE){ - return(list(qurl = qurl)) - } - else{ - return(list(qurl = qurl, body = body)) - } -} diff --git a/man/get_sid.Rd b/man/get_sid.Rd new file mode 100644 index 00000000..675ea376 --- /dev/null +++ b/man/get_sid.Rd @@ -0,0 +1,79 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pubchem.R +\name{get_sid} +\alias{get_sid} +\title{Retrieve PubChem Substance ID (SID)} +\usage{ +get_sid(query, from, match = NULL, verbose = TRUE, ...) +} +\arguments{ +\item{query}{character; search term, one ore more substances or depositors.} + +\item{from}{character; type of input. Valid values are \code{"name"}, +\code{}, \code{"sourceid/"} or \code{"sourceall"}.} + +\item{match}{character; functionality not yet implemented.} + +\item{verbose}{logical; should a verbose output be printed on the console?} +} +\value{ +Returns a tibble of substance id-s. +} +\description{ +Retrieve one or more substance IDs (SIDs) from PubChem based on +substance name, registry ID (e.g. CAS RN), or source ID. Alternatively, +retrieve SIDs of all substances provided by a PubChem depositor. +} +\details{ +\code{} is assembled as "\code{xref}/\{\code{RegistryID}, +\code{RN}, \code{PubMedID}, \code{MMDBID}, \code{ProteinGI}, +\code{NucleotideGI}, \code{TaxonomyID}, \code{MIMID}, \code{GeneID}, +\code{ProbeID}, \code{PatentID}\}, e.g. \code{from = "xref/RN"} will query by +CAS RN. + +\code{} is any valid PubChem Data Source ID. + +If \code{from = "sourceall"} the query is one or more valid Pubchem +depositor names. Depositor names are not case sensitive, but are sensitive to +spaces, and special characters may need to be escaped, such as "&" should be +replaced by "\%26". + +Depositor names and Data Source IDs can be found at +\url{https://pubchem.ncbi.nlm.nih.gov/sources/}. +} +\note{ +Please respect the Terms and Conditions of the National Library of +Medicine, \url{https://www.nlm.nih.gov/databases/download.html} and the data +usage policies of National Center for Biotechnology Information, +\url{https://www.ncbi.nlm.nih.gov/home/about/policies/}, +\url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}. +} +\examples{ +# might fail if API is not available +\donttest{ +get_sid("2-(Acetyloxy)benzoic acid", from = "name") +get_sid("57-27-2", from = "xref/rn") +get_sid(c("VCC957895", "VCC883692"), from = "sourceid/23706") +get_sid("Optopharma Ltd", from = "sourceall") +} +} +\references{ +Wang, Y., J. Xiao, T. O. Suzek, et al. (2009). PubChem: A Public +Information System for Analyzing Bioactivities of Small Molecules. Nucleic +Acids Research 37: 623–633. + +Kim, S., Thiessen, P. A., Bolton, E. E., & Bryant, S. H. (2015). +PUG-SOAP and PUG-REST: web services for programmatic access to chemical +information in PubChem. Nucleic acids research, gkv396. + +Kim, Sunghwan, Paul A. Thiessen, Evan E. Bolton, et al. (2016). +PubChem Substance and Compound Databases. Nucleic Acids Research 44(D1): +D1202–D1213. + +Sunghwan Kim, Paul A Thiessen, Tiejun Cheng, Bo Yu, Evan E Bolton +(2018) An update on PUG-REST: RESTful interface for programmatic access to +PubChem. Nucleic Acids Research 46(W1): W563–W570. +} +\author{ +Tamás Stirling, \email{stirling.tamas@gmail.com} +} diff --git a/man/pc_pugrest.Rd b/man/pc_pugrest.Rd deleted file mode 100644 index 0f7068a1..00000000 --- a/man/pc_pugrest.Rd +++ /dev/null @@ -1,151 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pubchem.R -\name{pc_pugrest} -\alias{pc_pugrest} -\title{Search PubChem through highly customizable requests} -\usage{ -pc_pugrest(query, domain, from, to = "all", output, single_url = FALSE) -} -\arguments{ -\item{query}{character; search term. A comma-separated list of positive -integers (e.g. cid, sid, aid) or identifier strings (source, inchikey, -formula). In some cases only a single identifier string (name, smiles, xref; -inchi, sdf by POST only).} - -\item{domain}{character; the domain of the search term. Can be one of -\code{"compound"}, \code{"substance"}, \code{"assay"}, -\code{"sources/[substance, assay]"}, \code{"sourcetable"}, -\code{"conformers"}, -\code{"annotations/[sourcename/|heading/]"}. See -details for more information.} - -\item{from}{character; the type of the search term, refered to as -\code{namespace} within the reference. Valid values depend on the domain. -See details for more information.} - -\item{to}{character; refered to as \code{operations} within the reference. -Tells the service which part of the record to retrieve. Currently, if no -operation is specified at all, the default is to retrieve the entire record. -See details for more information.} - -\item{output}{character; the format of the output. Can be one of -\code{"XML"}, \code{"ASNT"}, \code{"ASNB"}, \code{"JSON"}, -\code{"JSONP[?callback=]"}, \code{"SDF"}, \code{"CSV"}, -\code{"PNG"}, \code{"TXT"}. Note that not all formats are applicable to the -results of all operations; one cannot, for example, retrieve a whole compound -record as CSV or a property table as SDF. TXT output is only available in a -restricted set of cases where all the information is the same – for example, -synonyms for a single CID where there is one synonym per line.} -} -\value{ -a html POST response. -} -\description{ -This function gives you more control over your communications -with the PubChem PUG-REST webservice. This documentation page was created -from the original PubChem PUG-REST specification document which can be found -at \url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest}. -} -\details{ -Valid \code{from} values for \code{domain = "compound"} are: -\itemize{ -\item \code{"cid"}, \code{"name"}, \code{"smiles"}, \code{"inchi"}, -\code{"sdf"}, \code{"inchikey"}, \code{"formula"}, \code{}, -\code{}, \code{"listkey"}, \code{}. -\item \code{} is assembled as "\{\code{substructure}, -\code{superstructure}, \code{similarity}, \code{identity}\}/\{\code{smiles}, -\code{inchi}, \code{sdf}, \code{cid}\}", e.g. -\code{from = "substructure/smiles"}. -\item \code{} is assembled as "\code{xref}/\{\code{RegistryID}, -\code{RN}, \code{PubMedID}, \code{MMDBID}, \code{ProteinGI}, -\code{NucleotideGI}, \code{TaxonomyID}, \code{MIMID}, \code{GeneID}, -\code{ProbeID}, \code{PatentID}\}, e.g. \code{from = "xref/RN"}. is -not case sensitive. -\item \code{} is either \code{"fastformula"} or is assembled as -"\{\code{fastidentity}, \code{fastsimilarity_2d}, \code{fastsimilarity_3d}, -\code{fastsubstructure}, \code{fastsuperstructure}\}/\{\code{smiles}, -\code{smarts}, \code{inchi}, \code{sdf} or \code{cid}\}", e.g. -\code{from = "fastidentity/smiles"}. -} - -Valid \code{from} values for \code{domain = "substance"} are: -\itemize{ -\item \code{"sid"}, \code{"sourceid/"}, -\code{"sourceall/"}, \code{name}, \code{} or -\code{listkey}. -\item \code{} is any valid PubChem depositor name. Valid names -can be found at \url{https://pubchem.ncbi.nlm.nih.gov/sources/}. Special -characters may need to be escaped, such as "&" should be replaced by "\%26". -} - -Valid \code{from} values for \code{domain = "assay"} are: -\itemize{ -\item \code{"aid"}, \code{"listkey"}, \code{"type/"}, -\code{"sourceall/"}, \code{"target/"} or -\code{"activity/"}. -\item \code{} can be \code{all}, \code{confirmatory}, -\code{doseresponse}, \code{onhold}, \code{panel}, \code{rnai}, -\code{screening}, \code{summary}, \code{cellbased}, \code{biochemical}, -\code{invivo}, \code{invitro} or \code{activeconcentrationspecified}. -\item \code{} can be \code{gi}, \code{proteinname}, -\code{geneid}, \code{genesymbol} or \code{accession}. -} - -\code{} for domain can be \code{"sources/[substance, -assay]"}, \code{"sourcetable"}, \code{"conformers"}, \code{"annotations/ -[sourcename/, heading/]"}. - -Valid \code{to} values for \code{domain = "compound"} are: -\itemize{ -\item \code{"record"}, \code{}, \code{"synonyms"}, -\code{"sids"}, \code{"cids"}, \code{"aids"}, \code{"assaysummary"}, -\code{"classification"}, \code{}, \code{"description"} or -\code{"conformers"}. -\item \code{} is assembled as \code{"property/ -[comma-separated list of ]"}. -\item Valid \code{} are \code{"MolecularFormula"}, -\code{"MolecularWeight"}, \code{"CanonicalSMILES"}, \code{"IsomericSMILES"}, -\code{"InChI"}, \code{"InChIKey"}, \code{"IUPACname"}, \code{"XlogP"}, -\code{"ExactMass"}, \code{"MonoisotopicMass"}, \code{"TPSA"}, -\code{"Complexity"}, \code{"Charge"}, \code{"HBondDonorCount"}, -\code{"HBondAcceptorCount"}, \code{"RotatableBondCount"}, -\code{"HeavyAtomCount"}, \code{"IsotopeAtomCount"}, \code{"AtomStereoCount"}, -\code{"DefinedAtomStereoCount"}, \code{"UndefinedAtomStereoCount"}, -\code{"BondStereoCount"}, \code{"DefinedBondStereoCount"}, -\code{"UndefinedBondStereoCount"}, \code{"CovalentUnitCount"}, -\code{"Volume3D"}, \code{"XStericQuadrupole3D"}, -\code{"YStericQuadrupole3D"}, \code{"ZStericQuadrupole3D"}, -\code{"FeatureCount3D"}, \code{"FeatureAcceptorCount3D"}, -\code{"FeatureDonorCount3D"}, \code{"FeatureAnionCount3D"}, -\code{"FeatureCationCount3D"}, \code{"FeatureRingCount3D"}, -\code{"FeatureHydrophobeCount3D"}, \code{"ConformerModelRMSD3D"}, -\code{"EffectiveRotorCount3D"}, \code{"ConformerCount3D"}, -\code{"Fingerprint2D"}. are not case sensitive. -} - -Valid \code{to} values for \code{domain = "substance"} are: -\itemize{ -\item \code{"record"}, \code{"synonyms"}, \code{"sids"}, \code{"cids"}, -\code{"aids"}, \code{"assaysummary"}, \code{"classification"}, -\code{}, \code{"description"}. -\item \code{} is assembled as \code{"xrefs/ -[comma-separated list of xrefs tags]"}. -} - -Valid \code{to} values for \code{domain = "assay"} are: -\itemize{ -\item \code{"record"}, \code{"concise"}, \code{"aids"}, \code{"sids"}, -\code{"cids"}, \code{"description"}, \code{"targets/"}, -\code{}, \code{"summary"} or \code{"classification"}. -\item \code{} can be \code{"ProteinGI"}, \code{"ProteinName"}, -\code{"GeneID"} or \code{"GeneSymbol"}. -\item \code{} is assembled as \code{"doseresponse/sid"}. -} -} -\references{ -For more information, visit -\url{https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest} -} -\author{ -Tamás Stirling, \email{stirling.tamas@gmail.com} -} diff --git a/tests/testthat/test-pubchem.R b/tests/testthat/test-pubchem.R index dffa100a..cd94ca8e 100644 --- a/tests/testthat/test-pubchem.R +++ b/tests/testthat/test-pubchem.R @@ -14,6 +14,20 @@ test_that("get_cid()", { 12345) }) +test_that("get_sid()", { + skip_on_cran() + + a <- get_sid("2-(Acetyloxy)benzoic acid", from = "name") + a2 <- get_sid(c("balloon", NA, "aspirin"), from = "name") + b <- get_sid("57-27-2", from = "xref/rn") + c <- get_sid(c("VCC957895", "VCC883692"), from = "sourceid/23706") + + expect_true("tbl" %in% class(a)) + expect_true(87798 %in% a$sid) + expect_equal(is.na(a2[1:2, 2], c(TRUE, TRUE))) + expect_true(4681 %in% b$sid) + expect_equal(c$sid, c(385515341, 385515340)) +}) test_that("pc_prop", { skip_on_cran() @@ -47,64 +61,3 @@ test_that("cid integration tests", { expect_true(is.na(pc_prop(NA, properties = "CanonicalSmiles", verbose = FALSE))) }) - -test_that("pc_validate", { - #name/names to cid - - a <- pc_validate(2244, domain = "compound", from = "cid", - to = "property/MolecularFormula", output = "JSON") - expect_equal(strsplit(a$qurl, "pug/")[[1]][2], - "compound/cid/property/molecularformula/json") - expect_equal(a$body, "cid=2244") - b <- pc_validate(c(1234,2244), domain = "compound", from = "cid", - to = "property/MolecularFormula", output = "JSON") - expect_equal(strsplit(b$qurl, "pug/")[[1]][2], - "compound/cid/property/molecularformula/json") - expect_equal(b$body, "cid=1234,2244") - c <- pc_validate("5F1CA2B314D35F28C7F94168627B29E3", domain = "substance", - from = "sourceid/ibm", output = "ASNT") - expect_equal(strsplit(c$qurl, "pug/")[[1]][2], - "substance/sourceid/ibm/asnt") - expect_equal(c$body, "sourceid=5F1CA2B314D35F28C7F94168627B29E3") - d <- pc_validate(747285, domain = "substance", - from = "sourceid/dtp/nci", output = "sdf") - expect_equal(strsplit(d$qurl, "pug/")[[1]][2], - "substance/sourceid/dtp.nci/sdf") - expect_equal(d$body, "sourceid=747285") - e <- pc_validate(747285, domain = "substance", - from = "sourceid/dtp/nci", output = "png") - expect_equal(strsplit(e$qurl, "pug/")[[1]][2], - "substance/sourceid/dtp.nci/png") - expect_equal(e$body, "sourceid=747285") - f <- pc_validate(2244, domain = "compound", - from = "cid", output = "sdf") - expect_equal(strsplit(f$qurl, "pug/")[[1]][2], - "compound/cid/sdf") - expect_equal(f$body, "cid=2244") - g <- pc_validate("aspirin", domain = "compound", - from = "name", output = "json") - expect_equal(strsplit(g$qurl, "pug/")[[1]][2], - "compound/name/json") - expect_equal(g$body, "name=aspirin") - h <- pc_validate(c(1,2,3,4,5), domain = "compound", from = "cid", - to = "property/molecularformula,molecularweight", - output = "xml") - expect_equal(strsplit(h$qurl, "pug/")[[1]][2], - "compound/cid/property/molecularformula,molecularweight/xml") - expect_equal(h$body, "cid=1,2,3,4,5") - i <- pc_validate("acetic acid", domain = "compound", from = "name", - to = "cids", - output = "json") - expect_equal(strsplit(i$qurl, "pug/")[[1]][2], - "compound/name/cids/json") - expect_equal(i$body, "name=acetic acid") - - - -}) - -test_that("pc_pugrest", { - #pc_pugrest(176, "compound", "cid", "synonyms", "JSON") - #pc_pugrest("EU REGULATION (EC) No 1272/2008", substance, ) - #pc_pugrest("DC Chemicals", "substance", "sourceall", "sids", "JSON") -}) From 575e59faf40a47466b70300eaf3692b7b3f479bc Mon Sep 17 00:00:00 2001 From: stitam Date: Sun, 19 Apr 2020 09:43:35 +0100 Subject: [PATCH 06/11] Add ... argument to get_sid documentation --- R/pubchem.R | 1 + man/get_sid.Rd | 2 ++ 2 files changed, 3 insertions(+) diff --git a/R/pubchem.R b/R/pubchem.R index 2e099e51..bffcd4a0 100644 --- a/R/pubchem.R +++ b/R/pubchem.R @@ -128,6 +128,7 @@ get_cid <- function(query, from = "name", first = FALSE, #' @param match character; functionality not yet implemented. #' @param verbose logical; should a verbose output be printed on the console? #' @return Returns a tibble of substance id-s. +#' @param ... currently not used. #' @details \code{} is assembled as "\code{xref}/\{\code{RegistryID}, #' \code{RN}, \code{PubMedID}, \code{MMDBID}, \code{ProteinGI}, #' \code{NucleotideGI}, \code{TaxonomyID}, \code{MIMID}, \code{GeneID}, diff --git a/man/get_sid.Rd b/man/get_sid.Rd index 675ea376..7189f742 100644 --- a/man/get_sid.Rd +++ b/man/get_sid.Rd @@ -15,6 +15,8 @@ get_sid(query, from, match = NULL, verbose = TRUE, ...) \item{match}{character; functionality not yet implemented.} \item{verbose}{logical; should a verbose output be printed on the console?} + +\item{...}{currently not used.} } \value{ Returns a tibble of substance id-s. From 30bcfa7b77c889d6e5c7c23417919e54faf6a9f9 Mon Sep 17 00:00:00 2001 From: stitam Date: Wed, 29 Apr 2020 11:01:45 +0100 Subject: [PATCH 07/11] Update get_cid() to handle complex queries --- NEWS | 5 +- R/pubchem.R | 245 +++++++++++++++++++--------------- man/get_cid.Rd | 42 ++++-- man/get_sid.Rd | 35 +++-- tests/testthat/test-pubchem.R | 5 + 5 files changed, 203 insertions(+), 129 deletions(-) diff --git a/NEWS b/NEWS index 30fd6d56..5f16f686 100644 --- a/NEWS +++ b/NEWS @@ -1,10 +1,11 @@ -webchem 0.5.0.9001 +webchem 0.5.0.9006 ====================== NEW FEATURES +* get_cid() now can search by registry IDs (e.g. CAS RN), and can handle more complex requests like searching for similar compounds. +* get_sid() is a new function that can retrieve substance IDs from PubChem based on name, registry ID (e.g. CAS RN), or source ID. * get_etoxid() now can search by CAS, EC, GSBL and RTECS numbers. Added `from = ` argument. [PR #241, added by @andschar] - * nist_ri() now can search by name, InChI, InChIKey, or CAS. The `cas` argument is deprecated. Use `query` instead with `from = "cas"` MINOR IMPROVEMENTS diff --git a/R/pubchem.R b/R/pubchem.R index f4c84762..601342ec 100644 --- a/R/pubchem.R +++ b/R/pubchem.R @@ -1,19 +1,34 @@ -#' Retrieve Pubchem Id (CID) +#' Retrieve Pubchem Compound ID (CID) #' -#' Return CompoundID (CID) for a search query using PUG-REST, -#' see \url{https://pubchem.ncbi.nlm.nih.gov/}. -#' @param query character; search term. +#' Retrieve compound IDs (CIDs) from PubChem. +#' @param query character; search term, one or more compounds. #' @param from character; type of input, can be one of "name" (default), "cid", -#' "sid", "aid", "smiles", "inchi", "inchikey" -#' @param match character; How should multiple hits be handled?, "all" all matches are returned, "best" the best matching is returned, "ask" enters an interactive mode and the user is asked for input, "na" returns NA if multiple hits are found. -#' @param search_substances logical; If TRUE also searches PubChem SIDs +#' "smiles", "inchi", "sdf", "inchikey", "formula", , , +#' . See details for more information. +#' @param match character; How should multiple hits be handled?, "all" all +#' matches are returned, "best" the best matching is returned, "ask" enters an +#' interactive mode and the user is asked for input, "na" returns NA if multiple +#' hits are found. #' @param verbose logical; should a verbose output be printed on the console? #' @param arg character; optinal arguments like "name_type=word" to match #' individual words. #' @param first deprecated. Use `match` instead. #' @param ... currently unused. #' @return a tibble. -#' +#' @details is assembled as "{\code{substructure} | +#' \code{superstructure} | \code{similarity} | \code{identity}} / {\code{smiles} +#' | \code{inchi} | \code{sdf} | \code{cid}}", e.g. +#' \code{from = "substructure/smiles"}. +#' @details \code{} is assembled as "\code{xref}/\{\code{RegistryID} | +#' \code{RN} | \code{PubMedID} | \code{MMDBID} | \code{ProteinGI}, +#' \code{NucleotideGI} | \code{TaxonomyID} | \code{MIMID} | \code{GeneID} | +#' \code{ProbeID} | \code{PatentID}\}", e.g. \code{from = "xref/RN"} will query +#' by CAS RN. +#' @details is either \code{fastformula} or it is assembled as +#' "{\code{fastidentity} | \code{fastsimilarity_2d} | \code{fastsimilarity_3d} | +#' \code{fastsubstructure} | \code{fastsuperstructure}}/{\code{smiles} | +#' \code{smarts} | \code{inchi} | \code{sdf} | \code{cid}}", e.g. +#' \code{from = "fastidentity/smiles"}. #' @references Wang, Y., J. Xiao, T. O. Suzek, et al. 2009 PubChem: A Public #' Information System for #' Analyzing Bioactivities of Small Molecules. Nucleic Acids Research 37: @@ -45,6 +60,10 @@ #' get_cid("Triclosan", arg = "name_type=word") #' get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey") #' get_cid("CCCC", from = "smiles") +#' get_cid("C26H52NO6P", from = "formula") +#' get_cid("56-40-6", from = "xref/rn") +#' get_cid(5564, from = "similarity/cid") +#' get_cid("CCO", from = "similarity/smiles") #' #' # multiple inputs #' comp <- c("Triclosan", "Aspirin") @@ -53,18 +72,12 @@ #' } get_cid <- function(query, - from = c("name", "cid", "sid", "aid", "smiles", "inchi", "inchikey"), + from = "name", match = c("all", "first", "ask", "na"), verbose = TRUE, - search_substances = FALSE, arg = NULL, first = NULL, ...) { - - # from can be cid | name | smiles | inchi | sdf | inchikey | formula - # query <- c("Aspirin") - # from = "name" - #deprecate `first` if (!is.null(first) && first == TRUE) { message("`first = TRUE` is deprecated. Use `match = 'first'` instead") @@ -73,89 +86,106 @@ get_cid <- message("`first = FALSE` is deprecated. Use `match = 'all'` instead") match <- "all" } - - from <- match.arg(from) + #input validation + xref <- paste( + "xref", + c("registryid", "rn", "pubmedid", "mmdbid", "proteingi", "nucleotidegi", + "taxonomyid", "mimid", "geneid", "probeid", "patentid"), + sep = "/" + ) + structure_search <- expand.grid( + c("substructure", "superstructure", "similarity", "identity"), + c("smiles", "inchi", "sdf", "cid") + ) + structure_search <- with(structure_search, paste(Var1, Var2, sep = "/")) + fast_search <- expand.grid( + c("fastidentity", "fastsimilarity_2d", "fastsimilarity_3d", + "fastsubstructure", "fastsuperstructure"), + c("smiles", "smarts", "inchi", "sdf", "cid") + ) + fast_search <- c(with(fast_search, paste(Var1, Var2, sep = "/")), + "fastformula") + from_choices <-c("cid", "name", "smiles", "inchi", "sdf", "inchikey", + "formula", structure_search, xref, fast_search) + from <- tolower(from) + from <- match.arg(from, choices = from_choices) match <- match.arg(match) - - foo <- function(query, from, match, scope = "compound", - verbose, arg, ...) { - if (is.na(query)) - return(NA) - prolog <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug" - input <- paste0("/", scope, "/", from) - output <- "/cids/JSON" - if (!is.null(arg)) - arg <- paste0("?", arg) - qurl <- paste0(prolog, input, output, arg) - if (verbose) - message(qurl) - Sys.sleep(rgamma(1, shape = 15, scale = 1/10)) - cont <- try( - content( - POST(qurl, - body = paste0(from, "=", query)), - type = "text", encoding = "UTF-8"), - silent = TRUE - ) - if (inherits(cont, "try-error")) { - warning("Problem with web service encountered... Returning NA.") - return(NA) + foo <- function(query, from, match, verbose, arg, ...) { + if (is.na(query)) return(NA) + if (from %in% structure_search) { + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound", + from, query, "json", sep = "/") } - cont <- jsonlite::fromJSON(cont) - if (names(cont) == "Fault") { - warning(cont$Fault$Details, ". Returning NA.") - return(NA) + else { + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound", + from, query, "cids", "json", sep = "/") } - if (scope == "substance") { - cont <- cont$InformationList$Information$CID + if (!is.null(arg)) qurl <- paste0(qurl, "?", arg) + if (verbose) { + message(paste0("Querying ", query, ". "), appendLF = FALSE) + } + Sys.sleep(rgamma(1, shape = 15, scale = 1 / 10)) + res <- httr::POST(qurl, user_agent("webchem"), handle = handle("")) + if (res$status_code != 200) { + if (res$status_code == 202) { + cont <- httr::content(res, type = "text", encoding = "UTF-8") + listkey <- jsonlite::fromJSON(cont)$Waiting$ListKey + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound", + "listkey", listkey, "cids", "json", sep = "/") + while (res$status_code == 202) { + Sys.sleep(5 + rgamma(1, shape = 15, scale = 1 / 10)) + res <- httr::POST(qurl, user_agent("webchem"), handle = handle("")) + } + if (res$status_code != 200) { + if (verbose) message(httr::message_for_status(res)) + return(NA) + } + } + else{ + if (verbose) message(httr::message_for_status(res)) + return(NA) + } } + if (verbose) message(httr::message_for_status(res)) + cont <- httr::content(res, type = "text", encoding = "UTF-8") + cont <- jsonlite::fromJSON(cont)$IdentifierList$CID out <- unique(unlist(cont)) - out <- matcher(x = out, match = match, verbose = verbose) + out <- matcher(x = out, query = query, match = match, verbose = verbose) out <- as.character(out) names(out) <- NULL return(out) + return(cont) } - - out <- map(query, + out <- map(query, ~foo(query = .x, from = from, match = match, verbose = verbose, arg = arg)) - out <- setNames(out, query) - - if (search_substances) { - out2 <- map(query, - ~foo(query = .x, from = from, match = match, scope = "substance", - verbose = verbose, arg = arg)) - out2 <- setNames(out2, query) - - out <- map2(out, out2, c) - out <- map(out, unique) - } - - out <- + out <- setNames(out, query) + out <- lapply(out, enframe, name = NULL, value = "cid") %>% bind_rows(.id = "query") - return(out) + return(out) } #' Retrieve PubChem Substance ID (SID) #' -#' @description Retrieve one or more substance IDs (SIDs) from PubChem based on -#' substance name, registry ID (e.g. CAS RN), or source ID. Alternatively, -#' retrieve SIDs of all substances provided by a PubChem depositor. -#' @importFrom httr POST content -#' @importFrom jsonlite fromJSON -#' @importFrom tibble as_tibble +#' @description Retrieve substance IDs (SIDs) from PubChem based on substance +#' name, registry ID (e.g. CAS RN), or source ID. Alternatively, retrieve SIDs +#' of all substances provided by a PubChem depositor. #' @param query character; search term, one ore more substances or depositors. #' @param from character; type of input. Valid values are \code{"name"}, -#' \code{}, \code{"sourceid/"} or \code{"sourceall"}. -#' @param match character; functionality not yet implemented. +#' \code{}, \code{"sourceid/"} or \code{"sourceall"}. See +#' details for more information. +#' @param match character; How should multiple hits be handled?, "all" all +#' matches are returned, "best" the best matching is returned, "ask" enters an +#' interactive mode and the user is asked for input, "na" returns NA if multiple +#' hits are found. #' @param verbose logical; should a verbose output be printed on the console? #' @return Returns a tibble of substance id-s. #' @param ... currently not used. -#' @details \code{} is assembled as "\code{xref}/\{\code{RegistryID}, -#' \code{RN}, \code{PubMedID}, \code{MMDBID}, \code{ProteinGI}, -#' \code{NucleotideGI}, \code{TaxonomyID}, \code{MIMID}, \code{GeneID}, -#' \code{ProbeID}, \code{PatentID}\}, e.g. \code{from = "xref/RN"} will query by +#' #' @details \code{} is assembled as "\code{xref}/\{\code{RegistryID} | +#' \code{RN} | \code{PubMedID} | \code{MMDBID} | \code{ProteinGI}, +#' \code{NucleotideGI} | \code{TaxonomyID} | \code{MIMID} | \code{GeneID} | +#' \code{ProbeID} | \code{PatentID}\}", e.g. \code{from = "xref/RN"} will query by #' CAS RN. #' @details \code{} is any valid PubChem Data Source ID. #' @details If \code{from = "sourceall"} the query is one or more valid Pubchem @@ -182,6 +212,9 @@ get_cid <- #' \url{https://www.ncbi.nlm.nih.gov/home/about/policies/}, #' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}. #' @author Tamás Stirling, \email{stirling.tamas@@gmail.com} +#' @importFrom httr POST content +#' @importFrom jsonlite fromJSON +#' @importFrom tibble as_tibble #' @examples #' # might fail if API is not available #' \donttest{ @@ -190,7 +223,11 @@ get_cid <- #' get_sid(c("VCC957895", "VCC883692"), from = "sourceid/23706") #' get_sid("Optopharma Ltd", from = "sourceall") #' } -get_sid <- function(query, from, match = NULL, verbose = TRUE, ...) { +get_sid <- function(query, + from = "name", + match = c("all", "first", "ask", "na"), + verbose = TRUE, + ...) { from <- tolower(from) if (grepl("^sourceid/", from) == FALSE) { xref <- paste("xref", c("registryid", "rn", "pubmedid", "mmdbid", @@ -198,52 +235,52 @@ get_sid <- function(query, from, match = NULL, verbose = TRUE, ...) { "geneid", "probeid", "patentid"), sep = "/") from <- match.arg(from, choices = c("name", xref, "sourceall")) } - if (ping_pubchem() != TRUE) { - stop("The web service is currently unavailable.") - } - empty_df <- function(x) { - return(data.frame(query = x, sid = NA_character_, stringsAsFactors = FALSE)) - } + match <- match.arg(match) foo <- function(query, from, match, verbose, ...) { - if (is.na(query)) { - return(empty_df(query)) - } - if (verbose == TRUE) { + if (is.na(query)) return(NA) + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/", + from, query, "sids", "json", sep = "/") + if (verbose) { message(paste0("Querying ", query, ". "), appendLF = FALSE) } Sys.sleep(rgamma(1, shape = 15, scale = 1 / 10)) - qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/", - from, query, "sids", "json", sep = "/") - res <- httr::POST(qurl, user_agent("webchem"), timeout(10)) + res <- httr::POST(qurl, user_agent("webchem"), handle = handle("")) if (res$status_code != 200) { if (res$status_code == 202) { cont <- httr::content(res, type = "text", encoding = "UTF-8") listkey <- jsonlite::fromJSON(cont)$Waiting$ListKey qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance", "listkey", listkey, "sids", "json", sep = "/") - Sys.sleep(rgamma(1, shape = 15, scale = 1 / 10)) - res <- httr::POST(qurl, user_agent("webchem"), timeout(10)) + while (res$status_code == 202) { + Sys.sleep(5 + rgamma(1, shape = 15, scale = 1 / 10)) + res <- httr::POST(qurl, user_agent("webchem"), handle = handle("")) + } if (res$status_code != 200) { - if (verbose == TRUE) message(httr::message_for_status(res)) - return(empty_df(query)) + if (verbose) message(httr::message_for_status(res)) + return(NA) } } else{ - if (verbose == TRUE) message(httr::message_for_status(res)) - return(empty_df(query)) + if (verbose) message(httr::message_for_status(res)) + return(NA) } } - if (verbose == TRUE) message(httr::message_for_status(res)) + if (verbose) message(httr::message_for_status(res)) cont <- httr::content(res, type = "text", encoding = "UTF-8") cont <- jsonlite::fromJSON(cont)$IdentifierList$SID - cont <- data.frame(query = query, - sid = cont, - stringsAsFactors = FALSE) - return(cont) + out <- unique(unlist(cont)) + out <- matcher(x = out, match = match, verbose = verbose) + out <- as.character(out) + names(out) <- NULL + return(out) } - out <- lapply(query, function(x) foo(x, from, match, verbose)) - out <- do.call(rbind, out) - out <- tibble::as_tibble(out) + out <- map(query, + ~foo(query = .x, from = from, match = match, + verbose = verbose, arg = arg)) + out <- setNames(out, query) + out <- + lapply(out, enframe, name = NULL, value = "sid") %>% + bind_rows(.id = "query") return(out) } diff --git a/man/get_cid.Rd b/man/get_cid.Rd index 739c2e8e..aa958fd6 100644 --- a/man/get_cid.Rd +++ b/man/get_cid.Rd @@ -2,31 +2,32 @@ % Please edit documentation in R/pubchem.R \name{get_cid} \alias{get_cid} -\title{Retrieve Pubchem Id (CID)} +\title{Retrieve Pubchem Compound ID (CID)} \usage{ get_cid( query, - from = c("name", "cid", "sid", "aid", "smiles", "inchi", "inchikey"), + from = "name", match = c("all", "first", "ask", "na"), verbose = TRUE, - search_substances = FALSE, arg = NULL, first = NULL, ... ) } \arguments{ -\item{query}{character; search term.} +\item{query}{character; search term, one or more compounds.} \item{from}{character; type of input, can be one of "name" (default), "cid", -"sid", "aid", "smiles", "inchi", "inchikey"} +"smiles", "inchi", "sdf", "inchikey", "formula", , , +. See details for more information.} -\item{match}{character; How should multiple hits be handled?, "all" all matches are returned, "best" the best matching is returned, "ask" enters an interactive mode and the user is asked for input, "na" returns NA if multiple hits are found.} +\item{match}{character; How should multiple hits be handled?, "all" all +matches are returned, "best" the best matching is returned, "ask" enters an +interactive mode and the user is asked for input, "na" returns NA if multiple +hits are found.} \item{verbose}{logical; should a verbose output be printed on the console?} -\item{search_substances}{logical; If TRUE also searches PubChem SIDs} - \item{arg}{character; optinal arguments like "name_type=word" to match individual words.} @@ -38,8 +39,25 @@ individual words.} a tibble. } \description{ -Return CompoundID (CID) for a search query using PUG-REST, -see \url{https://pubchem.ncbi.nlm.nih.gov/}. +Retrieve compound IDs (CIDs) from PubChem. +} +\details{ + is assembled as "{\code{substructure} | +\code{superstructure} | \code{similarity} | \code{identity}} / {\code{smiles} + | \code{inchi} | \code{sdf} | \code{cid}}", e.g. + \code{from = "substructure/smiles"}. + +\code{} is assembled as "\code{xref}/\{\code{RegistryID} | +\code{RN} | \code{PubMedID} | \code{MMDBID} | \code{ProteinGI}, +\code{NucleotideGI} | \code{TaxonomyID} | \code{MIMID} | \code{GeneID} | +\code{ProbeID} | \code{PatentID}\}", e.g. \code{from = "xref/RN"} will query +by CAS RN. + + is either \code{fastformula} or it is assembled as +"{\code{fastidentity} | \code{fastsimilarity_2d} | \code{fastsimilarity_3d} | +\code{fastsubstructure} | \code{fastsuperstructure}}/{\code{smiles} | +\code{smarts} | \code{inchi} | \code{sdf} | \code{cid}}", e.g. +\code{from = "fastidentity/smiles"}. } \note{ Please respect the Terms and Conditions of the National Library of @@ -55,6 +73,10 @@ get_cid("Triclosan") get_cid("Triclosan", arg = "name_type=word") get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey") get_cid("CCCC", from = "smiles") +get_cid("C26H52NO6P", from = "formula") +get_cid("56-40-6", from = "xref/rn") +get_cid(5564, from = "similarity/cid") +get_cid("CCO", from = "similarity/smiles") # multiple inputs comp <- c("Triclosan", "Aspirin") diff --git a/man/get_sid.Rd b/man/get_sid.Rd index 7189f742..50d7aa7d 100644 --- a/man/get_sid.Rd +++ b/man/get_sid.Rd @@ -4,35 +4,44 @@ \alias{get_sid} \title{Retrieve PubChem Substance ID (SID)} \usage{ -get_sid(query, from, match = NULL, verbose = TRUE, ...) +get_sid( + query, + from = "name", + match = c("all", "first", "ask", "na"), + verbose = TRUE, + ... +) } \arguments{ \item{query}{character; search term, one ore more substances or depositors.} \item{from}{character; type of input. Valid values are \code{"name"}, -\code{}, \code{"sourceid/"} or \code{"sourceall"}.} +\code{}, \code{"sourceid/"} or \code{"sourceall"}. See +details for more information.} -\item{match}{character; functionality not yet implemented.} +\item{match}{character; How should multiple hits be handled?, "all" all +matches are returned, "best" the best matching is returned, "ask" enters an +interactive mode and the user is asked for input, "na" returns NA if multiple +hits are found.} \item{verbose}{logical; should a verbose output be printed on the console?} -\item{...}{currently not used.} +\item{...}{currently not used. +#' @details \code{} is assembled as "\code{xref}/\{\code{RegistryID} | +\code{RN} | \code{PubMedID} | \code{MMDBID} | \code{ProteinGI}, +\code{NucleotideGI} | \code{TaxonomyID} | \code{MIMID} | \code{GeneID} | +\code{ProbeID} | \code{PatentID}\}", e.g. \code{from = "xref/RN"} will query by +CAS RN.} } \value{ Returns a tibble of substance id-s. } \description{ -Retrieve one or more substance IDs (SIDs) from PubChem based on -substance name, registry ID (e.g. CAS RN), or source ID. Alternatively, -retrieve SIDs of all substances provided by a PubChem depositor. +Retrieve substance IDs (SIDs) from PubChem based on substance +name, registry ID (e.g. CAS RN), or source ID. Alternatively, retrieve SIDs +of all substances provided by a PubChem depositor. } \details{ -\code{} is assembled as "\code{xref}/\{\code{RegistryID}, -\code{RN}, \code{PubMedID}, \code{MMDBID}, \code{ProteinGI}, -\code{NucleotideGI}, \code{TaxonomyID}, \code{MIMID}, \code{GeneID}, -\code{ProbeID}, \code{PatentID}\}, e.g. \code{from = "xref/RN"} will query by -CAS RN. - \code{} is any valid PubChem Data Source ID. If \code{from = "sourceall"} the query is one or more valid Pubchem diff --git a/tests/testthat/test-pubchem.R b/tests/testthat/test-pubchem.R index fb046dd9..b147b69d 100644 --- a/tests/testthat/test-pubchem.R +++ b/tests/testthat/test-pubchem.R @@ -16,6 +16,11 @@ test_that("get_cid()", { expect_true(is.na(get_cid(NA)$cid[1])) expect_equal(get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey")$cid[1], "12345") + # issue 206, some queries first return a listkey. + expect_equal(get_cid(5564, from="fastsimilarity_2d/cid")[,1], 5564) + expect_equal(get_cid("C26H52NO6P", from = "formula")[,1], 10864091) + expect_equal(capture_messages(get_cid("balloon")), + "Querying balloon. Not Found (HTTP 404). \n") }) test_that("get_sid()", { From ec60d2723988e64fe4b5b4db0a4bf048d84cc35e Mon Sep 17 00:00:00 2001 From: stitam Date: Wed, 29 Apr 2020 11:29:23 +0100 Subject: [PATCH 08/11] Export get_sid() and fix some tests --- NAMESPACE | 1 + R/pubchem.R | 13 ++++++++++--- tests/testthat/test-pubchem.R | 10 +++++----- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 698458aa..b497aaf8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -53,6 +53,7 @@ export(get_chebiid) export(get_cid) export(get_csid) export(get_etoxid) +export(get_sid) export(get_wdid) export(inchikey) export(is.cas) diff --git a/R/pubchem.R b/R/pubchem.R index 601342ec..27ae803a 100644 --- a/R/pubchem.R +++ b/R/pubchem.R @@ -111,7 +111,10 @@ get_cid <- from <- match.arg(from, choices = from_choices) match <- match.arg(match) foo <- function(query, from, match, verbose, arg, ...) { - if (is.na(query)) return(NA) + if (is.na(query)) { + if (verbose) message(paste0(query, " is invalid. Returning NA.")) + return(NA) + } if (from %in% structure_search) { qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound", from, query, "json", sep = "/") @@ -215,6 +218,7 @@ get_cid <- #' @importFrom httr POST content #' @importFrom jsonlite fromJSON #' @importFrom tibble as_tibble +#' @export #' @examples #' # might fail if API is not available #' \donttest{ @@ -237,7 +241,10 @@ get_sid <- function(query, } match <- match.arg(match) foo <- function(query, from, match, verbose, ...) { - if (is.na(query)) return(NA) + if (is.na(query)) { + if (verbose) message(paste0(query, " is invalid. Returning NA.")) + return(NA) + } qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/", from, query, "sids", "json", sep = "/") if (verbose) { @@ -269,7 +276,7 @@ get_sid <- function(query, cont <- httr::content(res, type = "text", encoding = "UTF-8") cont <- jsonlite::fromJSON(cont)$IdentifierList$SID out <- unique(unlist(cont)) - out <- matcher(x = out, match = match, verbose = verbose) + out <- matcher(x = out, query = query, match = match, verbose = verbose) out <- as.character(out) names(out) <- NULL return(out) diff --git a/tests/testthat/test-pubchem.R b/tests/testthat/test-pubchem.R index b147b69d..f6fa99a3 100644 --- a/tests/testthat/test-pubchem.R +++ b/tests/testthat/test-pubchem.R @@ -17,10 +17,10 @@ test_that("get_cid()", { expect_equal(get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey")$cid[1], "12345") # issue 206, some queries first return a listkey. - expect_equal(get_cid(5564, from="fastsimilarity_2d/cid")[,1], 5564) - expect_equal(get_cid("C26H52NO6P", from = "formula")[,1], 10864091) + expect_equal(get_cid(5564, from="fastsimilarity_2d/cid")$cid[1], "5564") + expect_equal(get_cid("C26H52NO6P", from = "formula")$cid[1], "10864091") expect_equal(capture_messages(get_cid("balloon")), - "Querying balloon. Not Found (HTTP 404). \n") + c("Querying balloon. ", "Not Found (HTTP 404).", "\n")) }) test_that("get_sid()", { @@ -33,9 +33,9 @@ test_that("get_sid()", { expect_true("tbl" %in% class(a)) expect_true(87798 %in% a$sid) - expect_equal(is.na(a2[1:2, 2], c(TRUE, TRUE))) + expect_equal(is.na(a2$sid[1:2]), c(TRUE, TRUE)) expect_true(4681 %in% b$sid) - expect_equal(c$sid, c(385515341, 385515340)) + expect_equal(c$sid, c("385515341", "385515340")) }) test_that("pc_prop", { From 5363ace5af7a4b0ca0595fa14be355cb6f6ac111 Mon Sep 17 00:00:00 2001 From: stitam Date: Thu, 14 May 2020 11:39:03 +0100 Subject: [PATCH 09/11] Update get_cid() to query substance and assay domains as well --- NAMESPACE | 2 - NEWS | 8 +- R/pubchem.R | 244 +++++++++++++--------------------- man/get_cid.Rd | 67 ++++++++-- man/get_sid.Rd | 90 ------------- tests/testthat/test-pubchem.R | 76 ++++++----- 6 files changed, 200 insertions(+), 287 deletions(-) delete mode 100644 man/get_sid.Rd diff --git a/NAMESPACE b/NAMESPACE index d1ba1785..029d8fc7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -53,7 +53,6 @@ export(get_chebiid) export(get_cid) export(get_csid) export(get_etoxid) -export(get_sid) export(get_wdid) export(inchikey) export(is.cas) @@ -91,7 +90,6 @@ importFrom(dplyr,left_join) importFrom(httr,GET) importFrom(httr,POST) importFrom(httr,add_headers) -importFrom(httr,content) importFrom(httr,http_status) importFrom(jsonlite,fromJSON) importFrom(jsonlite,toJSON) diff --git a/NEWS b/NEWS index 3e94c4cb..12045d9a 100644 --- a/NEWS +++ b/NEWS @@ -1,14 +1,8 @@ -<<<<<<< HEAD -webchem 0.5.0.9006 -======= -webchem 0.5.0.9005 ->>>>>>> upstream/master -====================== +webchem 0.5.0.9010 NEW FEATURES * get_cid() now can search by registry IDs (e.g. CAS RN), and can handle more complex requests like searching for similar compounds. -* get_sid() is a new function that can retrieve substance IDs from PubChem based on name, registry ID (e.g. CAS RN), or source ID. * Retrieve chemical data from PubChem content pages with pc_sect(). * get_etoxid() now can search by CAS, EC, GSBL and RTECS numbers. Added `from = ` argument. [PR #241, added by @andschar] * nist_ri() now can search by name, InChI, InChIKey, or CAS. The `cas` argument is deprecated. Use `query` instead with `from = "cas"` diff --git a/R/pubchem.R b/R/pubchem.R index b399c109..05905e28 100644 --- a/R/pubchem.R +++ b/R/pubchem.R @@ -2,19 +2,29 @@ #' #' Retrieve compound IDs (CIDs) from PubChem. #' @param query character; search term, one or more compounds. -#' @param from character; type of input, can be one of "name" (default), "cid", -#' "smiles", "inchi", "sdf", "inchikey", "formula", , , -#' . See details for more information. -#' @param match character; How should multiple hits be handled?, "all" all -#' matches are returned, "best" the best matching is returned, "ask" enters an -#' interactive mode and the user is asked for input, "na" returns NA if multiple -#' hits are found. +#' @param from character; type of input. See details for more information. +#' @param domain character; query domain, can be one of \code{"compound"}, +#' \code{"substance"}, \code{"assay"}. +#' @param match character; How should multiple hits be handled?, \code{"all"} +#' all matches are returned, \code{"best"} the best matching is returned, +#' \code{"ask"} enters an interactive mode and the user is asked for input, +#' \code{"na"} returns NA if multiple hits are found. #' @param verbose logical; should a verbose output be printed on the console? #' @param arg character; optinal arguments like "name_type=word" to match #' individual words. #' @param first deprecated. Use `match` instead. #' @param ... currently unused. #' @return a tibble. +#' @details Valid values for the \code{from} argument depend on the +#' \code{domain}: +#' \itemize{ +#' \item{\code{compound}: \code{"name"}, \code{"smiles"}, \code{"inchi"}, +#' \code{"inchikey"}, \code{"formula"}, \code{"sdf"}, , +#' , .} +#' \item{\code{substance}: \code{"name"}, \code{"sid"}, +#' \code{}, \code{"sourceid/"} or \code{"sourceall"}.} +#' \item{\code{assay}: \code{"aid"}, \code{}.} +#' } #' @details is assembled as "{\code{substructure} | #' \code{superstructure} | \code{similarity} | \code{identity}} / {\code{smiles} #' | \code{inchi} | \code{sdf} | \code{cid}}", e.g. @@ -29,6 +39,18 @@ #' \code{fastsubstructure} | \code{fastsuperstructure}}/{\code{smiles} | #' \code{smarts} | \code{inchi} | \code{sdf} | \code{cid}}", e.g. #' \code{from = "fastidentity/smiles"}. +#' @details \code{} is any valid PubChem Data Source ID. When +#' \code{from = "sourceid/"}, the query is the ID of the substance in +#' the depositor's database. +#' @details If \code{from = "sourceall"} the query is one or more valid Pubchem +#' depositor names. Depositor names are not case sensitive, but are sensitive to +#' spaces, and special characters may need to be escaped, such as "&" should be +#' replaced by "\%26". +#' @details Depositor names and Data Source IDs can be found at +#' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}. +#' @details \code{} is assembled as "\code{target}/\{\code{gi} | +#' \code{proteinname} | \code{geneid} | \code{genesymbol} | \code{accession}\}", +#' e.g. \code{from = "target/geneid"} will query by GeneID. #' @references Wang, Y., J. Xiao, T. O. Suzek, et al. 2009 PubChem: A Public #' Information System for #' Analyzing Bioactivities of Small Molecules. Nucleic Acids Research 37: @@ -49,6 +71,7 @@ #' usage policies of the indicidual data sources #' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}. #' @author Eduard Szoecs, \email{eduardszoecs@@gmail.com} +#' @author Tamás Stirling, \email{stirling.tamas@@gmail.com} #' @import httr #' @importFrom purrr map map2 #' @importFrom jsonlite fromJSON @@ -60,21 +83,38 @@ #' # might fail if API is not available #' get_cid("Triclosan") #' get_cid("Triclosan", arg = "name_type=word") -#' get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey") +#' # from SMILES #' get_cid("CCCC", from = "smiles") +#' # from InChI +#' get_cid("InChI=1S/CH5N/c1-2/h2H2,1H3", from = "inchi") +#' # from InChIKey +#' get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey") +#' # from formula #' get_cid("C26H52NO6P", from = "formula") +#' # from CAS RN #' get_cid("56-40-6", from = "xref/rn") +#' # similarity #' get_cid(5564, from = "similarity/cid") #' get_cid("CCO", from = "similarity/smiles") +#' # from SID +#' get_cid("126534046", from = "sid", domain = "substance") +#' # sourceid +#' get_cid("VCC957895", from = "sourceid/23706", domain = "substance") +#' # sourceall +#' get_cid("Optopharma Ltd", from = "sourceall", domain = "substance") +#' # from AID (CIDs of substances tested in the assay) +#' get_cid(170004, from = "aid", domain = "assay") +#' # from GeneID (CIDs of substances tested on the gene) +#' get_cid(25086, from = "target/geneid", domain = "assay") #' #' # multiple inputs -#' comp <- c("Triclosan", "Aspirin") -#' get_cid(comp) +#' get_cid(c("Triclosan", "Aspirin")) #' #' } get_cid <- function(query, from = "name", + domain = c("compound", "substance", "assay"), match = c("all", "first", "ask", "na"), verbose = TRUE, arg = NULL, @@ -84,11 +124,13 @@ get_cid <- if (!is.null(first) && first == TRUE) { message("`first = TRUE` is deprecated. Use `match = 'first'` instead") match <- "first" - } else if (!is.null(first) && first==FALSE) { + } else if (!is.null(first) && first == FALSE) { message("`first = FALSE` is deprecated. Use `match = 'all'` instead") match <- "all" } #input validation + from <- tolower(from) + domain <- match.arg(domain) xref <- paste( "xref", c("registryid", "rn", "pubmedid", "mmdbid", "proteingi", "nucleotidegi", @@ -99,7 +141,8 @@ get_cid <- c("substructure", "superstructure", "similarity", "identity"), c("smiles", "inchi", "sdf", "cid") ) - structure_search <- with(structure_search, paste(Var1, Var2, sep = "/")) + structure_search <- paste(structure_search$Var1, structure_search$Var2, + sep = "/") fast_search <- expand.grid( c("fastidentity", "fastsimilarity_2d", "fastsimilarity_3d", "fastsubstructure", "fastsuperstructure"), @@ -107,35 +150,55 @@ get_cid <- ) fast_search <- c(with(fast_search, paste(Var1, Var2, sep = "/")), "fastformula") - from_choices <-c("cid", "name", "smiles", "inchi", "sdf", "inchikey", - "formula", structure_search, xref, fast_search) - from <- tolower(from) - from <- match.arg(from, choices = from_choices) + targets <- paste("target", c("gi", "proteinname", "geneid", "genesymbol", + "accession"), sep = "/") + if (domain == "compound") { + from_choices <- c("cid", "name", "smiles", "inchi", "sdf", "inchikey", + "formula", structure_search, xref, fast_search) + from <- match.arg(from, choices = from_choices) + } + if (domain == "substance") { + if (grepl("^sourceid/", from) == FALSE) { + from <- match.arg(from, choices = c("sid", "name", xref, "sourceall")) + } + } + if (domain == "assay") { + from <- match.arg(from, choices = c("aid", targets)) + } match <- match.arg(match) - foo <- function(query, from, match, verbose, arg, ...) { + foo <- function(query, from, domain, match, verbose, arg, ...) { if (is.na(query)) { if (verbose) message(paste0(query, " is invalid. Returning NA.")) return(NA) } if (from %in% structure_search) { - qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound", - from, query, "json", sep = "/") + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", + domain, from, query, "json", sep = "/") } else { - qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound", - from, query, "cids", "json", sep = "/") + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", + domain, from, query, "cids", "json", sep = "/") } if (!is.null(arg)) qurl <- paste0(qurl, "?", arg) if (verbose) { message(paste0("Querying ", query, ". "), appendLF = FALSE) } Sys.sleep(rgamma(1, shape = 15, scale = 1 / 10)) - res <- httr::POST(qurl, user_agent("webchem"), handle = handle("")) + if (from == "inchi") { + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", + domain, from, "cids", "json", sep = "/") + res <- httr::POST(qurl, body = paste0("inchi=", query), + user_agent("webchem"), handle = handle("")) + } + else { + res <- httr::POST(qurl, user_agent("webchem"), + handle = handle("")) + } if (res$status_code != 200) { if (res$status_code == 202) { cont <- httr::content(res, type = "text", encoding = "UTF-8") listkey <- jsonlite::fromJSON(cont)$Waiting$ListKey - qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound", + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/", domain, "listkey", listkey, "cids", "json", sep = "/") while (res$status_code == 202) { Sys.sleep(5 + rgamma(1, shape = 15, scale = 1 / 10)) @@ -153,16 +216,23 @@ get_cid <- } if (verbose) message(httr::message_for_status(res)) cont <- httr::content(res, type = "text", encoding = "UTF-8") - cont <- jsonlite::fromJSON(cont)$IdentifierList$CID + if (domain == "compound") { + cont <- jsonlite::fromJSON(cont)$IdentifierList$CID + } + if (domain == "substance") { + cont <- jsonlite::fromJSON(cont)$InformationList$Information$CID + } + if (domain == "assay") { + cont <- jsonlite::fromJSON(cont)$InformationList$Information$CID + } out <- unique(unlist(cont)) out <- matcher(x = out, query = query, match = match, verbose = verbose) out <- as.character(out) names(out) <- NULL return(out) - return(cont) } out <- map(query, - ~foo(query = .x, from = from, match = match, + ~foo(query = .x, from = from, domain = domain, match = match, verbose = verbose, arg = arg)) out <- setNames(out, query) out <- @@ -171,128 +241,6 @@ get_cid <- return(out) } -#' Retrieve PubChem Substance ID (SID) -#' -#' @description Retrieve substance IDs (SIDs) from PubChem based on substance -#' name, registry ID (e.g. CAS RN), or source ID. Alternatively, retrieve SIDs -#' of all substances provided by a PubChem depositor. -#' @param query character; search term, one ore more substances or depositors. -#' @param from character; type of input. Valid values are \code{"name"}, -#' \code{}, \code{"sourceid/"} or \code{"sourceall"}. See -#' details for more information. -#' @param match character; How should multiple hits be handled?, "all" all -#' matches are returned, "best" the best matching is returned, "ask" enters an -#' interactive mode and the user is asked for input, "na" returns NA if multiple -#' hits are found. -#' @param verbose logical; should a verbose output be printed on the console? -#' @return Returns a tibble of substance id-s. -#' @param ... currently not used. -#' #' @details \code{} is assembled as "\code{xref}/\{\code{RegistryID} | -#' \code{RN} | \code{PubMedID} | \code{MMDBID} | \code{ProteinGI}, -#' \code{NucleotideGI} | \code{TaxonomyID} | \code{MIMID} | \code{GeneID} | -#' \code{ProbeID} | \code{PatentID}\}", e.g. \code{from = "xref/RN"} will query by -#' CAS RN. -#' @details \code{} is any valid PubChem Data Source ID. -#' @details If \code{from = "sourceall"} the query is one or more valid Pubchem -#' depositor names. Depositor names are not case sensitive, but are sensitive to -#' spaces, and special characters may need to be escaped, such as "&" should be -#' replaced by "\%26". -#' @details Depositor names and Data Source IDs can be found at -#' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}. -#' @references Wang, Y., J. Xiao, T. O. Suzek, et al. (2009). PubChem: A Public -#' Information System for Analyzing Bioactivities of Small Molecules. Nucleic -#' Acids Research 37: 623–633. -#' @references Kim, S., Thiessen, P. A., Bolton, E. E., & Bryant, S. H. (2015). -#' PUG-SOAP and PUG-REST: web services for programmatic access to chemical -#' information in PubChem. Nucleic acids research, gkv396. -#' @references Kim, Sunghwan, Paul A. Thiessen, Evan E. Bolton, et al. (2016). -#' PubChem Substance and Compound Databases. Nucleic Acids Research 44(D1): -#' D1202–D1213. -#' @references Sunghwan Kim, Paul A Thiessen, Tiejun Cheng, Bo Yu, Evan E Bolton -#' (2018) An update on PUG-REST: RESTful interface for programmatic access to -#' PubChem. Nucleic Acids Research 46(W1): W563–W570. -#' @note Please respect the Terms and Conditions of the National Library of -#' Medicine, \url{https://www.nlm.nih.gov/databases/download.html} and the data -#' usage policies of National Center for Biotechnology Information, -#' \url{https://www.ncbi.nlm.nih.gov/home/about/policies/}, -#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}. -#' @author Tamás Stirling, \email{stirling.tamas@@gmail.com} -#' @importFrom httr POST content -#' @importFrom jsonlite fromJSON -#' @importFrom tibble as_tibble -#' @export -#' @examples -#' # might fail if API is not available -#' \donttest{ -#' get_sid("2-(Acetyloxy)benzoic acid", from = "name") -#' get_sid("57-27-2", from = "xref/rn") -#' get_sid(c("VCC957895", "VCC883692"), from = "sourceid/23706") -#' get_sid("Optopharma Ltd", from = "sourceall") -#' } -get_sid <- function(query, - from = "name", - match = c("all", "first", "ask", "na"), - verbose = TRUE, - ...) { - from <- tolower(from) - if (grepl("^sourceid/", from) == FALSE) { - xref <- paste("xref", c("registryid", "rn", "pubmedid", "mmdbid", - "proteingi", "nucleotidegi", "taxonomyid", "mimid", - "geneid", "probeid", "patentid"), sep = "/") - from <- match.arg(from, choices = c("name", xref, "sourceall")) - } - match <- match.arg(match) - foo <- function(query, from, match, verbose, ...) { - if (is.na(query)) { - if (verbose) message(paste0(query, " is invalid. Returning NA.")) - return(NA) - } - qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/", - from, query, "sids", "json", sep = "/") - if (verbose) { - message(paste0("Querying ", query, ". "), appendLF = FALSE) - } - Sys.sleep(rgamma(1, shape = 15, scale = 1 / 10)) - res <- httr::POST(qurl, user_agent("webchem"), handle = handle("")) - if (res$status_code != 200) { - if (res$status_code == 202) { - cont <- httr::content(res, type = "text", encoding = "UTF-8") - listkey <- jsonlite::fromJSON(cont)$Waiting$ListKey - qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance", - "listkey", listkey, "sids", "json", sep = "/") - while (res$status_code == 202) { - Sys.sleep(5 + rgamma(1, shape = 15, scale = 1 / 10)) - res <- httr::POST(qurl, user_agent("webchem"), handle = handle("")) - } - if (res$status_code != 200) { - if (verbose) message(httr::message_for_status(res)) - return(NA) - } - } - else{ - if (verbose) message(httr::message_for_status(res)) - return(NA) - } - } - if (verbose) message(httr::message_for_status(res)) - cont <- httr::content(res, type = "text", encoding = "UTF-8") - cont <- jsonlite::fromJSON(cont)$IdentifierList$SID - out <- unique(unlist(cont)) - out <- matcher(x = out, query = query, match = match, verbose = verbose) - out <- as.character(out) - names(out) <- NULL - return(out) - } - out <- map(query, - ~foo(query = .x, from = from, match = match, - verbose = verbose, arg = arg)) - out <- setNames(out, query) - out <- - lapply(out, enframe, name = NULL, value = "sid") %>% - bind_rows(.id = "query") - return(out) -} - #' Retrieve compound properties from a pubchem CID #' #' Retrieve compound information from pubchem CID, see diff --git a/man/get_cid.Rd b/man/get_cid.Rd index 2978c5f3..0d7f968d 100644 --- a/man/get_cid.Rd +++ b/man/get_cid.Rd @@ -7,6 +7,7 @@ get_cid( query, from = "name", + domain = c("compound", "substance", "assay"), match = c("all", "first", "ask", "na"), verbose = TRUE, arg = NULL, @@ -17,14 +18,15 @@ get_cid( \arguments{ \item{query}{character; search term, one or more compounds.} -\item{from}{character; type of input, can be one of "name" (default), "cid", -"smiles", "inchi", "sdf", "inchikey", "formula", , , -. See details for more information.} +\item{from}{character; type of input. See details for more information.} -\item{match}{character; How should multiple hits be handled?, "all" all -matches are returned, "best" the best matching is returned, "ask" enters an -interactive mode and the user is asked for input, "na" returns NA if multiple -hits are found.} +\item{domain}{character; query domain, can be one of \code{"compound"}, +\code{"substance"}, \code{"assay"}.} + +\item{match}{character; How should multiple hits be handled?, \code{"all"} +all matches are returned, \code{"best"} the best matching is returned, +\code{"ask"} enters an interactive mode and the user is asked for input, +\code{"na"} returns NA if multiple hits are found.} \item{verbose}{logical; should a verbose output be printed on the console?} @@ -42,6 +44,17 @@ a tibble. Retrieve compound IDs (CIDs) from PubChem. } \details{ +Valid values for the \code{from} argument depend on the +\code{domain}: +\itemize{ +\item{\code{compound}: \code{"name"}, \code{"smiles"}, \code{"inchi"}, +\code{"inchikey"}, \code{"formula"}, \code{"sdf"}, , +, .} +\item{\code{substance}: \code{"name"}, \code{"sid"}, +\code{}, \code{"sourceid/"} or \code{"sourceall"}.} +\item{\code{assay}: \code{"aid"}, \code{}.} +} + is assembled as "{\code{substructure} | \code{superstructure} | \code{similarity} | \code{identity}} / {\code{smiles} | \code{inchi} | \code{sdf} | \code{cid}}", e.g. @@ -58,6 +71,22 @@ by CAS RN. \code{fastsubstructure} | \code{fastsuperstructure}}/{\code{smiles} | \code{smarts} | \code{inchi} | \code{sdf} | \code{cid}}", e.g. \code{from = "fastidentity/smiles"}. + +\code{} is any valid PubChem Data Source ID. When +\code{from = "sourceid/"}, the query is the ID of the substance in +the depositor's database. + +If \code{from = "sourceall"} the query is one or more valid Pubchem +depositor names. Depositor names are not case sensitive, but are sensitive to +spaces, and special characters may need to be escaped, such as "&" should be +replaced by "\%26". + +Depositor names and Data Source IDs can be found at +\url{https://pubchem.ncbi.nlm.nih.gov/sources/}. + +\code{} is assembled as "\code{target}/\{\code{gi} | +\code{proteinname} | \code{geneid} | \code{genesymbol} | \code{accession}\}", +e.g. \code{from = "target/geneid"} will query by GeneID. } \note{ Please respect the Terms and Conditions of the National Library of @@ -73,16 +102,32 @@ usage policies of the indicidual data sources # might fail if API is not available get_cid("Triclosan") get_cid("Triclosan", arg = "name_type=word") -get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey") +# from SMILES get_cid("CCCC", from = "smiles") +# from InChI +get_cid("InChI=1S/CH5N/c1-2/h2H2,1H3", from = "inchi") +# from InChIKey +get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey") +# from formula get_cid("C26H52NO6P", from = "formula") +# from CAS RN get_cid("56-40-6", from = "xref/rn") +# similarity get_cid(5564, from = "similarity/cid") get_cid("CCO", from = "similarity/smiles") +# from SID +get_cid("126534046", from = "sid", domain = "substance") +# sourceid +get_cid("VCC957895", from = "sourceid/23706", domain = "substance") +# sourceall +get_cid("Optopharma Ltd", from = "sourceall", domain = "substance") +# from AID (CIDs of substances tested in the assay) +get_cid(170004, from = "aid", domain = "assay") +# from GeneID (CIDs of substances tested on the gene) +get_cid(25086, from = "target/geneid", domain = "assay") # multiple inputs -comp <- c("Triclosan", "Aspirin") -get_cid(comp) +get_cid(c("Triclosan", "Aspirin")) } } @@ -102,4 +147,6 @@ information in PubChem. Nucleic acids research, gkv396. } \author{ Eduard Szoecs, \email{eduardszoecs@gmail.com} + +Tamás Stirling, \email{stirling.tamas@gmail.com} } diff --git a/man/get_sid.Rd b/man/get_sid.Rd deleted file mode 100644 index 50d7aa7d..00000000 --- a/man/get_sid.Rd +++ /dev/null @@ -1,90 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pubchem.R -\name{get_sid} -\alias{get_sid} -\title{Retrieve PubChem Substance ID (SID)} -\usage{ -get_sid( - query, - from = "name", - match = c("all", "first", "ask", "na"), - verbose = TRUE, - ... -) -} -\arguments{ -\item{query}{character; search term, one ore more substances or depositors.} - -\item{from}{character; type of input. Valid values are \code{"name"}, -\code{}, \code{"sourceid/"} or \code{"sourceall"}. See -details for more information.} - -\item{match}{character; How should multiple hits be handled?, "all" all -matches are returned, "best" the best matching is returned, "ask" enters an -interactive mode and the user is asked for input, "na" returns NA if multiple -hits are found.} - -\item{verbose}{logical; should a verbose output be printed on the console?} - -\item{...}{currently not used. -#' @details \code{} is assembled as "\code{xref}/\{\code{RegistryID} | -\code{RN} | \code{PubMedID} | \code{MMDBID} | \code{ProteinGI}, -\code{NucleotideGI} | \code{TaxonomyID} | \code{MIMID} | \code{GeneID} | -\code{ProbeID} | \code{PatentID}\}", e.g. \code{from = "xref/RN"} will query by -CAS RN.} -} -\value{ -Returns a tibble of substance id-s. -} -\description{ -Retrieve substance IDs (SIDs) from PubChem based on substance -name, registry ID (e.g. CAS RN), or source ID. Alternatively, retrieve SIDs -of all substances provided by a PubChem depositor. -} -\details{ -\code{} is any valid PubChem Data Source ID. - -If \code{from = "sourceall"} the query is one or more valid Pubchem -depositor names. Depositor names are not case sensitive, but are sensitive to -spaces, and special characters may need to be escaped, such as "&" should be -replaced by "\%26". - -Depositor names and Data Source IDs can be found at -\url{https://pubchem.ncbi.nlm.nih.gov/sources/}. -} -\note{ -Please respect the Terms and Conditions of the National Library of -Medicine, \url{https://www.nlm.nih.gov/databases/download.html} and the data -usage policies of National Center for Biotechnology Information, -\url{https://www.ncbi.nlm.nih.gov/home/about/policies/}, -\url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}. -} -\examples{ -# might fail if API is not available -\donttest{ -get_sid("2-(Acetyloxy)benzoic acid", from = "name") -get_sid("57-27-2", from = "xref/rn") -get_sid(c("VCC957895", "VCC883692"), from = "sourceid/23706") -get_sid("Optopharma Ltd", from = "sourceall") -} -} -\references{ -Wang, Y., J. Xiao, T. O. Suzek, et al. (2009). PubChem: A Public -Information System for Analyzing Bioactivities of Small Molecules. Nucleic -Acids Research 37: 623–633. - -Kim, S., Thiessen, P. A., Bolton, E. E., & Bryant, S. H. (2015). -PUG-SOAP and PUG-REST: web services for programmatic access to chemical -information in PubChem. Nucleic acids research, gkv396. - -Kim, Sunghwan, Paul A. Thiessen, Evan E. Bolton, et al. (2016). -PubChem Substance and Compound Databases. Nucleic Acids Research 44(D1): -D1202–D1213. - -Sunghwan Kim, Paul A Thiessen, Tiejun Cheng, Bo Yu, Evan E Bolton -(2018) An update on PUG-REST: RESTful interface for programmatic access to -PubChem. Nucleic Acids Research 46(W1): W563–W570. -} -\author{ -Tamás Stirling, \email{stirling.tamas@gmail.com} -} diff --git a/tests/testthat/test-pubchem.R b/tests/testthat/test-pubchem.R index 27fc3723..7cffd60e 100644 --- a/tests/testthat/test-pubchem.R +++ b/tests/testthat/test-pubchem.R @@ -3,48 +3,62 @@ test_that("get_cid()", { skip_on_cran() skip_if_not(up, "PubChem service is down") - + #from name expect_equal(get_cid("Triclosan")$cid[1], "5564") + expect_equal(get_cid("Triclosan", domain = "substance")$cid[1], "5564") + #from smiles + expect_equal(get_cid("CCCC", from = "smiles")$cid, "7843") + #from inchi + expect_equal(get_cid("InChI=1S/CH5N/c1-2/h2H2,1H3", from = "inchi")$cid, + "6329") + #from inchikey + expect_equal(get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey")$cid[1], + "12345") + #from formula, issue 206, some queries first return a listkey. + expect_equal(get_cid("C26H52NO6P", from = "formula")$cid[1], "10864091") + # from CAS RN + expect_equal(get_cid("56-40-6", from = "xref/rn")$cid[1], "750") + expect_equal(get_cid("56-40-6", from = "xref/rn", + domain = "substance")$cid[1], "5257127") + #from cid, similarity + expect_equal(get_cid(5564, from = "similarity/cid")$cid[1], "5564") + #from smiles, similarity + expect_equal(get_cid("CCO", from = "similarity/smiles")$cid[1], "702") + #from SID + expect_equal(get_cid("126534046", from = "sid", domain = "substance")$cid, + "24971898") + # sourceid + expect_equal(get_cid("VCC957895", from = "sourceid/23706", + domain = "substance")$cid, "19689584") + # sourceall + expect_equal(get_cid("Optopharma Ltd", from = "sourceall", + domain = "substance")$cid[1], "102361739") + #from aid + expect_equal(get_cid(170004, from = "aid", domain = "assay")$cid, "68352") + #from GeneID + expect_equal(get_cid(25086, from = "target/geneid", domain = "assay")$cid[1], + "11580958") + #arg expect_true(nrow(get_cid("Triclosan", arg = "name_type=word")) > 1) + #match expect_true(nrow(get_cid("Triclosan", arg = "name_type=word", - match = "first")) == 1) + match = "first")) == 1) + #multiple compounds expect_true(nrow(get_cid(c("Triclosan", "Aspirin"))) == 2) - expect_true(is.na(suppressWarnings(get_cid("xxxx", verbose = FALSE))$cid[1])) - expect_warning( - get_cid("xxxx", verbose = FALSE), - "No CID found that matches the given name. Returning NA." - ) + #invalid input expect_true(is.na(get_cid(NA)$cid[1])) - expect_equal(get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey")$cid[1], - "12345") - # issue 206, some queries first return a listkey. - expect_equal(get_cid(5564, from="fastsimilarity_2d/cid")$cid[1], "5564") - expect_equal(get_cid("C26H52NO6P", from = "formula")$cid[1], "10864091") + expect_true(is.na(suppressWarnings(get_cid("xxxx", verbose = FALSE))$cid[1])) expect_equal(capture_messages(get_cid("balloon")), c("Querying balloon. ", "Not Found (HTTP 404).", "\n")) }) -test_that("get_sid()", { - skip_on_cran() - - a <- get_sid("2-(Acetyloxy)benzoic acid", from = "name") - a2 <- get_sid(c("balloon", NA, "aspirin"), from = "name") - b <- get_sid("57-27-2", from = "xref/rn") - c <- get_sid(c("VCC957895", "VCC883692"), from = "sourceid/23706") - - expect_true("tbl" %in% class(a)) - expect_true(87798 %in% a$sid) - expect_equal(is.na(a2$sid[1:2]), c(TRUE, TRUE)) - expect_true(4681 %in% b$sid) - expect_equal(c$sid, c("385515341", "385515340")) -}) - test_that("pc_prop", { skip_on_cran() skip_if_not(up, "PubChem service is down") a <- pc_prop("5564", properties = "CanonicalSmiles", verbose = FALSE) - b <- suppressWarnings(pc_prop("xxx", properties = "CanonicalSmiles", verbose = FALSE)) + b <- suppressWarnings(pc_prop("xxx", properties = "CanonicalSmiles", + verbose = FALSE)) c <- pc_prop("5564", properties = c("CanonicalSmiles", "InChiKey"), verbose = FALSE) expect_equal(a$CanonicalSMILES, "C1=CC(=C(C=C1Cl)O)OC2=C(C=C(C=C2)Cl)Cl") @@ -122,7 +136,8 @@ test_that("pc_sect()", { b <- pc_sect(2231, "depositor-supplied synonyms", "substance") expect_s3_class(b, c("tbl_df", "tbl", "data.frame")) expect_equal(names(b), c("SID", "Name", "Result", "SourceName", "SourceID")) - expect_equivalent(b$Result, c("cholesterol", "57-88-5", "5-cholestene-3beta-ol")) + expect_equivalent(b$Result, c("cholesterol", "57-88-5", + "5-cholestene-3beta-ol")) c <- pc_sect(780286, "modify date", "assay") expect_s3_class(c, c("tbl_df", "tbl", "data.frame")) @@ -132,7 +147,8 @@ test_that("pc_sect()", { d <- pc_sect("1ZHY_A", "Sequence", "protein") expect_s3_class(d, c("tbl_df", "tbl", "data.frame")) expect_equal(names(d), c("pdbID", "Name", "Result", "SourceName", "SourceID")) - expect_equivalent(d$Result[1], ">pdb|1ZHY|A Chain A, 1 Kes1 Protein (Run BLAST)") + expect_equivalent(d$Result[1], + ">pdb|1ZHY|A Chain A, 1 Kes1 Protein (Run BLAST)") e <- pc_sect("US2013040379", "Patent Identifier Synonyms", "patent") expect_s3_class(e, c("tbl_df", "tbl", "data.frame")) From d3bc549333e875c86bfdc6bcb87dd6d958e70860 Mon Sep 17 00:00:00 2001 From: stitam Date: Thu, 14 May 2020 12:30:09 +0100 Subject: [PATCH 10/11] Fix failed tests --- R/flavornet.R | 3 ++- tests/testthat/test-pubchem.R | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/R/flavornet.R b/R/flavornet.R index 62192e0b..9d28392a 100644 --- a/R/flavornet.R +++ b/R/flavornet.R @@ -27,7 +27,7 @@ fn_percept <- function(CAS, verbose = TRUE, ...) { foo <- function (CAS, verbose){ - qurl = paste0("http://www.flavornet.org/info/",CAS,".html") + qurl <- paste0("http://www.flavornet.org/info/",CAS,".html") if (verbose) message(qurl) Sys.sleep(rgamma(1, shape = 10, scale = 1/10)) @@ -43,5 +43,6 @@ fn_percept <- function(CAS, verbose = TRUE, ...) } percepts <- sapply(CAS, foo, verbose = verbose) percepts <- setNames(percepts, CAS) + suppressWarnings(closeAllConnections()) return(percepts) } \ No newline at end of file diff --git a/tests/testthat/test-pubchem.R b/tests/testthat/test-pubchem.R index 7cffd60e..198697d9 100644 --- a/tests/testthat/test-pubchem.R +++ b/tests/testthat/test-pubchem.R @@ -30,9 +30,6 @@ test_that("get_cid()", { # sourceid expect_equal(get_cid("VCC957895", from = "sourceid/23706", domain = "substance")$cid, "19689584") - # sourceall - expect_equal(get_cid("Optopharma Ltd", from = "sourceall", - domain = "substance")$cid[1], "102361739") #from aid expect_equal(get_cid(170004, from = "aid", domain = "assay")$cid, "68352") #from GeneID @@ -50,6 +47,10 @@ test_that("get_cid()", { expect_true(is.na(suppressWarnings(get_cid("xxxx", verbose = FALSE))$cid[1])) expect_equal(capture_messages(get_cid("balloon")), c("Querying balloon. ", "Not Found (HTTP 404).", "\n")) + skip("This test passes locally, but fails on AppVeyor and Travis") + # sourceall + expect_equal(get_cid("Optopharma Ltd", from = "sourceall", + domain = "substance")$cid[1], "102361739") }) test_that("pc_prop", { From 25eea8726868422dac9ec7a7968d3360223b4358 Mon Sep 17 00:00:00 2001 From: stitam Date: Fri, 15 May 2020 09:04:07 +0100 Subject: [PATCH 11/11] Add URLencoding and uncomment a failed test --- R/pubchem.R | 11 +++++------ man/get_cid.Rd | 4 +--- tests/testthat/test-pubchem.R | 1 - 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/R/pubchem.R b/R/pubchem.R index 05905e28..95683fe2 100644 --- a/R/pubchem.R +++ b/R/pubchem.R @@ -43,9 +43,7 @@ #' \code{from = "sourceid/"}, the query is the ID of the substance in #' the depositor's database. #' @details If \code{from = "sourceall"} the query is one or more valid Pubchem -#' depositor names. Depositor names are not case sensitive, but are sensitive to -#' spaces, and special characters may need to be escaped, such as "&" should be -#' replaced by "\%26". +#' depositor names. Depositor names are not case sensitive. #' @details Depositor names and Data Source IDs can be found at #' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}. #' @details \code{} is assembled as "\code{target}/\{\code{gi} | @@ -171,6 +169,10 @@ get_cid <- if (verbose) message(paste0(query, " is invalid. Returning NA.")) return(NA) } + if (verbose) { + message(paste0("Querying ", query, ". "), appendLF = FALSE) + } + if (is.character(query)) query <- URLencode(query) if (from %in% structure_search) { qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", domain, from, query, "json", sep = "/") @@ -180,9 +182,6 @@ get_cid <- domain, from, query, "cids", "json", sep = "/") } if (!is.null(arg)) qurl <- paste0(qurl, "?", arg) - if (verbose) { - message(paste0("Querying ", query, ". "), appendLF = FALSE) - } Sys.sleep(rgamma(1, shape = 15, scale = 1 / 10)) if (from == "inchi") { qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", diff --git a/man/get_cid.Rd b/man/get_cid.Rd index 0d7f968d..117b182a 100644 --- a/man/get_cid.Rd +++ b/man/get_cid.Rd @@ -77,9 +77,7 @@ by CAS RN. the depositor's database. If \code{from = "sourceall"} the query is one or more valid Pubchem -depositor names. Depositor names are not case sensitive, but are sensitive to -spaces, and special characters may need to be escaped, such as "&" should be -replaced by "\%26". +depositor names. Depositor names are not case sensitive. Depositor names and Data Source IDs can be found at \url{https://pubchem.ncbi.nlm.nih.gov/sources/}. diff --git a/tests/testthat/test-pubchem.R b/tests/testthat/test-pubchem.R index 198697d9..63c9bfe3 100644 --- a/tests/testthat/test-pubchem.R +++ b/tests/testthat/test-pubchem.R @@ -47,7 +47,6 @@ test_that("get_cid()", { expect_true(is.na(suppressWarnings(get_cid("xxxx", verbose = FALSE))$cid[1])) expect_equal(capture_messages(get_cid("balloon")), c("Querying balloon. ", "Not Found (HTTP 404).", "\n")) - skip("This test passes locally, but fails on AppVeyor and Travis") # sourceall expect_equal(get_cid("Optopharma Ltd", from = "sourceall", domain = "substance")$cid[1], "102361739")