diff --git a/.Rbuildignore b/.Rbuildignore index 78b88743..16b2ec5b 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -23,3 +23,10 @@ CONTRIBUTING.md README_cache .vscode +^codecov\.yml$ +^_pkgdown\.yml$ +^docs$ +^pkgdown$ + +.Rmd.orig + diff --git a/.gitignore b/.gitignore index 9c9e2da7..c4f57f2a 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ README_files* README.html .github/*.html README_cache* +docs +inst/doc diff --git a/.travis.yml b/.travis.yml index c3552180..2d15c76a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,11 +8,11 @@ cache: packages env: global: - _R_CRAN_CHECK_INCOMING_=false + - NOT_CRAN=true r_check_args: "--no-build-vignettes --no-manual --timings --as-cran --no-examples" r_check_revdep: false warnings_are_errors: false - apt_packages: - libssl-dev - libcurl4-openssl-dev @@ -30,13 +30,13 @@ r_github_packages: # report to codecov after_success: - - Rscript -e 'covr::codecov(line_exclusions = list("R/ppdb.R"), quiet = FALSE)' + - Rscript -e 'covr::codecov()' # report to maintainer notifications: email: recipients: - - eduardszoecs@gmail.com + - stirling.tamas@gmail.com on_success: change on_failure: change diff --git a/DESCRIPTION b/DESCRIPTION index 6add3ff6..18b801ce 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,5 +38,9 @@ Imports: Suggests: testthat, rcdk, - robotstxt + covr, + robotstxt, + knitr, + rmarkdown RoxygenNote: 7.1.0 +VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 1c9ff3dc..eb4fa5b1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,13 +1,16 @@ # Generated by roxygen2: do not edit by hand +S3method(cas,aw_query) S3method(cas,chebi_comp_entity) +S3method(cas,cts_compinfo) S3method(cas,default) +S3method(cas,etox_basic) +S3method(cas,opsin_query) S3method(cas,pan_query) S3method(cas,wd_ident) S3method(inchikey,aw_query) S3method(inchikey,chebi_comp_entity) -S3method(inchikey,cs_compinfo) -S3method(inchikey,cs_extcompinfo) +S3method(inchikey,cts_compinfo) S3method(inchikey,default) S3method(inchikey,etox_basic) S3method(inchikey,opsin_query) @@ -16,8 +19,6 @@ S3method(inchikey,pc_prop) S3method(inchikey,wd_ident) S3method(smiles,aw_query) S3method(smiles,chebi_comp_entity) -S3method(smiles,cs_compinfo) -S3method(smiles,cs_extcompinfo) S3method(smiles,cts_compinfo) S3method(smiles,default) S3method(smiles,etox_basic) @@ -33,6 +34,7 @@ export(chebi_comp_entity) export(ci_query) export(cid_compinfo) export(cir) +export(cir_img) export(cir_query) export(cs_check_key) export(cs_compinfo) @@ -67,10 +69,7 @@ export(parse_mol) export(pc_prop) export(pc_sect) export(pc_synonyms) -export(ping_cs) -export(ping_pan) -export(ping_pubchem) -export(ping_pubchem_pw) +export(ping_service) export(pp_query) export(ppdb) export(ppdb_parse) @@ -84,12 +83,15 @@ import(httr) import(jsonlite) import(rvest) import(stringr) +import(tibble) import(xml2) importFrom(data.tree,Do) importFrom(data.tree,FindNode) importFrom(data.tree,as.Node) importFrom(dplyr,bind_rows) +importFrom(dplyr,everything) importFrom(dplyr,left_join) +importFrom(dplyr,select) importFrom(httr,GET) importFrom(httr,POST) importFrom(httr,add_headers) @@ -107,6 +109,7 @@ importFrom(stats,rgamma) importFrom(stats,setNames) importFrom(tibble,as_tibble) importFrom(tibble,enframe) +importFrom(tibble,tibble) importFrom(utils,URLdecode) importFrom(utils,URLencode) importFrom(utils,adist) diff --git a/NEWS b/NEWS.md similarity index 88% rename from NEWS rename to NEWS.md index 4a60d996..cd7c01eb 100644 --- a/NEWS +++ b/NEWS.md @@ -1,20 +1,20 @@ -webchem 0.5.0.9005 -====================== +# webchem 0.5.0.9010 -NEW FEATURES +## NEW FEATURES +* get_cid() now can search by registry IDs (e.g. CAS RN), and can handle more complex requests like searching for similar compounds. * Retrieve chemical data from PubChem content pages with pc_sect(). * get_etoxid() now can search by CAS, EC, GSBL and RTECS numbers. Added `from = ` argument. [PR #241, added by @andschar] * nist_ri() now can search by name, InChI, InChIKey, or CAS. The `cas` argument is deprecated. Use `query` instead with `from = "cas"` -MINOR IMPROVEMENTS +## MINOR IMPROVEMENTS * all `get_*()` functions now output tibbles with a column for the query and a column for the retrieved ID * changes to arguments in `get_*()` functions to make them more consistent * aw_idx.rda is no longer included in the package as a data set. Instead, it is built by build_aw_idx() to tempdir(). -BUG FIXES +## BUG FIXES * nist_ri() returned malformed tables or errored if there was only one entry for a query * get_csid() now returns all csids when queried from formula @@ -23,14 +23,14 @@ BUG FIXES * get_cid() returned the PubChem ID of sodium when the query was NA [PR #223, fixed by stitam] * aw_query() returned a list for successful queries, NA for unsuccessful queries [PR #222, fixed by stitam] -DEPRECATED FUNCTIONS +## DEPRECATED FUNCTIONS -DEFUNCT FUNCTIONS +## DEFUNCT FUNCTIONS -webchem 0.5.0 -====================== +# webchem 0.5.0 -NEW FEATURES + +## NEW FEATURES * Retrieve data from ChEBI (https://www.ebi.ac.uk/chebi/) webservice with get_chebiid() and chebi_comp_entity(). ChEBI comprises a rich data base on chemicals with bilogical interest [contributed by @andschar]. * Retrieve retention indices from NIST (https://webbook.nist.gov) with nist_ri() [PR #154, contributed by @Aariq] @@ -38,12 +38,12 @@ NEW FEATURES * "first" argument in cts_convert() and cir_query() and "interactive" argument in pc_synonyms() deprecated. Use "choices" instead to return either a list of all results, only the first result, or an interactive menu to choose a result to return. [contributed by @Aariq] * ChemSpider functions now look for an API token stored in .Renviron or .Rprofile by default so you can keep them hidden more easily. -MINOR IMPROVEMENTS +## MINOR IMPROVEMENTS * as.cas() added. * removed documentation files for non-exported functions that were only used internally. -BUG FIXES +## BUG FIXES * cs_prop() failed with duplicated return values [issue #148, reported and fixed by @stanstrup] * pp_query() failed when compound present, but no properties [issue #151, reported and fixed by @stanstrup] @@ -54,24 +54,23 @@ BUG FIXES * fixed functions that communicate with the ChemSpider API [issue #149, issue #160, fixed by @stitam] * get_etoxid() printed incorrect results for certain match types [issue #201, fixed by @stitam] -DEPRECATED FUNCTIONS +## DEPRECATED FUNCTIONS * cs_extcompinfo() cannot be fixed as there is no equivalent in the new ChemSpider API yet. -DEFUNCT FUNCTIONS +## DEFUNCT FUNCTIONS * ppdb_parse() has been removed. webchem no longer offers any support for PPDB. * pp_query() has been removed. Physprop API is no longer active. * cs_prop() has been removed. -webchem 0.4.0 -====================== +# webchem 0.4.0 -NEW FEATURES +## NEW FEATURES -MINOR IMPROVEMENTS +## MINOR IMPROVEMENTS -BUG FIXES +## BUG FIXES * extr_num() did not work properly with decimal numbers [issue #136, reported and fixed by @stanstrup] * cs_prop() failed when epi-suite data was not available [issue #139, reported and fixed by @stanstrup] @@ -82,98 +81,98 @@ BUG FIXES * cir_query() failed with identifiers containing spaces (e.g. 'acetic acid') [issue #146, reported by Lars Nielsen] * several other functions failed with identifiers containing spaces & returned wrong distance. -DEPRECATED FUNCTIONS +## DEPRECATED FUNCTIONS + +## DEFUNCT FUNCTIONS -DEFUNCT FUNCTIONS +# webchem 0.3.0 -webchem 0.3.0 -====================== -NEW FEATURES +## NEW FEATURES -MINOR IMPROVEMENTS +## MINOR IMPROVEMENTS * cs_prop() now also return experimental data for Boiling and Melting Points. * pc_synonyms gained an argument 'interactive' to enter an interactive mode for selecting synonyms [issue #129, requested by @Aariq] * cts_convert now returns NA if no matches are found. -BUG FIXES +## BUG FIXES * cs_prop() failed with some CSIDs [isse #127, reported by @Aariq] * wd_ident() failed if multiple entries where found. Now returns the first hit only. * ci_query() did not return fully cleaned smiles and inchi -DEPRECATED FUNCTIONS +## DEPRECATED FUNCTIONS -DEFUNCT FUNCTIONS +## DEFUNCT FUNCTIONS -webchem 0.2.0 -====================== +# webchem 0.2.0 -NEW FEATURES + +## NEW FEATURES * fn_percept() extracts flavor percepts using CAS numbers from www.flavornet.org. Flavornet is a database of 738 compounds with human-detectible odors. [contributed by @Aariq] -MINOR IMPROVEMENTS +## MINOR IMPROVEMENTS + +## BUG FIXES -BUG FIXES +## DEPRECATED FUNCTIONS -DEPRECATED FUNCTIONS +## DEFUNCT FUNCTIONS -DEFUNCT FUNCTIONS +# webchem 0.1.1 -webchem 0.1.1 -====================== -NEW FEATURES +## NEW FEATURES * added ping_pubchem() to check whether pubchem is up & running * added cs_web_ping () to check whether the chemspider webpage is functional -MINOR IMPROVEMENTS +## MINOR IMPROVEMENTS * updated allan wood index -BUG FIXES +## BUG FIXES * pc_prop() returned to many rows if last cid supplied was NA * Switched to https for NCBI, chemspider & chemid (Issue #120, reported by @jranke) * get_wdid() failed if non-ascii characters where returned by wikipedia * rcdk:parse.smiles() now returns NA if a SMILES string could not be parsed. => broke is.smiles -DEPRECATED FUNCTIONS +## DEPRECATED FUNCTIONS -DEFUNCT FUNCTIONS +## DEFUNCT FUNCTIONS -webchem 0.1.0 -====================== +# webchem 0.1.0 -NEW FEATURES + +## NEW FEATURES * added cts_to() and cts_from() to retrieve possible ids that can be queried. * cts_*(), pp_query(), cir_query(), get_cid(), get_etoxid(), etox_*(), pan_query() get_wdid(), aw_query(), get_csid(), cs_prop(), cs_compinfo() and ci_query() can handle multiple inputs. * pc_prop() queries properties and pc_synonmy() synonyms from PUG-REST. * added extractors for webchem objects: cas(), inchikey() and smiles(). -MINOR IMPROVEMENTS +## MINOR IMPROVEMENTS * rewrite of pubchem functions using PUG-REST * chemspider: better use of NA in input (=return NA) * more robust matching in get_etoxid -BUG FIXES +## BUG FIXES * pan_query() did not return numeric values * get_cid() failed with multiple results -DEPRECATED FUNCTIONS +## DEPRECATED FUNCTIONS -DEFUNCT FUNCTIONS +## DEFUNCT FUNCTIONS * ppdb_query() has been removed due to copyright issues. The new ppdb_parse() parses only a html, but does not interact with the database @@ -186,10 +185,9 @@ The new ppdb_parse() parses only a html, but does not interact with the database -webchem 0.0.5 -====================== +# webchem 0.0.5 -NEW FEATURES +## NEW FEATURES * is.smiles() checks SMILES strings, by parsing via (R)CDK. * get_wdid() and wd_indent() to retrieve information from wikidata. @@ -197,7 +195,7 @@ NEW FEATURES * ci_query() can handle multi inputs (interactive mode, best match, first match and NA). * cs_prop() queries predcitions (ACD and EPiSuite) from ChemSpider -MINOR IMPROVEMENTS +## MINOR IMPROVEMENTS * webchem uses exclusively xml2 (instead of XML). * All function return source_url for (micro-)attribution of sources @@ -211,7 +209,7 @@ MINOR IMPROVEMENTS * aw_query() returns multiple inchikey if found. * pan() now returns chemical name and matched synonym. -BUG FIXES +## BUG FIXES * utility functions are not vectorized and throw an error. * chemid() did mot work with inchikey as input. @@ -221,7 +219,7 @@ BUG FIXES * ci_query() failed if multi hits were found. Now returns first hit. * aw_fuery() failed if inchikey was not found. -DEPRECATED FUNCTIONS +## DEPRECATED FUNCTIONS * pan_query() replaces pan() * aw_query() replaces alanwood() @@ -230,7 +228,7 @@ DEPRECATED FUNCTIONS * ci_query() replaces chemid() * pp_query() replaces physprop() -DEFUNCT FUNCTIONS +## DEFUNCT FUNCTIONS * csid_compinfo() * csid_extcompinfo() @@ -238,10 +236,10 @@ DEFUNCT FUNCTIONS -webchem 0.0.4 -====================== +# webchem 0.0.4 + -NEW FEATURES +## NEW FEATURES * chemid() to query ChemIDplus http://chem.sis.nlm.nih.gov/chemidplus/. * is.inchikey() and is.cas() to check if a string is valid inchikey or CAS registry number. @@ -261,7 +259,7 @@ NEW FEATURES * webchem has now a zenodo doi, please cite if you use it. -MINOR IMPROVEMENTS +## MINOR IMPROVEMENTS * cts_compinfo() checks if input is a inchikey (via exported function is.inchikey()). * cts_compinfo() is now more robust and verbose, if problems are encountered @@ -270,14 +268,14 @@ MINOR IMPROVEMENTS * physprop() also returns boiling and melting points. Moreover, values are now numeric. -BUG FIXES +## BUG FIXES * alanwood() returns only results for first match in case of multiple links found * physprop() stopped working after change of SRC to https, fixed now. * changed etox_* functions to https -DEPRECATED FUNCTIONS +## DEPRECATED FUNCTIONS * ppdb() replaces ppdb_query() and accepts individual index as created by ppdb_buildidx(). * cir() replaces cir_query(). @@ -285,43 +283,43 @@ DEPRECATED FUNCTIONS * cs_extcompinfo() replaces csid_extcompinfo() -DEFUNCT FUNCTIONS +## DEFUNCT FUNCTIONS * allanwood() -webchem 0.0.3 -====================== +# webchem 0.0.3 -NEW FEATURES + +## NEW FEATURES * Query SRC PHYSPROP Database with physprop(). * Query the ETOX ID with get_etoxid(); query basic information with etox_basic(); quality targets with etox_targets() and test results with etox_tests(). * Query PPDB with ppdb_query() -MINOR IMPROVEMENTS +## MINOR IMPROVEMENTS * added exceptions/checks to tests * improved robustness of cir_query() -BUG FIXES +## BUG FIXES * Correct the spelling of Alan Wood and rename function allanwood() to alanwood() [contribution of @jranke] -webchem 0.0.2 -====================== +# webchem 0.0.2 + -NEW FEATURES +## NEW FEATURES * Query the PAN Pesticides Database with pan(). * Query Allan Woods Compendium of Pesticide Common Names with allanwood(). -MINOR IMPROVEMENTS +## MINOR IMPROVEMENTS * Added checks for user input. * Fixed documentation, added example for bulk processing. @@ -330,7 +328,7 @@ MINOR IMPROVEMENTS * Added unit tests. * All functions return silently NA, if API is not reachable. -BUG FIXES +## BUG FIXES * cts_convert() does not ignore 'first' argument. * get_csid() did not return NA, if there was a problem with the API. diff --git a/R/alanwood.R b/R/alanwood.R index 09d7a5ca..9a0c8b7e 100644 --- a/R/alanwood.R +++ b/R/alanwood.R @@ -34,6 +34,7 @@ aw_query <- function(query, type = c("commonname", "cas"), verbose = TRUE, force_build = FALSE) { aw_idx <- build_aw_idx(verbose, force_build) foo <- function(query, type = c("commonname", "cas"), verbose) { + on.exit(suppressWarnings(closeAllConnections())) type <- match.arg(type) # search links in indexes if (type == "commonname") { @@ -147,6 +148,7 @@ aw_query <- function(query, type = c("commonname", "cas"), verbose = TRUE, #' @source \url{http://www.alanwood.net/pesticides} #' @export build_aw_idx <- function(verbose = TRUE, force_build = FALSE) { + on.exit(suppressWarnings(closeAllConnections())) message(msg = "build_aw_idx() will not be exported in future releases.") suppressWarnings(try(load(paste0(tempdir(), "/data/aw_idx.rda")), silent = TRUE)) diff --git a/R/chemid.R b/R/chemid.R index 3f8fbeef..42b2f40e 100644 --- a/R/chemid.R +++ b/R/chemid.R @@ -53,6 +53,7 @@ ci_query <- function(query, type = c('name', 'rn', 'inchikey'), type <- match.arg(type) match <- match.arg(match) foo <- function(query, type, match, verbose){ + on.exit(suppressWarnings(closeAllConnections())) if (is.na(query)) { message('query is NA! Returning NA.\n') return(NA) @@ -219,4 +220,4 @@ ci_query <- function(query, type = c('name', 'rn', 'inchikey'), out <- setNames(out, query) class(out) <- c('ci_query', 'list') return(out) -} \ No newline at end of file +} diff --git a/R/cir.R b/R/cir.R index a26afca6..6254128e 100644 --- a/R/cir.R +++ b/R/cir.R @@ -114,6 +114,10 @@ #' @export cir_query <- function(identifier, representation = 'smiles', resolver = NULL, first = FALSE, choices = NULL, verbose = TRUE, ...){ + if (first == TRUE) { + message("`first` is deprecated. Using `choices = 1` instead.") + choices = 1 + } foo <- function(identifier, representation, resolver, first, verbose) { if (is.na(identifier)) { return(NA) @@ -163,3 +167,244 @@ cir_query <- function(identifier, representation = 'smiles', resolver = NULL, out <- unlist(out) return(out) } + +#' Query Chemical Identifier Resolver Images +#' +#' A interface to the Chemical Identifier Resolver (CIR). +#' (\url{http://cactus.nci.nih.gov/chemical/structure_documentation}). +#' +#' @param query character; Search term. Can be any common chemical identifier +#' (e.g. CAS, INCHI(KEY), SMILES etc.) +#' @param dir character; Directory to save the image. +#' @param format character; Format of the stored image. Can be on of TODO +#' @param format character; Output format of the image. Can be one of "png", +#' "gif". +#' @param width integer; Width of the image. +#' @param height integer; Height of the image. +#' @param linewidth integer; Width of lines. +#' @param symbolfontsize integer; Fontsize of atoms in the image. +#' @param bgcolor character; E.g. transparent, white, \%23AADDEE +#' @param antialiasing logical; Should antialiasing be used? +#' @param atomcolor character; Color of the atoms in the image. +#' @param bondcolor character; Color of the atom bond lines. +#' @param csymbol character; Can be one of "special" (default - i.e. only +#' hydrogen atoms in functional groups or defining stereochemistry) or "all". +#' @param hsymbol character; Can be one of "special" (default - i.e. none are +#' shown) or "all" (all are printed). +#' @param hcolor character; Color of the hydrogen atoms. +#' @param header character; Should a header text be added to the image? Can be +#' any string. +#' @param footer character; Should a footer text be added to the image? Can be +#' any string. +#' @param verbose logical; Should a verbose output be printed on the console? +#' @param frame integer; Should a frame be plotted? Can be on of NULL (default) +#' or 1. +#' @param ... currently not used. +#' +#' @return data.frame and image written to disk +#' @details +#' CIR can resolve can be of the following \code{identifier}: Chemical Names, +#' IUPAC names, +#' CAS Numbers, SMILES strings, IUPAC InChI/InChIKeys, NCI/CADD Identifiers, +#' CACTVS HASHISY, NSC number, PubChem SID, ZINC Code, ChemSpider ID, +#' ChemNavigator SID, eMolecule VID. +#' +#' For an image with transparent background use ‘transparent’ as color name and +#' switch off antialiasing (i.e. antialiasing = 0). +#' +# followed this blog post +# https://cactus.nci.nih.gov/blog/?p=136 +#' +#' @note You can only make 1 request per second (this is a hard-coded feature). +#' +#' @references +#' \code{cir} relies on the great CIR web service created by the CADD +#' Group at NCI/NIH! \cr +#' \url{http://cactus.nci.nih.gov/chemical/structure_documentation}, \cr +#' \url{http://cactus.nci.nih.gov/blog/?cat=10}, \cr +#' \url{http://cactus.nci.nih.gov/blog/?p=1386}, \cr +#' \url{http://cactus.nci.nih.gov/blog/?p=1456}, \cr +#' +#' @author Andreas Scharmueller, \email{andschar@@protonmail.com} +#' +#' @examples +#' \donttest{ +#' # might fail if API is not available +#' cir_img("CCO", dir = tempdir()) # SMILES +#' +#' # multiple query strings and different formats +#' query = c("Glyphosate", "Isoproturon", "BSYNRYMUTXBXSQ-UHFFFAOYSA-N") +#' cir_img(query, dir = tempdir(), bgcolor = "transparent", antialising = 0) +#' +#' # all parameters +#' query = "Triclosan" +#' cir_img(query, +#' dir = tempdir(), +#' format = "gif", +#' width = 600, +#' height = 600, +#' linewidth = 5, +#' symbolfontsize = 30, +#' bgcolor = "red", +#' antialising = FALSE, +#' atomcolor = "green", +#' bondcolor = "yellow", +#' csymbol = "all", +#' hsymbol = "all", +#' hcolor = "purple", +#' header = "My funky chemical structure..", +#' footer = "..is just so awesome!", +#' frame = 1) +#'} +#' @export +#' +cir_img <- function(query, + dir = NULL, + format = c("png", "gif"), + width = 500, + height = 500, + linewidth = 2, + symbolfontsize = 16, + bgcolor = NULL, + antialiasing = TRUE, + atomcolor = NULL, + bondcolor = NULL, + csymbol = c("special", "all"), + hsymbol = c("special", "all"), + hcolor = NULL, + header = NULL, + footer = NULL, + frame = NULL, + verbose = TRUE, + ...) { + # check + if (is.null(dir)) + stop('Please provide a directory (dir =) to save the images.') + format <- match.arg(format) + csymbol <- match.arg(csymbol, c("special", "all")) + hsymbol <- match.arg(hsymbol, c("special", "all")) + foo <- function(query, + dir, + format, + width, + height, + linewidth, + symbolfontsize, + bgcolor, + antialiasing, + atomcolor, + bondcolor, + csymbol, + hsymbol, + hcolor, + header, + footer, + frame, + verbose, + ...) { + # prolog + baseurl <- "https://cactus.nci.nih.gov/chemical/structure" + qurl <- paste(baseurl, query, "image", sep = "/") + # options + if (!is.null(format)) + format <- paste0("format=", format) + if (!is.null(width)) + width <- paste0("width=", width) + if (!is.null(height)) + height <- paste0("height=", height) + if (!is.null(linewidth)) + linewidth <- paste0("linewidth=", linewidth) + if (!is.null(symbolfontsize)) + symbolfontsize <- paste0("symbolfontsize=", symbolfontsize) + if (!is.null(bgcolor)) + bgcolor <- paste0("bgcolor=", bgcolor) + if (!is.null(antialiasing)) + antialiasing <- paste0("antialiasing=", as.numeric(antialiasing)) + if (!is.null(atomcolor)) + atomcolor <- paste0("atomcolor=", atomcolor) + if (!is.null(bondcolor)) + bondcolor <- paste0("bondcolor=", bondcolor) + if (!is.null(csymbol)) + csymbol <- paste0("csymbol=", csymbol) + if (!is.null(hsymbol)) + hsymbol <- paste0("hsymbol=", hsymbol) + if (!is.null(hcolor)) + hcolor <- paste0("hcolor=", hcolor) + if (!is.null(header)) + header <- paste0("header=\"", header, "\"") + if (!is.null(footer)) + footer <- paste0("footer=\"", footer, "\"") + if (!is.null(frame)) + frame <- paste0("frame=", frame) + opts <- c(format, + width, + height, + linewidth, + symbolfontsize, + bgcolor, + antialiasing, + atomcolor, + bondcolor, + csymbol, + hsymbol, + hcolor, + header, + footer, + frame) + opts <- paste0(opts, collapse = "&") + opts <- paste0("?", opts) + # url + qurl <- URLencode(paste0(qurl, opts)) + # query + if (verbose) + message("Querying: ", query, "\n", qurl) + Sys.sleep(1.5) + path <- file.path(dir, paste0(query, ".", sub('format=', '', format))) + message("Image saved under: ", path) + # return image + h <- try( + GET(qurl, + timeout(5), + write_disk(path, overwrite = TRUE)) + ) + if (inherits(h, "try-error")) { + warning("Problem with web service encountered... Returning NA.") + return(data.frame(query = query, stringsAsFactors = FALSE)) + } else { + # return paths data.frame + data.frame(query = query, + path = path, + url = qurl, + stringsAsFactors = FALSE) + } + } + out <- lapply(query, + foo, + dir = dir, + format = format, + width = width, + height = height, + linewidth = linewidth, + symbolfontsize = symbolfontsize, + bgcolor = bgcolor, + antialiasing = antialiasing, + atomcolor = atomcolor, + bondcolor = bondcolor, + csymbol = csymbol, + hsymbol = hsymbol, + hcolor = hcolor, + header = header, + footer = footer, + frame = frame, + verbose = verbose) + dplyr::bind_rows(out) +} + + + + + + + + + diff --git a/R/cts.R b/R/cts.R index de181b2d..be7621ec 100644 --- a/R/cts.R +++ b/R/cts.R @@ -69,7 +69,7 @@ cts_compinfo <- function(inchikey, verbose = TRUE){ #' @param choices to return only the first result, use 'choices = 1'. To choose a result from an interative menu, provide a number of choices to choose from or "all". #' @param verbose logical; should a verbose output be printed on the console? #' @param ... currently not used. -#' @return a list of characters. If first = TRUE a vector. +#' @return a list of character vectors or if \code{choices} is used, then a single named vector. #' @author Eduard Szoecs, \email{eduardszoecs@@gmail.com} #' @details See also \url{http://cts.fiehnlab.ucdavis.edu/} #' for possible values of from and to. @@ -83,19 +83,21 @@ cts_compinfo <- function(inchikey, verbose = TRUE){ #' @examples #' \donttest{ #' # might fail if API is not available -#' cts_convert('XEFQLINVKFYRCS-UHFFFAOYSA-N', 'inchikey', 'Chemical Name') +#' cts_convert("triclosan", "Chemical Name", "inchikey") #' #' ### multiple inputs -#' comp <- c('XEFQLINVKFYRCS-UHFFFAOYSA-N', 'BSYNRYMUTXBXSQ-UHFFFAOYSA-N') -#' cts_convert(comp, 'inchikey', 'Chemical Name') +#' comp <- c("triclosan", "hexane") +#' cts_convert(comp, "Chemical Name", "cas") #' } cts_convert <- function(query, from, to, first = FALSE, choices = NULL, verbose = TRUE, ...){ if(!missing("first")) stop('"first" is deprecated. Use "choices = 1" instead.') if (length(from) > 1 | length(to) > 1) { - stop('Cannot handle multiple input strings.') + stop('Cannot handle multiple input or output types. Please provide only one argument for `from` and `to`.') } + foo <- function(query, from, to , first, verbose){ + if (is.na(query)) return(NA) baseurl <- "http://cts.fiehnlab.ucdavis.edu/service/convert" qurl <- paste0(baseurl, '/', from, '/', to, '/', query) qurl <- URLencode(qurl) diff --git a/R/etox.R b/R/etox.R index e728eaea..d9b1d91d 100644 --- a/R/etox.R +++ b/R/etox.R @@ -6,6 +6,7 @@ #' @import xml2 httr #' @importFrom stats rgamma #' @importFrom dplyr bind_rows +#' @importFrom tibble tibble #' @param query character; The searchterm #' @param from character; Type of input, can be one of "name" (chemical name), #' "cas" (CAS Number), "ec" (European Community number for regulatory purposes), @@ -16,8 +17,7 @@ #' name) ID, "ask" is a interactive mode and the user is asked for input, "na" #' returns \code{NA} if multiple hits are found. #' @param verbose logical; print message during processing to console? -#' @return a dataframe with 4 columns: etoxID, matched substance, string -#' distance to match and the queried string +#' @return a tibble with 3 columns: the query, the match, and the etoxID #' @note Before using this function, please read the disclaimer #' \url{https://webetox.uba.de/webETOX/disclaimer.do}. #' @seealso \code{\link{etox_basic}} for basic information, @@ -56,6 +56,12 @@ get_etoxid <- function(query, from <- match.arg(from) match <- match.arg(match) foo <- function(query, from, match, verbose) { + on.exit(suppressWarnings(closeAllConnections())) + + if (is.na(query)) { + empty <- list(query = NA, match = NA, etoxid = NA) + return(empty) + } if (verbose) message("Searching ", query) baseurl <- "https://webetox.uba.de/webETOX/public/search/stoff.do" @@ -188,6 +194,7 @@ etox_basic <- function(id, verbose = TRUE) { } # id <- c("20179", "9051") foo <- function(id, verbose) { + on.exit(suppressWarnings(closeAllConnections())) if (is.na(id)) { message('ID is NA! Returning NA.\n') return(NA) @@ -295,6 +302,7 @@ etox_targets <- function(id, verbose = TRUE) { stop("id must be a vector!") } foo <- function(id, verbose) { + on.exit(suppressWarnings(closeAllConnections())) if (is.na(id)) { message('ID is NA! Returning NA.\n') return(NA) @@ -382,6 +390,7 @@ etox_tests <- function(id, verbose = TRUE) { stop("id must be a vector!") } foo <- function(id, verbose){ + on.exit(suppressWarnings(closeAllConnections())) if (is.na(id)) { message('ID is NA! Returning NA.\n') return(NA) diff --git a/R/extractors.R b/R/extractors.R index 0763a674..00dccb16 100644 --- a/R/extractors.R +++ b/R/extractors.R @@ -12,11 +12,7 @@ cas <- function(x, ...){ # CAS --------------------------------------------------------------------- #' @export cas.default <- function(x, ...) { - sapply(x, function(y) { - if (length(y) == 1 && is.na(y)) - return(NA) - y$cas - }) + stop(paste("No cas method for class", class(x))) } #' @export cas.chebi_comp_entity <- function(x, ...) { @@ -26,23 +22,42 @@ cas.chebi_comp_entity <- function(x, ...) { }) } +#' @export +cas.opsin_query <- function(x, ...) { + stop("CAS is not returned by this datasource!") +} + #' @export cas.pan_query <- function(x, ...) { sapply(x, function(y) y$`CAS Number`) } + +#' @export +cas.aw_query <- function(x, ...) { + sapply(x, function(y) y$cas) +} + #' @export cas.wd_ident <- function(x, ...) { x$cas } + +#' @export +cas.cts_compinfo <- function(x, ...) { + stop("CAS is not returned by this data source") +} + +#' @export cas.etox_basic <- function(x, ...) { sapply(x, function(y) { if (length(y) == 1 && is.na(y)) return(NA) - unique(y[[1]]$cas) + unique(y$cas) }) } + # InChIKey ---------------------------------------------------------------- #' @rdname extractors #' @export @@ -52,7 +67,7 @@ inchikey <- function(x, ...){ #' @export inchikey.default <- function(x, ...) { - sapply(x, function(y) y$inchikey) + stop(paste("No inchikey method for class", class(x))) } #' @export @@ -71,14 +86,7 @@ inchikey.chebi_comp_entity <- function(x, ...) { }) } -#' @export -inchikey.cs_compinfo <- function(x, ...) { - x$inchikey -} -#' @export -inchikey.cs_extcompinfo <- function(x, ...) { - x$inchikey -} + #' @export inchikey.etox_basic <- function(x, ...) { stop("InChIkey is not returned by this datasource!") @@ -104,6 +112,11 @@ inchikey.wd_ident <- function(x, ...) { x$inchikey } +#' @export +inchikey.cts_compinfo <- function(x, ...) { + sapply(x, function(x) x$inchikey) +} + # SMILES ------------------------------------------------------------------ #' @rdname extractors #' @export @@ -113,7 +126,7 @@ smiles <- function(x, ...){ #' @export smiles.default <- function(x, ...) { - sapply(x, function(y) y$smiles) + stop(paste("no smiles method for class", class(x))) } #' @export smiles.chebi_comp_entity <- function(x, ...) { @@ -123,14 +136,7 @@ smiles.chebi_comp_entity <- function(x, ...) { }) } -#' @export -smiles.cs_compinfo <- function(x, ...) { - x$smiles -} -#' @export -smiles.cs_extcompinfo <- function(x, ...) { - x$smiles -} + #' @export smiles.cts_compinfo <- function(x, ...) { stop("SMILES is not returned by this datasource!") diff --git a/R/flavornet.R b/R/flavornet.R index 62192e0b..24f037c2 100644 --- a/R/flavornet.R +++ b/R/flavornet.R @@ -15,7 +15,7 @@ #' @author Eric Scott, \email{eric.scott@@tufts.edu} #' #' @examples -#' \donttest{ +#' \dontrun{ #' # might fail if website is not available #' fn_percept("123-32-0") #' @@ -27,7 +27,8 @@ fn_percept <- function(CAS, verbose = TRUE, ...) { foo <- function (CAS, verbose){ - qurl = paste0("http://www.flavornet.org/info/",CAS,".html") + on.exit(suppressWarnings(closeAllConnections())) + qurl <- paste0("http://www.flavornet.org/info/",CAS,".html") if (verbose) message(qurl) Sys.sleep(rgamma(1, shape = 10, scale = 1/10)) diff --git a/R/nist.R b/R/nist.R index 02a03af0..e58054a8 100644 --- a/R/nist.R +++ b/R/nist.R @@ -16,6 +16,7 @@ get_ri_xml <- type, polarity, temp_prog) { + on.exit(suppressWarnings(closeAllConnections())) from_str <- (switch( from, diff --git a/R/opsin.R b/R/opsin.R index bcc705f0..409d9514 100644 --- a/R/opsin.R +++ b/R/opsin.R @@ -4,11 +4,14 @@ #' \url{http://opsin.ch.cam.ac.uk/instructions.html}. #' #' @import jsonlite httr xml2 +#' @import tibble +#' @importFrom dplyr select everything +#' @importFrom purrr map_dfr #' @importFrom utils URLencode URLdecode #' @param query character; chemical name that should be queryed. #' @param verbose logical; should a verbose output be printed on the console? #' @param ... currently not used. -#' @return a data.frame with five columnns: "inchi", "stdinchi", "stdinchikey", "smiles", "message" +#' @return a tibble with six columnns: "query", inchi", "stdinchi", "stdinchikey", "smiles", "message", and "status" #' #' @references Lowe, D. M., Corbett, P. T., Murray-Rust, P., & Glen, R. C. (2011). #' Chemical Name to Structure: OPSIN, an Open Source Solution. Journal of Chemical Information and Modeling, @@ -23,33 +26,42 @@ opsin_query <- function(query, verbose = TRUE, ...){ # query <- 'cyclopropane' + foo <- function(query, verbose){ - query <- URLencode(query) + on.exit(suppressWarnings(closeAllConnections())) + + empty <- c(query, rep(NA, 6)) + names(empty) <- c("query", "inchi", "stdinchi", "stdinchikey", "smiles", "message", "status") + empty <- as_tibble(t(empty)) + if (is.na(query)) { + return(empty) + } + query_u <- URLencode(query) baseurl <- "http://opsin.ch.cam.ac.uk/opsin/" out <- 'json' - qurl <- paste0(baseurl, query, '.', out) + qurl <- paste0(baseurl, query_u, '.', out) if (verbose) - message('Querying ', URLdecode(query)) + message('Querying ', URLdecode(query_u)) Sys.sleep( rgamma(1, shape = 5, scale = 1/10)) h <- try(GET(qurl), silent = TRUE) if (inherits(h, "try-error")) { warning('Problem with web service encountered... Returning NA.') - return(rep(NA, 5)) + return(empty) } cont <- content(h, as = 'text') if (substr(cont, 1, 14) == ' + + + + triclosan + ALL + 200 + ALL + + + ' + + Sys.sleep(rgamma(1, shape = 5, scale = 1/10)) + res <- try(POST(baseurl, + add_headers(headers), + body = body, + user_agent("webchem (https://github.com/ropensci/webchem)"))) + if (inherits(res, "try-error")) + return(FALSE) + res$status_code == 200 +} + # pubchem ----------------------------------------------------------------- #' @import httr -#' @rdname ping +#' @noRd #' @return TRUE if pubchem is reachable -#' @export #' @examples #' \dontrun{ #' # might fail if API is not available @@ -31,9 +181,8 @@ ping_pubchem <- function(...) { # pubchem PUG-VIEW----------------------------------------------------------------- #' @import httr -#' @rdname ping +#' @noRd #' @return TRUE if pubchem PUG-VIEW is reachable -#' @export #' @examples #' \dontrun{ #' # might fail if API is not available @@ -46,41 +195,3 @@ ping_pubchem_pw <- function(...) { user_agent("webchem (https://github.com/ropensci/webchem)")) res$status_code == 200 } - - - -# ChemSpider webpage ----------------------------------------------------------- -#' @import httr -#' @rdname ping -#' @return TRUE if ChemSpider is reachable -#' @export -#' @examples -#' \dontrun{ -#' # might fail if API is not available -#' ping_cs() -#' } -ping_cs <- function(...) { - res <- GET('https://www.chemspider.com/Chemical-Structure.5363.html', ...) - stopifnot(is(res, "response")) - res$status_code == 200 -} - - - - -# PAN --------------------------------------------------------------------- -#' @import httr -#' @rdname ping -#' @return TRUE if PAN is reachable -#' @export -#' @examples -#' \dontrun{ -#' # might fail if API is not available -#' ping_pan() -#' } -ping_pan <- function(...) { - res <- try(GET('http://www.pesticideinfo.org/List_Chemicals.jsp?', timeout(1))) - if (inherits(res, 'try-error')) - return(FALSE) - res$status_code == 200 -} diff --git a/R/pubchem.R b/R/pubchem.R index d23682ae..2f22b7a9 100644 --- a/R/pubchem.R +++ b/R/pubchem.R @@ -1,19 +1,54 @@ -#' Retrieve Pubchem Id (CID) +#' Retrieve Pubchem Compound ID (CID) #' -#' Return CompoundID (CID) for a search query using PUG-REST, -#' see \url{https://pubchem.ncbi.nlm.nih.gov/}. -#' @param query character; search term. -#' @param from character; type of input, can be one of "name" (default), "cid", -#' "sid", "aid", "smiles", "inchi", "inchikey" -#' @param match character; How should multiple hits be handled?, "all" all matches are returned, "best" the best matching is returned, "ask" enters an interactive mode and the user is asked for input, "na" returns NA if multiple hits are found. -#' @param search_substances logical; If TRUE also searches PubChem SIDs +#' Retrieve compound IDs (CIDs) from PubChem. +#' @param query character; search term, one or more compounds. +#' @param from character; type of input. See details for more information. +#' @param domain character; query domain, can be one of \code{"compound"}, +#' \code{"substance"}, \code{"assay"}. +#' @param match character; How should multiple hits be handled?, \code{"all"} +#' all matches are returned, \code{"best"} the best matching is returned, +#' \code{"ask"} enters an interactive mode and the user is asked for input, +#' \code{"na"} returns NA if multiple hits are found. #' @param verbose logical; should a verbose output be printed on the console? #' @param arg character; optinal arguments like "name_type=word" to match #' individual words. #' @param first deprecated. Use `match` instead. #' @param ... currently unused. #' @return a tibble. -#' +#' @details Valid values for the \code{from} argument depend on the +#' \code{domain}: +#' \itemize{ +#' \item{\code{compound}: \code{"name"}, \code{"smiles"}, \code{"inchi"}, +#' \code{"inchikey"}, \code{"formula"}, \code{"sdf"}, , +#' , .} +#' \item{\code{substance}: \code{"name"}, \code{"sid"}, +#' \code{}, \code{"sourceid/"} or \code{"sourceall"}.} +#' \item{\code{assay}: \code{"aid"}, \code{}.} +#' } +#' @details is assembled as "{\code{substructure} | +#' \code{superstructure} | \code{similarity} | \code{identity}} / {\code{smiles} +#' | \code{inchi} | \code{sdf} | \code{cid}}", e.g. +#' \code{from = "substructure/smiles"}. +#' @details \code{} is assembled as "\code{xref}/\{\code{RegistryID} | +#' \code{RN} | \code{PubMedID} | \code{MMDBID} | \code{ProteinGI}, +#' \code{NucleotideGI} | \code{TaxonomyID} | \code{MIMID} | \code{GeneID} | +#' \code{ProbeID} | \code{PatentID}\}", e.g. \code{from = "xref/RN"} will query +#' by CAS RN. +#' @details is either \code{fastformula} or it is assembled as +#' "{\code{fastidentity} | \code{fastsimilarity_2d} | \code{fastsimilarity_3d} | +#' \code{fastsubstructure} | \code{fastsuperstructure}}/{\code{smiles} | +#' \code{smarts} | \code{inchi} | \code{sdf} | \code{cid}}", e.g. +#' \code{from = "fastidentity/smiles"}. +#' @details \code{} is any valid PubChem Data Source ID. When +#' \code{from = "sourceid/"}, the query is the ID of the substance in +#' the depositor's database. +#' @details If \code{from = "sourceall"} the query is one or more valid Pubchem +#' depositor names. Depositor names are not case sensitive. +#' @details Depositor names and Data Source IDs can be found at +#' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}. +#' @details \code{} is assembled as "\code{target}/\{\code{gi} | +#' \code{proteinname} | \code{geneid} | \code{genesymbol} | \code{accession}\}", +#' e.g. \code{from = "target/geneid"} will query by GeneID. #' @references Wang, Y., J. Xiao, T. O. Suzek, et al. 2009 PubChem: A Public #' Information System for #' Analyzing Bioactivities of Small Molecules. Nucleic Acids Research 37: @@ -34,6 +69,7 @@ #' usage policies of the indicidual data sources #' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}. #' @author Eduard Szoecs, \email{eduardszoecs@@gmail.com} +#' @author Tamás Stirling, \email{stirling.tamas@@gmail.com} #' @import httr #' @importFrom purrr map map2 #' @importFrom jsonlite fromJSON @@ -45,102 +81,164 @@ #' # might fail if API is not available #' get_cid("Triclosan") #' get_cid("Triclosan", arg = "name_type=word") -#' get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey") +#' # from SMILES #' get_cid("CCCC", from = "smiles") +#' # from InChI +#' get_cid("InChI=1S/CH5N/c1-2/h2H2,1H3", from = "inchi") +#' # from InChIKey +#' get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey") +#' # from formula +#' get_cid("C26H52NO6P", from = "formula") +#' # from CAS RN +#' get_cid("56-40-6", from = "xref/rn") +#' # similarity +#' get_cid(5564, from = "similarity/cid") +#' get_cid("CCO", from = "similarity/smiles") +#' # from SID +#' get_cid("126534046", from = "sid", domain = "substance") +#' # sourceid +#' get_cid("VCC957895", from = "sourceid/23706", domain = "substance") +#' # sourceall +#' get_cid("Optopharma Ltd", from = "sourceall", domain = "substance") +#' # from AID (CIDs of substances tested in the assay) +#' get_cid(170004, from = "aid", domain = "assay") +#' # from GeneID (CIDs of substances tested on the gene) +#' get_cid(25086, from = "target/geneid", domain = "assay") #' #' # multiple inputs -#' comp <- c("Triclosan", "Aspirin") -#' get_cid(comp) +#' get_cid(c("Triclosan", "Aspirin")) #' #' } get_cid <- function(query, - from = c("name", "cid", "sid", "aid", "smiles", "inchi", "inchikey"), + from = "name", + domain = c("compound", "substance", "assay"), match = c("all", "first", "ask", "na"), verbose = TRUE, - search_substances = FALSE, arg = NULL, first = NULL, ...) { - - # from can be cid | name | smiles | inchi | sdf | inchikey | formula - # query <- c("Aspirin") - # from = "name" - #deprecate `first` if (!is.null(first) && first == TRUE) { message("`first = TRUE` is deprecated. Use `match = 'first'` instead") match <- "first" - } else if (!is.null(first) && first==FALSE) { + } else if (!is.null(first) && first == FALSE) { message("`first = FALSE` is deprecated. Use `match = 'all'` instead") match <- "all" } - - from <- match.arg(from) + #input validation + from <- tolower(from) + domain <- match.arg(domain) + xref <- paste( + "xref", + c("registryid", "rn", "pubmedid", "mmdbid", "proteingi", "nucleotidegi", + "taxonomyid", "mimid", "geneid", "probeid", "patentid"), + sep = "/" + ) + structure_search <- expand.grid( + c("substructure", "superstructure", "similarity", "identity"), + c("smiles", "inchi", "sdf", "cid") + ) + structure_search <- paste(structure_search$Var1, structure_search$Var2, + sep = "/") + fast_search <- expand.grid( + c("fastidentity", "fastsimilarity_2d", "fastsimilarity_3d", + "fastsubstructure", "fastsuperstructure"), + c("smiles", "smarts", "inchi", "sdf", "cid") + ) + fast_search <- c(with(fast_search, paste(Var1, Var2, sep = "/")), + "fastformula") + targets <- paste("target", c("gi", "proteinname", "geneid", "genesymbol", + "accession"), sep = "/") + if (domain == "compound") { + from_choices <- c("cid", "name", "smiles", "inchi", "sdf", "inchikey", + "formula", structure_search, xref, fast_search) + from <- match.arg(from, choices = from_choices) + } + if (domain == "substance") { + if (grepl("^sourceid/", from) == FALSE) { + from <- match.arg(from, choices = c("sid", "name", xref, "sourceall")) + } + } + if (domain == "assay") { + from <- match.arg(from, choices = c("aid", targets)) + } match <- match.arg(match) - - foo <- function(query, from, match, scope = "compound", - verbose, arg, ...) { - if (is.na(query)) - return(NA) - prolog <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug" - input <- paste0("/", scope, "/", from) - output <- "/cids/JSON" - if (!is.null(arg)) - arg <- paste0("?", arg) - qurl <- paste0(prolog, input, output, arg) - if (verbose) - message(qurl) - Sys.sleep(rgamma(1, shape = 15, scale = 1/10)) - cont <- try( - content( - POST(qurl, - body = paste0(from, "=", query)), - type = "text", encoding = "UTF-8"), - silent = TRUE - ) - if (inherits(cont, "try-error")) { - warning("Problem with web service encountered... Returning NA.") + foo <- function(query, from, domain, match, verbose, arg, ...) { + if (is.na(query)) { + if (verbose) message(paste0(query, " is invalid. Returning NA.")) return(NA) } - cont <- jsonlite::fromJSON(cont) - if (names(cont) == "Fault") { - warning(cont$Fault$Details, ". Returning NA.") - return(NA) + if (verbose) { + message(paste0("Querying ", query, ". "), appendLF = FALSE) + } + if (is.character(query)) query <- URLencode(query) + if (from %in% structure_search) { + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", + domain, from, query, "json", sep = "/") + } + else { + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", + domain, from, query, "cids", "json", sep = "/") + } + if (!is.null(arg)) qurl <- paste0(qurl, "?", arg) + Sys.sleep(rgamma(1, shape = 15, scale = 1 / 10)) + if (from == "inchi") { + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug", + domain, from, "cids", "json", sep = "/") + res <- httr::POST(qurl, body = paste0("inchi=", query), + user_agent("webchem")) + } + else { + res <- httr::POST(qurl, user_agent("webchem")) + } + if (res$status_code != 200) { + if (res$status_code == 202) { + cont <- httr::content(res, type = "text", encoding = "UTF-8") + listkey <- jsonlite::fromJSON(cont)$Waiting$ListKey + qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/", domain, + "listkey", listkey, "cids", "json", sep = "/") + while (res$status_code == 202) { + Sys.sleep(5 + rgamma(1, shape = 15, scale = 1 / 10)) + res <- httr::POST(qurl, user_agent("webchem")) + } + if (res$status_code != 200) { + if (verbose) message(httr::message_for_status(res)) + return(NA) + } + } + else{ + if (verbose) message(httr::message_for_status(res)) + return(NA) + } + } + if (verbose) message(httr::message_for_status(res)) + cont <- httr::content(res, type = "text", encoding = "UTF-8") + if (domain == "compound") { + cont <- jsonlite::fromJSON(cont)$IdentifierList$CID } - if (scope == "substance") { - cont <- cont$InformationList$Information$CID + if (domain == "substance") { + cont <- jsonlite::fromJSON(cont)$InformationList$Information$CID + } + if (domain == "assay") { + cont <- jsonlite::fromJSON(cont)$InformationList$Information$CID } out <- unique(unlist(cont)) - out <- matcher(x = out, match = match, verbose = verbose) + out <- matcher(x = out, query = query, match = match, verbose = verbose) out <- as.character(out) names(out) <- NULL return(out) } - - out <- map(query, - ~foo(query = .x, from = from, match = match, + out <- map(query, + ~foo(query = .x, from = from, domain = domain, match = match, verbose = verbose, arg = arg)) - out <- setNames(out, query) - - if (search_substances) { - out2 <- map(query, - ~foo(query = .x, from = from, match = match, scope = "substance", - verbose = verbose, arg = arg)) - out2 <- setNames(out2, query) - - out <- map2(out, out2, c) - out <- map(out, unique) - } - - out <- + out <- setNames(out, query) + out <- lapply(out, enframe, name = NULL, value = "cid") %>% bind_rows(.id = "query") - return(out) + return(out) } - - #' Retrieve compound properties from a pubchem CID #' #' Retrieve compound information from pubchem CID, see @@ -277,7 +375,7 @@ pc_prop <- function(cid, properties = NULL, verbose = TRUE, ...) { #' @param arg character; optinal arguments like "name_type=word" to match #' individual words. #' @param ... optional arguments -#' @return a character vector. +#' @return a list of character vectors (one per query). If \code{choices} is used, a single named vector is returned instead. #' #' @references Wang, Y., J. Xiao, T. O. Suzek, et al. 2009 PubChem: A Public #' Information System for @@ -315,6 +413,7 @@ pc_synonyms <- function(query, from = "name", choices = NULL, verbose = TRUE, if (!missing("interactive")) stop("'interactive' is deprecated. Use 'choices' instead.") foo <- function(query, from, verbose, ...) { + if (is.na(query)) return(NA) prolog <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug" input <- paste0("/compound/", from) output <- "/synonyms/JSON" @@ -466,9 +565,7 @@ pc_page <- function(id, Sys.sleep(0.3 + stats::rexp(1, rate = 10 / 0.3)) res <- httr::POST( qurl, - user_agent("webchem (https://github.com/ropensci/webchem)"), - handle = handle("") - ) + user_agent("webchem (https://github.com/ropensci/webchem)")) if (res$status_code < 300) { if (verbose == TRUE) message(httr::message_for_status(res)) cont <- httr::content(res, type = "text", encoding = "UTF-8") diff --git a/R/wikidata.R b/R/wikidata.R index c6fd51ff..02a441a6 100644 --- a/R/wikidata.R +++ b/R/wikidata.R @@ -114,6 +114,7 @@ get_wdid <- #' Retrieve Indentifiers from Wikidata #' #' @import jsonlite +#' @import httr #' @importFrom stats rgamma #' #' @param id character; identifier, as returned by \code{\link{get_wdid}} @@ -169,7 +170,8 @@ wd_ident <- function(id, verbose = TRUE){ Sys.sleep( rgamma(1, shape = 15, scale = 1/10)) if (verbose) message('Querying ', qurl) - tmp <- fromJSON(qurl) + res <- GET(qurl) + tmp <- fromJSON(content(res, as = "text")) vars_out <- tmp$head$vars out <- tmp$results$bindings diff --git a/README.Rmd b/README.Rmd index 40fae2a5..f77c7542 100644 --- a/README.Rmd +++ b/README.Rmd @@ -21,13 +21,12 @@ knitr::opts_chunk$set( [![Build Status](https://travis-ci.org/ropensci/webchem.png)](https://travis-ci.org/ropensci/webchem) -[![Build status](https://ci.appveyor.com/api/projects/status/e3sa6e918jlemv46/branch/master)](https://ci.appveyor.com/project/EDiLD/webchem) -[![Coverage Status](https://codecov.io/github/ropensci/webchem/coverage.svg?branch=tests)](https://codecov.io/gh/ropensci/webchem/branch/tests) +[![Build status](https://ci.appveyor.com/api/projects/status/8m7wpfnxn41kqjnl?svg=true)](https://ci.appveyor.com/project/ropensci/webchem) +[![Coverage Status](https://codecov.io/github/ropensci/webchem/coverage.svg?branch=master)](https://codecov.io/gh/ropensci/webchem/branch/master) [![Open Issues](https://img.shields.io/github/issues/ropensci/webchem.svg)](https://github.com/ropensci/webchem/issues) [![](https://cranlogs.r-pkg.org/badges/webchem)](https://cran.r-project.org/package=webchem) [![CRAN status](https://www.r-pkg.org/badges/version/webchem)](https://CRAN.R-project.org/package=webchem) [![DOI](https://zenodo.org/badge/17223/ropensci/webchem.svg)](https://zenodo.org/badge/latestdoi/17223/ropensci/webchem) - `webchem` is a R package to retrieve chemical information from the web. @@ -36,26 +35,30 @@ This package interacts with a suite of web APIs to retrieve chemical information The functions in the package that hit a specific API have a prefix and suffix separated by an underscore (`prefix_suffix()`). They follow the format of `source_functionality`, with the exception of functions that retrieve database identifiers which follow the format of `get_identifier`. e.g.`cs_compinfo` uses ChemSpider to retrieve compound informations and `get_csid()` retrieves ChemSpider IDs. +## Fill out the survey! + +Do you use chemical information databases in your work? Help us help you by filling out our short survey at https://forms.gle/V7dfGGn73dkesn5L6. + +The `webchem` survey allows us to learn which databases you use and how you interact with chemical data. This is extremely valuable information for us and guides our development efforts. The survey takes about 5 minutes to fill out. -## Currently implemented in `webchem` +## Chemical databases currently accessed by webchem -Source | Function(s) | API Docs | API key ------- | --------- | -------- | -------- -[Chemical Identifier Resolver (CIR)](http://cactus.nci.nih.gov/chemical/structure) | `cir_query()` | [link](http://cactus.nci.nih.gov/chemical/structure_documentation) | none -[ChemSpider](http://www.chemspider.com/) | `cs_datasources()`,`get_csid()`, `cs_element_csid()`, `cs_convert()`, `cs_compinfo()`, `cs_extcompinfo()`, `cs_prop()`| [link](https://developer.rsc.org/compounds-v1/apis) | required [(link)](https://developer.rsc.org/) -[PubChem](https://pubchem.ncbi.nlm.nih.gov/) | `get_cid()`, `pc_prop()`, `pc_synonyms()` | [link](https://pubchem.ncbi.nlm.nih.gov/) | none -[Chemical Translation Service (CTS)](http://cts.fiehnlab.ucdavis.edu/) | `cts_convert()`, `cts_compinfo()` | none | none -[PAN Pesticide Database](http://www.pesticideinfo.org/) | `pan_query()` | none | none -[Alan Wood's Compendium of Pesticide Common Names](http://www.alanwood.net/pesticides/) | `aw_query()` | none | none -[ETOX](http://webetox.uba.de/webETOX/index.do) | `get_etoxid()`, `etox_basic()`. `etox_targets()`, `etox_tests()` | none | none -[ChemIDplus](http://chem.sis.nlm.nih.gov/chemidplus/) | `ci_query()` | none | none -[Wikidata](https://www.wikidata.org/wiki/Wikidata:WikiProject_Chemistry) | `get_wdid()`, `wd_ident()` | [link](https://www.mediawiki.org/wiki/API:Main_page) | none -[OPSIN](http://opsin.ch.cam.ac.uk/instructions.html) | `opsin_query()` | [link](http://opsin.ch.cam.ac.uk/instructions.html) | none -[Flavornet](http://www.flavornet.org) | `fn_percept()` | none | none -[NIST](https://webbook.nist.gov) | `nist_ri()` | none | none -[ChEBI](https://www.ebi.ac.uk/chebi/) | `get_chebiid()`, `chebi_comp_entity()` | [link](https://www.ebi.ac.uk/chebi/webServices.do) | none +At least some of the data in the following sources is accesible through `webchem` functions. To learn more about what is available, browse the documentation [here](https://docs.ropensci.org/webchem/reference/index.html). -Moreover, there are some functions to check indentifiers: `is.inchikey()`, `is.cas()` and `is.smiles()`. +- [Chemical Identifier Resolver (CIR)](http://cactus.nci.nih.gov/chemical/structure) +- [ChemSpider](http://www.chemspider.com/) (requires an [API token]((https://developer.rsc.org/))) +- [PubChem](https://pubchem.ncbi.nlm.nih.gov/) +- [Chemical Translation Service (CTS)](http://cts.fiehnlab.ucdavis.edu/) +- [PAN Pesticide Database](http://www.pesticideinfo.org/) +- [Alan Wood's Compendium of Pesticide Common Names](http://www.alanwood.net/pesticides/) +- [ETOX](http://webetox.uba.de/webETOX/index.do) +- [ChemIDplus](http://chem.sis.nlm.nih.gov/chemidplus/) +- [Wikidata](https://www.wikidata.org/wiki/Wikidata:WikiProject_Chemistry) +- [OPSIN](http://opsin.ch.cam.ac.uk/instructions.html) +- [Flavornet](http://www.flavornet.org) +- [NIST](https://webbook.nist.gov) (currently gas chromatography retention indices only) +- [ChEBI](https://www.ebi.ac.uk/chebi/) +- [U.S. EPA Substance Registry Service (SRS)](https://cdxnodengn.epa.gov/cdx-srs-rest/) #### API keys @@ -71,6 +74,7 @@ install.packages("webchem") #### Install from Github (development version) + ```{r install_github, eval=FALSE} install.packages("devtools") library("devtools") @@ -78,325 +82,14 @@ install_github("ropensci/webchem") ``` -## Quickstart -```{r load} -library("webchem") -``` - -#### Chemical Identifier Resolver (CIR) - -CAS numbers and molecular weight for [Triclosan](http://en.wikipedia.org/wiki/Triclosan). -Use `choices = 1` to return only the first hit. -```{r cir_query1} -cir_query('Triclosan', 'cas') -cir_query('Triclosan', 'cas', choices = 1) -cir_query('Triclosan', 'mw') -``` - -Query SMILES and InChIKey from CAS (Triclosan). -Inputs might by ambiguous and we can specify where to search using `resolver=`. -```{r cir_query2} -cir_query('3380-34-5', 'smiles') -cir_query('3380-34-5', 'stdinchikey', resolver = 'cas_number') -``` - -Query the number of rings using the InChiKey (Triclosan) -```{r cir_query3} -cir_query('XEFQLINVKFYRCS-UHFFFAOYSA-N', 'ring_count') -``` - - -#### ChemSpider -Retrieve the ChemSpider ID of Triclosan - -```{r get_csid} -(id <- get_csid('Triclosan')) -``` - -Use this ID to query information from ChemSpider - - -```{r cs_extcompinfo} -# cs_compinfo(id, fields = c("Formula", "MolecularWeight")) -``` - -Note that the URL of the source if also returned (`source_url`) and can be used for (micro-)attribution. - -Or to convert to a Mol-Object - - -```{r cs_csid_mol} -# mol <- cs_convert(id, from = 'csid', to = 'mol') -# head(parse_mol(mol$ab)) -``` -Note that the Molfile is parsed into a R object (via `parse_mol()`) and that an API-key is needed - - -`cs_convert()` handles a lot of input / output formats: - -```{r cs_inchikey_csid} -cs_convert('XEFQLINVKFYRCS-UHFFFAOYAS', from = 'inchikey', to = 'csid') -cs_convert('XEFQLINVKFYRCS-UHFFFAOYAS', from = 'inchikey', to = 'inchi') -cs_convert('c1cc(c(cc1Cl)O)Oc2ccc(cc2Cl)Cl', from = 'smiles', to = 'inchi') -``` - -And get EPISuit predictions from ChemSpider - -```{r cs_prop, eval=FALSE} -cs_prop('5363')[['5363']]$epi[ , c(1:4)] -``` - -#### PubChem - -Retrieve PubChem CID -```{r get_cid} -get_cid(c('Triclosan', 'Aspirin')) -get_cid('3380-34-5') -``` - -Use this CID to retrieve some chemical properties: -```{r pc_prop} -pc_prop(c(5564,2244), properties = c('InChIKey', 'MolecularFormula', 'MolecularWeight')) -``` - -and synonyms - -```{r pc_synonyms} -pc_synonyms(5564, from = 'cid')[[1]][1:5] -pc_synonyms('Triclosan', from = 'name')[[1]][1:5] -``` - - - -#### Chemical Translation Service (CTS) - -CTS allows to convert from nearly every possible identifier to nearly every possible identifier: -```{r cts_convert} -cts_convert(query = '3380-34-5', from = 'CAS', to = 'ChemSpider') -(inchk <- cts_convert(query = '50-00-0', from = 'CAS', to = 'inchikey')) -``` - -Moreover, we can a lot of information stored in the CTS database using InChIkey -```{r cts_compinfo} -info <- cts_compinfo(inchikey = inchk[[1]]) -info[[1]][1:5] -``` - - -#### PAN Pesticide Database -`pan_query()` returns a list of 75 entries, here I extract only 4 of those: -```{r pan} -pan_list <- pan_query('lambda-Cyhalothrin', match = 'best') -pan_list[[1]][c("CAS Number", "Chemical Class", "Water Solubility (Avg, mg/L)", "Adsorption Coefficient (Koc)" )] -``` - - - -#### Alan Wood's Compendium of Pesticide Common Names - -`aw_query()` returns a list of 9 entries and can query common names and cas numbers: -```{r alanwood} -aw_query('Fluazinam', type = 'commonname') -aw_query('79622-59-6', type = 'cas')[[1]]$cname -``` - -#### ETOX -ETOX: Information System Ecotoxicology and Environmental Quality Targets is a database run by the Federal Environment Agency of Germany and provides data on synonyms, identifiers, Quality Targest and Effects. - -First we need to query a substance ID: - -```{r get_etoxid} -ids <- get_etoxid('Triclosan', match = 'best') -ids -``` -`get_etoxid` tries to find the best match for you (check the matched and distance attributes), if multiple hits are found. -Other options are `match = 'ask'` to enter a interactive mode, `'na'` to return `NA`, `'all'` to return all hits and `'first'` to return the first hit. - -```{r} -get_etoxid('Triclosan', match = 'all') -``` - - - -With this substance ID we can query further information from ETOX, e.g.: - -```{r etox_basic} -etox_basic(ids$etoxid)[[1]] -``` - -Which returns CAS, EC and GSBL numbers, as well as a synonym list. - -We can also retrieve Quality Targets: - -```{r etox_targets} -targets <- etox_targets(ids$etoxid)[[1]] -targets$res[ , c('Substance', 'Country_or_Region', 'Designation', 'Value_Target_LR', 'Unit')] -``` - -and results of ecotox tests: -```{r etox_tests} -tests <- etox_tests(ids$etoxid)[[1]] -tests$res[ , c('Organism', 'Effect', 'Duration', 'Time_Unit','Endpoint', 'Value', 'Unit')] -``` - - - - - - - - -#### ChemIDplus - -```{r chemid} -out <- ci_query(query = 'Triclosan', type = 'name', match = 'best') -out[['Triclosan']]$physprop -``` - - - -#### Wikidata -```{r wikidata} -ids <- get_wdid(query = 'Triclosan') -ids - -# query identifiers from Wikidata -wd_ident(ids$id)[1:5] -``` - - -#### OPSIN -```{r opsin} -opsin_query(c('Cyclopropane', 'Octane')) -``` - - - -#### Flavornet - -```{r flavornet} -fn_percept(CAS = c("75-07-0", "123-32-0")) -``` - -#### NIST - -Identification of gas chromatography peaks is often aided by retention idices. NIST provides tables of retention indices reported in the literature organized by retention index type (Kovats, linear, normal alkane, and Lee), column polarity, and temperature program. - - -```{r nist} -RIs <- - nist_ri( - query = "78-70-6", - from = "cas", - type = "kovats", - polarity = "non-polar", - temp_prog = "ramp" - ) -head(RIs) -``` - -#### ChEBI - -Chemical Entities of Biological Interest (ChEBI) is a freely available dictionary of molecular entities focused on 'small' chemical compounds. `get_chebiid()` returns a list of data.frames which matching query results. The data.frames contain the __chebiid__, the __chebiiasciiname__, a __searchscore__ and __entity stars__ (either 2 or 3, depending on whether the entity was checked thoroughly). - -```{r chebi-lite} -ids <- get_chebiid(c('Isoproturon', 'RZVAJINKPMORJF-UHFFFAOYSA-N'), verbose = FALSE) -ids -``` - -The __chebiid__ can then be used to query the complete ChEBI entity using `chebi_comp_entity()`. The complete entity contains several different data structures which are returned in a list. The data structures are explained in greater detail at the [ChEBI website](https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:6049). Here, the list elements are showcased: - -```{r chebi-comp} -# ecample entities -isoproturon <- chebi_comp_entity(ids$Isoproturon$chebiid[1]) -paracetamol <- chebi_comp_entity(ids$`RZVAJINKPMORJF-UHFFFAOYSA-N`$chebiid[1]) - -# properties: a data.frame with general properties -lapply(isoproturon, '[[', 'properties') -# chem_structure: a list of chemical structure formats (e.g. mol) -lapply(isoproturon, '[[', 'chem_structure') -# synonyms: a data.frame of synonyms (collected from different sources) -lapply(isoproturon, '[[', 'synonyms') -# iupacnames: a data.frame of IUPAC names (collected from different sources) -lapply(isoproturon, '[[', 'iupacnames') -# formulae: a data.frame of chemical formulae (collected from different sources) -lapply(isoproturon, '[[', 'formulae') -# regnumbers: a data.frame of registry numbers (e.g. CAS, Beilstein, Reaxys) (collected from different sources) -lapply(isoproturon, '[[', 'regnumbers') -# chebiid_snd: a data.frame with secondary ChEBI ids -lapply(paracetamol, '[[', 'chebiid_snd') -# citations: Publications which cite the entity along with hyperlinks to the PubMed entry via Europe PMC -head(lapply(paracetamol, '[[', 'citations')[[1]]) -# parents: parent ontologies of the entity -lapply(paracetamol, '[[', 'parents') -# children: child ontologies of the entity -lapply(paracetamol, '[[', 'children') -# dblinks: Links to other data bases -lapply(paracetamol, '[[', 'dblinks') -# comments: General comment(s) -lapply(paracetamol, '[[', 'comments') -# Metabolites of Species -head(lapply(paracetamol, '[[', 'origins')[[1]]) -``` - -#### Extractor functions - -The sources provide a lot of informations that can be retrieved using the functions described above. Often only specific inforamtion is needed. -Therefore, we added extractor functions for common identifiers. - -```{r extractors, error=TRUE} -wi <- wd_ident("Q408646") -wi -cas(wi) -inchikey(wi) -smiles(wi) - -# smiles(etox_basic(5564)) -``` - - - -#### Misc functions - -##### Check if a string is a valid CAS registry number - -```{r is.cas} -is.cas('64-17-5') -is.cas('64-17-6') -``` - - -##### Check if a string is a valid InChIKey - -Using a pure R implementation: -```{r is.inchikey} -is.inchikey('BQJCRHHNABKAKU-KBQPJGBKSA-N') -is.inchikey('BQJCRHHNABKAKU-KBQPJGBKXA-N') -``` - -Using the ChemSpider API -```{r is.inchikey_cs} -is.inchikey('BQJCRHHNABKAKU-KBQPJGBKSA-N', type = 'chemspider') -is.inchikey('BQJCRHHNABKAKU-KBQPJGBKXA-N', type = 'chemspider') -``` - -##### Check if a string is a valid SMILES - -```{r is.smiles, eval=FALSE} -is.smiles('Clc(c(Cl)c(Cl)c1C(=O)O)c(Cl)c1Cl') -# 'J' is not found in the periodic table -is.smiles('Clc(c(Cl)c(Cl)c1C(=O)O)c(Cl)c1ClJ') -``` - - - - ### Acknowledgements -Without the fantastic web services `webchem` wouldn't be here. -Therefore, kudos to the web service providers and developers! +Without the fantastic web services `webchem` wouldn't be here. Therefore, kudos to the web service providers and developers! Please remember to acknowledge these data resources in your work using `webchem`. ### Related Projects + +You can find some related packages in the [ChemPhys CRAN Task View](https://cran.r-project.org/web/views/ChemPhys.html) + If you're more familiar with Python you should check out [Matt Swains](https://github.com/mcs07) repositories: [ChemSpiPy](https://github.com/mcs07/ChemSpiPy), [PubChemPy](https://github.com/mcs07/PubChemPy) and [CirPy](https://github.com/mcs07/CIRpy) provide similar functionality as `webchem`. ### Want to contribute? @@ -407,7 +100,7 @@ Check out our [contribution guide here](https://github.com/ropensci/webchem/blob - Please [report any issues, bugs or feature requests](https://github.com/ropensci/webchem/issues). - License: MIT -- Get citation information for `webchem` in R doing `citation("webchem")` +- Get citation information for `webchem` in R with `citation("webchem")` [![ropensci](http://ropensci.org/public_images/github_footer.png)](http://ropensci.org) diff --git a/README.md b/README.md index 53b084e1..48c4cc59 100644 --- a/README.md +++ b/README.md @@ -10,14 +10,13 @@ Status](https://travis-ci.org/ropensci/webchem.png)](https://travis-ci.org/ropen [![Build status](https://ci.appveyor.com/api/projects/status/e3sa6e918jlemv46/branch/master)](https://ci.appveyor.com/project/EDiLD/webchem) [![Coverage -Status](https://codecov.io/github/ropensci/webchem/coverage.svg?branch=tests)](https://codecov.io/gh/ropensci/webchem/branch/tests) +Status](https://codecov.io/github/ropensci/webchem/coverage.svg?branch=master)](https://codecov.io/gh/ropensci/webchem/branch/master) [![Open Issues](https://img.shields.io/github/issues/ropensci/webchem.svg)](https://github.com/ropensci/webchem/issues) [![](https://cranlogs.r-pkg.org/badges/webchem)](https://cran.r-project.org/package=webchem) [![CRAN status](https://www.r-pkg.org/badges/version/webchem)](https://CRAN.R-project.org/package=webchem) [![DOI](https://zenodo.org/badge/17223/ropensci/webchem.svg)](https://zenodo.org/badge/latestdoi/17223/ropensci/webchem) - `webchem` is a R package to retrieve chemical information from the web. @@ -29,29 +28,46 @@ suffix separated by an underscore (`prefix_suffix()`). They follow the format of `source_functionality`, with the exception of functions that retrieve database identifiers which follow the format of `get_identifier`. e.g.`cs_compinfo` uses ChemSpider to retrieve compound -informations and `get_csid()` retrieves ChemSpider -IDs. - -## Currently implemented in `webchem` - -| Source | Function(s) | API Docs | API key | -| --------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------ | --------------------------------------------- | -| [Chemical Identifier Resolver (CIR)](http://cactus.nci.nih.gov/chemical/structure) | `cir_query()` | [link](http://cactus.nci.nih.gov/chemical/structure_documentation) | none | -| [ChemSpider](http://www.chemspider.com/) | `cs_datasources()`,`get_csid()`, `cs_element_csid()`, `cs_convert()`, `cs_compinfo()`, `cs_extcompinfo()`, `cs_prop()` | [link](https://developer.rsc.org/compounds-v1/apis) | required [(link)](https://developer.rsc.org/) | -| [PubChem](https://pubchem.ncbi.nlm.nih.gov/) | `get_cid()`, `pc_prop()`, `pc_synonyms()` | [link](https://pubchem.ncbi.nlm.nih.gov/) | none | -| [Chemical Translation Service (CTS)](http://cts.fiehnlab.ucdavis.edu/) | `cts_convert()`, `cts_compinfo()` | none | none | -| [PAN Pesticide Database](http://www.pesticideinfo.org/) | `pan_query()` | none | none | -| [Alan Wood’s Compendium of Pesticide Common Names](http://www.alanwood.net/pesticides/) | `aw_query()` | none | none | -| [ETOX](http://webetox.uba.de/webETOX/index.do) | `get_etoxid()`, `etox_basic()`. `etox_targets()`, `etox_tests()` | none | none | -| [ChemIDplus](http://chem.sis.nlm.nih.gov/chemidplus/) | `ci_query()` | none | none | -| [Wikidata](https://www.wikidata.org/wiki/Wikidata:WikiProject_Chemistry) | `get_wdid()`, `wd_ident()` | [link](https://www.mediawiki.org/wiki/API:Main_page) | none | -| [OPSIN](http://opsin.ch.cam.ac.uk/instructions.html) | `opsin_query()` | [link](http://opsin.ch.cam.ac.uk/instructions.html) | none | -| [Flavornet](http://www.flavornet.org) | `fn_percept()` | none | none | -| [NIST](https://webbook.nist.gov) | `nist_ri()` | none | none | -| [ChEBI](https://www.ebi.ac.uk/chebi/) | `get_chebiid()`, `chebi_comp_entity()` | [link](https://www.ebi.ac.uk/chebi/webServices.do) | none | - -Moreover, there are some functions to check indentifiers: -`is.inchikey()`, `is.cas()` and `is.smiles()`. +informations and `get_csid()` retrieves ChemSpider IDs. + +## Fill out the survey\! + +Do you use chemical information databases in your work? Help us help you +by filling out our short survey at +. + +The `webchem` survey allows us to learn which databases you use and how +you interact with chemical data. This is extremely valuable information +for us and guides our development efforts. The survey takes about 5 +minutes to fill out. + +## Chemical databases currently accessed by webchem + +At least some of the data in the following sources is accesible through +`webchem` functions. To learn more about what is available, browse the +documentation +[here](https://docs.ropensci.org/webchem/reference/index.html). + + - [Chemical Identifier Resolver + (CIR)](http://cactus.nci.nih.gov/chemical/structure) + - [ChemSpider](http://www.chemspider.com/) (requires an [API + token](\(https://developer.rsc.org/\))) + - [PubChem](https://pubchem.ncbi.nlm.nih.gov/) + - [Chemical Translation Service + (CTS)](http://cts.fiehnlab.ucdavis.edu/) + - [PAN Pesticide Database](http://www.pesticideinfo.org/) + - [Alan Wood’s Compendium of Pesticide Common + Names](http://www.alanwood.net/pesticides/) + - [ETOX](http://webetox.uba.de/webETOX/index.do) + - [ChemIDplus](http://chem.sis.nlm.nih.gov/chemidplus/) + - [Wikidata](https://www.wikidata.org/wiki/Wikidata:WikiProject_Chemistry) + - [OPSIN](http://opsin.ch.cam.ac.uk/instructions.html) + - [Flavornet](http://www.flavornet.org) + - [NIST](https://webbook.nist.gov) (currently gas chromatography + retention indices only) + - [ChEBI](https://www.ebi.ac.uk/chebi/) + - [U.S. EPA Substance Registry Service + (SRS)](https://cdxnodengn.epa.gov/cdx-srs-rest/) #### API keys @@ -74,777 +90,18 @@ library("devtools") install_github("ropensci/webchem") ``` -## Quickstart - -``` r -library("webchem") -``` - -#### Chemical Identifier Resolver (CIR) - -CAS numbers and molecular weight for -[Triclosan](http://en.wikipedia.org/wiki/Triclosan). Use `choices = 1` -to return only the first hit. - -``` r -cir_query('Triclosan', 'cas') -#> $Triclosan -#> [1] "3380-34-5" "112099-35-1" "88032-08-0" -cir_query('Triclosan', 'cas', choices = 1) -#> Triclosan1 Triclosan2 Triclosan3 -#> "3380-34-5" "112099-35-1" "88032-08-0" -cir_query('Triclosan', 'mw') -#> $Triclosan -#> [1] 289.5451 -``` - -Query SMILES and InChIKey from CAS (Triclosan). Inputs might by -ambiguous and we can specify where to search using `resolver=`. - -``` r -cir_query('3380-34-5', 'smiles') -#> $`3380-34-5` -#> [1] "Oc1cc(Cl)ccc1Oc2ccc(Cl)cc2Cl" -cir_query('3380-34-5', 'stdinchikey', resolver = 'cas_number') -#> $`3380-34-5` -#> [1] "InChIKey=XEFQLINVKFYRCS-UHFFFAOYSA-N" -``` - -Query the number of rings using the InChiKey (Triclosan) - -``` r -cir_query('XEFQLINVKFYRCS-UHFFFAOYSA-N', 'ring_count') -#> $`XEFQLINVKFYRCS-UHFFFAOYSA-N` -#> [1] 2 -``` - -#### ChemSpider - -Retrieve the ChemSpider ID of Triclosan - -``` r -(id <- get_csid('Triclosan')) -#> $Triclosan -#> [1] 5363 -``` - -Use this ID to query information from ChemSpider - -``` r -# cs_compinfo(id, fields = c("Formula", "MolecularWeight")) -``` - -Note that the URL of the source if also returned (`source_url`) and can -be used for (micro-)attribution. - -Or to convert to a Mol-Object - -``` r -# mol <- cs_convert(id, from = 'csid', to = 'mol') -# head(parse_mol(mol$ab)) -``` - -Note that the Molfile is parsed into a R object (via `parse_mol()`) and -that an API-key is needed - -`cs_convert()` handles a lot of input / output formats: - -``` r -cs_convert('XEFQLINVKFYRCS-UHFFFAOYAS', from = 'inchikey', to = 'csid') -#> $`XEFQLINVKFYRCS-UHFFFAOYAS` -#> [1] 5363 -cs_convert('XEFQLINVKFYRCS-UHFFFAOYAS', from = 'inchikey', to = 'inchi') -#> $`XEFQLINVKFYRCS-UHFFFAOYAS` -#> [1] "InChI=1/C12H7Cl3O2/c13-7-1-3-11(9(15)5-7)17-12-4-2-8(14)6-10(12)16/h1-6,16H" -cs_convert('c1cc(c(cc1Cl)O)Oc2ccc(cc2Cl)Cl', from = 'smiles', to = 'inchi') -#> $`c1cc(c(cc1Cl)O)Oc2ccc(cc2Cl)Cl` -#> [1] "InChI=1/C12H7Cl3O2/c13-7-1-3-11(9(15)5-7)17-12-4-2-8(14)6-10(12)16/h1-6,16H" -``` - -And get EPISuit predictions from ChemSpider - -``` r -cs_prop('5363')[['5363']]$epi[ , c(1:4)] -``` - -#### PubChem - -Retrieve PubChem CID - -``` r -get_cid(c('Triclosan', 'Aspirin')) -#> $Triclosan -#> [1] 5564 -#> -#> $Aspirin -#> [1] 2244 -get_cid('3380-34-5') -#> $`3380-34-5` -#> [1] 5564 -``` - -Use this CID to retrieve some chemical -properties: - -``` r -pc_prop(c(5564,2244), properties = c('InChIKey', 'MolecularFormula', 'MolecularWeight')) -#> CID MolecularFormula MolecularWeight InChIKey -#> 1 5564 C12H7Cl3O2 289.50 XEFQLINVKFYRCS-UHFFFAOYSA-N -#> 2 2244 C9H8O4 180.16 BSYNRYMUTXBXSQ-UHFFFAOYSA-N -``` - -and synonyms - -``` r -pc_synonyms(5564, from = 'cid')[[1]][1:5] -#> [1] "5564" -#> [2] "triclosan" -#> [3] "3380-34-5" -#> [4] "5-CHLORO-2-(2,4-DICHLOROPHENOXY)PHENOL" -#> [5] "2,4,4'-Trichloro-2'-hydroxydiphenyl ether" -pc_synonyms('Triclosan', from = 'name')[[1]][1:5] -#> [1] "5564" -#> [2] "triclosan" -#> [3] "3380-34-5" -#> [4] "5-CHLORO-2-(2,4-DICHLOROPHENOXY)PHENOL" -#> [5] "2,4,4'-Trichloro-2'-hydroxydiphenyl ether" -``` - -#### Chemical Translation Service (CTS) - -CTS allows to convert from nearly every possible identifier to nearly -every possible identifier: - -``` r -cts_convert(query = '3380-34-5', from = 'CAS', to = 'ChemSpider') -#> $`3380-34-5` -#> [1] "31465" -(inchk <- cts_convert(query = '50-00-0', from = 'CAS', to = 'inchikey')) -#> $`50-00-0` -#> [1] "WSFSSNUMVMOOMR-UHFFFAOYSA-N" -``` - -Moreover, we can a lot of information stored in the CTS database using -InChIkey - -``` r -info <- cts_compinfo(inchikey = inchk[[1]]) -info[[1]][1:5] -#> $inchikey -#> [1] "WSFSSNUMVMOOMR-UHFFFAOYSA-N" -#> -#> $inchicode -#> [1] "InChI=1S/CH2O/c1-2/h1H2" -#> -#> $molweight -#> [1] 30.02602 -#> -#> $exactmass -#> [1] 30.01056 -#> -#> $formula -#> [1] "CH2O" -``` - -#### PAN Pesticide Database - -`pan_query()` returns a list of 75 entries, here I extract only 4 of -those: - -``` r -pan_list <- pan_query('lambda-Cyhalothrin', match = 'best') -pan_list[[1]][c("CAS Number", "Chemical Class", "Water Solubility (Avg, mg/L)", "Adsorption Coefficient (Koc)" )] -#> $`CAS Number` -#> [1] "91465-08-6" -#> -#> $`Chemical Class` -#> [1] "Pyrethroid" -#> -#> $`Water Solubility (Avg, mg/L)` -#> [1] NA -#> -#> $`Adsorption Coefficient (Koc)` -#> [1] 157000 -``` - -#### Alan Wood’s Compendium of Pesticide Common Names - -`aw_query()` returns a list of 9 entries and can query common names and -cas numbers: - -``` r -aw_query('Fluazinam', type = 'commonname') -#> $Fluazinam -#> $Fluazinam$cname -#> [1] "Fluazinam" -#> -#> $Fluazinam$status -#> [1] "ISO 1750 (published)" -#> -#> $Fluazinam$pref_iupac_name -#> [1] "3-chloro-N-[3-chloro-2,6-dinitro-4-(trifluoromethyl)phenyl]-5-(trifluoromethyl)pyridin-2-amine" -#> -#> $Fluazinam$iupac_name -#> [1] "3-chloro-N-(3-chloro-5-trifluoromethyl-2-pyridyl)-α,α,α-trifluoro-2,6-dinitro-p-toluidine" -#> -#> $Fluazinam$cas -#> [1] "79622-59-6" -#> -#> $Fluazinam$formula -#> [1] "C13H4Cl2F6N4O4" -#> -#> $Fluazinam$activity -#> [1] "fungicides" -#> -#> $Fluazinam$subactivity -#> [1] "pyridine fungicides" -#> -#> $Fluazinam$inchikey -#> [1] "UZCGKGPEKUCDTF-UHFFFAOYSA-N" -#> -#> $Fluazinam$inch -#> [1] "InChI=1S/C13H4Cl2F6N4O4/c14-6-1-4(12(16,17)18)3-22-11(6)23-9-7(24(26)27)2-5(13(19,20)21)8(15)10(9)25(28)29/h1-3H,(H,22,23)" -#> -#> $Fluazinam$source_url -#> [1] "http://www.alanwood.net/pesticides/fluazinam.html" -#> -#> -#> attr(,"class") -#> [1] "aw_query" "list" -aw_query('79622-59-6', type = 'cas')[[1]]$cname -#> [1] "fluazinam" -``` - -#### ETOX - -ETOX: Information System Ecotoxicology and Environmental Quality Targets -is a database run by the Federal Environment Agency of Germany and -provides data on synonyms, identifiers, Quality Targest and Effects. - -First we need to query a substance ID: - -``` r -ids <- get_etoxid('Triclosan', match = 'best') -ids -#> etoxid match distance query -#> 1 20179 Triclosan ( 20179 ) 0 Triclosan -``` - -`get_etoxid` tries to find the best match for you (check the matched and -distance attributes), if multiple hits are found. Other options are -`match = 'ask'` to enter a interactive mode, `'na'` to return `NA`, -`'all'` to return all hits and `'first'` to return the first hit. - -``` r -get_etoxid('Triclosan', match = 'all') -#> [[1]] -#> [1] "/webETOX/public/search/stoff.do?orderBy=name" -#> [2] "89236" -#> [3] "20179" -#> attr(,"matched") -#> [1] NA "Methyltriclosan ( 89236 )" -#> [3] "Triclosan ( 20179 )" -#> attr(,"distance") -#> [1] "all" -``` - -With this substance ID we can query further information from ETOX, e.g.: - -``` r -etox_basic(ids$etoxid)[[1]] -#> $cas -#> [1] "3380-34-5" -#> -#> $ec -#> character(0) -#> -#> $gsbl -#> [1] "117338" -#> -#> $synonyms -#> name language -#> 3 5-chloro-2-(2,4-dichlorophenoxy)phenol English -#> 4 Phenol, 5-chloro-2-(2,4-dichlorophenoxy)- English -#> 8 2,4,4'-Trichlor-2'-hydroxydiphenylether German -#> 9 2,4,4-Trichlor-2'-hydroxydiphenylether German -#> 10 2,4,4'-Trichloro-2'-hydroxydiphenylether German -#> 12 Chlor-2-(2,4-dichlorphenoxy)phenol, 5- universal -#> 13 Trichloro-2'-hydroxydiphenylether, 2,4,4'- universal -#> 14 5-Chlor-2-(2,4-dichlorphenoxy)-phenol universal -#> 15 Chlor-2-(2,4-dichlorphenoxy)-phenol, 5- universal -#> 16 5-Chlor-2-(2,4-dichlorphenoxy)phenol universal -#> 17 triclosán Spanish -#> 18 triklosaani Finnish -#> 19 triclosano Italian -#> 20 triklosan Swedish -#> -#> $source_url -#> [1] "https://webetox.uba.de/webETOX/public/basics/stoff.do?language=en&id=20179" -``` - -Which returns CAS, EC and GSBL numbers, as well as a synonym list. - -We can also retrieve Quality Targets: - -``` r -targets <- etox_targets(ids$etoxid)[[1]] -targets$res[ , c('Substance', 'Country_or_Region', 'Designation', 'Value_Target_LR', 'Unit')] -#> Substance Country_or_Region Designation Value_Target_LR Unit -#> 1 Triclosan AUS PNEC 0.050 µg/l -#> 2 Triclosan CHE AA-QS_freshwater 0.020 µg/l -#> 3 Triclosan CHE MAC-QS 0.020 µg/l -#> 4 Triclosan DEU AA-EQS 0.020 µg/l -#> 5 Triclosan DEU MAC-EQS 0.200 µg/l -#> 6 Triclosan DEU QS_fw, eco 0.020 µg/l -#> 7 Triclosan DEU MAC-QS_fw, eco 0.160 µg/l -#> 8 Triclosan DEU QS_sw, eco 0.002 µg/l -#> 9 Triclosan DEU MAC-QS_sw, eco 0.016 µg/l -#> 10 Triclosan DEU AA-EQS 0.020 µg/l -#> 11 Triclosan DEU AA-EQS 0.002 µg/l -#> 12 Triclosan DEU MAC-EQS 0.200 µg/l -#> 13 Triclosan DEU MAC-EQS 0.020 µg/l -``` - -and results of ecotox tests: - -``` r -tests <- etox_tests(ids$etoxid)[[1]] -tests$res[ , c('Organism', 'Effect', 'Duration', 'Time_Unit','Endpoint', 'Value', 'Unit')] -#> Organism Effect Duration -#> 1 Anabaena flos-aquae not reported 4 -#> 2 Brachionus calyciflorus not reported 2 -#> 3 Brachionus calyciflorus not reported 2 -#> 4 Brachionus calyciflorus not reported 2 -#> 5 Brachydanio rerio Embryo-Larval-Toxicity 10 -#> 6 Ceriodaphnia dubia Lethality 7 -#> 7 Ceriodaphnia dubia Mortality 2 -#> 8 Ceriodaphnia dubia Mortality 7 -#> 9 Ceriodaphnia dubia not reported 7 -#> 10 Ceriodaphnia dubia Reproduction 7 -#> 11 Ceriodaphnia dubia Reproduction 7 -#> 12 Daphnia magna Mortality 21 -#> 13 Daphnia magna Reproduction 21 -#> 14 Desmodesmus subspicatus Cell Proliferation 4 -#> 15 Dunaliella tertiolecta Cell Proliferation 4 -#> 16 Dunaliella tertiolecta Cell Proliferation 4 -#> 17 Oncorhynchus mykiss Embryo-Larval-Toxicity 4 -#> 18 Pimephales promelas Mortality 4 -#> 19 Pseudokirchneriella subcapitata Wachstumshemmung 3 -#> 20 Scenedesmus subspicatus Biomass 3 -#> 21 Scenedesmus subspicatus not reported 4 -#> 22 Scenedesmus subspicatus not reported 4 -#> 23 Scenedesmus subspicatus not reported 4 -#> 24 Scenedesmus subspicatus Reproduction 3 -#> 25 Hyalella azteca Mortality 10 -#> Time_Unit Endpoint Value Unit -#> 1 d NOEC 0.810 -#> 2 d NOEC 50.000 µg/l -#> 3 d NOEC 50.000 µg/l -#> 4 d NOEC 50.000 µg/l -#> 5 d NOEC 200.000 µg/l -#> 6 d NOEC 339.000 µg/l -#> 7 d EC50 120.000 µg/l -#> 8 d NOEC 50.000 µg/l -#> 9 d NOEC 4.000 µg/l -#> 10 d NOEC 6.000 µg/l -#> 11 d NOEC 182.000 µg/l -#> 12 d NOEC 132.000 µg/l -#> 13 d NOEC 40.000 µg/l -#> 14 d ErC50 1.610 µg/l -#> 15 d NOEC 1.600 µg/l -#> 16 d EbC50 3.550 µg/l -#> 17 d NOEC 34.100 µg/l -#> 18 d LC50 260.000 µg/l -#> 19 d NOEC 0.200 µg/l -#> 20 d NOEC 0.500 µg/l -#> 21 d NOEC 0.690 µg/l -#> 22 d NOEC 0.742 µg/l -#> 23 d NOEC 2.380 µg/l -#> 24 d NOEC 0.500 µg/l -#> 25 d NOEC 5.000 µg/l -``` - -#### ChemIDplus - -``` r -out <- ci_query(query = 'Triclosan', type = 'name', match = 'best') -out[['Triclosan']]$physprop -#> Physical Property Value Units Temp (deg C) -#> 1 Melting Point NA deg C NA -#> 2 log P (octanol-water) 4.76e+00 (none) NA -#> 3 Water Solubility 1.00e+01 mg/L 20 -#> 4 Vapor Pressure 6.45e-07 mm Hg 25 -#> 5 Henry's Law Constant 4.99e-09 atm-m3/mole 25 -#> 6 Atmospheric OH Rate Constant 1.61e-11 cm3/molecule-sec 25 -#> Source -#> 1 EXP -#> 2 EXP -#> 3 EXP -#> 4 EST -#> 5 EST -#> 6 EST -``` - -#### Wikidata - -``` r -ids <- get_wdid(query = 'Triclosan') -ids -#> id match distance query -#> 1 Q56228675 Triclosan 0 Triclosan - -# query identifiers from Wikidata -wd_ident(ids$id)[1:5] -#> smiles cas cid einecs csid -#> 1 -``` - -#### OPSIN - -``` r -opsin_query(c('Cyclopropane', 'Octane')) -#> inchi -#> Cyclopropane InChI=1/C3H6/c1-2-3-1/h1-3H2 -#> Octane InChI=1/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3 -#> stdinchi -#> Cyclopropane InChI=1S/C3H6/c1-2-3-1/h1-3H2 -#> Octane InChI=1S/C8H18/c1-3-5-7-8-6-4-2/h3-8H2,1-2H3 -#> stdinchikey smiles message status -#> Cyclopropane LVZWSLJZHVFIQJ-UHFFFAOYSA-N C1CC1 SUCCESS -#> Octane TVMXDCGIABBOFY-UHFFFAOYSA-N CCCCCCCC SUCCESS -#> query -#> Cyclopropane Cyclopropane -#> Octane Octane -``` - -#### Flavornet - -``` r -fn_percept(CAS = c("75-07-0", "123-32-0")) -#> 75-07-0 -#> "pungent, ether" -#> 123-32-0 -#> "cocoa, roasted nut, roast beef, medicine" -``` - -#### NIST - -Identification of gas chromatography peaks is often aided by retention -idices. NIST provides tables of retention indices reported in the -literature organized by retention index type (Kovats, linear, normal -alkane, and Lee), column polarity, and temperature program. - -``` r -RIs <- - nist_ri( - query = "78-70-6", - from = "cas", - type = "kovats", - polarity = "non-polar", - temp_prog = "ramp" - ) -head(RIs) -#> # A tibble: 6 x 16 -#> query type phase RI length gas substrate diameter thickness -#> -#> 1 78-7… Capi… DB-5 1098 30 Heli… 0.26 0.25 -#> 2 78-7… Capi… DB-1 1086 30 Heli… 0.25 0.25 -#> 3 78-7… Capi… DB-5… 1101 30 Heli… 0.25 0.25 -#> 4 78-7… Capi… HP-5… 1104 30 Heli… 0.25 0.25 -#> 5 78-7… Capi… HP-5… 1106 60 Heli… 0.25 0.25 -#> 6 78-7… Capi… DB-5 1099 30 Heli… 0.25 0.25 -#> # … with 7 more variables: temp_start , temp_end , -#> # temp_rate , hold_start , hold_end , reference , -#> # comment -``` - -#### ChEBI - -Chemical Entities of Biological Interest (ChEBI) is a freely available -dictionary of molecular entities focused on ‘small’ chemical compounds. -`get_chebiid()` returns a list of data.frames which matching query -results. The data.frames contain the **chebiid**, the -**chebiiasciiname**, a **searchscore** and **entity stars** (either 2 or -3, depending on whether the entity was checked -thoroughly). - -``` r -ids <- get_chebiid(c('Isoproturon', 'RZVAJINKPMORJF-UHFFFAOYSA-N'), verbose = FALSE) -ids -#> chebiid chebiasciiname -#> Isoproturon1 CHEBI:6049 isoproturon -#> Isoproturon2 CHEBI:83468 isoproturon-monodemethyl -#> Isoproturon3 CHEBI:83514 isoproturon-didemethyl -#> Isoproturon4 CHEBI:43405 para-isopropylaniline -#> RZVAJINKPMORJF-UHFFFAOYSA-N CHEBI:46195 paracetamol -#> searchscore entitystar -#> Isoproturon1 0.54 3 -#> Isoproturon2 0.27 3 -#> Isoproturon3 0.27 3 -#> Isoproturon4 0.06 3 -#> RZVAJINKPMORJF-UHFFFAOYSA-N 0.02 3 -#> query -#> Isoproturon1 Isoproturon -#> Isoproturon2 Isoproturon -#> Isoproturon3 Isoproturon -#> Isoproturon4 Isoproturon -#> RZVAJINKPMORJF-UHFFFAOYSA-N RZVAJINKPMORJF-UHFFFAOYSA-N -``` - -The **chebiid** can then be used to query the complete ChEBI entity -using `chebi_comp_entity()`. The complete entity contains several -different data structures which are returned in a list. The data -structures are explained in greater detail at the [ChEBI -website](https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:6049). -Here, the list elements are showcased: - -``` r -# ecample entities -isoproturon <- chebi_comp_entity(ids$Isoproturon$chebiid[1]) -paracetamol <- chebi_comp_entity(ids$`RZVAJINKPMORJF-UHFFFAOYSA-N`$chebiid[1]) - -# properties: a data.frame with general properties -lapply(isoproturon, '[[', 'properties') -#> $`CHEBI:6049` -#> chebiid chebiasciiname -#> 1 CHEBI:6049 isoproturon -#> definition -#> 1 A member of the class of phenylureas that is 1,1-dimethylurea substituted by a p-cumenyl group at position 3. A selective, systemic herbicide used to control annual grasses and broadleaf weeds in cereals, its use within the EU has been banned after September 2017 on the grounds of potential groundwater contamination and risks to aquatic life; there have also been concerns about its endocrine-disrupting properties. -#> status smiles -#> 1 CHECKED CC(C)c1ccc(NC(=O)N(C)C)cc1 -#> inchi -#> 1 InChI=1S/C12H18N2O/c1-9(2)10-5-7-11(8-6-10)13-12(15)14(3)4/h5-9H,1-4H3,(H,13,15) -#> inchikey charge mass monoisotopicmass entitystar -#> 1 PUIYMUZLKQOUOZ-UHFFFAOYSA-N 0 206.28410 206.14191 3 -# chem_structure: a list of chemical structure formats (e.g. mol) -lapply(isoproturon, '[[', 'chem_structure') -#> $`CHEBI:6049` -#> $`CHEBI:6049`[[1]] -#> $`CHEBI:6049`[[1]]$structure -#> $`CHEBI:6049`[[1]]$structure[[1]] -#> [1] "\n Mrv0541 11101412262D \n\n 15 15 0 0 0 0 999 V2000\n 7.5478 -4.8268 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 7.5478 -4.0026 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 8.2674 -3.5905 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 8.9830 -4.0026 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 8.9830 -4.8268 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 8.2674 -5.2389 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 9.6957 -5.2389 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0\n 10.4117 -4.8310 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 11.1278 -5.2431 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0\n 11.8396 -4.8352 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 10.4076 -4.0068 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0\n 11.1236 -6.0632 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 6.8318 -3.5947 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 6.8318 -2.7746 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 6.1157 -4.0068 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1 2 2 0 0 0 0\n 7 8 1 0 0 0 0\n 2 3 1 0 0 0 0\n 8 9 1 0 0 0 0\n 3 4 2 0 0 0 0\n 9 10 1 0 0 0 0\n 4 5 1 0 0 0 0\n 8 11 2 0 0 0 0\n 5 6 2 0 0 0 0\n 9 12 1 0 0 0 0\n 6 1 1 0 0 0 0\n 2 13 1 0 0 0 0\n 13 14 1 0 0 0 0\n 5 7 1 0 0 0 0\n 13 15 1 0 0 0 0\nM END\n" -#> -#> -#> $`CHEBI:6049`[[1]]$type -#> $`CHEBI:6049`[[1]]$type[[1]] -#> [1] "mol" -#> -#> -#> $`CHEBI:6049`[[1]]$dimension -#> $`CHEBI:6049`[[1]]$dimension[[1]] -#> [1] "2D" -#> -#> -#> $`CHEBI:6049`[[1]]$defaultStructure -#> $`CHEBI:6049`[[1]]$defaultStructure[[1]] -#> [1] "true" -# synonyms: a data.frame of synonyms (collected from different sources) -lapply(isoproturon, '[[', 'synonyms') -#> $`CHEBI:6049` -#> data type source -#> 1 1,1-dimethyl-3-(p-isopropylphenyl)-urea SYNONYM NIST Chemistry WebBook -#> 2 3-p-cumenyl-1,1-dimethylurea SYNONYM Alan Wood's Pesticides -#> 3 N-4-isopropylphenyl-N,N-dimethylurea SYNONYM NIST Chemistry WebBook -# iupacnames: a data.frame of IUPAC names (collected from different sources) -lapply(isoproturon, '[[', 'iupacnames') -#> $`CHEBI:6049` -#> data type source -#> 1 1,1-dimethyl-3-[4-(propan-2-yl)phenyl]urea IUPAC NAME IUPAC -# formulae: a data.frame of chemical formulae (collected from different sources) -lapply(isoproturon, '[[', 'formulae') -#> $`CHEBI:6049` -#> data source -#> 1 C12H18N2O ChEBI -# regnumbers: a data.frame of registry numbers (e.g. CAS, Beilstein, Reaxys) (collected from different sources) -lapply(isoproturon, '[[', 'regnumbers') -#> $`CHEBI:6049` -#> data type source -#> 1 2214033 Reaxys Registry Number Reaxys -#> 2 34123-59-6 CAS Registry Number KEGG COMPOUND -#> 3 34123-59-6 CAS Registry Number NIST Chemistry WebBook -#> 4 34123-59-6 CAS Registry Number ChemIDplus -# chebiid_snd: a data.frame with secondary ChEBI ids -lapply(paracetamol, '[[', 'chebiid_snd') -#> $`CHEBI:46195` -#> chebiids -#> 1 CHEBI:46191 -#> 2 CHEBI:2386 -# citations: Publications which cite the entity along with hyperlinks to the PubMed entry via Europe PMC -head(lapply(paracetamol, '[[', 'citations')[[1]]) -#> data type source -#> 1 11084378 PubMed citation Europe PMC -#> 2 11304127 PubMed citation Europe PMC -#> 3 16716555 PubMed citation Europe PMC -#> 4 18953082 PubMed citation Europe PMC -#> 5 21108564 PubMed citation Europe PMC -#> 6 22770225 PubMed citation Europe PMC -# parents: parent ontologies of the entity -lapply(paracetamol, '[[', 'parents') -#> $`CHEBI:46195` -#> chebiName chebiId type -#> 1 cyclooxygenase 2 inhibitor CHEBI:50629 has role -#> 2 cyclooxygenase 1 inhibitor CHEBI:50630 has role -#> 3 non-narcotic analgesic CHEBI:35481 has role -#> 4 antipyretic CHEBI:35493 has role -#> 5 non-steroidal anti-inflammatory drug CHEBI:35475 has role -#> 6 phenols CHEBI:33853 is a -#> 7 4-aminophenol CHEBI:17602 has functional parent -#> 8 xenobiotic CHEBI:35703 has role -#> 9 hepatotoxic agent CHEBI:50908 has role -#> 10 human blood serum metabolite CHEBI:85234 has role -#> 11 cyclooxygenase 3 inhibitor CHEBI:73263 has role -#> 12 environmental contaminant CHEBI:78298 has role -#> 13 acetamides CHEBI:22160 is a -#> status cyclicRelationship -#> 1 CHECKED false -#> 2 CHECKED false -#> 3 CHECKED false -#> 4 CHECKED false -#> 5 CHECKED false -#> 6 CHECKED false -#> 7 CHECKED false -#> 8 CHECKED false -#> 9 CHECKED false -#> 10 CHECKED false -#> 11 CHECKED false -#> 12 CHECKED false -#> 13 CHECKED false -# children: child ontologies of the entity -lapply(paracetamol, '[[', 'children') -#> $`CHEBI:46195` -#> chebiName chebiId -#> 1 acetaminophen glutathione conjugate CHEBI:32639 -#> 2 2-methoxyacetaminophen glucuronide CHEBI:133005 -#> 3 S-(5-acetamido-2-hydroxyphenyl)-N-acetyl-L-cysteine CHEBI:133435 -#> 4 methacetin CHEBI:139354 -#> 5 phenacetin CHEBI:8050 -#> 6 S-(5-acetamido-2-hydroxyphenyl)cysteine CHEBI:133066 -#> 7 acetaminophen O-beta-D-glucosiduronic acid CHEBI:32636 -#> 8 3-nitroacetaminophen CHEBI:139475 -#> 9 3-nitroacetaminophen-TMS CHEBI:139476 -#> 10 paracetamol sulfate CHEBI:32635 -#> type status cyclicRelationship -#> 1 has functional parent CHECKED false -#> 2 has functional parent CHECKED false -#> 3 has functional parent CHECKED false -#> 4 has functional parent CHECKED false -#> 5 has functional parent CHECKED false -#> 6 has functional parent CHECKED false -#> 7 has functional parent CHECKED false -#> 8 has functional parent CHECKED false -#> 9 has functional parent CHECKED false -#> 10 has functional parent CHECKED false -# dblinks: Links to other data bases -lapply(paracetamol, '[[', 'dblinks') -#> $`CHEBI:46195` -#> data type -#> 1 52 Drug Central accession -#> 2 Acetaminophen Wikipedia accession -#> 3 C06804 KEGG COMPOUND accession -#> 4 CPD-7669 MetaCyc accession -#> 5 D00217 KEGG DRUG accession -#> 6 DB00316 DrugBank accession -#> 7 HMDB0001859 HMDB accession -#> 8 LSM-5533 LINCS accession -#> 9 TYL PDBeChem accession -# comments: General comment(s) -lapply(paracetamol, '[[', 'comments') -#> $`CHEBI:46195` -#> text -#> 1 Stravs M, Schymanski E, Singer H, Department of Environmental Chemistry, Eawag -#> date -#> 1 2014-10-29 -# Metabolites of Species -head(lapply(paracetamol, '[[', 'origins')[[1]]) -#> speciesText speciesAccession SourceType SourceAccession -#> 1 Mus musculus NCBI:txid10090 MetaboLights MTBLS292 -#> 2 Homo sapiens NCBI:txid9606 PubMed Id 19309105 -#> 3 Homo sapiens NCBI:txid9606 PubMed Id 18502700 -#> 4 Homo sapiens NCBI:txid9606 PubMed Id 12097436 -#> 5 Homo sapiens NCBI:txid9606 PubMed Id 21359215 -#> 6 Homo sapiens NCBI:txid9606 MetaboLights MTBLS90 -#> componentText componentAccession -#> 1 -#> 2 urine BTO:0001419 -#> 3 cerebrospinal fluid UBERON:0001359 -#> 4 saliva UBERON:0001836 -#> 5 blood UBERON:0000178 -#> 6 blood serum BTO:0000133 -``` - -#### Extractor functions - -The sources provide a lot of informations that can be retrieved using -the functions described above. Often only specific inforamtion is -needed. Therefore, we added extractor functions for common identifiers. - -``` r -wi <- wd_ident("Q408646") -wi -#> smiles cas cid einecs csid -#> 1 C1=CC(=C(C=C1Cl)O)OC2=C(C=C(C=C2)Cl)Cl 3380-34-5 5564 222-182-2 5363 -#> inchi -#> 1 1S/C12H7Cl3O2/c13-7-1-3-11(9(15)5-7)17-12-4-2-8(14)6-10(12)16/h1-6,16H -#> inchikey drugbank zvg chebi chembl unii -#> 1 XEFQLINVKFYRCS-UHFFFAOYSA-N 08604 490400 164200 CHEMBL849 4NM5039Y5X -#> source_url query -#> 1 https://www.wikidata.org/wiki/Q408646 Q408646 -cas(wi) -#> [1] "3380-34-5" -inchikey(wi) -#> [1] "XEFQLINVKFYRCS-UHFFFAOYSA-N" -smiles(wi) -#> [1] "C1=CC(=C(C=C1Cl)O)OC2=C(C=C(C=C2)Cl)Cl" - -# smiles(etox_basic(5564)) -``` - -#### Misc functions - -##### Check if a string is a valid CAS registry number - -``` r -is.cas('64-17-5') -#> [1] TRUE -is.cas('64-17-6') -#> [1] FALSE -``` - -##### Check if a string is a valid InChIKey - -Using a pure R implementation: - -``` r -is.inchikey('BQJCRHHNABKAKU-KBQPJGBKSA-N') -#> [1] TRUE -is.inchikey('BQJCRHHNABKAKU-KBQPJGBKXA-N') -#> [1] FALSE -``` - -Using the ChemSpider API - -``` r -is.inchikey('BQJCRHHNABKAKU-KBQPJGBKSA-N', type = 'chemspider') -#> [1] TRUE -is.inchikey('BQJCRHHNABKAKU-KBQPJGBKXA-N', type = 'chemspider') -#> [1] FALSE -``` - -##### Check if a string is a valid SMILES - -``` r -is.smiles('Clc(c(Cl)c(Cl)c1C(=O)O)c(Cl)c1Cl') -# 'J' is not found in the periodic table -is.smiles('Clc(c(Cl)c(Cl)c1C(=O)O)c(Cl)c1ClJ') -``` - ### Acknowledgements Without the fantastic web services `webchem` wouldn’t be here. -Therefore, kudos to the web service providers and developers\! +Therefore, kudos to the web service providers and developers\! Please +remember to acknowledge these data resources in your work using +`webchem`. ### Related Projects +You can find some related packages in the [ChemPhys CRAN Task +View](https://cran.r-project.org/web/views/ChemPhys.html) + If you’re more familiar with Python you should check out [Matt Swains](https://github.com/mcs07) repositories: [ChemSpiPy](https://github.com/mcs07/ChemSpiPy), @@ -862,7 +119,7 @@ here](https://github.com/ropensci/webchem/blob/master/CONTRIBUTING.md). - Please [report any issues, bugs or feature requests](https://github.com/ropensci/webchem/issues). - License: MIT - - Get citation information for `webchem` in R doing + - Get citation information for `webchem` in R with `citation("webchem")` [![ropensci](http://ropensci.org/public_images/github_footer.png)](http://ropensci.org) diff --git a/_pkgdown.yml b/_pkgdown.yml new file mode 100644 index 00000000..194e8d07 --- /dev/null +++ b/_pkgdown.yml @@ -0,0 +1,52 @@ +url: https://docs.ropensci.org/webchem + +reference: +- title: Retrieve and translate chemical identifiers +- contents: + - starts_with("get") + - cts_convert + - cs_convert + - cts_from + - cts_to + - wd_ident + - pc_synonyms + - opsin_query + - etox_basic +- title: Retrieve chemical properties +- contents: + - aw_query + - ci_query + - cir_query + - chebi_comp_entity + - cs_compinfo + - cs_extcompinfo + - cts_compinfo + - etox_targets + - etox_tests + - fn_percept + - nist_ri + - pan_query + - pc_prop + - pc_sect + - srs_query +- title: Data +- contents: + - has_keyword("datasets") +- title: Utility functions +- contents: + - as.cas + - starts_with("is.") + - build_aw_idx + - cs_check_key + - cs_control + - extractors + - ping_service + - cs_datasources + - parse_mol +- title: Deprecated and defunct +- contents: + - webchem-defunct + - webchem-deprecated +- title: Package +- contents: + - webchem diff --git a/appveyor.yml b/appveyor.yml index 74386601..8c3bae51 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,8 +1,13 @@ +platform: x64 + environment: - R_BUILD_ARGS: --no-manual - R_CHECK_ARGS: --no-build-vignettes --no-manual --timings --as-cran --no-multiarch --no-examples - R_INSTALL_ARGS: --no-multiarch - PKGTYPE: binary + R_ARCH: x64 + R_BUILD_ARGS: --no-manual + R_CHECK_ARGS: --no-build-vignettes --no-manual --timings --as-cran --no-multiarch --no-examples + R_INSTALL_ARGS: --no-multiarch + PKGTYPE: binary + NOT_CRAN: true + CURL_SSL_BACKEND: openssl init: ps: | @@ -14,9 +19,9 @@ install: ps: Bootstrap build_script: - - set JAVA_HOME=C:\Program Files (x86)\Java\jdk1.8.0\jre - - set PATH=C:\Program Files (x86)\Java\jdk1.8.0\bin;C:\Program Files (x86)\Java\jdk1.8.0\jre\bin\server;%PATH% - - ls "C:\Program Files (x86)\Java\jdk1.8.0\jre" +# - set JAVA_HOME=C:\Program Files (x86)\Java\jdk1.8.0\jre +# - set PATH=C:\Program Files (x86)\Java\jdk1.8.0\bin;C:\Program Files (x86)\Java\jdk1.8.0\jre\bin\server;%PATH% +# - ls "C:\Program Files (x86)\Java\jdk1.8.0\jre" - travis-tool.sh install_deps test_script: @@ -24,11 +29,13 @@ test_script: on_failure: - travis-tool.sh dump_logs + - 7z a failure.zip *.Rcheck\* + - appveyor PushArtifact failure.zip notifications: - provider: Email to: - - eduardszoecs@gmail.com + - stirling.tamas@gmail.com on_build_success: false on_build_failure: true on_build_status_changed: true diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 00000000..04c55859 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,14 @@ +comment: false + +coverage: + status: + project: + default: + target: auto + threshold: 1% + informational: true + patch: + default: + target: auto + threshold: 1% + informational: true diff --git a/man/cir_img.Rd b/man/cir_img.Rd new file mode 100644 index 00000000..803f4afb --- /dev/null +++ b/man/cir_img.Rd @@ -0,0 +1,134 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cir.R +\name{cir_img} +\alias{cir_img} +\title{Query Chemical Identifier Resolver Images} +\usage{ +cir_img( + query, + dir = NULL, + format = c("png", "gif"), + width = 500, + height = 500, + linewidth = 2, + symbolfontsize = 16, + bgcolor = NULL, + antialiasing = TRUE, + atomcolor = NULL, + bondcolor = NULL, + csymbol = c("special", "all"), + hsymbol = c("special", "all"), + hcolor = NULL, + header = NULL, + footer = NULL, + frame = NULL, + verbose = TRUE, + ... +) +} +\arguments{ +\item{query}{character; Search term. Can be any common chemical identifier +(e.g. CAS, INCHI(KEY), SMILES etc.)} + +\item{dir}{character; Directory to save the image.} + +\item{format}{character; Output format of the image. Can be one of "png", +"gif".} + +\item{width}{integer; Width of the image.} + +\item{height}{integer; Height of the image.} + +\item{linewidth}{integer; Width of lines.} + +\item{symbolfontsize}{integer; Fontsize of atoms in the image.} + +\item{bgcolor}{character; E.g. transparent, white, \%23AADDEE} + +\item{antialiasing}{logical; Should antialiasing be used?} + +\item{atomcolor}{character; Color of the atoms in the image.} + +\item{bondcolor}{character; Color of the atom bond lines.} + +\item{csymbol}{character; Can be one of "special" (default - i.e. only +hydrogen atoms in functional groups or defining stereochemistry) or "all".} + +\item{hsymbol}{character; Can be one of "special" (default - i.e. none are +shown) or "all" (all are printed).} + +\item{hcolor}{character; Color of the hydrogen atoms.} + +\item{header}{character; Should a header text be added to the image? Can be +any string.} + +\item{footer}{character; Should a footer text be added to the image? Can be +any string.} + +\item{frame}{integer; Should a frame be plotted? Can be on of NULL (default) +or 1.} + +\item{verbose}{logical; Should a verbose output be printed on the console?} + +\item{...}{currently not used.} +} +\value{ +data.frame and image written to disk +} +\description{ +A interface to the Chemical Identifier Resolver (CIR). + (\url{http://cactus.nci.nih.gov/chemical/structure_documentation}). +} +\details{ +CIR can resolve can be of the following \code{identifier}: Chemical Names, + IUPAC names, + CAS Numbers, SMILES strings, IUPAC InChI/InChIKeys, NCI/CADD Identifiers, + CACTVS HASHISY, NSC number, PubChem SID, ZINC Code, ChemSpider ID, + ChemNavigator SID, eMolecule VID. + + For an image with transparent background use ‘transparent’ as color name and + switch off antialiasing (i.e. antialiasing = 0). +} +\note{ +You can only make 1 request per second (this is a hard-coded feature). +} +\examples{ +\donttest{ +# might fail if API is not available +cir_img("CCO") # SMILES + +# multiple query strings and different formats +query = c("Glyphosate", "Isoproturon", "BSYNRYMUTXBXSQ-UHFFFAOYSA-N") +cir_img(query, bgcolor = "transparent", antialising = 0) + +# all parameters +query = "Triclosan" +cir_img(query, + format = "gif", + width = 600, + height = 600, + linewidth = 5, + symbolfontsize = 30, + bgcolor = "red", + antialising = 0, + atomcolor = "green", + bondcolor = "yellow", + csymbol = "all", + hsymbol = "all", + hcolor = "purple", + header = "My funky chemical structure..", + footer = "..is just so awesome!", + frame = 1) +} +} +\references{ +\code{cir} relies on the great CIR web service created by the CADD +Group at NCI/NIH! \cr +\url{http://cactus.nci.nih.gov/chemical/structure_documentation}, \cr +\url{http://cactus.nci.nih.gov/blog/?cat=10}, \cr +\url{http://cactus.nci.nih.gov/blog/?p=1386}, \cr +\url{http://cactus.nci.nih.gov/blog/?p=1456}, \cr +} +\author{ +Andreas Scharmueller, \email{andschar@protonmail.com} +} diff --git a/man/cts_convert.Rd b/man/cts_convert.Rd index d45ebc53..a64819ea 100644 --- a/man/cts_convert.Rd +++ b/man/cts_convert.Rd @@ -31,7 +31,7 @@ cts_convert( \item{...}{currently not used.} } \value{ -a list of characters. If first = TRUE a vector. +a list of character vectors or if \code{choices} is used, then a single named vector. } \description{ Convert Ids using Chemical Translation Service (CTS), see \url{http://cts.fiehnlab.ucdavis.edu/} @@ -43,11 +43,11 @@ for possible values of from and to. \examples{ \donttest{ # might fail if API is not available -cts_convert('XEFQLINVKFYRCS-UHFFFAOYSA-N', 'inchikey', 'Chemical Name') +cts_convert("triclosan", "Chemical Name", "inchikey") ### multiple inputs -comp <- c('XEFQLINVKFYRCS-UHFFFAOYSA-N', 'BSYNRYMUTXBXSQ-UHFFFAOYSA-N') -cts_convert(comp, 'inchikey', 'Chemical Name') +comp <- c("triclosan", "hexane") +cts_convert(comp, "Chemical Name", "cas") } } \references{ diff --git a/man/fn_percept.Rd b/man/fn_percept.Rd index 95959441..3f8878d3 100644 --- a/man/fn_percept.Rd +++ b/man/fn_percept.Rd @@ -21,7 +21,7 @@ Retreive flavor percepts from \url{http://www.flavornet.org}. Flavornet is a da perceptible to humans detected using gas chromatography ofactometry (GCO). } \examples{ -\donttest{ +\dontrun{ # might fail if website is not available fn_percept("123-32-0") diff --git a/man/get_cid.Rd b/man/get_cid.Rd index da80b526..117b182a 100644 --- a/man/get_cid.Rd +++ b/man/get_cid.Rd @@ -2,30 +2,33 @@ % Please edit documentation in R/pubchem.R \name{get_cid} \alias{get_cid} -\title{Retrieve Pubchem Id (CID)} +\title{Retrieve Pubchem Compound ID (CID)} \usage{ get_cid( query, - from = c("name", "cid", "sid", "aid", "smiles", "inchi", "inchikey"), + from = "name", + domain = c("compound", "substance", "assay"), match = c("all", "first", "ask", "na"), verbose = TRUE, - search_substances = FALSE, arg = NULL, first = NULL, ... ) } \arguments{ -\item{query}{character; search term.} +\item{query}{character; search term, one or more compounds.} -\item{from}{character; type of input, can be one of "name" (default), "cid", -"sid", "aid", "smiles", "inchi", "inchikey"} +\item{from}{character; type of input. See details for more information.} -\item{match}{character; How should multiple hits be handled?, "all" all matches are returned, "best" the best matching is returned, "ask" enters an interactive mode and the user is asked for input, "na" returns NA if multiple hits are found.} +\item{domain}{character; query domain, can be one of \code{"compound"}, +\code{"substance"}, \code{"assay"}.} -\item{verbose}{logical; should a verbose output be printed on the console?} +\item{match}{character; How should multiple hits be handled?, \code{"all"} +all matches are returned, \code{"best"} the best matching is returned, +\code{"ask"} enters an interactive mode and the user is asked for input, +\code{"na"} returns NA if multiple hits are found.} -\item{search_substances}{logical; If TRUE also searches PubChem SIDs} +\item{verbose}{logical; should a verbose output be printed on the console?} \item{arg}{character; optinal arguments like "name_type=word" to match individual words.} @@ -38,8 +41,50 @@ individual words.} a tibble. } \description{ -Return CompoundID (CID) for a search query using PUG-REST, -see \url{https://pubchem.ncbi.nlm.nih.gov/}. +Retrieve compound IDs (CIDs) from PubChem. +} +\details{ +Valid values for the \code{from} argument depend on the +\code{domain}: +\itemize{ +\item{\code{compound}: \code{"name"}, \code{"smiles"}, \code{"inchi"}, +\code{"inchikey"}, \code{"formula"}, \code{"sdf"}, , +, .} +\item{\code{substance}: \code{"name"}, \code{"sid"}, +\code{}, \code{"sourceid/"} or \code{"sourceall"}.} +\item{\code{assay}: \code{"aid"}, \code{}.} +} + + is assembled as "{\code{substructure} | +\code{superstructure} | \code{similarity} | \code{identity}} / {\code{smiles} + | \code{inchi} | \code{sdf} | \code{cid}}", e.g. + \code{from = "substructure/smiles"}. + +\code{} is assembled as "\code{xref}/\{\code{RegistryID} | +\code{RN} | \code{PubMedID} | \code{MMDBID} | \code{ProteinGI}, +\code{NucleotideGI} | \code{TaxonomyID} | \code{MIMID} | \code{GeneID} | +\code{ProbeID} | \code{PatentID}\}", e.g. \code{from = "xref/RN"} will query +by CAS RN. + + is either \code{fastformula} or it is assembled as +"{\code{fastidentity} | \code{fastsimilarity_2d} | \code{fastsimilarity_3d} | +\code{fastsubstructure} | \code{fastsuperstructure}}/{\code{smiles} | +\code{smarts} | \code{inchi} | \code{sdf} | \code{cid}}", e.g. +\code{from = "fastidentity/smiles"}. + +\code{} is any valid PubChem Data Source ID. When +\code{from = "sourceid/"}, the query is the ID of the substance in +the depositor's database. + +If \code{from = "sourceall"} the query is one or more valid Pubchem +depositor names. Depositor names are not case sensitive. + +Depositor names and Data Source IDs can be found at +\url{https://pubchem.ncbi.nlm.nih.gov/sources/}. + +\code{} is assembled as "\code{target}/\{\code{gi} | +\code{proteinname} | \code{geneid} | \code{genesymbol} | \code{accession}\}", +e.g. \code{from = "target/geneid"} will query by GeneID. } \note{ Please respect the Terms and Conditions of the National Library of @@ -55,12 +100,32 @@ usage policies of the indicidual data sources # might fail if API is not available get_cid("Triclosan") get_cid("Triclosan", arg = "name_type=word") -get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey") +# from SMILES get_cid("CCCC", from = "smiles") +# from InChI +get_cid("InChI=1S/CH5N/c1-2/h2H2,1H3", from = "inchi") +# from InChIKey +get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey") +# from formula +get_cid("C26H52NO6P", from = "formula") +# from CAS RN +get_cid("56-40-6", from = "xref/rn") +# similarity +get_cid(5564, from = "similarity/cid") +get_cid("CCO", from = "similarity/smiles") +# from SID +get_cid("126534046", from = "sid", domain = "substance") +# sourceid +get_cid("VCC957895", from = "sourceid/23706", domain = "substance") +# sourceall +get_cid("Optopharma Ltd", from = "sourceall", domain = "substance") +# from AID (CIDs of substances tested in the assay) +get_cid(170004, from = "aid", domain = "assay") +# from GeneID (CIDs of substances tested on the gene) +get_cid(25086, from = "target/geneid", domain = "assay") # multiple inputs -comp <- c("Triclosan", "Aspirin") -get_cid(comp) +get_cid(c("Triclosan", "Aspirin")) } } @@ -80,4 +145,6 @@ information in PubChem. Nucleic acids research, gkv396. } \author{ Eduard Szoecs, \email{eduardszoecs@gmail.com} + +Tamás Stirling, \email{stirling.tamas@gmail.com} } diff --git a/man/get_etoxid.Rd b/man/get_etoxid.Rd index e6bafdf4..daabc09b 100644 --- a/man/get_etoxid.Rd +++ b/man/get_etoxid.Rd @@ -27,8 +27,7 @@ returns \code{NA} if multiple hits are found.} \item{verbose}{logical; print message during processing to console?} } \value{ -a dataframe with 4 columns: etoxID, matched substance, string -distance to match and the queried string +a tibble with 3 columns: the query, the match, and the etoxID } \description{ Query ETOX: Information System Ecotoxicology and Environmental Quality diff --git a/man/opsin_query.Rd b/man/opsin_query.Rd index 65e61cf0..d58f77c3 100644 --- a/man/opsin_query.Rd +++ b/man/opsin_query.Rd @@ -14,7 +14,7 @@ opsin_query(query, verbose = TRUE, ...) \item{...}{currently not used.} } \value{ -a data.frame with five columnns: "inchi", "stdinchi", "stdinchikey", "smiles", "message" +a tibble with six columnns: "query", inchi", "stdinchi", "stdinchikey", "smiles", "message", and "status" } \description{ Query the OPSIN (Open Parser for Systematic IUPAC nomenclature) web service diff --git a/man/pc_synonyms.Rd b/man/pc_synonyms.Rd index c7726841..25c1c62c 100644 --- a/man/pc_synonyms.Rd +++ b/man/pc_synonyms.Rd @@ -34,7 +34,7 @@ individual words.} \item{...}{optional arguments} } \value{ -a character vector. +a list of character vectors (one per query). If \code{choices} is used, a single named vector is returned instead. } \description{ Search synonyms using PUG-REST, diff --git a/man/ping.Rd b/man/ping.Rd deleted file mode 100644 index 2badc405..00000000 --- a/man/ping.Rd +++ /dev/null @@ -1,53 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ping.R -\name{ping} -\alias{ping} -\alias{ping_pubchem} -\alias{ping_pubchem_pw} -\alias{ping_cs} -\alias{ping_pan} -\title{Ping an API used in webchem to see if it's working.} -\usage{ -ping_pubchem(...) - -ping_pubchem_pw(...) - -ping_cs(...) - -ping_pan(...) -} -\arguments{ -\item{...}{Curl options passed on to \code{\link[httr]{GET}} or \code{\link[httr]{POST}}} -} -\value{ -A logical, TRUE or FALSE - -TRUE if pubchem is reachable - -TRUE if pubchem PUG-VIEW is reachable - -TRUE if ChemSpider is reachable - -TRUE if PAN is reachable -} -\description{ -Ping an API used in webchem to see if it's working. -} -\examples{ -\dontrun{ - # might fail if API is not available - ping_pubchem() - } -\dontrun{ - # might fail if API is not available - ping_pubchem_pw() - } -\dontrun{ - # might fail if API is not available - ping_cs() - } -\dontrun{ - # might fail if API is not available - ping_pan() - } -} diff --git a/man/ping_service.Rd b/man/ping_service.Rd new file mode 100644 index 00000000..5d9045be --- /dev/null +++ b/man/ping_service.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ping.R +\name{ping_service} +\alias{ping_service} +\title{Ping an API used in webchem to see if it's working.} +\usage{ +ping_service( + service = c("aw", "chebi", "ci", "cs", "cs_web", "cir", "cts", "etox", "fn", "nist", + "opsin", "pan", "pc", "srs", "wd") +) +} +\arguments{ +\item{service}{character; the same abbreviations used as prefixes in \code{webchem} functions, with the exception of \code{"cs_web"}, which only checks if the ChemSpider website is up, and thus doesn't require an API key.} +} +\value{ +A logical, TRUE if the service is available or FALSE if it isn't +} +\description{ +Ping an API used in webchem to see if it's working. +} +\examples{ +\dontrun{ +ping_service("pan") +} +} diff --git a/tests/testthat.R b/tests/testthat.R index c3ce1058..32577da7 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -6,6 +6,6 @@ library('webchem') # Because we use travis CI we will hear about any test failures as soon as they # happen. So, let's skip all tests on CRAN: # -# if (identical(Sys.getenv("NOT_CRAN"), "true")) { -# test_check("webchem") -# } +if (identical(Sys.getenv("NOT_CRAN"), "true")) { + test_check("webchem") +} diff --git a/tests/testthat/test-alanwood.R b/tests/testthat/test-alanwood.R index 9f09fe86..6a746de8 100644 --- a/tests/testthat/test-alanwood.R +++ b/tests/testthat/test-alanwood.R @@ -1,16 +1,30 @@ -context("alanwood") +up <- ping_service("aw") +test_that("examples in the article are unchanged", { + skip_on_cran() + skip_if_not(up, "Alanwood service is down") + + data("lc50", package = "webchem") + aw_data <- aw_query(lc50$cas[1:3], type = "cas") + igroup <- sapply(aw_data, function(y) y$subactivity[1]) + expect_is(igroup, "character") + expect_equal(names(igroup), c("50-29-3", "52-68-6", "55-38-9")) + expect_equal(unname(igroup), c("organochlorine insecticides", + "phosphonate insecticides", + "phenyl organothiophosphate insecticides")) +}) test_that("alanwood, commonname", { skip_on_cran() + skip_if_not(up, "Alanwood service is down") comps <- c("Fluazinam", "S-Metolachlor", "balloon", NA) o1 <- aw_query(comps, type = "commonname") - expect_is(o1, "list") + expect_type(o1, "list") expect_equal(length(o1), 4) - expect_is(o1[[1]], "list") - expect_is(o1[[2]], "list") + expect_type(o1[[1]], "list") + expect_type(o1[[2]], "list") expect_equal(o1[[3]], NA) expect_equal(o1[[4]], NA) expect_equal(o1[["Fluazinam"]]$cas, "79622-59-6") @@ -20,30 +34,23 @@ test_that("alanwood, commonname", { }) -test_that("alanwood, cas", { +test_that("alanwood, invalid input", { skip_on_cran() + skip_if_not(up, "Alanwood service is down") - comps <- c("79622-59-6", "87392-12-9", "balloon", NA) - o1 <- aw_query(comps, type = "cas") - + comps <- c("balloon", NA) + o1 <- aw_query(comps) expect_is(o1, "list") - expect_equal(length(o1), 4) - expect_is(o1[[1]], "list") - expect_is(o1[[2]], "list") - expect_equal(o1[[3]], NA) - expect_equal(o1[[4]], NA) - expect_equal(o1[[1]]$cas, "79622-59-6") - expect_equal(length(o1[[2]]$inchikey), 2) - expect_equal(length(o1[[2]]$inchi), 2) - expect_equal(length(o1[[1]]), 11) - expect_true(is.na(aw_query("12071-83-9", type = "cas")[[1]]$inchi)) + expect_equal(o1[[1]], NA) + expect_equal(o1[[2]], NA) }) test_that("alanwood, build_index", { skip_on_cran() + skip_if_not(up, "Alanwood service is down") idx <- suppressWarnings(build_aw_idx(verbose = FALSE, force_build = TRUE)) - expect_is(idx, "data.frame") + expect_s3_class(idx, "data.frame") expect_equal(ncol(idx), 4) expect_equal(names(idx), c("names", "links", "linknames", "source")) expect_equal(unique(idx$source), c("rn", "cn")) diff --git a/tests/testthat/test-chebi.R b/tests/testthat/test-chebi.R index 4c5af9a7..b4a16ece 100644 --- a/tests/testthat/test-chebi.R +++ b/tests/testthat/test-chebi.R @@ -1,7 +1,33 @@ -context("chebi") +up <- ping_service("chebi") +test_that("examples in the article are unchanged", { + skip_on_cran() + skip_if_not(up, "CHEBI service is down") + + data("lc50", package = "webchem") + cas_rns <- lc50[order(lc50$value)[1:3], "cas"] + chebiids <- get_chebiid(cas_rns) + comp <- chebi_comp_entity(chebiids$chebiid) + pars <- lapply(comp, function(x) { + with(x, parents[parents$type == "has role", ]) + }) + + expect_equal(cas_rns, c("563-12-2", "96182-53-5", "3383-96-8")) + expect_equal(chebiids$chebiid, c("CHEBI:38663", "CHEBI:38951", "CHEBI:38954")) + expect_equal(chebiids$chebiasciiname, c("ethion", "tebupirimfos", "temephos")) + expect_equal(pars$`CHEBI:38663`$chebiName, + c("insecticide", "environmental contaminant", + "EC 3.1.1.7 (acetylcholinesterase) inhibitor", "acaricide", + "agrochemical")) + expect_equal(pars$`CHEBI:38951`$chebiName, + "EC 3.1.1.7 (acetylcholinesterase) inhibitor") + expect_equal(pars$`CHEBI:38954`$chebiName, + c("EC 3.1.1.7 (acetylcholinesterase) inhibitor", "acaricide", + "agrochemical", "ectoparasiticide")) +}) test_that("chebi returns correct results", { skip_on_cran() + skip_if_not(up, "CHEBI service is down") a <- get_chebiid("Glyphosate", from = "ALL") b <- get_chebiid(c("triclosan", "glyphosate", "balloon", NA)) A <- chebi_comp_entity("CHEBI:27744") diff --git a/tests/testthat/test-chemid.R b/tests/testthat/test-chemid.R index e3d6ae74..c481bba7 100644 --- a/tests/testthat/test-chemid.R +++ b/tests/testthat/test-chemid.R @@ -1,45 +1,40 @@ -context("chemid") - - +up <- ping_service("ci") test_that("chemid returns correct results", { skip_on_cran() + skip_if_not(up, "CHEMID service is down") + skip("failing tests below") - o1 <- ci_query(c('xxxxx', NA, 'Aspirin', 'Triclosan'), type = 'name', match = 'best') o2 <- ci_query('50-00-0', type = 'rn') o3 <- ci_query('WSFSSNUMVMOOMR-UHFFFAOYSA-N', type = 'inchikey') + expect_type(o2, 'list') + expect_type(o3, 'list') - # test multiple matches - m1 <- ci_query('Tetracyclin', type = 'name', match = 'first') - m2 <- ci_query('Tetracyclin', type = 'name', match = 'best') - m3 <- ci_query('Tetracyclin', type = 'name', match = 'na') - - b1 <- ci_query('Tetracyclin', type = 'name') # BUG: Failed because of multiple matches - expect_equal(b1[[1]]$name[1], "Tetracycline") - - b2 <- ci_query('Edetic acid', type = 'name', match = 'best') - expect_equal(b2[[1]]$name[1], "Edetic acid") - expect_equal(attr(b2[[1]],'distance'), 0) - + o1 <- ci_query(c('xxxxx', NA, 'Aspirin', 'Triclosan'), type = 'name', match = 'best') expect_is(o1, 'list') - expect_is(o2, 'list') - expect_is(o3, 'list') - expect_is(m1, 'list') - expect_is(m2, 'list') - expect_is(m3, 'list') expect_true(length(o1) == 4) expect_true(is.na(o1[[1]])) expect_true(is.na(o1[[2]])) - expect_equal(o1[[3]]$name[2], "Aspirin") - expect_equal(length(o1[[3]]), 9) - expect_true(is.data.frame(o1[[3]]$physprop)) + expect_length(o1[[3]], 9) + expect_s3_class(o1[[3]]$physprop, "data.frame") + + b1 <- ci_query('Tetracyclin', type = 'name') # BUG: Failed because of multiple matches + expect_equal(b1[[1]]$name[1], "Tetracycline") + b2 <- ci_query('Edetic acid', type = 'name', match = 'best') + expect_equal(b2[[1]]$name[1], "Edetic acid") + expect_equal(attr(b2[[1]],'distance'), 0) + # test multiple matches + m1 <- ci_query('Tetracyclin', type = 'name', match = 'first') + m2 <- ci_query('Tetracyclin', type = 'name', match = 'best') + m3 <- ci_query('Tetracyclin', type = 'name', match = 'na') + + expect_type(m1, 'list') + expect_type(m2, 'list') + expect_type(m3, 'list') expect_equal(m1[[1]]$cas, "60-54-8") expect_equal(m2[[1]]$cas, "60-54-8") expect_equal(m3[[1]], NA) }) - - - diff --git a/tests/testthat/test-chemspider.R b/tests/testthat/test-chemspider.R index 97f408bc..72bf55af 100644 --- a/tests/testthat/test-chemspider.R +++ b/tests/testthat/test-chemspider.R @@ -1,20 +1,44 @@ -context("chemspider") +#test might still fail if website is up but API service is down. +up <- ping_service("cs_web") +test_that("examples in the article are unchanged", { + skip_on_cran() + skip_on_appveyor() + skip_on_travis() + skip_if_not(up, "ChemSpider service is down, skipping tests") + + #values come from test-pubchem + smiles <- c("CC1=CC(=C(C=C1)O)C", "CC1=C(C=CC(=C1)Cl)O", NA, + "CCNC1=NC(=NC(=N1)Cl)NC(C)C", "C1=CC=CC=C1", + "CC(C)NC1=NC(=NC(=N1)N)Cl") + csids <- get_csid(smiles, from = "smiles") + inchikeys <- cs_convert(csids$csid, from = "csid", to = "inchikey") + + expect_equal(csids$csid, c(13839123, 14165, NA, 2169, 236, 21157)) + expect_equal(inchikeys, + c("KUFFULVDNCHOFZ-UHFFFAOYAC", "RHPUJHQBPORFGV-UHFFFAOYAB",NA, + "MXWJVTOOROXGIU-UHFFFAOYAJ", "UHOVQNZJYSORNB-UHFFFAOYAH", + "DFWFIQKMSFGDCQ-UHFFFAOYAI")) +}) test_that("cs_check_key() can find API key in my local .Renviron", { skip_on_cran() + skip_on_appveyor() + skip_on_travis() expect_type(cs_check_key(), "character") }) test_that("cs_datasources()", { skip_on_cran() + skip_on_appveyor() + skip_on_travis() + skip_if_not(up, "ChemSpider service is down, skipping tests") a <- cs_datasources() expect_is(a, "character") }) test_that("cs_control()", { - skip_on_cran() - expect_is(cs_control(), "list") + expect_type(cs_control(), "list") expect_true("datasources" %in% names(cs_control())) expect_true("order_by" %in% names(cs_control())) expect_true("order_direction" %in% names(cs_control())) @@ -45,46 +69,66 @@ test_that("cs_control()", { expect_true(cs_control(isotopic = "unlabeled")$isotopic == "unlabeled") }) -test_that("get_csid()", { +test_that("get_csid() works with defaults", { skip_on_cran() + skip_on_appveyor() + skip_on_travis() + skip_if_not(up, "ChemSpider service is down, skipping tests") + a <- get_csid("Triclosan") b <- get_csid("Naproxene") ab <- get_csid(c("Triclosan", "Naproxene")) abcd <- get_csid(c("ethanol", "balloon", NA, "acetic acid")) - c1 <- get_csid("Oxygen", order_by = "recordId") - #c2 <- get_csid("Oxygen", control = cs_control(order_by = "massDefect")) - c3 <- get_csid("Oxygen", order_by = "molecularWeight") - c4 <- get_csid("Oxygen", order_by = "referenceCount") - c5 <- get_csid("Oxygen", order_by = "dataSourceCount") - c6 <- get_csid("Oxygen", order_by = "pubMedCount") - c7 <- get_csid("Oxygen", order_by = "rscCount") - c8 <- get_csid("Oxygen", order_direction = "ascending") - c9 <- get_csid("Oxygen", order_direction = "descending") - f <- get_csid("C47H93N2O6P", from = "formula", - order_by = "dataSourceCount", - order_direction = "descending" - ) expect_is(a, "data.frame") expect_equal(a$csid, 5363) expect_equal(b$csid, 137720) expect_equal(ab$csid, c(5363, 137720)) - expect_equal(c1$csid, c(952, 140526)) - #expect_equal(c2$Oxygen,c(952,140526)) does not work. - #seems to be an API error. - expect_equal(c3$csid, c(140526, 952)) - expect_equal(c4$csid, c(952, 140526)) - expect_equal(c5$csid, c(140526, 952)) - expect_equal(c6$csid, c(952, 140526)) - expect_equal(c7$csid, c(952, 140526)) - expect_equal(c8$csid, c(952, 140526)) - expect_equal(c9$csid, c(140526, 952)) expect_equal(abcd$csid, c(682, NA, NA, 171)) - expect_equal(f$csid, c(24846874, 59696525, 68025876, 71044200, 24608396)) +}) + +test_that("get_csid() works with arguments passed to cs_control()", { + skip_on_cran() + skip_on_appveyor() + skip_on_travis() + skip_if_not(up, "ChemSpider service is down, skipping tests") + + c1 <- head(get_csid("iron oxide", from = "name", order_by = "recordId")) + expect_equal(c1$csid, c(14147, 14237, 55474, 82623, 392353, 396260)) + + # c2 <- head(get_csid("C6H12O6", from = "formula", order_by = "massDefect")) + # not a column in the web interface, so not sure what to expect + + c3 <- head(get_csid("iron oxide", from = "name", order_by = "molecularWeight")) + expect_equal(c3$csid, c(14237, 396260, 392353, 82623, 14147, 452497)) + + c4 <- head(get_csid("C6H12O6", from = "formula", order_by = "referenceCount", + order_direction = "descending")) + expect_equal(c4$csid, c(23139, 1070, 868, 388747, 161434, 96749)) + + c5 <- head(get_csid("C6H12O6", from = "formula", order_by = "dataSourceCount", + order_direction = "descending")) + expect_equal(c5$csid, c(5764, 10239179, 83142, 96749, 58238, 71358)) + + c6 <- head(get_csid("C6H12O6", from = "formula", order_by = "pubMedCount", + order_direction = "descending")) + expect_equal(c6$csid, c(96749, 5589, 71358, 58238, 9484839, 9312824)) + + c7 <- head(get_csid("C6H12O6", from = "formula", order_by = "rscCount", + order_direction = "descending")) + expect_equal(c7$csid, c(96749, 5589, 71358, 58238, 9312824, 9484839)) + + c8 <- head(get_csid("iron oxide", from = "name", order_by = "molecularWeight", + order_direction = "descending")) + expect_equal(c8$csid, c(4937312, 55474, 14147, 452497, 82623, 392353)) }) test_that("cs_smiles_csid()", { skip_on_cran() + skip_on_appveyor() + skip_on_travis() + skip_if_not(up, "ChemSpider service is down, skipping tests") + a <- cs_smiles_csid("CC(O)=O") expect_is(a, "integer") @@ -93,6 +137,10 @@ test_that("cs_smiles_csid()", { test_that("cs_inchi_csid()", { skip_on_cran() + skip_on_appveyor() + skip_on_travis() + skip_if_not(up, "ChemSpider service is down, skipping tests") + a <- cs_inchi_csid(inchi = "InChI=1S/C2H4O2/c1-2(3)4/h1H3,(H,3,4)") expect_is(a, "integer") @@ -101,6 +149,10 @@ test_that("cs_inchi_csid()", { test_that("cs_inchikey_csid()", { skip_on_cran() + skip_on_appveyor() + skip_on_travis() + skip_if_not(up, "ChemSpider service is down, skipping tests") + a <- cs_inchikey_csid("QTBSBXVTEAMEQO-UHFFFAOYSA-N") expect_is(a, "integer") @@ -109,6 +161,10 @@ test_that("cs_inchikey_csid()", { test_that("cs_convert_multiple()", { skip_on_cran() + skip_on_appveyor() + skip_on_travis() + skip_if_not(up, "ChemSpider service is down, skipping tests") + a <- cs_convert_multiple("CC(=O)O", "smiles", "inchi") a_rev <- cs_convert_multiple(a, "inchi", "smiles") b <- cs_convert_multiple("InChI=1S/C2H4O2/c1-2(3)4/h1H3,(H,3,4)", "inchi", @@ -133,14 +189,18 @@ test_that("cs_convert_multiple()", { test_that("cs_convert()", { skip_on_cran() + skip_on_appveyor() + skip_on_travis() + skip_if_not(up, "ChemSpider service is down, skipping tests") + a <- cs_convert(171, "csid", "inchi") a_rev <- cs_convert(a, "inchi", "csid") a2 <- cs_convert(c(171, 172), "csid", "inchi") a2_rev <- cs_convert(a2, "inchi", "csid") - b <- cs_convert(171, "csid", "inchikey") - b_rev <- cs_convert(b, "inchikey", "csid") - b2 <- cs_convert(c(171, 172), "csid", "inchikey") - b2_rev <- cs_convert(b2, "inchikey", "csid") + b_rev <- cs_convert("QTBSBXVTEAMEQO-UHFFFAOYAR", "inchikey", "csid") + b2_rev <- cs_convert( + c("QTBSBXVTEAMEQO-UHFFFAOYAR", "IKHGUXGNUITLKF-UHFFFAOYSA-N"), + "inchikey", "csid") c <- cs_convert(171, "csid", "smiles") c_rev <- cs_convert(c, "smiles", "csid") c2 <- cs_convert(c(171, 172), "csid", "smiles") @@ -161,16 +221,16 @@ test_that("cs_convert()", { g2 <- cs_convert(a2, "inchi", "mol") h <- cs_convert("QTBSBXVTEAMEQO-UHFFFAOYSA-N", "inchikey", "mol") h_rev <- cs_convert(h, "mol", "inchikey") - h2 <- cs_convert(b2, "inchikey", "mol") + h2 <- cs_convert( + c("QTBSBXVTEAMEQO-UHFFFAOYAR", "IKHGUXGNUITLKF-UHFFFAOYSA-N"), + "inchikey", "mol") h2_rev <- cs_convert(h2, "mol", "inchikey") expect_equal(a, "InChI=1/C2H4O2/c1-2(3)4/h1H3,(H,3,4)") expect_equal(a_rev, 171) expect_length(a2, 2) expect_length(a2_rev, 2) - expect_equal(b, "QTBSBXVTEAMEQO-UHFFFAOYAR") expect_equal(b_rev, 171) - expect_length(b2, 2) expect_length(b2_rev, 2) expect_length(c2, 2) expect_length(c2_rev, 2) @@ -198,6 +258,10 @@ test_that("cs_convert()", { test_that("cs_compinfo()", { skip_on_cran() + skip_on_appveyor() + skip_on_travis() + skip_if_not(up, "ChemSpider service is down, skipping tests") + a <- cs_compinfo(171, c("SMILES", "Formula", "InChI", "InChIKey", "StdInChI", "StdInChIKey", "AverageMass", "MolecularWeight", "MonoisotopicMass", "NominalMass", "CommonName", @@ -236,4 +300,4 @@ test_that("cs_compinfo()", { # expect_equal(tt2[["average_mass"]], # "289.5418") # expect_equal(ncol(tt2), 14) -# }) \ No newline at end of file +# }) diff --git a/tests/testthat/test-cir.R b/tests/testthat/test-cir.R index 7db028c6..7a61b425 100644 --- a/tests/testthat/test-cir.R +++ b/tests/testthat/test-cir.R @@ -1,27 +1,37 @@ -context("cir") - +up <- ping_service("cir") test_that("cir_query()", { skip_on_cran() + skip_if_not(up, "CIR server is down") - Sys.sleep(5) expect_equal(cir_query('Triclosan', 'mw', verbose = FALSE)[[1]], 289.5451) - Sys.sleep(5) expect_equal(cir_query('xxxxxxx', 'mw', verbose = FALSE)[[1]], NA) - Sys.sleep(5) expect_equal(cir_query("3380-34-5", 'stdinchikey', resolver = 'cas_number', verbose = FALSE)[[1]], "InChIKey=XEFQLINVKFYRCS-UHFFFAOYSA-N") - Sys.sleep(5) expect_true(length(cir_query('Triclosan', 'cas', verbose = FALSE)[[1]]) > 1) - Sys.sleep(5) - expect_equal(length(cir_query('Triclosan', 'cas', first = TRUE, verbose = FALSE)[[1]]), 1) - Sys.sleep(5) - expect_equal(length(cir_query(c('Triclosan', 'Aspirin'), 'cas', verbose = FALSE)), 2) + expect_message(cir_query("acetic acid", "mw", first = TRUE)) + expect_length(cir_query('Triclosan', 'cas', choices = 1, verbose = FALSE)[[1]], 1) + expect_length(cir_query(c('Triclosan', 'Aspirin'), 'cas', verbose = FALSE), 2) + + skip("I have no clue why this one fails on R CMD check. It works when run in the console!") + expect_equivalent(cir_query('acetic acid', 'mw', choices = 1), c(`acetic acid` = 60.0524)) - Sys.sleep(5) - expect_equal(cir_query('acetic acid', 'mw', first = TRUE), c(`acetic acid` = 60.0524)) }) test_that("cir_query() doesn't mistake NA for sodium", { - Sys.sleep(5) + skip_on_cran() + skip_if_not(up, "CIR server is down") + expect_true(is.na(cir_query(as.character(NA), 'cas'))) }) + +test_that("cir_img()", { + skip_on_cran() + + Sys.sleep(5) + expect_error(cir_img('Glyphosate', dir = NULL)) + Sys.sleep(5) + expect_s3_class(cir_img('Isoproturon', dir = tempdir()), 'data.frame') + Sys.sleep(5) + expect_equal(nrow(cir_img(c('Metamitron', 'Diclofenac'), dir = tempdir())), 2) +}) + diff --git a/tests/testthat/test-cts.R b/tests/testthat/test-cts.R index 7d62618f..802ac2a2 100644 --- a/tests/testthat/test-cts.R +++ b/tests/testthat/test-cts.R @@ -1,55 +1,36 @@ -context("cts") - -require(RCurl) -chk_cts <- function(){ - qurl <- 'http://cts.fiehnlab.ucdavis.edu/service/compound/XEFQLINVKFYRCS-UHFFFAOYSA-N' - Sys.sleep(0.5) - cont <- try(getURL(qurl, .encoding = 'UTF-8', .opts = list(timeout = 3)), - silent = TRUE) - if (inherits(cont, 'try-error')) - skip("Server is down!") -} - -# chk_cir <- function(){ -# qurl <- 'http://cactus.nci.nih.gov/chemical/structure/Triclosan/cas/xml' -# Sys.sleep(0.5) -# cont <- try(getURL(qurl, .encoding = 'UTF-8', .opts = list(timeout = 3)), -# silent = TRUE) -# if (inherits(cont, 'try-error')) -# skip("Server is down!") -# } - - +up <- ping_service("cts") test_that("cts_compinfo()", { skip_on_cran() - chk_cts() + skip_if_not(up, "CTS service down") + expect_error(cts_compinfo('xxx')) - o1 <- cts_compinfo("XEFQLINVKFYRCS-UHFFFAOYSA-N", verbose = FALSE) - o2 <- cts_compinfo(c("XEFQLINVKFYRCS-UHFFFAOYSA-N", "XEFQLINVKFYRCS-UHFFFAOYSA-X"), verbose = FALSE) - expect_equal(cts_compinfo("XEFQLINVKFYRCS-UHFFFAOYSA-X", verbose = FALSE)[[1]], NA) + o1 <- suppressWarnings(cts_compinfo("XEFQLINVKFYRCS-UHFFFAOYSA-N", verbose = FALSE)) + o2 <- suppressWarnings(cts_compinfo(c("XEFQLINVKFYRCS-UHFFFAOYSA-N", "XEFQLINVKFYRCS-UHFFFAOYSA-X"), verbose = FALSE)) + expect_equal(suppressWarnings(cts_compinfo("XEFQLINVKFYRCS-UHFFFAOYSA-X", verbose = FALSE))[[1]], NA) expect_warning(cts_compinfo("XEFQLINVKFYRCS-UHFFFAOYSA-X", verbose = FALSE)) - expect_equal(length(o1[[1]]), 10) + expect_length(o1[[1]], 10) expect_equal(round(o1[[1]][["molweight"]], 3), 289.542) - expect_equal(length(o2), 2) + expect_length(o2, 2) expect_true(is.na(o2[[2]])) }) test_that("cts_convert()", { skip_on_cran() + skip_if_not(up, "CTS service down") - chk_cts() - comp <- c('XEFQLINVKFYRCS-UHFFFAOYSA-N', 'BSYNRYMUTXBXSQ-UHFFFAOYSA-N') + comp <- c('Triclosan', 'Hexane') expect_error(cts_convert(comp, c('Chemical Name', 'CAS'), 'CAS')) expect_error(cts_convert('Triclosan', 'CAS')) - o1 <- cts_convert(comp, 'Chemical Name', 'inchikey', first = TRUE, verbose = FALSE) - expect_equal(o1[[1]], 'XEFQLINVKFYRCS-UHFFFAOYSA-N') - expect_equal(length(o1), 2) - expect_true(is.na(cts_convert('xxxx', 'inchikey', 'Chemical Name')[[1]])) + expect_true(is.na(suppressWarnings(cts_convert('xxxx', 'Chemical Name', 'inchikey'))[[1]])) + o1 <- cts_convert(comp, 'Chemical Name', 'inchikey', choices = 1, verbose = FALSE) + expect_length(o1, 2) - # cts_convert('acetic acid', 'Chemical Name', 'CAS', first = TRUE) + expect_equal(o1[[1]], 'XEFQLINVKFYRCS-UHFFFAOYSA-N') + # cts_convert('acetic acid', 'Chemical Name', 'CAS', choices = 1) + expect_equivalent(cts_convert(NA, from = "Chemical Name", to = "inchikey"), NA) }) @@ -65,10 +46,10 @@ test_that("cts_convert()", { test_that("fromto", { skip_on_cran() - + skip_if_not(up, "CTS service down") to <- cts_to() from <- cts_from() - expect_true(is.character(to)) - expect_true(is.character(from)) + expect_type(to, "character") + expect_type(from, "character") }) diff --git a/tests/testthat/test-etox.R b/tests/testthat/test-etox.R index 80aa3210..bc2e672c 100644 --- a/tests/testthat/test-etox.R +++ b/tests/testthat/test-etox.R @@ -1,7 +1,49 @@ -context("etox") +up <- ping_service("etox") +test_that("examples in the article are unchanged", { + skip_on_cran() + skip_if_not(up, "ETOX service is down") + + data("jagst", package = "webchem") + subs <- head(unique(jagst$substance)) + ids <- get_etoxid(subs, match = "best") + etox_data <- etox_basic(ids$etoxid) + #values go to test-pubchem + etox_cas <- cas(etox_data) + eqs <- etox_targets(c("8397", "7240", "8836", "7442", "7571", "8756")) + macs <- suppressWarnings(sapply(eqs, function(y) { + if (length(y) == 1 && is.na(y)) { + return(NA) + } else { + res <- y$res + min(res[res$Country_or_Region == "EEC / EU" & + res$Designation == "MAC-EQS", "Value_Target_LR"]) + } + })) + + expect_is(ids, "data.frame") + expect_equal(names(ids), c("query", "match", "etoxid")) + expect_equal(ids$etoxid, + c("8668", "8494", NA, "8397", "7240", "7331")) + expect_equal( + ids$match, + c("2,4-Xylenol ( 8668 )", "4-Chlor-2-methylphenol ( 8494 )", NA, + "Atrazin ( 8397 )", "Benzol ( 7240 )", "Desethylatrazin ( 7331 )")) + expect_equal(ids$query, + c("2,4-Dimethylphenol", "4-Chlor-2-methylphenol", + "4-para-nonylphenol", "Atrazin", "Benzol", "Desethylatrazin")) + + expect_is(etox_cas, "character") + expect_equal(names(etox_cas), + c("8668", "8494", NA, "8397", "7240", "7331")) + expect_equal(unname(etox_cas),c("105-67-9", "1570-64-5", NA, "1912-24-9", + "71-43-2", "6190-65-4")) + expect_equal(unname(macs), c(2.000, 50.000, 0.016, 1.000, 4.000, 0.034), + tolerance = 10^-4) +}) test_that("get_etoxid returns correct results", { skip_on_cran() + skip_if_not(up, "ETOX service is down") # test general comps <- c("Triclosan", "Glyphosate") @@ -14,23 +56,28 @@ test_that("get_etoxid returns correct results", { o7 <- get_etoxid("203-157-5", from = "ec") do2 <- get_etoxid("Thiamethoxam") - expect_is(o1, "data.frame") - expect_is(o2, "data.frame") - expect_is(o3, "data.frame") - expect_is(o4, "data.frame") - expect_is(o5, "data.frame") - expect_is(o6, "data.frame") - expect_is(o7, "data.frame") - expect_is(do2, "data.frame") + expect_s3_class(o1, "data.frame") + expect_s3_class(o2, "data.frame") + expect_s3_class(o3, "data.frame") + expect_s3_class(o4, "data.frame") + expect_s3_class(o5, "data.frame") + expect_s3_class(o6, "data.frame") + expect_s3_class(o7, "data.frame") + expect_s3_class(do2, "data.frame") expect_equal(o1$etoxid, c("20179", "9051")) expect_equal(o2$etoxid, c("89236", "20179", "9051")) +}) + +test_that("examples from webchem article run", { + skip_on_cran() + skip_if_not(up, "ETOX service is down") # tests for the article data("jagst") ids <- get_etoxid(head(unique(jagst$substance),6), match = "best") - expect_is(ids, "data.frame") + expect_s3_class(ids, "data.frame") expect_equal(ids$etoxid, c("8668","8494",NA,"8397","7240","7331")) expect_equal(ids$match, c( "2,4-Xylenol ( 8668 )", @@ -51,88 +98,95 @@ test_that("get_etoxid returns correct results", { }) -# test_that("etox_basic returns correct results", { -# skip_on_cran() -# -# ids <- c("20179", "9051", "xxxxx", NA) -# o1 <- etox_basic(ids) -# -# expect_is(o1, 'list') -# expect_equal(length(o1), 4) -# expect_equal(o1[['20179']]$cas, "3380-34-5") -# expect_equal(length(o1[['20179']]), 5) -# expect_is(o1[['20179']]$synonyms, 'data.frame') -# expect_true(is.na(o1[[3]])) -# expect_true(is.na(o1[[4]])) -#}) -# -# -# test_that("etox_targets returns correct results", { -# skip_on_cran() -# -# ids <- c("20179", "9051", "xxxxx", NA) -# o1 <- etox_targets(ids) -# -# expect_is(o1, 'list') -# expect_equal(length(o1), 4) -# expect_equal(o1[['20179']]$res$Substance[1], "Triclosan") -# expect_equal(ncol(o1[['20179']]$res), 33) -# expect_is(o1[['20179']]$res, 'data.frame') -# expect_true(is.na(o1[[3]])) -# expect_true(is.na(o1[[4]])) -# }) - -# test_that("etox_tests returns correct results", { -# skip_on_cran() -# -# ids <- c("20179", "9051", "xxxxx", NA) -# o1 <- etox_tests(ids) -# -# expect_is(o1, 'list') -# expect_equal(length(o1), 4) -# expect_equal(o1[['20179']]$res$Substance[1], "Triclosan") -# expect_equal(ncol(o1[['20179']]$res), 41) -# expect_is(o1[['20179']]$res, 'data.frame') -# expect_true(is.na(o1[[3]])) -# expect_true(is.na(o1[[4]])) -# }) -# -# -# test_that("etox integration tests", { -# skip_on_cran() -# -# comps <- c('Triclosan', 'Glyphosate', 'xxxx') -# ids_b <- get_etoxid(comps, match = 'best') -# ids_a <- get_etoxid(comps, match = 'all') -# -# # etox_*() can handle only vector inputs (so using match = 'all' does not work) -# expect_error(etox_basic(ids_a)) -# expect_error(etox_targets(ids_a)) -# expect_error(etox_tests(ids_a)) -# -# -# int1 <- etox_basic(ids_b$etoxid) -# int2 <- etox_targets(ids_b$etoxid) -# int3 <- etox_tests(ids_b$etoxid) -# -# expect_is(int1, 'list') -# expect_equal(length(int1), 3) -# expect_equal(int1[['20179']]$cas, "3380-34-5") -# expect_equal(length(int1[['20179']]), 5) -# expect_is(int1[['20179']]$synonyms, 'data.frame') -# expect_true(is.na(int1[[3]])) -# -# expect_is(int2, 'list') -# expect_equal(length(int2), 3) -# expect_equal(int2[['20179']]$res$Substance[1], "Triclosan") -# expect_equal(ncol(int2[['20179']]$res), 33) -# expect_is(int2[['20179']]$res, 'data.frame') -# expect_true(is.na(int2[[3]])) -# -# expect_is(int3, 'list') -# expect_equal(length(int3), 3) -# expect_equal(int3[['20179']]$res$Substance[1], "Triclosan") -# expect_equal(ncol(int3[['20179']]$res), 41) -# expect_is(int3[['20179']]$res, 'data.frame') -# expect_true(is.na(int3[[3]])) -# }) \ No newline at end of file +test_that("etox_basic returns correct results", { + skip_on_cran() + skip_if_not(up, "ETOX service is down") + + ids <- c("20179", "9051", "xxxxx", NA) + o1 <- etox_basic(ids) + + expect_s3_class(o1, 'list') + expect_equal(length(o1), 4) + expect_equal(o1[['20179']]$cas, "3380-34-5") + expect_equal(length(o1[['20179']]), 5) + expect_s3_class(o1[['20179']]$synonyms, 'data.frame') + expect_true(is.na(o1[[3]])) + expect_true(is.na(o1[[4]])) +}) + +test_that("etox_targets returns correct results", { + skip_on_cran() + skip_if_not(up, "ETOX service is down") + + ids <- c("20179", "9051", "xxxxx", NA) + o1 <- etox_targets(ids) + + expect_type(o1, 'list') + expect_equal(length(o1), 4) + expect_equal(o1[['20179']]$res$Substance[1], "Triclosan") + expect_equal(ncol(o1[['20179']]$res), 33) + expect_s3_class(o1[['20179']]$res, 'data.frame') + expect_true(is.na(o1[[3]])) + expect_true(is.na(o1[[4]])) +}) + +test_that("etox_tests returns correct results", { + skip_on_cran() + skip_if_not(up, "ETOX service is down") + + ids <- c("20179", "9051", "xxxxx", NA) + o1 <- etox_tests(ids) + + expect_type(o1, 'list') + expect_equal(length(o1), 4) + expect_equal(o1[['20179']]$res$Substance[1], "Triclosan") + expect_equal(ncol(o1[['20179']]$res), 41) + expect_s3_class(o1[['20179']]$res, 'data.frame') + expect_true(is.na(o1[[3]])) + expect_true(is.na(o1[[4]])) +}) + + +test_that("etox integration tests", { + skip_on_cran() + skip_if_not(up, "ETOX service is down") + + comps <- c('Triclosan', 'Glyphosate', 'xxxx') + ids_b <- get_etoxid(comps, match = 'best') + ids_a <- get_etoxid(comps, match = 'all') + + # etox_*() can handle only vector inputs + expect_error(etox_basic(ids_b)) + expect_error(etox_targets(ids_b)) + expect_error(etox_tests(ids_b)) + + int1 <- etox_basic(ids_b$etoxid) + int2 <- etox_targets(ids_b$etoxid) + int3 <- etox_tests(ids_b$etoxid) + + expect_type(int1, 'list') + expect_equal(length(int1), 3) + expect_equal(int1[['20179']]$cas, "3380-34-5") + expect_equal(length(int1[['20179']]), 5) + expect_s3_class(int1[['20179']]$synonyms, 'data.frame') + expect_true(is.na(int1[[3]])) + + expect_type(int2, 'list') + expect_equal(length(int2), 3) + expect_equal(int2[['20179']]$res$Substance[1], "Triclosan") + expect_equal(ncol(int2[['20179']]$res), 33) + expect_s3_class(int2[['20179']]$res, 'data.frame') + expect_true(is.na(int2[[3]])) + + expect_type(int3, 'list') + expect_equal(length(int3), 3) + expect_equal(int3[['20179']]$res$Substance[1], "Triclosan") + expect_equal(ncol(int3[['20179']]$res), 41) + expect_s3_class(int3[['20179']]$res, 'data.frame') + expect_true(is.na(int3[[3]])) +}) + + +test_that("etox functions handle NAs", { + expect_equal(is.na(get_etoxid(NA)$match), TRUE) +}) diff --git a/tests/testthat/test-extractors.R b/tests/testthat/test-extractors.R index 15c5dfcd..89703800 100644 --- a/tests/testthat/test-extractors.R +++ b/tests/testthat/test-extractors.R @@ -1,79 +1,83 @@ -context('extractors') -# ChemSpider -token <- '37bf5e57-9091-42f5-9274-650a64398aaf' -out_cs_compinfo <- cs_compinfo('5363', token = token) -out_cs_extcompinfo <- cs_extcompinfo('5363', token = token) - -# CTS -inchikeys <- c("XEFQLINVKFYRCS-UHFFFAOYSA-N","BSYNRYMUTXBXSQ-UHFFFAOYSA-N" ) -out_cts_compinfo <- cts_compinfo(inchikeys) +test_that("extractors work with etox", { + skip_on_cran() + skip_if_not(ping_service("etox"), "ETOX service is down") -# ETOX -out_etox_basic <- etox_basic(8252) + out_etox_basic <- etox_basic(8252) + expect_equivalent(cas(out_etox_basic), "50-00-0") + expect_error(inchikey(out_etox_basic)) + expect_error(smiles(out_etox_basic)) +}) -# ChemID -out_ci_query <- ci_query(c('Aspirin', 'Triclosan'), type = 'name') +test_that("extractors work with chemid", { + skip_on_cran() + skip_if_not(ping_service("ci"), "CHEMID service is down") -# OPSIN -out_opsin_query <- opsin_query(c('Cyclopropane', 'Octane')) + skip("ci_query isn't working right now") + out_ci_query <- ci_query(c('Aspirin', 'Triclosan'), type = 'name') + expect_equivalent(cas(out_ci_query), c("50-78-2", "3380-34-5")) + expect_equivalent(inchikey(out_ci_query), + c("BSYNRYMUTXBXSQ-UHFFFAOYSA-N", "XEFQLINVKFYRCS-UHFFFAOYSA-N")) + expect_equivalent(smiles(out_ci_query), c("CC(=O)", "c1(Oc2c(cc(Cl)")) +}) -# Alan wood -out_aw_query <- aw_query(c('Fluazinam', 'Diclofop'), type = 'com') +test_that("extractors work with opsin", { + skip_on_cran() + skip_if_not(ping_service("opsin"), "OPSIN service is down") -# Wikidata -id <- c("Q408646", "Q18216") -out_wd_ident <- wd_ident(id) -# Pubchem -out_pc_prop <- pc_prop(c(5564, 2244)) -out_pc_prop2 <- pc_prop(5564, properties = c('MolecularFormula', 'MolecularWeight')) + out_opsin_query <- opsin_query(c('Cyclopropane', 'Octane')) + expect_error(cas(out_opsin_query), "CAS is not returned by this datasource!") + expect_equivalent(inchikey(out_opsin_query), + c("LVZWSLJZHVFIQJ-UHFFFAOYSA-N", "TVMXDCGIABBOFY-UHFFFAOYSA-N")) + expect_equivalent(smiles(out_opsin_query), c("C1CC1", "CCCCCCCC")) +}) -# pan -out_pan_query <- pan_query(c('2,4-dichlorophenol', 'Atrazin'), match = 'best') +test_that("extractors work with Alanwood", { + skip_on_cran() + skip_if_not(ping_service("aw"), "Alanwood database not reachable") + out_aw_query <- aw_query(c('Fluazinam', 'Diclofop'), type = 'com') + expect_equivalent(cas(out_aw_query), c("79622-59-6", "40843-25-2")) + expect_equivalent(inchikey(out_aw_query), + c("UZCGKGPEKUCDTF-UHFFFAOYSA-N", "OOLBCHYXZDXLDS-UHFFFAOYSA-N")) + expect_error(smiles(out_aw_query), "SMILES is not returned by this datasource!") +}) -test_that("cas is working", { +test_that("extractors work with Wikidata", { skip_on_cran() + skip_if_not(ping_service("wd"), "Wikidata service is down") - expect_error(cas(out_cs_extcompinfo)) - expect_equivalent(cas(out_etox_basic), "50-00-0") - expect_error(cas(out_opsin_query)) - expect_equivalent(cas(out_aw_query), c("79622-59-6", "40843-25-2")) + id <- c("Q408646", "Q18216") + out_wd_ident <- wd_ident(id) expect_equivalent(cas(out_wd_ident), c("3380-34-5", "50-78-2")) - expect_error(cas(out_pc_prop)) - expect_equivalent(cas(out_pan_query), c("120-83-2", "1912-24-9")) - expect_equivalent(cas(out_ci_query), c("50-78-2", "3380-34-5")) + expect_equivalent(inchikey(out_wd_ident), + c("XEFQLINVKFYRCS-UHFFFAOYSA-N", "BSYNRYMUTXBXSQ-UHFFFAOYSA-N")) + expect_equivalent(smiles(out_wd_ident), + c("C1=CC(=C(C=C1Cl)O)OC2=C(C=C(C=C2)Cl)Cl","CC(=O)OC1=CC=CC=C1C(=O)O")) }) -test_that("inchikey is working", { +test_that("extractors work with pubchem", { skip_on_cran() + skip_if_not(ping_service("pc"), "Pubchem service is down") - expect_equivalent(inchikey(out_cs_compinfo), "XEFQLINVKFYRCS-UHFFFAOYSA-N") - expect_equivalent(inchikey(out_cs_extcompinfo), "XEFQLINVKFYRCS-UHFFFAOYAS") - expect_equivalent(inchikey(out_cts_compinfo), c("XEFQLINVKFYRCS-UHFFFAOYSA-N", "BSYNRYMUTXBXSQ-UHFFFAOYSA-N" )) - expect_error(inchikey(out_etox_basic)) - expect_equivalent(inchikey(out_opsin_query), c("LVZWSLJZHVFIQJ-UHFFFAOYSA-N", "TVMXDCGIABBOFY-UHFFFAOYSA-N")) - expect_equivalent(inchikey(out_aw_query), c("UZCGKGPEKUCDTF-UHFFFAOYSA-N", "OOLBCHYXZDXLDS-UHFFFAOYSA-N")) - expect_equivalent(inchikey(out_wd_ident), c("XEFQLINVKFYRCS-UHFFFAOYSA-N", "BSYNRYMUTXBXSQ-UHFFFAOYSA-N")) - expect_equivalent(inchikey(out_pc_prop), c("XEFQLINVKFYRCS-UHFFFAOYSA-N", "BSYNRYMUTXBXSQ-UHFFFAOYSA-N")) + out_pc_prop <- pc_prop(c(5564, 2244)) + out_pc_prop2 <- pc_prop(5564, properties = c('MolecularFormula', 'MolecularWeight')) + expect_error(cas(out_pc_prop)) + expect_equivalent(inchikey(out_pc_prop), + c("XEFQLINVKFYRCS-UHFFFAOYSA-N", "BSYNRYMUTXBXSQ-UHFFFAOYSA-N")) expect_error(inchikey(out_pc_prop2)) - expect_error(inchikey(out_pan_query)) - expect_equivalent(inchikey(out_ci_query), c("BSYNRYMUTXBXSQ-UHFFFAOYSA-N", "XEFQLINVKFYRCS-UHFFFAOYSA-N")) + expect_equivalent(smiles(out_pc_prop), + c("C1=CC(=C(C=C1Cl)O)OC2=C(C=C(C=C2)Cl)Cl", "CC(=O)OC1=CC=CC=C1C(=O)O")) + expect_error(smiles(out_pc_prop2)) }) -test_that("smiles is working", { +test_that("extractors work with PAN", { skip_on_cran() + skip_if_not(ping_service("pan"), "PAN service is down") - expect_equivalent(smiles(out_cs_compinfo), "c1cc(c(cc1Cl)O)Oc2ccc(cc2Cl)Cl") - expect_equivalent(smiles(out_cs_extcompinfo), "c1cc(c(cc1Cl)O)Oc2ccc(cc2Cl)Cl") - expect_error(smiles(out_cts_compinfo)) - expect_error(smiles(out_etox_basic)) - expect_equivalent(smiles(out_opsin_query), c("C1CC1", "CCCCCCCC")) - expect_error(smiles(out_aw_query)) - expect_equivalent(smiles(out_wd_ident), c("C1=CC(=C(C=C1Cl)O)OC2=C(C=C(C=C2)Cl)Cl","CC(=O)OC1=CC=CC=C1C(=O)O")) - expect_equivalent(smiles(out_pc_prop), c("C1=CC(=C(C=C1Cl)O)OC2=C(C=C(C=C2)Cl)Cl", "CC(=O)OC1=CC=CC=C1C(=O)O")) - expect_error(smiles(out_pc_prop2)) + out_pan_query <- pan_query(c('2,4-dichlorophenol', 'Atrazin'), match = 'best') + expect_equivalent(cas(out_pan_query), c("120-83-2", "1912-24-9")) + expect_error(inchikey(out_pan_query)) expect_error(smiles(out_pan_query)) - expect_equivalent(smiles(out_ci_query), c("CC(=O)", "c1(Oc2c(cc(Cl)")) -}) \ No newline at end of file +}) diff --git a/tests/testthat/test_fn.R b/tests/testthat/test-flavornet.R similarity index 61% rename from tests/testthat/test_fn.R rename to tests/testthat/test-flavornet.R index e786bb32..8af1e81a 100644 --- a/tests/testthat/test_fn.R +++ b/tests/testthat/test-flavornet.R @@ -1,23 +1,24 @@ -context("flavornet") - +up <- ping_service("fn") test_that("fn_percept()", { skip_on_cran() + skip_if_not(up, "Flavornet is unreachable") a <- fn_percept("123-32-0") b <- fn_percept(c("75-07-0", "123-32-0")) - c <- fn_percept(c("75-07-0", "123-32-0", "50-00-0")) + c <- suppressWarnings(fn_percept(c("75-07-0", "123-32-0", "50-00-0"))) - expect_is(a, 'character') - expect_is(b, 'character') - expect_is(c, 'character') + expect_type(a, 'character') + expect_type(b, 'character') + expect_type(c, 'character') expect_equal(length(a), 1) expect_equal(length(b), 2) expect_equal(length(c), 3) - expect_equal(a, structure("cocoa, roasted nut, roast beef, medicine", .Names = "123-32-0")) + expect_equal(a, structure("cocoa, roasted nut, roast beef, medicine", + .Names = "123-32-0")) expect_equal(b, structure(c("pungent, ether", "cocoa, roasted nut, roast beef, medicine" ), .Names = c("75-07-0", "123-32-0"))) expect_true(is.na(c[[3]])) expect_warning(fn_percept('xxxx')) - }) \ No newline at end of file +}) diff --git a/tests/testthat/test-nist.R b/tests/testthat/test-nist.R index 644faf78..6b488811 100644 --- a/tests/testthat/test-nist.R +++ b/tests/testthat/test-nist.R @@ -1,6 +1,8 @@ -context("nist") library(robotstxt) +up <- ping_service("nist") test_that("NIST webbook is still OK with being scraped", { + skip_on_cran() + skip_if_not(up, "NIST Web Book is down") expect_true( paths_allowed("https://webbook.nist.gov/cgi/cbook.cgi", user_agent = 'webchem (https://github.com/ropensci/webchem)') @@ -9,6 +11,8 @@ test_that("NIST webbook is still OK with being scraped", { test_that("nist_ri() warns when no results", { skip_on_cran() + skip_if_not(up, "NIST Web Book is down") + expect_warning(nist_ri( "78-70-6", from = "cas", @@ -20,6 +24,8 @@ test_that("nist_ri() warns when no results", { test_that("nist_ri() works when only one row of data", { skip_on_cran() + skip_if_not(up, "NIST Web Book is down") + testdf <- nist_ri("78-70-6") expect_s3_class(testdf, "data.frame") expect_true(!anyNA(testdf$RI)) @@ -37,6 +43,8 @@ test_that("nist_ri() works when only one row of data", { test_that("nist_ri() returns results", { skip_on_cran() + skip_if_not(up, "NIST Web Book is down") + out <- nist_ri("78-70-6", type = "linear", temp_prog = "custom") expect_s3_class(out, "data.frame") expect_true(!anyNA(out$RI)) @@ -44,6 +52,8 @@ test_that("nist_ri() returns results", { test_that("nist_ri() works with inchikey query", { skip_on_cran() + skip_if_not(up, "NIST Web Book is down") + testdf <- nist_ri( "UHEPJGULSIKKTP-UHFFFAOYSA-N", from = "inchikey", @@ -57,6 +67,8 @@ test_that("nist_ri() works with inchikey query", { test_that("nist_ri() works with inchi query", { skip_on_cran() + skip_if_not(up, "NIST Web Book is down") + testdf <- nist_ri( "1S/C8H14O/c1-7(2)5-4-6-8(3)9/h5H,4,6H2,1-3H3", from = "inchi", @@ -70,6 +82,8 @@ test_that("nist_ri() works with inchi query", { test_that("nist_ri() works with name query", { skip_on_cran() + skip_if_not(up, "NIST Web Book is down") + testdf <- nist_ri( "myrcene", from = "name", @@ -83,6 +97,8 @@ test_that("nist_ri() works with name query", { test_that("nist_ri() works with multiple queries", { skip_on_cran() + skip_if_not(up, "NIST Web Book is down") + myRIs <- nist_ri( c("78-70-6", "13474-59-4"), @@ -96,6 +112,8 @@ test_that("nist_ri() works with multiple queries", { test_that("nist_ri() warns when multiple results", { skip_on_cran() + skip_if_not(up, "NIST Web Book is down") + expect_warning( nist_ri("Longipinene", from = "name"), "More than one match for 'Longipinene'. Returning NA.") @@ -103,6 +121,8 @@ test_that("nist_ri() warns when multiple results", { test_that("nist_ri() warns when no chromatography data", { skip_on_cran() + skip_if_not(up, "NIST Web Book is down") + expect_warning( nist_ri("methane", from = "name"), "There are no chromatography data for 'methane'. Returning NA." @@ -111,6 +131,8 @@ test_that("nist_ri() warns when no chromatography data", { test_that("cas = is deprecated gently", { skip_on_cran() + skip_if_not(up, "NIST Web Book is down") + expect_warning( nist_ri(cas = "78-70-6"), "`cas` is deprecated. Using `query` instead with `from = 'cas'`." @@ -120,6 +142,8 @@ test_that("cas = is deprecated gently", { test_that("nist_ri works with NAs", { skip_on_cran() + skip_if_not(up, "NIST Web Book is down") + test <- nist_ri("107-86-8", from = "cas", type = "linear", @@ -134,6 +158,5 @@ test_that("nist_ri works with NAs", { colnames(natest), colnames(test) ) - expect_equivalent(unique(natest$query), c(NA, "107-86-8")) -}) \ No newline at end of file +}) diff --git a/tests/testthat/test-opsin.R b/tests/testthat/test-opsin.R index f659c286..02ef9b6b 100644 --- a/tests/testthat/test-opsin.R +++ b/tests/testthat/test-opsin.R @@ -1,23 +1,26 @@ -context("opsin") - - +up <- ping_service("opsin") test_that("opsin_query()", { skip_on_cran() + skip_if_not(up, "OPSIN service is down") o1 <- opsin_query(c('Cyclopropane', 'Octane')) - o2 <- opsin_query(c('xxxx')) + o2 <- suppressWarnings(opsin_query(c('xxxx'))) # issue #146 b1 <- opsin_query('Acetic acid') expect_equal(b1$query, 'Acetic acid') - expect_is(o1, 'data.frame') - expect_equal(ncol(o1), 6) - expect_equal(ncol(o2), 6) + expect_s3_class(o1, 'data.frame') + + expect_equal( + colnames(o1), + c("query", "inchi", "stdinchi", "stdinchikey", "smiles", "message", "status") + ) + expect_equal(nrow(o1), 2) expect_equal(nrow(o2), 1) expect_equal(o1$query, c('Cyclopropane', 'Octane')) expect_equal(o2$query, c('xxxx')) + expect_equal(is.na(opsin_query(NA)$smiles), TRUE) + expect_equal(ncol(o1), ncol(o2)) }) - - diff --git a/tests/testthat/test-pan.R b/tests/testthat/test-pan.R index a559b321..0da2d25e 100644 --- a/tests/testthat/test-pan.R +++ b/tests/testthat/test-pan.R @@ -1,25 +1,24 @@ -context("pan") -skip_if_not(ping_pan()) - +up <- ping_service("pan") test_that("pan_query()", { skip_on_cran() + skip_if_not(up, "PAN service is down, skipping tests") a <- pan_query(c('Triclosan','Chlorpyrifos', 'xxxx', NA), match = 'best', verbose = FALSE) b <- pan_query(c('Triclosan','Chlorpyrifos', 'xxxx', NA), match = 'all', verbose = FALSE) c <- pan_query(c('Triclosan','Chlorpyrifos', 'xxxx', NA), match = 'first', verbose = FALSE) - expect_is(a, 'list') - expect_is(b, 'list') - expect_is(c, 'list') - expect_equal(length(a), 4) - expect_equal(length(b), 4) - expect_equal(length(c), 4) - expect_equal(length(a[[1]]), 75) - expect_equal(length(b[[2]]), 75) - expect_equal(length(c[[2]]), 75) - expect_equal(length(a[[1]]$`CAS Number`), 1) - expect_equal(length(b[[2]]$`CAS Number`), 9) - expect_equal(length(c[[2]]$`CAS Number`), 1) + expect_s3_class(a, 'list') + expect_s3_class(b, 'list') + expect_s3_class(c, 'list') + expect_length(a, 4) + expect_length(b, 4) + expect_length(c, 4) + expect_length(a[[1]], 75) + expect_length(b[[2]], 75) + expect_length(c[[2]], 75) + expect_length(a[[1]]$`CAS Number`, 1) + expect_length(b[[2]]$`CAS Number`, 9) + expect_length(c[[2]]$`CAS Number`, 1) expect_true(is.na(a[[3]])) expect_true(is.na(b[[3]])) expect_true(is.na(c[[3]])) @@ -28,4 +27,4 @@ test_that("pan_query()", { expect_true(is.na(c[[4]])) expect_equal(a[[1]]$`CAS Number`, "3380-34-5") expect_equal(a[[2]]$`Chemical name`, "Chlorpyrifos") -}) \ No newline at end of file +}) diff --git a/tests/testthat/test-pubchem.R b/tests/testthat/test-pubchem.R index 254c98d1..9a5b39f2 100644 --- a/tests/testthat/test-pubchem.R +++ b/tests/testthat/test-pubchem.R @@ -1,40 +1,92 @@ -context("pubchem") +up <- ping_service("pc") +test_that("examples in the article are unchanged", { + skip_on_cran() + skip_if_not(up, "PubChem service is down") + #values come from test-etox + cas <- c("105-67-9", "1570-64-5", NA, "1912-24-9", "71-43-2", "6190-65-4") + cids <- get_cid(cas, from = "xref/rn", match = "first") + pc_data <- pc_prop(cids$cid, properties = "CanonicalSMILES") + #values go to test-chemspider + pc_smiles <- smiles(pc_data) + + expect_s3_class(pc_data, "data.frame") + + expect_equal(cids$cid, c("7771", "14855", NA, "2256", "241", "22563")) + expect_equal(pc_smiles, c("CC1=CC(=C(C=C1)O)C", "CC1=C(C=CC(=C1)Cl)O", NA, + "CCNC1=NC(=NC(=N1)Cl)NC(C)C", "C1=CC=CC=C1", + "CC(C)NC1=NC(=NC(=N1)N)Cl")) +}) test_that("get_cid()", { skip_on_cran() - - expect_equal(get_cid("Triclosan")$cid[1], "5564") + skip_if_not(up, "PubChem service is down") + + #from name + expect_true("5564" %in% get_cid("Triclosan")$cid) + expect_true("5564" %in% get_cid("Triclosan", domain = "substance")$cid) + #from smiles + expect_equal(get_cid("CCCC", from = "smiles")$cid, "7843") + #from inchi + expect_equal(get_cid("InChI=1S/CH5N/c1-2/h2H2,1H3", from = "inchi")$cid, + "6329") + #from inchikey + expect_equal(get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey")$cid, + "12345") + #from formula, issue 206, some queries first return a listkey. + expect_true("10864091" %in% get_cid("C26H52NO6P", from = "formula")$cid) + # from CAS RN + expect_true("750" %in% get_cid("56-40-6", from = "xref/rn")$cid) + expect_true("5257127" %in% + get_cid("56-40-6", from = "xref/rn", domain = "substance")$cid) + #from cid, similarity + expect_true("5564" %in% get_cid(5564, from = "similarity/cid")$cid) + #from smiles, similarity + expect_true("702" %in% get_cid("CCO", from = "similarity/smiles")$cid) + #from SID + expect_equal(get_cid("126534046", from = "sid", domain = "substance")$cid, + "24971898") + # sourceid + expect_true( + "19689584" %in% + get_cid("VCC957895", from = "sourceid/23706", domain = "substance")$cid) + #from aid + expect_equal(get_cid(170004, from = "aid", domain = "assay")$cid, "68352") + #from GeneID + expect_true("11580958" %in% + get_cid(25086, from = "target/geneid", domain = "assay")$cid) + #arg expect_true(nrow(get_cid("Triclosan", arg = "name_type=word")) > 1) + #match expect_true(nrow(get_cid("Triclosan", arg = "name_type=word", - match = "first")) == 1) + match = "first")) == 1) + #multiple compounds expect_true(nrow(get_cid(c("Triclosan", "Aspirin"))) == 2) - expect_true(is.na(suppressWarnings(get_cid("xxxx", verbose = FALSE))$cid[1])) - expect_warning( - get_cid("xxxx", verbose = FALSE), - "No CID found that matches the given name. Returning NA." - ) + #invalid input expect_true(is.na(get_cid(NA)$cid[1])) - expect_equal(get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey")$cid[1], - "12345") + expect_true(is.na(suppressWarnings(get_cid("xxxx", verbose = FALSE))$cid[1])) + expect_equal(capture_messages(get_cid("balloon")), + c("Querying balloon. ", "Not Found (HTTP 404).", "\n")) + # sourceall + expect_equal(get_cid("Optopharma Ltd", from = "sourceall", + domain = "substance")$cid[1], "102361739") }) - test_that("pc_prop", { skip_on_cran() + skip_if_not(up, "PubChem service is down") - a <- pc_prop("5564", properties = "CanonicalSmiles", verbose = FALSE) - b <- suppressWarnings(pc_prop("xxx", properties = "CanonicalSmiles", verbose = FALSE)) + b <- suppressWarnings(pc_prop("xxx", properties = "CanonicalSmiles", + verbose = FALSE)) c <- pc_prop("5564", properties = c("CanonicalSmiles", "InChiKey"), verbose = FALSE) - expect_equal(a$CanonicalSMILES, "C1=CC(=C(C=C1Cl)O)OC2=C(C=C(C=C2)Cl)Cl") expect_true(is.na(b)) - expect_is(a, "data.frame") expect_equal(ncol(c), 3) }) test_that("pc_synonyms", { skip_on_cran() - + skip_if_not(up, "PubChem service is down") + expect_equivalent(pc_synonyms(NA), NA) expect_equal(pc_synonyms("Triclosan")[[1]][1], "5564") expect_equal(length(pc_synonyms(c("Triclosan", "Aspirin"))), 2) expect_equal(pc_synonyms("BPGDAMSIGCZZLK-UHFFFAOYSA-N", @@ -44,6 +96,7 @@ test_that("pc_synonyms", { test_that("cid integration tests", { skip_on_cran() + skip_if_not(ping_pubchem(), "PubChem service is down") expect_equal(pc_prop(get_cid("Triclosan")$cid[1], properties = "CanonicalSmiles")$CanonicalSMILES, @@ -53,9 +106,12 @@ test_that("cid integration tests", { }) test_that("pc_page()", { + skip_on_cran() + skip_if_not(up, "PubChem service is down") + a <- pc_page(c(311, 176, 1118, "balloon", NA), "pKa") - expect_is(a, "list") + expect_type(a, "list") expect_length(a, 5) expect_is(a[[1]], c("Node", "R6")) expect_is(a[[2]], c("Node", "R6")) @@ -65,6 +121,9 @@ test_that("pc_page()", { }) test_that("pc_extract() chemical and physical properties", { + skip_on_cran() + skip_if_not(up, "PubChem service is down") + s <- pc_page(c(NA, 176, 311, "balloon"), "chemical and physical properties") mw <- pc_extract(s, "molecular weight") # example for a computed property pd <- pc_extract(s, "physical description") # textual description @@ -78,8 +137,11 @@ test_that("pc_extract() chemical and physical properties", { }) test_that("pc_sect()", { + skip_on_cran() + skip_if_not(up, "PubChem service is down") + a <- pc_sect(c(311, 176, 1118, "balloon", NA), "pKa") - expect_is(a, c("tbl_df", "tbl", "data.frame")) + expect_s3_class(a, c("tbl_df", "tbl", "data.frame")) expect_equal(names(a), c("CID", "Name", "Result", "SourceName", "SourceID")) expect_equal(a$CID, c("311", "176", "1118", "balloon", NA)) expect_equal(a$Name, c("Citric acid", "Acetic acid", NA, NA, NA)) @@ -88,24 +150,26 @@ test_that("pc_sect()", { expect_equal(a$SourceID, c("DB04272", "DB03166", NA, NA, NA)) b <- pc_sect(2231, "depositor-supplied synonyms", "substance") - expect_is(b, c("tbl_df", "tbl", "data.frame")) + expect_s3_class(b, c("tbl_df", "tbl", "data.frame")) expect_equal(names(b), c("SID", "Name", "Result", "SourceName", "SourceID")) - expect_equal(b$Result, c("cholesterol", "57-88-5", "5-cholestene-3beta-ol")) + expect_equivalent(b$Result, c("cholesterol", "57-88-5", + "5-cholestene-3beta-ol")) c <- pc_sect(780286, "modify date", "assay") - expect_is(c, c("tbl_df", "tbl", "data.frame")) + expect_s3_class(c, c("tbl_df", "tbl", "data.frame")) expect_equal(names(c), c("AID", "Name", "Result", "SourceName", "SourceID")) expect_equal(c$Result, c("2014-05-03", "2018-09-28")) d <- pc_sect("1ZHY_A", "Sequence", "protein") - expect_is(d, c("tbl_df", "tbl", "data.frame")) + expect_s3_class(d, c("tbl_df", "tbl", "data.frame")) expect_equal(names(d), c("pdbID", "Name", "Result", "SourceName", "SourceID")) - expect_equal(d$Result[1], ">pdb|1ZHY|A Chain A, 1 Kes1 Protein (Run BLAST)") + expect_equivalent(d$Result[1], + ">pdb|1ZHY|A Chain A, 1 Kes1 Protein (Run BLAST)") e <- pc_sect("US2013040379", "Patent Identifier Synonyms", "patent") - expect_is(e, c("tbl_df", "tbl", "data.frame")) + expect_s3_class(e, c("tbl_df", "tbl", "data.frame")) expect_equal(names(e), c("PatentID", "Name", "Result", "SourceName", "SourceID")) - expect_equal(e$Result, c("US20130040379", "US20130040379A1", + expect_equivalent(e$Result, c("US20130040379", "US20130040379A1", "US2013040379A1")) }) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index e37d7189..13e894c8 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -1,27 +1,33 @@ -context("utils") +library(rcdk) +up <- ping_service("cs_web") +test_that("examples in the article are unchanged", { + expect_false(is.inchikey("BQJCRHHNABKAKU-KBQPJGBKS-AN")) + expect_equal(capture_messages(is.inchikey("BQJCRHHNABKAKU-KBQPJGBKS-AN")), + "Hyphens not at position 15 and 26.\n") + expect_false(is.cas('64-17-6')) + expect_equal( + capture_messages(is.cas("64-17-6")), "Checksum is not correct! 5 vs. 6\n") + skip_if_not(up, "ChemSpider service is down, skipping tests") + expect_false(is.inchikey("BQJCRHHNABKAKU-KBQPJGBKSA-5", type = "chemspider")) +}) test_that("is.cas() returns correct results", { - skip_on_cran() expect_true(is.cas('64-17-5')) expect_false(is.cas('64175')) expect_false(is.cas('4-17-5')) expect_false(is.cas('64-177-6')) expect_false(is.cas('64-17-55')) - expect_false(is.cas('64-17-6')) expect_error(is.cas(c('64-17-5', '64-17-5'))) }) test_that("as.cas() handles properly formatted CAS",{ - skip_on_cran() - expect_identical(as.cas("64-17-5"), "64-17-5") expect_silent(as.cas("64-17-5")) }) test_that("is.inchikey() returns correct results", { - skip_on_cran() expect_true(is.inchikey('BQJCRHHNABKAKU-KBQPJGBKSA-N')) expect_false(is.inchikey('BQJCRHHNABKAKU-KBQPJGBKSA')) @@ -33,6 +39,11 @@ test_that("is.inchikey() returns correct results", { expect_error(is.inchikey(c('BQJCRHHNABKAKU-KBQPJGBKSA-N', 'BQJCRHHNABKAKU-KBQPJGBKSA-N'))) + skip_on_cran() + skip_on_travis() + skip_on_appveyor() + skip_if_not(up, "ChemSpider service is down, skipping tests") + g <- is.inchikey('BQJCRHHNABKAKU-KBQPJGBKSA-N', type = 'chemspider') b <- is.inchikey('BQJCRHHNABKAKU-KBQPJGBKSA', type = 'chemspider') @@ -51,7 +62,6 @@ test_that("is.inchikey() returns correct results", { test_that("is.smiles() returns correct results", { - skip_on_cran() expect_true(is.smiles('Clc1ccc(cc1)C(c2ccc(Cl)cc2)C(Cl)(Cl)Cl')) expect_false(is.smiles('Clc1ccc(cc1)C(c2ccc(Cl)cc2)C(Cl)(Cl)ClWWX')) @@ -60,15 +70,14 @@ test_that("is.smiles() returns correct results", { }) test_that("extr_num() returns correct results", { - skip_on_cran() expect_equal(extr_num("Melting Pt : -44.6 deg C"), -44.6) expect_equal(extr_num("Melting Pt : 44.6 deg C"), 44.6) expect_equal(extr_num("Melting Pt : 446 deg C"), 446) }) -test_that("as.cas() returns correct results", { - skip_on_cran() +test_that("as.cas() returns correct reults", { + expect_equal(as.cas(58082), "58-08-2") expect_equal(as.cas(123456789), NA) expect_identical(as.cas(c(761659, 123456789, "hexenol")), diff --git a/tests/testthat/test-wikidata.R b/tests/testthat/test-wikidata.R index b547502a..f7b0ff2d 100644 --- a/tests/testthat/test-wikidata.R +++ b/tests/testthat/test-wikidata.R @@ -1,9 +1,7 @@ -context("wikidata") - - +up <- ping_service("wd") test_that("get_wdid returns correct results", { skip_on_cran() - + skip_if_not(up, "Wikidata service is down") # test general comps <- c('DDT', 'Aspirin', 'xdewrwdcadsr4w', 'acetic acid') o1 <- get_wdid(comps, match = 'best') @@ -22,16 +20,20 @@ test_that("get_wdid returns correct results", { }) test_that("get_wdid() handles NAs", { + skip_on_cran() + skip_if_not(up, "Wikidata service is down") + expect_s3_class(get_wdid(NA), "data.frame") expect_s3_class(get_wdid(c("Triclosan", "Glyphosate", NA)), "data.frame") }) test_that("wd_ident returns correct results", { skip_on_cran() + skip_if_not(up, "Wikidata service is down") id <- c( "Q163648", "Q18216", "asndalsr", NA) o1 <- wd_ident(id) - expect_is(o1, 'data.frame') + expect_s3_class(o1, 'data.frame') expect_equal(nrow(o1), 4) expect_true(is.na(o1$smiles[3])) expect_true(is.na(o1$smiles[4])) @@ -43,6 +45,7 @@ test_that("wd_ident returns correct results", { test_that("wd integration test", { skip_on_cran() + skip_if_not(up, "Wikidata service is down") d <- wd_ident(get_wdid('hexane', language = 'en', match = 'best')$wdid) f <- wd_ident(get_wdid('xxxxxxxAX', language = 'en', match = 'best')$wdid) @@ -51,4 +54,4 @@ test_that("wd integration test", { expect_equal(ncol(d), 14) expect_s3_class(d, 'data.frame') expect_true(all(is.na(f[1, ]))) -}) \ No newline at end of file +}) diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 00000000..aff8f7db --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,3 @@ +*.html +*.R +!precompile.R diff --git a/vignettes/precompile.R b/vignettes/precompile.R new file mode 100644 index 00000000..05c41d15 --- /dev/null +++ b/vignettes/precompile.R @@ -0,0 +1,4 @@ +# Precomplie vignettes locally +# More info here: https://ropensci.org/technotes/2019/12/08/precompute-vignettes/ +library(knitr) +knit("vignettes/webchem.Rmd.orig", "vignettes/webchem.Rmd") #Get Started diff --git a/vignettes/webchem.Rmd b/vignettes/webchem.Rmd new file mode 100644 index 00000000..0839deff --- /dev/null +++ b/vignettes/webchem.Rmd @@ -0,0 +1,147 @@ +--- +title: "Getting started with webchem" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Getting started with webchem} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + + + +```r +library(webchem) +library(dplyr) +``` + +The `lc50` dataset provided with `webchem` contains acute ecotoxicity of 124 insecticides. We'll work with a subset of these to obtain chemical names and octanal/water partitioning coefficients from PubChem, and gas chromatography retention indices from the NIST Web Book. + + +```r +head(lc50) +#> cas value +#> 4 50-29-3 12.415277 +#> 12 52-68-6 1.282980 +#> 15 55-38-9 12.168138 +#> 18 56-23-5 35000.000000 +#> 21 56-38-2 1.539119 +#> 36 57-74-9 98.400000 + +lc50_sub <- lc50[1:15, ] +``` + +## Getting Identifiers + +Usually a `webchem` workflow starts with translating and retrieving chemical identifiers since most chemical information databases use their own internal identifiers. + +First, we will covert CAS numbers to InChIKey identifiers using the Chemical Translation Service. Then, we'll use these InChiKeys to get Pubchem CompoundID numbers, to use for retrieving chemical properties from PubChem. + + +```r +lc50_sub$inchikey <- cts_convert(lc50_sub$cas, from = "CAS", to = "InChIKey", choices = 1, verbose = FALSE) +head(lc50_sub) +#> cas value inchikey +#> 4 50-29-3 12.415277 YVGGHNCTFXOJCH-UHFFFAOYSA-N +#> 12 52-68-6 1.282980 NFACJZMKEDPNKN-UHFFFAOYSA-N +#> 15 55-38-9 12.168138 PNVJTZOFSHSLTO-UHFFFAOYSA-N +#> 18 56-23-5 35000.000000 VZGDMQKNWNREIO-UHFFFAOYSA-N +#> 21 56-38-2 1.539119 LCCNCVORNKJIRZ-UHFFFAOYSA-N +#> 36 57-74-9 98.400000 BIWJNBZANLAXMG-YQELWRJZSA-N +any(is.na(lc50_sub$inchikey)) +#> [1] FALSE +``` + +Great, now we can retrieve PubChem CIDs. All `get_*()` functions return a data frame containing the query and the retrieved identifier. We can merge this with our dataset with `dplyr::full_join()` + + +```r +x <- get_cid(lc50_sub$inchikey, from = "inchikey", match = "first", verbose = FALSE) +library(dplyr) +lc50_sub2 <- full_join(lc50_sub, x, by = c("inchikey" = "query")) +head(lc50_sub2) +#> cas value inchikey cid +#> 1 50-29-3 12.415277 YVGGHNCTFXOJCH-UHFFFAOYSA-N 3036 +#> 2 52-68-6 1.282980 NFACJZMKEDPNKN-UHFFFAOYSA-N 5853 +#> 3 55-38-9 12.168138 PNVJTZOFSHSLTO-UHFFFAOYSA-N 3346 +#> 4 56-23-5 35000.000000 VZGDMQKNWNREIO-UHFFFAOYSA-N 5943 +#> 5 56-38-2 1.539119 LCCNCVORNKJIRZ-UHFFFAOYSA-N 991 +#> 6 57-74-9 98.400000 BIWJNBZANLAXMG-YQELWRJZSA-N 11954021 +``` + +## Retrieving Chemical Properties + +Functions that query chemical information databases begin with a prefix that matches the database. For example, functions to query PubChem begin with `pc_` and functions to query ChemSpider begin with `cs_`. In this example, we'll get the names and log octanal/water partitioning coefficients for each compound using PubChem, and the WHO acute toxicity rating from the PAN Pesticide database. + + +```r +y <- pc_prop(lc50_sub2$cid, properties = c("IUPACName", "XLogP")) +#> https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/property/IUPACName,XLogP/JSON +y$CID <- as.character(y$CID) +lc50_sub3 <- full_join(lc50_sub2, y, by = c("cid" = "CID")) +head(lc50_sub3) +#> cas value inchikey cid +#> 1 50-29-3 12.415277 YVGGHNCTFXOJCH-UHFFFAOYSA-N 3036 +#> 2 52-68-6 1.282980 NFACJZMKEDPNKN-UHFFFAOYSA-N 5853 +#> 3 55-38-9 12.168138 PNVJTZOFSHSLTO-UHFFFAOYSA-N 3346 +#> 4 56-23-5 35000.000000 VZGDMQKNWNREIO-UHFFFAOYSA-N 5943 +#> 5 56-38-2 1.539119 LCCNCVORNKJIRZ-UHFFFAOYSA-N 991 +#> 6 57-74-9 98.400000 BIWJNBZANLAXMG-YQELWRJZSA-N 11954021 +#> IUPACName XLogP +#> 1 1-chloro-4-[2,2,2-trichloro-1-(4-chlorophenyl)ethyl]benzene 6.9 +#> 2 2,2,2-trichloro-1-dimethoxyphosphorylethanol 0.5 +#> 3 dimethoxy-(3-methyl-4-methylsulfanylphenoxy)-sulfanylidene-lambda5-phosphane 4.1 +#> 4 tetrachloromethane 2.8 +#> 5 diethoxy-(4-nitrophenoxy)-sulfanylidene-lambda5-phosphane 3.8 +#> 6 (1R,7S)-1,3,4,7,8,9,10,10-octachlorotricyclo[5.2.1.02,6]dec-8-ene 4.9 +``` + +The IUPAC names are long and unwieldy, and one could use `pc_synonyms()` to choose better names. Several other functions return synonyms as well, even though they are not explicitly translator type functions. We'll see an example of that next. + +Many of the chemical databases `webchem` can query contain vast amounts of information in a variety of structures. Therefore, some `webchem` functions return nested lists rather than data frames. `pan_query()` is one such function. + + +```r +out <- pan_query(lc50_sub3$cas, verbose = FALSE) +#> Warning in lapply(out[tonum], as.numeric): NAs introduced by coercion + +#> Warning in lapply(out[tonum], as.numeric): NAs introduced by coercion + +#> Warning in lapply(out[tonum], as.numeric): NAs introduced by coercion + +#> Warning in lapply(out[tonum], as.numeric): NAs introduced by coercion + +#> Warning in lapply(out[tonum], as.numeric): NAs introduced by coercion + +#> Warning in lapply(out[tonum], as.numeric): NAs introduced by coercion + +#> Warning in lapply(out[tonum], as.numeric): NAs introduced by coercion +``` + +`out` is a nested list which you can inspect with `View()`. It has an element for each query, and within each query, many elements corresponding to different properties in the database. To extract a single property from all queries, we need to use a mapping function such as `sapply()` or one of the `map_*()` functions from the `purrr` package. + + +```r +lc50_sub3$who_tox <- sapply(out, function(y) y$`WHO Acute Toxicity`) +lc50_sub3$common_name <- sapply(out, function(y) y$`Chemical name`) + +# #equivalent with purrr package: +# lc50_sub3$who_tox <- map_chr(out, pluck, "WHO Acute Toxicity") +# lc50_sub3$common_name <- map_chr(out, pluck, "Chemical name") +``` + + +```r +#tidy up columns +lc50_done <- dplyr::select(lc50_sub3, common_name, cas, inchikey, XLogP, who_tox) +head(lc50_done) +#> common_name cas inchikey XLogP who_tox +#> 1 DDT, p,p' 50-29-3 YVGGHNCTFXOJCH-UHFFFAOYSA-N 6.9 II, Moderately Hazardous +#> 2 Trichlorfon 52-68-6 NFACJZMKEDPNKN-UHFFFAOYSA-N 0.5 II, Moderately Hazardous +#> 3 Fenthion 55-38-9 PNVJTZOFSHSLTO-UHFFFAOYSA-N 4.1 II, Moderately Hazardous +#> 4 Carbon tetrachloride 56-23-5 VZGDMQKNWNREIO-UHFFFAOYSA-N 2.8 Not Listed +#> 5 Parathion 56-38-2 LCCNCVORNKJIRZ-UHFFFAOYSA-N 3.8 Ia, Extremely Hazardous +#> 6 Chlordane 57-74-9 BIWJNBZANLAXMG-YQELWRJZSA-N 4.9 II, Moderately Hazardous +``` + diff --git a/vignettes/webchem.Rmd.orig b/vignettes/webchem.Rmd.orig new file mode 100644 index 00000000..cd0a3859 --- /dev/null +++ b/vignettes/webchem.Rmd.orig @@ -0,0 +1,87 @@ +--- +title: "Getting started with webchem" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Getting started with webchem} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup} +library(webchem) +library(dplyr) +``` + +The `lc50` dataset provided with `webchem` contains acute ecotoxicity of 124 insecticides. We'll work with a subset of these to obtain chemical names and octanal/water partitioning coefficients from PubChem, and gas chromatography retention indices from the NIST Web Book. + +```{r} +head(lc50) + +lc50_sub <- lc50[1:15, ] +``` + +## Getting Identifiers + +Usually a `webchem` workflow starts with translating and retrieving chemical identifiers since most chemical information databases use their own internal identifiers. + +First, we will covert CAS numbers to InChIKey identifiers using the Chemical Translation Service. Then, we'll use these InChiKeys to get Pubchem CompoundID numbers, to use for retrieving chemical properties from PubChem. + +```{r} +lc50_sub$inchikey <- cts_convert(lc50_sub$cas, from = "CAS", to = "InChIKey", choices = 1, verbose = FALSE) +head(lc50_sub) +any(is.na(lc50_sub$inchikey)) +``` + +Great, now we can retrieve PubChem CIDs. All `get_*()` functions return a data frame containing the query and the retrieved identifier. We can merge this with our dataset with `dplyr::full_join()` + +```{r} +x <- get_cid(lc50_sub$inchikey, from = "inchikey", match = "first", verbose = FALSE) +library(dplyr) +lc50_sub2 <- full_join(lc50_sub, x, by = c("inchikey" = "query")) +head(lc50_sub2) +``` + +## Retrieving Chemical Properties + +Functions that query chemical information databases begin with a prefix that matches the database. For example, functions to query PubChem begin with `pc_` and functions to query ChemSpider begin with `cs_`. In this example, we'll get the names and log octanal/water partitioning coefficients for each compound using PubChem, and the WHO acute toxicity rating from the PAN Pesticide database. + +```{r} +y <- pc_prop(lc50_sub2$cid, properties = c("IUPACName", "XLogP")) +y$CID <- as.character(y$CID) +lc50_sub3 <- full_join(lc50_sub2, y, by = c("cid" = "CID")) +head(lc50_sub3) +``` + +The IUPAC names are long and unwieldy, and one could use `pc_synonyms()` to choose better names. Several other functions return synonyms as well, even though they are not explicitly translator type functions. We'll see an example of that next. + +Many of the chemical databases `webchem` can query contain vast amounts of information in a variety of structures. Therefore, some `webchem` functions return nested lists rather than data frames. `pan_query()` is one such function. + +```{r message=FALSE} +out <- pan_query(lc50_sub3$cas, verbose = FALSE) +``` + +`out` is a nested list which you can inspect with `View()`. It has an element for each query, and within each query, many elements corresponding to different properties in the database. To extract a single property from all queries, we need to use a mapping function such as `sapply()` or one of the `map_*()` functions from the `purrr` package. + +```{r} +lc50_sub3$who_tox <- sapply(out, function(y) y$`WHO Acute Toxicity`) +lc50_sub3$common_name <- sapply(out, function(y) y$`Chemical name`) + +# #equivalent with purrr package: +# lc50_sub3$who_tox <- map_chr(out, pluck, "WHO Acute Toxicity") +# lc50_sub3$common_name <- map_chr(out, pluck, "Chemical name") +``` + +```{r} +#tidy up columns +lc50_done <- dplyr::select(lc50_sub3, common_name, cas, inchikey, XLogP, who_tox) +head(lc50_done) +``` + diff --git a/webchem.Rproj b/webchem.Rproj index 30f36b5b..3080eb6c 100644 --- a/webchem.Rproj +++ b/webchem.Rproj @@ -17,5 +17,5 @@ StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source -PackageCheckArgs: --as-cran --timings +PackageCheckArgs: --as-cran --timings --no-examples --no-build-vignettes --no-manual PackageRoxygenize: rd,collate,namespace,vignette