ropensci · stitam · Sep 17, 2020 · Jul 10, 2020 · Jul 10, 2020 · Jul 10, 2020
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -64,6 +64,8 @@ Some consistency guidelines:
 
 13. SMILES strings may use special characters like "#". `URLencode()` does not encode this as "%23" by default, so use `URLencode(query, reserved = TRUE)` instead. It's important to note that it's the query that has to be encoded like this, not the full url.
 
+14. Print verbose messages. Use `htts::message_for_status()` and `webchem_message()` functions to generate standard messages when possible.
+
 ### Data Sources
 
 You might think all webscraping is perfectly legal but it is unfortunately not that simple.

diff --git a/NAMESPACE b/NAMESPACE
@@ -31,7 +31,6 @@ S3method(smiles,pc_prop)
 S3method(smiles,wd_ident)
 export(as.cas)
 export(aw_query)
-export(build_aw_idx)
 export(cas)
 export(chebi_comp_entity)
 export(ci_query)

diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,4 @@
-# webchem 1.0.0.9005
+# webchem 1.0.0.9010
 
 ## NEW FEATURES
 
@@ -8,6 +8,8 @@
 
 ## MINOR IMPROVEMENTS
 
+* Most functions now use httr::RETRY() to access webservices.
+* Verbose messages are now harmonized.
 * The `"type"` argument in `ci_query()` and `aw_query()` has been changed to `"from"` for consistency with other functions
 * `fn_percept()` and `cts_compinfo()` now have `"query"` and `"from"` arguments for consistency with other functions
 * Possible values for `"from"` have been made more consistent across functions

diff --git a/R/alanwood.R b/R/alanwood.R
@@ -8,9 +8,7 @@
 #' @param  query character; search string
 #' @param from character; type of input ('cas' or 'name')
 #' @param verbose logical; print message during processing to console?
-#' @param force_build logical; force building a new index? See
-#' \code{\link{build_aw_idx}} for more details.
-#' @param ... currently unused.
+#' @param ... additional arguments to internal utility functions
 #' @param type deprecated
 #' @return A list of eight entries: common-name, status, preferred IUPAC Name,
 #' IUPAC Name, cas, formula, activity, subactivity, inchikey, inchi and source
@@ -35,25 +33,26 @@
 #' # use CAS-numbers
 #' aw_query("79622-59-6", from = 'cas')
 #' }
-#' @seealso \code{\link{build_aw_idx}}
-
 aw_query <- function(query, from = c("name", "cas"), verbose = TRUE,
-                     force_build = FALSE, type, ...) {
+                     type, ...) {
   if (!missing(type)) {
     message('"type" is deprecated. Please use "from" instead. ')
     from <- type
   }
 
   if ("commonname" %in% from) {
-    warning('To search by compound name use "name" instead of "commonname"')
+    message('To search by compound name use "name" instead of "commonname"')
     from <- "name"
   }
   from <- match.arg(from)
-  aw_idx <- build_aw_idx(verbose, force_build)
+  aw_idx <- build_aw_idx(verbose, ...)
 
   foo <- function(query, from, verbose) {
-    on.exit(suppressWarnings(closeAllConnections()))
-
+    if (is.na(query)) {
+      if (verbose) webchem_message("na")
+      return(NA)
+    }
+    if (verbose) webchem_message("query", query, appendLF = FALSE)
   # search links in indexes
     if (from == "name") {
       links <- aw_idx$links[aw_idx$source == "cn"]
@@ -70,80 +69,92 @@ aw_query <- function(query, from = c("name", "cas"), verbose = TRUE,
     }
 
     takelink <- links[tolower(names) == tolower(query)]
-    if (is.na(query)) takelink <- vector()
     if (length(takelink) == 0) {
-      message("Not found! Returning NA.\n")
+      if (verbose) message("Not found.")
       return(NA)
     }
     if (length(takelink) > 1) {
       takelink <- unique(takelink)
       if (length(takelink) > 1) {
-        message("More then one link found! Returning first.\n")
+        message("More then one link found. Returning first.")
         takelink <- takelink[1]
       }
     }
-    if (verbose)
-      message("Querying ", takelink)
 
+    qurl <- paste0("http://www.alanwood.net/pesticides/", takelink)
     Sys.sleep(rgamma(1, shape = 15, scale = 1 / 10))
-    ttt <- read_html(paste0("http://www.alanwood.net/pesticides/", takelink))
-
-    status <- xml_text(
-      xml_find_all(ttt, "//tr/th[@id='r1']/following-sibling::td"))
-    pref_iupac_name <- xml_text(
-      xml_find_all(ttt, "//tr/th[@id='r2']/following-sibling::td"))
-    iupac_name <- xml_text(
-      xml_find_all(ttt, "//tr/th[@id='r3']/following-sibling::td"))
-    cas <- xml_text(
-      xml_find_all(ttt, "//tr/th[@id='r5']/following-sibling::td"))
-    formula <- xml_text(
-      xml_find_all(ttt, "//tr/th[@id='r6']/following-sibling::td"))
-    activity <- xml_text(
-      xml_find_all(ttt, "//tr/th[@id='r7']/following-sibling::td"))
-    subactivity <- trimws(
-      strsplit(gsub("^.*\\((.*)\\)", "\\1", activity), ";")[[1]])
-    activity <- gsub("^(.*) \\(.*\\)", "\\1", activity)
-    inchikey_r <- xml_text(
-      xml_find_all(ttt, "//tr/th[@id='r11']/following-sibling::td"))
-    if (length(inchikey_r) == 0) {
-      inchikey <- NA
-    } else {
-      if (grepl("isomer", inchikey_r)) {
-        inchikey <- c(
-          s_isomer = gsub(
-            ".*\\(S\\)-isomer:(.*)(minor component.*)", "\\1", inchikey_r),
-          r_isomer = gsub(".*\\(R\\)-isomer:(.*)", "\\1", inchikey_r))
-      }
-      if (grepl("identifier", inchikey_r)) {
-        inchikey <- c(gsub("(.*)identifier.*", "\\1", inchikey_r),
-                      gsub(".*identifier.*:(.*)", "\\1", inchikey_r))
-        names(inchikey) <- c("inchikey",
-                             gsub(".*(identifier.*:).*", "\\1", inchikey_r)
-                             )
-      }
-      if (!grepl("isomer", inchikey_r) & !grepl("identifier", inchikey_r))
-        inchikey <- inchikey_r
+    res <- try(httr::RETRY("GET",
+                           qurl,
+                           httr::user_agent(webchem_url()),
+                           terminate_on = 404,
+                           quiet = TRUE), silent = TRUE)
+    if (inherits(res, "try-error")) {
+      if (verbose) webchem_message("service_down")
+      return(NA)
     }
+    if (verbose) message(httr::message_for_status(res))
+    if (res$status_code == 200){
+      ttt <- read_html(res)
+      status <- xml_text(
+        xml_find_all(ttt, "//tr/th[@id='r1']/following-sibling::td"))
+      pref_iupac_name <- xml_text(
+        xml_find_all(ttt, "//tr/th[@id='r2']/following-sibling::td"))
+      iupac_name <- xml_text(
+        xml_find_all(ttt, "//tr/th[@id='r3']/following-sibling::td"))
+      cas <- xml_text(
+        xml_find_all(ttt, "//tr/th[@id='r5']/following-sibling::td"))
+      formula <- xml_text(
+        xml_find_all(ttt, "//tr/th[@id='r6']/following-sibling::td"))
+      activity <- xml_text(
+        xml_find_all(ttt, "//tr/th[@id='r7']/following-sibling::td"))
+      subactivity <- trimws(
+        strsplit(gsub("^.*\\((.*)\\)", "\\1", activity), ";")[[1]])
+      activity <- gsub("^(.*) \\(.*\\)", "\\1", activity)
+      inchikey_r <- xml_text(
+        xml_find_all(ttt, "//tr/th[@id='r11']/following-sibling::td"))
+      if (length(inchikey_r) == 0) {
+        inchikey <- NA
+      } else {
+        if (grepl("isomer", inchikey_r)) {
+          inchikey <- c(
+            s_isomer = gsub(
+              ".*\\(S\\)-isomer:(.*)(minor component.*)", "\\1", inchikey_r),
+            r_isomer = gsub(".*\\(R\\)-isomer:(.*)", "\\1", inchikey_r))
+        }
+        if (grepl("identifier", inchikey_r)) {
+          inchikey <- c(gsub("(.*)identifier.*", "\\1", inchikey_r),
+                        gsub(".*identifier.*:(.*)", "\\1", inchikey_r))
+          names(inchikey) <- c("inchikey",
+                               gsub(".*(identifier.*:).*", "\\1", inchikey_r)
+          )
+        }
+        if (!grepl("isomer", inchikey_r) & !grepl("identifier", inchikey_r))
+          inchikey <- inchikey_r
+      }
 
-    inchi <- xml_text(
-      xml_find_all(ttt, "//tr/th[@id='r12']/following-sibling::td"))
-    if (length(inchi) == 0) {
-      inchi <- NA
-    } else {
-      if (grepl("isomer", inchi)) {
-        inchi <- c(s_isomer = gsub(".*\\(S\\)-isomer:(.*)(minor component.*)",
-                                   "\\1", inchi),
-                   r_isomer = gsub(".*\\(R\\)-isomer:(.*)", "\\1", inchi))
+      inchi <- xml_text(
+        xml_find_all(ttt, "//tr/th[@id='r12']/following-sibling::td"))
+      if (length(inchi) == 0) {
+        inchi <- NA
+      } else {
+        if (grepl("isomer", inchi)) {
+          inchi <- c(s_isomer = gsub(".*\\(S\\)-isomer:(.*)(minor component.*)",
+                                     "\\1", inchi),
+                     r_isomer = gsub(".*\\(R\\)-isomer:(.*)", "\\1", inchi))
+        }
       }
+      # add source url
+      source_url <- paste0("http://www.alanwood.net/pesticides/", takelink)
+      out <- list(cname = cname, status = status,
+                  pref_iupac_name = pref_iupac_name, iupac_name = iupac_name,
+                  cas = cas, formula = formula, activity = activity,
+                  subactivity = subactivity, inchikey = inchikey, inchi = inchi,
+                  source_url = source_url)
+      return(out)
+    }
+    else {
+      return(NA)
     }
-    # add source url
-    source_url <- paste0("http://www.alanwood.net/pesticides/", takelink)
-    out <- list(cname = cname, status = status,
-                pref_iupac_name = pref_iupac_name, iupac_name = iupac_name,
-                cas = cas, formula = formula, activity = activity,
-                subactivity = subactivity, inchikey = inchikey, inchi = inchi,
-                source_url = source_url)
-    return(out)
   }
   out <- lapply(query, function(x) foo(x, from = from, verbose = verbose))
   out <- setNames(out, query)
@@ -155,19 +166,17 @@ aw_query <- function(query, from = c("name", "cas"), verbose = TRUE,
 #'
 #' This function builds an index of Alan Woods Compendium of Pesticides
 #' \url{http://www.alanwood.net/pesticides} and saves it to
-#' \code{\link{tempdir}}. This is a utility function for
-#' \code{\link{aw_query}} and will not be exported in future releases.
+#' \code{\link{tempdir}}.
 #' @import xml2
+#'
 #' @param verbose logical; print message during processing to console?
 #' @param force_build logical; force building a new index?
 #' @return a data.frame
 #' @seealso \code{\link{aw_query}}, \code{\link{tempdir}}
 #' @author Eduard Szöcs, \email{eduardszoecs@@gmail.com}
 #' @source \url{http://www.alanwood.net/pesticides}
-#' @export
+#' @noRd
 build_aw_idx <- function(verbose = TRUE, force_build = FALSE) {
-  on.exit(suppressWarnings(closeAllConnections()))
-  message(msg = "build_aw_idx() will not be exported in future releases.")
   suppressWarnings(try(load(paste0(tempdir(), "/data/aw_idx.rda")),
                        silent = TRUE))
   if (!file.exists(paste0(tempdir(), "/data/aw_idx.rda")) |
@@ -176,40 +185,48 @@ build_aw_idx <- function(verbose = TRUE, force_build = FALSE) {
     if (!dir.exists(paste0(tempdir(), "/data"))) {
       dir.create(paste0(tempdir(), "/data"))
     }
-    if (verbose == TRUE) {
-      message("Building index.", appendLF = FALSE)
-    }
-    idx1 <- read_html("http://www.alanwood.net/pesticides/index_rn.html")
-    prep_idx <- function(y) {
-      names <- xml_text(xml_find_all(y, "//dl/dt"))
-      links <- xml_attr(
-        xml_find_all(y, "//dt/following-sibling::dd[1]/a[1]"), "href")
-      linknames <- xml_text(xml_find_all(y, "//dt/following-sibling::dd[1]/a[1]"))
-      return(data.frame(names, links, linknames, stringsAsFactors = FALSE))
+    if (verbose) message("Building index. ", appendLF = FALSE)
+    res <- try(httr::RETRY("GET",
+                           "http://www.alanwood.net/pesticides/index_rn.html",
+                           httr::user_agent(webchem_url()),
+                           terminate_on = 404,
+                           quiet = TRUE), silent= TRUE)
+    if (inherits(res, "try-error")) {
+      if (verbose) webchem_message("service_down")
+      return(NA)
     }
-    aw_idx <- rbind(prep_idx(idx1))
-    aw_idx[["source"]] <- "rn"
-    idx4 <- read_html("http://www.alanwood.net/pesticides/index_cn.html")
-    n <- xml_find_all(idx4, "//a")
-    names <- xml_text(n)
-    rm <- names == ""
-    names <- names[!rm]
-    links <- xml_attr(n, "href")
-    links <- links[!rm]
-    idx4 <- data.frame(names = NA, links = links, linknames = names,
-                       source = "cn", stringsAsFactors = FALSE)
-    aw_idx <- rbind(aw_idx, idx4)
+    if (verbose) message(httr::message_for_status(res))
+    if (res$status_code == 200){
+      idx1 <- read_html("http://www.alanwood.net/pesticides/index_rn.html")
+      prep_idx <- function(y) {
+        names <- xml_text(xml_find_all(y, "//dl/dt"))
+        links <- xml_attr(
+          xml_find_all(y, "//dt/following-sibling::dd[1]/a[1]"), "href")
+        linknames <- xml_text(
+          xml_find_all(y, "//dt/following-sibling::dd[1]/a[1]"))
+        return(data.frame(names, links, linknames, stringsAsFactors = FALSE))
+      }
+      aw_idx <- rbind(prep_idx(idx1))
+      aw_idx[["source"]] <- "rn"
+      idx4 <- read_html("http://www.alanwood.net/pesticides/index_cn.html")
+      n <- xml_find_all(idx4, "//a")
+      names <- xml_text(n)
+      rm <- names == ""
+      names <- names[!rm]
+      links <- xml_attr(n, "href")
+      links <- links[!rm]
+      idx4 <- data.frame(names = NA, links = links, linknames = names,
+                         source = "cn", stringsAsFactors = FALSE)
+      aw_idx <- rbind(aw_idx, idx4)
 
-    # fix encoding
-    ln <- aw_idx$linknames
-    Encoding(ln) <- "latin1"
-    ln <- iconv(ln, from = "latin1", to = "ASCII", sub = "")
-    aw_idx$linknames <- ln
-    attr(aw_idx, "date") <- Sys.Date()
-    if (verbose == TRUE) {
-      message(" Done.")
+      # fix encoding
+      ln <- aw_idx$linknames
+      Encoding(ln) <- "latin1"
+      ln <- iconv(ln, from = "latin1", to = "ASCII", sub = "")
+      aw_idx$linknames <- ln
+      attr(aw_idx, "date") <- Sys.Date()
+      save(aw_idx, file = paste0(tempdir(), "/data/aw_idx.rda"))
     }
-    save(aw_idx, file = paste0(tempdir(), "/data/aw_idx.rda"))
   }
   return(aw_idx)
 }