Skip to content

Commit

Permalink
Merge pull request #271 from stitam/retry
Browse files Browse the repository at this point in the history
Update requests to use httr::RETRY and harmonise verbose messages
  • Loading branch information
stitam authored Sep 17, 2020
2 parents 69197f3 + b0d9c0e commit 0cbdb29
Show file tree
Hide file tree
Showing 31 changed files with 1,302 additions and 937 deletions.
2 changes: 2 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ Some consistency guidelines:

13. SMILES strings may use special characters like "#". `URLencode()` does not encode this as "%23" by default, so use `URLencode(query, reserved = TRUE)` instead. It's important to note that it's the query that has to be encoded like this, not the full url.

14. Print verbose messages. Use `httr::message_for_status()` and `webchem_message()` functions to generate standard messages when possible.

### Data Sources

You might think all webscraping is perfectly legal but it is unfortunately not that simple.
Expand Down
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ S3method(smiles,pc_prop)
S3method(smiles,wd_ident)
export(as.cas)
export(aw_query)
export(build_aw_idx)
export(cas)
export(chebi_comp_entity)
export(ci_query)
Expand Down
4 changes: 3 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# webchem 1.0.0.9005
# webchem 1.0.0.9010

## NEW FEATURES

Expand All @@ -8,6 +8,8 @@

## MINOR IMPROVEMENTS

* Most functions now use httr::RETRY() to access webservices.
* Verbose messages are now harmonized.
* The `"type"` argument in `ci_query()` and `aw_query()` has been changed to `"from"` for consistency with other functions
* `fn_percept()` and `cts_compinfo()` now have `"query"` and `"from"` arguments for consistency with other functions
* Possible values for `"from"` have been made more consistent across functions
Expand Down
231 changes: 126 additions & 105 deletions R/alanwood.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@
#' @param query character; search string
#' @param from character; type of input ('cas' or 'name')
#' @param verbose logical; print message during processing to console?
#' @param force_build logical; force building a new index? See
#' \code{\link{build_aw_idx}} for more details.
#' @param ... currently unused.
#' @param ... additional arguments to internal utility functions
#' @param type deprecated
#' @return A list of eight entries: common-name, status, preferred IUPAC Name,
#' IUPAC Name, cas, formula, activity, subactivity, inchikey, inchi and source
Expand All @@ -35,25 +33,29 @@
#' # use CAS-numbers
#' aw_query("79622-59-6", from = 'cas')
#' }
#' @seealso \code{\link{build_aw_idx}}

aw_query <- function(query, from = c("name", "cas"), verbose = TRUE,
force_build = FALSE, type, ...) {
type, ...) {

if (ping_service("aw") == FALSE) stop(webchem_message("service_down"))

if (!missing(type)) {
message('"type" is deprecated. Please use "from" instead. ')
from <- type
}

if ("commonname" %in% from) {
warning('To search by compound name use "name" instead of "commonname"')
message('To search by compound name use "name" instead of "commonname"')
from <- "name"
}
from <- match.arg(from)
aw_idx <- build_aw_idx(verbose, force_build)
aw_idx <- build_aw_idx(verbose, ...)

foo <- function(query, from, verbose) {
on.exit(suppressWarnings(closeAllConnections()))

if (is.na(query)) {
if (verbose) webchem_message("na")
return(NA)
}
if (verbose) webchem_message("query", query, appendLF = FALSE)
# search links in indexes
if (from == "name") {
links <- aw_idx$links[aw_idx$source == "cn"]
Expand All @@ -70,80 +72,92 @@ aw_query <- function(query, from = c("name", "cas"), verbose = TRUE,
}

takelink <- links[tolower(names) == tolower(query)]
if (is.na(query)) takelink <- vector()
if (length(takelink) == 0) {
message("Not found! Returning NA.\n")
if (verbose) message("Not found.")
return(NA)
}
if (length(takelink) > 1) {
takelink <- unique(takelink)
if (length(takelink) > 1) {
message("More then one link found! Returning first.\n")
message("More then one link found. Returning first.")
takelink <- takelink[1]
}
}
if (verbose)
message("Querying ", takelink)

qurl <- paste0("http://www.alanwood.net/pesticides/", takelink)
Sys.sleep(rgamma(1, shape = 15, scale = 1 / 10))
ttt <- read_html(paste0("http://www.alanwood.net/pesticides/", takelink))

status <- xml_text(
xml_find_all(ttt, "//tr/th[@id='r1']/following-sibling::td"))
pref_iupac_name <- xml_text(
xml_find_all(ttt, "//tr/th[@id='r2']/following-sibling::td"))
iupac_name <- xml_text(
xml_find_all(ttt, "//tr/th[@id='r3']/following-sibling::td"))
cas <- xml_text(
xml_find_all(ttt, "//tr/th[@id='r5']/following-sibling::td"))
formula <- xml_text(
xml_find_all(ttt, "//tr/th[@id='r6']/following-sibling::td"))
activity <- xml_text(
xml_find_all(ttt, "//tr/th[@id='r7']/following-sibling::td"))
subactivity <- trimws(
strsplit(gsub("^.*\\((.*)\\)", "\\1", activity), ";")[[1]])
activity <- gsub("^(.*) \\(.*\\)", "\\1", activity)
inchikey_r <- xml_text(
xml_find_all(ttt, "//tr/th[@id='r11']/following-sibling::td"))
if (length(inchikey_r) == 0) {
inchikey <- NA
} else {
if (grepl("isomer", inchikey_r)) {
inchikey <- c(
s_isomer = gsub(
".*\\(S\\)-isomer:(.*)(minor component.*)", "\\1", inchikey_r),
r_isomer = gsub(".*\\(R\\)-isomer:(.*)", "\\1", inchikey_r))
}
if (grepl("identifier", inchikey_r)) {
inchikey <- c(gsub("(.*)identifier.*", "\\1", inchikey_r),
gsub(".*identifier.*:(.*)", "\\1", inchikey_r))
names(inchikey) <- c("inchikey",
gsub(".*(identifier.*:).*", "\\1", inchikey_r)
)
}
if (!grepl("isomer", inchikey_r) & !grepl("identifier", inchikey_r))
inchikey <- inchikey_r
res <- try(httr::RETRY("GET",
qurl,
httr::user_agent(webchem_url()),
terminate_on = 404,
quiet = TRUE), silent = TRUE)
if (inherits(res, "try-error")) {
if (verbose) webchem_message("service_down")
return(NA)
}
if (verbose) message(httr::message_for_status(res))
if (res$status_code == 200){
ttt <- read_html(res)
status <- xml_text(
xml_find_all(ttt, "//tr/th[@id='r1']/following-sibling::td"))
pref_iupac_name <- xml_text(
xml_find_all(ttt, "//tr/th[@id='r2']/following-sibling::td"))
iupac_name <- xml_text(
xml_find_all(ttt, "//tr/th[@id='r3']/following-sibling::td"))
cas <- xml_text(
xml_find_all(ttt, "//tr/th[@id='r5']/following-sibling::td"))
formula <- xml_text(
xml_find_all(ttt, "//tr/th[@id='r6']/following-sibling::td"))
activity <- xml_text(
xml_find_all(ttt, "//tr/th[@id='r7']/following-sibling::td"))
subactivity <- trimws(
strsplit(gsub("^.*\\((.*)\\)", "\\1", activity), ";")[[1]])
activity <- gsub("^(.*) \\(.*\\)", "\\1", activity)
inchikey_r <- xml_text(
xml_find_all(ttt, "//tr/th[@id='r11']/following-sibling::td"))
if (length(inchikey_r) == 0) {
inchikey <- NA
} else {
if (grepl("isomer", inchikey_r)) {
inchikey <- c(
s_isomer = gsub(
".*\\(S\\)-isomer:(.*)(minor component.*)", "\\1", inchikey_r),
r_isomer = gsub(".*\\(R\\)-isomer:(.*)", "\\1", inchikey_r))
}
if (grepl("identifier", inchikey_r)) {
inchikey <- c(gsub("(.*)identifier.*", "\\1", inchikey_r),
gsub(".*identifier.*:(.*)", "\\1", inchikey_r))
names(inchikey) <- c("inchikey",
gsub(".*(identifier.*:).*", "\\1", inchikey_r)
)
}
if (!grepl("isomer", inchikey_r) & !grepl("identifier", inchikey_r))
inchikey <- inchikey_r
}

inchi <- xml_text(
xml_find_all(ttt, "//tr/th[@id='r12']/following-sibling::td"))
if (length(inchi) == 0) {
inchi <- NA
} else {
if (grepl("isomer", inchi)) {
inchi <- c(s_isomer = gsub(".*\\(S\\)-isomer:(.*)(minor component.*)",
"\\1", inchi),
r_isomer = gsub(".*\\(R\\)-isomer:(.*)", "\\1", inchi))
inchi <- xml_text(
xml_find_all(ttt, "//tr/th[@id='r12']/following-sibling::td"))
if (length(inchi) == 0) {
inchi <- NA
} else {
if (grepl("isomer", inchi)) {
inchi <- c(s_isomer = gsub(".*\\(S\\)-isomer:(.*)(minor component.*)",
"\\1", inchi),
r_isomer = gsub(".*\\(R\\)-isomer:(.*)", "\\1", inchi))
}
}
# add source url
source_url <- paste0("http://www.alanwood.net/pesticides/", takelink)
out <- list(cname = cname, status = status,
pref_iupac_name = pref_iupac_name, iupac_name = iupac_name,
cas = cas, formula = formula, activity = activity,
subactivity = subactivity, inchikey = inchikey, inchi = inchi,
source_url = source_url)
return(out)
}
else {
return(NA)
}
# add source url
source_url <- paste0("http://www.alanwood.net/pesticides/", takelink)
out <- list(cname = cname, status = status,
pref_iupac_name = pref_iupac_name, iupac_name = iupac_name,
cas = cas, formula = formula, activity = activity,
subactivity = subactivity, inchikey = inchikey, inchi = inchi,
source_url = source_url)
return(out)
}
out <- lapply(query, function(x) foo(x, from = from, verbose = verbose))
out <- setNames(out, query)
Expand All @@ -155,19 +169,18 @@ aw_query <- function(query, from = c("name", "cas"), verbose = TRUE,
#'
#' This function builds an index of Alan Woods Compendium of Pesticides
#' \url{http://www.alanwood.net/pesticides} and saves it to
#' \code{\link{tempdir}}. This is a utility function for
#' \code{\link{aw_query}} and will not be exported in future releases.
#' \code{\link{tempdir}}.
#' @import xml2
#'
#' @param verbose logical; print message during processing to console?
#' @param force_build logical; force building a new index?
#' @return a data.frame
#' @seealso \code{\link{aw_query}}, \code{\link{tempdir}}
#' @author Eduard Szöcs, \email{eduardszoecs@@gmail.com}
#' @source \url{http://www.alanwood.net/pesticides}
#' @export
#' @noRd
build_aw_idx <- function(verbose = TRUE, force_build = FALSE) {
on.exit(suppressWarnings(closeAllConnections()))
message(msg = "build_aw_idx() will not be exported in future releases.")
if (ping_service("aw") == FALSE) stop(webchem_message("service_down"))
suppressWarnings(try(load(paste0(tempdir(), "/data/aw_idx.rda")),
silent = TRUE))
if (!file.exists(paste0(tempdir(), "/data/aw_idx.rda")) |
Expand All @@ -176,40 +189,48 @@ build_aw_idx <- function(verbose = TRUE, force_build = FALSE) {
if (!dir.exists(paste0(tempdir(), "/data"))) {
dir.create(paste0(tempdir(), "/data"))
}
if (verbose == TRUE) {
message("Building index.", appendLF = FALSE)
}
idx1 <- read_html("http://www.alanwood.net/pesticides/index_rn.html")
prep_idx <- function(y) {
names <- xml_text(xml_find_all(y, "//dl/dt"))
links <- xml_attr(
xml_find_all(y, "//dt/following-sibling::dd[1]/a[1]"), "href")
linknames <- xml_text(xml_find_all(y, "//dt/following-sibling::dd[1]/a[1]"))
return(data.frame(names, links, linknames, stringsAsFactors = FALSE))
if (verbose) message("Building index. ", appendLF = FALSE)
res <- try(httr::RETRY("GET",
"http://www.alanwood.net/pesticides/index_rn.html",
httr::user_agent(webchem_url()),
terminate_on = 404,
quiet = TRUE), silent= TRUE)
if (inherits(res, "try-error")) {
if (verbose) webchem_message("service_down")
return(NA)
}
aw_idx <- rbind(prep_idx(idx1))
aw_idx[["source"]] <- "rn"
idx4 <- read_html("http://www.alanwood.net/pesticides/index_cn.html")
n <- xml_find_all(idx4, "//a")
names <- xml_text(n)
rm <- names == ""
names <- names[!rm]
links <- xml_attr(n, "href")
links <- links[!rm]
idx4 <- data.frame(names = NA, links = links, linknames = names,
source = "cn", stringsAsFactors = FALSE)
aw_idx <- rbind(aw_idx, idx4)
if (verbose) message(httr::message_for_status(res))
if (res$status_code == 200){
idx1 <- read_html("http://www.alanwood.net/pesticides/index_rn.html")
prep_idx <- function(y) {
names <- xml_text(xml_find_all(y, "//dl/dt"))
links <- xml_attr(
xml_find_all(y, "//dt/following-sibling::dd[1]/a[1]"), "href")
linknames <- xml_text(
xml_find_all(y, "//dt/following-sibling::dd[1]/a[1]"))
return(data.frame(names, links, linknames, stringsAsFactors = FALSE))
}
aw_idx <- rbind(prep_idx(idx1))
aw_idx[["source"]] <- "rn"
idx4 <- read_html("http://www.alanwood.net/pesticides/index_cn.html")
n <- xml_find_all(idx4, "//a")
names <- xml_text(n)
rm <- names == ""
names <- names[!rm]
links <- xml_attr(n, "href")
links <- links[!rm]
idx4 <- data.frame(names = NA, links = links, linknames = names,
source = "cn", stringsAsFactors = FALSE)
aw_idx <- rbind(aw_idx, idx4)

# fix encoding
ln <- aw_idx$linknames
Encoding(ln) <- "latin1"
ln <- iconv(ln, from = "latin1", to = "ASCII", sub = "")
aw_idx$linknames <- ln
attr(aw_idx, "date") <- Sys.Date()
if (verbose == TRUE) {
message(" Done.")
# fix encoding
ln <- aw_idx$linknames
Encoding(ln) <- "latin1"
ln <- iconv(ln, from = "latin1", to = "ASCII", sub = "")
aw_idx$linknames <- ln
attr(aw_idx, "date") <- Sys.Date()
save(aw_idx, file = paste0(tempdir(), "/data/aw_idx.rda"))
}
save(aw_idx, file = paste0(tempdir(), "/data/aw_idx.rda"))
}
return(aw_idx)
}
Loading

0 comments on commit 0cbdb29

Please sign in to comment.