Updating all taxonomic backbones (bug with accepted names) and docume…

…ntation
LimaRAF · May 31, 2024 · ea527f6 · ea527f6
1 parent 5aea91e
commit ea527f6
Show file tree

Hide file tree

Showing 23 changed files with 148 additions and 68 deletions.
diff --git a/.gitignore b/.gitignore
@@ -25,7 +25,6 @@ inst/doc
 *.prj
 *.shp
 *.shx
-data-raw
 data-raw/latam
 data-raw/wfo
 data-raw/wcvp

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,20 +1,16 @@
 Package: plantRdata
 Type: Package
 Title: Accessory Datasets for Package plantR
-Version: 0.0.0.9000
+Version: 0.0.1
 Authors@R: c(
     person(given   = "Renato A.",
            family  = "Ferreira de Lima",
            role    = c("aut", "cre"),
            email   = "[email protected]",
-           comment = c(ORCID = "0000-0002-1048-0138")),
-    person(given = "Andrea",
-           family = "Sánchez-Tapia",
-           role = c("aut"),
-           email = "[email protected]",
-           comment = c(ORCID = "0000-0002-3521-4338")))
+           comment = c(ORCID = "0000-0002-1048-0138")))
 Maintainer: Renato A. Ferreira de Lima <[email protected]>
-Description: Creates reproducible datasets and maps for use by package plantR, a package for managing records from biological collections.
+Description: Creates reproducible datasets and maps for use by package plantR, 
+              a package for managing records from biological collections.
 URL: https://github.com/LimaRAF/plantRdata
 BugReports: https://github.com/LimaRAF/plantRdata/issues
 License: GPL (>= 3)
@@ -24,13 +20,14 @@ RoxygenNote: 7.3.1
 Config/testthat/edition: 3
 VignetteBuilder: knitr
 Depends: 
-    R (>= 2.10)
+    R (>= 3.5.0)
 Imports: 
+    here,
     httr,
     rvest
 Suggests: 
     knitr,
     rmarkdown,
     testthat (>= 3.0.0)
 LazyData: true
-LazyDataCompression: xz
+LazyDataCompression: xz
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
+export(loadData)
+importFrom(here,here)
 importFrom(httr,GET)
 importFrom(httr,content)
 importFrom(rvest,html_element)

diff --git a/NEWS.md b/NEWS.md
@@ -1,10 +1,10 @@
-# plantRdata 0.0.0.9000
+# plantRdata 0.0.1
 
 * Creation of the package and adding the `NEWS.md` file to track changes.
 
-* I wrote the workflow for latamMaps. Rocc works perfectly, I formatted lightly* just to check the flow and the object sizes. That's going to be very tricky. We are losing set_crs somewhere along the way, maybe it's PROJ, maybe it's another thing. The package is weighing 10MB. I am avoiding the complicated formatting on purpose.
+* Addition of the first taxonomic backbones: WFO, WCVP and GBIF
 
-* Created a preliminary `world` object to check the workflow. The package is heavier than it should but installation is OK. By setting plantRmaps::world, shares_border() passes all the tests, even the French Guiana problem n_n
+* Creation of the internal functions to help compiling and saving the datatsets
 
 
 
diff --git a/R/gbifNames.R b/R/gbifNames.R
@@ -19,7 +19,7 @@
 #' @source \url{https://hosted-datasets.gbif.org/datasets/backbone/current/}
 #' @evalRd .readScript("data-raw/gbif/last_update.txt", 
 #'  "Last update/change of the downloaded backbone (year-month-day):")
-#' @format An object of class \code{data.frame} with 12 columns and over 1.9 million rows.
+#' @format An object of class \code{data.frame} with 13 columns and over 1.9 million rows.
 #' @evalRd .readScript("data-raw/gbif/citation.txt", "", "references")
 #'  
 "gbifNamesPlantae"
@@ -45,7 +45,7 @@
 #' @source \url{https://hosted-datasets.gbif.org/datasets/backbone/current/}
 #' @evalRd .readScript("data-raw/gbif/last_update.txt", 
 #'  "Last update/change of the downloaded backbone (year-month-day):")
-#' @format An object of class \code{data.frame} with 12 columns and almost 400 thousand rows.
+#' @format An object of class \code{data.frame} with 13 columns and almost 400 thousand rows.
 #' @evalRd .readScript("data-raw/gbif/citation.txt", "", "references")
 #'  
 "gbifNamesFungi"
@@ -71,7 +71,7 @@
 #' @source \url{https://hosted-datasets.gbif.org/datasets/backbone/current/}
 #' @evalRd .readScript("data-raw/gbif/last_update.txt", 
 #'  "Last update/change of the downloaded backbone (year-month-day):")
-#' @format An object of class \code{data.frame} with 12 columns and over 3.7 million rows.
+#' @format An object of class \code{data.frame} with 13 columns and over 3.7 million rows.
 #' @evalRd .readScript("data-raw/gbif/citation.txt", "", "references")
 #'  
 "gbifNamesAnimalia"
diff --git a/R/internals.R b/R/internals.R
@@ -58,8 +58,30 @@
   }    
 }
 
-
 #' 
+#' @title Store External Data
+#' 
+#' @param data an R object. The data to be saved in `inst\extdata`.
+#' @param source a character. The name of the data source (e.g.
+#'   "wfo") which correspond to the new subfolder in `inst\extdata`.
+#' @param name a character. The name of the file taht will cointain
+#' the data (e.g. "wfoNames")
+#'  
+#' @keywords internal
+#' 
+#' @importFrom here here
+#'
+#' @noRd
+#' 
+.storeData <- function(data, source = "wcvp", name = "wcvpNames") {
+  dir <- here::here("inst", "extdata", source)
+  if (!dir.exists(dir)) 
+    dir.create(dir)
+
+  save(data, file = file.path(dir, paste0(name, ".rda")),
+       compress = "xz")
+}
+
 #' @title Read File in Help
 #' 
 #' @param file a path to the file with the script to be read

diff --git a/R/wcvpNames.R b/R/wcvpNames.R
@@ -3,7 +3,7 @@
 #' @description A dataset containing the most relevant taxonomic
 #'   information of plant names stored in the [World Checklist of Vascular Plants](https://powo.science.kew.org/), 
 #'   taxonomic backbone including vascular plants (i.e. licophytes,
-#'   ferns, gimnosperms and angiosperms) and all taxonomic levels
+#'   ferns, gymnosperms and angiosperms) and all taxonomic levels
 #'   (i.e. infra-species, species, genus, family and so on).
 #'   Bryophytes are not included. \cr\cr The original backbone was
 #'   slightly edited aiming to standardize the notation across
@@ -20,7 +20,8 @@
 #' @source \url{https://sftp.kew.org/pub/data-repositories/WCVP/}
 #' @evalRd .readScript("data-raw/wcvp/last_update.txt", 
 #'  "Last update/change of the downloaded backbone (year-month-day):")
-#' @format An object of class \code{data.frame} with 10 columns and over 1.4 million rows.
+#' @format An object of class \code{data.frame} with 11 columns and
+#'   over 1.4 million rows.
 #' @references 
 #'   Govaerts, R., Nic Lughadha, E., Black, N. et al. (2021). The
 #'   World Checklist of Vascular Plants, a continuously updated

diff --git a/R/wfoNames.R b/R/wfoNames.R
@@ -3,7 +3,7 @@
 #' @description A dataset containing the most relevant taxonomic
 #'   information of plant names stored in the [World Flora Online](https://www.worldfloraonline.org/), 
 #'   taxonomic backbone including bryophytes and vascular plants (i.e.
-#'   licophytes, ferns, gimnosperms and angiosperms) and all taxonomic
+#'   licophytes, ferns, gymnosperms and angiosperms) and all taxonomic
 #'   levels (i.e. infra-species, species, genus, family and so
 #'   on).\cr\cr The original backbone was slightly edited aiming to
 #'   standardize the notation across backbones provided in
@@ -16,11 +16,12 @@
 #'
 #' @keywords datasets
 #' @name wfoNames
+#' @aliases wfoNames
 #' @usage data(wfoNames)
 #' @source \url{https://files.worldfloraonline.org/files/WFO_Backbone/_WFOCompleteBackbone/}
 #' @evalRd .readScript("data-raw/wfo/last_update.txt", 
 #'  "Last update of the downloaded backbone (month/day/year):")
-#' @format An object of class \code{data.frame} with 11 columns and over 1.5 million rows.
+#' @format An object of class \code{data.frame} with 12 columns and over 1.5 million rows.
 #' @references 
 #'   Borsch, T., Berendsohn, W., Dalcin, E., et al. (2020). World
 #'   Flora Online: Placing taxonomists at the heart of a definitive

diff --git a/data-raw/get_gbif.R b/data-raw/get_gbif.R
@@ -28,7 +28,7 @@ last_download <- readLines(file.path(here::here(), "data-raw", backbone,
 if (last_updated != last_download) {
   ## download the latest taxonomic backbone (in browser or using the code below)
   url <- paste0(url0, zip)
-  options(timeout = max(300, getOption("timeout")))
+  options(timeout = max(600, getOption("timeout")))
   utils::download.file(url = url, destfile = path, mode = "wb")
 
   ## unzipping the data
@@ -67,16 +67,20 @@ if (last_updated != last_download) {
   data$name <- .squish(data$name)
 
   ## obtaining the accepted.name column
-  data1 <- data[, c("id", "name", "authorship", "taxon.rank")]
+  rep_these <- is.na(data$accepted.id)
+  data1 <- data[rep_these, c("id", "name", "authorship", 
+                    "taxon.rank", "name.status")]
   names(data1)[1] <- "accepted.id" 
   tmp <- dplyr::left_join(data, data1, by = "accepted.id")
   identical(tmp$id, data$id) # should be TRUE
   rep_these <- !data$accepted.id %in% c("", " ", NA, "NA")
-  data$accepted.name <- NA_character_
+  data$accepted.authorship <- NA_character_
   data$accepted.taxon.rank <- NA_character_
-  data$accepted.name[rep_these] <- paste(tmp$name.x[rep_these],
-                                         tmp$authorship.x[rep_these])
-  data$accepted.taxon.rank[rep_these] <- tmp$taxon.rank.x[rep_these]
+  data$accepted.name.status <- NA_character_
+  data$accepted.name[!rep_these] <- tmp$name.y[!rep_these]
+  data$accepted.authorship[!rep_these] <- tmp$authorship.y[!rep_these]
+  data$accepted.taxon.rank[!rep_these] <- tmp$taxon.rank.y[!rep_these]
+  data$accepted.name.status[!rep_these] <- tmp$name.status.y[!rep_these]
 
   ## Organizing fields
   cols1 <- c("id",
@@ -89,15 +93,19 @@ if (last_updated != last_download) {
              "taxon.rank", # species, genus, family, order, etc.
              "taxon.status", # accepted or synonym
              "name.status", # correct, ilegitimate, legitimate, but incorrect, orthographical variant, missapplied, not validly published, rejected
-             "accepted.name",  #accepted binomial + authors             
-             "accepted.taxon.rank") 
+             "accepted.name",  #accepted canonical             
+             "accepted.authorship",  #accepted authors             
+             "accepted.taxon.rank",
+             "accepted.name.status") 
   data <- data[, cols1]
 
   ## Basic standardization of notation
   data$taxon.rank <- tolower(data$taxon.rank)
   data$taxon.status <- tolower(data$taxon.status)
   data$name.status <- tolower(data$name.status)
   data$accepted.taxon.rank <- tolower(data$accepted.taxon.rank)
+  data$accepted.name.status <- tolower(data$accepted.name.status)
+
 
   # Saving ------------------------------------------------------------
   reinos <- c("Plantae", "Fungi", "Animalia")
@@ -111,6 +119,9 @@ if (last_updated != last_download) {
   data <- data[!duplicated(paste0(data$kingdom, data$scientific.name)), ]
   data <- data[order(data$id), ]
 
+  ## Removing the combined name + authorship column
+  data <- data[, -which(names(data) %in% "scientific.name")]
+
   ## Adding source acronym to the backbone ID
   data$id <- paste0(backbone, "-", data$id)
 

diff --git a/data-raw/get_wcvp.R b/data-raw/get_wcvp.R
@@ -53,26 +53,33 @@ if (last_updated != last_download) {
   ## filtering and standardizing important column names
   cols <- c("plant_name_id", "family", "taxon_name", "taxon_authors",
             "taxon_rank", "nomenclatural_remarks", "taxon_status", 
-            "accepted_plant_name_id") 
+            "accepted_plant_name_id", "powo_id") 
+
   data <- as.data.frame(data)[, cols]
   names(data) <- c("id", "family", "name", "authorship", "taxon.rank",
-                   "name.status", "taxon.status", "accepted.id")
+                   "name.status", "taxon.status", "accepted.id", "id.powo")
 
   ## obtaining the scientific.name (taxon names + authors)
   data$scientific.name <- 
     .buildName(data, col.names = c("name", "authorship"))
 
   ## obtaining the accepted.name column
-  data1 <- data[, c("id", "name", "authorship", "taxon.rank")]
-  names(data1)[1] <- "accepted.id" 
+  rep_these <- data$id == data$accepted.id
+  rep_these[is.na(rep_these)] <- FALSE
+  data1 <- data[rep_these, 
+                c("id", "name", "authorship", 
+                  "taxon.rank", "name.status")]
+  names(data1)[1] <- "accepted.id"
   tmp <- dplyr::left_join(data, data1, by = "accepted.id")
   identical(tmp$id, data$id) # should be TRUE
-  rep_these <- !data$accepted.id %in% c("", " ", NA, "NA")
   data$accepted.name <- NA_character_
+  data$accepted.authorship <- NA_character_
   data$accepted.taxon.rank <- NA_character_
-  data$accepted.name[rep_these] <- paste(tmp$name.x[rep_these],
-                                         tmp$authorship.x[rep_these])
-  data$accepted.taxon.rank[rep_these] <- tmp$taxon.rank.x[rep_these]
+  data$accepted.name.status <- NA_character_
+  data$accepted.name[!rep_these] <- tmp$name.y[!rep_these]
+  data$accepted.authorship[!rep_these] <- tmp$authorship.y[!rep_these]
+  data$accepted.taxon.rank[!rep_these] <- tmp$taxon.rank.y[!rep_these]
+  data$accepted.name.status[!rep_these] <- tmp$name.status.y[!rep_these]
 
   ## Organizing fields
   cols1 <- c("id",
@@ -83,15 +90,18 @@ if (last_updated != last_download) {
              "taxon.rank", # species, genus, family, order, etc.
              "taxon.status", # accepted or synonym
              "name.status", # correct, ilegitimate, legitimate, but incorrect, orthographical variant, missapplied, not validly published, rejected
-             "accepted.name",  #accepted binomial + authors             
-             "accepted.taxon.rank") 
+             "accepted.name",  #accepted canonical             
+             "accepted.authorship",  #accepted authors             
+             "accepted.taxon.rank",
+             "accepted.name.status") 
   data <- data[, cols1]
 
   ## Basic standardization of notation
   data$taxon.rank <- tolower(data$taxon.rank)
   data$taxon.status <- tolower(data$taxon.status)
   data$name.status <- tolower(data$name.status)
   data$accepted.taxon.rank <- tolower(data$accepted.taxon.rank)
+  data$accepted.name.status <- tolower(data$accepted.name.status)
 
   # further editing of name status
   status <- .squish(data$name.status)
@@ -134,25 +144,27 @@ if (last_updated != last_download) {
   data <- data[!duplicated(data$scientific.name), ]
   data <- data[order(data$id), ]
 
+  ## Removing the combined name + authorship column
+  data <- data[, -which(names(data) %in% "scientific.name")]
+
   ## Adding source acronym to the backbone ID
   data$id <- paste0(backbone, "-", data$id)
 
-  ## How many columns and lines (in April 2024: 1,421,040)
+  ## How many columns and lines (in April 2024: 1,421,040; May 2024: 1,429,871)
   dimensions <- paste0(dim(data)[1], " rows and ", dim(data)[2], " columns")
 
   ## Saving
+  # .storeData(data, source= backbone, name= paste0(backbone, "Names"))
   wcvpNames <- data
   usethis::use_data(wcvpNames, compress = "xz", overwrite=TRUE)
 
-  path_to_save <- file.path(here::here(), "data-raw", backbone, 
-                            "last_update.txt")
-  write(last_update, path_to_save)
-  path_to_save <- file.path(here::here(), "data-raw", backbone, 
-                            "version.txt")
-  write(version, path_to_save)
-  path_to_save <- file.path(here::here(), "data-raw", backbone, 
-                            "df_dim.txt")
-  write(dimensions, path_to_save)
+  data_folder <- "data-raw" # c("inst", "extdata")
+  path_folder <- file.path(here::here(), 
+                           paste0(data_folder, collapse = .Platform$file.sep),
+                           backbone)
+  write(last_update, file.path(path_folder, "last_update.txt"))
+  write(version, file.path(path_folder, "version.txt"))
+  write(dimensions, file.path(path_folder, "df_dim.txt"))
   unlink(path)
 }  
 rm(list = ls())