Adding indirect valid names in BFO, and WCVP

LimaRAF · Oct 15, 2024 · 2d31021 · 2d31021
1 parent 0547a34
commit 2d31021
Show file tree

Hide file tree

Showing 8 changed files with 135 additions and 3 deletions.
diff --git a/data-raw/get_bfo.R b/data-raw/get_bfo.R
@@ -107,12 +107,51 @@ if (last_updated != last_download) {
 
 
   # Editing data --------------------------------------------------
+  ## adding missing accepted names
+  miss_ids <- unique(
+    data$acceptedNameUsageID[!data$acceptedNameUsageID %in% unique(data$id)])
+  miss_ids <- miss_ids[!miss_ids %in% c("", " ", NA)]
+  if (length(miss_ids) > 0) {
+    miss_data <- data[data$acceptedNameUsageID %in% miss_ids, ]
+    miss_data$taxonID <- miss_data$acceptedNameUsageID 
+    miss_data$acceptedNameUsageID <- NA
+
+    miss_data$scientificName <- miss_data$acceptedNameUsage
+    miss_data$acceptedNameUsage <- NA
+    miss_data$parentNameUsage <- NA
+    miss_data$higherClassification <- NA
+    miss_data$nomenclaturalStatus <- "valid"
+    miss_data$taxonomicStatus <- "accepted"
+
+    miss_data$id <- miss_data$taxonID
+    miss_data$parentNameUsageID <- NA
+    miss_data$namePublishedInYear <- NA
+    miss_data$namePublishedIn <- NA
+    miss_data$originalNameUsageID <- NA
+    miss_data$genus <- NA
+    miss_data$specificEpithet <- NA
+    miss_data$infraspecificEpithet <- NA
+    miss_data$modified <- NA
+    miss_data$bibliographicCitation <- NA
+    miss_data$references <- NA
+
+    x <- data.frame(scientificName = miss_data$scientificName)
+    tmp <- plantR::fixSpecies(x)
+
+    miss_data$taxon_name <- tmp$scientificName.new
+    miss_data$scientificNameAuthorship <- 
+      tmp$scientificNameAuthorship.new
+
+    miss_data1 <- unique(miss_data)
+
+    data <- rbind.data.frame(data, miss_data1)
+  }
+
   ## filtering and standardizing important column names
   cols <- c("taxonID", "higherClassification" ,"phylum", "family", 
             "taxon_name", "scientificNameAuthorship",
             "taxonRank", "nomenclaturalStatus", "taxonomicStatus", 
             "acceptedNameUsageID", "kingdom", "scientificName") 
-
   data <- as.data.frame(data)[, cols]
   names(data) <- c("id", "higherClassification", "phylum", "family", 
                    "name", "authorship", 
@@ -134,13 +173,14 @@ if (last_updated != last_download) {
     Encoding(data$name[rep_these]) <- "UTF-8"
     data$name[rep_these] <- iconv(data$name[rep_these], "UTF-8", "UTF-8")
   }
-  
+
   ## obtaining the accepted.name column
   rep_these <- is.na(data$accepted.id)
   data1 <- data[rep_these, 
                 c("id", "name", "authorship", 
                   "taxon.rank", "taxon.status", "name.status")]
   names(data1)[1] <- "accepted.id"
+  data1 <- data1[!duplicated(data1$accepted.id), ]
   tmp <- dplyr::left_join(data, data1, by = "accepted.id")
   stopifnot(identical(tmp$id, data$id)) # should be TRUE
 
@@ -156,6 +196,44 @@ if (last_updated != last_download) {
   data$accepted.taxon.status[!rep_these] <- tmp$taxon.status.y[!rep_these]
   data$accepted.name.status[!rep_these] <- tmp$name.status.y[!rep_these]
 
+  ## Any missing accepted names?
+  rep_these <- !is.na(data$accepted.id) & is.na(data$accepted.name)
+  if (any(rep_these)) {
+    tmp <- data[rep_these, "accepted.id", drop = FALSE]
+    names(tmp)[1] <- "id"
+    col2rep <- c("accepted.name", "accepted.authorship", 
+                 "accepted.taxon.rank", "accepted.taxon.status", 
+                 "accepted.name.status")
+    data1 <- data[, c("id", "accepted.id", col2rep)]
+    tmp1 <- dplyr::left_join(tmp, data1, by = "id")
+
+    check_these <- is.na(tmp1$accepted.name)
+    if (any(check_these)) {
+
+      check_ids <- tmp1$accepted.id[check_these]
+      data1 <- data[match(check_ids, data$accepted.id),]
+      data1 <- unique(data1[data1$taxon.status %in% "accepted", ])
+
+      check_ids <- tmp1$id[check_these]
+      data2 <- data[match(check_ids, data$accepted.id),]
+      data2 <- unique(data2[data2$taxon.status %in% "accepted", ])
+      data3 <- unique(rbind.data.frame(data1, data2))
+
+      # names(tmp) <- "accepted.id"
+      col2rep1 <- c("name", "authorship", "taxon.rank", 
+                    "taxon.status", "name.status")
+      tmp2 <- dplyr::left_join(tmp1[check_these,], 
+                               data3[, c("accepted.id", col2rep1)], 
+                               by = "accepted.id")
+
+      tmp1[check_these, c("accepted.id", col2rep)] <- 
+        tmp2[, c("id", col2rep1)] 
+    }
+
+    data[rep_these, c("accepted.id", col2rep)] <- 
+      tmp1[, c("accepted.id", col2rep)] 
+  }
+
   ## Organizing fields
   cols1 <- c("id",
              "higherClassification", 

diff --git a/data-raw/get_wcvp.R b/data-raw/get_wcvp.R
@@ -214,7 +214,7 @@ if (last_updated != last_download) {
   table(data1$name.status, data1$taxon.status)
   names(data1)[1] <- "accepted.id"
   tmp <- dplyr::left_join(data, data1, by = "accepted.id")
-  identical(tmp$id, data$id) # should be TRUE
+  stopifnot(identical(tmp$id, data$id)) # should be TRUE
   data$accepted.name <- NA_character_
   data$accepted.authorship <- NA_character_
   data$accepted.taxon.rank <- NA_character_
@@ -227,6 +227,22 @@ if (last_updated != last_download) {
   data$accepted.taxon.status[!rep_these] <- tmp$taxon.status.y[!rep_these]
   data$accepted.name.status[!rep_these] <- tmp$name.status.y[!rep_these]
 
+  ## Any missing accepted names?
+  rep_these <- !data$accepted.id %in% c("", " ", NA) & 
+                  data$accepted.name %in% c("", " ", NA) &
+                    data$id != data$accepted.id
+  if (any(rep_these)) {
+    tmp <- data[rep_these, "accepted.id", drop = FALSE]
+    names(tmp)[1] <- "id"
+
+    col2rep <- c("name", "authorship", "taxon.rank", 
+                  "taxon.status", "name.status")
+    data1 <- data[, c("id", "accepted.id", col2rep)]
+    tmp1 <- dplyr::left_join(tmp, data1, by = "id")
+
+    data[rep_these, col2rep] <-  tmp1[, col2rep] 
+  }
+
   ## Organizing fields
   cols1 <- c("id",
              "family", # "genus", "specific.epiteth", "infra.epiteth",

diff --git a/data-raw/get_wfo.R b/data-raw/get_wfo.R
@@ -93,6 +93,44 @@ if (last_updated != last_download) {
   data$accepted.taxon.status[!rep_these] <- tmp$taxon.status.y[!rep_these]
   data$accepted.name.status[!rep_these] <- tmp$name.status.y[!rep_these]
 
+  ## Any missing accepted names?
+  rep_these <- !data$accepted.id %in% c("", " ", NA) & 
+                data$accepted.name %in% c("", " ", NA)
+  if (any(rep_these)) {
+    tmp <- data[rep_these, "accepted.id", drop = FALSE]
+    names(tmp)[1] <- "id"
+    col2rep <- c("accepted.name", "accepted.authorship", 
+                 "accepted.taxon.rank", "accepted.taxon.status", 
+                 "accepted.name.status")
+    data1 <- data[, c("id", "accepted.id", col2rep)]
+    tmp1 <- dplyr::left_join(tmp, data1, by = "id")
+
+    check_these <- is.na(tmp1$accepted.name)
+    if (any(check_these)) {
+
+      check_ids <- tmp1$accepted.id[check_these]
+      data1 <- data[match(check_ids, data$accepted.id),]
+      data1 <- unique(data1[data1$taxon.status %in% "accepted", ])
+
+      check_ids <- tmp1$id[check_these]
+      data2 <- data[match(check_ids, data$accepted.id),]
+      data2 <- unique(data2[data2$taxon.status %in% "accepted", ])
+      data3 <- unique(rbind.data.frame(data1, data2))
+
+      col2rep1 <- c("name", "authorship", "taxon.rank", 
+                    "taxon.status", "name.status")
+      tmp2 <- dplyr::left_join(tmp1[check_these,], 
+                               data3[, c("accepted.id", col2rep1)], 
+                               by = "accepted.id")
+
+      tmp1[check_these, c("accepted.id", col2rep)] <- 
+        tmp2[, c("id", col2rep1)] 
+    }
+
+    data[rep_these, c("accepted.id", col2rep)] <- 
+      tmp1[, c("accepted.id", col2rep)] 
+  }
+
   ## Organizing fields
   cols1 <- c("id",
              "phylum",

diff --git a/data/bfoNamesAlgae.rda b/data/bfoNamesAlgae.rda
diff --git a/data/bfoNamesBryophyta.rda b/data/bfoNamesBryophyta.rda
diff --git a/data/bfoNamesFungi.rda b/data/bfoNamesFungi.rda
diff --git a/data/bfoNamesTracheophyta.rda b/data/bfoNamesTracheophyta.rda
diff --git a/data/wcvpNames.rda b/data/wcvpNames.rda