new function for combining site and event data and cleanup of summari…

…zing sample data given some database changes since I last worked on this
ecohealthalliance · Apr 29, 2024 · fd2e583 · fd2e583
1 parent 1ae2ff4
commit fd2e583
Show file tree

Hide file tree

Showing 5 changed files with 165 additions and 128 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -18,6 +18,7 @@ export(process_summarized_specimen_data_for_plotting)
 export(read.gb)
 export(save_tables)
 export(split_tibble)
+export(summarize_event_data)
 export(summarize_virus_data)
 importFrom(dplyr,"%>%")
 importFrom(magrittr,"%<>%")

diff --git a/R/summarize_event_data.R b/R/summarize_event_data.R
@@ -0,0 +1,74 @@
+#' Summarize Event Data
+#'
+#' @param site_data Data frame. Site info
+#' @param event_data Data frame. Sampling occasions (events)
+#' @param site_characterization Data frame. Site characterization (as named on airtable)
+#' @param base base id
+#'
+#' @return List of two data frames (specimen data and sample data)
+#' @export
+summarize_event_data <- function(site_data, event_data, site_characterization, base) {
+
+  ## 1) SABRENet and BEEZ have a bit different info in "Site Data" and "Event Data"
+  ## 2) In SABRENet -- "Sample Data" column 'Site' links appropriately to "Site Data"
+  ##    In BEEZ     -- "Sample Data" column 'Site' is actually a link to "Event Data"
+
+  ## X - Site Data (Site) <<- Site Characterization (id)
+  ## Y - Event Data (Site Data) <<- X (id)
+
+  event_data %<>% dplyr::select(
+    base_id, id, `Date arrived`, `Date depart`
+  )
+
+  site_data %<>% dplyr::select(
+    base_id, id, Site_ID, Site, Event, `Sample Data`
+  ) %>% dplyr::mutate(
+      `Site`  = lapply(`Site`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
+    , `Event` = lapply(`Event`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
+  )
+
+  site_characterization %<>% dplyr::select(
+    base_id, id, `Site name`
+  )
+
+  site_data.s  <- site_data %>% dplyr::filter(base_id == base[[1]])
+  site_data.b  <- site_data %>% dplyr::filter(base_id == base[[2]])
+
+  event_data.s <- event_data %>% dplyr::filter(base_id == base[[1]])
+  event_data.b <- event_data %>% dplyr::filter(base_id == base[[2]])
+
+  site_characterization.s <- site_characterization %>% dplyr::filter(base_id == base[[1]])
+  site_characterization.b <- site_characterization %>% dplyr::filter(base_id == base[[2]])
+
+  ## X - Site Data (Site) <<- Site Characterization (id)
+  ## Y - Event Data (Site Data) <<- X (id)
+
+  joined.s <- dplyr::left_join(
+      site_data.s
+    , site_characterization.s %>% dplyr::rename(Site = id)
+    , by = c("base_id", "Site")
+  ) %>% dplyr::left_join(
+    .
+    , event_data.s %>% dplyr::rename(Event = id)
+    , by = c("base_id", "Event")
+  ) %>% dplyr::select(
+    base_id, id, Site_ID, `Site name`, `Date arrived`, `Date depart`
+  )
+
+  joined.b <- dplyr::left_join(
+      site_data.b
+    , site_characterization.b %>% dplyr::rename(Site = id)
+    , by = c("base_id", "Site")
+  ) %>% dplyr::left_join(
+    .
+    , event_data.b %>% dplyr::rename(Event = id)
+    , by = c("base_id", "Event")
+  ) %>% dplyr::select(
+    base_id, Event, Site_ID, `Site name`, `Date arrived`, `Date depart`
+  ) %>% dplyr::rename(id = Event)
+
+  return(
+    rbind(joined.s, joined.b) %>% dplyr::rename(Site = id)
+  )
+
+}
diff --git a/R/summarize_virus_data.R b/R/summarize_virus_data.R
@@ -2,137 +2,70 @@
 #'
 #' @param bat_data Data frame. Sample data
 #' @param site_data Data frame. Site info
+#' @param species_taxonomy Data frame. Species taxonomy (as named on airtable)
 #' @param specimen_data Data frame. Specimen data
 #' @param sequence_data Data frame. Sequences (all the already cleaned ones in the Airtable table "Sequences")
 #' @param base_ids list of two Airtable base IDs
 #'
 #' @return List of two data frames (specimen data and sample data)
 #' @export
-summarize_virus_data <- function(bat_data, site_data, specimen_data, sequence_data, base_ids) {
+summarize_virus_data <- function(bat_data, site_data, species_taxonomy, specimen_data, sequence_data, base_ids) {
 
   `%notin%` <- Negate(`%in%`)
 
   ## unlist, respecting NULL entries for left_joining further down
   specimen_data %<>%
     dplyr::mutate(
-      `Sample Type`   = lapply(`Sample Type`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-    , `Laboratory ID` = lapply(`Laboratory ID`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-    ## NOTE: Need to get at the root of this problem, I do not know why just Corona Year Tested is a list and other
-     ## "year tested" columns are not
-    , `Corona Year tested` = lapply(`Corona Year tested`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-  )
+        `Sample Type`   = lapply(`Sample Type`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
+      , `Laboratory ID` = lapply(`Laboratory ID`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
+      ## NOTE: Need to get at the root of this problem, I do not know why just Corona Year Tested is a list and other
+      ## "year tested" columns are not
+      , `Corona Year tested` = lapply(`Corona Year tested`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
+    )
 
-  ## Due to somewhat different database structures regarding how visit information is spread between Site Data and Event Data,
-   ## have to, at least for now, split up and do something a bit different to parse each and stick back together
-    ## !! Would be really nice to change the database structure to match, but it seems that this either difficult or low priority
-  site_data.s  <- site_data %>% dplyr::filter(base_id == base_ids[[1]])
-  site_data.b  <- site_data %>% dplyr::filter(base_id == base_ids[[2]])
-  event_data.s <- event_data %>% dplyr::filter(base_id == base_ids[[1]])
-  event_data.b <- event_data %>% dplyr::filter(base_id == base_ids[[2]])
-
-  bat_data.s <- bat_data %>%
-    dplyr::filter(base_id == base_ids[[1]]) %>%
-    ## drop the lists for species (while retaining nulls) and site data
-    dplyr::mutate(
-      `Common name`                     = lapply(`Common name`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-    , `Scientific name`                 = lapply(`Scientific name`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-    , `Animal type`                     = lapply(`Animal type`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-    , `Species in Environment`          = lapply(`Species in Environment`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-    , `Common name (Environmental)`     = lapply(`Common name (Environmental)`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-    , `Scientific name (Environmental)` = lapply(`Scientific name (Environmental)`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-    , `Animal type (Environmental)`     = lapply(`Animal type (Environmental)`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-    , site_data                         = lapply(`Site Data`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()) %>%
-    dplyr::left_join(., site_data.s %>% dplyr::select(-createdTime) %>% dplyr::rename(site_data = id), by = c("site_data", "base_id"))  %>%
+  bat_data %<>%
+    dplyr::select(-createdTime) %>%
     dplyr::mutate(
-      Site_Name = apply(Site_ID %>% matrix(), 1, FUN = function(x) strsplit(x, "_")[[1]][1])
-    , Year     = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][1]) %>% as.numeric()
-    , Month    = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][2]) %>% as.numeric()
-    , Day      = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][3]) %>% as.numeric()
-    , Date     = as.Date(Date)
-    , DOY      = lubridate::yday(Date)
+        `Working Species ID`     = lapply(`Working Species ID`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
+      , `Species in Environment` = lapply(`Species in Environment`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
+      , site_data                = lapply(`Site Data`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
+      , spec_ID = ifelse(!is.na(`Working Species ID`), `Working Species ID`, `Species in Environment`)
     ) %>%
-    dplyr::select(
-    ## Critical species-level stuff
-      id, Laboratory_ID, Year, Month, DOY, Day, Date, Site_Name, `Specimen type`
-    , `Animal type`,  `Common name`, `Scientific name`
-    , `Animal type (Environmental)`,  `Common name (Environmental)`, `Scientific name (Environmental)`
-    , `Species in Environment`, Recapture, `Unique Individual`
-    ## Individual-level characteristics
-    , Sex, Age, `Reproductive status`, `Nipple condition`, `Forearm length (mm)`, `Final mass (g)`
-    ## Keep Laboratory ID around to identify environmental samples, but also need Laboratory ID to match with
-    ## Specimen Data for joining
+    ## add species info
+    dplyr::left_join(
+      .
+      , species_taxonomy %>% rename(spec_ID = id) %>% dplyr::select(-createdTime)
+      , by = c("base_id", "spec_ID")
     ) %>%
-    dplyr::rename(`Laboratory ID` = id) %>%
+    dplyr::select(-`Working Species ID`, -`Species in Environment`, -spec_ID) %>%
+    dplyr::left_join(., site_data %>% dplyr::rename(site_data = Site), by = c("site_data", "base_id")) %>%
+    dplyr::select(-c(`Site Data`, site_data)) %>%
+    dplyr::mutate(visit_duration = (as.Date(`Date depart`) - as.Date(`Date arrived`)) %>% as.numeric()) %>%
+    dplyr::rename(Date = `Date arrived`, Site_Name = `Site name`, `Scientific name` = `Current classification`) %>%
     dplyr::mutate(
-      ## May have caught all of these by now, but doesnt hurt to double check
-      Age = plyr::mapvalues(Age, from = "Adult ", to = "Adult")
-    , Sex = plyr::mapvalues(Sex, from = c("Male ", "Female "), to = c("Male", "Female"))
-    )
-
-  event_data.b %<>%
-    dplyr::mutate(`Site Data` = lapply(`Site Data`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()) %>%
-    dplyr::left_join(site_data.b %>% dplyr::select(-createdTime) %>% dplyr::rename(`Site Data` = id)) %>%
-    dplyr::select(-createdTime, -`Site Data`) %>%
-    dplyr::rename(site_data = id)
-
-  bat_data.b <- bat_data %>%
-    dplyr::filter(base_id == base_ids[[2]]) %>%
-    ## drop the lists for species (while retaining nulls) and site data
-    dplyr::mutate(
-      `Common name`                     = lapply(`Common name`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-    , `Scientific name`                 = lapply(`Scientific name`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-    , `Animal type`                     = lapply(`Animal type`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-    , `Species in Environment`          = lapply(`Species in Environment`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-    , `Common name (Environmental)`     = lapply(`Common name (Environmental)`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-    , `Scientific name (Environmental)` = lapply(`Scientific name (Environmental)`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-    , `Animal type (Environmental)`     = lapply(`Animal type (Environmental)`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
-    , site_data                         = lapply(`Site Data`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()) %>%
-    dplyr::left_join(., event_data.b, by = c("site_data", "base_id")) %>%
-    dplyr::mutate(
-      Year  = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][1]) %>% as.numeric()
-    , Month = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][2]) %>% as.numeric()
-    , Day   = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][3]) %>% as.numeric()
-    , Date  = as.Date(Date)
-    , DOY   = lubridate::yday(Date)
+      , Year     = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][1]) %>% as.numeric()
+      , Month    = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][2]) %>% as.numeric()
+      , Day      = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][3]) %>% as.numeric()
+      , Date     = as.Date(Date)
+      , DOY      = lubridate::yday(Date)
     ) %>%
-    dplyr::rename(Site_Name = Site_ID) %>%
     dplyr::select(
-    ## Critical species-level stuff
-      id, Laboratory_ID, Year, Month, DOY, Day, Date, Site_Name, `Specimen type`
-    , `Animal type`,  `Common name`, `Scientific name`
-    , `Animal type (Environmental)`,  `Common name (Environmental)`, `Scientific name (Environmental)`
-    , `Species in Environment`, Recapture, `Unique Individual`
-    ## Individual-level characteristics
-    , Sex, Age, `Reproductive status`, `Nipple condition`, `Forearm length (mm)`, `Final mass (g)`
-    ## Keep Laboratory ID around to identify environmental samples, but also need Laboratory ID to match with
-     ## Specimen Data for joining
+      ## Critical species-level stuff
+        id, Laboratory_ID, Year, Month, DOY, Day, Date, Site_Name
+      , `Animal Type`,  `Common name`, `Scientific name`, Family
+      , Recapture, `Unique Individual`
+      ## Individual-level characteristics
+      , Sex, Age, `Reproductive status`, `Nipple condition`, `Forearm length (mm)`, `Final mass (g)`
+      ## Keep Laboratory ID around to identify environmental samples, but also need Laboratory ID to match with
+      ## Specimen Data for joining
     ) %>%
     dplyr::rename(`Laboratory ID` = id) %>%
     dplyr::mutate(
-      Age = plyr::mapvalues(Age, from = "Adult ", to = "Adult")
-    , Sex = plyr::mapvalues(Sex, from = c("Male ", "Female "), to = c("Male", "Female"))
-    )
-
-  bat_data <- rbind(bat_data.s, bat_data.b)
-
-  ## Combine individual and environmental samples in long form
-  bat_data %<>% dplyr::mutate(
-      `Scientific name` = ifelse(is.na(`Scientific name`) & !is.na(`Scientific name (Environmental)`)
-                                 , `Scientific name (Environmental)`
-                                 , `Scientific name`)
-    , `Common name`     = ifelse(is.na(`Common name`) & !is.na(`Common name (Environmental)`)
-                                 , `Common name (Environmental)`
-                                 , `Common name`)
-    , `Animal type`     = ifelse(is.na(`Animal type`) & !is.na(`Animal type (Environmental)`)
-                                 , `Animal type (Environmental)`
-                                 , `Animal type`)
-    , `Specimen type`   = ifelse(is.na(`Specimen type`) & !is.na(`Animal type`)
-                                 , `Animal type`
-                                 , `Specimen type`)
-    )
-
-  ## Identify environmental samples
-  bat_data %<>%
+      ## May have caught all of these by now, but doesnt hurt to double check
+        Age = plyr::mapvalues(Age, from = "Adult ", to = "Adult")
+      , Sex = plyr::mapvalues(Sex, from = c("Male ", "Female "), to = c("Male", "Female"))
+    ) %>%
+    ## Identify environmental samples and a bit other processing
     dplyr::mutate(Recapture = ifelse(is.na(Recapture), FALSE, Recapture)) %>%
     dplyr::mutate(index = seq(dplyr::n())) %>% dplyr::group_by(index) %>%
     dplyr::mutate(env = ifelse(grep("E", Laboratory_ID) %>% length() > 0, 1, 0)) %>%
@@ -142,11 +75,11 @@ summarize_virus_data <- function(bat_data, site_data, specimen_data, sequence_da
   ## pull just some columns for visuals
   specimen_data %<>%
     dplyr::select(
-     `Specimen ID`, `Sample Type`, `Laboratory ID`
-   , tidyselect::all_of(tidyselect::ends_with("Outcome"))
-   , tidyselect::all_of(tidyselect::ends_with("Year tested"))
-   ) %>%
-   dplyr::left_join(., bat_data)
+      `Specimen ID`, `Sample Type`, `Laboratory ID`
+      , tidyselect::all_of(tidyselect::ends_with("Outcome"))
+      , tidyselect::all_of(tidyselect::ends_with("Year tested"))
+    ) %>%
+    dplyr::left_join(., bat_data)
 
   ## ! To resolve when the issue with multiple wells is resolved
   specimen_data <- specimen_data[!duplicated(specimen_data), ]
@@ -155,22 +88,25 @@ summarize_virus_data <- function(bat_data, site_data, specimen_data, sequence_da
   outcomes <- specimen_data %>%
     tidyr::pivot_longer(
       .
-    , c(tidyselect::all_of(tidyselect::ends_with("Outcome")))
-    , names_to = "Virus", values_to = "Result") %>%
+      , c(tidyselect::all_of(tidyselect::ends_with("Outcome")))
+      , names_to = "Virus", values_to = "Result") %>%
     dplyr::select(-tidyselect::all_of(tidyselect::ends_with("Year tested"))) %>%
     dplyr::mutate(
       Virus = apply(Virus %>% matrix(), 1, FUN = function(x) strsplit(x, " Outcome")[[1]][1])
-    , Virus = plyr::mapvalues(Virus, from = "Filo Real-time", to = "Filo")
+      , Virus = plyr::mapvalues(Virus, from = "Filo Real-time", to = "Filo")
     ) %>%
     dplyr::filter(!is.na(Result)) %>%
-    dplyr:: mutate(Result_n = ifelse(((Result %notin% c("Negative", "Inconclusive", "Excluded from testing")) & !is.na(Result)), 1, 0)) %>%
+    dplyr:: mutate(Result_n = ifelse(((Result %notin% c(
+      "Negative", "Inconclusive", "Excluded from testing"
+      , "Requires cloning", "Unknown"
+    )) & !is.na(Result)), 1, 0)) %>%
     dplyr::filter(Result != "Excluded from testing")
 
   yrs_tested <- specimen_data %>%
     tidyr::pivot_longer(
       .
-    , c(tidyselect::all_of(tidyselect::ends_with("Year tested")))
-    , names_to = "Virus", values_to = "Year Tested") %>%
+      , c(tidyselect::all_of(tidyselect::ends_with("Year tested")))
+      , names_to = "Virus", values_to = "Year Tested") %>%
     dplyr::select(-tidyselect::all_of(tidyselect::ends_with("Outcome"))) %>%
     dplyr::mutate(Virus = apply(Virus %>% matrix(), 1, FUN = function(x) strsplit(x, " Year tested")[[1]][1])) %>%
     dplyr::filter(!is.na(`Year Tested`))
@@ -182,8 +118,8 @@ summarize_virus_data <- function(bat_data, site_data, specimen_data, sequence_da
     ## If any exist, if not, ignored
     Virus = plyr::mapvalues(
       Virus
-    , from = c("Filo Real-time", "PAR", "Influ")
-    , to = c("Filo", "Paramyxo", "Flu")
+      , from = c("Filo Real-time", "PAR", "Influ")
+      , to = c("Filo", "Paramyxo", "Flu")
     )
   ) %>% dplyr::rename(Species = `Scientific name`) %>%
     dplyr::mutate(index = seq(dplyr::n())) %>% dplyr::group_by(index) %>%
@@ -193,17 +129,17 @@ summarize_virus_data <- function(bat_data, site_data, specimen_data, sequence_da
   ## Add sequence data (cleaned from tables "Sequences" from both bases)
   specimen_data %<>%
     dplyr::left_join(
-     .
-    , sequence_data %>%
-       dplyr::select(Virus, Specimen_ID, Sequence) %>%
-       dplyr::mutate(Specimen_ID = lapply(Specimen_ID, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()) %>%
-       dplyr::rename(`Specimen ID` = Specimen_ID)
+      .
+      , sequence_data %>%
+        dplyr::select(Virus, Specimen_ID, Sequence) %>%
+        dplyr::mutate(Specimen_ID = lapply(Specimen_ID, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()) %>%
+        dplyr::rename(`Specimen ID` = Specimen_ID)
     )
 
   return(
     list(
       specimen_data = specimen_data
-    , sample_data   = bat_data
+      , sample_data   = bat_data
     )
   )
 

diff --git a/man/summarize_event_data.Rd b/man/summarize_event_data.Rd
diff --git a/man/summarize_virus_data.Rd b/man/summarize_virus_data.Rd