Skip to content

Commit

Permalink
new function for combining site and event data and cleanup of summari…
Browse files Browse the repository at this point in the history
…zing sample data given some database changes since I last worked on this
  • Loading branch information
Morgan Kain authored and Morgan Kain committed Apr 29, 2024
1 parent 1ae2ff4 commit fd2e583
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 128 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ export(process_summarized_specimen_data_for_plotting)
export(read.gb)
export(save_tables)
export(split_tibble)
export(summarize_event_data)
export(summarize_virus_data)
importFrom(dplyr,"%>%")
importFrom(magrittr,"%<>%")
Expand Down
74 changes: 74 additions & 0 deletions R/summarize_event_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#' Summarize Event Data
#'
#' @param site_data Data frame. Site info
#' @param event_data Data frame. Sampling occasions (events)
#' @param site_characterization Data frame. Site characterization (as named on airtable)
#' @param base base id
#'
#' @return List of two data frames (specimen data and sample data)
#' @export
summarize_event_data <- function(site_data, event_data, site_characterization, base) {

## 1) SABRENet and BEEZ have a bit different info in "Site Data" and "Event Data"
## 2) In SABRENet -- "Sample Data" column 'Site' links appropriately to "Site Data"
## In BEEZ -- "Sample Data" column 'Site' is actually a link to "Event Data"

## X - Site Data (Site) <<- Site Characterization (id)
## Y - Event Data (Site Data) <<- X (id)

event_data %<>% dplyr::select(
base_id, id, `Date arrived`, `Date depart`
)

site_data %<>% dplyr::select(
base_id, id, Site_ID, Site, Event, `Sample Data`
) %>% dplyr::mutate(
`Site` = lapply(`Site`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, `Event` = lapply(`Event`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
)

site_characterization %<>% dplyr::select(
base_id, id, `Site name`
)

site_data.s <- site_data %>% dplyr::filter(base_id == base[[1]])
site_data.b <- site_data %>% dplyr::filter(base_id == base[[2]])

event_data.s <- event_data %>% dplyr::filter(base_id == base[[1]])
event_data.b <- event_data %>% dplyr::filter(base_id == base[[2]])

site_characterization.s <- site_characterization %>% dplyr::filter(base_id == base[[1]])
site_characterization.b <- site_characterization %>% dplyr::filter(base_id == base[[2]])

## X - Site Data (Site) <<- Site Characterization (id)
## Y - Event Data (Site Data) <<- X (id)

joined.s <- dplyr::left_join(
site_data.s
, site_characterization.s %>% dplyr::rename(Site = id)
, by = c("base_id", "Site")
) %>% dplyr::left_join(
.
, event_data.s %>% dplyr::rename(Event = id)
, by = c("base_id", "Event")
) %>% dplyr::select(
base_id, id, Site_ID, `Site name`, `Date arrived`, `Date depart`
)

joined.b <- dplyr::left_join(
site_data.b
, site_characterization.b %>% dplyr::rename(Site = id)
, by = c("base_id", "Site")
) %>% dplyr::left_join(
.
, event_data.b %>% dplyr::rename(Event = id)
, by = c("base_id", "Event")
) %>% dplyr::select(
base_id, Event, Site_ID, `Site name`, `Date arrived`, `Date depart`
) %>% dplyr::rename(id = Event)

return(
rbind(joined.s, joined.b) %>% dplyr::rename(Site = id)
)

}
192 changes: 64 additions & 128 deletions R/summarize_virus_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,137 +2,70 @@
#'
#' @param bat_data Data frame. Sample data
#' @param site_data Data frame. Site info
#' @param species_taxonomy Data frame. Species taxonomy (as named on airtable)
#' @param specimen_data Data frame. Specimen data
#' @param sequence_data Data frame. Sequences (all the already cleaned ones in the Airtable table "Sequences")
#' @param base_ids list of two Airtable base IDs
#'
#' @return List of two data frames (specimen data and sample data)
#' @export
summarize_virus_data <- function(bat_data, site_data, specimen_data, sequence_data, base_ids) {
summarize_virus_data <- function(bat_data, site_data, species_taxonomy, specimen_data, sequence_data, base_ids) {

`%notin%` <- Negate(`%in%`)

## unlist, respecting NULL entries for left_joining further down
specimen_data %<>%
dplyr::mutate(
`Sample Type` = lapply(`Sample Type`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, `Laboratory ID` = lapply(`Laboratory ID`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
## NOTE: Need to get at the root of this problem, I do not know why just Corona Year Tested is a list and other
## "year tested" columns are not
, `Corona Year tested` = lapply(`Corona Year tested`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
)
`Sample Type` = lapply(`Sample Type`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, `Laboratory ID` = lapply(`Laboratory ID`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
## NOTE: Need to get at the root of this problem, I do not know why just Corona Year Tested is a list and other
## "year tested" columns are not
, `Corona Year tested` = lapply(`Corona Year tested`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
)

## Due to somewhat different database structures regarding how visit information is spread between Site Data and Event Data,
## have to, at least for now, split up and do something a bit different to parse each and stick back together
## !! Would be really nice to change the database structure to match, but it seems that this either difficult or low priority
site_data.s <- site_data %>% dplyr::filter(base_id == base_ids[[1]])
site_data.b <- site_data %>% dplyr::filter(base_id == base_ids[[2]])
event_data.s <- event_data %>% dplyr::filter(base_id == base_ids[[1]])
event_data.b <- event_data %>% dplyr::filter(base_id == base_ids[[2]])

bat_data.s <- bat_data %>%
dplyr::filter(base_id == base_ids[[1]]) %>%
## drop the lists for species (while retaining nulls) and site data
dplyr::mutate(
`Common name` = lapply(`Common name`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, `Scientific name` = lapply(`Scientific name`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, `Animal type` = lapply(`Animal type`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, `Species in Environment` = lapply(`Species in Environment`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, `Common name (Environmental)` = lapply(`Common name (Environmental)`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, `Scientific name (Environmental)` = lapply(`Scientific name (Environmental)`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, `Animal type (Environmental)` = lapply(`Animal type (Environmental)`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, site_data = lapply(`Site Data`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()) %>%
dplyr::left_join(., site_data.s %>% dplyr::select(-createdTime) %>% dplyr::rename(site_data = id), by = c("site_data", "base_id")) %>%
bat_data %<>%
dplyr::select(-createdTime) %>%
dplyr::mutate(
Site_Name = apply(Site_ID %>% matrix(), 1, FUN = function(x) strsplit(x, "_")[[1]][1])
, Year = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][1]) %>% as.numeric()
, Month = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][2]) %>% as.numeric()
, Day = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][3]) %>% as.numeric()
, Date = as.Date(Date)
, DOY = lubridate::yday(Date)
`Working Species ID` = lapply(`Working Species ID`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, `Species in Environment` = lapply(`Species in Environment`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, site_data = lapply(`Site Data`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, spec_ID = ifelse(!is.na(`Working Species ID`), `Working Species ID`, `Species in Environment`)
) %>%
dplyr::select(
## Critical species-level stuff
id, Laboratory_ID, Year, Month, DOY, Day, Date, Site_Name, `Specimen type`
, `Animal type`, `Common name`, `Scientific name`
, `Animal type (Environmental)`, `Common name (Environmental)`, `Scientific name (Environmental)`
, `Species in Environment`, Recapture, `Unique Individual`
## Individual-level characteristics
, Sex, Age, `Reproductive status`, `Nipple condition`, `Forearm length (mm)`, `Final mass (g)`
## Keep Laboratory ID around to identify environmental samples, but also need Laboratory ID to match with
## Specimen Data for joining
## add species info
dplyr::left_join(
.
, species_taxonomy %>% rename(spec_ID = id) %>% dplyr::select(-createdTime)
, by = c("base_id", "spec_ID")
) %>%
dplyr::rename(`Laboratory ID` = id) %>%
dplyr::select(-`Working Species ID`, -`Species in Environment`, -spec_ID) %>%
dplyr::left_join(., site_data %>% dplyr::rename(site_data = Site), by = c("site_data", "base_id")) %>%
dplyr::select(-c(`Site Data`, site_data)) %>%
dplyr::mutate(visit_duration = (as.Date(`Date depart`) - as.Date(`Date arrived`)) %>% as.numeric()) %>%
dplyr::rename(Date = `Date arrived`, Site_Name = `Site name`, `Scientific name` = `Current classification`) %>%
dplyr::mutate(
## May have caught all of these by now, but doesnt hurt to double check
Age = plyr::mapvalues(Age, from = "Adult ", to = "Adult")
, Sex = plyr::mapvalues(Sex, from = c("Male ", "Female "), to = c("Male", "Female"))
)

event_data.b %<>%
dplyr::mutate(`Site Data` = lapply(`Site Data`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()) %>%
dplyr::left_join(site_data.b %>% dplyr::select(-createdTime) %>% dplyr::rename(`Site Data` = id)) %>%
dplyr::select(-createdTime, -`Site Data`) %>%
dplyr::rename(site_data = id)

bat_data.b <- bat_data %>%
dplyr::filter(base_id == base_ids[[2]]) %>%
## drop the lists for species (while retaining nulls) and site data
dplyr::mutate(
`Common name` = lapply(`Common name`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, `Scientific name` = lapply(`Scientific name`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, `Animal type` = lapply(`Animal type`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, `Species in Environment` = lapply(`Species in Environment`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, `Common name (Environmental)` = lapply(`Common name (Environmental)`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, `Scientific name (Environmental)` = lapply(`Scientific name (Environmental)`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, `Animal type (Environmental)` = lapply(`Animal type (Environmental)`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()
, site_data = lapply(`Site Data`, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()) %>%
dplyr::left_join(., event_data.b, by = c("site_data", "base_id")) %>%
dplyr::mutate(
Year = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][1]) %>% as.numeric()
, Month = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][2]) %>% as.numeric()
, Day = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][3]) %>% as.numeric()
, Date = as.Date(Date)
, DOY = lubridate::yday(Date)
, Year = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][1]) %>% as.numeric()
, Month = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][2]) %>% as.numeric()
, Day = sapply(Date, FUN = function(x) strsplit(x, "-")[[1]][3]) %>% as.numeric()
, Date = as.Date(Date)
, DOY = lubridate::yday(Date)
) %>%
dplyr::rename(Site_Name = Site_ID) %>%
dplyr::select(
## Critical species-level stuff
id, Laboratory_ID, Year, Month, DOY, Day, Date, Site_Name, `Specimen type`
, `Animal type`, `Common name`, `Scientific name`
, `Animal type (Environmental)`, `Common name (Environmental)`, `Scientific name (Environmental)`
, `Species in Environment`, Recapture, `Unique Individual`
## Individual-level characteristics
, Sex, Age, `Reproductive status`, `Nipple condition`, `Forearm length (mm)`, `Final mass (g)`
## Keep Laboratory ID around to identify environmental samples, but also need Laboratory ID to match with
## Specimen Data for joining
## Critical species-level stuff
id, Laboratory_ID, Year, Month, DOY, Day, Date, Site_Name
, `Animal Type`, `Common name`, `Scientific name`, Family
, Recapture, `Unique Individual`
## Individual-level characteristics
, Sex, Age, `Reproductive status`, `Nipple condition`, `Forearm length (mm)`, `Final mass (g)`
## Keep Laboratory ID around to identify environmental samples, but also need Laboratory ID to match with
## Specimen Data for joining
) %>%
dplyr::rename(`Laboratory ID` = id) %>%
dplyr::mutate(
Age = plyr::mapvalues(Age, from = "Adult ", to = "Adult")
, Sex = plyr::mapvalues(Sex, from = c("Male ", "Female "), to = c("Male", "Female"))
)

bat_data <- rbind(bat_data.s, bat_data.b)

## Combine individual and environmental samples in long form
bat_data %<>% dplyr::mutate(
`Scientific name` = ifelse(is.na(`Scientific name`) & !is.na(`Scientific name (Environmental)`)
, `Scientific name (Environmental)`
, `Scientific name`)
, `Common name` = ifelse(is.na(`Common name`) & !is.na(`Common name (Environmental)`)
, `Common name (Environmental)`
, `Common name`)
, `Animal type` = ifelse(is.na(`Animal type`) & !is.na(`Animal type (Environmental)`)
, `Animal type (Environmental)`
, `Animal type`)
, `Specimen type` = ifelse(is.na(`Specimen type`) & !is.na(`Animal type`)
, `Animal type`
, `Specimen type`)
)

## Identify environmental samples
bat_data %<>%
## May have caught all of these by now, but doesnt hurt to double check
Age = plyr::mapvalues(Age, from = "Adult ", to = "Adult")
, Sex = plyr::mapvalues(Sex, from = c("Male ", "Female "), to = c("Male", "Female"))
) %>%
## Identify environmental samples and a bit other processing
dplyr::mutate(Recapture = ifelse(is.na(Recapture), FALSE, Recapture)) %>%
dplyr::mutate(index = seq(dplyr::n())) %>% dplyr::group_by(index) %>%
dplyr::mutate(env = ifelse(grep("E", Laboratory_ID) %>% length() > 0, 1, 0)) %>%
Expand All @@ -142,11 +75,11 @@ summarize_virus_data <- function(bat_data, site_data, specimen_data, sequence_da
## pull just some columns for visuals
specimen_data %<>%
dplyr::select(
`Specimen ID`, `Sample Type`, `Laboratory ID`
, tidyselect::all_of(tidyselect::ends_with("Outcome"))
, tidyselect::all_of(tidyselect::ends_with("Year tested"))
) %>%
dplyr::left_join(., bat_data)
`Specimen ID`, `Sample Type`, `Laboratory ID`
, tidyselect::all_of(tidyselect::ends_with("Outcome"))
, tidyselect::all_of(tidyselect::ends_with("Year tested"))
) %>%
dplyr::left_join(., bat_data)

## ! To resolve when the issue with multiple wells is resolved
specimen_data <- specimen_data[!duplicated(specimen_data), ]
Expand All @@ -155,22 +88,25 @@ summarize_virus_data <- function(bat_data, site_data, specimen_data, sequence_da
outcomes <- specimen_data %>%
tidyr::pivot_longer(
.
, c(tidyselect::all_of(tidyselect::ends_with("Outcome")))
, names_to = "Virus", values_to = "Result") %>%
, c(tidyselect::all_of(tidyselect::ends_with("Outcome")))
, names_to = "Virus", values_to = "Result") %>%
dplyr::select(-tidyselect::all_of(tidyselect::ends_with("Year tested"))) %>%
dplyr::mutate(
Virus = apply(Virus %>% matrix(), 1, FUN = function(x) strsplit(x, " Outcome")[[1]][1])
, Virus = plyr::mapvalues(Virus, from = "Filo Real-time", to = "Filo")
, Virus = plyr::mapvalues(Virus, from = "Filo Real-time", to = "Filo")
) %>%
dplyr::filter(!is.na(Result)) %>%
dplyr:: mutate(Result_n = ifelse(((Result %notin% c("Negative", "Inconclusive", "Excluded from testing")) & !is.na(Result)), 1, 0)) %>%
dplyr:: mutate(Result_n = ifelse(((Result %notin% c(
"Negative", "Inconclusive", "Excluded from testing"
, "Requires cloning", "Unknown"
)) & !is.na(Result)), 1, 0)) %>%
dplyr::filter(Result != "Excluded from testing")

yrs_tested <- specimen_data %>%
tidyr::pivot_longer(
.
, c(tidyselect::all_of(tidyselect::ends_with("Year tested")))
, names_to = "Virus", values_to = "Year Tested") %>%
, c(tidyselect::all_of(tidyselect::ends_with("Year tested")))
, names_to = "Virus", values_to = "Year Tested") %>%
dplyr::select(-tidyselect::all_of(tidyselect::ends_with("Outcome"))) %>%
dplyr::mutate(Virus = apply(Virus %>% matrix(), 1, FUN = function(x) strsplit(x, " Year tested")[[1]][1])) %>%
dplyr::filter(!is.na(`Year Tested`))
Expand All @@ -182,8 +118,8 @@ summarize_virus_data <- function(bat_data, site_data, specimen_data, sequence_da
## If any exist, if not, ignored
Virus = plyr::mapvalues(
Virus
, from = c("Filo Real-time", "PAR", "Influ")
, to = c("Filo", "Paramyxo", "Flu")
, from = c("Filo Real-time", "PAR", "Influ")
, to = c("Filo", "Paramyxo", "Flu")
)
) %>% dplyr::rename(Species = `Scientific name`) %>%
dplyr::mutate(index = seq(dplyr::n())) %>% dplyr::group_by(index) %>%
Expand All @@ -193,17 +129,17 @@ summarize_virus_data <- function(bat_data, site_data, specimen_data, sequence_da
## Add sequence data (cleaned from tables "Sequences" from both bases)
specimen_data %<>%
dplyr::left_join(
.
, sequence_data %>%
dplyr::select(Virus, Specimen_ID, Sequence) %>%
dplyr::mutate(Specimen_ID = lapply(Specimen_ID, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()) %>%
dplyr::rename(`Specimen ID` = Specimen_ID)
.
, sequence_data %>%
dplyr::select(Virus, Specimen_ID, Sequence) %>%
dplyr::mutate(Specimen_ID = lapply(Specimen_ID, FUN = function(x) ifelse(is.null(x), NA, x)) %>% unlist()) %>%
dplyr::rename(`Specimen ID` = Specimen_ID)
)

return(
list(
specimen_data = specimen_data
, sample_data = bat_data
, sample_data = bat_data
)
)

Expand Down
23 changes: 23 additions & 0 deletions man/summarize_event_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions man/summarize_virus_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit fd2e583

Please sign in to comment.