Merge pull request #127 from weecology/by-route

Import BBS data by route
weecology · May 28, 2019 · 9c89304 · 9c89304
2 parents 41a2f11 + 4ff431e
commit 9c89304
Show file tree

Hide file tree

Showing 19 changed files with 618 additions and 250 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,7 +3,7 @@
 .Rhistory
 .RData
 .Ruserdata
-/.drake/
+.drake/
 /output/drake-cache.sqlite
 /analysis/*.html
 /data/

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,11 +3,15 @@
 export("%>%")
 export(analysis_wrapper)
 export(build_analyses_plan)
+export(build_bbs_datasets_plan)
 export(build_datasets_plan)
 export(check_data_format)
 export(check_default_data_path)
 export(collect_analyses)
-export(get_bbs_data)
+export(combine_bbs_subspecies)
+export(filter_bbs_species)
+export(filter_bbs_ts)
+export(get_bbs_route_region_data)
 export(get_cowley_lizards)
 export(get_cowley_snakes)
 export(get_default_data_path)
@@ -22,6 +26,8 @@ export(get_sgs_data)
 export(import_retriever_data)
 export(install_retriever_data)
 export(interpolate_obs)
+export(prepare_bbs_ts_data)
+export(process_bbs_route_region_data)
 export(richness)
 export(summarise_effort)
 export(summarise_obs)
@@ -34,21 +40,5 @@ export(ts_summary)
 export(ts_summary_drake)
 export(uni_ts_summary)
 export(use_default_data_path)
-importFrom(graphics,axis)
-importFrom(graphics,mtext)
-importFrom(graphics,par)
-importFrom(graphics,plot)
-importFrom(graphics,points)
-importFrom(graphics,rect)
-importFrom(graphics,text)
 importFrom(magrittr,"%>%")
-importFrom(stats,D)
-importFrom(stats,acf)
-importFrom(stats,cor)
-importFrom(stats,median)
-importFrom(stats,na.omit)
-importFrom(stats,sd)
-importFrom(stats,setNames)
-importFrom(utils,data)
-importFrom(utils,read.csv)
 importFrom(utils,read.delim)
diff --git a/R/bbs_cleaning_functions.R b/R/bbs_cleaning_functions.R
@@ -0,0 +1,210 @@
+#' @title Prepare BBS population time-series data
+#' @description Modified from https://github.com/weecology/bbs-forecasting
+#'   and https://github.com/weecology/MATSS-community-change 
+#'   Selects sites with data spanning start_yr through end_yr containing at 
+#'   least min_num_yrs of data samples during that period. Cleans data tables 
+#'   and stores each individual route as a .Rds file. Saves a data table of the 
+#'   route + region pairs. 
+#' @param start_yr num first year of time-series
+#' @param end_yr num last year of time-series
+#' @param min_num_yrs num minimum number of years of data between start_yr & end_yr
+#' @param bbs_subset optional, a subset of the BBS communities to use 
+#'   (to speed up development). As c(1:X)
+#' @inheritParams get_mtquad_data
+#' @return NULL
+#' @export
+
+prepare_bbs_ts_data <- function(start_yr = 1965, end_yr = 2017, min_num_yrs = 10,
+                                path = get_default_data_path(), bbs_subset = NULL) 
+{
+    bbs_data_tables <- import_retriever_data("breed-bird-survey", path = path)
+
+    bbs_data <- bbs_data_tables$breed_bird_survey_weather %>%
+        dplyr::filter(runtype == 1, rpid == 101) %>%
+        dplyr::left_join(bbs_data_tables$breed_bird_survey_counts,
+                         by = c('statenum', 'route', 'rpid', 'year', 'routedataid', 'countrynum')) %>%
+        dplyr::left_join(bbs_data_tables$breed_bird_survey_routes, 
+                         by = c('statenum', 'route', 'countrynum')) %>%
+        dplyr::mutate(site_id = statenum*1000 + route, 
+                      starttemp = dplyr::case_when(tempscale=='F' ~ c((starttemp - 32)*5/9),
+                                                   tempscale=='C' ~ as.double(starttemp)),
+                      endtemp = dplyr::case_when(tempscale=='F' ~ c((endtemp - 32)*5/9),
+                                                 tempscale=='C' ~ as.double(endtemp))) %>%
+        dplyr::rename(lat = latitude,
+                      long = longitude,
+                      species_id = aou,
+                      abundance = speciestotal) %>%
+        filter_bbs_ts(start_yr, end_yr, min_num_yrs)
+
+    # prepare and write out route and region metadata
+    bbs_routes_regions <- bbs_data %>%
+        dplyr::select(bcr, route) %>%
+        dplyr::distinct() %>%
+        dplyr::mutate(name = paste0("bbs_bcr", bcr, "_route", route))
+
+    storage_path <- file.path(path, 'breed-bird-survey-prepped')
+    if (!dir.exists(storage_path)) {
+        dir.create(storage_path)
+    }
+
+    write.csv(bbs_routes_regions, file.path(storage_path, "routes_and_regions_table.csv"), row.names = F)
+
+    # filter and process selected route and region combinations
+    if (!is.null(bbs_subset)) {
+        bbs_routes_regions <- bbs_routes_regions[bbs_subset, ]
+    }
+
+    bbs_routes_regions %>%
+        dplyr::select(bcr, route) %>%
+        purrr::pmap(function(bcr, route) {
+            bbs_data %>%
+                dplyr::filter(bcr == !!bcr,
+                              route == !!route) %>%
+                process_bbs_route_region_data(species_table = bbs_data_tables$breed_bird_survey_species) %>%
+                saveRDS(file = file.path(storage_path, paste0("route", route, "region", bcr, ".Rds")) )
+        })
+}
+
+#' @title Process the BBS data for an individual route and region
+#' @description Correct and otherwise filter BBS species data (see 
+#'   \code{\link{combine_subspecies}} and \code{\link{filter_bbs_species}} for 
+#'   more info). Generate the abundance, covariate, and metadata tables and 
+#'   return the combined object.
+#' @param bbs_data_table main bbs data table
+#' @param species_table table of species for BBS
+#' @return the processed BBS data
+#' @export
+process_bbs_route_region_data <- function(bbs_data_table, species_table)
+{
+    # check that exactly one route and one region are represented in the data
+    route <- unique(bbs_data_table$route)
+    region <- unique(bbs_data_table$bcr)
+    stopifnot(length(route) == 1 && 
+                  length(region == 1))
+
+    # process species IDs
+    this_bbs_data <- bbs_data_table %>%
+        combine_bbs_subspecies(species_table = species_table) %>%
+        filter_bbs_species(species_table = species_table) %>%
+        dplyr::mutate(species_id = paste('sp', species_id, sep=''),
+                      date = as.Date(paste(year, month, day, sep = "-"))) %>%
+        dplyr::ungroup() 
+
+    abundance <- this_bbs_data %>%
+        dplyr::group_by(year, species_id) %>%
+        dplyr::summarise(abundance = sum(abundance)) %>%
+        dplyr::ungroup() %>%
+        tidyr::spread(key = species_id, value = abundance, fill = 0) %>%
+        dplyr::arrange(year) %>%
+        dplyr::select(-year)
+
+    covariates <- this_bbs_data %>%
+        dplyr::group_by(year) %>%
+        dplyr::summarise(effort = dplyr::n_distinct(site_id),
+                         starttemp = mean(starttemp), endtemp = mean(endtemp),
+                         startwind = mean(startwind), endwind = mean(endwind),
+                         startsky = mean(startsky), endsky = mean(endsky),
+                         lat = mean(lat), long = mean(long), mean_date = mean(date)) %>%
+        dplyr::arrange(year)
+
+    metadata <- list(timename = 'year', effort = 'effort', route = route, region = region)
+
+    return(list('abundance' = abundance, 'covariates' = covariates, 'metadata' = metadata))
+}
+
+
+#' @title Filter BBS to specified time series period and number of samples
+#'
+#' @description Modified from https://github.com/weecology/bbs-forecasting 
+#' and https://github.com/weecology/MATSS-community-change
+#'
+#' @param bbs_data dataframe that contains BBS site_id and year columns
+#' @param start_yr num first year of time-series
+#' @param end_yr num last year of time-series
+#' @param min_num_yrs num minimum number of years of data between start_yr & end_yr
+#'
+#' @return dataframe with original data and associated environmental data
+#' @export
+filter_bbs_ts <- function(bbs_data, start_yr, end_yr, min_num_yrs) {
+    sites_to_keep = bbs_data %>%
+        dplyr::filter(year >= start_yr, year <= end_yr) %>%
+        dplyr::group_by(site_id) %>%
+        dplyr::summarise(num_years = length(unique(year))) %>%
+        dplyr::ungroup() %>%
+        dplyr::filter(num_years >= min_num_yrs)
+
+    filtered_data <- bbs_data %>%
+        dplyr::filter(year >= start_yr, year <= end_yr) %>%
+        dplyr::filter(site_id %in% sites_to_keep$site_id)
+}
+
+
+#' Filter poorly sampled BBS species
+#' 
+#' Modified from https://github.com/weecology/bbs-forecasting/blob/master/R/forecast-bbs-core.R
+#'
+#' Removes waterbirds, shorebirds, owls, kingfishers, knightjars,
+#' dippers. These species are poorly sampled due to their aquatic or
+#' noctural nature. Also removes taxa that were either partially unidentified
+#' (e.g. "sp.") or were considered hybrids (e.g. "A x B") or were listed as more
+#' than one species (e.g. "A / B")
+#'
+#' @param df dataframe containing an species_id column
+#'
+#' @return dataframe, filtered version of initial dataframe
+#' @export
+
+filter_bbs_species <- function(df, species_table){
+
+    is_unidentified = function(names) {
+        #Before filtering, account for this one hybrid of 2 subspecies so it's kept
+        names[names == 'auratus auratus x auratus cafer'] = 'auratus auratus'
+        grepl('sp\\.| x |\\/', names)
+    }
+
+    valid_taxa = species_table %>%
+        dplyr:: filter(!is_unidentified(species)) %>%
+        dplyr::filter(aou > 2880) %>%
+        dplyr::filter(aou < 3650 | aou > 3810) %>%
+        dplyr::filter(aou < 3900 | aou > 3910) %>%
+        dplyr::filter(aou < 4160 | aou > 4210) %>%
+        dplyr::filter(aou != 7010)
+
+    dplyr::filter(df, species_id %in% valid_taxa$aou)
+}
+
+#' Combine subspecies into their common species
+#' 
+#' Modified from https://github.com/weecology/bbs-forecasting/blob/master/R/forecast-bbs-core.R 
+#'
+#' @export
+combine_bbs_subspecies = function(df, species_table){
+
+    # Subspecies have two spaces separated by non-spaces
+    subspecies_names = species_table %>%
+        dplyr::filter(aou %in% unique(df$species_id)) %>%
+        dplyr::pull(spanish_common_name) %>%
+        grep(" [^ ]+ ", ., value = TRUE)
+
+    subspecies_ids = species_table %>%
+        dplyr::filter(spanish_common_name %in% subspecies_names) %>%
+        dplyr::pull(aou)
+
+    # Drop all but the first two words to get the root species name,
+    # then find the AOU code
+    new_subspecies_ids = species_table %>%
+        dplyr::slice(match(stringr::word(subspecies_names, 1,2),
+                           species_table$spanish_common_name)) %>%
+        dplyr::pull(aou)
+
+    # replace the full subspecies names with species-level names
+    for (i in seq_along(subspecies_ids)) {
+        df$species_id[df$species_id == subspecies_ids[i]] = new_subspecies_ids[i]
+    }
+
+    df_grouped <- df %>%
+        dplyr::group_by_at(dplyr::vars(-abundance)) %>%
+        dplyr::summarise(abundance = sum(abundance)) %>%
+        dplyr::ungroup() %>%
+        dplyr::distinct()
+}
diff --git a/R/build_plans.R b/R/build_plans.R
@@ -63,15 +63,19 @@ build_analyses_plan <- function(methods, datasets, ...)
 #' @title Generate a Drake Plan for Datasets
 #' 
 #' @param data_path where to get the downloaded retriever datasets
-#' @param include_downloaded_data whether to also include downloadable datasets
+#' @param include_retriever_data whether to include retriever-downloaded data
+#' @param include_bbs_data whether to include BBS data
+#' @inheritParams build_bbs_datasets_plan
 #' 
 #' @return a drake plan (i.e. a tibble) specifying the targets and commands 
 #'   for gathering datasets
 #' 
 #' @export
 #' 
 build_datasets_plan <- function(data_path = get_default_data_path(), 
-                                include_downloaded_data = FALSE)
+                                include_retriever_data = FALSE,
+                                include_bbs_data = FALSE,
+                                bbs_subset = NULL)
 {
     datasets <- drake::drake_plan(
         maizuru_data = get_maizuru_data(),
@@ -82,17 +86,56 @@ build_datasets_plan <- function(data_path = get_default_data_path(),
         karoo_data = get_karoo_data(), 
         kruger_data = get_kruger_data()
     )
-    if (include_downloaded_data)
+
+    if (include_retriever_data)
     {
         datasets <- datasets %>%
             dplyr::bind_rows(
                 drake::drake_plan(
                     portal_data = get_portal_rodents(),
-                    bbs_data = get_bbs_data(region = 7, path = !!data_path),
                     sdl_data = get_sdl_data(path = !!data_path),
                     mtquad_data = get_mtquad_data(path = !!data_path)
                 )
             )
     }
+
+    if (include_bbs_data) {
+        bbs_datasets = build_bbs_datasets_plan(data_path = data_path, 
+                                               bbs_subset = bbs_subset)
+
+        datasets <- datasets %>%
+            dplyr::bind_rows(bbs_datasets)
+    }
+
     return(datasets)
 }
+
+#' @title Generate a Drake Plan for BBS Datasets
+#' 
+#' @inheritParams build_datasets_plan
+#' @inheritParams prepare_bbs_ts_data
+#' 
+#' @return a drake plan (i.e. a tibble) specifying the targets and commands 
+#'   for gathering BBS datasets
+#' 
+#' @export
+#' 
+build_bbs_datasets_plan <- function(data_path = get_default_data_path(), bbs_subset = NULL)
+{
+    routes_and_regions_file <- file.path(data_path, "breed-bird-survey-prepped", "routes_and_regions_table.csv")
+
+    if (!file.exists(routes_and_regions_file)) {
+        prepare_bbs_ts_data(path = data_path, bbs_subset = bbs_subset)
+    }
+
+    routes_and_regions <- read.csv(routes_and_regions_file, colClasses = "character")
+
+    bbs_datasets <- drake::drake_plan(
+        bbs_data_rtrg = target(get_bbs_route_region_data(route, region, path = !!data_path),
+                               transform = map(route = !!rlang::syms(routes_and_regions$route),
+                                               region = !!rlang::syms(routes_and_regions$bcr)
+                               )
+        )
+    )
+    return(bbs_datasets)
+}