weecology · diazrenata · May 10, 2019 · May 10, 2019 · May 10, 2019 · May 12, 2019
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,11 +3,15 @@
 export("%>%")
 export(analysis_wrapper)
 export(build_analyses_plan)
+export(build_bbs_datasets_plan)
 export(build_datasets_plan)
 export(check_data_format)
 export(check_default_data_path)
 export(collect_analyses)
-export(get_bbs_data)
+export(combine_subspecies)
+export(filter_bbs_species)
+export(filter_ts)
+export(get_bbs_route_region_data)
 export(get_cowley_lizards)
 export(get_cowley_snakes)
 export(get_default_data_path)
@@ -22,6 +26,7 @@ export(get_sgs_data)
 export(import_retriever_data)
 export(install_retriever_data)
 export(interpolate_obs)
+export(prepare_bbs_ts_data)
 export(richness)
 export(summarise_effort)
 export(summarise_obs)

diff --git a/R/bbs_cleaning_functions.R b/R/bbs_cleaning_functions.R
@@ -0,0 +1,69 @@
+#' Filter poorly sampled BBS species
+#' 
+#' Modified from https://github.com/weecology/bbs-forecasting/blob/master/R/forecast-bbs-core.R
+#'
+#' Removes waterbirds, shorebirds, owls, kingfishers, knightjars,
+#' dippers. These species are poorly sampled due to their aquatic or
+#' noctural nature. Also removes taxa that were either partially unidentified
+#' (e.g. "sp.") or were considered hybrids (e.g. "A x B") or were listed as more
+#' than one species (e.g. "A / B")
+#'
+#' @param df dataframe containing an species_id column
+#'
+#' @return dataframe, filtered version of initial dataframe
+#' @export
+
+filter_bbs_species <- function(df, species_table){
+
+    is_unidentified = function(names) {
+        #Before filtering, account for this one hybrid of 2 subspecies so it's kept
+        names[names=='auratus auratus x auratus cafer']='auratus auratus'
+        grepl('sp\\.| x |\\/', names)
+    }
+
+    valid_taxa = species_table %>%
+        dplyr:: filter(!is_unidentified(species)) %>%
+        dplyr::filter(aou > 2880) %>%
+        dplyr::filter(aou < 3650 | aou > 3810) %>%
+        dplyr::filter(aou < 3900 | aou > 3910) %>%
+        dplyr::filter(aou < 4160 | aou > 4210) %>%
+        dplyr::filter(aou != 7010)
+
+    dplyr::filter(df, species_id %in% valid_taxa$aou)
+}
+
+#' Combine subspecies into their common species
+#' 
+#' Modified from https://github.com/weecology/bbs-forecasting/blob/master/R/forecast-bbs-core.R 
+#'
+#' @export
+combine_subspecies = function(df, species_table){
+
+    # Subspecies have two spaces separated by non-spaces
+    subspecies_names = species_table %>%
+        dplyr::filter(aou %in% unique(df$species_id)) %>%
+        dplyr::pull(spanish_common_name) %>%
+        grep(" [^ ]+ ", ., value = TRUE)
+
+    subspecies_ids = species_table %>%
+        dplyr::filter(spanish_common_name %in% subspecies_names) %>%
+        dplyr::pull(aou)
+
+    # Drop all but the first two words to get the root species name,
+    # then find the AOU code
+    new_subspecies_ids = species_table %>%
+        dplyr::slice(match(stringr::word(subspecies_names, 1,2),
+                           species_table$spanish_common_name)) %>%
+        dplyr::pull(aou)
+
+    # replace the full subspecies names with species-level names
+    for (i in seq_along(subspecies_ids)) {
+        df$species_id[df$species_id == subspecies_ids[i]] = new_subspecies_ids[i]
+    }
+
+    df_grouped <- df %>%
+        dplyr::group_by_at(dplyr::vars(-abundance)) %>%
+        dplyr::summarise(abundance = sum(abundance)) %>%
+        dplyr::ungroup() %>%
+        dplyr::distinct()
+}
diff --git a/R/build_plans.R b/R/build_plans.R
@@ -64,14 +64,16 @@ build_analyses_plan <- function(methods, datasets, ...)
 #' 
 #' @param data_path where to get the downloaded retriever datasets
 #' @param include_downloaded_data whether to also include downloadable datasets
+#' @param include_bbs_data whether to include BBS data
 #' 
 #' @return a drake plan (i.e. a tibble) specifying the targets and commands 
 #'   for gathering datasets
 #' 
 #' @export
 #' 
 build_datasets_plan <- function(data_path = get_default_data_path(), 
-                                include_downloaded_data = FALSE)
+                                include_downloaded_data = FALSE,
+                                include_bbs_data = FALSE)
 {
     datasets <- drake::drake_plan(
         maizuru_data = get_maizuru_data(),
@@ -88,11 +90,47 @@ build_datasets_plan <- function(data_path = get_default_data_path(),
             dplyr::bind_rows(
                 drake::drake_plan(
                     portal_data = get_portal_rodents(),
-                    bbs_data = get_bbs_data(region = 7, path = !!data_path),
                     sdl_data = get_sdl_data(path = !!data_path),
                     mtquad_data = get_mtquad_data(path = !!data_path)
                 )
             )
     }
+
+    if (include_bbs_data) {
+        bbs_ts_data = get_bbs_ts_data()
+        bbs_datasets = build_bbs_datasets_plan(bbs_ts_data)
+        datasets <- datasets %>%
+            dplyr::bind_rows(bbs_datasets)
+    }
+
     return(datasets)
 }
+
+#' @title Generate a Drake Plan for BBS Datasets
+#' 
+#' @param data_path path
+#' @param from_raw whether to re-prep BBS data
+#' 
+#' @return a drake plan (i.e. a tibble) specifying the targets and commands 
+#'   for gathering BBS datasets
+#' 
+#' @export
+#' 
+build_bbs_datasets_plan <- function(path = get_default_data_path(), from_raw = FALSE)
+{
+    # if(from_raw) {
+    #     prepare_bbs_ts_data()
+    # } 
+    # load(paste0(path, '/breed-bird-survey-prepped/bbs_ts_data.Rds'))
+    # 
+    bbs_ts_data = prepare_bbs_ts_data()
+
+    bbs_datasets <- drake::drake_plan(
+        bbs_data_rtrg = target(get_bbs_route_region_data(route, region, bbs_ts_data),
+                               transform = map(route = !!rlang::syms(bbs_ts_data$routes_and_regions$route),
+                                               region = !!rlang::syms(bbs_ts_data$routes_and_regions$bcr)
+                               )
+        )
+    )
+    return(bbs_datasets)
+}
diff --git a/R/get_retriever_data.R b/R/get_retriever_data.R
@@ -1,27 +1,19 @@
-#' @title Create BBS population time-series data
+#' @title Prepare BBS population time-series data
 #'
 #' @description Modified from https://github.com/weecology/bbs-forecasting
 #' and https://github.com/weecology/MATSS-community-change 
 #' Selects sites with data spanning start_yr through end_yr containing at least min_num_yrs of data
 #' samples during that period.
-#'
 #' @param start_yr num first year of time-series
 #' @param end_yr num last year of time-series
 #' @param min_num_yrs num minimum number of years of data between start_yr & end_yr
-#' @param region region code of data to return (currently uses state codes)
 #' @inheritParams get_mtquad_data
 #'
-#' @return list of two dataframes (one with abundance data, the other with covariate data) 
-#'   and one list of metadata.
-#' 
-#' @examples
-#' \dontrun{
-#'   get_bbs_data(region=7)
-#' }
+#' @return list of three dataframes (one with bbs data filtered to time series that meet the criteria, one with the BBS species table, and one with the routes and regions represented in the first dataframe).
 #' @export
 
-get_bbs_data <- function(start_yr = 1965, end_yr = 2017, min_num_yrs = 10, region, 
-                         path = get_default_data_path())
+prepare_bbs_ts_data <- function(start_yr = 1965, end_yr = 2017, min_num_yrs = 10,
+                            path = get_default_data_path())
 {
     bbs_data_tables <- import_retriever_data("breed-bird-survey", path = path)
 
@@ -31,7 +23,6 @@ get_bbs_data <- function(start_yr = 1965, end_yr = 2017, min_num_yrs = 10, regio
                          by = c('statenum', 'route', 'rpid', 'year', 'routedataid', 'countrynum')) %>%
         dplyr::left_join(bbs_data_tables$breed_bird_survey_routes, 
                          by = c('statenum', 'route', 'countrynum')) %>%
-        dplyr::filter(bcr == region) %>%
         dplyr::mutate(site_id = statenum*1000 + route, 
                       starttemp = dplyr::case_when(tempscale=='F' ~ c((starttemp - 32)*5/9),
                                                    tempscale=='C' ~ as.double(starttemp)),
@@ -41,20 +32,57 @@ get_bbs_data <- function(start_yr = 1965, end_yr = 2017, min_num_yrs = 10, regio
                       long = longitude,
                       species_id = aou,
                       abundance = speciestotal) %>%
+        MATSS::filter_ts(start_yr, end_yr, min_num_yrs)
+
+
+    bbs_routes_regions <- bbs_data %>%
+        dplyr::select(bcr, route) %>%
+        dplyr::distinct() %>%
+        dplyr::mutate(bcr = as.character(bcr), route = as.character(route)) %>%
+        dplyr::mutate(name = paste0("bbs_bcr", bcr, "_route", route))
+
+    bbs_ts_data = list(bbs_data = bbs_data, species_table = bbs_data_tables$breed_bird_survey_species, routes_and_regions = bbs_routes_regions)
+
+    # if(!dir.exists(paste0(path, '/breed-bird-survey-prepped'))) {
+    #     dir.create(paste0(path, '/breed-bird-survey-prepped'))
+    # }
+
+   # save(bbs_ts_data, file = paste0(path, '/breed-bird-survey-prepped/bbs_ts_data.Rds'))
+
+    return(bbs_ts_data)
+}
+
+
+#' Get BBS data by route and reigon
+#'
+#' @param route route
+#' @param region region
+#' @param bbs_ts_data list of three dataframes (output of get_bbs_ts_data)
+#' @return list of two dataframes (one with abundance data, the other with covariate data) 
+#'   and one list of metadata.
+#' @export
+get_bbs_route_region_data <- function(route, region, bbs_ts_data) {
+
+    route = as.numeric(route)
+    region = as.numeric(region)
+
+    this_bbs_data <- bbs_ts_data$bbs_data %>%
+        dplyr::filter(bcr == region, route == route) %>%
+        combine_subspecies(species_table = bbs_ts_data$species_table) %>%
+        filter_bbs_species(species_table = bbs_ts_data$species_table) %>%
         dplyr::mutate(species_id = paste('sp', species_id, sep=''),
                       date = as.Date(paste(year, month, day, sep = "-"))) %>%
-        filter_ts(start_yr, end_yr, min_num_yrs) %>%
         dplyr::ungroup() 
 
-    abundance <- bbs_data %>%
+    abundance <- this_bbs_data %>%
         dplyr::group_by(year, species_id) %>%
         dplyr::summarise(abundance = sum(abundance)) %>%
         dplyr::ungroup() %>%
         tidyr::spread(key = species_id, value = abundance, fill = 0) %>%
         dplyr::arrange(year) %>%
         dplyr::select(-year)
 
-    covariates <- bbs_data %>%
+    covariates <- this_bbs_data %>%
         dplyr::group_by(year) %>%
         dplyr::summarise(effort = dplyr::n_distinct(site_id),
                          starttemp = mean(starttemp), endtemp = mean(endtemp),
@@ -63,11 +91,13 @@ get_bbs_data <- function(start_yr = 1965, end_yr = 2017, min_num_yrs = 10, regio
                          lat = mean(lat), long = mean(long), mean_date = mean(date)) %>%
         dplyr::arrange(year)
 
-    metadata <- list(timename = 'year', effort = 'effort')
+    metadata <- list(timename = 'year', effort = 'effort', route = this_route, region = this_bcr)
 
     return(list('abundance' = abundance, 'covariates' = covariates, 'metadata' = metadata))
+
 }
 
+
 #' @title Filter BBS to specified time series period and number of samples
 #'
 #' @description Modified from https://github.com/weecology/bbs-forecasting 
@@ -79,6 +109,7 @@ get_bbs_data <- function(start_yr = 1965, end_yr = 2017, min_num_yrs = 10, regio
 #' @param min_num_yrs num minimum number of years of data between start_yr & end_yr
 #'
 #' @return dataframe with original data and associated environmental data
+#' @export
 filter_ts <- function(bbs_data, start_yr, end_yr, min_num_yrs) {
     sites_to_keep = bbs_data %>%
         dplyr::filter(year >= start_yr, year <= end_yr) %>%

diff --git a/man/build_bbs_datasets_plan.Rd b/man/build_bbs_datasets_plan.Rd
diff --git a/man/build_datasets_plan.Rd b/man/build_datasets_plan.Rd
diff --git a/man/combine_subspecies.Rd b/man/combine_subspecies.Rd
diff --git a/man/filter_bbs_species.Rd b/man/filter_bbs_species.Rd
diff --git a/man/get_bbs_route_region_data.Rd b/man/get_bbs_route_region_data.Rd