Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import BBS data by route #127

Merged
merged 37 commits into from
May 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
512846c
Summarize bbs regions by year
gmyenni Apr 26, 2019
b69f9eb
Filter by route and add region, route to covariates table
diazrenata May 10, 2019
87172de
Clean bbs data and subset at route level
diazrenata May 10, 2019
7cb0db8
Clean up imports in NAMESPACE
diazrenata May 10, 2019
070fee7
Method for importing all BBS routes and including in drake plans
diazrenata May 12, 2019
2b82156
Adding option to prepare the BBS data once and store it in the path/d…
diazrenata May 12, 2019
1799adf
Correct typo
diazrenata May 13, 2019
f80ff6c
export bbs cleaning functions for use in MATSS-LDATS
diazrenata May 13, 2019
6a558a3
fix saving bug
diazrenata May 13, 2019
026dfe0
finish bug fix
diazrenata May 13, 2019
6c74edb
try not saving
diazrenata May 13, 2019
7fb1652
update travis
diazrenata May 13, 2019
8add2a6
fix typo
diazrenata May 13, 2019
ff9b2a3
typo
diazrenata May 13, 2019
7fd796a
Add prepare_bbs_ts_data to drake plans
diazrenata May 13, 2019
55bbce7
Remove bbs_ts_data from datasets chunk of drake plans, and stop evalu…
diazrenata May 13, 2019
65f6b2a
git status
diazrenata May 13, 2019
497e3c6
Saving bbs tables in data directory and accessing them as needed, ins…
diazrenata May 13, 2019
a39e54c
fix typo
diazrenata May 13, 2019
0f49c86
Adjust & add tests for BBS plans
diazrenata May 13, 2019
6cc5db8
skip bbs plan tests for now
diazrenata May 13, 2019
bdc0c8b
resolve travis.yml conflict
diazrenata May 13, 2019
635275a
Remove test for get_bbs_data (removed)
diazrenata May 13, 2019
e02c118
Rearrange BBS functions
diazrenata May 27, 2019
3c1be4a
Add option to work with a subset of BBS data and fix passing the path…
diazrenata May 27, 2019
ac3f45e
Speed up prepare_bbs_ts_data'
diazrenata May 27, 2019
58b9b7a
update subset_bbs_route_region_data function
ha0ye May 28, 2019
2601532
simplify looping paradigm for processing bbs route and regions
ha0ye May 28, 2019
93782a9
return data from process_bbs function
ha0ye May 28, 2019
9f996c8
make function arguments consistent
ha0ye May 28, 2019
e773af7
fix bbs-data-test
ha0ye May 28, 2019
b962eb6
fix plan test
ha0ye May 28, 2019
025927a
include test for bbs dataset plan
ha0ye May 28, 2019
319e934
clean drake cache from vignettes folder and update .gitignore
ha0ye May 28, 2019
710b09c
oops, fix typo
ha0ye May 28, 2019
0bd95f5
correct documentation on process_bbs_route_region_data
ha0ye May 28, 2019
4ff431e
Merge pull request #130 from weecology/bbs-by-route-patch
diazrenata May 28, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
.Rhistory
.RData
.Ruserdata
/.drake/
.drake/
/output/drake-cache.sqlite
/analysis/*.html
/data/
Expand Down
24 changes: 7 additions & 17 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@
export("%>%")
export(analysis_wrapper)
export(build_analyses_plan)
export(build_bbs_datasets_plan)
export(build_datasets_plan)
export(check_data_format)
export(check_default_data_path)
export(collect_analyses)
export(get_bbs_data)
export(combine_bbs_subspecies)
export(filter_bbs_species)
export(filter_bbs_ts)
export(get_bbs_route_region_data)
export(get_cowley_lizards)
export(get_cowley_snakes)
export(get_default_data_path)
Expand All @@ -22,6 +26,8 @@ export(get_sgs_data)
export(import_retriever_data)
export(install_retriever_data)
export(interpolate_obs)
export(prepare_bbs_ts_data)
export(process_bbs_route_region_data)
export(richness)
export(summarise_effort)
export(summarise_obs)
Expand All @@ -34,21 +40,5 @@ export(ts_summary)
export(ts_summary_drake)
export(uni_ts_summary)
export(use_default_data_path)
importFrom(graphics,axis)
importFrom(graphics,mtext)
importFrom(graphics,par)
importFrom(graphics,plot)
importFrom(graphics,points)
importFrom(graphics,rect)
importFrom(graphics,text)
importFrom(magrittr,"%>%")
importFrom(stats,D)
importFrom(stats,acf)
importFrom(stats,cor)
importFrom(stats,median)
importFrom(stats,na.omit)
importFrom(stats,sd)
importFrom(stats,setNames)
importFrom(utils,data)
importFrom(utils,read.csv)
importFrom(utils,read.delim)
210 changes: 210 additions & 0 deletions R/bbs_cleaning_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
#' @title Prepare BBS population time-series data
#' @description Modified from https://github.com/weecology/bbs-forecasting
#' and https://github.com/weecology/MATSS-community-change
#' Selects sites with data spanning start_yr through end_yr containing at
#' least min_num_yrs of data samples during that period. Cleans data tables
#' and stores each individual route as a .Rds file. Saves a data table of the
#' route + region pairs.
#' @param start_yr num first year of time-series
#' @param end_yr num last year of time-series
#' @param min_num_yrs num minimum number of years of data between start_yr & end_yr
#' @param bbs_subset optional, a subset of the BBS communities to use
#' (to speed up development). As c(1:X)
#' @inheritParams get_mtquad_data
#' @return NULL
#' @export

prepare_bbs_ts_data <- function(start_yr = 1965, end_yr = 2017, min_num_yrs = 10,
path = get_default_data_path(), bbs_subset = NULL)
{
bbs_data_tables <- import_retriever_data("breed-bird-survey", path = path)

bbs_data <- bbs_data_tables$breed_bird_survey_weather %>%
dplyr::filter(runtype == 1, rpid == 101) %>%
dplyr::left_join(bbs_data_tables$breed_bird_survey_counts,
by = c('statenum', 'route', 'rpid', 'year', 'routedataid', 'countrynum')) %>%
dplyr::left_join(bbs_data_tables$breed_bird_survey_routes,
by = c('statenum', 'route', 'countrynum')) %>%
dplyr::mutate(site_id = statenum*1000 + route,
starttemp = dplyr::case_when(tempscale=='F' ~ c((starttemp - 32)*5/9),
tempscale=='C' ~ as.double(starttemp)),
endtemp = dplyr::case_when(tempscale=='F' ~ c((endtemp - 32)*5/9),
tempscale=='C' ~ as.double(endtemp))) %>%
dplyr::rename(lat = latitude,
long = longitude,
species_id = aou,
abundance = speciestotal) %>%
filter_bbs_ts(start_yr, end_yr, min_num_yrs)

# prepare and write out route and region metadata
bbs_routes_regions <- bbs_data %>%
dplyr::select(bcr, route) %>%
dplyr::distinct() %>%
dplyr::mutate(name = paste0("bbs_bcr", bcr, "_route", route))

storage_path <- file.path(path, 'breed-bird-survey-prepped')
if (!dir.exists(storage_path)) {
dir.create(storage_path)
}

write.csv(bbs_routes_regions, file.path(storage_path, "routes_and_regions_table.csv"), row.names = F)

# filter and process selected route and region combinations
if (!is.null(bbs_subset)) {
bbs_routes_regions <- bbs_routes_regions[bbs_subset, ]
}

bbs_routes_regions %>%
dplyr::select(bcr, route) %>%
purrr::pmap(function(bcr, route) {
bbs_data %>%
dplyr::filter(bcr == !!bcr,
route == !!route) %>%
process_bbs_route_region_data(species_table = bbs_data_tables$breed_bird_survey_species) %>%
saveRDS(file = file.path(storage_path, paste0("route", route, "region", bcr, ".Rds")) )
})
}

#' @title Process the BBS data for an individual route and region
#' @description Correct and otherwise filter BBS species data (see
#' \code{\link{combine_subspecies}} and \code{\link{filter_bbs_species}} for
#' more info). Generate the abundance, covariate, and metadata tables and
#' return the combined object.
#' @param bbs_data_table main bbs data table
#' @param species_table table of species for BBS
#' @return the processed BBS data
#' @export
process_bbs_route_region_data <- function(bbs_data_table, species_table)
{
# check that exactly one route and one region are represented in the data
route <- unique(bbs_data_table$route)
region <- unique(bbs_data_table$bcr)
stopifnot(length(route) == 1 &&
length(region == 1))

# process species IDs
this_bbs_data <- bbs_data_table %>%
combine_bbs_subspecies(species_table = species_table) %>%
filter_bbs_species(species_table = species_table) %>%
dplyr::mutate(species_id = paste('sp', species_id, sep=''),
date = as.Date(paste(year, month, day, sep = "-"))) %>%
dplyr::ungroup()

abundance <- this_bbs_data %>%
dplyr::group_by(year, species_id) %>%
dplyr::summarise(abundance = sum(abundance)) %>%
dplyr::ungroup() %>%
tidyr::spread(key = species_id, value = abundance, fill = 0) %>%
dplyr::arrange(year) %>%
dplyr::select(-year)

covariates <- this_bbs_data %>%
dplyr::group_by(year) %>%
dplyr::summarise(effort = dplyr::n_distinct(site_id),
starttemp = mean(starttemp), endtemp = mean(endtemp),
startwind = mean(startwind), endwind = mean(endwind),
startsky = mean(startsky), endsky = mean(endsky),
lat = mean(lat), long = mean(long), mean_date = mean(date)) %>%
dplyr::arrange(year)

metadata <- list(timename = 'year', effort = 'effort', route = route, region = region)

return(list('abundance' = abundance, 'covariates' = covariates, 'metadata' = metadata))
}


#' @title Filter BBS to specified time series period and number of samples
#'
#' @description Modified from https://github.com/weecology/bbs-forecasting
#' and https://github.com/weecology/MATSS-community-change
#'
#' @param bbs_data dataframe that contains BBS site_id and year columns
#' @param start_yr num first year of time-series
#' @param end_yr num last year of time-series
#' @param min_num_yrs num minimum number of years of data between start_yr & end_yr
#'
#' @return dataframe with original data and associated environmental data
#' @export
filter_bbs_ts <- function(bbs_data, start_yr, end_yr, min_num_yrs) {
sites_to_keep = bbs_data %>%
dplyr::filter(year >= start_yr, year <= end_yr) %>%
dplyr::group_by(site_id) %>%
dplyr::summarise(num_years = length(unique(year))) %>%
dplyr::ungroup() %>%
dplyr::filter(num_years >= min_num_yrs)

filtered_data <- bbs_data %>%
dplyr::filter(year >= start_yr, year <= end_yr) %>%
dplyr::filter(site_id %in% sites_to_keep$site_id)
}


#' Filter poorly sampled BBS species
#'
#' Modified from https://github.com/weecology/bbs-forecasting/blob/master/R/forecast-bbs-core.R
#'
#' Removes waterbirds, shorebirds, owls, kingfishers, knightjars,
#' dippers. These species are poorly sampled due to their aquatic or
#' noctural nature. Also removes taxa that were either partially unidentified
#' (e.g. "sp.") or were considered hybrids (e.g. "A x B") or were listed as more
#' than one species (e.g. "A / B")
#'
#' @param df dataframe containing an species_id column
#'
#' @return dataframe, filtered version of initial dataframe
#' @export

filter_bbs_species <- function(df, species_table){

is_unidentified = function(names) {
#Before filtering, account for this one hybrid of 2 subspecies so it's kept
names[names == 'auratus auratus x auratus cafer'] = 'auratus auratus'
grepl('sp\\.| x |\\/', names)
}

valid_taxa = species_table %>%
dplyr:: filter(!is_unidentified(species)) %>%
dplyr::filter(aou > 2880) %>%
dplyr::filter(aou < 3650 | aou > 3810) %>%
dplyr::filter(aou < 3900 | aou > 3910) %>%
dplyr::filter(aou < 4160 | aou > 4210) %>%
dplyr::filter(aou != 7010)

dplyr::filter(df, species_id %in% valid_taxa$aou)
}

#' Combine subspecies into their common species
#'
#' Modified from https://github.com/weecology/bbs-forecasting/blob/master/R/forecast-bbs-core.R
#'
#' @export
combine_bbs_subspecies = function(df, species_table){

# Subspecies have two spaces separated by non-spaces
subspecies_names = species_table %>%
dplyr::filter(aou %in% unique(df$species_id)) %>%
dplyr::pull(spanish_common_name) %>%
grep(" [^ ]+ ", ., value = TRUE)

subspecies_ids = species_table %>%
dplyr::filter(spanish_common_name %in% subspecies_names) %>%
dplyr::pull(aou)

# Drop all but the first two words to get the root species name,
# then find the AOU code
new_subspecies_ids = species_table %>%
dplyr::slice(match(stringr::word(subspecies_names, 1,2),
species_table$spanish_common_name)) %>%
dplyr::pull(aou)

# replace the full subspecies names with species-level names
for (i in seq_along(subspecies_ids)) {
df$species_id[df$species_id == subspecies_ids[i]] = new_subspecies_ids[i]
}

df_grouped <- df %>%
dplyr::group_by_at(dplyr::vars(-abundance)) %>%
dplyr::summarise(abundance = sum(abundance)) %>%
dplyr::ungroup() %>%
dplyr::distinct()
}
51 changes: 47 additions & 4 deletions R/build_plans.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,19 @@ build_analyses_plan <- function(methods, datasets, ...)
#' @title Generate a Drake Plan for Datasets
#'
#' @param data_path where to get the downloaded retriever datasets
#' @param include_downloaded_data whether to also include downloadable datasets
#' @param include_retriever_data whether to include retriever-downloaded data
#' @param include_bbs_data whether to include BBS data
#' @inheritParams build_bbs_datasets_plan
#'
#' @return a drake plan (i.e. a tibble) specifying the targets and commands
#' for gathering datasets
#'
#' @export
#'
build_datasets_plan <- function(data_path = get_default_data_path(),
include_downloaded_data = FALSE)
include_retriever_data = FALSE,
include_bbs_data = FALSE,
bbs_subset = NULL)
{
datasets <- drake::drake_plan(
maizuru_data = get_maizuru_data(),
Expand All @@ -82,17 +86,56 @@ build_datasets_plan <- function(data_path = get_default_data_path(),
karoo_data = get_karoo_data(),
kruger_data = get_kruger_data()
)
if (include_downloaded_data)

if (include_retriever_data)
{
datasets <- datasets %>%
dplyr::bind_rows(
drake::drake_plan(
portal_data = get_portal_rodents(),
bbs_data = get_bbs_data(region = 7, path = !!data_path),
sdl_data = get_sdl_data(path = !!data_path),
mtquad_data = get_mtquad_data(path = !!data_path)
)
)
}

if (include_bbs_data) {
bbs_datasets = build_bbs_datasets_plan(data_path = data_path,
bbs_subset = bbs_subset)

datasets <- datasets %>%
dplyr::bind_rows(bbs_datasets)
}

return(datasets)
}

#' @title Generate a Drake Plan for BBS Datasets
#'
#' @inheritParams build_datasets_plan
#' @inheritParams prepare_bbs_ts_data
#'
#' @return a drake plan (i.e. a tibble) specifying the targets and commands
#' for gathering BBS datasets
#'
#' @export
#'
build_bbs_datasets_plan <- function(data_path = get_default_data_path(), bbs_subset = NULL)
{
routes_and_regions_file <- file.path(data_path, "breed-bird-survey-prepped", "routes_and_regions_table.csv")

if (!file.exists(routes_and_regions_file)) {
prepare_bbs_ts_data(path = data_path, bbs_subset = bbs_subset)
}

routes_and_regions <- read.csv(routes_and_regions_file, colClasses = "character")

bbs_datasets <- drake::drake_plan(
bbs_data_rtrg = target(get_bbs_route_region_data(route, region, path = !!data_path),
transform = map(route = !!rlang::syms(routes_and_regions$route),
region = !!rlang::syms(routes_and_regions$bcr)
)
)
)
return(bbs_datasets)
}
Loading