Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

By route and cleaning #124

Closed
wants to merge 10 commits into from
7 changes: 6 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@
export("%>%")
export(analysis_wrapper)
export(build_analyses_plan)
export(build_bbs_datasets_plan)
export(build_datasets_plan)
export(check_data_format)
export(check_default_data_path)
export(collect_analyses)
export(get_bbs_data)
export(combine_subspecies)
export(filter_bbs_species)
export(filter_ts)
export(get_bbs_route_region_data)
export(get_cowley_lizards)
export(get_cowley_snakes)
export(get_default_data_path)
Expand All @@ -22,6 +26,7 @@ export(get_sgs_data)
export(import_retriever_data)
export(install_retriever_data)
export(interpolate_obs)
export(prepare_bbs_ts_data)
export(richness)
export(summarise_effort)
export(summarise_obs)
Expand Down
69 changes: 69 additions & 0 deletions R/bbs_cleaning_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#' Filter poorly sampled BBS species
#'
#' Modified from https://github.com/weecology/bbs-forecasting/blob/master/R/forecast-bbs-core.R
#'
#' Removes waterbirds, shorebirds, owls, kingfishers, knightjars,
#' dippers. These species are poorly sampled due to their aquatic or
#' noctural nature. Also removes taxa that were either partially unidentified
#' (e.g. "sp.") or were considered hybrids (e.g. "A x B") or were listed as more
#' than one species (e.g. "A / B")
#'
#' @param df dataframe containing an species_id column
#'
#' @return dataframe, filtered version of initial dataframe
#' @export

filter_bbs_species <- function(df, species_table){

is_unidentified = function(names) {
#Before filtering, account for this one hybrid of 2 subspecies so it's kept
names[names=='auratus auratus x auratus cafer']='auratus auratus'
grepl('sp\\.| x |\\/', names)
}

valid_taxa = species_table %>%
dplyr:: filter(!is_unidentified(species)) %>%
dplyr::filter(aou > 2880) %>%
dplyr::filter(aou < 3650 | aou > 3810) %>%
dplyr::filter(aou < 3900 | aou > 3910) %>%
dplyr::filter(aou < 4160 | aou > 4210) %>%
dplyr::filter(aou != 7010)

dplyr::filter(df, species_id %in% valid_taxa$aou)
}

#' Combine subspecies into their common species
#'
#' Modified from https://github.com/weecology/bbs-forecasting/blob/master/R/forecast-bbs-core.R
#'
#' @export
combine_subspecies = function(df, species_table){

# Subspecies have two spaces separated by non-spaces
subspecies_names = species_table %>%
dplyr::filter(aou %in% unique(df$species_id)) %>%
dplyr::pull(spanish_common_name) %>%
grep(" [^ ]+ ", ., value = TRUE)

subspecies_ids = species_table %>%
dplyr::filter(spanish_common_name %in% subspecies_names) %>%
dplyr::pull(aou)

# Drop all but the first two words to get the root species name,
# then find the AOU code
new_subspecies_ids = species_table %>%
dplyr::slice(match(stringr::word(subspecies_names, 1,2),
species_table$spanish_common_name)) %>%
dplyr::pull(aou)

# replace the full subspecies names with species-level names
for (i in seq_along(subspecies_ids)) {
df$species_id[df$species_id == subspecies_ids[i]] = new_subspecies_ids[i]
}

df_grouped <- df %>%
dplyr::group_by_at(dplyr::vars(-abundance)) %>%
dplyr::summarise(abundance = sum(abundance)) %>%
dplyr::ungroup() %>%
dplyr::distinct()
}
42 changes: 40 additions & 2 deletions R/build_plans.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,16 @@ build_analyses_plan <- function(methods, datasets, ...)
#'
#' @param data_path where to get the downloaded retriever datasets
#' @param include_downloaded_data whether to also include downloadable datasets
#' @param include_bbs_data whether to include BBS data
#'
#' @return a drake plan (i.e. a tibble) specifying the targets and commands
#' for gathering datasets
#'
#' @export
#'
build_datasets_plan <- function(data_path = get_default_data_path(),
include_downloaded_data = FALSE)
include_downloaded_data = FALSE,
include_bbs_data = FALSE)
{
datasets <- drake::drake_plan(
maizuru_data = get_maizuru_data(),
Expand All @@ -88,11 +90,47 @@ build_datasets_plan <- function(data_path = get_default_data_path(),
dplyr::bind_rows(
drake::drake_plan(
portal_data = get_portal_rodents(),
bbs_data = get_bbs_data(region = 7, path = !!data_path),
sdl_data = get_sdl_data(path = !!data_path),
mtquad_data = get_mtquad_data(path = !!data_path)
)
)
}

if (include_bbs_data) {
bbs_ts_data = get_bbs_ts_data()
bbs_datasets = build_bbs_datasets_plan(bbs_ts_data)
datasets <- datasets %>%
dplyr::bind_rows(bbs_datasets)
}

return(datasets)
}

#' @title Generate a Drake Plan for BBS Datasets
#'
#' @param data_path path
#' @param from_raw whether to re-prep BBS data
#'
#' @return a drake plan (i.e. a tibble) specifying the targets and commands
#' for gathering BBS datasets
#'
#' @export
#'
build_bbs_datasets_plan <- function(path = get_default_data_path(), from_raw = FALSE)
{
# if(from_raw) {
# prepare_bbs_ts_data()
# }
# load(paste0(path, '/breed-bird-survey-prepped/bbs_ts_data.Rds'))
#
bbs_ts_data = prepare_bbs_ts_data()

bbs_datasets <- drake::drake_plan(
bbs_data_rtrg = target(get_bbs_route_region_data(route, region, bbs_ts_data),
transform = map(route = !!rlang::syms(bbs_ts_data$routes_and_regions$route),
region = !!rlang::syms(bbs_ts_data$routes_and_regions$bcr)
)
)
)
return(bbs_datasets)
}
65 changes: 48 additions & 17 deletions R/get_retriever_data.R
Original file line number Diff line number Diff line change
@@ -1,27 +1,19 @@
#' @title Create BBS population time-series data
#' @title Prepare BBS population time-series data
#'
#' @description Modified from https://github.com/weecology/bbs-forecasting
#' and https://github.com/weecology/MATSS-community-change
#' Selects sites with data spanning start_yr through end_yr containing at least min_num_yrs of data
#' samples during that period.
#'
#' @param start_yr num first year of time-series
#' @param end_yr num last year of time-series
#' @param min_num_yrs num minimum number of years of data between start_yr & end_yr
#' @param region region code of data to return (currently uses state codes)
#' @inheritParams get_mtquad_data
#'
#' @return list of two dataframes (one with abundance data, the other with covariate data)
#' and one list of metadata.
#'
#' @examples
#' \dontrun{
#' get_bbs_data(region=7)
#' }
#' @return list of three dataframes (one with bbs data filtered to time series that meet the criteria, one with the BBS species table, and one with the routes and regions represented in the first dataframe).
#' @export

get_bbs_data <- function(start_yr = 1965, end_yr = 2017, min_num_yrs = 10, region,
path = get_default_data_path())
prepare_bbs_ts_data <- function(start_yr = 1965, end_yr = 2017, min_num_yrs = 10,
path = get_default_data_path())
{
bbs_data_tables <- import_retriever_data("breed-bird-survey", path = path)

Expand All @@ -31,7 +23,6 @@ get_bbs_data <- function(start_yr = 1965, end_yr = 2017, min_num_yrs = 10, regio
by = c('statenum', 'route', 'rpid', 'year', 'routedataid', 'countrynum')) %>%
dplyr::left_join(bbs_data_tables$breed_bird_survey_routes,
by = c('statenum', 'route', 'countrynum')) %>%
dplyr::filter(bcr == region) %>%
dplyr::mutate(site_id = statenum*1000 + route,
starttemp = dplyr::case_when(tempscale=='F' ~ c((starttemp - 32)*5/9),
tempscale=='C' ~ as.double(starttemp)),
Expand All @@ -41,20 +32,57 @@ get_bbs_data <- function(start_yr = 1965, end_yr = 2017, min_num_yrs = 10, regio
long = longitude,
species_id = aou,
abundance = speciestotal) %>%
MATSS::filter_ts(start_yr, end_yr, min_num_yrs)


bbs_routes_regions <- bbs_data %>%
dplyr::select(bcr, route) %>%
dplyr::distinct() %>%
dplyr::mutate(bcr = as.character(bcr), route = as.character(route)) %>%
dplyr::mutate(name = paste0("bbs_bcr", bcr, "_route", route))

bbs_ts_data = list(bbs_data = bbs_data, species_table = bbs_data_tables$breed_bird_survey_species, routes_and_regions = bbs_routes_regions)

# if(!dir.exists(paste0(path, '/breed-bird-survey-prepped'))) {
# dir.create(paste0(path, '/breed-bird-survey-prepped'))
# }

# save(bbs_ts_data, file = paste0(path, '/breed-bird-survey-prepped/bbs_ts_data.Rds'))

return(bbs_ts_data)
}


#' Get BBS data by route and reigon
#'
#' @param route route
#' @param region region
#' @param bbs_ts_data list of three dataframes (output of get_bbs_ts_data)
#' @return list of two dataframes (one with abundance data, the other with covariate data)
#' and one list of metadata.
#' @export
get_bbs_route_region_data <- function(route, region, bbs_ts_data) {

route = as.numeric(route)
region = as.numeric(region)

this_bbs_data <- bbs_ts_data$bbs_data %>%
dplyr::filter(bcr == region, route == route) %>%
combine_subspecies(species_table = bbs_ts_data$species_table) %>%
filter_bbs_species(species_table = bbs_ts_data$species_table) %>%
dplyr::mutate(species_id = paste('sp', species_id, sep=''),
date = as.Date(paste(year, month, day, sep = "-"))) %>%
filter_ts(start_yr, end_yr, min_num_yrs) %>%
dplyr::ungroup()

abundance <- bbs_data %>%
abundance <- this_bbs_data %>%
dplyr::group_by(year, species_id) %>%
dplyr::summarise(abundance = sum(abundance)) %>%
dplyr::ungroup() %>%
tidyr::spread(key = species_id, value = abundance, fill = 0) %>%
dplyr::arrange(year) %>%
dplyr::select(-year)

covariates <- bbs_data %>%
covariates <- this_bbs_data %>%
dplyr::group_by(year) %>%
dplyr::summarise(effort = dplyr::n_distinct(site_id),
starttemp = mean(starttemp), endtemp = mean(endtemp),
Expand All @@ -63,11 +91,13 @@ get_bbs_data <- function(start_yr = 1965, end_yr = 2017, min_num_yrs = 10, regio
lat = mean(lat), long = mean(long), mean_date = mean(date)) %>%
dplyr::arrange(year)

metadata <- list(timename = 'year', effort = 'effort')
metadata <- list(timename = 'year', effort = 'effort', route = this_route, region = this_bcr)

return(list('abundance' = abundance, 'covariates' = covariates, 'metadata' = metadata))

}


#' @title Filter BBS to specified time series period and number of samples
#'
#' @description Modified from https://github.com/weecology/bbs-forecasting
Expand All @@ -79,6 +109,7 @@ get_bbs_data <- function(start_yr = 1965, end_yr = 2017, min_num_yrs = 10, regio
#' @param min_num_yrs num minimum number of years of data between start_yr & end_yr
#'
#' @return dataframe with original data and associated environmental data
#' @export
filter_ts <- function(bbs_data, start_yr, end_yr, min_num_yrs) {
sites_to_keep = bbs_data %>%
dplyr::filter(year >= start_yr, year <= end_yr) %>%
Expand Down
21 changes: 21 additions & 0 deletions man/build_bbs_datasets_plan.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion man/build_datasets_plan.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions man/combine_subspecies.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 24 additions & 0 deletions man/filter_bbs_species.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions man/get_bbs_route_region_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading