diff --git a/.gitignore b/.gitignore index d30a6da..ca5a933 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,7 @@ # Session Data files .RData .RDataTmp -.Rproj +*.Rproj # User-specific files .Ruserdata diff --git a/DESCRIPTION b/DESCRIPTION index 791fdbf..cb915b5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: travelSurveyTools Title: travelSurveyTools -Version: 2.4.2 +Version: 2.4.3 Authors@R: c( person("RSG", "Inc.", , "rsg@rsginc.com", role = c("aut", "cre")), person("Ashley", "Asmus", , "ashley.asmus@rsginc.com", role = "aut"), diff --git a/NEWS.md b/NEWS.md index e4c6b74..959c4ba 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# travelSurveyTools 2.4.3 + +- Resolve bugs and updated function syntax in `hts_prep_triprate` and `hts_prep_variable` +- Updated `remove_outliers` parameter to default to FALSE in `hts_prep_triprate` +- Added a warning message in `hts_remove_outliers` + # travelSurveyTools 2.4.2 - Add summarize_var and summarize_by to output. Select weight automatically in `hts_summary_wrapper` diff --git a/R/hts_prep_triprate.R b/R/hts_prep_triprate.R index 0d9b107..f83e486 100644 --- a/R/hts_prep_triprate.R +++ b/R/hts_prep_triprate.R @@ -55,10 +55,16 @@ hts_prep_triprate = function(summarize_by = NULL, day_name = "day", ids = c("hh_id", "person_id", "day_id", "trip_id", "vehicle_id"), wts = c("hh_weight", "person_weight", "day_weight", "trip_weight", "hh_weight"), - remove_outliers = TRUE, + remove_outliers = FALSE, threshold = 0.975, weighted = TRUE, - hts_data) { + hts_data = list( + "hh" = hh, + "person" = person, + "day" = day, + "trip" = trip, + "vehicle" = vehicle)) { + # Check variable_list first variables_dt = hts_validate_variable_list(variables_dt, hts_data) @@ -135,151 +141,151 @@ hts_prep_triprate = function(summarize_by = NULL, shared_name %in% summarize_by & get(day_name) == 1 & get(trip_name) == 1, shared_name] - -if (length(day_trip_vars) > 0) { - - setnames(variables_dt, trip_name, 'trip_table') - - variables_dt[shared_name %in% day_trip_vars, trip_table := 0] - - setnames(variables_dt, 'trip_table', trip_name) - -} - -if (length(summarize_by) > 0) { - byvar_dt = hts_prep_byvar(summarize_by, - variables_dt = variables_dt, - hts_data = hts_data, - byvar_ids = ids, - byvar_wts = wts - ) - - merge_cols = names(byvar_dt)[names(byvar_dt) %in% names(trip_control)] - triprate_dt = merge(trip_control, byvar_dt, by = merge_cols, all.x = TRUE) - - triprate_cols = intersect(names(triprate_dt), c(ids, wts)) - - triprate_cols = triprate_cols[!triprate_cols %in% c(trip_id, trip_wt)] - - triprate_cols_all = c(triprate_cols, summarize_by) - - if (weighted) { - triprate_dt = triprate_dt[, .(num_trips = sum(get(trip_wt))), - by = triprate_cols_all - ] - } - - if (!weighted) { - triprate_dt = triprate_dt[, .(num_trips = sum(!is.na(get(trip_id)))), - by = triprate_cols_all - ] + if (length(day_trip_vars) > 0) { + + setnames(variables_dt, trip_name, 'trip_table') + + variables_dt[shared_name %in% day_trip_vars, trip_table := 0] + + setnames(variables_dt, 'trip_table', trip_name) + } - # fill in with zeros for zero trips on a given day: - triprate_dt[, `:=`( - num_trips = nafill(num_trips, fill = 0) - )] - - # If one of the by-variables is in trip table, need to expand to - # include all levels of the variable for every trip, and fill with zeros: - if (trip_id %in% names(byvar_dt)) { - # fill in with zeros for zero trips for a given level of xt_var using dcast: - dcast_formula = - paste0( - paste0(triprate_cols, collapse = " + "), - " ~ ", - paste0(summarize_by, collapse = " + ") - ) - - triprate_cast = dcast(triprate_dt, - dcast_formula, - value.var = "num_trips", - fill = 0 + if (length(summarize_by) > 0) { + byvar_dt = hts_prep_byvar(summarize_by, + variables_dt = variables_dt, + hts_data = hts_data, + byvar_ids = ids, + byvar_wts = wts ) - # Remove columns where NA levels of factors were generated during dcast: - na_filled_cols = names(triprate_cast)[names(triprate_cast) %like% "_NA"] + merge_cols = names(byvar_dt)[names(byvar_dt) %in% names(trip_control)] - if (length(na_filled_cols) > 0) { - triprate_cast[, c(na_filled_cols) := NULL] - } + triprate_dt = merge(trip_control, byvar_dt, by = merge_cols, all.x = TRUE) - # transform back to long format, with separate cols for weighted & unwt. trip rates: - triprate_dt = data.table::melt( - triprate_cast, - id.vars = triprate_cols, - value.name = "num_trips" - ) + triprate_cols = intersect(names(triprate_dt), c(ids, wts)) + + triprate_cols = triprate_cols[!triprate_cols %in% c(trip_id, trip_wt)] + + triprate_cols_all = c(triprate_cols, summarize_by) - # Relabel xtab trip vars after melting: - if (length(summarize_by) > 1) { - triprate_dt[, c(summarize_by) := tstrsplit(variable, "_")] - triprate_dt[, variable := NULL] + if (weighted) { + triprate_dt = triprate_dt[, .(num_trips = sum(get(trip_wt))), + by = triprate_cols_all + ] } - if (length(summarize_by) == 1) { - setnames(triprate_dt, old = "variable", new = summarize_by) + if (!weighted) { + triprate_dt = triprate_dt[, .(num_trips = sum(!is.na(get(..trip_id)))), + by = triprate_cols_all + ] } - triprate_dt = triprate_dt[] + # fill in with zeros for zero trips on a given day: + triprate_dt[, `:=`( + num_trips = nafill(num_trips, fill = 0) + )] + + # If one of the by-variables is in trip table, need to expand to + # include all levels of the variable for every trip, and fill with zeros: + if (trip_id %in% names(byvar_dt)) { + # fill in with zeros for zero trips for a given level of xt_var using dcast: + dcast_formula = + paste0( + paste0(triprate_cols, collapse = " + "), + " ~ ", + paste0(summarize_by, collapse = " + ") + ) + + triprate_cast = dcast(triprate_dt, + dcast_formula, + value.var = "num_trips", + fill = 0 + ) + + # Remove columns where NA levels of factors were generated during dcast: + na_filled_cols = names(triprate_cast)[names(triprate_cast) %like% "_NA" | names(triprate_cast) == "NA"] + + if (length(na_filled_cols) > 0) { + triprate_cast[, c(na_filled_cols) := NULL] + } + + # transform back to long format, with separate cols for weighted & unwt. trip rates: + triprate_dt = data.table::melt( + triprate_cast, + id.vars = triprate_cols, + value.name = "num_trips" + ) + + # Relabel xtab trip vars after melting: + if (length(summarize_by) > 1) { + triprate_dt[, c(summarize_by) := tstrsplit(variable, "_")] + triprate_dt[, variable := NULL] + } + + if (length(summarize_by) == 1) { + setnames(triprate_dt, old = "variable", new = summarize_by) + } + + triprate_dt = triprate_dt[] + } + + if (weighted) { + # calculate trip rate + triprate_dt[, trip_rate := + ifelse(num_trips == 0, 0, num_trips / get(day_wt))] + + # Save counts of trips under a different name + setnames(triprate_dt, "num_trips", "trip_count_wtd") + + setnames(triprate_dt, "trip_rate", "num_trips") + } } - if (weighted) { - # calculate trip rate - triprate_dt[, trip_rate := - ifelse(num_trips == 0, 0, num_trips / get(day_wt))] + # remove outliers + if (remove_outliers) { + out = hts_remove_outliers(triprate_dt, + numvar = "num_trips", + threshold = threshold + ) - # Save counts of trips under a different name - setnames(triprate_dt, "num_trips", "trip_count_wtd") + triprate_dt = out[["dt"]] - setnames(triprate_dt, "trip_rate", "num_trips") + outlier_table = out[["outlier_description"]] } -} - -# remove outliers -if (remove_outliers) { - out = hts_remove_outliers(triprate_dt, - numvar = "num_trips", - threshold = threshold - ) - triprate_dt = out[["dt"]] + # Bin trips: + triprate_binned = hts_bin_var( + prepped_dt = triprate_dt, + numvar = "num_trips", + nbins = 7 + ) - outlier_table = out[["outlier_description"]] -} - -# Bin trips: -triprate_binned = hts_bin_var( - prepped_dt = triprate_dt, - numvar = "num_trips", - nbins = 7 -) - -if (weighted) { - setnames(triprate_dt, "num_trips", "num_trips_wtd", skip_absent = TRUE) - setnames(triprate_binned, "num_trips", "num_trips_wtd", skip_absent = TRUE) -} else { - setnames(triprate_dt, "num_trips", "num_trips_unwtd", skip_absent = TRUE) - setnames(triprate_binned, "num_trips", "num_trips_unwtd", skip_absent = TRUE) -} -prepped_dt_ls = list( - "num" = triprate_dt, - "cat" = triprate_binned -) - -# Append outliers: -if (remove_outliers) { + if (weighted) { + setnames(triprate_dt, "num_trips", "num_trips_wtd", skip_absent = TRUE) + setnames(triprate_binned, "num_trips", "num_trips_wtd", skip_absent = TRUE) + } else { + setnames(triprate_dt, "num_trips", "num_trips_unwtd", skip_absent = TRUE) + setnames(triprate_binned, "num_trips", "num_trips_unwtd", skip_absent = TRUE) + } prepped_dt_ls = list( - "cat" = triprate_binned, "num" = triprate_dt, - "outliers" = outlier_table + "cat" = triprate_binned ) -} - - -return(prepped_dt_ls) + + # Append outliers: + if (remove_outliers) { + prepped_dt_ls = list( + "cat" = triprate_binned, + "num" = triprate_dt, + "outliers" = outlier_table + ) + } + + + return(prepped_dt_ls) } ## quiets concerns of R CMD check -utils::globalVariables(c("trip_weight", "num_trips", "trip_rate", "day_weight", "trip_table")) +utils::globalVariables(c("..trip_id","trip_weight", "num_trips", "trip_rate", "day_weight", "trip_table")) diff --git a/R/hts_prep_variable.R b/R/hts_prep_variable.R index 30625ae..db3059f 100644 --- a/R/hts_prep_variable.R +++ b/R/hts_prep_variable.R @@ -68,37 +68,42 @@ #' ) #' ) hts_prep_variable = function(summarize_var = NULL, - summarize_by = NULL, - variables_dt = variable_list, - data = hts_data, - id_cols = c("hh_id", "person_id", "day_id", "trip_id", "vehicle_id"), - weighted = TRUE, - wt_cols = c("hh_weight", "person_weight", "day_weight", "trip_weight", "hh_weight"), - remove_outliers = TRUE, - threshold = 0.975, - remove_missing = TRUE, - missing_values = c("Missing Response", "995"), - not_imputable = -1, - strataname = NULL) { + summarize_by = NULL, + variables_dt = variable_list, + data = list( + "hh" = hh, + "person" = person, + "day" = day, + "trip" = trip, + "vehicle" = vehicle), + id_cols = c("hh_id", "person_id", "day_id", "trip_id", "vehicle_id"), + weighted = TRUE, + wt_cols = c("hh_weight", "person_weight", "day_weight", "trip_weight", "hh_weight"), + remove_outliers = TRUE, + threshold = 0.975, + remove_missing = TRUE, + missing_values = c("Missing Response", "995"), + not_imputable = -1, + strataname = NULL) { # tictoc::tic("Total Time") - + # Check variable_list first variables_dt = hts_validate_variable_list(variables_dt, data) - + # Check that there is a id and weight for every table if (length(data) != length(id_cols)) { stop("Each table in data must have a corresponding id in id_cols") } - + if (weighted) { if (length(data) != length(wt_cols)) { stop("Each table in data must have a corresponding weight in wt_cols") } } - + # TODO: Could we put id and weight cols in a snippet or some such? # Or in a settings/options for these functions? - + if (remove_missing) { data = hts_remove_missing_data( hts_data = data, @@ -110,78 +115,78 @@ hts_prep_variable = function(summarize_var = NULL, not_imputable = not_imputable ) } - + # Find location of summary variable: var_location = hts_find_var(summarize_var, data = data, variables_dt = variables_dt) - + tbl_idx = which(names(data) == var_location) - + wtname = wt_cols[tbl_idx] - + # Select table where this variable lives: var_dt = data[[var_location]] - + # check for missing weight variables missing_weight_count = var_dt[is.na(get(wtname)), .N] - + if (missing_weight_count > 0) { message( missing_weight_count, " missing observation(s) of ", wtname, " in ", var_location, " setting equal to 0" ) - + setnames(var_dt, wtname, "old_weight") - + var_dt[, old_weight := ifelse(is.na(old_weight), - 0, - old_weight + 0, + old_weight )] - + setnames(var_dt, "old_weight", wtname) } - + # Check that specified id column exists in var_dt stopifnot( "table id not in var_dt, make id_cols ordered list of ids in each table of data" = id_cols[[tbl_idx]] %in% names(var_dt) ) - + # Is this a shared variable? var_is_shared = variables_dt[shared_name == summarize_var, is_checkbox][1] == 1 - + # If yes, expand summarize_var: if (var_is_shared) { summarize_var = variables_dt[shared_name == summarize_var, variable] - + for (i in 1:length(summarize_var)) { if (var_dt[, !"integer" %in% class(get(summarize_var[i]))]) { message("Checkbox variables must have integer values") - + stop() } } } - + # only keep ids that are in var_dt sum_vars_id_cols = intersect(id_cols, names(var_dt)) - + # Subset table to these column(s): - + if (weighted) { subset_cols = c(sum_vars_id_cols, summarize_var, wtname) } else { subset_cols = c(sum_vars_id_cols, summarize_var) } - + var_dt = var_dt[, subset_cols, with = FALSE] - + # If shared variable, melt var_dt: if (var_is_shared) { shared_name = variables_dt[ variable == summarize_var[[1]], shared_name ] - + if (weighted) { var_dt = hts_melt_vars( shared_name = shared_name, @@ -207,43 +212,43 @@ hts_prep_variable = function(summarize_var = NULL, to_single_row = FALSE ) } - - + + summarize_var = shared_name - + setnames(var_dt, shared_name, "shared_name") - + # make factor levels var_dt$shared_name = factor(var_dt$shared_name, levels = unique(var_dt$shared_name)) - + setnames(var_dt, "shared_name", shared_name) } - + # Identify, then bin, if summarize_var is numeric: v_class = variables_dt[shared_name == summarize_var, data_type][[1]] - + if (!v_class %in% c("integer", "numeric")) { var_dt_num = NULL var_dt_cat = var_dt } - + if (v_class %in% c("integer", "numeric")) { # remove outliers if (remove_outliers) { out = hts_remove_outliers(var_dt, - numvar = summarize_var, - threshold = threshold + numvar = summarize_var, + threshold = threshold ) - + var_dt = out[["dt"]] - + outlier_table = out[["outlier_description"]] } - + # save a copy of the un-binned data: var_dt_num = data.table::copy(var_dt) - - + + # bin the data for categorical summaries: var_dt_cat = hts_bin_var( prepped_dt = var_dt, @@ -251,72 +256,72 @@ hts_prep_variable = function(summarize_var = NULL, nbins = 7 ) } - + # Summarize-by variables: if (length(summarize_by) == 0) { num_res = var_dt_num cat_res = var_dt_cat } - + if (length(summarize_by) > 0) { for (i in 1:length(summarize_by)) { var = summarize_by[i] - + byvar_location = hts_find_var(var, data = data, variables_dt = variables_dt) - + # Select table where this variable lives: byvar_table = data[[byvar_location]] - + byvar_is_shared = variables_dt[shared_name == var, is_checkbox][1] == 1 - + if (byvar_is_shared) { var = variables_dt[shared_name == var, variable][1] } - + if (byvar_is_shared & !"integer" %in% byvar_table[, class(get(var))]) { message("Checkbox variables must have integer values") - + stop() } } - + byvar_dt = hts_prep_byvar(summarize_by, - variables_dt = variables_dt, - hts_data = data, - byvar_ids = id_cols, - byvar_wts = wt_cols + variables_dt = variables_dt, + hts_data = data, + byvar_ids = id_cols, + byvar_wts = wt_cols ) - + # Merge by var and summarize var: allow_cartesian_setting = FALSE - + if (var_is_shared == TRUE) { allow_cartesian_setting = TRUE } - + cat_res = merge(var_dt_cat, - byvar_dt, - all.x = FALSE, all.y = FALSE, - allow.cartesian = allow_cartesian_setting + byvar_dt, + all.x = FALSE, all.y = FALSE, + allow.cartesian = allow_cartesian_setting ) - + setcolorder(cat_res, intersect(c(sum_vars_id_cols, wt_cols, summarize_var, summarize_by), names(cat_res))) - + if (v_class %in% c("integer", "numeric")) { num_res = merge(var_dt_num, - byvar_dt, - all.x = FALSE, all.y = FALSE, - allow.cartesian = allow_cartesian_setting + byvar_dt, + all.x = FALSE, all.y = FALSE, + allow.cartesian = allow_cartesian_setting ) - + setcolorder(num_res, intersect(c(sum_vars_id_cols, wt_cols, summarize_var, summarize_by), names(cat_res))) } - + if (!v_class %in% c("integer", "numeric")) { num_res = NULL } } - + if (!is.null(strataname)) { if (!is.null(cat_res)) { cat_res = hts_cbind_var( @@ -328,7 +333,7 @@ hts_prep_variable = function(summarize_var = NULL, cbind_wts = wt_cols ) } - + if (!is.null(num_res)) { num_res = hts_cbind_var( lhs_table = num_res, @@ -340,12 +345,12 @@ hts_prep_variable = function(summarize_var = NULL, ) } } - + prepped_dt_ls = list( "cat" = cat_res, "num" = num_res ) - + # Append outliers: if (v_class %in% c("integer", "numeric") & remove_outliers) { prepped_dt_ls = list( @@ -354,8 +359,8 @@ hts_prep_variable = function(summarize_var = NULL, "outliers" = outlier_table ) } - - + + return(prepped_dt_ls) } diff --git a/R/hts_remove_outliers.R b/R/hts_remove_outliers.R index 9da5c6f..ee65e14 100644 --- a/R/hts_remove_outliers.R +++ b/R/hts_remove_outliers.R @@ -11,10 +11,12 @@ #' @examples #' #' require(data.table) +#' #' hts_remove_outliers(var_dt = trip, numvar = "speed_mph") #' hts_remove_outliers = function(var_dt, numvar = NULL, threshold = 0.975) { + outlier_storage = list() outlier_cutoff = quantile(var_dt[, get(numvar)], threshold, na.rm = TRUE) @@ -26,6 +28,12 @@ hts_remove_outliers = function(var_dt, numvar = NULL, max_outlier = max(var_dt[get(numvar) >= outlier_cutoff, get(numvar)]) ) + if (outlier_table$num_removed > 0) { + warning(stringr::str_glue( + "{outlier_table$num_removed} outliers were removed based on the threshold of {threshold}." + )) + } + outlier_storage[["outlier_description"]] = outlier_table outlier_storage[["dt"]] = var_dt[get(numvar) < outlier_cutoff] diff --git a/docs/404.html b/docs/404.html index c2974ca..a9c3729 100644 --- a/docs/404.html +++ b/docs/404.html @@ -24,7 +24,7 @@ travelSurveyTools - 2.4.2 + 2.4.3