diff --git a/R/cal-apply-multi.R b/R/cal-apply-multi.R index f540552a..4e4cb1eb 100644 --- a/R/cal-apply-multi.R +++ b/R/cal-apply-multi.R @@ -18,13 +18,20 @@ cal_apply_multi.cal_estimate_multinomial <- #---------------------------- >> Single Predict -------------------------------- apply_multi_predict <- function(object, .data) { + if (inherits(object$estimates[[1]]$estimate, "gam")) { + prob_type <- "response" + } else { + prob_type <- "probs" + } preds <- object$estimates[[1]]$estimate %>% - predict(newdata = .data, type = "probs") %>% - dplyr::as_tibble() + predict(newdata = .data, type = prob_type) + + colnames(preds) <- as.character(object$levels) + preds <- dplyr::as_tibble(preds) for (i in seq_along(object$levels)) { lev <- object$levels[i] - .data[, as.character(lev)] <- preds[, names(lev)] + .data[, as.character(lev)] <- preds[, as.character(lev)] } .data } diff --git a/R/cal-estimate-multinom.R b/R/cal-estimate-multinom.R index 20d3b68d..98421561 100644 --- a/R/cal-estimate-multinom.R +++ b/R/cal-estimate-multinom.R @@ -1,11 +1,52 @@ #' Uses a Multinomial calibration model to calculate new probabilities -#' @details It uses the `multinom` function, from the `nnet` package, to -#' create the calibration. +#' @details +#' When `smooth = FALSE`, [nnet::multinom()] function is used to estimate the +#' model, otherwise [mgcv::gam()] is used. #' @inheritParams cal_estimate_logistic +#' @examplesIf !probably:::is_cran_check() & rlang::is_installed(c("modeldata", "parsnip", "randomForest")) +#' library(modeldata) +#' library(parsnip) +#' library(dplyr) +#' +#' f <- +#' list( +#' ~ -0.5 + 0.6 * abs(A), +#' ~ ifelse(A > 0 & B > 0, 1.0 + 0.2 * A / B, -2), +#' ~ -0.6 * A + 0.50 * B - A * B +#' ) +#' +#' set.seed(1) +#' tr_dat <- sim_multinomial(500, eqn_1 = f[[1]], eqn_2 = f[[2]], eqn_3 = f[[3]]) +#' cal_dat <- sim_multinomial(500, eqn_1 = f[[1]], eqn_2 = f[[2]], eqn_3 = f[[3]]) +#' te_dat <- sim_multinomial(500, eqn_1 = f[[1]], eqn_2 = f[[2]], eqn_3 = f[[3]]) +#' +#' set.seed(2) +#' rf_fit <- +#' rand_forest() %>% +#' set_mode("classification") %>% +#' set_engine("randomForest") %>% +#' fit(class ~ ., data = tr_dat) +#' +#' cal_pred <- +#' predict(rf_fit, cal_dat, type = "prob") %>% +#' bind_cols(cal_dat) +#' te_pred <- +#' predict(rf_fit, te_dat, type = "prob") %>% +#' bind_cols(te_dat) +#' +#' cal_plot_windowed(cal_pred, truth = class, window_size = 0.1, step_size = 0.03) +#' +#' smoothed_mn <- cal_estimate_multinomial(cal_pred, truth = class) +#' +#' new_test_pred <- cal_apply(te_pred, smoothed_mn) +#' +#' cal_plot_windowed(new_test_pred, truth = class, window_size = 0.1, step_size = 0.03) +#' #' @export cal_estimate_multinomial <- function(.data, truth = NULL, estimate = dplyr::starts_with(".pred_"), + smooth = TRUE, parameters = NULL, ...) { UseMethod("cal_estimate_multinomial") @@ -13,11 +54,13 @@ cal_estimate_multinomial <- function(.data, #' @export #' @rdname cal_estimate_multinomial -cal_estimate_multinomial.data.frame <- function(.data, - truth = NULL, - estimate = dplyr::starts_with(".pred_"), - parameters = NULL, - ...) { +cal_estimate_multinomial.data.frame <- + function(.data, + truth = NULL, + estimate = dplyr::starts_with(".pred_"), + smooth = TRUE, + parameters = NULL, + ...) { stop_null_parameters(parameters) truth <- enquo(truth) @@ -26,18 +69,21 @@ cal_estimate_multinomial.data.frame <- function(.data, truth = !!truth, estimate = {{ estimate }}, source_class = cal_class_name(.data), + smooth = smooth, ... ) } #' @export #' @rdname cal_estimate_multinomial -cal_estimate_multinomial.tune_results <- function(.data, - truth = NULL, - estimate = dplyr::starts_with(".pred_"), - parameters = NULL, - ...) { - tune_args <- tune_results_args( +cal_estimate_multinomial.tune_results <- + function(.data, + truth = NULL, + estimate = dplyr::starts_with(".pred_"), + smooth = TRUE, + parameters = NULL, + ...) { + tune_args <- tune_results_args( .data = .data, truth = {{ truth }}, estimate = {{ estimate }}, @@ -53,6 +99,7 @@ cal_estimate_multinomial.tune_results <- function(.data, truth = !!tune_args$truth, estimate = !!tune_args$estimate, source_class = cal_class_name(.data), + smooth = smooth, ... ) } @@ -64,7 +111,7 @@ required_pkgs.cal_estimate_multinomial <- function(x, ...) { c("nnet", "probably") } -cal_multinom_impl <- function(.data, truth, estimate, source_class, ...) { +cal_multinom_impl <- function(.data, truth, estimate, source_class, smooth, ...) { truth <- enquo(truth) levels <- truth_estimate_map(.data, !!truth, {{ estimate }}) @@ -79,6 +126,7 @@ cal_multinom_impl <- function(.data, truth, estimate, source_class, ...) { .data = .data, truth = !!truth, levels = levels, + smooth = smooth, ... ) @@ -95,7 +143,7 @@ cal_multinom_impl <- function(.data, truth, estimate, source_class, ...) { } -cal_multinom_impl_grp <- function(.data, truth, levels, ...) { +cal_multinom_impl_grp <- function(.data, truth, levels, smooth, ...) { truth <- enquo(truth) .data %>% split_dplyr_groups() %>% @@ -105,6 +153,7 @@ cal_multinom_impl_grp <- function(.data, truth, levels, ...) { .data = x$data, truth = !!truth, levels = levels, + smooth = smooth, ... = ... ) list( @@ -118,21 +167,55 @@ cal_multinom_impl_grp <- function(.data, truth, levels, ...) { cal_multinom_impl_single <- function(.data, truth = NULL, levels = NULL, + smooth = TRUE, ...) { truth <- enquo(truth) + num_lvls <- length(levels) + levels <- levels[1:(length(levels) - 1)] - levels <- levels[1:length(levels) - 1] + if (smooth) { + # multinomial gams in mgcv needs zero-based integers as the outcome - levels_formula <- purrr::reduce( - levels, - function(x, y) expr(!!x + !!y) - ) + class_col <- deparse(ensym(truth)) + .data[[class_col]] <- as.numeric(.data[[class_col]]) - 1 + max_int <- max(.data[[class_col]], na.rm = TRUE) - f_model <- expr(!!ensym(truth) ~ !!levels_formula) + # It also needs a list of formulas, one for each level, and the first one + # requires a LHS + + smooths <- purrr::map(levels, ~ call2(.fn = "s", expr(!!.x))) + rhs_f <- purrr::reduce(smooths, function(x, y) expr(!!x + !!y)) + rhs_only <- new_formula(lhs = NULL, rhs = rhs_f) + both_sides <- new_formula(lhs = ensym(truth), rhs = rhs_f) + all_f <- purrr::map(seq_along(levels), ~ rhs_only) + all_f[[1]] <- both_sides + + model <- mgcv::gam(all_f, data = .data, family = mgcv::multinom(max_int)) + + # Nuke environments saved in formulas + # # TODO This next line causes a failure for unknown reasons. Look into it more + # model$formula <- purrr::map(model$formula, clean_env) + model$terms <- clean_env(model$terms) + + } else { + levels_formula <- purrr::reduce( + levels, + function(x, y) expr(!!x + !!y) + ) + + f_model <- expr(!!ensym(truth) ~ !!levels_formula) + + prevent_output <- utils::capture.output( + model <- nnet::multinom(formula = f_model, data = .data, ...) + ) + model$terms <- clean_env(model$terms) + } - prevent_output <- utils::capture.output( - model <- nnet::multinom(formula = f_model, data = .data, ...) - ) model } + +clean_env <- function(x) { + attr(x, ".Environment") <- rlang::base_env() + x +} diff --git a/man/cal_estimate_multinomial.Rd b/man/cal_estimate_multinomial.Rd index 5aa7cd61..481e8cd3 100644 --- a/man/cal_estimate_multinomial.Rd +++ b/man/cal_estimate_multinomial.Rd @@ -10,6 +10,7 @@ cal_estimate_multinomial( .data, truth = NULL, estimate = dplyr::starts_with(".pred_"), + smooth = TRUE, parameters = NULL, ... ) @@ -18,6 +19,7 @@ cal_estimate_multinomial( .data, truth = NULL, estimate = dplyr::starts_with(".pred_"), + smooth = TRUE, parameters = NULL, ... ) @@ -26,6 +28,7 @@ cal_estimate_multinomial( .data, truth = NULL, estimate = dplyr::starts_with(".pred_"), + smooth = TRUE, parameters = NULL, ... ) @@ -43,6 +46,9 @@ defaults to the prefix used by tidymodels (\code{.pred_}). The order of the identifiers will be considered the same as the order of the levels of the \code{truth} variable.} +\item{smooth}{Applies to the logistic models. It switches between logistic +spline when \code{TRUE}, and simple logistic regression when \code{FALSE}.} + \item{parameters}{(Optional) An optional tibble of tuning parameter values that can be used to filter the predicted values before processing. Applies only to \code{tune_results} objects.} @@ -54,6 +60,47 @@ calculate the new probabilities.} Uses a Multinomial calibration model to calculate new probabilities } \details{ -It uses the \code{multinom} function, from the \code{nnet} package, to -create the calibration. +When \code{smooth = FALSE}, \code{\link[nnet:multinom]{nnet::multinom()}} function is used to estimate the +model, otherwise \code{\link[mgcv:gam]{mgcv::gam()}} is used. +} +\examples{ +\dontshow{if (!probably:::is_cran_check() & rlang::is_installed(c("modeldata", "parsnip", "randomForest"))) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +library(modeldata) +library(parsnip) +library(dplyr) + +f <- + list( + ~ -0.5 + 0.6 * abs(A), + ~ ifelse(A > 0 & B > 0, 1.0 + 0.2 * A / B, -2), + ~ -0.6 * A + 0.50 * B - A * B + ) + +set.seed(1) +tr_dat <- sim_multinomial(500, eqn_1 = f[[1]], eqn_2 = f[[2]], eqn_3 = f[[3]]) +cal_dat <- sim_multinomial(500, eqn_1 = f[[1]], eqn_2 = f[[2]], eqn_3 = f[[3]]) +te_dat <- sim_multinomial(500, eqn_1 = f[[1]], eqn_2 = f[[2]], eqn_3 = f[[3]]) + +set.seed(2) +rf_fit <- + rand_forest() \%>\% + set_mode("classification") \%>\% + set_engine("randomForest") \%>\% + fit(class ~ ., data = tr_dat) + +cal_pred <- + predict(rf_fit, cal_dat, type = "prob") \%>\% + bind_cols(cal_dat) +te_pred <- + predict(rf_fit, te_dat, type = "prob") \%>\% + bind_cols(te_dat) + +cal_plot_windowed(cal_pred, truth = class, window_size = 0.1, step_size = 0.03) + +smoothed_mn <- cal_estimate_multinomial(cal_pred, truth = class) + +new_test_pred <- cal_apply(te_pred, smoothed_mn) + +cal_plot_windowed(new_test_pred, truth = class, window_size = 0.1, step_size = 0.03) +\dontshow{\}) # examplesIf} } diff --git a/tests/testthat/_snaps/cal-estimate.md b/tests/testthat/_snaps/cal-estimate.md index eefd38d3..2fee933f 100644 --- a/tests/testthat/_snaps/cal-estimate.md +++ b/tests/testthat/_snaps/cal-estimate.md @@ -193,6 +193,23 @@ `.pred_coyote` ==> coyote `.pred_gray_fox` ==> gray_fox +--- + + Code + print(sp_smth_multi) + Message + + -- Probability Calibration + Method: Multinomial + Type: Multiclass + Source class: Data Frame + Data points: 110 + Truth variable: `Species` + Estimate variables: + `.pred_bobcat` ==> bobcat + `.pred_coyote` ==> coyote + `.pred_gray_fox` ==> gray_fox + # Multinomial estimates work - tune_results Code @@ -203,14 +220,29 @@ Method: Multinomial Type: Multiclass Source class: Tune Results - Data points: 2,930, split in 10 groups - Truth variable: `Bldg_Type` + Data points: 5,000, split in 10 groups + Truth variable: `class` + Estimate variables: + `.pred_one` ==> one + `.pred_two` ==> two + `.pred_three` ==> three + +--- + + Code + print(tl_smth_multi) + Message + + -- Probability Calibration + Method: Multinomial + Type: Multiclass + Source class: Tune Results + Data points: 5,000, split in 10 groups + Truth variable: `class` Estimate variables: - `.pred_OneFam` ==> OneFam - `.pred_TwoFmCon` ==> TwoFmCon - `.pred_Duplex` ==> Duplex - `.pred_Twnhs` ==> Twnhs - `.pred_TwnhsE` ==> TwnhsE + `.pred_one` ==> one + `.pred_two` ==> two + `.pred_three` ==> three # Linear estimates work - data.frame @@ -228,7 +260,7 @@ # Linear estimates work - tune_results Code - print(tl_logistic) + print(tl_linear) Message -- Regression Calibration diff --git a/tests/testthat/_snaps/cal-validate.md b/tests/testthat/_snaps/cal-validate.md index c1f5c3da..9ef60a34 100644 --- a/tests/testthat/_snaps/cal-validate.md +++ b/tests/testthat/_snaps/cal-validate.md @@ -5,19 +5,18 @@ Output # 10-fold cross-validation # A tibble: 10 x 6 - splits id calibration validation stats_after stats_~1 - - 1 Fold01 - 2 Fold02 - 3 Fold03 - 4 Fold04 - 5 Fold05 - 6 Fold06 - 7 Fold07 - 8 Fold08 - 9 Fold09 - 10 Fold10 - # ... with abbreviated variable name 1: stats_before + splits id calibration validation stats_after stats_before + + 1 Fold01 + 2 Fold02 + 3 Fold03 + 4 Fold04 + 5 Fold05 + 6 Fold06 + 7 Fold07 + 8 Fold08 + 9 Fold09 + 10 Fold10 # Logistic validation with `fit_resamples` diff --git a/tests/testthat/cal_files/multiclass_ames.rds b/tests/testthat/cal_files/multiclass_ames.rds index a3126323..821e2688 100644 Binary files a/tests/testthat/cal_files/multiclass_ames.rds and b/tests/testthat/cal_files/multiclass_ames.rds differ diff --git a/tests/testthat/cal_files/sim_multi.rds b/tests/testthat/cal_files/sim_multi.rds deleted file mode 100644 index 0fb341a7..00000000 Binary files a/tests/testthat/cal_files/sim_multi.rds and /dev/null differ diff --git a/tests/testthat/helper-cal.R b/tests/testthat/helper-cal.R index da4de101..83655164 100644 --- a/tests/testthat/helper-cal.R +++ b/tests/testthat/helper-cal.R @@ -72,18 +72,17 @@ testthat_cal_multiclass <- function() { set.seed(111) - df <- modeldata::ames %>% - dplyr::sample_frac(0.1) + df <- sim_multinom_df(500) ranger_recipe <- recipes::recipe( - formula = Bldg_Type ~ ., + formula = class ~ ., data = df ) ranger_spec <- parsnip::rand_forest( mtry = tune(), min_n = tune(), - trees = 1000 + trees = 200 ) %>% parsnip::set_mode("classification") %>% parsnip::set_engine("ranger") @@ -122,7 +121,7 @@ testthat_cal_sim_multi <- function() { } set.seed(1) - train <- sim_multinom_df() + train <- sim_multinom_df(200) test <- sim_multinom_df() model <- randomForest::randomForest(class ~ ., train) diff --git a/tests/testthat/test-cal-estimate.R b/tests/testthat/test-cal-estimate.R index f54ebcdf..bfdf0f6a 100644 --- a/tests/testthat/test-cal-estimate.R +++ b/tests/testthat/test-cal-estimate.R @@ -114,15 +114,21 @@ test_that("Beta estimates work - tune_results", { # ------------------------------ Multinomial ----------------------------------- test_that("Multinomial estimates work - data.frame", { - sp_multi <- cal_estimate_multinomial(species_probs, Species) + sp_multi <- cal_estimate_multinomial(species_probs, Species, smooth = FALSE) expect_cal_type(sp_multi, "multiclass") expect_cal_method(sp_multi, "Multinomial") expect_cal_rows(sp_multi, n = 110) expect_snapshot(print(sp_multi)) + + sp_smth_multi <- cal_estimate_multinomial(species_probs, Species, smooth = TRUE) + expect_cal_type(sp_smth_multi, "multiclass") + expect_cal_method(sp_smth_multi, "Multinomial") + expect_cal_rows(sp_smth_multi, n = 110) + expect_snapshot(print(sp_smth_multi)) }) test_that("Multinomial estimates work - tune_results", { - tl_multi <- cal_estimate_multinomial(testthat_cal_multiclass()) + tl_multi <- cal_estimate_multinomial(testthat_cal_multiclass(), smooth = FALSE) expect_cal_type(tl_multi, "multiclass") expect_cal_method(tl_multi, "Multinomial") expect_snapshot(print(tl_multi)) @@ -135,6 +141,20 @@ test_that("Multinomial estimates work - tune_results", { cal_apply(tl_multi) %>% nrow() ) + + tl_smth_multi <- cal_estimate_multinomial(testthat_cal_multiclass(), smooth = TRUE) + expect_cal_type(tl_smth_multi, "multiclass") + expect_cal_method(tl_smth_multi, "Multinomial") + expect_snapshot(print(tl_smth_multi)) + + expect_equal( + testthat_cal_multiclass() %>% + tune::collect_predictions(summarize = TRUE) %>% + nrow(), + testthat_cal_multiclass() %>% + cal_apply(tl_smth_multi) %>% + nrow() + ) }) test_that("Passing a binary outcome causes error", { @@ -154,11 +174,11 @@ test_that("Linear estimates work - data.frame", { }) test_that("Linear estimates work - tune_results", { - tl_logistic <- cal_estimate_linear(testthat_cal_reg(), outcome, smooth = FALSE) - expect_cal_type(tl_logistic, "regression") - expect_cal_method(tl_logistic, "Linear") - expect_cal_estimate(tl_logistic, "butchered_glm") - expect_snapshot(print(tl_logistic)) + tl_linear <- cal_estimate_linear(testthat_cal_reg(), outcome, smooth = FALSE) + expect_cal_type(tl_linear, "regression") + expect_cal_method(tl_linear, "Linear") + expect_cal_estimate(tl_linear, "butchered_glm") + expect_snapshot(print(tl_linear)) }) # ----------------------------- Linear Spline --------------------------------