From a8d9c370e6a02ff83f6ad943c6c5f06fd601a3fa Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Fri, 16 Jun 2023 15:02:09 -0700 Subject: [PATCH 1/5] have pca steps return data unchanged if `num_comp = 0` --- R/pca_sparse.R | 2 +- R/pca_sparse_bayes.R | 4 ++-- R/pca_truncated.R | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/R/pca_sparse.R b/R/pca_sparse.R index e6b2672..65f0d92 100644 --- a/R/pca_sparse.R +++ b/R/pca_sparse.R @@ -140,7 +140,7 @@ step_pca_sparse_new <- prep.step_pca_sparse <- function(x, training, info = NULL, ...) { col_names <- recipes_eval_select(x$terms, training, info) - if (length(col_names) > 0) { + if (length(col_names) > 0 && x$num_comp > 0) { check_type(training[, col_names], types = c("double", "integer")) p <- length(col_names) diff --git a/R/pca_sparse_bayes.R b/R/pca_sparse_bayes.R index 332d97b..bad45eb 100644 --- a/R/pca_sparse_bayes.R +++ b/R/pca_sparse_bayes.R @@ -193,12 +193,12 @@ prep.step_pca_sparse_bayes <- function(x, training, info = NULL, ...) { ) res <- rlang::eval_tidy(cl) rotation <- svd(res$loadings)$u + rownames(rotation) <- col_names + colnames(rotation) <- names0(x$num_comp, prefix = x$prefix) } else { # fake a rotation matrix so that the resolved names can be used for tidy() rotation <- matrix(NA, nrow = length(col_names), ncol = p) } - rownames(rotation) <- col_names - colnames(rotation) <- names0(x$num_comp, prefix = x$prefix) } else { rotation <- NA } diff --git a/R/pca_truncated.R b/R/pca_truncated.R index e14df80..b606f33 100644 --- a/R/pca_truncated.R +++ b/R/pca_truncated.R @@ -148,12 +148,13 @@ prep.step_pca_truncated <- function(x, training, info = NULL, ...) { } else { prc_obj <- recipes::pca_wts(training[, col_names, drop = FALSE], wts = wts) } + rownames(prc_obj$rotation) <- col_names } else { prc_obj <- NULL prc_obj$rotation <- matrix(nrow = 0, ncol = 0) } - rownames(prc_obj$rotation) <- col_names + if (is.null(prc_obj$center)) { prc_obj$center <- FALSE From cd90bd552df470823565773466fa908688f831d1 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Fri, 16 Jun 2023 15:02:17 -0700 Subject: [PATCH 2/5] test that pca steps return data unchanged if `num_comp = 0` --- tests/testthat/test-pca_sparse.R | 14 ++++++++++++++ tests/testthat/test-pca_sparse_bayes.R | 14 ++++++++++++++ tests/testthat/test-pca_truncated.R | 14 ++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/tests/testthat/test-pca_sparse.R b/tests/testthat/test-pca_sparse.R index 8c03de1..573405d 100644 --- a/tests/testthat/test-pca_sparse.R +++ b/tests/testthat/test-pca_sparse.R @@ -93,6 +93,20 @@ test_that("tunable", { ) }) +test_that("Do nothing for num_comps = 0 and keep_original_cols = FALSE", { + # https://github.com/tidymodels/recipes/issues/1152 + skip_if_not_installed("irlba") + + rec <- recipe(carb ~ ., data = mtcars) %>% + step_pca_sparse(all_predictors(), + num_comp = 0, keep_original_cols = FALSE) %>% + prep() + + res <- bake(rec, new_data = NULL) + + expect_identical(res, tibble::as_tibble(mtcars)) +}) + # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { diff --git a/tests/testthat/test-pca_sparse_bayes.R b/tests/testthat/test-pca_sparse_bayes.R index e8d9e46..ebd1f04 100644 --- a/tests/testthat/test-pca_sparse_bayes.R +++ b/tests/testthat/test-pca_sparse_bayes.R @@ -102,6 +102,20 @@ test_that("tunable", { ) }) +test_that("Do nothing for num_comps = 0 and keep_original_cols = FALSE", { + # https://github.com/tidymodels/recipes/issues/1152 + skip_if_not_installed("irlba") + + rec <- recipe(carb ~ ., data = mtcars) %>% + step_pca_sparse_bayes(all_predictors(), + num_comp = 0, keep_original_cols = FALSE) %>% + prep() + + res <- bake(rec, new_data = NULL) + + expect_identical(res, tibble::as_tibble(mtcars)) +}) + # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { diff --git a/tests/testthat/test-pca_truncated.R b/tests/testthat/test-pca_truncated.R index 27ec9cc..15c06f0 100644 --- a/tests/testthat/test-pca_truncated.R +++ b/tests/testthat/test-pca_truncated.R @@ -71,6 +71,20 @@ test_that("check_name() is used", { ) }) +test_that("Do nothing for num_comps = 0 and keep_original_cols = FALSE", { + # https://github.com/tidymodels/recipes/issues/1152 + skip_if_not_installed("irlba") + + rec <- recipe(carb ~ ., data = mtcars) %>% + step_pca_truncated(all_predictors(), + num_comp = 0, keep_original_cols = FALSE) %>% + prep() + + res <- bake(rec, new_data = NULL) + + expect_identical(res, tibble::as_tibble(mtcars)) +}) + # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { From 9c7666b1dc503e3b4f0baef9c07b7b7634550831 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Fri, 16 Jun 2023 15:06:48 -0700 Subject: [PATCH 3/5] add news bullet --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index fe3f67d..c5e15a3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # embed (development version) +* `step_pca_sparse()`, `step_pca_truncated()` and `step_pca_sparse_bayes()` now returns data unaltered if `num_comp = 0`. This is done to be consistent with recipes steps of the same nature. + # embed 1.1.1 ## Bug Fixes From 88d0ec75dce802e1236433f8cde8e3734b248802 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Fri, 16 Jun 2023 15:08:01 -0700 Subject: [PATCH 4/5] have all pca steps inheritParams from recipes::step_pca() --- R/pca_sparse.R | 4 +--- R/pca_sparse_bayes.R | 5 +---- man/step_pca_sparse.Rd | 8 +++++--- man/step_pca_sparse_bayes.Rd | 9 +++++---- 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/R/pca_sparse.R b/R/pca_sparse.R index 65f0d92..9106d4f 100644 --- a/R/pca_sparse.R +++ b/R/pca_sparse.R @@ -5,6 +5,7 @@ #' zero coefficients. #' #' @inheritParams step_lencode_bayes +#' @inheritParams recipes::step_pca #' @inherit step_lencode_bayes return #' @param ... One or more selector functions to choose which variables will be #' used to compute the components. See [selections()] for more details. For @@ -13,9 +14,6 @@ #' they be assigned? By default, the function assumes that the new principal #' component columns created by the original variables will be used as #' predictors in a model. -#' @param num_comp The number of PCA components to retain as new predictors. If -#' `num_comp` is greater than the number of columns or the number of possible -#' components, a smaller value will be used. #' @param predictor_prop The maximum number of original predictors that can have #' non-zero coefficients for each PCA component (via regularization). #' @param keep_original_cols A logical to keep the original variables in the diff --git a/R/pca_sparse_bayes.R b/R/pca_sparse_bayes.R index bad45eb..14ed072 100644 --- a/R/pca_sparse_bayes.R +++ b/R/pca_sparse_bayes.R @@ -5,6 +5,7 @@ #' some zero coefficients. #' #' @inheritParams step_lencode_bayes +#' @inheritParams recipes::step_pca #' @inherit step_lencode_bayes return #' @param ... One or more selector functions to choose which variables will be #' used to compute the components. See [selections()] for more details. For @@ -13,10 +14,6 @@ #' they be assigned? By default, the function assumes that the new principal #' component columns created by the original variables will be used as #' predictors in a model. -#' @param num_comp The number of PCA components to retain as new predictors. If -#' `num_comp` is greater than the number of columns or the number of possible -#' components, a smaller value will be used. A value of zero indicates that -#' PCA will _not_ be used on the data. #' @param prior_slab_dispersion This value is proportional to the dispersion (or #' scale) parameter for the slab portion of the prior. Smaller values result #' in an increase in zero coefficients. diff --git a/man/step_pca_sparse.Rd b/man/step_pca_sparse.Rd index a96c3db..7bd1b16 100644 --- a/man/step_pca_sparse.Rd +++ b/man/step_pca_sparse.Rd @@ -35,9 +35,11 @@ predictors in a model.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} -\item{num_comp}{The number of PCA components to retain as new predictors. If -\code{num_comp} is greater than the number of columns or the number of possible -components, a smaller value will be used.} +\item{num_comp}{The number of components to retain as new predictors. +If \code{num_comp} is greater than the number of columns or the number of +possible components, a smaller value will be used. If \code{num_comp = 0} +is set then no transformation is done and selected variables will +stay unchanged.} \item{predictor_prop}{The maximum number of original predictors that can have non-zero coefficients for each PCA component (via regularization).} diff --git a/man/step_pca_sparse_bayes.Rd b/man/step_pca_sparse_bayes.Rd index 3b07442..adc7c86 100644 --- a/man/step_pca_sparse_bayes.Rd +++ b/man/step_pca_sparse_bayes.Rd @@ -36,10 +36,11 @@ predictors in a model.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} -\item{num_comp}{The number of PCA components to retain as new predictors. If -\code{num_comp} is greater than the number of columns or the number of possible -components, a smaller value will be used. A value of zero indicates that -PCA will \emph{not} be used on the data.} +\item{num_comp}{The number of components to retain as new predictors. +If \code{num_comp} is greater than the number of columns or the number of +possible components, a smaller value will be used. If \code{num_comp = 0} +is set then no transformation is done and selected variables will +stay unchanged.} \item{prior_slab_dispersion}{This value is proportional to the dispersion (or scale) parameter for the slab portion of the prior. Smaller values result From df76b33d390c78ce99030e1c696c79bb54eae264 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Fri, 16 Jun 2023 15:09:44 -0700 Subject: [PATCH 5/5] don't need skip_if_not_installed --- tests/testthat/test-pca_sparse.R | 1 - tests/testthat/test-pca_sparse_bayes.R | 3 +-- tests/testthat/test-pca_truncated.R | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/testthat/test-pca_sparse.R b/tests/testthat/test-pca_sparse.R index 573405d..8b31782 100644 --- a/tests/testthat/test-pca_sparse.R +++ b/tests/testthat/test-pca_sparse.R @@ -95,7 +95,6 @@ test_that("tunable", { test_that("Do nothing for num_comps = 0 and keep_original_cols = FALSE", { # https://github.com/tidymodels/recipes/issues/1152 - skip_if_not_installed("irlba") rec <- recipe(carb ~ ., data = mtcars) %>% step_pca_sparse(all_predictors(), diff --git a/tests/testthat/test-pca_sparse_bayes.R b/tests/testthat/test-pca_sparse_bayes.R index ebd1f04..919ff19 100644 --- a/tests/testthat/test-pca_sparse_bayes.R +++ b/tests/testthat/test-pca_sparse_bayes.R @@ -104,8 +104,7 @@ test_that("tunable", { test_that("Do nothing for num_comps = 0 and keep_original_cols = FALSE", { # https://github.com/tidymodels/recipes/issues/1152 - skip_if_not_installed("irlba") - + rec <- recipe(carb ~ ., data = mtcars) %>% step_pca_sparse_bayes(all_predictors(), num_comp = 0, keep_original_cols = FALSE) %>% diff --git a/tests/testthat/test-pca_truncated.R b/tests/testthat/test-pca_truncated.R index 15c06f0..b0dc49d 100644 --- a/tests/testthat/test-pca_truncated.R +++ b/tests/testthat/test-pca_truncated.R @@ -73,7 +73,6 @@ test_that("check_name() is used", { test_that("Do nothing for num_comps = 0 and keep_original_cols = FALSE", { # https://github.com/tidymodels/recipes/issues/1152 - skip_if_not_installed("irlba") rec <- recipe(carb ~ ., data = mtcars) %>% step_pca_truncated(all_predictors(),