From d24260fdad61df58588fe9e9f05337ae8061b31a Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 12 Feb 2025 19:43:07 -0600 Subject: [PATCH] [R-package] require lgb.Dataset, remove support for passing 'colnames' and 'categorical_feature' for lgb.train() and lgb.cv() (#6714) --- R-package/R/callback.R | 2 +- R-package/R/lgb.Dataset.R | 2 +- R-package/R/lgb.convert_with_rules.R | 2 +- R-package/R/lgb.cv.R | 63 +-------------------- R-package/R/lgb.train.R | 36 +----------- R-package/R/utils.R | 24 ++------ R-package/man/lgb.cv.Rd | 20 ------- R-package/man/lgb.train.Rd | 14 ----- R-package/src/install.libs.R | 2 +- R-package/tests/testthat/test_basic.R | 4 +- R-package/tests/testthat/test_lgb.Booster.R | 6 +- R-package/tests/testthat/test_parameters.R | 2 +- build_r.R | 6 +- 13 files changed, 20 insertions(+), 163 deletions(-) diff --git a/R-package/R/callback.R b/R-package/R/callback.R index c436409ddafb..ba3f742233b2 100644 --- a/R-package/R/callback.R +++ b/R-package/R/callback.R @@ -67,7 +67,7 @@ CB_ENV <- R6::R6Class( } - return(paste0(msg, collapse = " ")) + return(paste(msg, collapse = " ")) } diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R index 45dd01bd5045..a0c18f6cfd75 100644 --- a/R-package/R/lgb.Dataset.R +++ b/R-package/R/lgb.Dataset.R @@ -457,7 +457,7 @@ Dataset <- R6::R6Class( if (!.is_null_handle(x = private$handle)) { # Merge names with tab separation - merged_name <- paste0(as.list(private$colnames), collapse = "\t") + merged_name <- paste(as.list(private$colnames), collapse = "\t") .Call( LGBM_DatasetSetFeatureNames_R , private$handle diff --git a/R-package/R/lgb.convert_with_rules.R b/R-package/R/lgb.convert_with_rules.R index f024e9dfe6e9..1d2748ac1f38 100644 --- a/R-package/R/lgb.convert_with_rules.R +++ b/R-package/R/lgb.convert_with_rules.R @@ -5,7 +5,7 @@ vapply( X = df , FUN = function(x) { - paste0(class(x), collapse = ",") + paste(class(x), collapse = ",") } , FUN.VALUE = character(1L) ) diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R index 638d1c628e12..1f6fcf6cd84d 100644 --- a/R-package/R/lgb.cv.R +++ b/R-package/R/lgb.cv.R @@ -25,8 +25,6 @@ CVBooster <- R6::R6Class( #' @description Cross validation logic used by LightGBM #' @inheritParams lgb_shared_params #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples. -#' @param label Deprecated. See "Deprecated Arguments" section below. -#' @param weight Deprecated. See "Deprecated Arguments" section below. #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals} #' @param showsd \code{boolean}, whether to show standard deviation of cross validation. #' This parameter defaults to \code{TRUE}. Setting it to \code{FALSE} can lead to a @@ -36,8 +34,6 @@ CVBooster <- R6::R6Class( #' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds #' (each element must be a vector of test fold's indices). When folds are supplied, #' the \code{nfold} and \code{stratified} parameters are ignored. -#' @param colnames Deprecated. See "Deprecated Arguments" section below. -#' @param categorical_feature Deprecated. See "Deprecated Arguments" section below. #' @param callbacks List of callback functions that are applied at each iteration. #' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model #' into a predictor model which frees up memory and the original datasets @@ -69,20 +65,12 @@ CVBooster <- R6::R6Class( #' ) #' } #' -#' @section Deprecated Arguments: -#' -#' A future release of \code{lightgbm} will require passing an \code{lgb.Dataset} -#' to argument \code{'data'}. It will also remove support for passing arguments -#' \code{'categorical_feature'}, \code{'colnames'}, \code{'label'}, and \code{'weight'}. -#' #' @importFrom data.table data.table setorderv #' @export lgb.cv <- function(params = list() , data , nrounds = 100L , nfold = 3L - , label = NULL - , weight = NULL , obj = NULL , eval = NULL , verbose = 1L @@ -92,8 +80,6 @@ lgb.cv <- function(params = list() , stratified = TRUE , folds = NULL , init_model = NULL - , colnames = NULL - , categorical_feature = NULL , early_stopping_rounds = NULL , callbacks = list() , reset_data = FALSE @@ -104,33 +90,8 @@ lgb.cv <- function(params = list() if (nrounds <= 0L) { stop("nrounds should be greater than zero") } - - # If 'data' is not an lgb.Dataset, try to construct one using 'label' if (!.is_Dataset(x = data)) { - warning(paste0( - "Passing anything other than an lgb.Dataset object to lgb.cv() is deprecated. " - , "Either pass an lgb.Dataset object, or use lightgbm()." - )) - if (is.null(label)) { - stop("'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'") - } - data <- lgb.Dataset(data = data, label = label) - } - - # raise deprecation warnings if necessary - # ref: https://github.com/microsoft/LightGBM/issues/6435 - args <- names(match.call()) - if ("categorical_feature" %in% args) { - .emit_dataset_kwarg_warning("categorical_feature", "lgb.cv") - } - if ("colnames" %in% args) { - .emit_dataset_kwarg_warning("colnames", "lgb.cv") - } - if ("label" %in% args) { - .emit_dataset_kwarg_warning("label", "lgb.cv") - } - if ("weight" %in% args) { - .emit_dataset_kwarg_warning("weight", "lgb.cv") + stop("lgb.cv: data must be an lgb.Dataset instance") } # set some parameters, resolving the way they were passed in with other parameters @@ -214,37 +175,17 @@ lgb.cv <- function(params = list() data$construct() # Check interaction constraints - cnames <- NULL - if (!is.null(colnames)) { - cnames <- colnames - } else if (!is.null(data$get_colnames())) { - cnames <- data$get_colnames() - } params[["interaction_constraints"]] <- .check_interaction_constraints( interaction_constraints = interaction_constraints - , column_names = cnames + , column_names = data$get_colnames() ) - if (!is.null(weight)) { - data$set_field(field_name = "weight", data = weight) - } - # Update parameters with parsed parameters data$update_params(params = params) # Create the predictor set data$.__enclos_env__$private$set_predictor(predictor = predictor) - # Write column names - if (!is.null(colnames)) { - data$set_colnames(colnames = colnames) - } - - # Write categorical features - if (!is.null(categorical_feature)) { - data$set_categorical_feature(categorical_feature = categorical_feature) - } - if (!is.null(folds)) { # Check for list of folds or for single value diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R index 4d994cfc6f04..4b16cac22515 100644 --- a/R-package/R/lgb.train.R +++ b/R-package/R/lgb.train.R @@ -6,8 +6,6 @@ #' @inheritParams lgb_shared_params #' @param valids a list of \code{lgb.Dataset} objects, used for validation #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals} -#' @param colnames Deprecated. See "Deprecated Arguments" section below. -#' @param categorical_feature Deprecated. See "Deprecated Arguments" section below. #' @param callbacks List of callback functions that are applied at each iteration. #' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the #' booster model into a predictor model which frees up memory and the @@ -42,12 +40,6 @@ #' ) #' } #' -#' @section Deprecated Arguments: -#' -#' A future release of \code{lightgbm} will remove support for passing arguments -#' \code{'categorical_feature'} and \code{'colnames'}. Pass those things to -#' \code{lgb.Dataset} instead. -#' #' @export lgb.train <- function(params = list(), data, @@ -59,8 +51,6 @@ lgb.train <- function(params = list(), record = TRUE, eval_freq = 1L, init_model = NULL, - colnames = NULL, - categorical_feature = NULL, early_stopping_rounds = NULL, callbacks = list(), reset_data = FALSE, @@ -83,16 +73,6 @@ lgb.train <- function(params = list(), } } - # raise deprecation warnings if necessary - # ref: https://github.com/microsoft/LightGBM/issues/6435 - args <- names(match.call()) - if ("categorical_feature" %in% args) { - .emit_dataset_kwarg_warning("categorical_feature", "lgb.train") - } - if ("colnames" %in% args) { - .emit_dataset_kwarg_warning("colnames", "lgb.train") - } - # set some parameters, resolving the way they were passed in with other parameters # in `params`. # this ensures that the model stored with Booster$save() correctly represents @@ -171,21 +151,12 @@ lgb.train <- function(params = list(), # Construct datasets, if needed data$update_params(params = params) - if (!is.null(categorical_feature)) { - data$set_categorical_feature(categorical_feature) - } data$construct() # Check interaction constraints - cnames <- NULL - if (!is.null(colnames)) { - cnames <- colnames - } else if (!is.null(data$get_colnames())) { - cnames <- data$get_colnames() - } params[["interaction_constraints"]] <- .check_interaction_constraints( interaction_constraints = interaction_constraints - , column_names = cnames + , column_names = data$get_colnames() ) # Update parameters with parsed parameters @@ -194,11 +165,6 @@ lgb.train <- function(params = list(), # Create the predictor set data$.__enclos_env__$private$set_predictor(predictor) - # Write column names - if (!is.null(colnames)) { - data$set_colnames(colnames) - } - valid_contain_train <- FALSE train_data_name <- "train" reduced_valid_sets <- list() diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 9fbdba778cc4..321feb60bb30 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -34,7 +34,7 @@ # If a parameter has multiple values, join those values together with commas. # trimws() is necessary because format() will pad to make strings the same width - val <- paste0( + val <- paste( trimws( format( x = unname(params[[i]]) @@ -46,7 +46,7 @@ if (nchar(val) <= 0L) next # Skip join # Join key value - pair <- paste0(c(param_names[[i]], val), collapse = "=") + pair <- paste(c(param_names[[i]], val), collapse = "=") ret <- c(ret, pair) } @@ -55,7 +55,7 @@ return("") } - return(paste0(ret, collapse = " ")) + return(paste(ret, collapse = " ")) } @@ -115,7 +115,7 @@ # Turn indices 0-based and convert to string for (j in seq_along(interaction_constraints)) { interaction_constraints[[j]] <- paste0( - "[", paste0(interaction_constraints[[j]] - 1L, collapse = ","), "]" + "[", paste(interaction_constraints[[j]] - 1L, collapse = ","), "]" ) } return(interaction_constraints) @@ -258,19 +258,3 @@ return(a == b) } } - -# ref: https://github.com/microsoft/LightGBM/issues/6435 -.emit_dataset_kwarg_warning <- function(calling_function, argname) { - msg <- sprintf( - paste0( - "Argument '%s' to %s() is deprecated and will be removed in a future release. " - , "Set '%s' with lgb.Dataset() instead. " - , "See https://github.com/microsoft/LightGBM/issues/6435." - ) - , argname - , calling_function - , argname - ) - warning(msg) - return(invisible(NULL)) -} diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd index cee059d494ca..0fb0363c092e 100644 --- a/R-package/man/lgb.cv.Rd +++ b/R-package/man/lgb.cv.Rd @@ -9,8 +9,6 @@ lgb.cv( data, nrounds = 100L, nfold = 3L, - label = NULL, - weight = NULL, obj = NULL, eval = NULL, verbose = 1L, @@ -20,8 +18,6 @@ lgb.cv( stratified = TRUE, folds = NULL, init_model = NULL, - colnames = NULL, - categorical_feature = NULL, early_stopping_rounds = NULL, callbacks = list(), reset_data = FALSE, @@ -41,10 +37,6 @@ may allow you to pass other types of data like \code{matrix} and then separately \item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.} -\item{label}{Deprecated. See "Deprecated Arguments" section below.} - -\item{weight}{Deprecated. See "Deprecated Arguments" section below.} - \item{obj}{objective function, can be character or custom objective function. Examples include \code{regression}, \code{regression_l1}, \code{huber}, \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}} @@ -103,10 +95,6 @@ the \code{nfold} and \code{stratified} parameters are ignored.} \item{init_model}{path of model file or \code{lgb.Booster} object, will continue training from this model} -\item{colnames}{Deprecated. See "Deprecated Arguments" section below.} - -\item{categorical_feature}{Deprecated. See "Deprecated Arguments" section below.} - \item{early_stopping_rounds}{int. Activates early stopping. When this parameter is non-null, training will stop if the evaluation of any metric on any validation set fails to improve for \code{early_stopping_rounds} consecutive boosting rounds. @@ -131,14 +119,6 @@ a trained model \code{lgb.CVBooster}. \description{ Cross validation logic used by LightGBM } -\section{Deprecated Arguments}{ - - -A future release of \code{lightgbm} will require passing an \code{lgb.Dataset} -to argument \code{'data'}. It will also remove support for passing arguments -\code{'categorical_feature'}, \code{'colnames'}, \code{'label'}, and \code{'weight'}. -} - \section{Early Stopping}{ diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd index ebbfc206998e..1f6edfa25ba3 100644 --- a/R-package/man/lgb.train.Rd +++ b/R-package/man/lgb.train.Rd @@ -15,8 +15,6 @@ lgb.train( record = TRUE, eval_freq = 1L, init_model = NULL, - colnames = NULL, - categorical_feature = NULL, early_stopping_rounds = NULL, callbacks = list(), reset_data = FALSE, @@ -82,10 +80,6 @@ printing of evaluation during training} \item{init_model}{path of model file or \code{lgb.Booster} object, will continue training from this model} -\item{colnames}{Deprecated. See "Deprecated Arguments" section below.} - -\item{categorical_feature}{Deprecated. See "Deprecated Arguments" section below.} - \item{early_stopping_rounds}{int. Activates early stopping. When this parameter is non-null, training will stop if the evaluation of any metric on any validation set fails to improve for \code{early_stopping_rounds} consecutive boosting rounds. @@ -109,14 +103,6 @@ Low-level R interface to train a LightGBM model. Unlike \code{\link{lightgbm}}, this function is focused on performance (e.g. speed, memory efficiency). It is also less likely to have breaking API changes in new releases than \code{\link{lightgbm}}. } -\section{Deprecated Arguments}{ - - -A future release of \code{lightgbm} will remove support for passing arguments -\code{'categorical_feature'} and \code{'colnames'}. Pass those things to -\code{lgb.Dataset} instead. -} - \section{Early Stopping}{ diff --git a/R-package/src/install.libs.R b/R-package/src/install.libs.R index c622de927ac5..fa61d014a50e 100644 --- a/R-package/src/install.libs.R +++ b/R-package/src/install.libs.R @@ -51,7 +51,7 @@ inst_dir <- file.path(R_PACKAGE_SOURCE, "inst", fsep = "/") , "make this faster." )) } - cmd <- paste0(cmd, " ", paste0(args, collapse = " ")) + cmd <- paste0(cmd, " ", paste(args, collapse = " ")) exit_code <- system(cmd) } diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 06d35a146d66..cb43ba613be9 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -433,7 +433,7 @@ test_that("lgb.cv() rejects negative or 0 value passed to nrounds", { } }) -test_that("lgb.cv() throws an informative error if 'data' is not an lgb.Dataset and labels are not given", { +test_that("lgb.cv() throws an informative error if 'data' is not an lgb.Dataset", { bad_values <- list( 4L , "hello" @@ -454,7 +454,7 @@ test_that("lgb.cv() throws an informative error if 'data' is not an lgb.Dataset , 10L , nfold = 5L ) - }, regexp = "'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'", fixed = TRUE) + }, regexp = "lgb.cv: data must be an lgb.Dataset instance", fixed = TRUE) } }) diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 9197fd7226af..a770b28c415a 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -888,7 +888,7 @@ test_that("Saving a model with different feature importance types works", { .feat_importance_from_string <- function(model_string) { file_lines <- strsplit(model_string, "\n", fixed = TRUE)[[1L]] - start_indx <- which(grepl("^feature_importances\\:$", file_lines)) + 1L + start_indx <- which(file_lines == "feature_importances:") + 1L blank_line_indices <- which(file_lines == "") end_indx <- blank_line_indices[blank_line_indices > start_indx][1L] - 1L importances <- file_lines[start_indx: end_indx] @@ -955,7 +955,7 @@ test_that("Saving a model with unknown importance type fails", { .params_from_model_string <- function(model_str) { file_lines <- strsplit(model_str, "\n", fixed = TRUE)[[1L]] - start_indx <- which(grepl("^parameters\\:$", file_lines)) + 1L + start_indx <- which(file_lines == "parameters:") + 1L blank_line_indices <- which(file_lines == "") end_indx <- blank_line_indices[blank_line_indices > start_indx][1L] - 1L params <- file_lines[start_indx: end_indx] @@ -1532,7 +1532,7 @@ test_that("Booster's print, show, and summary work correctly", { } .has_expected_content_for_finalized_model <- function(printed_txt) { - expect_true(any(grepl("^LightGBM Model$", printed_txt))) + expect_true(any(printed_txt == "LightGBM Model")) expect_true(any(grepl("Booster handle is invalid", printed_txt, fixed = TRUE))) } diff --git a/R-package/tests/testthat/test_parameters.R b/R-package/tests/testthat/test_parameters.R index 9949ffe646b9..2e3aaa3799c3 100644 --- a/R-package/tests/testthat/test_parameters.R +++ b/R-package/tests/testthat/test_parameters.R @@ -18,7 +18,7 @@ test_that("Feature penalties work properly", { num_leaves = 5L , learning_rate = 0.05 , objective = "binary" - , feature_penalty = paste0(feature_penalties, collapse = ",") + , feature_penalty = paste(feature_penalties, collapse = ",") , metric = "binary_error" , num_threads = .LGB_MAX_THREADS ) diff --git a/build_r.R b/build_r.R index 1d824d60bbba..a680f0a730b2 100644 --- a/build_r.R +++ b/build_r.R @@ -121,7 +121,7 @@ if (length(parsed_args[["make_args"]]) > 0L) { pattern = "make_args_from_build_script <- character(0L)" , replacement = paste0( "make_args_from_build_script <- c(\"" - , paste0(parsed_args[["make_args"]], collapse = "\", \"") + , paste(parsed_args[["make_args"]], collapse = "\", \"") , "\")" ) , x = install_libs_content @@ -167,7 +167,7 @@ if (length(parsed_args[["make_args"]]) > 0L) { , "make this faster." )) } - cmd <- paste0(cmd, " ", paste0(args, collapse = " ")) + cmd <- paste0(cmd, " ", paste(args, collapse = " ")) exit_code <- system(cmd) } @@ -426,6 +426,6 @@ install_args <- c("CMD", "INSTALL", "--no-multiarch", "--with-keep.source", tarb if (INSTALL_AFTER_BUILD) { .run_shell_command(install_cmd, install_args) } else { - cmd <- paste0(install_cmd, " ", paste0(install_args, collapse = " ")) + cmd <- paste0(install_cmd, " ", paste(install_args, collapse = " ")) print(sprintf("Skipping installation. Install the package with command '%s'", cmd)) }