From d24260fdad61df58588fe9e9f05337ae8061b31a Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Wed, 12 Feb 2025 19:43:07 -0600
Subject: [PATCH] [R-package] require lgb.Dataset, remove support for passing
 'colnames' and 'categorical_feature' for lgb.train() and lgb.cv() (#6714)

---
 R-package/R/callback.R                      |  2 +-
 R-package/R/lgb.Dataset.R                   |  2 +-
 R-package/R/lgb.convert_with_rules.R        |  2 +-
 R-package/R/lgb.cv.R                        | 63 +--------------------
 R-package/R/lgb.train.R                     | 36 +-----------
 R-package/R/utils.R                         | 24 ++------
 R-package/man/lgb.cv.Rd                     | 20 -------
 R-package/man/lgb.train.Rd                  | 14 -----
 R-package/src/install.libs.R                |  2 +-
 R-package/tests/testthat/test_basic.R       |  4 +-
 R-package/tests/testthat/test_lgb.Booster.R |  6 +-
 R-package/tests/testthat/test_parameters.R  |  2 +-
 build_r.R                                   |  6 +-
 13 files changed, 20 insertions(+), 163 deletions(-)

diff --git a/R-package/R/callback.R b/R-package/R/callback.R
index c436409ddafb..ba3f742233b2 100644
--- a/R-package/R/callback.R
+++ b/R-package/R/callback.R
@@ -67,7 +67,7 @@ CB_ENV <- R6::R6Class(
 
   }
 
-  return(paste0(msg, collapse = "  "))
+  return(paste(msg, collapse = "  "))
 
 }
 
diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
index 45dd01bd5045..a0c18f6cfd75 100644
--- a/R-package/R/lgb.Dataset.R
+++ b/R-package/R/lgb.Dataset.R
@@ -457,7 +457,7 @@ Dataset <- R6::R6Class(
       if (!.is_null_handle(x = private$handle)) {
 
         # Merge names with tab separation
-        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
+        merged_name <- paste(as.list(private$colnames), collapse = "\t")
         .Call(
           LGBM_DatasetSetFeatureNames_R
           , private$handle
diff --git a/R-package/R/lgb.convert_with_rules.R b/R-package/R/lgb.convert_with_rules.R
index f024e9dfe6e9..1d2748ac1f38 100644
--- a/R-package/R/lgb.convert_with_rules.R
+++ b/R-package/R/lgb.convert_with_rules.R
@@ -5,7 +5,7 @@
         vapply(
             X = df
             , FUN = function(x) {
-                paste0(class(x), collapse = ",")
+                paste(class(x), collapse = ",")
             }
             , FUN.VALUE = character(1L)
         )
diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R
index 638d1c628e12..1f6fcf6cd84d 100644
--- a/R-package/R/lgb.cv.R
+++ b/R-package/R/lgb.cv.R
@@ -25,8 +25,6 @@ CVBooster <- R6::R6Class(
 #' @description Cross validation logic used by LightGBM
 #' @inheritParams lgb_shared_params
 #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
-#' @param label Deprecated. See "Deprecated Arguments" section below.
-#' @param weight Deprecated. See "Deprecated Arguments" section below.
 #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
 #' @param showsd \code{boolean}, whether to show standard deviation of cross validation.
 #'               This parameter defaults to \code{TRUE}. Setting it to \code{FALSE} can lead to a
@@ -36,8 +34,6 @@ CVBooster <- R6::R6Class(
 #' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
 #'              (each element must be a vector of test fold's indices). When folds are supplied,
 #'              the \code{nfold} and \code{stratified} parameters are ignored.
-#' @param colnames Deprecated. See "Deprecated Arguments" section below.
-#' @param categorical_feature Deprecated. See "Deprecated Arguments" section below.
 #' @param callbacks List of callback functions that are applied at each iteration.
 #' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model
 #'                   into a predictor model which frees up memory and the original datasets
@@ -69,20 +65,12 @@ CVBooster <- R6::R6Class(
 #' )
 #' }
 #'
-#' @section Deprecated Arguments:
-#'
-#' A future release of \code{lightgbm} will require passing an \code{lgb.Dataset}
-#' to argument \code{'data'}. It will also remove support for passing arguments
-#' \code{'categorical_feature'}, \code{'colnames'}, \code{'label'}, and \code{'weight'}.
-#'
 #' @importFrom data.table data.table setorderv
 #' @export
 lgb.cv <- function(params = list()
                    , data
                    , nrounds = 100L
                    , nfold = 3L
-                   , label = NULL
-                   , weight = NULL
                    , obj = NULL
                    , eval = NULL
                    , verbose = 1L
@@ -92,8 +80,6 @@ lgb.cv <- function(params = list()
                    , stratified = TRUE
                    , folds = NULL
                    , init_model = NULL
-                   , colnames = NULL
-                   , categorical_feature = NULL
                    , early_stopping_rounds = NULL
                    , callbacks = list()
                    , reset_data = FALSE
@@ -104,33 +90,8 @@ lgb.cv <- function(params = list()
   if (nrounds <= 0L) {
     stop("nrounds should be greater than zero")
   }
-
-  # If 'data' is not an lgb.Dataset, try to construct one using 'label'
   if (!.is_Dataset(x = data)) {
-    warning(paste0(
-      "Passing anything other than an lgb.Dataset object to lgb.cv() is deprecated. "
-      , "Either pass an lgb.Dataset object, or use lightgbm()."
-    ))
-    if (is.null(label)) {
-      stop("'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'")
-    }
-    data <- lgb.Dataset(data = data, label = label)
-  }
-
-  # raise deprecation warnings if necessary
-  # ref: https://github.com/microsoft/LightGBM/issues/6435
-  args <- names(match.call())
-  if ("categorical_feature" %in% args) {
-    .emit_dataset_kwarg_warning("categorical_feature", "lgb.cv")
-  }
-  if ("colnames" %in% args) {
-    .emit_dataset_kwarg_warning("colnames", "lgb.cv")
-  }
-  if ("label" %in% args) {
-    .emit_dataset_kwarg_warning("label", "lgb.cv")
-  }
-  if ("weight" %in% args) {
-    .emit_dataset_kwarg_warning("weight", "lgb.cv")
+    stop("lgb.cv: data must be an lgb.Dataset instance")
   }
 
   # set some parameters, resolving the way they were passed in with other parameters
@@ -214,37 +175,17 @@ lgb.cv <- function(params = list()
   data$construct()
 
   # Check interaction constraints
-  cnames <- NULL
-  if (!is.null(colnames)) {
-    cnames <- colnames
-  } else if (!is.null(data$get_colnames())) {
-    cnames <- data$get_colnames()
-  }
   params[["interaction_constraints"]] <- .check_interaction_constraints(
     interaction_constraints = interaction_constraints
-    , column_names = cnames
+    , column_names = data$get_colnames()
   )
 
-  if (!is.null(weight)) {
-    data$set_field(field_name = "weight", data = weight)
-  }
-
   # Update parameters with parsed parameters
   data$update_params(params = params)
 
   # Create the predictor set
   data$.__enclos_env__$private$set_predictor(predictor = predictor)
 
-  # Write column names
-  if (!is.null(colnames)) {
-    data$set_colnames(colnames = colnames)
-  }
-
-  # Write categorical features
-  if (!is.null(categorical_feature)) {
-    data$set_categorical_feature(categorical_feature = categorical_feature)
-  }
-
   if (!is.null(folds)) {
 
     # Check for list of folds or for single value
diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R
index 4d994cfc6f04..4b16cac22515 100644
--- a/R-package/R/lgb.train.R
+++ b/R-package/R/lgb.train.R
@@ -6,8 +6,6 @@
 #' @inheritParams lgb_shared_params
 #' @param valids a list of \code{lgb.Dataset} objects, used for validation
 #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
-#' @param colnames Deprecated. See "Deprecated Arguments" section below.
-#' @param categorical_feature Deprecated. See "Deprecated Arguments" section below.
 #' @param callbacks List of callback functions that are applied at each iteration.
 #' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the
 #'                   booster model into a predictor model which frees up memory and the
@@ -42,12 +40,6 @@
 #' )
 #' }
 #'
-#' @section Deprecated Arguments:
-#'
-#' A future release of \code{lightgbm} will remove support for passing arguments
-#' \code{'categorical_feature'} and \code{'colnames'}. Pass those things to
-#' \code{lgb.Dataset} instead.
-#'
 #' @export
 lgb.train <- function(params = list(),
                       data,
@@ -59,8 +51,6 @@ lgb.train <- function(params = list(),
                       record = TRUE,
                       eval_freq = 1L,
                       init_model = NULL,
-                      colnames = NULL,
-                      categorical_feature = NULL,
                       early_stopping_rounds = NULL,
                       callbacks = list(),
                       reset_data = FALSE,
@@ -83,16 +73,6 @@ lgb.train <- function(params = list(),
     }
   }
 
-  # raise deprecation warnings if necessary
-  # ref: https://github.com/microsoft/LightGBM/issues/6435
-  args <- names(match.call())
-  if ("categorical_feature" %in% args) {
-    .emit_dataset_kwarg_warning("categorical_feature", "lgb.train")
-  }
-  if ("colnames" %in% args) {
-    .emit_dataset_kwarg_warning("colnames", "lgb.train")
-  }
-
   # set some parameters, resolving the way they were passed in with other parameters
   # in `params`.
   # this ensures that the model stored with Booster$save() correctly represents
@@ -171,21 +151,12 @@ lgb.train <- function(params = list(),
 
   # Construct datasets, if needed
   data$update_params(params = params)
-  if (!is.null(categorical_feature)) {
-    data$set_categorical_feature(categorical_feature)
-  }
   data$construct()
 
   # Check interaction constraints
-  cnames <- NULL
-  if (!is.null(colnames)) {
-    cnames <- colnames
-  } else if (!is.null(data$get_colnames())) {
-    cnames <- data$get_colnames()
-  }
   params[["interaction_constraints"]] <- .check_interaction_constraints(
     interaction_constraints = interaction_constraints
-    , column_names = cnames
+    , column_names = data$get_colnames()
   )
 
   # Update parameters with parsed parameters
@@ -194,11 +165,6 @@ lgb.train <- function(params = list(),
   # Create the predictor set
   data$.__enclos_env__$private$set_predictor(predictor)
 
-  # Write column names
-  if (!is.null(colnames)) {
-    data$set_colnames(colnames)
-  }
-
   valid_contain_train <- FALSE
   train_data_name <- "train"
   reduced_valid_sets <- list()
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 9fbdba778cc4..321feb60bb30 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -34,7 +34,7 @@
 
     # If a parameter has multiple values, join those values together with commas.
     # trimws() is necessary because format() will pad to make strings the same width
-    val <- paste0(
+    val <- paste(
       trimws(
         format(
           x = unname(params[[i]])
@@ -46,7 +46,7 @@
     if (nchar(val) <= 0L) next # Skip join
 
     # Join key value
-    pair <- paste0(c(param_names[[i]], val), collapse = "=")
+    pair <- paste(c(param_names[[i]], val), collapse = "=")
     ret <- c(ret, pair)
 
   }
@@ -55,7 +55,7 @@
     return("")
   }
 
-  return(paste0(ret, collapse = " "))
+  return(paste(ret, collapse = " "))
 
 }
 
@@ -115,7 +115,7 @@
   # Turn indices 0-based and convert to string
   for (j in seq_along(interaction_constraints)) {
     interaction_constraints[[j]] <- paste0(
-      "[", paste0(interaction_constraints[[j]] - 1L, collapse = ","), "]"
+      "[", paste(interaction_constraints[[j]] - 1L, collapse = ","), "]"
     )
   }
   return(interaction_constraints)
@@ -258,19 +258,3 @@
     return(a == b)
   }
 }
-
-# ref: https://github.com/microsoft/LightGBM/issues/6435
-.emit_dataset_kwarg_warning <- function(calling_function, argname) {
-  msg <- sprintf(
-    paste0(
-      "Argument '%s' to %s() is deprecated and will be removed in a future release. "
-      , "Set '%s' with lgb.Dataset() instead. "
-      , "See https://github.com/microsoft/LightGBM/issues/6435."
-    )
-    , argname
-    , calling_function
-    , argname
-  )
-  warning(msg)
-  return(invisible(NULL))
-}
diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd
index cee059d494ca..0fb0363c092e 100644
--- a/R-package/man/lgb.cv.Rd
+++ b/R-package/man/lgb.cv.Rd
@@ -9,8 +9,6 @@ lgb.cv(
   data,
   nrounds = 100L,
   nfold = 3L,
-  label = NULL,
-  weight = NULL,
   obj = NULL,
   eval = NULL,
   verbose = 1L,
@@ -20,8 +18,6 @@ lgb.cv(
   stratified = TRUE,
   folds = NULL,
   init_model = NULL,
-  colnames = NULL,
-  categorical_feature = NULL,
   early_stopping_rounds = NULL,
   callbacks = list(),
   reset_data = FALSE,
@@ -41,10 +37,6 @@ may allow you to pass other types of data like \code{matrix} and then separately
 
 \item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
 
-\item{label}{Deprecated. See "Deprecated Arguments" section below.}
-
-\item{weight}{Deprecated. See "Deprecated Arguments" section below.}
-
 \item{obj}{objective function, can be character or custom objective function. Examples include
 \code{regression}, \code{regression_l1}, \code{huber},
 \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
@@ -103,10 +95,6 @@ the \code{nfold} and \code{stratified} parameters are ignored.}
 
 \item{init_model}{path of model file or \code{lgb.Booster} object, will continue training from this model}
 
-\item{colnames}{Deprecated. See "Deprecated Arguments" section below.}
-
-\item{categorical_feature}{Deprecated. See "Deprecated Arguments" section below.}
-
 \item{early_stopping_rounds}{int. Activates early stopping. When this parameter is non-null,
 training will stop if the evaluation of any metric on any validation set
 fails to improve for \code{early_stopping_rounds} consecutive boosting rounds.
@@ -131,14 +119,6 @@ a trained model \code{lgb.CVBooster}.
 \description{
 Cross validation logic used by LightGBM
 }
-\section{Deprecated Arguments}{
-
-
-A future release of \code{lightgbm} will require passing an \code{lgb.Dataset}
-to argument \code{'data'}. It will also remove support for passing arguments
-\code{'categorical_feature'}, \code{'colnames'}, \code{'label'}, and \code{'weight'}.
-}
-
 \section{Early Stopping}{
 
 
diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd
index ebbfc206998e..1f6edfa25ba3 100644
--- a/R-package/man/lgb.train.Rd
+++ b/R-package/man/lgb.train.Rd
@@ -15,8 +15,6 @@ lgb.train(
   record = TRUE,
   eval_freq = 1L,
   init_model = NULL,
-  colnames = NULL,
-  categorical_feature = NULL,
   early_stopping_rounds = NULL,
   callbacks = list(),
   reset_data = FALSE,
@@ -82,10 +80,6 @@ printing of evaluation during training}
 
 \item{init_model}{path of model file or \code{lgb.Booster} object, will continue training from this model}
 
-\item{colnames}{Deprecated. See "Deprecated Arguments" section below.}
-
-\item{categorical_feature}{Deprecated. See "Deprecated Arguments" section below.}
-
 \item{early_stopping_rounds}{int. Activates early stopping. When this parameter is non-null,
 training will stop if the evaluation of any metric on any validation set
 fails to improve for \code{early_stopping_rounds} consecutive boosting rounds.
@@ -109,14 +103,6 @@ Low-level R interface to train a LightGBM model. Unlike \code{\link{lightgbm}},
              this function is focused on performance (e.g. speed, memory efficiency). It is also
              less likely to have breaking API changes in new releases than \code{\link{lightgbm}}.
 }
-\section{Deprecated Arguments}{
-
-
-A future release of \code{lightgbm} will remove support for passing arguments
-\code{'categorical_feature'} and \code{'colnames'}. Pass those things to
-\code{lgb.Dataset} instead.
-}
-
 \section{Early Stopping}{
 
 
diff --git a/R-package/src/install.libs.R b/R-package/src/install.libs.R
index c622de927ac5..fa61d014a50e 100644
--- a/R-package/src/install.libs.R
+++ b/R-package/src/install.libs.R
@@ -51,7 +51,7 @@ inst_dir <- file.path(R_PACKAGE_SOURCE, "inst", fsep = "/")
           , "make this faster."
         ))
       }
-      cmd <- paste0(cmd, " ", paste0(args, collapse = " "))
+      cmd <- paste0(cmd, " ", paste(args, collapse = " "))
       exit_code <- system(cmd)
     }
 
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index 06d35a146d66..cb43ba613be9 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -433,7 +433,7 @@ test_that("lgb.cv() rejects negative or 0 value passed to nrounds", {
   }
 })
 
-test_that("lgb.cv() throws an informative error if 'data' is not an lgb.Dataset and labels are not given", {
+test_that("lgb.cv() throws an informative error if 'data' is not an lgb.Dataset", {
   bad_values <- list(
     4L
     , "hello"
@@ -454,7 +454,7 @@ test_that("lgb.cv() throws an informative error if 'data' is not an lgb.Dataset
         , 10L
         , nfold = 5L
       )
-    }, regexp = "'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'", fixed = TRUE)
+    }, regexp = "lgb.cv: data must be an lgb.Dataset instance", fixed = TRUE)
   }
 })
 
diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R
index 9197fd7226af..a770b28c415a 100644
--- a/R-package/tests/testthat/test_lgb.Booster.R
+++ b/R-package/tests/testthat/test_lgb.Booster.R
@@ -888,7 +888,7 @@ test_that("Saving a model with different feature importance types works", {
 
     .feat_importance_from_string <- function(model_string) {
         file_lines <- strsplit(model_string, "\n", fixed = TRUE)[[1L]]
-        start_indx <- which(grepl("^feature_importances\\:$", file_lines)) + 1L
+        start_indx <- which(file_lines == "feature_importances:") + 1L
         blank_line_indices <- which(file_lines == "")
         end_indx <- blank_line_indices[blank_line_indices > start_indx][1L] - 1L
         importances <- file_lines[start_indx: end_indx]
@@ -955,7 +955,7 @@ test_that("Saving a model with unknown importance type fails", {
 
 .params_from_model_string <- function(model_str) {
     file_lines <- strsplit(model_str, "\n", fixed = TRUE)[[1L]]
-    start_indx <- which(grepl("^parameters\\:$", file_lines)) + 1L
+    start_indx <- which(file_lines == "parameters:") + 1L
     blank_line_indices <- which(file_lines == "")
     end_indx <- blank_line_indices[blank_line_indices > start_indx][1L] - 1L
     params <- file_lines[start_indx: end_indx]
@@ -1532,7 +1532,7 @@ test_that("Booster's print, show, and summary work correctly", {
     }
 
     .has_expected_content_for_finalized_model <- function(printed_txt) {
-      expect_true(any(grepl("^LightGBM Model$", printed_txt)))
+      expect_true(any(printed_txt == "LightGBM Model"))
       expect_true(any(grepl("Booster handle is invalid", printed_txt, fixed = TRUE)))
     }
 
diff --git a/R-package/tests/testthat/test_parameters.R b/R-package/tests/testthat/test_parameters.R
index 9949ffe646b9..2e3aaa3799c3 100644
--- a/R-package/tests/testthat/test_parameters.R
+++ b/R-package/tests/testthat/test_parameters.R
@@ -18,7 +18,7 @@ test_that("Feature penalties work properly", {
         num_leaves = 5L
         , learning_rate = 0.05
         , objective = "binary"
-        , feature_penalty = paste0(feature_penalties, collapse = ",")
+        , feature_penalty = paste(feature_penalties, collapse = ",")
         , metric = "binary_error"
         , num_threads = .LGB_MAX_THREADS
       )
diff --git a/build_r.R b/build_r.R
index 1d824d60bbba..a680f0a730b2 100644
--- a/build_r.R
+++ b/build_r.R
@@ -121,7 +121,7 @@ if (length(parsed_args[["make_args"]]) > 0L) {
     pattern = "make_args_from_build_script <- character(0L)"
     , replacement = paste0(
       "make_args_from_build_script <- c(\""
-      , paste0(parsed_args[["make_args"]], collapse = "\", \"")
+      , paste(parsed_args[["make_args"]], collapse = "\", \"")
       , "\")"
     )
     , x = install_libs_content
@@ -167,7 +167,7 @@ if (length(parsed_args[["make_args"]]) > 0L) {
           , "make this faster."
         ))
       }
-      cmd <- paste0(cmd, " ", paste0(args, collapse = " "))
+      cmd <- paste0(cmd, " ", paste(args, collapse = " "))
       exit_code <- system(cmd)
     }
 
@@ -426,6 +426,6 @@ install_args <- c("CMD", "INSTALL", "--no-multiarch", "--with-keep.source", tarb
 if (INSTALL_AFTER_BUILD) {
   .run_shell_command(install_cmd, install_args)
 } else {
-  cmd <- paste0(install_cmd, " ", paste0(install_args, collapse = " "))
+  cmd <- paste0(install_cmd, " ", paste(install_args, collapse = " "))
   print(sprintf("Skipping installation. Install the package with command '%s'", cmd))
 }