diff --git a/ludwig/api.py b/ludwig/api.py
index d1c5e2a3fe1..3302f51d4d8 100644
--- a/ludwig/api.py
+++ b/ludwig/api.py
@@ -831,6 +831,7 @@ def predict(
         data_format: str = None,
         split: str = FULL,
         batch_size: int = 128,
+        generation_config: Optional[Dict] = None,
         skip_save_unprocessed_output: bool = True,
         skip_save_predictions: bool = True,
         output_directory: str = "results",
@@ -840,43 +841,34 @@ def predict(
     ) -> Tuple[Union[dict, pd.DataFrame], str]:
         """Using a trained model, make predictions from the provided dataset.
 
-        # Inputs
-        :param dataset: (Union[str, dict, pandas.DataFrame]) source containing
-            the entire dataset to be evaluated.
-        :param data_format: (str, default: `None`) format to interpret data
-            sources. Will be inferred automatically if not specified.  Valid
-            formats are `'auto'`, `'csv'`, `'df'`, `'dict'`, `'excel'`, `'feather'`,
-            `'fwf'`, `'hdf5'` (cache file produced during previous training),
-            `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`,
-            `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
-            `'stata'`, `'tsv'`.
-        :param: split: (str, default= `'full'`): if the input dataset contains
-            a split column, this parameter indicates which split of the data
-            to use. Possible values are `'full'`, `'training'`, `'validation'`, `'test'`.
-        :param batch_size: (int, default: 128) size of batch to use when making
-            predictions.
-        :param skip_save_unprocessed_output: (bool, default: `True`) if this
-            parameter is `False`, predictions and their probabilities are saved
-            in both raw unprocessed numpy files containing tensors and as
-            postprocessed CSV files (one for each output feature).
-            If this parameter is `True`, only the CSV ones are saved and the
-            numpy ones are skipped.
-        :param skip_save_predictions: (bool, default: `True`) skips saving
-            test predictions CSV files.
-        :param output_directory: (str, default: `'results'`) the directory that
-            will contain the training statistics, TensorBoard logs, the saved
-            model and the training progress files.
-        :param return_type: (Union[str, dict, pandas.DataFrame], default: pd.DataFrame)
-            indicates the format of the returned predictions.
-        :param callbacks: (Optional[List[Callback]], default: None)
-            optional list of callbacks to use during this predict operation. Any callbacks
-            already registered to the model will be preserved.
-
-        # Return
+        Args:
+            dataset: (Union[str, dict, pandas.DataFrame]): source containing the entire dataset to be evaluated.
+            data_format: (str, default: `None`) format to interpret data sources. Will be inferred automatically if not
+                specified.  Valid formats are `'auto'`, `'csv'`, `'df'`, `'dict'`, `'excel'`, `'feather'`, `'fwf'`,
+                `'hdf5'` (cache file produced during previous training), `'html'` (file containing a single HTML
+                `<table>`), `'json'`, `'jsonl'`, `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
+                `'stata'`, `'tsv'`.
+            split: (str, default= `'full'`):  if the input dataset contains a split column, this parameter indicates
+                which split of the data to use. Possible values are `'full'`, `'training'`, `'validation'`, `'test'`.
+            batch_size: (int, default: 128) size of batch to use when making predictions.
+            generation_config: Dict, default: `None`) config for the generation of the predictions. If `None`, the
+                config that was used during model training is used.
+            skip_save_unprocessed_output: (bool, default: `True`) if this parameter is `False`, predictions and their
+                probabilities are saved in both raw unprocessed numpy files containing tensors and as postprocessed CSV
+                files (one for each output feature). If this parameter is `True`, only the CSV ones are saved and the
+                numpy ones are skipped.
+            skip_save_predictions: (bool, default: `True`) skips saving test predictions CSV files.
+            output_directory: (str, default: `'results'`) the directory that will contain the training statistics,
+                TensorBoard logs, the saved model and the training progress files.
+            return_type: (Union[str, dict, pandas.DataFrame], default: pd.DataFrame) indicates the format of the
+                returned predictions.
+            callbacks: (Optional[List[Callback]], default: None) optional list of callbacks to use during this predict
+                operation. Any callbacks already registered to the model will be preserved.
 
-        :return: (Tuple[Union[dict, pd.DataFrame], str]) `(predictions, output_directory)`
-            `predictions` predictions from the provided dataset,
-            `output_directory` filepath string to where data was stored.
+        Returns:
+            `(predictions, output_directory)`: (Tuple[Union[dict, pd.DataFrame], str])
+                `predictions` predictions from the provided dataset,
+                `output_directory` filepath string to where data was stored.
         """
         self._check_initialization()
 
@@ -893,12 +885,21 @@ def predict(
             callbacks=self.callbacks + (callbacks or []),
         )
 
+        # Set the generation config if it exists.
+        # model.reset_generation_config() is called after batch prediction.
+        if generation_config is not None:
+            self.model.set_generation_config(generation_config)
+
         logger.debug("Predicting")
         with self.backend.create_predictor(self.model, batch_size=batch_size) as predictor:
             predictions = predictor.batch_predict(
                 dataset,
             )
 
+            # If there was a generation config set prior to batch prediction, reset it.
+            if generation_config is not None:
+                self.model.reset_generation_config()
+
             if self.backend.is_coordinator():
                 # if we are skipping all saving,
                 # there is no need to create a directory that will remain empty
diff --git a/ludwig/models/llm.py b/ludwig/models/llm.py
index 0ef2c9e3200..f3263564755 100644
--- a/ludwig/models/llm.py
+++ b/ludwig/models/llm.py
@@ -1,4 +1,5 @@
 import contextlib
+import copy
 import logging
 import os
 import tempfile
@@ -160,6 +161,10 @@ def __init__(
 
         self.generation = GenerationConfig(**self.config_obj.generation.to_dict())
 
+        # Save the original generation config so that we can reset it if/when we change it when self.generation gets is
+        # dynamically mutated during 1-off predict calls after fine-tuning.
+        self.original_generation_config = copy.deepcopy(self.generation)
+
         # ================ Inputs ================
         try:
             self.input_features.update(self.build_inputs(input_feature_configs=self.config_obj.input_features))
@@ -195,6 +200,14 @@ def __init__(
     def create_feature_dict(self) -> LudwigFeatureDict:
         return DictWrapper(LudwigFeatureDict())
 
+    def set_generation_config(self, generation_config_dict):
+        """Sets the generation config for the model."""
+        self.generation = GenerationConfig(**generation_config_dict)
+
+    def reset_generation_config(self):
+        """Sets the generation config for th."""
+        self.generation = self.original_generation_config
+
     @property
     def output_feature_decoder(self) -> OutputFeature:
         return self._output_feature_decoder.module
@@ -375,7 +388,7 @@ def generate(
         mask=None,
     ) -> Dict[str, torch.Tensor]:
         """Generates tokens using the model."""
-
+        logger.info(f"For generating text, using: {self.generation}")
         input_ids, _ = self._unpack_inputs(inputs)
 
         with torch.no_grad():
@@ -383,6 +396,10 @@ def generate(
             sequences_list = []
             for input_ids_sample in input_ids:
                 input_ids_sample_no_padding = remove_left_padding(input_ids_sample, self.tokenizer)
+                logger.info(
+                    "Decoded text inputs for the first example in batch: "
+                    f"{self.tokenizer.decode(input_ids_sample_no_padding[0])}"
+                )
 
                 if input_ids_sample_no_padding.shape[1] > self.max_input_length:
                     logger.warning(