diff --git a/ludwig/api.py b/ludwig/api.py index d1c5e2a3fe1..3302f51d4d8 100644 --- a/ludwig/api.py +++ b/ludwig/api.py @@ -831,6 +831,7 @@ def predict( data_format: str = None, split: str = FULL, batch_size: int = 128, + generation_config: Optional[Dict] = None, skip_save_unprocessed_output: bool = True, skip_save_predictions: bool = True, output_directory: str = "results", @@ -840,43 +841,34 @@ def predict( ) -> Tuple[Union[dict, pd.DataFrame], str]: """Using a trained model, make predictions from the provided dataset. - # Inputs - :param dataset: (Union[str, dict, pandas.DataFrame]) source containing - the entire dataset to be evaluated. - :param data_format: (str, default: `None`) format to interpret data - sources. Will be inferred automatically if not specified. Valid - formats are `'auto'`, `'csv'`, `'df'`, `'dict'`, `'excel'`, `'feather'`, - `'fwf'`, `'hdf5'` (cache file produced during previous training), - `'html'` (file containing a single HTML ``), `'json'`, `'jsonl'`, - `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`, - `'stata'`, `'tsv'`. - :param: split: (str, default= `'full'`): if the input dataset contains - a split column, this parameter indicates which split of the data - to use. Possible values are `'full'`, `'training'`, `'validation'`, `'test'`. - :param batch_size: (int, default: 128) size of batch to use when making - predictions. - :param skip_save_unprocessed_output: (bool, default: `True`) if this - parameter is `False`, predictions and their probabilities are saved - in both raw unprocessed numpy files containing tensors and as - postprocessed CSV files (one for each output feature). - If this parameter is `True`, only the CSV ones are saved and the - numpy ones are skipped. - :param skip_save_predictions: (bool, default: `True`) skips saving - test predictions CSV files. - :param output_directory: (str, default: `'results'`) the directory that - will contain the training statistics, TensorBoard logs, the saved - model and the training progress files. - :param return_type: (Union[str, dict, pandas.DataFrame], default: pd.DataFrame) - indicates the format of the returned predictions. - :param callbacks: (Optional[List[Callback]], default: None) - optional list of callbacks to use during this predict operation. Any callbacks - already registered to the model will be preserved. - - # Return + Args: + dataset: (Union[str, dict, pandas.DataFrame]): source containing the entire dataset to be evaluated. + data_format: (str, default: `None`) format to interpret data sources. Will be inferred automatically if not + specified. Valid formats are `'auto'`, `'csv'`, `'df'`, `'dict'`, `'excel'`, `'feather'`, `'fwf'`, + `'hdf5'` (cache file produced during previous training), `'html'` (file containing a single HTML + `
`), `'json'`, `'jsonl'`, `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`, + `'stata'`, `'tsv'`. + split: (str, default= `'full'`): if the input dataset contains a split column, this parameter indicates + which split of the data to use. Possible values are `'full'`, `'training'`, `'validation'`, `'test'`. + batch_size: (int, default: 128) size of batch to use when making predictions. + generation_config: Dict, default: `None`) config for the generation of the predictions. If `None`, the + config that was used during model training is used. + skip_save_unprocessed_output: (bool, default: `True`) if this parameter is `False`, predictions and their + probabilities are saved in both raw unprocessed numpy files containing tensors and as postprocessed CSV + files (one for each output feature). If this parameter is `True`, only the CSV ones are saved and the + numpy ones are skipped. + skip_save_predictions: (bool, default: `True`) skips saving test predictions CSV files. + output_directory: (str, default: `'results'`) the directory that will contain the training statistics, + TensorBoard logs, the saved model and the training progress files. + return_type: (Union[str, dict, pandas.DataFrame], default: pd.DataFrame) indicates the format of the + returned predictions. + callbacks: (Optional[List[Callback]], default: None) optional list of callbacks to use during this predict + operation. Any callbacks already registered to the model will be preserved. - :return: (Tuple[Union[dict, pd.DataFrame], str]) `(predictions, output_directory)` - `predictions` predictions from the provided dataset, - `output_directory` filepath string to where data was stored. + Returns: + `(predictions, output_directory)`: (Tuple[Union[dict, pd.DataFrame], str]) + `predictions` predictions from the provided dataset, + `output_directory` filepath string to where data was stored. """ self._check_initialization() @@ -893,12 +885,21 @@ def predict( callbacks=self.callbacks + (callbacks or []), ) + # Set the generation config if it exists. + # model.reset_generation_config() is called after batch prediction. + if generation_config is not None: + self.model.set_generation_config(generation_config) + logger.debug("Predicting") with self.backend.create_predictor(self.model, batch_size=batch_size) as predictor: predictions = predictor.batch_predict( dataset, ) + # If there was a generation config set prior to batch prediction, reset it. + if generation_config is not None: + self.model.reset_generation_config() + if self.backend.is_coordinator(): # if we are skipping all saving, # there is no need to create a directory that will remain empty diff --git a/ludwig/models/llm.py b/ludwig/models/llm.py index 0ef2c9e3200..f3263564755 100644 --- a/ludwig/models/llm.py +++ b/ludwig/models/llm.py @@ -1,4 +1,5 @@ import contextlib +import copy import logging import os import tempfile @@ -160,6 +161,10 @@ def __init__( self.generation = GenerationConfig(**self.config_obj.generation.to_dict()) + # Save the original generation config so that we can reset it if/when we change it when self.generation gets is + # dynamically mutated during 1-off predict calls after fine-tuning. + self.original_generation_config = copy.deepcopy(self.generation) + # ================ Inputs ================ try: self.input_features.update(self.build_inputs(input_feature_configs=self.config_obj.input_features)) @@ -195,6 +200,14 @@ def __init__( def create_feature_dict(self) -> LudwigFeatureDict: return DictWrapper(LudwigFeatureDict()) + def set_generation_config(self, generation_config_dict): + """Sets the generation config for the model.""" + self.generation = GenerationConfig(**generation_config_dict) + + def reset_generation_config(self): + """Sets the generation config for th.""" + self.generation = self.original_generation_config + @property def output_feature_decoder(self) -> OutputFeature: return self._output_feature_decoder.module @@ -375,7 +388,7 @@ def generate( mask=None, ) -> Dict[str, torch.Tensor]: """Generates tokens using the model.""" - + logger.info(f"For generating text, using: {self.generation}") input_ids, _ = self._unpack_inputs(inputs) with torch.no_grad(): @@ -383,6 +396,10 @@ def generate( sequences_list = [] for input_ids_sample in input_ids: input_ids_sample_no_padding = remove_left_padding(input_ids_sample, self.tokenizer) + logger.info( + "Decoded text inputs for the first example in batch: " + f"{self.tokenizer.decode(input_ids_sample_no_padding[0])}" + ) if input_ids_sample_no_padding.shape[1] > self.max_input_length: logger.warning(