Skip to content

Commit

Permalink
Add mechanic to override default values for generation during model.p…
Browse files Browse the repository at this point in the history
…redict() (#3520)
  • Loading branch information
justinxzhao authored Aug 11, 2023
1 parent 92d07f6 commit 6552adc
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 37 deletions.
73 changes: 37 additions & 36 deletions ludwig/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,6 +831,7 @@ def predict(
data_format: str = None,
split: str = FULL,
batch_size: int = 128,
generation_config: Optional[Dict] = None,
skip_save_unprocessed_output: bool = True,
skip_save_predictions: bool = True,
output_directory: str = "results",
Expand All @@ -840,43 +841,34 @@ def predict(
) -> Tuple[Union[dict, pd.DataFrame], str]:
"""Using a trained model, make predictions from the provided dataset.
# Inputs
:param dataset: (Union[str, dict, pandas.DataFrame]) source containing
the entire dataset to be evaluated.
:param data_format: (str, default: `None`) format to interpret data
sources. Will be inferred automatically if not specified. Valid
formats are `'auto'`, `'csv'`, `'df'`, `'dict'`, `'excel'`, `'feather'`,
`'fwf'`, `'hdf5'` (cache file produced during previous training),
`'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`,
`'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
`'stata'`, `'tsv'`.
:param: split: (str, default= `'full'`): if the input dataset contains
a split column, this parameter indicates which split of the data
to use. Possible values are `'full'`, `'training'`, `'validation'`, `'test'`.
:param batch_size: (int, default: 128) size of batch to use when making
predictions.
:param skip_save_unprocessed_output: (bool, default: `True`) if this
parameter is `False`, predictions and their probabilities are saved
in both raw unprocessed numpy files containing tensors and as
postprocessed CSV files (one for each output feature).
If this parameter is `True`, only the CSV ones are saved and the
numpy ones are skipped.
:param skip_save_predictions: (bool, default: `True`) skips saving
test predictions CSV files.
:param output_directory: (str, default: `'results'`) the directory that
will contain the training statistics, TensorBoard logs, the saved
model and the training progress files.
:param return_type: (Union[str, dict, pandas.DataFrame], default: pd.DataFrame)
indicates the format of the returned predictions.
:param callbacks: (Optional[List[Callback]], default: None)
optional list of callbacks to use during this predict operation. Any callbacks
already registered to the model will be preserved.
# Return
Args:
dataset: (Union[str, dict, pandas.DataFrame]): source containing the entire dataset to be evaluated.
data_format: (str, default: `None`) format to interpret data sources. Will be inferred automatically if not
specified. Valid formats are `'auto'`, `'csv'`, `'df'`, `'dict'`, `'excel'`, `'feather'`, `'fwf'`,
`'hdf5'` (cache file produced during previous training), `'html'` (file containing a single HTML
`<table>`), `'json'`, `'jsonl'`, `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
`'stata'`, `'tsv'`.
split: (str, default= `'full'`): if the input dataset contains a split column, this parameter indicates
which split of the data to use. Possible values are `'full'`, `'training'`, `'validation'`, `'test'`.
batch_size: (int, default: 128) size of batch to use when making predictions.
generation_config: Dict, default: `None`) config for the generation of the predictions. If `None`, the
config that was used during model training is used.
skip_save_unprocessed_output: (bool, default: `True`) if this parameter is `False`, predictions and their
probabilities are saved in both raw unprocessed numpy files containing tensors and as postprocessed CSV
files (one for each output feature). If this parameter is `True`, only the CSV ones are saved and the
numpy ones are skipped.
skip_save_predictions: (bool, default: `True`) skips saving test predictions CSV files.
output_directory: (str, default: `'results'`) the directory that will contain the training statistics,
TensorBoard logs, the saved model and the training progress files.
return_type: (Union[str, dict, pandas.DataFrame], default: pd.DataFrame) indicates the format of the
returned predictions.
callbacks: (Optional[List[Callback]], default: None) optional list of callbacks to use during this predict
operation. Any callbacks already registered to the model will be preserved.
:return: (Tuple[Union[dict, pd.DataFrame], str]) `(predictions, output_directory)`
`predictions` predictions from the provided dataset,
`output_directory` filepath string to where data was stored.
Returns:
`(predictions, output_directory)`: (Tuple[Union[dict, pd.DataFrame], str])
`predictions` predictions from the provided dataset,
`output_directory` filepath string to where data was stored.
"""
self._check_initialization()

Expand All @@ -893,12 +885,21 @@ def predict(
callbacks=self.callbacks + (callbacks or []),
)

# Set the generation config if it exists.
# model.reset_generation_config() is called after batch prediction.
if generation_config is not None:
self.model.set_generation_config(generation_config)

logger.debug("Predicting")
with self.backend.create_predictor(self.model, batch_size=batch_size) as predictor:
predictions = predictor.batch_predict(
dataset,
)

# If there was a generation config set prior to batch prediction, reset it.
if generation_config is not None:
self.model.reset_generation_config()

if self.backend.is_coordinator():
# if we are skipping all saving,
# there is no need to create a directory that will remain empty
Expand Down
19 changes: 18 additions & 1 deletion ludwig/models/llm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import contextlib
import copy
import logging
import os
import tempfile
Expand Down Expand Up @@ -160,6 +161,10 @@ def __init__(

self.generation = GenerationConfig(**self.config_obj.generation.to_dict())

# Save the original generation config so that we can reset it if/when we change it when self.generation gets is
# dynamically mutated during 1-off predict calls after fine-tuning.
self.original_generation_config = copy.deepcopy(self.generation)

# ================ Inputs ================
try:
self.input_features.update(self.build_inputs(input_feature_configs=self.config_obj.input_features))
Expand Down Expand Up @@ -195,6 +200,14 @@ def __init__(
def create_feature_dict(self) -> LudwigFeatureDict:
return DictWrapper(LudwigFeatureDict())

def set_generation_config(self, generation_config_dict):
"""Sets the generation config for the model."""
self.generation = GenerationConfig(**generation_config_dict)

def reset_generation_config(self):
"""Sets the generation config for th."""
self.generation = self.original_generation_config

@property
def output_feature_decoder(self) -> OutputFeature:
return self._output_feature_decoder.module
Expand Down Expand Up @@ -375,14 +388,18 @@ def generate(
mask=None,
) -> Dict[str, torch.Tensor]:
"""Generates tokens using the model."""

logger.info(f"For generating text, using: {self.generation}")
input_ids, _ = self._unpack_inputs(inputs)

with torch.no_grad():
input_lengths = []
sequences_list = []
for input_ids_sample in input_ids:
input_ids_sample_no_padding = remove_left_padding(input_ids_sample, self.tokenizer)
logger.info(
"Decoded text inputs for the first example in batch: "
f"{self.tokenizer.decode(input_ids_sample_no_padding[0])}"
)

if input_ids_sample_no_padding.shape[1] > self.max_input_length:
logger.warning(
Expand Down

0 comments on commit 6552adc

Please sign in to comment.