Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

QOL: Fail during preprocessing if max sequence lengths are shorter than the prompt template. #3719

Merged
merged 13 commits into from
Oct 27, 2023
Merged
20 changes: 20 additions & 0 deletions ludwig/config_validation/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
def check_global_max_sequence_length_fits_prompt_template(metadata, global_preprocessing_parameters):
"""Checks that the prompt template fits within the global max sequence length."""

if (
"global_max_sequence_length" in global_preprocessing_parameters
and global_preprocessing_parameters["global_max_sequence_length"] is not None
):
for feature_name, feature_metadata in metadata.items():
if (
"prompt_template_num_tokens" in feature_metadata
and feature_metadata["prompt_template_num_tokens"]
> global_preprocessing_parameters["global_max_sequence_length"]
):
raise ValueError(
f'The prompt contains ({feature_metadata["prompt_template_num_tokens"]}) tokens, which is more '
f"than the the global_max_sequence_length "
f'({global_preprocessing_parameters["global_max_sequence_length"]}), which will remove all unique '
"information. Shorten the prompt, or increase the global max sequence length to > "
f'({feature_metadata["prompt_template_num_tokens"]}) to include the full prompt.'
)
12 changes: 8 additions & 4 deletions ludwig/data/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

from ludwig.api_annotations import DeveloperAPI
from ludwig.backend import Backend, LOCAL_BACKEND
from ludwig.config_validation.preprocessing import check_global_max_sequence_length_fits_prompt_template
from ludwig.constants import (
BFILL,
CHECKSUM,
Expand Down Expand Up @@ -1277,10 +1278,12 @@ def build_dataset(
callback.on_build_metadata_start(dataset_df, mode)

logger.debug("build metadata")
metadata = build_metadata(
metadata, feature_name_to_preprocessing_parameters, dataset_cols, feature_configs, backend
metadata: TrainingSetMetadataDict = build_metadata(
config, metadata, feature_name_to_preprocessing_parameters, dataset_cols, feature_configs, backend
)

check_global_max_sequence_length_fits_prompt_template(metadata, global_preprocessing_parameters)

for callback in callbacks or []:
callback.on_build_metadata_end(dataset_df, mode)

Expand Down Expand Up @@ -1346,7 +1349,7 @@ def build_dataset(
col_name_to_dtype = {}
for col_name, col in proc_cols.items():
# if col is a list of list-like objects, we assume the internal dtype of each col[i] remains unchanged.
if type(col) == list and type(col[0]) in {list, np.ndarray, torch.Tensor}:
if type(col) is list and type(col[0]) in {list, np.ndarray, torch.Tensor}:
continue
col_name_to_dtype[col_name] = col.dtype
dataset = dataset.astype(col_name_to_dtype)
Expand Down Expand Up @@ -1521,6 +1524,7 @@ def is_input_feature(feature_config: FeatureConfigDict) -> bool:


def build_metadata(
config: ModelConfigDict,
metadata: TrainingSetMetadataDict,
feature_name_to_preprocessing_parameters: Dict[str, PreprocessingConfigDict],
dataset_cols: Dict[str, Series],
Expand All @@ -1536,7 +1540,7 @@ def build_metadata(

column = dataset_cols[feature_config[COLUMN]]
metadata[feature_name] = get_from_registry(feature_config[TYPE], get_base_type_registry()).get_feature_meta(
column, preprocessing_parameters, backend, is_input_feature(feature_config)
config, column, preprocessing_parameters, backend, is_input_feature(feature_config)
)

metadata[feature_name][PREPROCESSING] = preprocessing_parameters
Expand Down
8 changes: 6 additions & 2 deletions ludwig/features/audio_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from ludwig.features.base_feature import BaseFeatureMixin
from ludwig.features.sequence_feature import SequenceInputFeature
from ludwig.schema.features.audio_feature import AudioInputFeatureConfig
from ludwig.types import FeatureMetadataDict, PreprocessingConfigDict, TrainingSetMetadataDict
from ludwig.types import FeatureMetadataDict, ModelConfigDict, PreprocessingConfigDict, TrainingSetMetadataDict
from ludwig.utils.audio_utils import (
calculate_mean,
calculate_var,
Expand Down Expand Up @@ -95,7 +95,11 @@ def cast_column(column, backend):

@staticmethod
def get_feature_meta(
column, preprocessing_parameters: PreprocessingConfigDict, backend, is_input_feature: bool
config: ModelConfigDict,
column,
preprocessing_parameters: PreprocessingConfigDict,
backend,
is_input_feature: bool,
) -> FeatureMetadataDict:
first_audio_file_path = column.head(1).iloc[0]
_, sampling_rate_in_hz = torchaudio.load(first_audio_file_path)
Expand Down
10 changes: 7 additions & 3 deletions ludwig/features/bag_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from ludwig.features.feature_utils import set_str_to_idx
from ludwig.features.set_feature import _SetPreprocessing
from ludwig.schema.features.bag_feature import BagInputFeatureConfig
from ludwig.types import FeatureMetadataDict, PreprocessingConfigDict, TrainingSetMetadataDict
from ludwig.types import FeatureMetadataDict, ModelConfigDict, PreprocessingConfigDict, TrainingSetMetadataDict
from ludwig.utils.strings_utils import create_vocabulary

logger = logging.getLogger(__name__)
Expand All @@ -41,7 +41,11 @@ def cast_column(column, backend):

@staticmethod
def get_feature_meta(
column, preprocessing_parameters: PreprocessingConfigDict, backend, is_input_feature: bool
config: ModelConfigDict,
column,
preprocessing_parameters: PreprocessingConfigDict,
backend,
is_input_feature: bool,
) -> FeatureMetadataDict:
vocabulary = create_vocabulary(
column,
Expand All @@ -55,7 +59,7 @@ def get_feature_meta(
"str2idx": vocabulary.str2idx,
"str2freq": vocabulary.str2freq,
"vocab_size": len(vocabulary.str2idx),
"max_set_size": vocabulary.line_length_max,
"max_set_size": vocabulary.max_sequence_length,
}

@staticmethod
Expand Down
15 changes: 13 additions & 2 deletions ludwig/features/base_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,13 @@
from ludwig.modules.metric_registry import get_metric_classes, get_metric_cls, get_metric_tensor_input
from ludwig.modules.reduction_modules import SequenceReducer
from ludwig.schema.features.base import BaseFeatureConfig, BaseOutputFeatureConfig
from ludwig.types import FeatureConfigDict, FeatureMetadataDict, PreprocessingConfigDict, TrainingSetMetadataDict
from ludwig.types import (
FeatureConfigDict,
FeatureMetadataDict,
ModelConfigDict,
PreprocessingConfigDict,
TrainingSetMetadataDict,
)
from ludwig.utils import output_feature_utils
from ludwig.utils.calibration import CalibrationModule
from ludwig.utils.torch_utils import LudwigModule
Expand Down Expand Up @@ -71,11 +77,16 @@ def cast_column(column: DataFrame, backend) -> DataFrame:

@abstractstaticmethod
def get_feature_meta(
column: DataFrame, preprocessing_parameters: PreprocessingConfigDict, backend, is_input_feature: bool
config: ModelConfigDict,
justinxzhao marked this conversation as resolved.
Show resolved Hide resolved
column: DataFrame,
preprocessing_parameters: PreprocessingConfigDict,
backend,
is_input_feature: bool,
) -> FeatureMetadataDict:
"""Returns a dictionary of feature metadata.

Args:
config: Ludwig model config dict.
column: Pandas column of values.
preprocessing_parameters: Preprocessing configuration for this feature.
backend: (Union[Backend, str]) Backend to use for feature data processing.
Expand Down
7 changes: 6 additions & 1 deletion ludwig/features/binary_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
FeatureConfigDict,
FeatureMetadataDict,
FeaturePostProcessingOutputDict,
ModelConfigDict,
PreprocessingConfigDict,
TrainingSetMetadataDict,
)
Expand Down Expand Up @@ -142,7 +143,11 @@ def cast_column(column, backend):

@staticmethod
def get_feature_meta(
column: DataFrame, preprocessing_parameters: PreprocessingConfigDict, backend, is_input_feature: bool
config: ModelConfigDict,
column: DataFrame,
preprocessing_parameters: PreprocessingConfigDict,
backend,
is_input_feature: bool,
) -> FeatureMetadataDict:
if column.dtype != object:
return {}
Expand Down
41 changes: 20 additions & 21 deletions ludwig/features/category_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from ludwig.types import (
FeatureMetadataDict,
FeaturePostProcessingOutputDict,
ModelConfigDict,
PreprocessingConfigDict,
TrainingSetMetadataDict,
)
Expand Down Expand Up @@ -145,7 +146,11 @@ def cast_column(column, backend):

@staticmethod
def get_feature_meta(
column, preprocessing_parameters: PreprocessingConfigDict, backend, is_input_feature: bool
config: ModelConfigDict,
column,
preprocessing_parameters: PreprocessingConfigDict,
backend,
is_input_feature: bool,
) -> FeatureMetadataDict:
idx2str, str2idx, str2freq = create_vocabulary_single_token(
column,
Expand Down Expand Up @@ -255,7 +260,11 @@ def type():

@staticmethod
def get_feature_meta(
column, preprocessing_parameters: PreprocessingConfigDict, backend, is_input_feature: bool
config: ModelConfigDict,
column,
preprocessing_parameters: PreprocessingConfigDict,
backend,
is_input_feature: bool,
) -> FeatureMetadataDict:
idx2str = preprocessing_parameters["vocab"]
str2idx = {s: i for i, s in enumerate(idx2str)}
Expand Down Expand Up @@ -389,29 +398,21 @@ def update_config_with_metadata(feature_config, feature_metadata, *args, **kwarg
if isinstance(feature_config.loss.class_weights, (list, tuple)):
if len(feature_config.loss.class_weights) != feature_config.num_classes:
raise ValueError(
"The length of class_weights ({}) is not compatible with "
"the number of classes ({}) for feature {}. "
f"The length of class_weights ({len(feature_config.loss.class_weights)}) is not compatible with "
f"the number of classes ({feature_config.num_classes}) for feature {feature_config.column}. "
"Check the metadata JSON file to see the classes "
"and their order and consider there needs to be a weight "
"for the <UNK> class too.".format(
len(feature_config.loss.class_weights),
feature_config.num_classes,
feature_config.column,
)
"for the <UNK> class too."
)

if isinstance(feature_config.loss.class_weights, dict):
if feature_metadata["str2idx"].keys() != feature_config.loss.class_weights.keys():
raise ValueError(
"The class_weights keys ({}) are not compatible with "
"the classes ({}) of feature {}. "
f"The class_weights keys ({feature_config.loss.class_weights.keys()}) are not compatible with "
f'the classes ({feature_metadata["str2idx"].keys()}) of feature {feature_config.column}. '
"Check the metadata JSON file to see the classes "
"and consider there needs to be a weight "
"for the <UNK> class too.".format(
feature_config.loss.class_weights.keys(),
feature_metadata["str2idx"].keys(),
feature_config.column,
)
"for the <UNK> class too."
)
else:
class_weights = feature_config.loss.class_weights
Expand Down Expand Up @@ -458,13 +459,11 @@ def update_config_with_metadata(feature_config, feature_metadata, *args, **kwarg

if all_rows_length != feature_config.num_classes:
raise ValueError(
"The size of the class_similarities matrix of {} is "
"{}, different from the number of classes ({}). "
f"The size of the class_similarities matrix of {feature_config.column} is "
f"{all_rows_length}, different from the number of classes ({feature_config.num_classes}). "
"Check the metadata JSON file to see the classes "
"and their order and "
"consider <UNK> class too.".format(
feature_config.column, all_rows_length, feature_config.num_classes
)
"consider <UNK> class too."
)

similarities = np.array(similarities, dtype=np.float32)
Expand Down
14 changes: 12 additions & 2 deletions ludwig/features/date_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@
from ludwig.constants import COLUMN, DATE, PROC_COLUMN
from ludwig.features.base_feature import BaseFeatureMixin, InputFeature
from ludwig.schema.features.date_feature import DateInputFeatureConfig
from ludwig.types import FeatureConfigDict, FeatureMetadataDict, PreprocessingConfigDict, TrainingSetMetadataDict
from ludwig.types import (
FeatureConfigDict,
FeatureMetadataDict,
ModelConfigDict,
PreprocessingConfigDict,
TrainingSetMetadataDict,
)
from ludwig.utils.date_utils import create_vector_from_datetime_obj, parse_datetime
from ludwig.utils.types import DataFrame, TorchscriptPreprocessingInput

Expand Down Expand Up @@ -57,7 +63,11 @@ def cast_column(column, backend):

@staticmethod
def get_feature_meta(
column, preprocessing_parameters: PreprocessingConfigDict, backend, is_input_feature: bool
config: ModelConfigDict,
column,
preprocessing_parameters: PreprocessingConfigDict,
backend,
is_input_feature: bool,
) -> FeatureMetadataDict:
return {"preprocessing": preprocessing_parameters}

Expand Down
8 changes: 6 additions & 2 deletions ludwig/features/h3_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from ludwig.constants import COLUMN, H3, PROC_COLUMN
from ludwig.features.base_feature import BaseFeatureMixin, InputFeature
from ludwig.schema.features.h3_feature import H3InputFeatureConfig
from ludwig.types import FeatureMetadataDict, PreprocessingConfigDict, TrainingSetMetadataDict
from ludwig.types import FeatureMetadataDict, ModelConfigDict, PreprocessingConfigDict, TrainingSetMetadataDict
from ludwig.utils.h3_util import h3_to_components
from ludwig.utils.types import TorchscriptPreprocessingInput

Expand Down Expand Up @@ -80,7 +80,11 @@ def cast_column(column, backend):

@staticmethod
def get_feature_meta(
column, preprocessing_parameters: PreprocessingConfigDict, backend, is_input_feature: bool
config: ModelConfigDict,
column,
preprocessing_parameters: PreprocessingConfigDict,
backend,
is_input_feature: bool,
) -> FeatureMetadataDict:
return {}

Expand Down
8 changes: 6 additions & 2 deletions ludwig/features/image_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
RandomVerticalFlipConfig,
)
from ludwig.schema.features.image_feature import ImageInputFeatureConfig
from ludwig.types import FeatureMetadataDict, PreprocessingConfigDict, TrainingSetMetadataDict
from ludwig.types import FeatureMetadataDict, ModelConfigDict, PreprocessingConfigDict, TrainingSetMetadataDict
from ludwig.utils.augmentation_utils import get_augmentation_op, register_augmentation_op
from ludwig.utils.data_utils import get_abs_path
from ludwig.utils.dataframe_utils import is_dask_series_or_df
Expand Down Expand Up @@ -368,7 +368,11 @@ def cast_column(column, backend):

@staticmethod
def get_feature_meta(
column, preprocessing_parameters: PreprocessingConfigDict, backend, is_input_feature: bool
config: ModelConfigDict,
column,
preprocessing_parameters: PreprocessingConfigDict,
backend,
is_input_feature: bool,
) -> FeatureMetadataDict:
return {PREPROCESSING: preprocessing_parameters}

Expand Down
7 changes: 6 additions & 1 deletion ludwig/features/number_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from ludwig.types import (
FeatureMetadataDict,
FeaturePostProcessingOutputDict,
ModelConfigDict,
PreprocessingConfigDict,
TrainingSetMetadataDict,
)
Expand Down Expand Up @@ -315,7 +316,11 @@ def cast_column(column, backend):

@staticmethod
def get_feature_meta(
column, preprocessing_parameters: PreprocessingConfigDict, backend, is_input_feature: bool
config: ModelConfigDict,
column,
preprocessing_parameters: PreprocessingConfigDict,
backend,
is_input_feature: bool,
) -> FeatureMetadataDict:
numeric_transformer: NumberTransformer = get_from_registry(
preprocessing_parameters.get("normalization", None),
Expand Down
Loading
Loading