diff --git a/ludwig/schema/metadata/configs/preprocessing.yaml b/ludwig/schema/metadata/configs/preprocessing.yaml index e0de6f6dac6..688f2084732 100644 --- a/ludwig/schema/metadata/configs/preprocessing.yaml +++ b/ludwig/schema/metadata/configs/preprocessing.yaml @@ -150,3 +150,10 @@ cache_encoder_embeddings: it's not always the case that you would always want to enable it when possible. expected_impact: 1 ui_display_name: Cache Encoder Embeddings +global_max_sequence_length: + expected_impact: 2 + ui_display_name: Global Max Sequence Length + description_implications: + Specifically for LLMs. This is the maximum number of tokens going into the model's forward pass during training. Sequences will be truncated to this length after merging the tokens from the input with tokens from the target. If not set, the total length of the merged input and target token sequences will be used. + example_value: + - 512 diff --git a/ludwig/schema/preprocessing.py b/ludwig/schema/preprocessing.py index c04443dc538..075c63fe4a3 100644 --- a/ludwig/schema/preprocessing.py +++ b/ludwig/schema/preprocessing.py @@ -44,6 +44,7 @@ class PreprocessingConfig(schema_utils.BaseMarshmallowConfig): description="Specifically for LLMs. This is the maximum length of the input sequence going into the model's " "forward pass during training. Sequences will be truncated to this length after merging inputs and targets. " "If not set, the total length of the merged input and target token sequences will be used.", + parameter_metadata=PREPROCESSING_METADATA["global_max_sequence_length"], )