diff --git a/ludwig/constants.py b/ludwig/constants.py index 87a7594b805..44720c39b3c 100644 --- a/ludwig/constants.py +++ b/ludwig/constants.py @@ -83,6 +83,7 @@ PREDICTIONS = "predictions" TOP_K = "top_k" TOP_K_PREDICTIONS = "top_k_predictions" +TOKENS = "tokens" PROBABILITY = "probability" PROBABILITIES = "probabilities" SPLIT_PROBABILITIES = "split_probabilities" diff --git a/ludwig/features/sequence_feature.py b/ludwig/features/sequence_feature.py index fded71f973d..7323cb5285c 100644 --- a/ludwig/features/sequence_feature.py +++ b/ludwig/features/sequence_feature.py @@ -25,12 +25,13 @@ COLUMN, LAST_PREDICTIONS, LENGTHS, + LOGITS, NAME, - PREDICTIONS, PROBABILITIES, PROBABILITY, PROC_COLUMN, SEQUENCE, + TOKENS, ) from ludwig.features.base_feature import BaseFeatureMixin, InputFeature, OutputFeature, PredictModule from ludwig.features.feature_utils import compute_sequence_probability, compute_token_probabilities @@ -146,7 +147,7 @@ def __init__(self, metadata: TrainingSetMetadataDict): self.max_sequence_length = int(metadata["max_sequence_length"]) self.idx2str = metadata["idx2str"] self.unknown_symbol = UNKNOWN_SYMBOL - self.predictions_key = PREDICTIONS + self.predictions_key = TOKENS self.probabilities_key = PROBABILITIES self.probability_key = PROBABILITY @@ -176,7 +177,21 @@ def forward(self, preds: Dict[str, torch.Tensor], feature_name: str) -> FeatureP } -class _SequencePredict(PredictModule): +class SequencePredictModule(PredictModule): + """Overrides PredictModule for sequence, text and timeseries features. + + Explicit member variables needed here for scripting, as Torchscript will not be able to recognize global variables + during scripting. + """ + + def __init__(self): + super().__init__() + self.predictions_key = TOKENS + self.probabilities_key = PROBABILITIES + self.logits_key = LOGITS + + +class _SequencePredict(SequencePredictModule): def forward(self, inputs: Dict[str, torch.Tensor], feature_name: str) -> Dict[str, torch.Tensor]: logits = output_feature_utils.get_output_feature_tensor(inputs, feature_name, self.logits_key) probabilities = torch.softmax(logits, -1) @@ -471,7 +486,7 @@ def postprocess_predictions( result, metadata, ): - predictions_col = f"{self.feature_name}_{PREDICTIONS}" + predictions_col = f"{self.feature_name}_{TOKENS}" lengths_col = f"{self.feature_name}_{LENGTHS}" if predictions_col in result: if "idx2str" in metadata: diff --git a/ludwig/features/text_feature.py b/ludwig/features/text_feature.py index 43c73bfb199..78c55de5e3b 100644 --- a/ludwig/features/text_feature.py +++ b/ludwig/features/text_feature.py @@ -24,12 +24,12 @@ LAST_PREDICTIONS, LENGTHS, NAME, - PREDICTIONS, PREPROCESSING, PROBABILITIES, PROBABILITY, PROC_COLUMN, TEXT, + TOKENS, ) from ludwig.features.base_feature import BaseFeatureMixin, OutputFeature from ludwig.features.feature_utils import compute_sequence_probability, compute_token_probabilities @@ -323,7 +323,7 @@ def postprocess_predictions( metadata, ): # todo: refactor to reuse SequenceOutputFeature.postprocess_predictions - predictions_col = f"{self.feature_name}_{PREDICTIONS}" + tokens_col = f"{self.feature_name}_{TOKENS}" tokenizer = None if metadata["preprocessing"]["tokenizer"] == "hf_tokenizer": @@ -333,9 +333,10 @@ def postprocess_predictions( metadata["preprocessing"]["pretrained_model_name_or_path"], ) - if predictions_col in result: + if tokens_col in result: + predicted_tokens = result[tokens_col] - def idx2str(pred): + def idx2token(pred): if tokenizer is None: return [ metadata["idx2str"][token] if token < len(metadata["idx2str"]) else UNKNOWN_SYMBOL @@ -343,7 +344,21 @@ def idx2str(pred): ] return tokenizer.tokenizer.batch_decode(pred, skip_special_tokens=True) - result[predictions_col] = result[predictions_col].map(idx2str) + result[tokens_col] = predicted_tokens.map(idx2token) + + # Add concatenated tokens with spacing into a new column in the results dataframe + + def idx2prediction(pred): + if tokenizer is None: + return " ".join( + [ + metadata["idx2str"][token] if token < len(metadata["idx2str"]) else UNKNOWN_SYMBOL + for token in pred + ] + ) + return tokenizer.tokenizer.batch_decode([pred], skip_special_tokens=True) + + result[f"{self.feature_name}_predictions"] = predicted_tokens.map(idx2prediction) last_preds_col = f"{self.feature_name}_{LAST_PREDICTIONS}" if last_preds_col in result: diff --git a/ludwig/modules/metric_modules.py b/ludwig/modules/metric_modules.py index 1a9cc5882fb..08997c64311 100644 --- a/ludwig/modules/metric_modules.py +++ b/ludwig/modules/metric_modules.py @@ -72,6 +72,7 @@ TEXT, TIMESERIES, TOKEN_ACCURACY, + TOKENS, VECTOR, ) from ludwig.distributed import get_current_dist_strategy @@ -344,7 +345,7 @@ def get_current_value(self, preds: Tensor, target: Tensor) -> Tensor: return self.sigmoid_cross_entropy_function(preds, target) -@register_metric(TOKEN_ACCURACY, [SEQUENCE, TEXT], MAXIMIZE, PREDICTIONS) +@register_metric(TOKEN_ACCURACY, [SEQUENCE, TEXT], MAXIMIZE, TOKENS) class TokenAccuracyMetric(MeanMetric): def __init__(self, **kwargs): super().__init__() @@ -356,7 +357,7 @@ def get_current_value(self, preds: Tensor, target: Tensor) -> Tensor: return torch.mean(masked_correct_preds) -@register_metric(SEQUENCE_ACCURACY, [SEQUENCE, TEXT], MAXIMIZE, PREDICTIONS) +@register_metric(SEQUENCE_ACCURACY, [SEQUENCE, TEXT], MAXIMIZE, TOKENS) class SequenceAccuracyMetric(MeanMetric): def __init__(self, **kwargs): super().__init__() @@ -389,7 +390,7 @@ def get_current_value(self, preds: Tensor, target: Tensor): return torch.exp(shifted_loss) -@register_metric("char_error_rate", [SEQUENCE, TEXT], MINIMIZE, PREDICTIONS) +@register_metric("char_error_rate", [SEQUENCE, TEXT], MINIMIZE, TOKENS) class CharErrorRateMetric(CharErrorRate, LudwigMetric): def __init__(self, **kwargs): super().__init__()