Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Return sentences instead of individual tokens for text features during inference #3504

Closed
wants to merge 9 commits into from
1 change: 1 addition & 0 deletions ludwig/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
PREDICTIONS = "predictions"
TOP_K = "top_k"
TOP_K_PREDICTIONS = "top_k_predictions"
TOKENS = "tokens"
PROBABILITY = "probability"
PROBABILITIES = "probabilities"
SPLIT_PROBABILITIES = "split_probabilities"
Expand Down
23 changes: 19 additions & 4 deletions ludwig/features/sequence_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,13 @@
COLUMN,
LAST_PREDICTIONS,
LENGTHS,
LOGITS,
NAME,
PREDICTIONS,
PROBABILITIES,
PROBABILITY,
PROC_COLUMN,
SEQUENCE,
TOKENS,
)
from ludwig.features.base_feature import BaseFeatureMixin, InputFeature, OutputFeature, PredictModule
from ludwig.features.feature_utils import compute_sequence_probability, compute_token_probabilities
Expand Down Expand Up @@ -146,7 +147,7 @@ def __init__(self, metadata: TrainingSetMetadataDict):
self.max_sequence_length = int(metadata["max_sequence_length"])
self.idx2str = metadata["idx2str"]
self.unknown_symbol = UNKNOWN_SYMBOL
self.predictions_key = PREDICTIONS
self.predictions_key = TOKENS
self.probabilities_key = PROBABILITIES
self.probability_key = PROBABILITY

Expand Down Expand Up @@ -176,7 +177,21 @@ def forward(self, preds: Dict[str, torch.Tensor], feature_name: str) -> FeatureP
}


class _SequencePredict(PredictModule):
class SequencePredictModule(PredictModule):
"""Overrides PredictModule for sequence, text and timeseries features.

Explicit member variables needed here for scripting, as Torchscript will not be able to recognize global variables
during scripting.
"""

def __init__(self):
super().__init__()
self.predictions_key = TOKENS
self.probabilities_key = PROBABILITIES
self.logits_key = LOGITS


class _SequencePredict(SequencePredictModule):
def forward(self, inputs: Dict[str, torch.Tensor], feature_name: str) -> Dict[str, torch.Tensor]:
logits = output_feature_utils.get_output_feature_tensor(inputs, feature_name, self.logits_key)
probabilities = torch.softmax(logits, -1)
Expand Down Expand Up @@ -471,7 +486,7 @@ def postprocess_predictions(
result,
metadata,
):
predictions_col = f"{self.feature_name}_{PREDICTIONS}"
predictions_col = f"{self.feature_name}_{TOKENS}"
lengths_col = f"{self.feature_name}_{LENGTHS}"
if predictions_col in result:
if "idx2str" in metadata:
Expand Down
25 changes: 20 additions & 5 deletions ludwig/features/text_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@
LAST_PREDICTIONS,
LENGTHS,
NAME,
PREDICTIONS,
PREPROCESSING,
PROBABILITIES,
PROBABILITY,
PROC_COLUMN,
TEXT,
TOKENS,
)
from ludwig.features.base_feature import BaseFeatureMixin, OutputFeature
from ludwig.features.feature_utils import compute_sequence_probability, compute_token_probabilities
Expand Down Expand Up @@ -323,7 +323,7 @@ def postprocess_predictions(
metadata,
):
# todo: refactor to reuse SequenceOutputFeature.postprocess_predictions
predictions_col = f"{self.feature_name}_{PREDICTIONS}"
tokens_col = f"{self.feature_name}_{TOKENS}"

tokenizer = None
if metadata["preprocessing"]["tokenizer"] == "hf_tokenizer":
Expand All @@ -333,17 +333,32 @@ def postprocess_predictions(
metadata["preprocessing"]["pretrained_model_name_or_path"],
)

if predictions_col in result:
if tokens_col in result:
predicted_tokens = result[tokens_col]

def idx2str(pred):
def idx2token(pred):
if tokenizer is None:
return [
metadata["idx2str"][token] if token < len(metadata["idx2str"]) else UNKNOWN_SYMBOL
for token in pred
]
return tokenizer.tokenizer.batch_decode(pred, skip_special_tokens=True)

result[predictions_col] = result[predictions_col].map(idx2str)
result[tokens_col] = predicted_tokens.map(idx2token)

# Add concatenated tokens with spacing into a new column in the results dataframe

def idx2prediction(pred):
if tokenizer is None:
return " ".join(
arnavgarg1 marked this conversation as resolved.
Show resolved Hide resolved
[
metadata["idx2str"][token] if token < len(metadata["idx2str"]) else UNKNOWN_SYMBOL
for token in pred
]
)
return tokenizer.tokenizer.batch_decode([pred], skip_special_tokens=True)

result[f"{self.feature_name}_predictions"] = predicted_tokens.map(idx2prediction)

last_preds_col = f"{self.feature_name}_{LAST_PREDICTIONS}"
if last_preds_col in result:
Expand Down
7 changes: 4 additions & 3 deletions ludwig/modules/metric_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
TEXT,
TIMESERIES,
TOKEN_ACCURACY,
TOKENS,
VECTOR,
)
from ludwig.distributed import get_current_dist_strategy
Expand Down Expand Up @@ -344,7 +345,7 @@ def get_current_value(self, preds: Tensor, target: Tensor) -> Tensor:
return self.sigmoid_cross_entropy_function(preds, target)


@register_metric(TOKEN_ACCURACY, [SEQUENCE, TEXT], MAXIMIZE, PREDICTIONS)
@register_metric(TOKEN_ACCURACY, [SEQUENCE, TEXT], MAXIMIZE, TOKENS)
class TokenAccuracyMetric(MeanMetric):
def __init__(self, **kwargs):
super().__init__()
Expand All @@ -356,7 +357,7 @@ def get_current_value(self, preds: Tensor, target: Tensor) -> Tensor:
return torch.mean(masked_correct_preds)


@register_metric(SEQUENCE_ACCURACY, [SEQUENCE, TEXT], MAXIMIZE, PREDICTIONS)
@register_metric(SEQUENCE_ACCURACY, [SEQUENCE, TEXT], MAXIMIZE, TOKENS)
class SequenceAccuracyMetric(MeanMetric):
def __init__(self, **kwargs):
super().__init__()
Expand Down Expand Up @@ -389,7 +390,7 @@ def get_current_value(self, preds: Tensor, target: Tensor):
return torch.exp(shifted_loss)


@register_metric("char_error_rate", [SEQUENCE, TEXT], MINIMIZE, PREDICTIONS)
@register_metric("char_error_rate", [SEQUENCE, TEXT], MINIMIZE, TOKENS)
class CharErrorRateMetric(CharErrorRate, LudwigMetric):
def __init__(self, **kwargs):
super().__init__()
Expand Down
Loading