From fa94f4d0fc07e2b16d357308784ba056ae7a0213 Mon Sep 17 00:00:00 2001 From: connor-mccorm <97468934+connor-mccorm@users.noreply.github.com> Date: Mon, 16 Oct 2023 09:06:38 -0700 Subject: [PATCH] Set default text preprocessing lowercase behavior to False (#3721) --- ludwig/schema/features/preprocessing/text.py | 4 ++-- tests/integration_tests/test_preprocessing.py | 20 +++++++++---------- tests/ludwig/utils/test_strings_utils.py | 8 ++++---- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/ludwig/schema/features/preprocessing/text.py b/ludwig/schema/features/preprocessing/text.py index 27b51f60ba1..ec4230fd098 100644 --- a/ludwig/schema/features/preprocessing/text.py +++ b/ludwig/schema/features/preprocessing/text.py @@ -89,7 +89,7 @@ class BaseTextPreprocessingConfig(BasePreprocessingConfig): ) lowercase: bool = schema_utils.Boolean( - default=True, + default=False, description="If true, converts the string to lowercase before tokenizing.", parameter_metadata=FEATURE_METADATA[TEXT][PREPROCESSING]["lowercase"], ) @@ -205,7 +205,7 @@ class TextOutputPreprocessingConfig(BaseTextPreprocessingConfig): ) lowercase: bool = schema_utils.Boolean( - default=True, + default=False, description="If true, converts the string to lowercase before tokenizing.", parameter_metadata=FEATURE_METADATA[TEXT][PREPROCESSING]["lowercase"], ) diff --git a/tests/integration_tests/test_preprocessing.py b/tests/integration_tests/test_preprocessing.py index 671327e8e12..b8a330a88ea 100644 --- a/tests/integration_tests/test_preprocessing.py +++ b/tests/integration_tests/test_preprocessing.py @@ -937,17 +937,17 @@ def test_fill_with_mode_different_df_engine(tmpdir, csv_filename, df_engine, ray You are a helpful chatbot. USER: {__sample__}: {country}, {year:.2f} ASSISTANT: """ -expected_task_sample = """instruction: predict the output feature. return only values in {true, false} +expected_task_sample = """Instruction: predict the output feature. Return only values in {true, false} ### -examples: +Examples: ### -input: foo bar -output: true +Input: foo bar +Output: true ### -input: baz quc -output: false +Input: baz quc +Output: false ### -input:""" +Input:""" @pytest.mark.llm @@ -973,7 +973,7 @@ def test_fill_with_mode_different_df_engine(tmpdir, csv_filename, df_engine, ray category_feature(name="country"), number_feature(name="year"), ], - ("you are a helpful chatbot. user: "), + ("You are a helpful chatbot. USER: "), ), ], ids=["task_sample", "multi_col"], @@ -988,7 +988,7 @@ def test_prompt_template(input_features, expected, model_type, backend, tmpdir, data_df = pd.read_csv(data_csv) raw_values = [data_df[input_features[i][COLUMN]].values.tolist() for i in range(len(input_features))] - # Only use the first input featuere (text) and discard the others, which are only used for data gen + # Only use the first input feature (text) and discard the others, which are only used for data gen input_features = input_features[:1] config = { MODEL_TYPE: model_type, @@ -1032,7 +1032,7 @@ def test_prompt_template(input_features, expected, model_type, backend, tmpdir, # Test formatting in parametrize uses 2 decimal places of precision raw_text = f"{v:.2f}" else: - raw_text = str(v).lower() + raw_text = str(v) assert raw_text in decoded, f"'{raw_text}' not in '{decoded}'" diff --git a/tests/ludwig/utils/test_strings_utils.py b/tests/ludwig/utils/test_strings_utils.py index 1d23fde7817..14afce1f88b 100644 --- a/tests/ludwig/utils/test_strings_utils.py +++ b/tests/ludwig/utils/test_strings_utils.py @@ -78,7 +78,7 @@ def test_create_vocabulary_chars(): ) vocab = vocabulary.vocab - assert len(vocab) == 24 + assert len(vocab) == 27 assert vocab[strings_utils.SpecialSymbol.START.value] == strings_utils.START_SYMBOL assert vocab[strings_utils.SpecialSymbol.STOP.value] == strings_utils.STOP_SYMBOL assert vocab[strings_utils.SpecialSymbol.PADDING.value] == strings_utils.PADDING_SYMBOL @@ -231,13 +231,13 @@ def test_create_vocabulary_idf(compute_idf: bool): # "sentence" and "and" should be next, as they appear in two docs each assert idf_sorted[1][0] > idf_sorted[0][0] - assert idf_sorted[1][1] == {"sentence", "and"} + assert idf_sorted[1][1] == {"sentence", "And"} # finally, every token that only appears once assert idf_sorted[2][0] > idf_sorted[1][0] assert idf_sorted[2][1] == { ",", - "i", + "I", "'", "one", "very", @@ -246,7 +246,7 @@ def test_create_vocabulary_idf(compute_idf: bool): "m", "!", "last", - "hello", + "Hello", "a", "another", }