From fa94f4d0fc07e2b16d357308784ba056ae7a0213 Mon Sep 17 00:00:00 2001
From: connor-mccorm <97468934+connor-mccorm@users.noreply.github.com>
Date: Mon, 16 Oct 2023 09:06:38 -0700
Subject: [PATCH] Set default text preprocessing lowercase behavior to False
 (#3721)

---
 ludwig/schema/features/preprocessing/text.py  |  4 ++--
 tests/integration_tests/test_preprocessing.py | 20 +++++++++----------
 tests/ludwig/utils/test_strings_utils.py      |  8 ++++----
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/ludwig/schema/features/preprocessing/text.py b/ludwig/schema/features/preprocessing/text.py
index 27b51f60ba1..ec4230fd098 100644
--- a/ludwig/schema/features/preprocessing/text.py
+++ b/ludwig/schema/features/preprocessing/text.py
@@ -89,7 +89,7 @@ class BaseTextPreprocessingConfig(BasePreprocessingConfig):
     )
 
     lowercase: bool = schema_utils.Boolean(
-        default=True,
+        default=False,
         description="If true, converts the string to lowercase before tokenizing.",
         parameter_metadata=FEATURE_METADATA[TEXT][PREPROCESSING]["lowercase"],
     )
@@ -205,7 +205,7 @@ class TextOutputPreprocessingConfig(BaseTextPreprocessingConfig):
     )
 
     lowercase: bool = schema_utils.Boolean(
-        default=True,
+        default=False,
         description="If true, converts the string to lowercase before tokenizing.",
         parameter_metadata=FEATURE_METADATA[TEXT][PREPROCESSING]["lowercase"],
     )
diff --git a/tests/integration_tests/test_preprocessing.py b/tests/integration_tests/test_preprocessing.py
index 671327e8e12..b8a330a88ea 100644
--- a/tests/integration_tests/test_preprocessing.py
+++ b/tests/integration_tests/test_preprocessing.py
@@ -937,17 +937,17 @@ def test_fill_with_mode_different_df_engine(tmpdir, csv_filename, df_engine, ray
 You are a helpful chatbot. USER: {__sample__}: {country}, {year:.2f} ASSISTANT:
 """
 
-expected_task_sample = """instruction: predict the output feature. return only values in {true, false}
+expected_task_sample = """Instruction: predict the output feature. Return only values in {true, false}
 ###
-examples:
+Examples:
 ###
-input: foo bar
-output: true
+Input: foo bar
+Output: true
 ###
-input: baz quc
-output: false
+Input: baz quc
+Output: false
 ###
-input:"""
+Input:"""
 
 
 @pytest.mark.llm
@@ -973,7 +973,7 @@ def test_fill_with_mode_different_df_engine(tmpdir, csv_filename, df_engine, ray
                 category_feature(name="country"),
                 number_feature(name="year"),
             ],
-            ("you are a helpful chatbot. user: "),
+            ("You are a helpful chatbot. USER: "),
         ),
     ],
     ids=["task_sample", "multi_col"],
@@ -988,7 +988,7 @@ def test_prompt_template(input_features, expected, model_type, backend, tmpdir,
     data_df = pd.read_csv(data_csv)
     raw_values = [data_df[input_features[i][COLUMN]].values.tolist() for i in range(len(input_features))]
 
-    # Only use the first input featuere (text) and discard the others, which are only used for data gen
+    # Only use the first input feature (text) and discard the others, which are only used for data gen
     input_features = input_features[:1]
     config = {
         MODEL_TYPE: model_type,
@@ -1032,7 +1032,7 @@ def test_prompt_template(input_features, expected, model_type, backend, tmpdir,
                 # Test formatting in parametrize uses 2 decimal places of precision
                 raw_text = f"{v:.2f}"
             else:
-                raw_text = str(v).lower()
+                raw_text = str(v)
             assert raw_text in decoded, f"'{raw_text}' not in '{decoded}'"
 
 
diff --git a/tests/ludwig/utils/test_strings_utils.py b/tests/ludwig/utils/test_strings_utils.py
index 1d23fde7817..14afce1f88b 100644
--- a/tests/ludwig/utils/test_strings_utils.py
+++ b/tests/ludwig/utils/test_strings_utils.py
@@ -78,7 +78,7 @@ def test_create_vocabulary_chars():
     )
     vocab = vocabulary.vocab
 
-    assert len(vocab) == 24
+    assert len(vocab) == 27
     assert vocab[strings_utils.SpecialSymbol.START.value] == strings_utils.START_SYMBOL
     assert vocab[strings_utils.SpecialSymbol.STOP.value] == strings_utils.STOP_SYMBOL
     assert vocab[strings_utils.SpecialSymbol.PADDING.value] == strings_utils.PADDING_SYMBOL
@@ -231,13 +231,13 @@ def test_create_vocabulary_idf(compute_idf: bool):
 
     # "sentence" and "and" should be next, as they appear in two docs each
     assert idf_sorted[1][0] > idf_sorted[0][0]
-    assert idf_sorted[1][1] == {"sentence", "and"}
+    assert idf_sorted[1][1] == {"sentence", "And"}
 
     # finally, every token that only appears once
     assert idf_sorted[2][0] > idf_sorted[1][0]
     assert idf_sorted[2][1] == {
         ",",
-        "i",
+        "I",
         "'",
         "one",
         "very",
@@ -246,7 +246,7 @@ def test_create_vocabulary_idf(compute_idf: bool):
         "m",
         "!",
         "last",
-        "hello",
+        "Hello",
         "a",
         "another",
     }