Merge branch 'master' into maintenance/alexsherstinsky/use_defined_co…

…nstants_and_unquoted_typehints-2023_10_15-10
ludwig-ai · Oct 16, 2023 · 3b78b5e · 3b78b5e
2 parents 12d0f55 + fa94f4d
commit 3b78b5e
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 16 deletions.
diff --git a/ludwig/schema/features/preprocessing/text.py b/ludwig/schema/features/preprocessing/text.py
@@ -89,7 +89,7 @@ class BaseTextPreprocessingConfig(BasePreprocessingConfig):
     )
 
     lowercase: bool = schema_utils.Boolean(
-        default=True,
+        default=False,
         description="If true, converts the string to lowercase before tokenizing.",
         parameter_metadata=FEATURE_METADATA[TEXT][PREPROCESSING]["lowercase"],
     )
@@ -205,7 +205,7 @@ class TextOutputPreprocessingConfig(BaseTextPreprocessingConfig):
     )
 
     lowercase: bool = schema_utils.Boolean(
-        default=True,
+        default=False,
         description="If true, converts the string to lowercase before tokenizing.",
         parameter_metadata=FEATURE_METADATA[TEXT][PREPROCESSING]["lowercase"],
     )

diff --git a/tests/integration_tests/test_preprocessing.py b/tests/integration_tests/test_preprocessing.py
@@ -937,17 +937,17 @@ def test_fill_with_mode_different_df_engine(tmpdir, csv_filename, df_engine, ray
 You are a helpful chatbot. USER: {__sample__}: {country}, {year:.2f} ASSISTANT:
 """
 
-expected_task_sample = """instruction: predict the output feature. return only values in {true, false}
+expected_task_sample = """Instruction: predict the output feature. Return only values in {true, false}
 ###
-examples:
+Examples:
 ###
-input: foo bar
-output: true
+Input: foo bar
+Output: true
 ###
-input: baz quc
-output: false
+Input: baz quc
+Output: false
 ###
-input:"""
+Input:"""
 
 
 @pytest.mark.llm
@@ -973,7 +973,7 @@ def test_fill_with_mode_different_df_engine(tmpdir, csv_filename, df_engine, ray
                 category_feature(name="country"),
                 number_feature(name="year"),
             ],
-            ("you are a helpful chatbot. user: "),
+            ("You are a helpful chatbot. USER: "),
         ),
     ],
     ids=["task_sample", "multi_col"],
@@ -988,7 +988,7 @@ def test_prompt_template(input_features, expected, model_type, backend, tmpdir,
     data_df = pd.read_csv(data_csv)
     raw_values = [data_df[input_features[i][COLUMN]].values.tolist() for i in range(len(input_features))]
 
-    # Only use the first input featuere (text) and discard the others, which are only used for data gen
+    # Only use the first input feature (text) and discard the others, which are only used for data gen
     input_features = input_features[:1]
     config = {
         MODEL_TYPE: model_type,
@@ -1032,7 +1032,7 @@ def test_prompt_template(input_features, expected, model_type, backend, tmpdir,
                 # Test formatting in parametrize uses 2 decimal places of precision
                 raw_text = f"{v:.2f}"
             else:
-                raw_text = str(v).lower()
+                raw_text = str(v)
             assert raw_text in decoded, f"'{raw_text}' not in '{decoded}'"
 
 

diff --git a/tests/ludwig/utils/test_strings_utils.py b/tests/ludwig/utils/test_strings_utils.py
@@ -78,7 +78,7 @@ def test_create_vocabulary_chars():
     )
     vocab = vocabulary.vocab
 
-    assert len(vocab) == 24
+    assert len(vocab) == 27
     assert vocab[strings_utils.SpecialSymbol.START.value] == strings_utils.START_SYMBOL
     assert vocab[strings_utils.SpecialSymbol.STOP.value] == strings_utils.STOP_SYMBOL
     assert vocab[strings_utils.SpecialSymbol.PADDING.value] == strings_utils.PADDING_SYMBOL
@@ -231,13 +231,13 @@ def test_create_vocabulary_idf(compute_idf: bool):
 
     # "sentence" and "and" should be next, as they appear in two docs each
     assert idf_sorted[1][0] > idf_sorted[0][0]
-    assert idf_sorted[1][1] == {"sentence", "and"}
+    assert idf_sorted[1][1] == {"sentence", "And"}
 
     # finally, every token that only appears once
     assert idf_sorted[2][0] > idf_sorted[1][0]
     assert idf_sorted[2][1] == {
         ",",
-        "i",
+        "I",
         "'",
         "one",
         "very",
@@ -246,7 +246,7 @@ def test_create_vocabulary_idf(compute_idf: bool):
         "m",
         "!",
         "last",
-        "hello",
+        "Hello",
         "a",
         "another",
     }