Add a feature flag that disables the new codec by default and add tests

ecmwf · Aug 1, 2024 · 58bcd0a · 58bcd0a
1 parent 89e8890
commit 58bcd0a
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 31 deletions.
diff --git a/src/pyodc/codec.py b/src/pyodc/codec.py
@@ -1,3 +1,4 @@
+import os
 import struct
 
 import pandas as pd
@@ -601,8 +602,10 @@ def select_codec(column_name: str, data: pd.Series, data_type, bitfields):
             codec_class = ShortReal2
 
     elif data_type == DataType.STRING:
-        if data.nunique() == 1 and not data.hasnans:
-            codec_class = ConstantString if len(data.iloc[0]) <= 8 else LongConstantString
+        if data.nunique() == 1 and not data.hasnans and len(data.iloc[0]) <= 8:
+            codec_class = ConstantString
+        elif data.nunique() == 1 and not data.hasnans and "ODC_ENABLE_WRITING_LONG_STRING_CODEC" in os.environ:
+            codec_class = LongConstantString
         elif data.nunique() <= 256:
             codec_class = Int8String
         else:

diff --git a/tests/test_pyodc_codc_interop.py b/tests/test_pyodc_codc_interop.py
@@ -11,6 +11,7 @@
 
 import pandas as pd
 import numpy as np
+import os
 
 # Each case is a single column and the expected codec 
 testcases = [
@@ -26,8 +27,8 @@
     # Constant columns of strings of less than 8 bytes go into ConstantString
     [["abcd"] * 7, codec.ConstantString],
 
-    # Constant columns of strings of more than 8 bytes must be handled by the new LongConstantString codec
-    [["abcdefghi"] * 7, codec.LongConstantString],
+    # Constant columns of strings of more than 8 bytes must be handled differently
+    [["abcdefghi"] * 7, codec.Int8String],
 
     # Columns of strings with less than 2^n unique values go into Int8String or Int16String
     [["aoeu", "aoeu", "aaaaaaaooooooo", "None", "boo", "squiggle", "a"], codec.Int8String],
@@ -67,5 +68,26 @@ def test_codec_choice(testcase, encoder, decoder):
 
     assert type(codec) == expected_codec
 
+    # Check the data round tripped
+    numpy.testing.assert_array_equal(df.column.values, round_tripped_data.column.values)
+
+@pytest.mark.parametrize("encoder", odc_modules)
+@pytest.mark.parametrize("decoder", odc_modules)
+def test_codec_choice_long_string(encoder, decoder):
+    "Check that codc and pyodc choose the same codec for long constant strings in the presence of ODC_ENABLE_WRITING_LONG_STRING_CODEC"
+    testdata, expected_codec = [["abcdefghi"] * 7, codec.LongConstantString]
+    df = pd.DataFrame(dict(column = testdata))
+
+    os.environ["ODC_ENABLE_WRITING_LONG_STRING_CODEC"] = "true"
+
+    with NamedTemporaryFile() as fencode:
+        encoder.encode_odb(df, fencode.name)
+        round_tripped_data = decoder.read_odb(fencode.name, single = True)
+        chosen_codec = first_codec(fencode.name)
+
+    del os.environ["ODC_ENABLE_WRITING_LONG_STRING_CODEC"]
+
+    assert type(chosen_codec) == expected_codec
+
     # Check the data round tripped
     numpy.testing.assert_array_equal(df.column.values, round_tripped_data.column.values)
diff --git a/tests/test_string_codecs.py b/tests/test_string_codecs.py
@@ -49,43 +49,50 @@ def test_normal_constant_string():
 
     _check_decode(cdc, encoded, "helloAAA")
 
+def check_codec_choice(testdata, expected_codec):
+    # Check that the correct codec is being selected
+    series = pd.Series(testdata)
+    selected_codec = codec.select_codec("column", series, DataType.STRING, False)
+    assert isinstance(selected_codec, expected_codec)
+
+    # Create a temporary stream
+    f = io.BytesIO()
+    st = LittleEndianStream(f)
+
+    # Encode the header and data for just this column        
+    selected_codec.encode_header(st)
+    for val in testdata: selected_codec.encode(st, val)
+    st.seek(0) # reset the stream to the start
+
+    # Check the header can be decoded correctly
+    decoded_codec = codec.read_codec(st)
+    assert decoded_codec.column_name == "column"
+    assert decoded_codec.type == DataType.STRING
+    assert decoded_codec.name == selected_codec.name
+
+    # Check the encoded data matches        
+    for val in testdata:
+        decoded_val = selected_codec.decode(st)
+        assert val == decoded_val
+
 def test_string_codec_selection():
     # Deliberately using strings on length 7,8,9 to catch edges cases
     testcases = [
         [["constan", "constan"], codec.ConstantString],
         [["constant", "constant"], codec.ConstantString],
-        [["longconst", "longconst"], codec.LongConstantString],
-        [["longconstant", "longconstant"], codec.LongConstantString],
+        [["longconst", "longconst"], codec.Int8String],
+        [["longconstant", "longconstant"], codec.Int8String],
         [["not", "constant", "longnotconstant"], codec.Int8String],
         [["longconstant"] + [str(num) for num in range(256)], codec.Int16String]
     ]
 
-
     for testdata, expected_codec in testcases:
-        # Check that the correct codec is being selected
-        series = pd.Series(testdata)
-        selected_codec = codec.select_codec("column", series, DataType.STRING, False)
-        assert isinstance(selected_codec, expected_codec)
-
-        # Create a temporary stream
-        f = io.BytesIO()
-        st = LittleEndianStream(f)
-
-        # Encode the header and data for just this column        
-        selected_codec.encode_header(st)
-        for val in testdata: selected_codec.encode(st, val)
-        st.seek(0) # reset the stream to the start
-
-        # Check the header can be decoded correctly
-        decoded_codec = codec.read_codec(st)
-        assert decoded_codec.column_name == "column"
-        assert decoded_codec.type == DataType.STRING
-        assert decoded_codec.name == selected_codec.name
-
-        # Check the encoded data matches        
-        for val in testdata:
-            decoded_val = selected_codec.decode(st)
-            assert val == decoded_val
+        check_codec_choice(testdata, expected_codec)
+
+    os.environ["ODC_ENABLE_WRITING_LONG_STRING_CODEC"] = "true"
+    for testdata, expected_codec in testcases[2:3]:
+        check_codec_choice(testdata, codec.LongConstantString)
+    del os.environ["ODC_ENABLE_WRITING_LONG_STRING_CODEC"]
 
 
 @pytest.mark.parametrize("odyssey", odc_modules)