ecmwf · simondsmart · Aug 7, 2024 · Jun 18, 2023 · Aug 6, 2024
diff --git a/src/pyodc/codec.py b/src/pyodc/codec.py
@@ -1,3 +1,4 @@
+import os
 import struct
 
 import pandas as pd
@@ -167,7 +168,6 @@ def from_dataframe(cls, column_name: str, data: pd.Series, data_type: DataType,
     def decode(self, stream):
         return struct.pack("<d", self.min).split(b"\x00", 1)[0].decode("utf-8")
 
-
 class NumericBase(Codec):
     _numChanges = None
     _data = None
@@ -435,6 +435,50 @@ def _decode(stream):
         return stream.readUInt16()
 
 
+class LongConstantString(Codec):
+    value: str
+
+    def __init__(self, *args, value, **kwargs):
+        self.value = value
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_dataframe(cls, column_name: str, data: pd.Series, data_type: DataType, bitfields):
+        assert not data.hasnans
+        assert data_type == DataType.STRING
+        assert not bitfields
+        assert data.nunique() == 1
+        return cls(column_name, 0, 0, data_type, value=data.iloc[0])
+
+    @classmethod
+    def from_stream(cls, stream, column_name: str, data_type: DataType, bitfield_names, bitfield_sizes):
+        has_missing, minval, maxval, missing_value = cls.read_core_header(stream)
+        value = stream.readString()
+        return cls(
+            column_name,
+            minval,
+            maxval,
+            data_type,
+            value=value,
+            has_missing=has_missing,
+            bitfield_names=bitfield_names,
+            bitfield_sizes=bitfield_sizes,
+        )
+
+    def encode_header(self, stream):
+        super().encode_header(stream)
+        stream.encodeString(self.value)
+
+    def encode(self, stream, value):
+        pass
+
+    def decode(self, stream):
+        return self.value
+
+    @property
+    def numChanges(self):
+        return 0
+
 def select_codec(column_name: str, data: pd.Series, data_type, bitfields):
     # If data types are not specified, determine them from the pandas Series
 
@@ -497,8 +541,10 @@ def select_codec(column_name: str, data: pd.Series, data_type, bitfields):
             codec_class = ShortReal2
 
     elif data_type == DataType.STRING:
-        if data.nunique() == 1 and len(data.iloc[0]) <= 8 and not data.hasnans:
+        if data.nunique() == 1 and not data.hasnans and len(data.iloc[0]) <= 8:
             codec_class = ConstantString
+        elif data.nunique() == 1 and not data.hasnans and "ODC_ENABLE_WRITING_LONG_STRING_CODEC" in os.environ:
+            codec_class = LongConstantString
         elif data.nunique() <= 256:
             codec_class = Int8String
         else:

diff --git a/tests/test_pyodc_codc_interop.py b/tests/test_pyodc_codc_interop.py
@@ -11,6 +11,7 @@
 
 import pandas as pd
 import numpy as np
+import os
 
 # Each case is a single column and the expected codec 
 testcases = [
@@ -26,8 +27,7 @@
     # Constant columns of strings of less than 8 bytes go into ConstantString
     [["abcd"] * 7, codec.ConstantString],
 
-    # Constant columns of strings of more than 8 bytes currently get promoted to 
-    #  codec.Int8String but working on a version of ConstantString that supports longer strings.
+    # Constant columns of strings of more than 8 bytes must be handled differently
     [["abcdefghi"] * 7, codec.Int8String],
 
     # Columns of strings with less than 2^n unique values go into Int8String or Int16String
@@ -68,5 +68,26 @@ def test_codec_choice(testcase, encoder, decoder):
 
     assert type(codec) == expected_codec
 
+    # Check the data round tripped
+    numpy.testing.assert_array_equal(df.column.values, round_tripped_data.column.values)
+
+@pytest.mark.parametrize("encoder", odc_modules)
+@pytest.mark.parametrize("decoder", odc_modules)
+def test_codec_choice_long_string(encoder, decoder):
+    "Check that codc and pyodc choose the same codec for long constant strings in the presence of ODC_ENABLE_WRITING_LONG_STRING_CODEC"
+    testdata, expected_codec = [["abcdefghi"] * 7, codec.LongConstantString]
+    df = pd.DataFrame(dict(column = testdata))
+
+    os.environ["ODC_ENABLE_WRITING_LONG_STRING_CODEC"] = "true"
+
+    with NamedTemporaryFile() as fencode:
+        encoder.encode_odb(df, fencode.name)
+        round_tripped_data = decoder.read_odb(fencode.name, single = True)
+        chosen_codec = first_codec(fencode.name)
+
+    del os.environ["ODC_ENABLE_WRITING_LONG_STRING_CODEC"]
+
+    assert type(chosen_codec) == expected_codec
+
     # Check the data round tripped
     numpy.testing.assert_array_equal(df.column.values, round_tripped_data.column.values)
diff --git a/tests/test_string_codecs.py b/tests/test_string_codecs.py
@@ -3,6 +3,7 @@
 import pandas as pd
 import numpy
 from tempfile import NamedTemporaryFile
+import io
 
 import pytest
 from conftest import odc_modules
@@ -48,6 +49,51 @@ def test_normal_constant_string():
 
     _check_decode(cdc, encoded, "helloAAA")
 
+def check_codec_choice(testdata, expected_codec):
+    # Check that the correct codec is being selected
+    series = pd.Series(testdata)
+    selected_codec = codec.select_codec("column", series, DataType.STRING, False)
+    assert isinstance(selected_codec, expected_codec)
+
+    # Create a temporary stream
+    f = io.BytesIO()
+    st = LittleEndianStream(f)
+
+    # Encode the header and data for just this column        
+    selected_codec.encode_header(st)
+    for val in testdata: selected_codec.encode(st, val)
+    st.seek(0) # reset the stream to the start
+
+    # Check the header can be decoded correctly
+    decoded_codec = codec.read_codec(st)
+    assert decoded_codec.column_name == "column"
+    assert decoded_codec.type == DataType.STRING
+    assert decoded_codec.name == selected_codec.name
+
+    # Check the encoded data matches        
+    for val in testdata:
+        decoded_val = selected_codec.decode(st)
+        assert val == decoded_val
+
+def test_string_codec_selection():
+    # Deliberately using strings on length 7,8,9 to catch edges cases
+    testcases = [
+        [["constan", "constan"], codec.ConstantString],
+        [["constant", "constant"], codec.ConstantString],
+        [["longconst", "longconst"], codec.Int8String],
+        [["longconstant", "longconstant"], codec.Int8String],
+        [["not", "constant", "longnotconstant"], codec.Int8String],
+        [["longconstant"] + [str(num) for num in range(256)], codec.Int16String]
+    ]
+
+    for testdata, expected_codec in testcases:
+        check_codec_choice(testdata, expected_codec)
+
+    os.environ["ODC_ENABLE_WRITING_LONG_STRING_CODEC"] = "true"
+    for testdata, expected_codec in testcases[2:3]:
+        check_codec_choice(testdata, codec.LongConstantString)
+    del os.environ["ODC_ENABLE_WRITING_LONG_STRING_CODEC"]
+
 
 @pytest.mark.parametrize("odyssey", odc_modules)
 def test_decode_odb1_missing_strings(odyssey):