Skip to content

Commit

Permalink
Add a feature flag that disables the new codec by default and add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
TomHodson committed Aug 1, 2024
1 parent 89e8890 commit 58bcd0a
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 31 deletions.
7 changes: 5 additions & 2 deletions src/pyodc/codec.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import struct

import pandas as pd
Expand Down Expand Up @@ -601,8 +602,10 @@ def select_codec(column_name: str, data: pd.Series, data_type, bitfields):
codec_class = ShortReal2

elif data_type == DataType.STRING:
if data.nunique() == 1 and not data.hasnans:
codec_class = ConstantString if len(data.iloc[0]) <= 8 else LongConstantString
if data.nunique() == 1 and not data.hasnans and len(data.iloc[0]) <= 8:
codec_class = ConstantString
elif data.nunique() == 1 and not data.hasnans and "ODC_ENABLE_WRITING_LONG_STRING_CODEC" in os.environ:
codec_class = LongConstantString
elif data.nunique() <= 256:
codec_class = Int8String
else:
Expand Down
26 changes: 24 additions & 2 deletions tests/test_pyodc_codc_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import pandas as pd
import numpy as np
import os

# Each case is a single column and the expected codec
testcases = [
Expand All @@ -26,8 +27,8 @@
# Constant columns of strings of less than 8 bytes go into ConstantString
[["abcd"] * 7, codec.ConstantString],

# Constant columns of strings of more than 8 bytes must be handled by the new LongConstantString codec
[["abcdefghi"] * 7, codec.LongConstantString],
# Constant columns of strings of more than 8 bytes must be handled differently
[["abcdefghi"] * 7, codec.Int8String],

# Columns of strings with less than 2^n unique values go into Int8String or Int16String
[["aoeu", "aoeu", "aaaaaaaooooooo", "None", "boo", "squiggle", "a"], codec.Int8String],
Expand Down Expand Up @@ -67,5 +68,26 @@ def test_codec_choice(testcase, encoder, decoder):

assert type(codec) == expected_codec

# Check the data round tripped
numpy.testing.assert_array_equal(df.column.values, round_tripped_data.column.values)

@pytest.mark.parametrize("encoder", odc_modules)
@pytest.mark.parametrize("decoder", odc_modules)
def test_codec_choice_long_string(encoder, decoder):
"Check that codc and pyodc choose the same codec for long constant strings in the presence of ODC_ENABLE_WRITING_LONG_STRING_CODEC"
testdata, expected_codec = [["abcdefghi"] * 7, codec.LongConstantString]
df = pd.DataFrame(dict(column = testdata))

os.environ["ODC_ENABLE_WRITING_LONG_STRING_CODEC"] = "true"

with NamedTemporaryFile() as fencode:
encoder.encode_odb(df, fencode.name)
round_tripped_data = decoder.read_odb(fencode.name, single = True)
chosen_codec = first_codec(fencode.name)

del os.environ["ODC_ENABLE_WRITING_LONG_STRING_CODEC"]

assert type(chosen_codec) == expected_codec

# Check the data round tripped
numpy.testing.assert_array_equal(df.column.values, round_tripped_data.column.values)
61 changes: 34 additions & 27 deletions tests/test_string_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,43 +49,50 @@ def test_normal_constant_string():

_check_decode(cdc, encoded, "helloAAA")

def check_codec_choice(testdata, expected_codec):
# Check that the correct codec is being selected
series = pd.Series(testdata)
selected_codec = codec.select_codec("column", series, DataType.STRING, False)
assert isinstance(selected_codec, expected_codec)

# Create a temporary stream
f = io.BytesIO()
st = LittleEndianStream(f)

# Encode the header and data for just this column
selected_codec.encode_header(st)
for val in testdata: selected_codec.encode(st, val)
st.seek(0) # reset the stream to the start

# Check the header can be decoded correctly
decoded_codec = codec.read_codec(st)
assert decoded_codec.column_name == "column"
assert decoded_codec.type == DataType.STRING
assert decoded_codec.name == selected_codec.name

# Check the encoded data matches
for val in testdata:
decoded_val = selected_codec.decode(st)
assert val == decoded_val

def test_string_codec_selection():
# Deliberately using strings on length 7,8,9 to catch edges cases
testcases = [
[["constan", "constan"], codec.ConstantString],
[["constant", "constant"], codec.ConstantString],
[["longconst", "longconst"], codec.LongConstantString],
[["longconstant", "longconstant"], codec.LongConstantString],
[["longconst", "longconst"], codec.Int8String],
[["longconstant", "longconstant"], codec.Int8String],
[["not", "constant", "longnotconstant"], codec.Int8String],
[["longconstant"] + [str(num) for num in range(256)], codec.Int16String]
]


for testdata, expected_codec in testcases:
# Check that the correct codec is being selected
series = pd.Series(testdata)
selected_codec = codec.select_codec("column", series, DataType.STRING, False)
assert isinstance(selected_codec, expected_codec)

# Create a temporary stream
f = io.BytesIO()
st = LittleEndianStream(f)

# Encode the header and data for just this column
selected_codec.encode_header(st)
for val in testdata: selected_codec.encode(st, val)
st.seek(0) # reset the stream to the start

# Check the header can be decoded correctly
decoded_codec = codec.read_codec(st)
assert decoded_codec.column_name == "column"
assert decoded_codec.type == DataType.STRING
assert decoded_codec.name == selected_codec.name

# Check the encoded data matches
for val in testdata:
decoded_val = selected_codec.decode(st)
assert val == decoded_val
check_codec_choice(testdata, expected_codec)

os.environ["ODC_ENABLE_WRITING_LONG_STRING_CODEC"] = "true"
for testdata, expected_codec in testcases[2:3]:
check_codec_choice(testdata, codec.LongConstantString)
del os.environ["ODC_ENABLE_WRITING_LONG_STRING_CODEC"]


@pytest.mark.parametrize("odyssey", odc_modules)
Expand Down

0 comments on commit 58bcd0a

Please sign in to comment.