Skip to content

Commit

Permalink
make default string encoding utf-8 to handle non-ascii X's and y's
Browse files Browse the repository at this point in the history
  • Loading branch information
Elijah Rippeth authored and fgregg committed Sep 30, 2024
1 parent 75c3081 commit 4014eb0
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 19 deletions.
2 changes: 1 addition & 1 deletion pycrfsuite/_pycrfsuite.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# cython: embedsignature=True
# cython: c_string_type=str
# cython: c_string_encoding=ascii
# cython: c_string_encoding=utf-8
# cython: profile=False
# distutils: language=c++
from . cimport crfsuite_api
Expand Down
21 changes: 16 additions & 5 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,27 @@ def xseq():
{"clean": 1, "shop": 0.1},
{"walk": 1, "shop": 0.5},
{},
{'clean': 1},
{u'солнце': u'не светит'.encode('utf8'), 'clean': 1},
{'world': 2}
{"clean": 1},
{"солнце": "не светит".encode(), "clean": 1},
{"world": 2},
]


@pytest.fixture
def yseq():
return ['sunny', 'sunny', u'sunny', 'rainy', 'rainy', 'rainy',
'sunny', 'sunny', 'rainy', 'rainy', '好']
return [
"sunny",
"sunny",
"sunny",
"rainy",
"rainy",
"rainy",
"sunny",
"sunny",
"rainy",
"rainy",
"好",
]


@pytest.fixture
Expand Down
31 changes: 18 additions & 13 deletions tests/test_tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,12 +171,17 @@ def test_info(model_filename):
with Tagger().open(model_filename) as tagger:
res = tagger.info()

assert res.transitions[('sunny', 'sunny')] > res.transitions[('sunny', 'rainy')]
assert res.state_features[('walk', 'sunny')] > res.state_features[('walk', 'rainy')]
assert (u'солнце:не светит', u'rainy') in res.state_features
assert res.header['num_labels'] == '3'
assert set(res.labels.keys()) == set(['sunny', 'rainy', '好'])
assert set(res.attributes.keys()) == set(['shop', 'walk', 'clean', u'солнце:не светит', 'world'])
assert res.transitions[("sunny", "sunny")] > res.transitions[("sunny", "rainy")]
assert (
res.state_features[("walk", "sunny")]
> res.state_features[("walk", "rainy")]
)
assert ("солнце:не светит", "rainy") in res.state_features
assert res.header["num_labels"] == "3"
assert set(res.labels.keys()) == set(["sunny", "rainy", "好"])
assert set(res.attributes.keys()) == set(
["shop", "walk", "clean", "солнце:не светит", "world"]
)

# it shouldn't segfault on a closed tagger
with pytest.raises(RuntimeError):
Expand Down Expand Up @@ -222,13 +227,13 @@ def test_append_nested_dicts(tmpdir):
with Tagger().open(model_filename) as tagger:
info = tagger.info()
assert set(info.attributes.keys()) == {
"foo:bar:baz",
"foo:spam",
"foo:egg:x",
"foo:egg:y",
"foo:ham:x",
"foo:ham:y",
"foo:bar:ham",
"foo:bar:baz",
"foo:spam",
"foo:egg:x",
"foo:egg:y",
"foo:ham:x",
"foo:ham:y",
"foo:bar:ham",
}

for feat in ["foo:bar:baz", "foo:spam", "foo:egg:x", "foo:egg:y"]:
Expand Down

0 comments on commit 4014eb0

Please sign in to comment.