Skip to content

Commit

Permalink
Fix a bug causing a sentence not to end on empty line...
Browse files Browse the repository at this point in the history
if the previous line ended with whitespace.
  • Loading branch information
foxik committed Nov 15, 2023
1 parent f332b5f commit 217403e
Show file tree
Hide file tree
Showing 7 changed files with 133 additions and 137 deletions.
4 changes: 4 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
Version 1.11.2-dev
------------------
- Fix a bug in the tokenizers, which did not end a sentence on an
empty line if the preceding line ended with whitespace. However,
only the REST server and the API was affected; the `run_*` binaries
read input a paragraph at a time, and so circumvent the problem.


Version 1.11.1 [18 Feb 23]
Expand Down
80 changes: 39 additions & 41 deletions src/tokenizer/czech_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ namespace morphodita {
static const char _czech_tokenizer_cond_offsets[] = {
0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2
2, 2, 2, 2
};

static const char _czech_tokenizer_cond_lengths[] = {
0, 0, 0, 0, 0, 0, 0, 2,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0
0, 0, 0, 0
};

static const short _czech_tokenizer_cond_keys[] = {
Expand All @@ -36,8 +36,8 @@ static const char _czech_tokenizer_cond_spaces[] = {

static const unsigned char _czech_tokenizer_key_offsets[] = {
0, 0, 17, 29, 43, 46, 51, 54,
89, 94, 98, 101, 105, 110, 111, 116,
117, 122, 136, 143, 148, 151, 163
89, 94, 99, 100, 105, 106, 111, 125,
132, 137, 140, 152
};

static const short _czech_tokenizer_trans_keys[] = {
Expand All @@ -52,34 +52,33 @@ static const short _czech_tokenizer_trans_keys[] = {
131u, 135u, 142u, 147u, 157u, 159u, 160u, 301u,
557u, 811u, 1067u, 0u, 42u, 48u, 57u, 58u,
64u, 65u, 90u, 91u, 96u, 97u, 122u, 123u,
255u, 9u, 10u, 13u, 32u, 147u, 9u, 13u,
32u, 147u, 9u, 32u, 147u, 9u, 10u, 32u,
147u, 9u, 10u, 13u, 32u, 147u, 13u, 9u,
10u, 13u, 32u, 147u, 10u, 9u, 10u, 13u,
32u, 147u, 13u, 32u, 34u, 39u, 41u, 59u,
93u, 125u, 139u, 141u, 147u, 161u, 9u, 10u,
44u, 46u, 69u, 101u, 159u, 48u, 57u, 69u,
101u, 159u, 48u, 57u, 159u, 48u, 57u, 129u,
131u, 135u, 151u, 155u, 157u, 65u, 90u, 97u,
122u, 142u, 143u, 159u, 48u, 57u, 0
255u, 9u, 10u, 13u, 32u, 147u, 9u, 10u,
13u, 32u, 147u, 13u, 9u, 10u, 13u, 32u,
147u, 10u, 9u, 10u, 13u, 32u, 147u, 13u,
32u, 34u, 39u, 41u, 59u, 93u, 125u, 139u,
141u, 147u, 161u, 9u, 10u, 44u, 46u, 69u,
101u, 159u, 48u, 57u, 69u, 101u, 159u, 48u,
57u, 159u, 48u, 57u, 129u, 131u, 135u, 151u,
155u, 157u, 65u, 90u, 97u, 122u, 142u, 143u,
159u, 48u, 57u, 0
};

static const char _czech_tokenizer_single_lengths[] = {
0, 13, 10, 12, 1, 3, 1, 21,
5, 4, 3, 4, 5, 1, 5, 1,
5, 12, 5, 3, 1, 6, 1
5, 5, 1, 5, 1, 5, 12, 5,
3, 1, 6, 1
};

static const char _czech_tokenizer_range_lengths[] = {
0, 2, 1, 1, 1, 1, 1, 7,
0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 3, 1
0, 0, 0, 0, 0, 0, 1, 1,
1, 1, 3, 1
};

static const unsigned char _czech_tokenizer_index_offsets[] = {
0, 0, 16, 28, 42, 45, 50, 53,
82, 88, 93, 97, 102, 108, 110, 116,
118, 124, 138, 145, 150, 153, 163
82, 88, 94, 96, 102, 104, 110, 124,
131, 136, 139, 149
};

static const char _czech_tokenizer_indicies[] = {
Expand All @@ -93,49 +92,48 @@ static const char _czech_tokenizer_indicies[] = {
10, 13, 9, 13, 9, 13, 16, 16,
16, 16, 10, 16, 15, 13, 9, 17,
9, 17, 9, 15, 9, 16, 9, 16,
9, 14, 10, 19, 20, 10, 10, 18,
10, 21, 10, 10, 18, 10, 10, 10,
18, 10, 21, 10, 10, 18, 10, 22,
23, 10, 10, 18, 25, 24, 10, 22,
26, 10, 10, 18, 25, 24, 10, 23,
26, 10, 10, 18, 4, 4, 5, 5,
5, 5, 5, 5, 5, 5, 4, 5,
4, 27, 28, 28, 29, 29, 15, 15,
27, 29, 29, 6, 6, 27, 8, 8,
27, 16, 16, 16, 16, 16, 16, 16,
16, 16, 27, 15, 15, 27, 0
9, 14, 10, 11, 12, 10, 10, 18,
10, 19, 20, 10, 10, 18, 22, 21,
10, 19, 23, 10, 10, 18, 22, 21,
10, 20, 23, 10, 10, 18, 4, 4,
5, 5, 5, 5, 5, 5, 5, 5,
4, 5, 4, 24, 25, 25, 26, 26,
15, 15, 24, 26, 26, 6, 6, 24,
8, 8, 24, 16, 16, 16, 16, 16,
16, 16, 16, 16, 24, 15, 15, 24,
0
};

static const char _czech_tokenizer_trans_targs[] = {
7, 1, 2, 7, 1, 3, 19, 6,
20, 7, 8, 12, 16, 17, 0, 18,
21, 22, 7, 9, 11, 10, 13, 14,
7, 7, 15, 7, 4, 5
7, 1, 2, 7, 1, 3, 16, 6,
17, 7, 8, 9, 13, 14, 0, 15,
18, 19, 7, 10, 11, 7, 7, 12,
7, 4, 5
};

static const char _czech_tokenizer_trans_actions[] = {
1, 0, 0, 2, 3, 0, 4, 0,
0, 7, 0, 0, 0, 4, 0, 4,
0, 0, 8, 0, 0, 0, 0, 0,
9, 10, 0, 11, 0, 0
0, 0, 8, 0, 0, 9, 10, 0,
11, 0, 0
};

static const char _czech_tokenizer_to_state_actions[] = {
0, 0, 0, 0, 0, 0, 0, 5,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0
0, 0, 0, 0
};

static const char _czech_tokenizer_from_state_actions[] = {
0, 0, 0, 0, 0, 0, 0, 6,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0
0, 0, 0, 0
};

static const unsigned char _czech_tokenizer_eof_trans[] = {
0, 1, 1, 1, 1, 1, 1, 0,
19, 19, 19, 19, 19, 25, 19, 25,
19, 28, 28, 28, 28, 28, 28
19, 19, 22, 19, 22, 19, 25, 25,
25, 25, 25, 25
};

static const int czech_tokenizer_start = 7;
Expand Down
2 changes: 1 addition & 1 deletion src/tokenizer/czech_tokenizer.rl
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ bool czech_tokenizer::next_sentence(vector<token_range>& tokens) {
fhold;
};

eol eol
(whitespace* eol -- eol eol) eol
{
if (!tokens.empty()) fbreak;
current = te;
Expand Down
101 changes: 49 additions & 52 deletions src/tokenizer/english_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,14 +234,14 @@ static const char _english_tokenizer_cond_offsets[] = {
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2
2, 2
};

static const char _english_tokenizer_cond_lengths[] = {
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 2, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0
0, 0
};

static const short _english_tokenizer_cond_keys[] = {
Expand All @@ -254,9 +254,9 @@ static const char _english_tokenizer_cond_spaces[] = {

static const unsigned char _english_tokenizer_key_offsets[] = {
0, 0, 17, 29, 43, 46, 49, 52,
55, 60, 63, 98, 103, 107, 110, 114,
119, 120, 125, 126, 131, 145, 152, 156,
161, 164, 179, 192, 206
55, 60, 63, 98, 103, 108, 109, 114,
115, 120, 134, 141, 145, 150, 153, 168,
181, 195
};

static const short _english_tokenizer_trans_keys[] = {
Expand All @@ -273,41 +273,39 @@ static const short _english_tokenizer_trans_keys[] = {
301u, 557u, 811u, 1067u, 0u, 42u, 48u, 57u,
58u, 64u, 65u, 90u, 91u, 96u, 97u, 122u,
123u, 255u, 9u, 10u, 13u, 32u, 147u, 9u,
13u, 32u, 147u, 9u, 32u, 147u, 9u, 10u,
32u, 147u, 9u, 10u, 13u, 32u, 147u, 13u,
9u, 10u, 13u, 32u, 147u, 10u, 9u, 10u,
13u, 32u, 147u, 13u, 32u, 34u, 39u, 41u,
59u, 93u, 125u, 139u, 141u, 147u, 161u, 9u,
10u, 44u, 46u, 69u, 101u, 159u, 48u, 57u,
44u, 46u, 69u, 101u, 69u, 101u, 159u, 48u,
57u, 159u, 48u, 57u, 39u, 45u, 129u, 131u,
10u, 13u, 32u, 147u, 13u, 9u, 10u, 13u,
32u, 147u, 10u, 9u, 10u, 13u, 32u, 147u,
13u, 32u, 34u, 39u, 41u, 59u, 93u, 125u,
139u, 141u, 147u, 161u, 9u, 10u, 44u, 46u,
69u, 101u, 159u, 48u, 57u, 44u, 46u, 69u,
101u, 69u, 101u, 159u, 48u, 57u, 159u, 48u,
57u, 39u, 45u, 129u, 131u, 135u, 151u, 155u,
157u, 161u, 65u, 90u, 97u, 122u, 142u, 143u,
45u, 129u, 131u, 135u, 151u, 155u, 157u, 65u,
90u, 97u, 122u, 142u, 143u, 39u, 129u, 131u,
135u, 151u, 155u, 157u, 161u, 65u, 90u, 97u,
122u, 142u, 143u, 45u, 129u, 131u, 135u, 151u,
155u, 157u, 65u, 90u, 97u, 122u, 142u, 143u,
39u, 129u, 131u, 135u, 151u, 155u, 157u, 161u,
65u, 90u, 97u, 122u, 142u, 143u, 159u, 48u,
57u, 0
122u, 142u, 143u, 159u, 48u, 57u, 0
};

static const char _english_tokenizer_single_lengths[] = {
0, 13, 10, 12, 1, 1, 1, 1,
3, 1, 21, 5, 4, 3, 4, 5,
1, 5, 1, 5, 12, 5, 4, 3,
1, 9, 7, 8, 1
3, 1, 21, 5, 5, 1, 5, 1,
5, 12, 5, 4, 3, 1, 9, 7,
8, 1
};

static const char _english_tokenizer_range_lengths[] = {
0, 2, 1, 1, 1, 1, 1, 1,
1, 1, 7, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 0, 1,
1, 3, 3, 3, 1
0, 1, 1, 0, 1, 1, 3, 3,
3, 1
};

static const unsigned char _english_tokenizer_index_offsets[] = {
0, 0, 16, 28, 42, 45, 48, 51,
54, 59, 62, 91, 97, 102, 106, 111,
117, 119, 125, 127, 133, 147, 154, 159,
164, 167, 180, 191, 203
54, 59, 62, 91, 97, 103, 105, 111,
113, 119, 133, 140, 145, 150, 153, 166,
177, 189
};

static const char _english_tokenizer_indicies[] = {
Expand All @@ -322,58 +320,57 @@ static const char _english_tokenizer_indicies[] = {
15, 13, 16, 12, 16, 12, 16, 19,
19, 19, 19, 13, 19, 18, 16, 12,
20, 12, 20, 12, 18, 12, 19, 12,
19, 12, 17, 13, 22, 23, 13, 13,
21, 13, 24, 13, 13, 21, 13, 13,
13, 21, 13, 24, 13, 13, 21, 13,
25, 26, 13, 13, 21, 28, 27, 13,
25, 29, 13, 13, 21, 28, 27, 13,
26, 29, 13, 13, 21, 4, 4, 5,
5, 5, 5, 5, 5, 5, 5, 4,
5, 4, 30, 31, 32, 33, 33, 18,
18, 30, 31, 32, 33, 33, 30, 33,
33, 9, 9, 30, 11, 11, 30, 34,
35, 19, 19, 19, 19, 19, 19, 34,
19, 19, 19, 30, 35, 19, 19, 19,
19, 19, 19, 19, 19, 19, 30, 34,
19, 19, 19, 19, 19, 19, 34, 19,
19, 19, 30, 18, 18, 30, 0
19, 12, 17, 13, 14, 15, 13, 13,
21, 13, 22, 23, 13, 13, 21, 25,
24, 13, 22, 26, 13, 13, 21, 25,
24, 13, 23, 26, 13, 13, 21, 4,
4, 5, 5, 5, 5, 5, 5, 5,
5, 4, 5, 4, 27, 28, 29, 30,
30, 18, 18, 27, 28, 29, 30, 30,
27, 30, 30, 9, 9, 27, 11, 11,
27, 31, 32, 19, 19, 19, 19, 19,
19, 31, 19, 19, 19, 27, 32, 19,
19, 19, 19, 19, 19, 19, 19, 19,
27, 31, 19, 19, 19, 19, 19, 19,
31, 19, 19, 19, 27, 18, 18, 27,
0
};

static const char _english_tokenizer_trans_targs[] = {
10, 1, 2, 10, 1, 3, 5, 6,
22, 23, 9, 24, 10, 11, 15, 19,
20, 0, 21, 25, 28, 10, 12, 14,
13, 16, 17, 10, 10, 18, 10, 4,
7, 8, 26, 27
19, 20, 9, 21, 10, 11, 12, 16,
17, 0, 18, 22, 25, 10, 13, 14,
10, 10, 15, 10, 4, 7, 8, 23,
24
};

static const char _english_tokenizer_trans_actions[] = {
1, 0, 0, 2, 3, 0, 0, 0,
4, 4, 0, 0, 7, 0, 0, 0,
4, 0, 4, 0, 0, 8, 0, 0,
0, 0, 0, 9, 10, 0, 11, 0,
0, 0, 0, 0
9, 10, 0, 11, 0, 0, 0, 0,
0
};

static const char _english_tokenizer_to_state_actions[] = {
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 5, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0
0, 0
};

static const char _english_tokenizer_from_state_actions[] = {
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 6, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0
0, 0
};

static const unsigned char _english_tokenizer_eof_trans[] = {
0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 0, 22, 22, 22, 22, 22,
28, 22, 28, 22, 31, 31, 31, 31,
31, 31, 31, 31, 31
1, 1, 0, 22, 22, 25, 22, 25,
22, 28, 28, 28, 28, 28, 28, 28,
28, 28
};

static const int english_tokenizer_start = 10;
Expand Down
2 changes: 1 addition & 1 deletion src/tokenizer/english_tokenizer.rl
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ bool english_tokenizer::next_sentence(vector<token_range>& tokens) {
fhold;
};

eol eol
(whitespace* eol -- eol eol) eol
{
if (!tokens.empty()) fbreak;
current = te;
Expand Down
Loading

0 comments on commit 217403e

Please sign in to comment.