diff --git a/Changes b/Changes index dd48d28..9af4aaa 100644 --- a/Changes +++ b/Changes @@ -1,3 +1,7 @@ +0.2.2 2023-09-06 + - Fix behaviour for end of text character positions + when no end of sentence occured before. + 0.2.1 2023-09-05 - Add english tokenizer. - Fix buffer bug. diff --git a/datok.go b/datok.go index fba655e..1dd5cbe 100644 --- a/datok.go +++ b/datok.go @@ -1018,6 +1018,10 @@ PARSECHAR: if eot { eot = false + if !sentenceEnd { + sentenceEnd = true + w.SentenceEnd(buffc) + } textEnd = true w.TextEnd(0) if DEBUG { diff --git a/matrix.go b/matrix.go index e2d9858..7eda112 100644 --- a/matrix.go +++ b/matrix.go @@ -592,6 +592,10 @@ PARSECHARM: if eot { eot = false + if !sentenceEnd { + sentenceEnd = true + w.SentenceEnd(buffc) + } textEnd = true w.TextEnd(buffc) rewindBuffer = true diff --git a/token_writer_test.go b/token_writer_test.go index 868e69d..63b9c2b 100644 --- a/token_writer_test.go +++ b/token_writer_test.go @@ -85,6 +85,18 @@ func TestTokenWriterFromOptions(t *testing.T) { matStr = w.String() assert.Equal("1 5 5 6\n1 6\n0 3 3 4\n0 4\n", matStr) + w.Reset() + mat.TransduceTokenWriter(strings.NewReader("Tree\n\x04\n"), tws) + + matStr = w.String() + assert.Equal("0 4\n0 4\n", matStr) + + w.Reset() + mat.TransduceTokenWriter(strings.NewReader("Tree.\n\x04\n"), tws) + + matStr = w.String() + assert.Equal("0 4 4 5\n0 5\n", matStr) + // // Write sentence offsets without token offsets tws = NewTokenWriter(w, SENTENCE_POS|NEWLINE_AFTER_EOT)