Skip to content

Commit

Permalink
Fix end of text behaviour in case of sentence positions
Browse files Browse the repository at this point in the history
Change-Id: Ic433dd3579d9a79df5734a405e682596c3ccddad
  • Loading branch information
Akron committed Sep 6, 2023
1 parent 78d270d commit f66dc14
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 0 deletions.
4 changes: 4 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
0.2.2 2023-09-06
- Fix behaviour for end of text character positions
when no end of sentence occured before.

0.2.1 2023-09-05
- Add english tokenizer.
- Fix buffer bug.
Expand Down
4 changes: 4 additions & 0 deletions datok.go
Original file line number Diff line number Diff line change
Expand Up @@ -1018,6 +1018,10 @@ PARSECHAR:

if eot {
eot = false
if !sentenceEnd {
sentenceEnd = true
w.SentenceEnd(buffc)
}
textEnd = true
w.TextEnd(0)
if DEBUG {
Expand Down
4 changes: 4 additions & 0 deletions matrix.go
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,10 @@ PARSECHARM:

if eot {
eot = false
if !sentenceEnd {
sentenceEnd = true
w.SentenceEnd(buffc)
}
textEnd = true
w.TextEnd(buffc)
rewindBuffer = true
Expand Down
12 changes: 12 additions & 0 deletions token_writer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,18 @@ func TestTokenWriterFromOptions(t *testing.T) {
matStr = w.String()
assert.Equal("1 5 5 6\n1 6\n0 3 3 4\n0 4\n", matStr)

w.Reset()
mat.TransduceTokenWriter(strings.NewReader("Tree\n\x04\n"), tws)

matStr = w.String()
assert.Equal("0 4\n0 4\n", matStr)

w.Reset()
mat.TransduceTokenWriter(strings.NewReader("Tree.\n\x04\n"), tws)

matStr = w.String()
assert.Equal("0 4 4 5\n0 5\n", matStr)

//
// Write sentence offsets without token offsets
tws = NewTokenWriter(w, SENTENCE_POS|NEWLINE_AFTER_EOT)
Expand Down

0 comments on commit f66dc14

Please sign in to comment.