From f7164bcb545dab662f79729a918c5bd55feb8cc1 Mon Sep 17 00:00:00 2001 From: zzwx <8169082+zzwx@users.noreply.github.com> Date: Mon, 30 Nov 2020 08:04:02 -0500 Subject: [PATCH] Start, end of tokens in sanitized text --- tokenize.go | 62 +++++++++++++++++----------- tokenize_test.go | 104 ++++++++++++++++++++++++++++++++++++++++++----- types.go | 2 + utilities.go | 12 +++--- 4 files changed, 141 insertions(+), 39 deletions(-) diff --git a/tokenize.go b/tokenize.go index 0b03a34..d5a15a4 100644 --- a/tokenize.go +++ b/tokenize.go @@ -15,12 +15,12 @@ type Tokenizer interface { // iterTokenizer splits a sentence into words. type iterTokenizer struct { - specialRE *regexp.Regexp - sanitizer *strings.Replacer - contractions []string - suffixes []string - prefixes []string - emoticons map[string]int + specialRE *regexp.Regexp + sanitizer *strings.Replacer + contractions []string + suffixes []string + prefixes []string + emoticons map[string]int isUnsplittable TokenTester } @@ -96,9 +96,9 @@ func NewIterTokenizer(opts ...TokenizerOptFunc) *iterTokenizer { return tok } -func addToken(s string, toks []*Token) []*Token { +func addToken(toks []*Token, s string, from, to int) []*Token { if strings.TrimSpace(s) != "" { - toks = append(toks, &Token{Text: s}) + toks = append(toks, &Token{Text: s, Start: from, End: to}) } return toks } @@ -108,7 +108,7 @@ func (t *iterTokenizer) isSpecial(token string) bool { return found || t.specialRE.MatchString(token) || t.isUnsplittable(token) } -func (t *iterTokenizer) doSplit(token string) []*Token { +func (t *iterTokenizer) doSplit(token string, offset int) []*Token { tokens := []*Token{} suffs := []*Token{} @@ -117,36 +117,44 @@ func (t *iterTokenizer) doSplit(token string) []*Token { if t.isSpecial(token) { // We've found a special case (e.g., an emoticon) -- so, we add it as a token without // any further processing. - tokens = addToken(token, tokens) + tokens = addToken(tokens, token, offset, offset+len(token)) break } last = utf8.RuneCountInString(token) lower := strings.ToLower(token) - if hasAnyPrefix(token, t.prefixes) { + if length := hasAnyPrefix(token, t.prefixes); length > 0 { // Remove prefixes -- e.g., $100 -> [$, 100]. - tokens = addToken(string(token[0]), tokens) - token = token[1:] + tokens = addToken(tokens, token[:length], offset, offset+length) + token = token[length:] + offset += length } else if idx := hasAnyIndex(lower, t.contractions); idx > -1 { // Handle "they'll", "I'll", "Don't", "won't", etc. // // they'll -> [they, 'll]. // don't -> [do, n't]. - tokens = addToken(token[:idx], tokens) + tokens = addToken(tokens, token[:idx], offset, offset+idx) token = token[idx:] - } else if hasAnySuffix(token, t.suffixes) { + offset += idx + } else if length := hasAnySuffix(token, t.suffixes); length > 0 { // Remove suffixes -- e.g., Well) -> [Well, )]. suffs = append([]*Token{ - {Text: string(token[len(token)-1])}}, + {Text: string(token[len(token)-length]), + Start: offset + len(token) - length, + End: offset + len(token)}}, suffs...) token = token[:len(token)-1] } else { - tokens = addToken(token, tokens) + tokens = addToken(tokens, token, offset, offset+len(token)) } } - return append(tokens, suffs...) } +type tokensOffset struct { + tl []*Token + offset int +} + // tokenize splits a sentence into a slice of words. func (t *iterTokenizer) Tokenize(text string) []*Token { tokens := []*Token{} @@ -155,7 +163,7 @@ func (t *iterTokenizer) Tokenize(text string) []*Token { length := len(clean) start, index := 0, 0 - cache := map[string][]*Token{} + cache := map[string]tokensOffset{} for index <= length { uc, size := utf8.DecodeRuneInString(clean[index:]) if size == 0 { @@ -167,10 +175,18 @@ func (t *iterTokenizer) Tokenize(text string) []*Token { if start < index { span := clean[start:index] if toks, found := cache[span]; found { - tokens = append(tokens, toks...) + for _, t := range toks.tl { + tokens = append(tokens, &Token{ + Tag: t.Tag, + Text: t.Text, + Label: t.Label, + Start: t.Start - toks.offset + start, + End: t.End - toks.offset + start, + }) + } } else { - toks := t.doSplit(span) - cache[span] = toks + toks := t.doSplit(span, start) + cache[span] = tokensOffset{toks, start} tokens = append(tokens, toks...) } } @@ -185,7 +201,7 @@ func (t *iterTokenizer) Tokenize(text string) []*Token { } if start < index { - tokens = append(tokens, t.doSplit(clean[start:index])...) + tokens = append(tokens, t.doSplit(clean[start:index], start)...) } return tokens diff --git a/tokenize_test.go b/tokenize_test.go index 41e9b90..5736858 100644 --- a/tokenize_test.go +++ b/tokenize_test.go @@ -15,7 +15,7 @@ func checkTokens(t *testing.T, tokens []*Token, expected []string, name string) observed = append(observed, tokens[i].Text) } if !reflect.DeepEqual(observed, expected) { - t.Errorf("%v: unexpected tokens", name) + t.Errorf("%v: unexpected tokens: %#v", name, observed) } } @@ -26,6 +26,18 @@ func checkCase(t *testing.T, doc *Document, expected []string, name string) { } } +func checkStartEnd(t *testing.T, token *Token, expectedText string, expectedStart, expectedEnd int) { + if token.Text != expectedText { + t.Errorf("got %v, want %v", token.Text, expectedText) + } + if token.Start != expectedStart { + t.Errorf("got %v, want %v", token.Start, expectedStart) + } + if token.End != expectedEnd { + t.Errorf("got %v, want %v", token.End, expectedEnd) + } +} + func makeDoc(text string) (*Document, error) { return NewDocument( text, @@ -157,28 +169,72 @@ func TestTokenizationWebParagraph(t *testing.T) { } func TestTokenizationTwitter(t *testing.T) { - doc, _ := makeDoc("@twitter, what time does it start :-)") + text := "@twitter, what time does it start :-)" + doc, _ := makeDoc(text) expected := []string{"@twitter", ",", "what", "time", "does", "it", "start", ":-)"} checkCase(t, doc, expected, "TokenizationWebParagraph(1)") - - doc, _ = makeDoc("Mr. James plays basketball in the N.B.A., do you?") + checkStartEnd(t, doc.tokens[0], "@twitter", 0, 8) + checkStartEnd(t, doc.tokens[1], ",", 8, 9) + checkStartEnd(t, doc.tokens[2], "what", 10, 14) + checkStartEnd(t, doc.tokens[3], "time", 15, 19) + checkStartEnd(t, doc.tokens[4], "does", 20, 24) + checkStartEnd(t, doc.tokens[5], "it", 25, 27) + checkStartEnd(t, doc.tokens[6], "start", 28, 33) + checkStartEnd(t, doc.tokens[7], ":-)", 34, len(text)) + + text = "Mr. James plays basketball in the N.B.A., do you?" + doc, _ = makeDoc(text) expected = []string{ "Mr.", "James", "plays", "basketball", "in", "the", "N.B.A.", ",", "do", "you", "?"} checkCase(t, doc, expected, "TokenizationWebParagraph(2)") - - doc, _ = makeDoc("ˌˌ kill the last letter") + checkStartEnd(t, doc.tokens[0], "Mr.", 0, 3) + checkStartEnd(t, doc.tokens[1], "James", 4, 9) + checkStartEnd(t, doc.tokens[2], "plays", 10, 15) + checkStartEnd(t, doc.tokens[3], "basketball", 16, 26) + checkStartEnd(t, doc.tokens[4], "in", 27, 29) + checkStartEnd(t, doc.tokens[5], "the", 30, 33) + checkStartEnd(t, doc.tokens[6], "N.B.A.", 34, 40) + checkStartEnd(t, doc.tokens[7], ",", 40, 41) + checkStartEnd(t, doc.tokens[8], "do", 42, 44) + checkStartEnd(t, doc.tokens[9], "you", 45, 48) + checkStartEnd(t, doc.tokens[10], "?", 48, len(text)) + + text = "ˌˌ kill the last letter" + doc, _ = makeDoc(text) expected = []string{"ˌˌ", "kill", "the", "last", "letter"} checkCase(t, doc, expected, "TokenizationWebParagraph(3)") - - doc, _ = makeDoc("ˌˌˌ kill the last letter") + checkStartEnd(t, doc.tokens[0], "ˌˌ", 0, 4) + checkStartEnd(t, doc.tokens[1], "kill", 5, 9) + checkStartEnd(t, doc.tokens[2], "the", 10, 13) + checkStartEnd(t, doc.tokens[3], "last", 14, 18) + checkStartEnd(t, doc.tokens[4], "letter", 19, len(text)) + + text = "ˌˌˌ kill the last letter" + doc, _ = makeDoc(text) expected = []string{"ˌˌˌ", "kill", "the", "last", "letter"} checkCase(t, doc, expected, "TokenizationWebParagraph(4)") - - doc, _ = makeDoc("March. July. March. June. January.") + checkStartEnd(t, doc.tokens[0], "ˌˌˌ", 0, 6) + checkStartEnd(t, doc.tokens[1], "kill", 7, 11) + checkStartEnd(t, doc.tokens[2], "the", 12, 15) + checkStartEnd(t, doc.tokens[3], "last", 16, 20) + checkStartEnd(t, doc.tokens[4], "letter", 21, len(text)) + + text = "March. July. March. June. January." + doc, _ = makeDoc(text) expected = []string{ "March", ".", "July", ".", "March", ".", "June", ".", "January", "."} checkCase(t, doc, expected, "TokenizationWebParagraph(5)") + checkStartEnd(t, doc.tokens[0], "March", 0, 5) + checkStartEnd(t, doc.tokens[1], ".", 5, 6) + checkStartEnd(t, doc.tokens[2], "July", 7, 11) + checkStartEnd(t, doc.tokens[3], ".", 11, 12) + checkStartEnd(t, doc.tokens[4], "March", 13, 18) + checkStartEnd(t, doc.tokens[5], ".", 18, 19) + checkStartEnd(t, doc.tokens[6], "June", 20, 24) + checkStartEnd(t, doc.tokens[7], ".", 24, 25) + checkStartEnd(t, doc.tokens[8], "January", 27, 34) + checkStartEnd(t, doc.tokens[9], ".", 34, len(text)) } func TestTokenizationContractions(t *testing.T) { @@ -186,19 +242,47 @@ func TestTokenizationContractions(t *testing.T) { tokens := tokenizer.Tokenize("He's happy") expected := []string{"He", "'s", "happy"} checkTokens(t, tokens, expected, "TokenizationContraction(default-found)") + checkStartEnd(t, tokens[0], "He", 0, 2) + checkStartEnd(t, tokens[1], "'s", 2, 4) + checkStartEnd(t, tokens[2], "happy", 5, 10) tokens = tokenizer.Tokenize("I've been better") expected = []string{"I've", "been", "better"} checkTokens(t, tokens, expected, "TokenizationContraction(default-missing)") + checkStartEnd(t, tokens[0], "I've", 0, 4) + checkStartEnd(t, tokens[1], "been", 5, 9) + checkStartEnd(t, tokens[2], "better", 10, 16) tokenizer = NewIterTokenizer(UsingContractions([]string{"'ve"})) tokens = tokenizer.Tokenize("I've been better") expected = []string{"I", "'ve", "been", "better"} checkTokens(t, tokens, expected, "TokenizationContraction(custom-found)") + checkStartEnd(t, tokens[0], "I", 0, 1) + checkStartEnd(t, tokens[1], "'ve", 1, 4) + checkStartEnd(t, tokens[2], "been", 5, 9) + checkStartEnd(t, tokens[3], "better", 10, 16) tokens = tokenizer.Tokenize("He's happy") expected = []string{"He's", "happy"} checkTokens(t, tokens, expected, "TokenizationContraction(custom-missing)") + checkStartEnd(t, tokens[0], "He's", 0, 4) + checkStartEnd(t, tokens[1], "happy", 5, 10) +} + +func TestTokenizationSuffixes(t *testing.T) { + tokenizer := NewIterTokenizer() + tokens := tokenizer.Tokenize("(Well,\nthat\twasn't good).") + expected := []string{"(", "Well", ",", "that", "was", "n't", "good", ")", "."} + checkTokens(t, tokens, expected, "TestTokenizationSuffixes") + checkStartEnd(t, tokens[0], "(", 0, 1) + checkStartEnd(t, tokens[1], "Well", 1, 5) + checkStartEnd(t, tokens[2], ",", 5, 6) + checkStartEnd(t, tokens[3], "that", 7, 11) + checkStartEnd(t, tokens[4], "was", 12, 15) + checkStartEnd(t, tokens[5], "n't", 15, 18) + checkStartEnd(t, tokens[6], "good", 19, 23) + checkStartEnd(t, tokens[7], ")", 23, 24) + checkStartEnd(t, tokens[8], ".", 24, 25) } func BenchmarkTokenization(b *testing.B) { diff --git a/types.go b/types.go index c9acbfd..de4c1b5 100644 --- a/types.go +++ b/types.go @@ -6,6 +6,8 @@ type Token struct { Tag string // The token's part-of-speech tag. Text string // The token's actual content. Label string // The token's IOB label. + Start int // The token's start in bytes in sanitized text. + End int // The token's end in bytes in sanitized text. } // An Entity represents an individual named-entity. diff --git a/utilities.go b/utilities.go index eafe1b5..8f1aad9 100644 --- a/utilities.go +++ b/utilities.go @@ -52,24 +52,24 @@ func getDiskAsset(path string) *gob.Decoder { return gob.NewDecoder(f) } -func hasAnyPrefix(s string, prefixes []string) bool { +func hasAnyPrefix(s string, prefixes []string) int { n := len(s) for _, prefix := range prefixes { if n > len(prefix) && strings.HasPrefix(s, prefix) { - return true + return len(prefix) } } - return false + return -1 } -func hasAnySuffix(s string, suffixes []string) bool { +func hasAnySuffix(s string, suffixes []string) int { n := len(s) for _, suffix := range suffixes { if n > len(suffix) && strings.HasSuffix(s, suffix) { - return true + return len(suffix) } } - return false + return -1 } func hasAnyIndex(s string, suffixes []string) int {