jdkato · zzwx · Nov 30, 2020 · nicolasassi · Jan 15, 2021 · nicolasassi
diff --git a/tokenize.go b/tokenize.go
@@ -15,12 +15,12 @@ type Tokenizer interface {
 
 // iterTokenizer splits a sentence into words.
 type iterTokenizer struct {
-	specialRE *regexp.Regexp
-	sanitizer *strings.Replacer
-	contractions  []string
-	suffixes  []string
-	prefixes  []string
-	emoticons map[string]int
+	specialRE      *regexp.Regexp
+	sanitizer      *strings.Replacer
+	contractions   []string
+	suffixes       []string
+	prefixes       []string
+	emoticons      map[string]int
 	isUnsplittable TokenTester
 }
 
@@ -96,9 +96,9 @@ func NewIterTokenizer(opts ...TokenizerOptFunc) *iterTokenizer {
 	return tok
 }
 
-func addToken(s string, toks []*Token) []*Token {
+func addToken(toks []*Token, s string, from, to int) []*Token {
 	if strings.TrimSpace(s) != "" {
-		toks = append(toks, &Token{Text: s})
+		toks = append(toks, &Token{Text: s, Start: from, End: to})
 	}
 	return toks
 }
@@ -108,7 +108,7 @@ func (t *iterTokenizer) isSpecial(token string) bool {
 	return found || t.specialRE.MatchString(token) || t.isUnsplittable(token)
 }
 
-func (t *iterTokenizer) doSplit(token string) []*Token {
+func (t *iterTokenizer) doSplit(token string, offset int) []*Token {
 	tokens := []*Token{}
 	suffs := []*Token{}
 
@@ -117,36 +117,44 @@ func (t *iterTokenizer) doSplit(token string) []*Token {
 		if t.isSpecial(token) {
 			// We've found a special case (e.g., an emoticon) -- so, we add it as a token without
 			// any further processing.
-			tokens = addToken(token, tokens)
+			tokens = addToken(tokens, token, offset, offset+len(token))
 			break
 		}
 		last = utf8.RuneCountInString(token)
 		lower := strings.ToLower(token)
-		if hasAnyPrefix(token, t.prefixes) {
+		if length := hasAnyPrefix(token, t.prefixes); length > 0 {
 			// Remove prefixes -- e.g., $100 -> [$, 100].
-			tokens = addToken(string(token[0]), tokens)
-			token = token[1:]
+			tokens = addToken(tokens, token[:length], offset, offset+length)
+			token = token[length:]
+			offset += length
 		} else if idx := hasAnyIndex(lower, t.contractions); idx > -1 {
 			// Handle "they'll", "I'll", "Don't", "won't", etc.
 			//
 			// they'll -> [they, 'll].
 			// don't -> [do, n't].
-			tokens = addToken(token[:idx], tokens)
+			tokens = addToken(tokens, token[:idx], offset, offset+idx)
 			token = token[idx:]
-		} else if hasAnySuffix(token, t.suffixes) {
+			offset += idx
+		} else if length := hasAnySuffix(token, t.suffixes); length > 0 {
 			// Remove suffixes -- e.g., Well) -> [Well, )].
 			suffs = append([]*Token{
-				{Text: string(token[len(token)-1])}},
+				{Text: string(token[len(token)-length]),
+					Start: offset + len(token) - length,
+					End:   offset + len(token)}},
 				suffs...)
 			token = token[:len(token)-1]
 		} else {
-			tokens = addToken(token, tokens)
+			tokens = addToken(tokens, token, offset, offset+len(token))
 		}
 	}
-
 	return append(tokens, suffs...)
 }
 
+type tokensOffset struct {
+	tl     []*Token
+	offset int
+}
+
 // tokenize splits a sentence into a slice of words.
 func (t *iterTokenizer) Tokenize(text string) []*Token {
 	tokens := []*Token{}
@@ -155,7 +163,7 @@ func (t *iterTokenizer) Tokenize(text string) []*Token {
 	length := len(clean)
 
 	start, index := 0, 0
-	cache := map[string][]*Token{}
+	cache := map[string]tokensOffset{}
 	for index <= length {
 		uc, size := utf8.DecodeRuneInString(clean[index:])
 		if size == 0 {
@@ -167,10 +175,18 @@ func (t *iterTokenizer) Tokenize(text string) []*Token {
 			if start < index {
 				span := clean[start:index]
 				if toks, found := cache[span]; found {
-					tokens = append(tokens, toks...)
+					for _, t := range toks.tl {
+						tokens = append(tokens, &Token{
+							Tag:   t.Tag,
+							Text:  t.Text,
+							Label: t.Label,
+							Start: t.Start - toks.offset + start,
+							End:   t.End - toks.offset + start,
+						})
+					}
 				} else {
-					toks := t.doSplit(span)
-					cache[span] = toks
+					toks := t.doSplit(span, start)
+					cache[span] = tokensOffset{toks, start}
 					tokens = append(tokens, toks...)
 				}
 			}
@@ -185,7 +201,7 @@ func (t *iterTokenizer) Tokenize(text string) []*Token {
 	}
 
 	if start < index {
-		tokens = append(tokens, t.doSplit(clean[start:index])...)
+		tokens = append(tokens, t.doSplit(clean[start:index], start)...)
 	}
 
 	return tokens

diff --git a/tokenize_test.go b/tokenize_test.go
@@ -15,7 +15,7 @@ func checkTokens(t *testing.T, tokens []*Token, expected []string, name string)
 		observed = append(observed, tokens[i].Text)
 	}
 	if !reflect.DeepEqual(observed, expected) {
-		t.Errorf("%v: unexpected tokens", name)
+		t.Errorf("%v: unexpected tokens: %#v", name, observed)
 	}
 }
 
@@ -26,6 +26,18 @@ func checkCase(t *testing.T, doc *Document, expected []string, name string) {
 	}
 }
 
+func checkStartEnd(t *testing.T, token *Token, expectedText string, expectedStart, expectedEnd int) {
+	if token.Text != expectedText {
+		t.Errorf("got %v, want %v", token.Text, expectedText)
+	}
+	if token.Start != expectedStart {
+		t.Errorf("got %v, want %v", token.Start, expectedStart)
+	}
+	if token.End != expectedEnd {
+		t.Errorf("got %v, want %v", token.End, expectedEnd)
+	}
+}
+
 func makeDoc(text string) (*Document, error) {
 	return NewDocument(
 		text,
@@ -157,48 +169,120 @@ func TestTokenizationWebParagraph(t *testing.T) {
 }
 
 func TestTokenizationTwitter(t *testing.T) {
-	doc, _ := makeDoc("@twitter, what time does it start :-)")
+	text := "@twitter, what time does it start :-)"
+	doc, _ := makeDoc(text)
 	expected := []string{"@twitter", ",", "what", "time", "does", "it", "start", ":-)"}
 	checkCase(t, doc, expected, "TokenizationWebParagraph(1)")
-
-	doc, _ = makeDoc("Mr. James plays basketball in the N.B.A., do you?")
+	checkStartEnd(t, doc.tokens[0], "@twitter", 0, 8)
+	checkStartEnd(t, doc.tokens[1], ",", 8, 9)
+	checkStartEnd(t, doc.tokens[2], "what", 10, 14)
+	checkStartEnd(t, doc.tokens[3], "time", 15, 19)
+	checkStartEnd(t, doc.tokens[4], "does", 20, 24)
+	checkStartEnd(t, doc.tokens[5], "it", 25, 27)
+	checkStartEnd(t, doc.tokens[6], "start", 28, 33)
+	checkStartEnd(t, doc.tokens[7], ":-)", 34, len(text))
+
+	text = "Mr. James plays basketball in the N.B.A., do you?"
+	doc, _ = makeDoc(text)
 	expected = []string{
 		"Mr.", "James", "plays", "basketball", "in", "the", "N.B.A.", ",",
 		"do", "you", "?"}
 	checkCase(t, doc, expected, "TokenizationWebParagraph(2)")
-
-	doc, _ = makeDoc("ˌˌ kill the last letter")
+	checkStartEnd(t, doc.tokens[0], "Mr.", 0, 3)
+	checkStartEnd(t, doc.tokens[1], "James", 4, 9)
+	checkStartEnd(t, doc.tokens[2], "plays", 10, 15)
+	checkStartEnd(t, doc.tokens[3], "basketball", 16, 26)
+	checkStartEnd(t, doc.tokens[4], "in", 27, 29)
+	checkStartEnd(t, doc.tokens[5], "the", 30, 33)
+	checkStartEnd(t, doc.tokens[6], "N.B.A.", 34, 40)
+	checkStartEnd(t, doc.tokens[7], ",", 40, 41)
+	checkStartEnd(t, doc.tokens[8], "do", 42, 44)
+	checkStartEnd(t, doc.tokens[9], "you", 45, 48)
+	checkStartEnd(t, doc.tokens[10], "?", 48, len(text))
+
+	text = "ˌˌ kill the last letter"
+	doc, _ = makeDoc(text)
 	expected = []string{"ˌˌ", "kill", "the", "last", "letter"}
 	checkCase(t, doc, expected, "TokenizationWebParagraph(3)")
-
-	doc, _ = makeDoc("ˌˌˌ kill the last letter")
+	checkStartEnd(t, doc.tokens[0], "ˌˌ", 0, 4)
+	checkStartEnd(t, doc.tokens[1], "kill", 5, 9)
+	checkStartEnd(t, doc.tokens[2], "the", 10, 13)
+	checkStartEnd(t, doc.tokens[3], "last", 14, 18)
+	checkStartEnd(t, doc.tokens[4], "letter", 19, len(text))
+
+	text = "ˌˌˌ kill the last letter"
+	doc, _ = makeDoc(text)
 	expected = []string{"ˌˌˌ", "kill", "the", "last", "letter"}
 	checkCase(t, doc, expected, "TokenizationWebParagraph(4)")
-
-	doc, _ = makeDoc("March. July. March. June. January.")
+	checkStartEnd(t, doc.tokens[0], "ˌˌˌ", 0, 6)
+	checkStartEnd(t, doc.tokens[1], "kill", 7, 11)
+	checkStartEnd(t, doc.tokens[2], "the", 12, 15)
+	checkStartEnd(t, doc.tokens[3], "last", 16, 20)
+	checkStartEnd(t, doc.tokens[4], "letter", 21, len(text))
+
+	text = "March. July. March. June.  January."
+	doc, _ = makeDoc(text)
 	expected = []string{
 		"March", ".", "July", ".", "March", ".", "June", ".", "January", "."}
 	checkCase(t, doc, expected, "TokenizationWebParagraph(5)")
+	checkStartEnd(t, doc.tokens[0], "March", 0, 5)
+	checkStartEnd(t, doc.tokens[1], ".", 5, 6)
+	checkStartEnd(t, doc.tokens[2], "July", 7, 11)
+	checkStartEnd(t, doc.tokens[3], ".", 11, 12)
+	checkStartEnd(t, doc.tokens[4], "March", 13, 18)
+	checkStartEnd(t, doc.tokens[5], ".", 18, 19)
+	checkStartEnd(t, doc.tokens[6], "June", 20, 24)
+	checkStartEnd(t, doc.tokens[7], ".", 24, 25)
+	checkStartEnd(t, doc.tokens[8], "January", 27, 34)
+	checkStartEnd(t, doc.tokens[9], ".", 34, len(text))
 }
 
 func TestTokenizationContractions(t *testing.T) {
 	tokenizer := NewIterTokenizer()
 	tokens := tokenizer.Tokenize("He's happy")
 	expected := []string{"He", "'s", "happy"}
 	checkTokens(t, tokens, expected, "TokenizationContraction(default-found)")
+	checkStartEnd(t, tokens[0], "He", 0, 2)
+	checkStartEnd(t, tokens[1], "'s", 2, 4)
+	checkStartEnd(t, tokens[2], "happy", 5, 10)
 
 	tokens = tokenizer.Tokenize("I've been better")
 	expected = []string{"I've", "been", "better"}
 	checkTokens(t, tokens, expected, "TokenizationContraction(default-missing)")
+	checkStartEnd(t, tokens[0], "I've", 0, 4)
+	checkStartEnd(t, tokens[1], "been", 5, 9)
+	checkStartEnd(t, tokens[2], "better", 10, 16)
 
 	tokenizer = NewIterTokenizer(UsingContractions([]string{"'ve"}))
 	tokens = tokenizer.Tokenize("I've been better")
 	expected = []string{"I", "'ve", "been", "better"}
 	checkTokens(t, tokens, expected, "TokenizationContraction(custom-found)")
+	checkStartEnd(t, tokens[0], "I", 0, 1)
+	checkStartEnd(t, tokens[1], "'ve", 1, 4)
+	checkStartEnd(t, tokens[2], "been", 5, 9)
+	checkStartEnd(t, tokens[3], "better", 10, 16)
 
 	tokens = tokenizer.Tokenize("He's happy")
 	expected = []string{"He's", "happy"}
 	checkTokens(t, tokens, expected, "TokenizationContraction(custom-missing)")
+	checkStartEnd(t, tokens[0], "He's", 0, 4)
+	checkStartEnd(t, tokens[1], "happy", 5, 10)
+}
+
+func TestTokenizationSuffixes(t *testing.T) {
+	tokenizer := NewIterTokenizer()
+	tokens := tokenizer.Tokenize("(Well,\nthat\twasn't good).")
+	expected := []string{"(", "Well", ",", "that", "was", "n't", "good", ")", "."}
+	checkTokens(t, tokens, expected, "TestTokenizationSuffixes")
+	checkStartEnd(t, tokens[0], "(", 0, 1)
+	checkStartEnd(t, tokens[1], "Well", 1, 5)
+	checkStartEnd(t, tokens[2], ",", 5, 6)
+	checkStartEnd(t, tokens[3], "that", 7, 11)
+	checkStartEnd(t, tokens[4], "was", 12, 15)
+	checkStartEnd(t, tokens[5], "n't", 15, 18)
+	checkStartEnd(t, tokens[6], "good", 19, 23)
+	checkStartEnd(t, tokens[7], ")", 23, 24)
+	checkStartEnd(t, tokens[8], ".", 24, 25)
 }
 
 func BenchmarkTokenization(b *testing.B) {

diff --git a/types.go b/types.go
@@ -6,6 +6,8 @@ type Token struct {
 	Tag   string // The token's part-of-speech tag.
 	Text  string // The token's actual content.
 	Label string // The token's IOB label.
+	Start int    // The token's start in bytes in sanitized text.
+	End   int    // The token's end in bytes in sanitized text.
 }
 
 // An Entity represents an individual named-entity.

diff --git a/utilities.go b/utilities.go
@@ -52,24 +52,24 @@ func getDiskAsset(path string) *gob.Decoder {
 	return gob.NewDecoder(f)
 }
 
-func hasAnyPrefix(s string, prefixes []string) bool {
+func hasAnyPrefix(s string, prefixes []string) int {
 	n := len(s)
 	for _, prefix := range prefixes {
 		if n > len(prefix) && strings.HasPrefix(s, prefix) {
-			return true
+			return len(prefix)
 		}
 	}
-	return false
+	return -1
 }
 
-func hasAnySuffix(s string, suffixes []string) bool {
+func hasAnySuffix(s string, suffixes []string) int {
 	n := len(s)
 	for _, suffix := range suffixes {
 		if n > len(suffix) && strings.HasSuffix(s, suffix) {
-			return true
+			return len(suffix)
 		}
 	}
-	return false
+	return -1
 }
 
 func hasAnyIndex(s string, suffixes []string) int {