Skip to content
This repository has been archived by the owner on May 14, 2023. It is now read-only.

Start, end of tokens in sanitized text #69

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 39 additions & 23 deletions tokenize.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ type Tokenizer interface {

// iterTokenizer splits a sentence into words.
type iterTokenizer struct {
specialRE *regexp.Regexp
sanitizer *strings.Replacer
contractions []string
suffixes []string
prefixes []string
emoticons map[string]int
specialRE *regexp.Regexp
sanitizer *strings.Replacer
contractions []string
suffixes []string
prefixes []string
emoticons map[string]int
isUnsplittable TokenTester
}

Expand Down Expand Up @@ -96,9 +96,9 @@ func NewIterTokenizer(opts ...TokenizerOptFunc) *iterTokenizer {
return tok
}

func addToken(s string, toks []*Token) []*Token {
func addToken(toks []*Token, s string, from, to int) []*Token {
if strings.TrimSpace(s) != "" {
toks = append(toks, &Token{Text: s})
toks = append(toks, &Token{Text: s, Start: from, End: to})
}
return toks
}
Expand All @@ -108,7 +108,7 @@ func (t *iterTokenizer) isSpecial(token string) bool {
return found || t.specialRE.MatchString(token) || t.isUnsplittable(token)
}

func (t *iterTokenizer) doSplit(token string) []*Token {
func (t *iterTokenizer) doSplit(token string, offset int) []*Token {
tokens := []*Token{}
suffs := []*Token{}

Expand All @@ -117,36 +117,44 @@ func (t *iterTokenizer) doSplit(token string) []*Token {
if t.isSpecial(token) {
// We've found a special case (e.g., an emoticon) -- so, we add it as a token without
// any further processing.
tokens = addToken(token, tokens)
tokens = addToken(tokens, token, offset, offset+len(token))
break
}
last = utf8.RuneCountInString(token)
lower := strings.ToLower(token)
if hasAnyPrefix(token, t.prefixes) {
if length := hasAnyPrefix(token, t.prefixes); length > 0 {
// Remove prefixes -- e.g., $100 -> [$, 100].
tokens = addToken(string(token[0]), tokens)
token = token[1:]
tokens = addToken(tokens, token[:length], offset, offset+length)
token = token[length:]
offset += length
} else if idx := hasAnyIndex(lower, t.contractions); idx > -1 {
// Handle "they'll", "I'll", "Don't", "won't", etc.
//
// they'll -> [they, 'll].
// don't -> [do, n't].
tokens = addToken(token[:idx], tokens)
tokens = addToken(tokens, token[:idx], offset, offset+idx)
token = token[idx:]
} else if hasAnySuffix(token, t.suffixes) {
offset += idx
} else if length := hasAnySuffix(token, t.suffixes); length > 0 {
// Remove suffixes -- e.g., Well) -> [Well, )].
suffs = append([]*Token{
{Text: string(token[len(token)-1])}},
{Text: string(token[len(token)-length]),
Start: offset + len(token) - length,
End: offset + len(token)}},
suffs...)
token = token[:len(token)-1]
} else {
tokens = addToken(token, tokens)
tokens = addToken(tokens, token, offset, offset+len(token))
}
}

return append(tokens, suffs...)
}

type tokensOffset struct {
tl []*Token
offset int
}

// tokenize splits a sentence into a slice of words.
func (t *iterTokenizer) Tokenize(text string) []*Token {
tokens := []*Token{}
Expand All @@ -155,7 +163,7 @@ func (t *iterTokenizer) Tokenize(text string) []*Token {
length := len(clean)

start, index := 0, 0
cache := map[string][]*Token{}
cache := map[string]tokensOffset{}
for index <= length {
uc, size := utf8.DecodeRuneInString(clean[index:])
if size == 0 {
Expand All @@ -167,10 +175,18 @@ func (t *iterTokenizer) Tokenize(text string) []*Token {
if start < index {
span := clean[start:index]
if toks, found := cache[span]; found {
tokens = append(tokens, toks...)
for _, t := range toks.tl {
tokens = append(tokens, &Token{
Tag: t.Tag,
Text: t.Text,
Label: t.Label,
Start: t.Start - toks.offset + start,
End: t.End - toks.offset + start,
})
}
} else {
toks := t.doSplit(span)
cache[span] = toks
toks := t.doSplit(span, start)
cache[span] = tokensOffset{toks, start}
tokens = append(tokens, toks...)
}
}
Expand All @@ -185,7 +201,7 @@ func (t *iterTokenizer) Tokenize(text string) []*Token {
}

if start < index {
tokens = append(tokens, t.doSplit(clean[start:index])...)
tokens = append(tokens, t.doSplit(clean[start:index], start)...)
}

return tokens
Expand Down
104 changes: 94 additions & 10 deletions tokenize_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ func checkTokens(t *testing.T, tokens []*Token, expected []string, name string)
observed = append(observed, tokens[i].Text)
}
if !reflect.DeepEqual(observed, expected) {
t.Errorf("%v: unexpected tokens", name)
t.Errorf("%v: unexpected tokens: %#v", name, observed)
}
}

Expand All @@ -26,6 +26,18 @@ func checkCase(t *testing.T, doc *Document, expected []string, name string) {
}
}

func checkStartEnd(t *testing.T, token *Token, expectedText string, expectedStart, expectedEnd int) {
if token.Text != expectedText {
t.Errorf("got %v, want %v", token.Text, expectedText)
}
if token.Start != expectedStart {
t.Errorf("got %v, want %v", token.Start, expectedStart)
}
if token.End != expectedEnd {
t.Errorf("got %v, want %v", token.End, expectedEnd)
}
}

func makeDoc(text string) (*Document, error) {
return NewDocument(
text,
Expand Down Expand Up @@ -157,48 +169,120 @@ func TestTokenizationWebParagraph(t *testing.T) {
}

func TestTokenizationTwitter(t *testing.T) {
doc, _ := makeDoc("@twitter, what time does it start :-)")
text := "@twitter, what time does it start :-)"
doc, _ := makeDoc(text)
expected := []string{"@twitter", ",", "what", "time", "does", "it", "start", ":-)"}
checkCase(t, doc, expected, "TokenizationWebParagraph(1)")

doc, _ = makeDoc("Mr. James plays basketball in the N.B.A., do you?")
checkStartEnd(t, doc.tokens[0], "@twitter", 0, 8)
checkStartEnd(t, doc.tokens[1], ",", 8, 9)
checkStartEnd(t, doc.tokens[2], "what", 10, 14)
checkStartEnd(t, doc.tokens[3], "time", 15, 19)
checkStartEnd(t, doc.tokens[4], "does", 20, 24)
checkStartEnd(t, doc.tokens[5], "it", 25, 27)
checkStartEnd(t, doc.tokens[6], "start", 28, 33)
checkStartEnd(t, doc.tokens[7], ":-)", 34, len(text))

text = "Mr. James plays basketball in the N.B.A., do you?"
doc, _ = makeDoc(text)
expected = []string{
"Mr.", "James", "plays", "basketball", "in", "the", "N.B.A.", ",",
"do", "you", "?"}
checkCase(t, doc, expected, "TokenizationWebParagraph(2)")

doc, _ = makeDoc("ˌˌ kill the last letter")
checkStartEnd(t, doc.tokens[0], "Mr.", 0, 3)
checkStartEnd(t, doc.tokens[1], "James", 4, 9)
checkStartEnd(t, doc.tokens[2], "plays", 10, 15)
checkStartEnd(t, doc.tokens[3], "basketball", 16, 26)
checkStartEnd(t, doc.tokens[4], "in", 27, 29)
checkStartEnd(t, doc.tokens[5], "the", 30, 33)
checkStartEnd(t, doc.tokens[6], "N.B.A.", 34, 40)
checkStartEnd(t, doc.tokens[7], ",", 40, 41)
checkStartEnd(t, doc.tokens[8], "do", 42, 44)
checkStartEnd(t, doc.tokens[9], "you", 45, 48)
checkStartEnd(t, doc.tokens[10], "?", 48, len(text))

text = "ˌˌ kill the last letter"
doc, _ = makeDoc(text)
expected = []string{"ˌˌ", "kill", "the", "last", "letter"}
checkCase(t, doc, expected, "TokenizationWebParagraph(3)")

doc, _ = makeDoc("ˌˌˌ kill the last letter")
checkStartEnd(t, doc.tokens[0], "ˌˌ", 0, 4)
checkStartEnd(t, doc.tokens[1], "kill", 5, 9)
checkStartEnd(t, doc.tokens[2], "the", 10, 13)
checkStartEnd(t, doc.tokens[3], "last", 14, 18)
checkStartEnd(t, doc.tokens[4], "letter", 19, len(text))

text = "ˌˌˌ kill the last letter"
doc, _ = makeDoc(text)
expected = []string{"ˌˌˌ", "kill", "the", "last", "letter"}
checkCase(t, doc, expected, "TokenizationWebParagraph(4)")

doc, _ = makeDoc("March. July. March. June. January.")
checkStartEnd(t, doc.tokens[0], "ˌˌˌ", 0, 6)
checkStartEnd(t, doc.tokens[1], "kill", 7, 11)
checkStartEnd(t, doc.tokens[2], "the", 12, 15)
checkStartEnd(t, doc.tokens[3], "last", 16, 20)
checkStartEnd(t, doc.tokens[4], "letter", 21, len(text))

text = "March. July. March. June. January."
doc, _ = makeDoc(text)
expected = []string{
"March", ".", "July", ".", "March", ".", "June", ".", "January", "."}
checkCase(t, doc, expected, "TokenizationWebParagraph(5)")
checkStartEnd(t, doc.tokens[0], "March", 0, 5)
checkStartEnd(t, doc.tokens[1], ".", 5, 6)
checkStartEnd(t, doc.tokens[2], "July", 7, 11)
checkStartEnd(t, doc.tokens[3], ".", 11, 12)
checkStartEnd(t, doc.tokens[4], "March", 13, 18)
checkStartEnd(t, doc.tokens[5], ".", 18, 19)
checkStartEnd(t, doc.tokens[6], "June", 20, 24)
checkStartEnd(t, doc.tokens[7], ".", 24, 25)
checkStartEnd(t, doc.tokens[8], "January", 27, 34)
checkStartEnd(t, doc.tokens[9], ".", 34, len(text))
}

func TestTokenizationContractions(t *testing.T) {
tokenizer := NewIterTokenizer()
tokens := tokenizer.Tokenize("He's happy")
expected := []string{"He", "'s", "happy"}
checkTokens(t, tokens, expected, "TokenizationContraction(default-found)")
checkStartEnd(t, tokens[0], "He", 0, 2)
checkStartEnd(t, tokens[1], "'s", 2, 4)
checkStartEnd(t, tokens[2], "happy", 5, 10)

tokens = tokenizer.Tokenize("I've been better")
expected = []string{"I've", "been", "better"}
checkTokens(t, tokens, expected, "TokenizationContraction(default-missing)")
checkStartEnd(t, tokens[0], "I've", 0, 4)
checkStartEnd(t, tokens[1], "been", 5, 9)
checkStartEnd(t, tokens[2], "better", 10, 16)

tokenizer = NewIterTokenizer(UsingContractions([]string{"'ve"}))
tokens = tokenizer.Tokenize("I've been better")
expected = []string{"I", "'ve", "been", "better"}
checkTokens(t, tokens, expected, "TokenizationContraction(custom-found)")
checkStartEnd(t, tokens[0], "I", 0, 1)
checkStartEnd(t, tokens[1], "'ve", 1, 4)
checkStartEnd(t, tokens[2], "been", 5, 9)
checkStartEnd(t, tokens[3], "better", 10, 16)

tokens = tokenizer.Tokenize("He's happy")
expected = []string{"He's", "happy"}
checkTokens(t, tokens, expected, "TokenizationContraction(custom-missing)")
checkStartEnd(t, tokens[0], "He's", 0, 4)
checkStartEnd(t, tokens[1], "happy", 5, 10)
}

func TestTokenizationSuffixes(t *testing.T) {
tokenizer := NewIterTokenizer()
tokens := tokenizer.Tokenize("(Well,\nthat\twasn't good).")
expected := []string{"(", "Well", ",", "that", "was", "n't", "good", ")", "."}
checkTokens(t, tokens, expected, "TestTokenizationSuffixes")
checkStartEnd(t, tokens[0], "(", 0, 1)
checkStartEnd(t, tokens[1], "Well", 1, 5)
checkStartEnd(t, tokens[2], ",", 5, 6)
checkStartEnd(t, tokens[3], "that", 7, 11)
checkStartEnd(t, tokens[4], "was", 12, 15)
checkStartEnd(t, tokens[5], "n't", 15, 18)
checkStartEnd(t, tokens[6], "good", 19, 23)
checkStartEnd(t, tokens[7], ")", 23, 24)
checkStartEnd(t, tokens[8], ".", 24, 25)
}

func BenchmarkTokenization(b *testing.B) {
Expand Down
2 changes: 2 additions & 0 deletions types.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ type Token struct {
Tag string // The token's part-of-speech tag.
Text string // The token's actual content.
Label string // The token's IOB label.
Start int // The token's start in bytes in sanitized text.
End int // The token's end in bytes in sanitized text.
Comment on lines +9 to +10
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

confusing comments, please make it simplier as the concept of "sanitized" is outside this struct

}

// An Entity represents an individual named-entity.
Expand Down
12 changes: 6 additions & 6 deletions utilities.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,24 +52,24 @@ func getDiskAsset(path string) *gob.Decoder {
return gob.NewDecoder(f)
}

func hasAnyPrefix(s string, prefixes []string) bool {
func hasAnyPrefix(s string, prefixes []string) int {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add some comment explaning that -1 is the same as false

n := len(s)
for _, prefix := range prefixes {
if n > len(prefix) && strings.HasPrefix(s, prefix) {
return true
return len(prefix)
}
}
return false
return -1
}

func hasAnySuffix(s string, suffixes []string) bool {
func hasAnySuffix(s string, suffixes []string) int {
n := len(s)
for _, suffix := range suffixes {
if n > len(suffix) && strings.HasSuffix(s, suffix) {
return true
return len(suffix)
}
}
return false
return -1
}

func hasAnyIndex(s string, suffixes []string) int {
Expand Down