-
Notifications
You must be signed in to change notification settings - Fork 186
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Regular expression NER #118
base: develop
Are you sure you want to change the base?
Changes from all commits
7d818c6
2e912ee
45cf828
a9e5d70
34ca219
c817913
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
import re | ||
import codecs | ||
|
||
from nltk.text import TokenSearcher as NLTKTokenSearcher | ||
|
||
from iepy.preprocess.ner.base import BaseNERRunner | ||
|
||
|
||
class RegExpNERRunner(BaseNERRunner): | ||
|
||
def __init__(self, label, regexp, override=False): | ||
super(RegExpNERRunner, self).__init__(override=override) | ||
self.label = label | ||
self.regexp = regexp | ||
|
||
def run_ner(self, doc): | ||
entities = [] | ||
tokens = doc.tokens | ||
searcher = TokenSearcher(tokens) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
for match in searcher.finditer(self.regexp): | ||
entity_oc = self.process_match(match) | ||
if type(entity_oc) == list: | ||
entities.extend(entity_oc) | ||
else: | ||
entities.append(entity_oc) | ||
return entities | ||
|
||
def process_match(self, match): | ||
name = ' '.join(match.group()) | ||
kind = self.label | ||
offset, offset_end = match.span() | ||
entity_oc = self.build_occurrence(name, kind, name, offset, offset_end) | ||
|
||
return entity_oc | ||
|
||
|
||
class TokenSearcher(NLTKTokenSearcher): | ||
""" | ||
From nltk.text.TokenSearcher documentation: | ||
|
||
A class that makes it easier to use regular expressions to search | ||
over tokenized strings. The tokenized string is converted to a | ||
string where tokens are marked with angle brackets -- e.g., | ||
``'<the><window><is><still><open>'``. The regular expression | ||
passed to the ``findall()`` method is modified to treat angle | ||
brackets as non-capturing parentheses, in addition to matching the | ||
token boundaries; and to have ``'.'`` not match the angle brackets. | ||
""" | ||
|
||
def __init__(self, tokens): | ||
# replace < and > inside tokens with \< and \> | ||
_raw = '><'.join(w.replace('<', '\<').replace('>', '\>') for w in tokens) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm no completely sure, but would it be a problem if there is a token with |
||
# preprend >< instead of < for easier token counting | ||
self._raw = '><' + _raw + '>' | ||
|
||
def finditer(self, regexp): | ||
regexp = preprocess_regexp(regexp) | ||
|
||
i = re.finditer(regexp, self._raw) | ||
while True: | ||
try: | ||
m = next(i) | ||
start, end = m.span() | ||
token_start = self._raw[:start].count('><') | ||
token_end = self._raw[:end].count('><') | ||
yield MatchObject(m, token_start, token_end) | ||
except: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This Why not to use |
||
return | ||
|
||
|
||
class MatchObject: | ||
|
||
def __init__(self, m, token_start, token_end): | ||
self.m = m | ||
self.all = m.group() | ||
self.all_start, self.all_end = m.span() | ||
self.token_start = token_start | ||
self.token_end = token_end | ||
|
||
def group(self, *args): | ||
result = self.m.group(*args) | ||
if result: | ||
return result[1:-1].split('><') | ||
else: | ||
return None | ||
|
||
def span(self, *args): | ||
start, end = self.m.span(*args) | ||
span_start = self.all[:start - self.all_start].count('<') | ||
span_end = self.all[:end - self.all_start].count('<') | ||
|
||
return (self.token_start + span_start, self.token_start + span_end) | ||
|
||
|
||
def preprocess_regexp(regexp): | ||
# preprocess the regular expression | ||
regexp = re.sub(r'\s', '', regexp) | ||
# replace < and > only if not double (<< or >>): | ||
regexp = re.sub(r'(?<!<)<(?!<)', '(?:<(?:', regexp) | ||
regexp = re.sub(r'(?<!>)>(?!>)', ')>)', regexp) | ||
# now, replace << >> with < > resp. | ||
regexp = re.sub(r'<<', '<', regexp) | ||
regexp = re.sub(r'>>', '>', regexp) | ||
# Replace . (if not preceded by \) with [^>] | ||
regexp = re.sub(r'(?<!\\)\.', '[^>]', regexp) | ||
|
||
return regexp |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from unittest import TestCase | ||
|
||
from iepy.preprocess.ner.regexp import RegExpNERRunner, TokenSearcher | ||
|
||
from .factories import SentencedIEDocFactory | ||
from .manager_case import ManagerTestCase | ||
from .test_ner import NERTestMixin | ||
|
||
|
||
class TestRegExpNERRunner(ManagerTestCase, NERTestMixin): | ||
|
||
def test(self): | ||
doc = SentencedIEDocFactory( | ||
text="Chase notes she's negative for HIV and Hepatitis C") | ||
ner_runner = RegExpNERRunner('disease', '<HIV>|<Hepatitis><[A-C]>') | ||
ner_runner(doc) | ||
# (the tokenizer splits she's in two parts) | ||
entities_triples = [(6, 7, 'DISEASE'), (8, 10, 'DISEASE')] | ||
self.check_ner_result(doc, entities_triples) | ||
|
||
doc = SentencedIEDocFactory( | ||
text="Cuddy points out that the CT scan showed the patient has a metal pin in her arm and can't undergo an MRI") | ||
ner_runner = RegExpNERRunner('MEDICAL_TEST', '<[A-Z]+><scan>|<MRI>') | ||
ner_runner(doc) | ||
# (the tokenizer splits can't in two parts) | ||
entities_triples = [(5, 7, 'MEDICAL_TEST'), (22, 23, 'MEDICAL_TEST')] | ||
self.check_ner_result(doc, entities_triples) | ||
|
||
|
||
class TestTokenSearcher(TestCase): | ||
|
||
def test(self): | ||
sent = "Chase notes she 's negative for HIV and Hepatitis C" | ||
regexp = '<HIV>|<Hepatitis><[A-C]>' | ||
searcher = TokenSearcher(sent.split()) | ||
matches = [(m.group(), m.span()) for m in searcher.finditer(regexp)] | ||
self.assertEqual(matches, [(['HIV'], (6, 7)), (['Hepatitis', 'C'], (8, 10))]) | ||
|
||
def test_named_group(self): | ||
sent = "Cuddy points out that the CT scan showed the patient has a metal pin in her arm and can 't undergo an MRI" | ||
regexp = '(<an>|<the>) (?P<<name>> <[A-Z]+><scan>|<MRI>)' | ||
searcher = TokenSearcher(sent.split()) | ||
matches = [(m.group('name'), m.span('name')) for m in searcher.finditer(regexp)] | ||
self.assertEqual(matches, [(['CT', 'scan'], (5, 7)), (['MRI'], (22, 23))]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This import looks unused since (apparently) no method from
NLTKTokenSearcher
is used.