machinalis · francolq · Jul 25, 2016 · Jul 25, 2016 · Aug 9, 2016 · Aug 9, 2016
diff --git a/iepy/preprocess/ner/regexp.py b/iepy/preprocess/ner/regexp.py
@@ -0,0 +1,107 @@
+import re
+import codecs
+
+from nltk.text import TokenSearcher as NLTKTokenSearcher
+
+from iepy.preprocess.ner.base import BaseNERRunner
+
+
+class RegExpNERRunner(BaseNERRunner):
+
+    def __init__(self, label, regexp, override=False):
+        super(RegExpNERRunner, self).__init__(override=override)
+        self.label = label
+        self.regexp = regexp
+
+    def run_ner(self, doc):
+        entities = []
+        tokens = doc.tokens
+        searcher = TokenSearcher(tokens)
+        for match in searcher.finditer(self.regexp):
+            entity_oc = self.process_match(match)
+            if type(entity_oc) == list:
+                entities.extend(entity_oc)
+            else:
+                entities.append(entity_oc)
+        return entities
+
+    def process_match(self, match):
+        name = ' '.join(match.group())
+        kind = self.label
+        offset, offset_end = match.span()
+        entity_oc = self.build_occurrence(name, kind, name, offset, offset_end)
+
+        return entity_oc
+
+
+class TokenSearcher(NLTKTokenSearcher):
+    """
+    From nltk.text.TokenSearcher documentation:
+
+        A class that makes it easier to use regular expressions to search
+        over tokenized strings.  The tokenized string is converted to a
+        string where tokens are marked with angle brackets -- e.g.,
+        ``'<the><window><is><still><open>'``.  The regular expression
+        passed to the ``findall()`` method is modified to treat angle
+        brackets as non-capturing parentheses, in addition to matching the
+        token boundaries; and to have ``'.'`` not match the angle brackets.
+    """
+
+    def __init__(self, tokens):
+        # replace < and > inside tokens with \< and \>
+        _raw = '><'.join(w.replace('<', '\<').replace('>', '\>') for w in tokens)
+        # preprend >< instead of < for easier token counting
+        self._raw = '><' + _raw + '>'
+
+    def finditer(self, regexp):
+        regexp = preprocess_regexp(regexp)
+
+        i = re.finditer(regexp, self._raw)
+        while True:
+            try:
+                m = next(i)
+                start, end = m.span()
+                token_start = self._raw[:start].count('><')
+                token_end = self._raw[:end].count('><')
+                yield MatchObject(m, token_start, token_end)
+            except:
+                return
+
+
+class MatchObject:
+
+    def __init__(self, m, token_start, token_end):
+        self.m = m
+        self.all = m.group()
+        self.all_start, self.all_end = m.span()
+        self.token_start = token_start
+        self.token_end = token_end
+
+    def group(self, *args):
+        result = self.m.group(*args)
+        if result:
+            return result[1:-1].split('><')
+        else:
+            return None
+
+    def span(self, *args):
+        start, end = self.m.span(*args)
+        span_start = self.all[:start - self.all_start].count('<')
+        span_end = self.all[:end - self.all_start].count('<')
+
+        return (self.token_start + span_start, self.token_start + span_end)
+
+
+def preprocess_regexp(regexp):
+    # preprocess the regular expression
+    regexp = re.sub(r'\s', '', regexp)
+    # replace < and > only if not double (<< or >>):
+    regexp = re.sub(r'(?<!<)<(?!<)', '(?:<(?:', regexp)
+    regexp = re.sub(r'(?<!>)>(?!>)', ')>)', regexp)
+    # now, replace << >> with < > resp.
+    regexp = re.sub(r'<<', '<', regexp)
+    regexp = re.sub(r'>>', '>', regexp)
+    # Replace . (if not preceded by \) with [^>]
+    regexp = re.sub(r'(?<!\\)\.', '[^>]', regexp)
+
+    return regexp
diff --git a/tests/test_regexp_ner.py b/tests/test_regexp_ner.py
@@ -0,0 +1,44 @@
+from unittest import TestCase
+
+from iepy.preprocess.ner.regexp import RegExpNERRunner, TokenSearcher
+
+from .factories import SentencedIEDocFactory
+from .manager_case import ManagerTestCase
+from .test_ner import NERTestMixin
+
+
+class TestRegExpNERRunner(ManagerTestCase, NERTestMixin):
+
+    def test(self):
+        doc = SentencedIEDocFactory(
+            text="Chase notes she's negative for HIV and Hepatitis C")
+        ner_runner = RegExpNERRunner('disease', '<HIV>|<Hepatitis><[A-C]>')
+        ner_runner(doc)
+        # (the tokenizer splits she's in two parts)
+        entities_triples = [(6, 7, 'DISEASE'), (8, 10, 'DISEASE')]
+        self.check_ner_result(doc, entities_triples)
+
+        doc = SentencedIEDocFactory(
+            text="Cuddy points out that the CT scan showed the patient has a metal pin in her arm and can't undergo an MRI")
+        ner_runner = RegExpNERRunner('MEDICAL_TEST', '<[A-Z]+><scan>|<MRI>')
+        ner_runner(doc)
+        # (the tokenizer splits can't in two parts)
+        entities_triples = [(5, 7, 'MEDICAL_TEST'), (22, 23, 'MEDICAL_TEST')]
+        self.check_ner_result(doc, entities_triples)
+
+
+class TestTokenSearcher(TestCase):
+
+    def test(self):
+        sent = "Chase notes she 's negative for HIV and Hepatitis C"
+        regexp = '<HIV>|<Hepatitis><[A-C]>'
+        searcher = TokenSearcher(sent.split())
+        matches = [(m.group(), m.span()) for m in searcher.finditer(regexp)]
+        self.assertEqual(matches, [(['HIV'], (6, 7)), (['Hepatitis', 'C'], (8, 10))])
+
+    def test_named_group(self):
+        sent = "Cuddy points out that the CT scan showed the patient has a metal pin in her arm and can 't undergo an MRI"
+        regexp = '(<an>|<the>) (?P<<name>> <[A-Z]+><scan>|<MRI>)'
+        searcher = TokenSearcher(sent.split())
+        matches = [(m.group('name'), m.span('name')) for m in searcher.finditer(regexp)]
+        self.assertEqual(matches, [(['CT', 'scan'], (5, 7)), (['MRI'], (22, 23))])