[NLP-1955] Implement SpacyCore for spaCy 3.0+ (#16)

* feat: add spacy2 and spacy3 nox sessions * tests: implement spacy3 tests and update spacy2 tests * tests: refactor spacy tests * feat: implement spacy 3.0 support * fix: lint and typing * docs: update spacy documentation * refactor: spacy2 example * feat: implement spacy3 example * tests: lint spacy tests * docs: fixes * refactor: consistence references to hmrb in code
babylonhealth · May 14, 2021 · 038484c · 038484c
1 parent c69a015
commit 038484c
Show file tree

Hide file tree

Showing 6 changed files with 240 additions and 42 deletions.
diff --git a/docs/advanced.rst b/docs/advanced.rst
@@ -1,11 +1,11 @@
-🤖 Advanced Usage
-=================
+🤖 spaCy and callbacks
+======================
 
 
-Hammurabi in spaCy pipelines
-----------------------------
+Hammurabi in spaCy 2.X pipelines
+---------------------------------
 We provide native support for spaCy through the ``SpacyCore`` object.
-The ``SpacyCore`` object can simply be integrated into your existing spaCy pipelines.
+The ``SpacyCore`` object can simply be integrated into your existing spaCy 2.X pipelines.
 
 
 .. code-block:: python
@@ -18,7 +18,45 @@ The ``SpacyCore`` object can simply be integrated into your existing spaCy pipel
    core.load(rules)
    nlp.add_pipe(core)
 
-``SpacyCore`` takes a *dict* of callbacks, an optional *function* that converts input (to_json) and a *bool* whether to sort and execute in ascending order according to match length.
+``SpacyCore`` takes a *dict* of callbacks, an optional *function* that converts spaCy doc type (to_json) to a representation that corresponds to your rules and a *bool* whether to sort and execute in ascending order according to match length.
+
+Once the object is instantiated, you can load rules using the ``.load`` method.
+
+
+Hammurabi in spaCy 3.X pipelines
+---------------------------------
+We also provide native support for spaCy 3.0+. You still have to import the `SpacyCore` object to run the component registration and the configuration syntax is slightly different versus 2.0.
+
+We follow the new custom pipeline component API under ``spacy.language`` `[Link] <https://spacy.io/usage/processing-pipelines#custom-components>`_:
+
+First, we have to register both our augmenter functions `map_doc` and any callback functions we would call in spaCy's registry.
+
+Second, we have to create a configuration dictionary that contains the rules and references the callbacks and mapping functions as shown in the example below.
+
+Finally, we can add the ``"hmrb"`` pipeline component using our configuration to the spaCy pipeline.
+
+.. code-block:: python
+
+   from hmrb.core import SpacyCore
+
+   @spacy.registry.augmenters("jsonify_span")
+   def jsonify_span(span):
+     return [
+        {"lemma": token.lemma_, "pos": token.pos_, "lower": token.lower_}
+        for token in span
+    ]
+
+   @spacy.registry.callbacks("dummy_callback")
+   def dummy_callback(seq: list, span: slice, data: dict) -> None:
+    print("OK")
+
+   conf = {
+       "rules": GRAMMAR
+       "callbacks": {"my_callback": "callbacks.dummy_callback"}
+       "map_doc": "augmenters.jsonify_span"
+   }
+   nlp.add_pipe("hmrb", config=conf)
+
 
 Handling Callbacks
 ------------------
@@ -165,4 +203,3 @@ The ordinal validation behaviour is logically separated from the sentence valida
 Note that `validate_ordinal` is only responsible for validating the abbreviated ordinal.
 If successful, it persists its results in the `doc` object. These will be picked up by `validate_Nth_or_Nth_icecream`, which does not perform any additional validation of the ordinal syntax. Instead, it checks that the two compared ordinals are different.
 This example shows how frequent callback usage can be used to achieve better segregation of responsibility.
-
diff --git a/examples/spacy2.py b/examples/spacy2.py
@@ -1,7 +1,7 @@
 import spacy
 
 nlp = spacy.load("en_core_web_sm")
-sentences = 'I love gorillas. Peter loves gorillas. Jane loves Tarzan.'
+sentences = "I love gorillas. Peter loves gorillas. Jane loves Tarzan."
 
 
 def conj_be(subj: str) -> str:
@@ -21,44 +21,41 @@ def gorilla_clb(seq: list, span: slice, data: dict) -> None:
 
 def lover_clb(seq: list, span: slice, data: dict) -> None:
     print(
-        f'{seq[span][-1]["text"]} is a love interest of'
-        f'{seq[span.start]["text"]}.'
+        f"{seq[span][-1].text} is a love interest of "
+        f"{seq[span.start].text}."
     )
 
 
-clbs = {"gorilla people": gorilla_clb, "lover": lover_clb}
+clbs = {"loves_gorilla": gorilla_clb, "loves_someone": lover_clb}
 
 grammar = """
 Law:
-- callback: "gorilla people"
+- callback: "loves_gorilla"
 (
 ((pos: "PROPN") or (pos: "PRON"))
 (lemma: "love")
 (lemma: "gorilla")
 )
 Law:
-- callback: "lover"
+- callback: "loves_someone"
 (
 (pos: "PROPN")
-(text: "loves")
+(lower: "loves")
 (pos: "PROPN")
 )
 """
 
+
 def jsonify_span(span):
-    jsn = []
-    for token in span:
-        jsn.append({
-            'lemma': token.lemma_,
-            'pos': token.pos_,
-            'lower': token.lower_,
-        })
-    return jsn
+    return [
+        {"lemma": token.lemma_, "pos": token.pos_, "lower": token.lower_}
+        for token in span
+    ]
+
 
 from hmrb.core import SpacyCore
-core = SpacyCore(callbacks=clbs,
-                 map_doc=jsonify_span,
-                 sort_length=True)
+
+core = SpacyCore(callbacks=clbs, map_doc=jsonify_span, sort_length=True)
 
 core.load(grammar)
 nlp.add_pipe(core)

diff --git a/examples/spacy3.py b/examples/spacy3.py
@@ -0,0 +1,70 @@
+import spacy
+
+nlp = spacy.load("en_core_web_sm")
+sentences = "I love gorillas. Peter loves gorillas. Jane loves Tarzan."
+
+
+def conj_be(subj: str) -> str:
+    if subj == "I":
+        return "am"
+    elif subj == "you":
+        return "are"
+    else:
+        return "is"
+
+
+@spacy.registry.callbacks("gorilla_callback")
+def gorilla_clb(seq: list, span: slice, data: dict) -> None:
+    subj = seq[span.start].text
+    be = conj_be(subj)
+    print(f"{subj} {be} a gorilla person.")
+
+
+@spacy.registry.callbacks("lover_callback")
+def lover_clb(seq: list, span: slice, data: dict) -> None:
+    print(
+        f"{seq[span][-1].text} is a love interest of "
+        f"{seq[span.start].text}."
+    )
+
+
+grammar = """
+Law:
+- callback: "loves_gorilla"
+(
+((pos: "PROPN") or (pos: "PRON"))
+(lemma: "love")
+(lemma: "gorilla")
+)
+Law:
+- callback: "loves_someone"
+(
+(pos: "PROPN")
+(lower: "loves")
+(pos: "PROPN")
+)
+"""
+
+
+@spacy.registry.augmenters("jsonify_span")
+def jsonify_span(span):
+    return [
+        {"lemma": token.lemma_, "pos": token.pos_, "lower": token.lower_}
+        for token in span
+    ]
+
+
+from hmrb.core import SpacyCore
+
+conf = {
+    "rules": grammar,
+    "callbacks": {
+        "loves_gorilla": "callbacks.gorilla_callback",
+        "loves_someone": "callbacks.lover_callback",
+    },
+    "map_doc": "augmenters.jsonify_span",
+    "sort_length": True,
+}
+
+nlp.add_pipe("hmrb", config=conf)
+nlp(sentences)
diff --git a/hmrb/core.py b/hmrb/core.py
@@ -220,3 +220,39 @@ def __call__(self, doc: Any) -> Any:
         logging.info(f"call: {len(protobuf)} match(es)")
         super()._execute(protobuf, doc)
         return doc
+
+
+try:
+    from spacy.language import Language
+    from spacy import registry
+
+    def spacy_factory(
+        nlp: object,
+        name: str,
+        callbacks: dict,
+        sets: dict,
+        map_doc: str,
+        sort_length: bool,
+        rules: str,
+    ) -> SpacyCore:
+        map_fn = registry.get(*map_doc.split("."))
+        callbacks = {
+            key: registry.get(*value.split(".")) for key, value in callbacks.items()
+        }
+        core = SpacyCore(callbacks, sets, map_fn, sort_length)
+        core.load(rules)
+        return core
+
+    Language.factory(
+        "hmrb",
+        default_config={
+            "callbacks": {},
+            "sets": {},
+            "map_doc": _default_map,
+            "sort_length": False,
+            "rules": "",
+        },
+        func=spacy_factory,
+    )
+except (ImportError, AttributeError):
+    logging.debug("disabling support for spaCy 3.0+")
diff --git a/noxfile.py b/noxfile.py
@@ -62,6 +62,32 @@ def tests(session: Session) -> None:
     session.run("coverage", "xml")
 
 
+@nox.session(python=["3.9"])
+def test_spacy2(session: Session) -> None:
+    session.install("pytest")
+    session.install("spacy<3.0.0")
+    session.run("spacy", "download", "en_core_web_sm")
+    session.install("-r", "requirements.txt")
+    session.install("-e", ".")
+    session.run(
+        "pytest",
+        "tests/test_spacy.py",
+        )
+
+
+@nox.session(python=["3.9"])
+def test_spacy3(session: Session) -> None:
+    session.install("pytest")
+    session.install("spacy>=3.0.0")
+    session.run("spacy", "download", "en_core_web_sm")
+    session.install("-r", "requirements.txt")
+    session.install("-e", ".")
+    session.run(
+        "pytest",
+        "tests/test_spacy.py",
+        )
+
+
 @nox.session(python=["3.7"])
 def changelog(session: Session) -> None:
     args = session.posargs or ["--unreleased"]

diff --git a/tests/test_spacy.py b/tests/test_spacy.py
@@ -1,18 +1,19 @@
 import pytest
 
+spacy = pytest.importorskip("spacy")
+
+
 def jsonify_span(span):
-    jsn = []
-    for token in span:
-        jsn.append({
-            'lemma': token.lemma_,
-            'pos': token.pos_,
-            'lower': token.lower_,
-        })
-    return jsn
+    return [
+        {"lemma": token.lemma_, "pos": token.pos_, "lower": token.lower_}
+        for token in span
+    ]
+
 
 def dummy_callback(seq: list, span: slice, data: dict) -> None:
     print("OK")
 
+
 TEXT = "I feel great today."
 TEXT2 = "I love icecream."
 GRAMMAR = """
@@ -24,22 +25,53 @@ def dummy_callback(seq: list, span: slice, data: dict) -> None:
     (lemma: "great")
 )
 """
-CLBS = {"pytest": dummy_callback}
+
 
 def test_spacyV2(capsys):
-    spacy = pytest.importorskip("spacy")
-    assert spacy.__version__ == "2.3.5"
+    if spacy.__version__ >= "3.0.0":
+        pytest.skip(f"Invalid spacy version {spacy.__version__}")
     nlp = spacy.load("en_core_web_sm")
-
     from hmrb.core import SpacyCore
-    core = SpacyCore(callbacks=CLBS,
-                 map_doc=jsonify_span,
-                 sort_length=True)
+
+    core = SpacyCore(
+        callbacks={"pytest": dummy_callback},
+        map_doc=jsonify_span,
+        sort_length=True,
+    )
     core.load(GRAMMAR)
     nlp.add_pipe(core)
     nlp(TEXT)
     captured = capsys.readouterr()
-    assert captured[0] == 'OK\n'
+    assert captured[0] == "OK\n"
+    nlp(TEXT2)
+    captured = capsys.readouterr()
+    assert captured[0] == ""
+
+
+def test_spacyV3(capsys):
+    spacy = pytest.importorskip("spacy")
+    if spacy.__version__ <= "3.0.0":
+        pytest.skip(f"Invalid spacy version {spacy.__version__}")
+    nlp = spacy.load("en_core_web_sm")
+
+    @spacy.registry.augmenters("jsonify_span")
+    def jsonify_span_pointer(span):
+        return jsonify_span(span)
+
+    @spacy.registry.callbacks("dummy_callback")
+    def dummy_callback_pointer(*args, **kwargs):
+        return dummy_callback(*args, **kwargs)
+
+    conf = {}
+    conf["rules"] = GRAMMAR
+    conf["callbacks"] = {"pytest": "callbacks.dummy_callback"}
+    conf["map_doc"] = "augmenters.jsonify_span"
+    conf["sort_length"] = True
+
+    nlp.add_pipe("hmrb", config=conf)
+    nlp(TEXT)
+    captured = capsys.readouterr()
+    assert captured[0] == "OK\n"
     nlp(TEXT2)
     captured = capsys.readouterr()
-    assert captured[0] == ''
+    assert captured[0] == ""