From 038484caaa073ce9525fdd73eef3aa53fe35ee83 Mon Sep 17 00:00:00 2001
From: Kristian Boda <kristian.boda@babylonhealth.com>
Date: Fri, 14 May 2021 15:54:23 +0100
Subject: [PATCH] [NLP-1955] Implement SpacyCore for spaCy 3.0+ (#16)

* feat: add spacy2 and spacy3 nox sessions

* tests: implement spacy3 tests and update spacy2 tests

* tests: refactor spacy tests

* feat: implement spacy 3.0 support

* fix: lint and typing

* docs: update spacy documentation

* refactor: spacy2 example

* feat: implement spacy3 example

* tests: lint spacy tests

* docs: fixes

* refactor: consistence references to hmrb in code
---
 docs/advanced.rst   | 51 ++++++++++++++++++++++++++++-----
 examples/spacy2.py  | 33 ++++++++++-----------
 examples/spacy3.py  | 70 +++++++++++++++++++++++++++++++++++++++++++++
 hmrb/core.py        | 36 +++++++++++++++++++++++
 noxfile.py          | 26 +++++++++++++++++
 tests/test_spacy.py | 66 +++++++++++++++++++++++++++++++-----------
 6 files changed, 240 insertions(+), 42 deletions(-)
 create mode 100644 examples/spacy3.py

diff --git a/docs/advanced.rst b/docs/advanced.rst
index 1d59cb8..27c329b 100644
--- a/docs/advanced.rst
+++ b/docs/advanced.rst
@@ -1,11 +1,11 @@
-🤖 Advanced Usage
-=================
+🤖 spaCy and callbacks
+======================
 
 
-Hammurabi in spaCy pipelines
-----------------------------
+Hammurabi in spaCy 2.X pipelines
+---------------------------------
 We provide native support for spaCy through the ``SpacyCore`` object.
-The ``SpacyCore`` object can simply be integrated into your existing spaCy pipelines.
+The ``SpacyCore`` object can simply be integrated into your existing spaCy 2.X pipelines.
 
 
 .. code-block:: python
@@ -18,7 +18,45 @@ The ``SpacyCore`` object can simply be integrated into your existing spaCy pipel
    core.load(rules)
    nlp.add_pipe(core)
 
-``SpacyCore`` takes a *dict* of callbacks, an optional *function* that converts input (to_json) and a *bool* whether to sort and execute in ascending order according to match length.
+``SpacyCore`` takes a *dict* of callbacks, an optional *function* that converts spaCy doc type (to_json) to a representation that corresponds to your rules and a *bool* whether to sort and execute in ascending order according to match length.
+
+Once the object is instantiated, you can load rules using the ``.load`` method.
+
+
+Hammurabi in spaCy 3.X pipelines
+---------------------------------
+We also provide native support for spaCy 3.0+. You still have to import the `SpacyCore` object to run the component registration and the configuration syntax is slightly different versus 2.0.
+
+We follow the new custom pipeline component API under ``spacy.language`` `[Link] <https://spacy.io/usage/processing-pipelines#custom-components>`_:
+
+First, we have to register both our augmenter functions `map_doc` and any callback functions we would call in spaCy's registry.
+
+Second, we have to create a configuration dictionary that contains the rules and references the callbacks and mapping functions as shown in the example below.
+
+Finally, we can add the ``"hmrb"`` pipeline component using our configuration to the spaCy pipeline.
+
+.. code-block:: python
+
+   from hmrb.core import SpacyCore
+
+   @spacy.registry.augmenters("jsonify_span")
+   def jsonify_span(span):
+     return [
+        {"lemma": token.lemma_, "pos": token.pos_, "lower": token.lower_}
+        for token in span
+    ]
+
+   @spacy.registry.callbacks("dummy_callback")
+   def dummy_callback(seq: list, span: slice, data: dict) -> None:
+    print("OK")
+
+   conf = {
+       "rules": GRAMMAR
+       "callbacks": {"my_callback": "callbacks.dummy_callback"}
+       "map_doc": "augmenters.jsonify_span"
+   }
+   nlp.add_pipe("hmrb", config=conf)
+
 
 Handling Callbacks
 ------------------
@@ -165,4 +203,3 @@ The ordinal validation behaviour is logically separated from the sentence valida
 Note that `validate_ordinal` is only responsible for validating the abbreviated ordinal.
 If successful, it persists its results in the `doc` object. These will be picked up by `validate_Nth_or_Nth_icecream`, which does not perform any additional validation of the ordinal syntax. Instead, it checks that the two compared ordinals are different.
 This example shows how frequent callback usage can be used to achieve better segregation of responsibility.
-
diff --git a/examples/spacy2.py b/examples/spacy2.py
index b3998b2..3751403 100644
--- a/examples/spacy2.py
+++ b/examples/spacy2.py
@@ -1,7 +1,7 @@
 import spacy
 
 nlp = spacy.load("en_core_web_sm")
-sentences = 'I love gorillas. Peter loves gorillas. Jane loves Tarzan.'
+sentences = "I love gorillas. Peter loves gorillas. Jane loves Tarzan."
 
 
 def conj_be(subj: str) -> str:
@@ -21,44 +21,41 @@ def gorilla_clb(seq: list, span: slice, data: dict) -> None:
 
 def lover_clb(seq: list, span: slice, data: dict) -> None:
     print(
-        f'{seq[span][-1]["text"]} is a love interest of'
-        f'{seq[span.start]["text"]}.'
+        f"{seq[span][-1].text} is a love interest of "
+        f"{seq[span.start].text}."
     )
 
 
-clbs = {"gorilla people": gorilla_clb, "lover": lover_clb}
+clbs = {"loves_gorilla": gorilla_clb, "loves_someone": lover_clb}
 
 grammar = """
 Law:
-- callback: "gorilla people"
+- callback: "loves_gorilla"
 (
 ((pos: "PROPN") or (pos: "PRON"))
 (lemma: "love")
 (lemma: "gorilla")
 )
 Law:
-- callback: "lover"
+- callback: "loves_someone"
 (
 (pos: "PROPN")
-(text: "loves")
+(lower: "loves")
 (pos: "PROPN")
 )
 """
 
+
 def jsonify_span(span):
-    jsn = []
-    for token in span:
-        jsn.append({
-            'lemma': token.lemma_,
-            'pos': token.pos_,
-            'lower': token.lower_,
-        })
-    return jsn
+    return [
+        {"lemma": token.lemma_, "pos": token.pos_, "lower": token.lower_}
+        for token in span
+    ]
+
 
 from hmrb.core import SpacyCore
-core = SpacyCore(callbacks=clbs,
-                 map_doc=jsonify_span,
-                 sort_length=True)
+
+core = SpacyCore(callbacks=clbs, map_doc=jsonify_span, sort_length=True)
 
 core.load(grammar)
 nlp.add_pipe(core)
diff --git a/examples/spacy3.py b/examples/spacy3.py
new file mode 100644
index 0000000..0b42122
--- /dev/null
+++ b/examples/spacy3.py
@@ -0,0 +1,70 @@
+import spacy
+
+nlp = spacy.load("en_core_web_sm")
+sentences = "I love gorillas. Peter loves gorillas. Jane loves Tarzan."
+
+
+def conj_be(subj: str) -> str:
+    if subj == "I":
+        return "am"
+    elif subj == "you":
+        return "are"
+    else:
+        return "is"
+
+
+@spacy.registry.callbacks("gorilla_callback")
+def gorilla_clb(seq: list, span: slice, data: dict) -> None:
+    subj = seq[span.start].text
+    be = conj_be(subj)
+    print(f"{subj} {be} a gorilla person.")
+
+
+@spacy.registry.callbacks("lover_callback")
+def lover_clb(seq: list, span: slice, data: dict) -> None:
+    print(
+        f"{seq[span][-1].text} is a love interest of "
+        f"{seq[span.start].text}."
+    )
+
+
+grammar = """
+Law:
+- callback: "loves_gorilla"
+(
+((pos: "PROPN") or (pos: "PRON"))
+(lemma: "love")
+(lemma: "gorilla")
+)
+Law:
+- callback: "loves_someone"
+(
+(pos: "PROPN")
+(lower: "loves")
+(pos: "PROPN")
+)
+"""
+
+
+@spacy.registry.augmenters("jsonify_span")
+def jsonify_span(span):
+    return [
+        {"lemma": token.lemma_, "pos": token.pos_, "lower": token.lower_}
+        for token in span
+    ]
+
+
+from hmrb.core import SpacyCore
+
+conf = {
+    "rules": grammar,
+    "callbacks": {
+        "loves_gorilla": "callbacks.gorilla_callback",
+        "loves_someone": "callbacks.lover_callback",
+    },
+    "map_doc": "augmenters.jsonify_span",
+    "sort_length": True,
+}
+
+nlp.add_pipe("hmrb", config=conf)
+nlp(sentences)
diff --git a/hmrb/core.py b/hmrb/core.py
index d484ace..108047a 100644
--- a/hmrb/core.py
+++ b/hmrb/core.py
@@ -220,3 +220,39 @@ def __call__(self, doc: Any) -> Any:
         logging.info(f"call: {len(protobuf)} match(es)")
         super()._execute(protobuf, doc)
         return doc
+
+
+try:
+    from spacy.language import Language
+    from spacy import registry
+
+    def spacy_factory(
+        nlp: object,
+        name: str,
+        callbacks: dict,
+        sets: dict,
+        map_doc: str,
+        sort_length: bool,
+        rules: str,
+    ) -> SpacyCore:
+        map_fn = registry.get(*map_doc.split("."))
+        callbacks = {
+            key: registry.get(*value.split(".")) for key, value in callbacks.items()
+        }
+        core = SpacyCore(callbacks, sets, map_fn, sort_length)
+        core.load(rules)
+        return core
+
+    Language.factory(
+        "hmrb",
+        default_config={
+            "callbacks": {},
+            "sets": {},
+            "map_doc": _default_map,
+            "sort_length": False,
+            "rules": "",
+        },
+        func=spacy_factory,
+    )
+except (ImportError, AttributeError):
+    logging.debug("disabling support for spaCy 3.0+")
diff --git a/noxfile.py b/noxfile.py
index 375ddac..6e13592 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -62,6 +62,32 @@ def tests(session: Session) -> None:
     session.run("coverage", "xml")
 
 
+@nox.session(python=["3.9"])
+def test_spacy2(session: Session) -> None:
+    session.install("pytest")
+    session.install("spacy<3.0.0")
+    session.run("spacy", "download", "en_core_web_sm")
+    session.install("-r", "requirements.txt")
+    session.install("-e", ".")
+    session.run(
+        "pytest",
+        "tests/test_spacy.py",
+        )
+
+
+@nox.session(python=["3.9"])
+def test_spacy3(session: Session) -> None:
+    session.install("pytest")
+    session.install("spacy>=3.0.0")
+    session.run("spacy", "download", "en_core_web_sm")
+    session.install("-r", "requirements.txt")
+    session.install("-e", ".")
+    session.run(
+        "pytest",
+        "tests/test_spacy.py",
+        )
+
+
 @nox.session(python=["3.7"])
 def changelog(session: Session) -> None:
     args = session.posargs or ["--unreleased"]
diff --git a/tests/test_spacy.py b/tests/test_spacy.py
index 29b7c57..17fb380 100644
--- a/tests/test_spacy.py
+++ b/tests/test_spacy.py
@@ -1,18 +1,19 @@
 import pytest
 
+spacy = pytest.importorskip("spacy")
+
+
 def jsonify_span(span):
-    jsn = []
-    for token in span:
-        jsn.append({
-            'lemma': token.lemma_,
-            'pos': token.pos_,
-            'lower': token.lower_,
-        })
-    return jsn
+    return [
+        {"lemma": token.lemma_, "pos": token.pos_, "lower": token.lower_}
+        for token in span
+    ]
+
 
 def dummy_callback(seq: list, span: slice, data: dict) -> None:
     print("OK")
 
+
 TEXT = "I feel great today."
 TEXT2 = "I love icecream."
 GRAMMAR = """
@@ -24,22 +25,53 @@ def dummy_callback(seq: list, span: slice, data: dict) -> None:
     (lemma: "great")
 )
 """
-CLBS = {"pytest": dummy_callback}
+
 
 def test_spacyV2(capsys):
-    spacy = pytest.importorskip("spacy")
-    assert spacy.__version__ == "2.3.5"
+    if spacy.__version__ >= "3.0.0":
+        pytest.skip(f"Invalid spacy version {spacy.__version__}")
     nlp = spacy.load("en_core_web_sm")
-
     from hmrb.core import SpacyCore
-    core = SpacyCore(callbacks=CLBS,
-                 map_doc=jsonify_span,
-                 sort_length=True)
+
+    core = SpacyCore(
+        callbacks={"pytest": dummy_callback},
+        map_doc=jsonify_span,
+        sort_length=True,
+    )
     core.load(GRAMMAR)
     nlp.add_pipe(core)
     nlp(TEXT)
     captured = capsys.readouterr()
-    assert captured[0] == 'OK\n'
+    assert captured[0] == "OK\n"
+    nlp(TEXT2)
+    captured = capsys.readouterr()
+    assert captured[0] == ""
+
+
+def test_spacyV3(capsys):
+    spacy = pytest.importorskip("spacy")
+    if spacy.__version__ <= "3.0.0":
+        pytest.skip(f"Invalid spacy version {spacy.__version__}")
+    nlp = spacy.load("en_core_web_sm")
+
+    @spacy.registry.augmenters("jsonify_span")
+    def jsonify_span_pointer(span):
+        return jsonify_span(span)
+
+    @spacy.registry.callbacks("dummy_callback")
+    def dummy_callback_pointer(*args, **kwargs):
+        return dummy_callback(*args, **kwargs)
+
+    conf = {}
+    conf["rules"] = GRAMMAR
+    conf["callbacks"] = {"pytest": "callbacks.dummy_callback"}
+    conf["map_doc"] = "augmenters.jsonify_span"
+    conf["sort_length"] = True
+
+    nlp.add_pipe("hmrb", config=conf)
+    nlp(TEXT)
+    captured = capsys.readouterr()
+    assert captured[0] == "OK\n"
     nlp(TEXT2)
     captured = capsys.readouterr()
-    assert captured[0] == ''
+    assert captured[0] == ""