From ebeb2491fd75f3e296b163b1979394d24867bde1 Mon Sep 17 00:00:00 2001
From: Nick Budak <thatbudakguy@gmail.com>
Date: Sat, 13 Jul 2024 12:14:03 -0700
Subject: [PATCH] Add match grouping by spans and set this as default output
 type

Closes #122
---
 dphon/cli.py             |  26 +++++---
 dphon/console.py         | 132 +++++++++++++++++++++------------------
 dphon/match.py           |   4 +-
 dphon/reuse.py           | 131 ++++++++++++++++++++++++++++++++++----
 tests/unit/test_reuse.py | 110 ++++++++++++++++++++++++--------
 5 files changed, 292 insertions(+), 111 deletions(-)
diff --git a/dphon/cli.py b/dphon/cli.py
index 50d89ee..0883b8a 100644
--- a/dphon/cli.py
+++ b/dphon/cli.py
@@ -1,12 +1,12 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 """dphon - a tool for old chinese phonetic analysis
- 
+
 Usage:
     dphon -h | --help
     dphon --version
-    dphon [-v | -vv] [options] <path>... 
- 
+    dphon [-v | -vv] [options] <path>...
+
 Global Options:
     -h, --help
         Show this help text and exit.
@@ -38,7 +38,7 @@
 
     -l <NUM>, --len-limit <NUM>     [default: 50]
         Compare at most NUM tokens when obtaining the similarity score. A
-        higher number will slow down execution time but return more matches. 
+        higher number will slow down execution time but return more matches.
 
     -c <NUM>, --context <NUM>       [default: 4]
         Add NUM tokens of context to each side of matches. Context displays with
@@ -56,14 +56,14 @@
         less than the value for "--ngram-order".
 
     --max <NUM>                     [default: 64]
-        Limit to matches with total number of tokens <= NUM. Must be equal to 
+        Limit to matches with total number of tokens <= NUM. Must be equal to
         or greater than the value for "--ngram-order".
 
 Examples:
     dphon texts/*.txt --min 8 > matches.txt
     dphon file1.txt file2.txt --ngram-order 8 --threshold 0.8
     dphon docs.jsonl --input-format jsonl --output-format jsonl > matches.jsonl
- 
+
 Help:
     For more information on using this tool, visit the Github repository:
     https://github.com/direct-phonology/dphon
@@ -155,8 +155,11 @@ def run() -> None:
     else:
         # use system pager by default; colorize if LESS=R
         with console.pager(styles=os.getenv("LESS", "") == "R"):
-            for match in results:
-                console.print(match)
+            for doc in graph.docs:
+                for group in doc._.groups:
+                    console.print(group, "\n")
+            # for match in results:
+            #     console.print(match)
 
 
 def setup(args: Dict) -> Language:
@@ -168,6 +171,8 @@ def setup(args: Dict) -> Language:
     # add Doc metadata
     if not Doc.has_extension("id"):
         Doc.set_extension("id", default="")
+    if not Doc.has_extension("groups"):
+        Doc.set_extension("groups", default=[])
 
     # setup spaCy model
     nlp = spacy.blank("zh", meta={"tokenizer": {"config": {"use_jieba": False}}})
@@ -192,7 +197,7 @@ def process(nlp: Language, args: Dict) -> MatchGraph:
     start = time.perf_counter()
     for doc, context in nlp.pipe(load_texts(args["<path>"]), as_tuples=True):
         doc._.id = context["id"]
-        graph.add_doc(context["id"], doc)
+        graph.add_doc(doc)
         logging.debug(f'indexed doc "{doc._.id}"')
     stop = time.perf_counter() - start
     logging.info(f"indexed {graph.number_of_docs()} docs in {stop:.1f}s")
@@ -243,6 +248,9 @@ def process(nlp: Language, args: Dict) -> MatchGraph:
     # align all matches
     graph.align(SmithWatermanPhoneticAligner(gap_char="　"))
 
+    # group all matches
+    graph.group()
+
     # limit via min and max lengths if requested
     if args["--min"]:
         graph.filter(lambda m: len(m) >= int(args["--min"]))
diff --git a/dphon/console.py b/dphon/console.py
index 269c7cf..1605819 100644
--- a/dphon/console.py
+++ b/dphon/console.py
@@ -1,12 +1,12 @@
-from typing import Tuple, List
+from typing import List, Tuple
 
 from rich.console import Console
 from rich.highlighter import RegexHighlighter
 from rich.theme import Theme
+from spacy.tokens import Span
 
-from .match import Match
 from .g2p import GraphemesToPhonemes
-
+from .match import Match
 
 # Default color scheme for highlighting matches
 DEFAULT_THEME = Theme(
@@ -45,91 +45,99 @@ def format_match(self, match: Match) -> Tuple[str, str]:
         Adds markup for highlighting insertions, mismatches, etc. If context is
         set, also adds highlighted context to either end of the match.
         """
+        return (
+            self.format_span(match.utxt, match.vtxt, match.au, match.av),
+            self.format_span(match.vtxt, match.utxt, match.av, match.au),
+        )
 
-        su, sv = self._mark(match)
+    def transcribe_match(self, match: Match) -> Tuple[str, str]:
+        """Render a phonemic transcription for a Match."""
+        return (self.transcribe_span(match.utxt), self.transcribe_span(match.vtxt))
+
+    def format_span(
+        self,
+        span: Span,
+        other: Span = None,
+        alignment: str = None,
+        other_alignment: str = None,
+    ) -> str:
+        """Return a Span as a Rich format string, with optional context.
+
+        Adds markup for highlighting insertions, mismatches, etc. if a second
+        reference Span is provided. If context is set, also adds highlighted
+        context to either end of the match.
+        """
+        highlighted_span = self._mark_span(span, alignment, other, other_alignment)
         if self.context > 0:
-            cul, cur, cvl, cvr = self._add_context(match)
-            su = cul + su + cur
-            sv = cvl + sv + cvr
-        return su, sv
+            context_left, context_right = self._add_span_context(span)
+            formatted_span = context_left + highlighted_span + context_right
+        return formatted_span
 
-    def _mark(self, match: Match) -> Tuple[str, str]:
-        """Mark up the match for colorization with a theme.
+    def transcribe_span(self, span: Span) -> str:
+        """Render a phonemic transcription for a Span."""
+        return "*" + " ".join(span._.syllables)
 
-        - Adds markup for insertions (tokens in one sequence but not another).
+    def _mark_span(
+        self, span: Span, alignment: str, other: Span, other_alignment: str
+    ) -> str:
+        """Mark up a Span for colorization with a theme, in relation to another Span.
+
+        - Adds markup for insertions (tokens in one Span but not another).
         - Adds markup for mismatches (differing tokens in the same position).
         - Adds markup for graphic variants (mismatches with same phonemes).
         """
-
-        # if no alignment, just convert to strings because we can't highlight
-        if not match.au or not match.av:
-            return match.utxt.text, match.vtxt.text
+        # if no alignment, just return the text because we can't highlight
+        if not alignment or not other or not other_alignment:
+            return span.text
 
         # o(N) implementation: step through each sequence adding markup
-        # TODO convert to a DFA so there's less markup repetition
-        su: List[str] = []
-        sv: List[str] = []
-        u_ptr = 0
-        v_ptr = 0
-        for i in range(len(match)):
-
+        # TODO convert to a DFA so there's less markup repetition?
+        marked_span: List[str] = []
+        span_ptr = 0
+        other_ptr = 0
+        for i in range(len(span)):
             # gap in u: insertion in v (if not punctuation)
-            if match.au[i] == self.gap_char and match.av[i].isalnum():
-                su.append(match.au[i])
-                sv.append(f"[insertion]{match.av[i]}[/insertion]")
-                v_ptr += 1
+            if alignment[i] == self.gap_char and other_alignment[i].isalnum():
+                marked_span.append(alignment[i])
+                other_ptr += 1
                 continue
 
             # gap in v: insertion in u (if not punctuation)
-            if match.av[i] == self.gap_char and match.au[i].isalnum():
-                su.append(f"[insertion]{match.au[i]}[/insertion]")
-                sv.append(match.av[i])
-                u_ptr += 1
+            if other_alignment[i] == self.gap_char and alignment[i].isalnum():
+                marked_span.append(f"[insertion]{alignment[i]}[/insertion]")
+                span_ptr += 1
                 continue
 
             # variants (both u and v)
-            if self.g2p.are_graphic_variants(match.utxt[u_ptr], match.vtxt[v_ptr]):
-                su.append(f"[variant]{match.au[i]}[/variant]")
-                sv.append(f"[variant]{match.av[i]}[/variant]")
-                u_ptr += 1
-                v_ptr += 1
+            if self.g2p.are_graphic_variants(span[span_ptr], other[other_ptr]):
+                marked_span.append(f"[variant]{alignment[i]}[/variant]")
+                span_ptr += 1
+                other_ptr += 1
                 continue
 
             # mismatch (both u and v) - only highlight if alphanumeric
-            if match.au[i] != match.av[i]:
-                if match.au[i].isalnum() and match.av[i].isalnum():
-                    su.append(f"[mismatch]{match.au[i]}[/mismatch]")
-                    sv.append(f"[mismatch]{match.av[i]}[/mismatch]")
-                    u_ptr += 1
-                    v_ptr += 1
+            if alignment[i] != other_alignment[i]:
+                if alignment[i].isalnum() and other_alignment[i].isalnum():
+                    marked_span.append(f"[mismatch]{alignment[i]}[/mismatch]")
+                    span_ptr += 1
+                    other_ptr += 1
                     continue
 
             # equality; nothing to highlight
-            su.append(match.au[i])
-            sv.append(match.av[i])
-            u_ptr += 1
-            v_ptr += 1
+            marked_span.append(alignment[i])
+            span_ptr += 1
+            other_ptr += 1
 
-        return "".join(su), "".join(sv)
+        return "".join(marked_span)
 
-    def _add_context(self, match: Match) -> Tuple[str, str, str, str]:
-        """Add context to either side of the match sequences.
+    def _add_span_context(self, span: Span) -> Tuple[str, str]:
+        """Add context to either side of the Span.
 
         Context coloration can be changed by the default theme; a dim appearance
         is used in terminals.
         """
-
-        utxt, vtxt = match.utxt, match.vtxt
-        u, v = utxt.doc, vtxt.doc
-        cul = f"[context]{u[utxt.start-self.context:utxt.start]}[/context]"
-        cur = f"[context]{u[utxt.end:utxt.end+self.context]}[/context]"
-        cvl = f"[context]{v[vtxt.start-self.context:vtxt.start]}[/context]"
-        cvr = f"[context]{v[vtxt.end:vtxt.end+self.context]}[/context]"
-        return (cul, cur, cvl, cvr)
-
-    def transcription(self, match: Match) -> Tuple[str, str]:
-        """Get the phonemic transcription for the match for display."""
-        return (
-            "*" + " ".join(match.utxt._.syllables),
-            "*" + " ".join(match.vtxt._.syllables),
+        context_left = (
+            f"[context]{span.doc[span.start-self.context:span.start]}[/context]"
         )
+        context_right = f"[context]{span.doc[span.end:span.end+self.context]}[/context]"
+        return context_left, context_right
diff --git a/dphon/match.py b/dphon/match.py
index 2475cc5..d66fe7d 100644
--- a/dphon/match.py
+++ b/dphon/match.py
@@ -6,9 +6,9 @@
 from typing import Dict, List, NamedTuple
 
 import Levenshtein as Lev
-from rich.padding import Padding
 from rich.console import Console, ConsoleOptions, RenderResult
-from spacy.tokens import Span
+from rich.padding import Padding
+from spacy.tokens import Doc, Span
 
 
 class Match(NamedTuple):
diff --git a/dphon/reuse.py b/dphon/reuse.py
index eed1be8..fbc8f21 100644
--- a/dphon/reuse.py
+++ b/dphon/reuse.py
@@ -2,21 +2,94 @@
 # -*- coding: utf-8 -*-
 """Classes for analyzing text reuse."""
 
-from itertools import combinations
-from typing import Callable, Iterable, Iterator, Tuple
+from functools import cached_property
+from itertools import combinations, groupby
+from typing import Callable, Iterable, Iterator
 
 from networkx import MultiGraph, create_empty_copy
-from rich.progress import Progress, BarColumn, SpinnerColumn
-from spacy.tokens import Doc
+from rich.console import Console, ConsoleOptions, RenderResult
+from rich.progress import BarColumn, Progress, SpinnerColumn
+from spacy.tokens import Doc, Span
 
 from .align import Aligner
+from .console import err_console
 from .extend import Extender, extend_matches
 from .match import Match
-from .console import err_console
 
 
-class MatchGraph:
+class MatchGroup:
+    """A group of matches with common bounds in a single document."""
+
+    def __init__(
+        self, doc: Doc, start: int, end: int, matches: Iterable[Match]
+    ) -> None:
+        self.doc = doc
+        self.start = start
+        self.end = end
+        self.matches = list(matches)
+
+    def __len__(self) -> int:
+        return len(self.matches)
+
+    def __rich_console__(
+        self, console: Console, options: ConsoleOptions
+    ) -> RenderResult:
+        """Format the group for display in console."""
+        render_results = []
+
+        # render the "anchor" span first (i.e., the span that all matches share)
+        render_results += [
+            f"[bold]{self.doc._.id}[/bold] ({self.start}–{self.end-1})：",
+            console.highlighter.format_span(self.anchor_span),
+            console.highlighter.transcribe_span(self.anchor_span),
+        ]
+
+        # render the non-anchor spans from each match in the group
+        for i, match in enumerate(self.matches):
+            span = self.non_anchor_span(match)
+            alignment = self.non_anchor_alignment(match)
+            anchor_alignment = self.anchor_alignment(match)
+            render_results += [
+                f"{i + 1}. {span.doc._.id} ({span.start}–{span.end-1})：",
+                console.highlighter.format_span(
+                    span, self.anchor_span, alignment, anchor_alignment
+                ),
+                console.highlighter.transcribe_span(span),
+            ]
+
+        return render_results
+
+    @cached_property
+    def anchor_span(self) -> Span:
+        """Get the anchor span for the group."""
+        return self.doc[self.start : self.end]
+
+    def anchor_alignment(self, match: Match) -> str:
+        """Get the anchor alignment for a given match."""
+        if match.u == self.doc._.id:
+            return match.au
+        if match.v == self.doc._.id:
+            return match.av
+        raise ValueError("Match does not belong to document.", match, self.doc)
+
+    def non_anchor_span(self, match: Match) -> Span:
+        """Get the non-anchor span for a given match."""
+        if match.u == self.doc._.id:
+            return match.vtxt
+        if match.v == self.doc._.id:
+            return match.utxt
+        raise ValueError("Match does not belong to document.", match, self.doc)
+
+    def non_anchor_alignment(self, match: Match) -> str:
+        """Get the non-anchor alignment for a given match."""
+        if match.u == self.doc._.id:
+            return match.av
+        if match.v == self.doc._.id:
+            return match.au
+        raise ValueError("Match does not belong to document.", match, self.doc)
+
 
+class MatchGraph:
     _G: MultiGraph
 
     def __init__(self) -> None:
@@ -50,13 +123,17 @@ def number_of_docs(self) -> int:
         """Total number of documents in the graph."""
         return self._G.number_of_nodes()
 
-    def add_doc(self, label: str, doc: Doc) -> None:
+    def add_doc(self, doc: Doc, label: str = None) -> None:
         """Add a single document to the graph."""
-        self._G.add_node(label, doc=doc)
+        doc_id = label or doc._.id
+        if not doc_id:
+            raise ValueError("Document must have an identifier.", doc)
+        doc._.id = doc_id
+        self._G.add_node(doc_id, doc=doc)
 
-    def add_docs(self, docs: Iterable[Tuple[str, Doc]]) -> None:
+    def add_docs(self, docs: Iterable[Doc]) -> None:
         """Add a collection of documents to the graph."""
-        self._G.add_nodes_from(((label, {"doc": doc}) for label, doc in docs))
+        [self.add_doc(doc) for doc in docs]
 
     def add_match(self, match: Match) -> None:
         """Add a single match to the graph."""
@@ -64,7 +141,7 @@ def add_match(self, match: Match) -> None:
 
     def add_matches(self, matches: Iterable[Match]) -> None:
         """Add a collection of matches to the graph."""
-        self._G.add_edges_from([(m.u, m.v, m._asdict()) for m in matches])
+        [self.add_match(match) for match in matches]
 
     def extend(self, extender: Extender) -> None:
         """Extend all matches in the graph using a provided strategy."""
@@ -108,9 +185,41 @@ def align(self, align: Aligner) -> None:
         self._G = G
         self.progress.remove_task(task)
 
+    def group(self) -> None:
+        """Group all matches in the graph by their shared spans."""
+        # track progress
+        task = self.progress.add_task(
+            "grouping", u="", v="", total=self.number_of_matches()
+        )
+
+        # iterate through each document and group all matches that target it
+        with self.progress:
+            for doc in self.docs:
+                self.progress.update(task, u=doc)
+                edges = self._G.edges(doc._.id, data=True)
+                matches = [Match(**data) for _u, _v, data in edges]
+                for span, group in groupby(
+                    sorted(matches, key=_bounds_in(doc)), key=_bounds_in(doc)
+                ):
+                    doc._.groups.append(MatchGroup(doc, span[0], span[1], group))
+                self.progress.update(task, advance=len(edges))
+        self.progress.remove_task(task)
+
     def filter(self, predicate: Callable[[Match], bool]) -> None:
         """Filter all matches in the graph using a provided predicate."""
         G = create_empty_copy(self._G)
         filtered = filter(predicate, self.matches)
         G.add_edges_from([(m.u, m.v, m._asdict()) for m in filtered])
         self._G = G
+
+
+# helper for getting bounds of a match in a given document
+def _bounds_in(doc):
+    def _bounds(match):
+        if match.utxt.doc == doc:
+            return match.utxt.start, match.utxt.end
+        if match.vtxt.doc == doc:
+            return match.vtxt.start, match.vtxt.end
+        raise ValueError("Match does not belong to document.", match, doc)
+
+    return _bounds
diff --git a/tests/unit/test_reuse.py b/tests/unit/test_reuse.py
index ad29cfd..94f421b 100644
--- a/tests/unit/test_reuse.py
+++ b/tests/unit/test_reuse.py
@@ -4,10 +4,11 @@
 from unittest import TestCase, skip
 
 import spacy
-from dphon.reuse import MatchGraph
 from spacy.tokens import Doc
+
 from dphon.extend import LevenshteinExtender
 from dphon.match import Match
+from dphon.reuse import MatchGraph
 
 # disconnect logging for testing
 logging.captureWarnings(True)
@@ -22,38 +23,93 @@ class TestMatchGraph(TestCase):
     def setUp(self) -> None:
         """create a spaCy pipeline and match graph for testing"""
         self.nlp = spacy.blank(
-            "zh", meta={"tokenizer": {"config": {"use_jieba": False}}})
-        self.G = MatchGraph()
+            "zh", meta={"tokenizer": {"config": {"use_jieba": False}}}
+        )
         if not Doc.has_extension("id"):
             Doc.set_extension("id", default="")
-        # doc1 = self.nlp.make_doc("與朋友交言而有信雖曰未學吾必謂之學矣")
-        # doc2 = self.nlp.make_doc("與朋友交言而有信雖曰已學吾必謂之未也")
-        # doc3 = self.nlp.make_doc("與朋友交言而有信雖未讀書吾亦謂之學矣")
+        if not Doc.has_extension("groups"):
+            Doc.set_extension("groups", default=[])
 
     def test_extend(self) -> None:
         """extend should reduce graph to maximal matches only"""
         doc1 = self.nlp.make_doc("與朋友交言而有信雖曰未學吾")
         doc2 = self.nlp.make_doc("與朋友交言而有信雖曰已學吾")
         doc3 = self.nlp.make_doc("與朋友交言而有信雖未讀書吾")
-        self.G.add_docs([("論語·學而", doc1),
-                         ("藝文類聚·錢", doc2),
-                         ("顏氏家訓·勉學", doc3)])
-        self.G.add_matches([
-            Match("論語·學而", "藝文類聚·錢", doc1[0:4], doc2[0:4]),      # 與朋友交
-            Match("論語·學而", "藝文類聚·錢", doc1[4:8], doc2[4:8]),      # 言而有信
-            Match("論語·學而", "顏氏家訓·勉學", doc1[0:4], doc3[0:4]),    # 與朋友交
-            Match("論語·學而", "顏氏家訓·勉學", doc1[4:8], doc3[4:8]),    # 言而有信
-            Match("藝文類聚·錢", "顏氏家訓·勉學", doc2[0:4], doc3[0:4]),  # 與朋友交
-            Match("藝文類聚·錢", "顏氏家訓·勉學", doc2[4:8], doc3[4:8]),  # 言而有信
-        ])
+        doc1._.id = "論語·學而"
+        doc2._.id = "藝文類聚·錢"
+        doc3._.id = "顏氏家訓·勉學"
+        G = MatchGraph()
+        G.add_docs([doc1, doc2, doc3])
+        G.add_matches(
+            [
+                Match("論語·學而", "藝文類聚·錢", doc1[0:4], doc2[0:4]),  # 與朋友交
+                Match("論語·學而", "藝文類聚·錢", doc1[4:8], doc2[4:8]),  # 言而有信
+                Match("論語·學而", "顏氏家訓·勉學", doc1[0:4], doc3[0:4]),  # 與朋友交
+                Match("論語·學而", "顏氏家訓·勉學", doc1[4:8], doc3[4:8]),  # 言而有信
+                Match("藝文類聚·錢", "顏氏家訓·勉學", doc2[0:4], doc3[0:4]),  # 與朋友交
+                Match("藝文類聚·錢", "顏氏家訓·勉學", doc2[4:8], doc3[4:8]),  # 言而有信
+            ]
+        )
         extender = LevenshteinExtender(threshold=0.8, len_limit=50)
-        self.G.extend(extender)
-        matches = [(m.u, m.v, m.utxt.text, m.vtxt.text)
-                   for m in self.G.matches]
-        self.assertEqual(len(matches), 3)
-        self.assertEqual(matches[0], ("論語·學而", "藝文類聚·錢",
-                                      "與朋友交言而有信雖曰未學吾", "與朋友交言而有信雖曰已學吾"))
-        self.assertEqual(matches[1], ("論語·學而", "顏氏家訓·勉學",
-                                      "與朋友交言而有信雖曰未學吾", "與朋友交言而有信雖未讀書吾"))
-        self.assertEqual(matches[2], ("藝文類聚·錢", "顏氏家訓·勉學",
-                                      "與朋友交言而有信雖", "與朋友交言而有信雖"))
+        G.extend(extender)
+        matches = [(m.u, m.v, m.utxt.text, m.vtxt.text) for m in G.matches]
+        self.assertEqual(len(matches), 3, "should have 3 matches")
+        self.assertEqual(
+            matches[0],
+            (
+                "論語·學而",
+                "藝文類聚·錢",
+                "與朋友交言而有信雖曰未學吾",
+                "與朋友交言而有信雖曰已學吾",
+            ),
+        )
+        self.assertEqual(
+            matches[1],
+            (
+                "論語·學而",
+                "顏氏家訓·勉學",
+                "與朋友交言而有信雖曰未學吾",
+                "與朋友交言而有信雖未讀書吾",
+            ),
+        )
+        self.assertEqual(
+            matches[2],
+            (
+                "藝文類聚·錢",
+                "顏氏家訓·勉學",
+                "與朋友交言而有信雖",
+                "與朋友交言而有信雖",
+            ),
+        )
+
+    def test_group(self) -> None:
+        """grouping should group matches by shared spans"""
+        doc1 = self.nlp.make_doc("與朋友交言而有信雖曰未學吾")
+        doc2 = self.nlp.make_doc("與朋友交言而有信雖曰已學吾")
+        doc3 = self.nlp.make_doc("與朋友交言而有信雖未讀書吾")
+        doc1._.id = "論語·學而"
+        doc2._.id = "藝文類聚·錢"
+        doc3._.id = "顏氏家訓·勉學"
+        G = MatchGraph()
+        G.add_docs([doc1, doc2, doc3])
+        G.add_matches(
+            [
+                Match(
+                    "論語·學而", "藝文類聚·錢", doc1[0:8], doc2[0:8]
+                ),  # 與朋友交言而有信
+                Match(
+                    "論語·學而", "顏氏家訓·勉學", doc1[0:8], doc3[0:8]
+                ),  # 與朋友交言而有信
+                Match(
+                    "藝文類聚·錢", "顏氏家訓·勉學", doc2[0:8], doc3[0:8]
+                ),  # 與朋友交言而有信
+            ]
+        )
+        G.group()
+        self.assertEqual(len(doc1._.groups), 1)
+        self.assertEqual(len(doc2._.groups), 1)
+        self.assertEqual(len(doc3._.groups), 1)
+        group = doc1._.groups[0]
+        self.assertEqual(group.start, 0)
+        self.assertEqual(group.end, 8)
+        self.assertEqual(len(group), 2)