Return source white space string (#95)

* fix: return actual white space string * test: white_space returns source string * fix: surface length rather than raw length * Add a few more tests and use escape characters Escape characters make the source easier to read. * Cache whitespace to avoid invalidation issues * Remove commented code * Add whitespace clobber test --------- Co-authored-by: Paul O'Leary McCann <[email protected]>
polm · Nov 10, 2024 · bd3d34b · bd3d34b
1 parent 38f679b
commit bd3d34b
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 9 deletions.
diff --git a/fugashi/fugashi.pyx b/fugashi/fugashi.pyx
@@ -40,6 +40,7 @@ cdef class Node:
     feature string, which is an untokenized CSV string."""
     cdef const mecab_node_t* c_node
     cdef str _surface
+    cdef str _ws
     cdef object features
     cdef object wrapper
 
@@ -60,10 +61,6 @@ cdef class Node:
     @property
     def surface(self):
         if self._surface is None:
-            #self._surface = self.c_node.surface[:self.c_node.length].decode('utf-8')
-            #base = self._offset + (self.c_node.rlength - self.c_node.length)
-            #end = self._offset + self.c_node.rlength
-            #self._surface = self.__cstr[end - self.c_node.length:end].decode('utf-8')
             pass
         return self._surface
 
@@ -107,11 +104,13 @@ cdef class Node:
 
     @property
     def white_space(self):
-        # The half-width spaces before the token, if any.
-        if self.length == self.rlength:
+        if self._ws is None:
             return ''
-        else:
-            return ' ' * (self.rlength - self.length)
+        return self._ws
+
+    @white_space.setter
+    def white_space(self, ws):
+        self._ws = ws
 
     cdef list pad_none(self, list fields):
         try:
@@ -285,6 +284,15 @@ cdef class GenericTagger:
                 self._cache[shash] = sys.intern(surf.decode("utf-8"))
             nn.surface = self._cache[shash]
 
+            # do the same for whitespace
+            nodelen = node.rlength - node.length
+            pnode = node.prev
+            ws = pnode.surface[pnode.length : pnode.length + nodelen]
+            wshash = hash(ws)
+            if wshash not in self._cache:
+                self._cache[wshash] = sys.intern(ws.decode("utf-8"))
+            nn.white_space = self._cache[wshash]
+
             out.append(nn)
 
     def nbest(self, text, num=10):

diff --git a/fugashi/tests/test_basic.py b/fugashi/tests/test_basic.py
@@ -28,6 +28,16 @@
         ('稻村に行きました', ['0,2', '*', '0', '*', '*']),
         )
 
+# Last number is token index of white space
+WHITE_SPACE_TESTS = (
+        ("これは 半角スペースです", " ", 2),
+        ("これは\tタブ文字です", "\t", 2),
+        ("これは\n改行文字です", "\n", 2),
+        ("これは\n\t 複数種類の空白文字です", "\n\t ", 2),
+        ("これは\n\t 複数種類の空白文字です", "\n\t ", 2),
+        ("\tタブ文字で始まる文字列", "\t", 0),
+        )
+
 @pytest.mark.parametrize('text,wakati', WAKATI_TESTS)
 def test_wakati(text, wakati):
     tagger = Tagger('-Owakati')
@@ -82,7 +92,15 @@ def test_accent(text, accent):
 def test_clobber():
     # Check that memory isn't clobbered by repeated parse calls
     tagger = Tagger()
-    nodes1 = tagger("a b c d")
+    nodes1 = tagger("a\tb c d")
     nodes2 = tagger("x y z !")
 
     assert "a b c d".split() == [nn.surface for nn in nodes1]
+    assert ["", "\t", " ", " "] == [nn.white_space for nn in nodes1]
+
+@pytest.mark.parametrize("text,space,idx", WHITE_SPACE_TESTS)
+def test_white_space(text, space, idx):
+    tagger = Tagger()
+    nodes = tagger.parseToNodeList(text)
+
+    assert nodes[idx].white_space == space