diff --git a/.gitignore b/.gitignore index 24212224..ce4d214f 100644 --- a/.gitignore +++ b/.gitignore @@ -124,4 +124,6 @@ ENV/ # mypy .mypy_cache/ -.idea \ No newline at end of file +# ide +.idea +.vscode \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ca71574..1906ec98 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 3.0.2 (2023-02-15) + +### Changed +- recognize 4+ spaces as a token, blocking annotations + ## 3.0.1 (2023-12-20) ### Fixed diff --git a/deduce/tokenizer.py b/deduce/tokenizer.py index dbe67df4..d0a41350 100644 --- a/deduce/tokenizer.py +++ b/deduce/tokenizer.py @@ -3,7 +3,7 @@ import docdeid as dd import regex -_TOKENIZER_PATTERN = regex.compile(r"\w+|[\n\r\t]|.(?"] maintainers = ["Vincent Menger "] diff --git a/tests/unit/test_tokenizer.py b/tests/unit/test_tokenizer.py index 6553f9a8..130c6cec 100644 --- a/tests/unit/test_tokenizer.py +++ b/tests/unit/test_tokenizer.py @@ -47,6 +47,22 @@ def test_split_nonalpha(self): assert tokenizer._split_text(text=text) == expected_tokens + def test_split_multiple_spaces(self): + tokenizer = DeduceTokenizer() + text = "Pieter van der Zee Bergen Op Zoom" + expected_tokens = [ + dd.Token(text="Pieter", start_char=0, end_char=6), + dd.Token(text="van", start_char=7, end_char=10), + dd.Token(text="der", start_char=11, end_char=14), + dd.Token(text="Zee", start_char=15, end_char=18), + dd.Token(text=" ", start_char=18, end_char=23), + dd.Token(text="Bergen", start_char=23, end_char=29), + dd.Token(text="Op", start_char=30, end_char=32), + dd.Token(text="Zoom", start_char=34, end_char=38), + ] + + assert tokenizer._split_text(text=text) == expected_tokens + def test_split_newline(self): tokenizer = DeduceTokenizer() text = "regel 1 \n gevolgd door regel 2"