From e46ad8e008fe814f2bda3b81813380a0ddb14eba Mon Sep 17 00:00:00 2001 From: heinrich5991 Date: Mon, 12 Aug 2019 01:15:18 +0200 Subject: [PATCH 1/2] Don't parse dates with more than 4 digits for the year MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The regex was broken before, using `(?!…)` instead of `(?<=…)`. --- src/documents/parsers.py | 10 ++++----- src/paperless_tesseract/tests/test_date.py | 24 ++++++++++++++++++++++ 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 142ebba68..3c0bf19c3 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -21,11 +21,11 @@ # - MONTH ZZZZ, with ZZZZ being 4 digits # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits DATE_REGEX = re.compile( - r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501 - r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501 - r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501 - r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' + - r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))' + r'(\b|(?<=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501 + r'(\b|(?<=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501 + r'(\b|(?<=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501 + r'(\b|(?<=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' + + r'(\b|(?<=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))' ) diff --git a/src/paperless_tesseract/tests/test_date.py b/src/paperless_tesseract/tests/test_date.py index ac2f9648f..37865df23 100644 --- a/src/paperless_tesseract/tests/test_date.py +++ b/src/paperless_tesseract/tests/test_date.py @@ -182,3 +182,27 @@ def test_crazy_date_past(self, *args): document = RasterisedDocumentParser("/dev/null") document.get_text() self.assertIsNone(document.get_date()) + + EXTRA = { + "123/04/2020/3423": None, + "-23/04/2020-foo": "2020 04 23", + "-23-04-2020-blurb": "2020 04 23", + # gets parsed as month: 23, day: 04, which is invalid + # "-2020-04-23-bar": "2020 04 23", + "12020-04-23-": None, + "-2020-04-234": None, + } + + @mock.patch(MOCK_SCRATCH, SCRATCH) + def test_date_format_bulk(self): + timezone = tz.gettz(settings.TIME_ZONE) + for input, expected in self.EXTRA.items(): + if expected is not None: + raw = [int(x) for x in expected.split()] + expected = datetime.datetime(*raw, tzinfo=timezone) + + input_file = os.path.join(self.SAMPLE_FILES, "") + document = RasterisedDocumentParser(input_file) + document._text = input + message = "Test case {!r}".format(input) + self.assertEqual(document.get_date(), expected, msg=message) From a2681995274d83668513318410b024f5af076ea2 Mon Sep 17 00:00:00 2001 From: heinrich5991 Date: Mon, 19 Aug 2019 21:56:07 +0200 Subject: [PATCH 2/2] Clean up date regex, don't use lookbehind/lookahead --- src/documents/parsers.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 3c0bf19c3..d44d7b0ac 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -21,11 +21,13 @@ # - MONTH ZZZZ, with ZZZZ being 4 digits # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits DATE_REGEX = re.compile( - r'(\b|(?<=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501 - r'(\b|(?<=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501 - r'(\b|(?<=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501 - r'(\b|(?<=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' + - r'(\b|(?<=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))' + r'(?:\b|[_-])(' + + r'[0-9]{1,2}[./-][0-9]{1,2}[./-](?:[0-9]{4}|[0-9]{2})|' + + r'(?:[0-9]{4}|[0-9]{2})[./-][0-9]{1,2}[./-][0-9]{1,2}|' + + r'[0-9]{1,2}[. ]+[^ ]{3,9} (?:[0-9]{4}|[0-9]{2})|' + + r'[^\W\d_]{3,9} [0-9]{1,2}, [0-9]{4}|' + + r'[^\W\d_]{3,9} [0-9]{4}' + + r')(?:\b|[_-])' ) @@ -104,7 +106,7 @@ def __parser(ds, date_order): if self.FILENAME_DATE_ORDER: self.log("info", "Checking document title for date") for m in re.finditer(DATE_REGEX, title): - date_string = m.group(0) + date_string = m.group(1) try: date = __parser(date_string, self.FILENAME_DATE_ORDER) @@ -130,7 +132,7 @@ def __parser(ds, date_order): # Iterate through all regex matches in text and try to parse the date for m in re.finditer(DATE_REGEX, text): - date_string = m.group(0) + date_string = m.group(1) try: date = __parser(date_string, self.DATE_ORDER)