diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 142ebba68..d44d7b0ac 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -21,11 +21,13 @@ # - MONTH ZZZZ, with ZZZZ being 4 digits # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits DATE_REGEX = re.compile( - r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501 - r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501 - r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501 - r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' + - r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))' + r'(?:\b|[_-])(' + + r'[0-9]{1,2}[./-][0-9]{1,2}[./-](?:[0-9]{4}|[0-9]{2})|' + + r'(?:[0-9]{4}|[0-9]{2})[./-][0-9]{1,2}[./-][0-9]{1,2}|' + + r'[0-9]{1,2}[. ]+[^ ]{3,9} (?:[0-9]{4}|[0-9]{2})|' + + r'[^\W\d_]{3,9} [0-9]{1,2}, [0-9]{4}|' + + r'[^\W\d_]{3,9} [0-9]{4}' + + r')(?:\b|[_-])' ) @@ -104,7 +106,7 @@ def __parser(ds, date_order): if self.FILENAME_DATE_ORDER: self.log("info", "Checking document title for date") for m in re.finditer(DATE_REGEX, title): - date_string = m.group(0) + date_string = m.group(1) try: date = __parser(date_string, self.FILENAME_DATE_ORDER) @@ -130,7 +132,7 @@ def __parser(ds, date_order): # Iterate through all regex matches in text and try to parse the date for m in re.finditer(DATE_REGEX, text): - date_string = m.group(0) + date_string = m.group(1) try: date = __parser(date_string, self.DATE_ORDER) diff --git a/src/paperless_tesseract/tests/test_date.py b/src/paperless_tesseract/tests/test_date.py index ac2f9648f..37865df23 100644 --- a/src/paperless_tesseract/tests/test_date.py +++ b/src/paperless_tesseract/tests/test_date.py @@ -182,3 +182,27 @@ def test_crazy_date_past(self, *args): document = RasterisedDocumentParser("/dev/null") document.get_text() self.assertIsNone(document.get_date()) + + EXTRA = { + "123/04/2020/3423": None, + "-23/04/2020-foo": "2020 04 23", + "-23-04-2020-blurb": "2020 04 23", + # gets parsed as month: 23, day: 04, which is invalid + # "-2020-04-23-bar": "2020 04 23", + "12020-04-23-": None, + "-2020-04-234": None, + } + + @mock.patch(MOCK_SCRATCH, SCRATCH) + def test_date_format_bulk(self): + timezone = tz.gettz(settings.TIME_ZONE) + for input, expected in self.EXTRA.items(): + if expected is not None: + raw = [int(x) for x in expected.split()] + expected = datetime.datetime(*raw, tzinfo=timezone) + + input_file = os.path.join(self.SAMPLE_FILES, "") + document = RasterisedDocumentParser(input_file) + document._text = input + message = "Test case {!r}".format(input) + self.assertEqual(document.get_date(), expected, msg=message)