diff --git a/dateparser/search/search.py b/dateparser/search/search.py index aa71c7299..1a6ea79c8 100644 --- a/dateparser/search/search.py +++ b/dateparser/search/search.py @@ -86,10 +86,15 @@ def split_if_not_parsed(self, item, original): possible_splits.extend(self.split_by(item, original, splitter)) return possible_splits - def parse_item(self, parser, item, translated_item, parsed, need_relative_base): + def parse_item(self, parser, item, translated_item, parsed, need_relative_base, language=None): relative_base = None - item = item.replace('ngày', '') - item = item.replace('am', '') + + # Replacing language specific words with special meaning + if language == "de": + item = re.sub(r'am (?=\d)', '', item) + if language == "vi": + item = re.sub(r'ngày (?=\d)', '', item) + parsed_item = parser.get_date_data(item) is_relative = date_is_relative(translated_item) @@ -101,7 +106,7 @@ def parse_item(self, parser, item, translated_item, parsed, need_relative_base): parsed_item = parser.get_date_data(item) return parsed_item, is_relative - def parse_found_objects(self, parser, to_parse, original, translated, settings): + def parse_found_objects(self, parser, to_parse, original, translated, settings, language=None): parsed = [] substrings = [] need_relative_base = True @@ -111,7 +116,7 @@ def parse_found_objects(self, parser, to_parse, original, translated, settings): if len(item) <= 2: continue - parsed_item, is_relative = self.parse_item(parser, item, translated[i], parsed, need_relative_base) + parsed_item, is_relative = self.parse_item(parser, item, translated[i], parsed, need_relative_base, language) if parsed_item['date_obj']: parsed.append((parsed_item, is_relative)) substrings.append(original[i].strip(" .,:()[]-'")) @@ -131,7 +136,7 @@ def parse_found_objects(self, parser, to_parse, original, translated, settings): if len(jtem) <= 2: continue parsed_jtem, is_relative_jtem = self.parse_item( - parser, jtem, split_translated[j], current_parsed, need_relative_base) + parser, jtem, split_translated[j], current_parsed, need_relative_base, language) current_parsed.append((parsed_jtem, is_relative_jtem)) current_substrings.append(split_original[j].strip(' .,:()[]-')) possible_parsed.append(current_parsed) @@ -152,10 +157,9 @@ def search_parse(self, shortname, text, settings): else: languages = [shortname] to_parse = original - parser = DateDataParser(languages=languages, settings=settings) parsed, substrings = self.parse_found_objects(parser=parser, to_parse=to_parse, - original=original, translated=translated, settings=settings) + original=original, translated=translated, settings=settings, language=shortname) parser._settings = Settings() return list(zip(substrings, [i[0]['date_obj'] for i in parsed])) diff --git a/tests/test_search.py b/tests/test_search.py index 067601569..876b77636 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -461,15 +461,16 @@ def test_search_and_parse(self, shortname, string, expected, settings=None): January UTC June 5 am utc June 23th 5 pm EST - May 31, 8am UTC""", + May 31 + 8am UTC""", [('May 2020', datetime.datetime(2020, 5, datetime.datetime.utcnow().day, 0, 0)), ('June 2020', datetime.datetime(2020, 6, datetime.datetime.utcnow().day, 0, 0)), ('2023', datetime.datetime(2023, 6, datetime.datetime.utcnow().day, 0, 0)), ('January UTC', datetime.datetime(2023, 1, datetime.datetime.utcnow().day, 0, 0, tzinfo=pytz.utc)), - ('June 5 am utc', datetime.datetime(2023, 6, 5, 0, 0, tzinfo=pytz.utc)), + ('June 5 am utc', datetime.datetime(2023, 6, datetime.datetime.utcnow().day, 5, 0, tzinfo=pytz.utc)), ('June 23th 5 pm EST', datetime.datetime(2023, 6, 23, 17, 0, tzinfo=pytz.timezone("EST"))), ('May 31', datetime.datetime(2023, 5, 31, 0, 0)), - ('8am UTC', datetime.datetime(2023, 8, 31, 0, 0, tzinfo=pytz.utc))]), + ('8am UTC', datetime.datetime(2023, 5, 31, 8, 0, tzinfo=pytz.utc))]), # Russian param('ru', '19 марта 2001 был хороший день. 20 марта тоже был хороший день. 21 марта был отличный день.',