From c814a2157665ce5b20c999213d5c7ba07be8889c Mon Sep 17 00:00:00 2001 From: Zeno Jiricek Date: Wed, 18 Sep 2024 21:32:41 +0930 Subject: [PATCH] fix: date fields are treated as strings (#44) many issues that became non obvious in config file parse issues and matchers and correlation between matcher config and what format you should be comparing against. So now when extracting, use the `ExtractorBase.parse_date` as it will emit a date string in the format acceptable to beancount. From thereon, your matchers can expect to work with dates as a string in that format as you see it in your journals. Specifying a date format in your extractor classes or in your extractor config is merely to there to convert the date string found in your source files (pdf, csv etc). --- beancount_importer_rules/extractor.py | 8 ++- beancount_importer_rules/includes.py | 40 +++++++++++- .../processor/matchers.py | 20 +++--- beancount_importer_rules/templates.py | 16 +++++ tests/fixtures/engine/imported/.gitignore | 1 - tests/test_engine.py | 4 ++ tests/test_match_dates.py | 13 +++- tests/test_match_regex.py | 63 +++++++++++++++++++ tests/test_templates.py | 18 ++++++ 9 files changed, 167 insertions(+), 16 deletions(-) delete mode 100644 tests/fixtures/engine/imported/.gitignore create mode 100644 tests/test_match_regex.py create mode 100644 tests/test_templates.py diff --git a/beancount_importer_rules/extractor.py b/beancount_importer_rules/extractor.py index ba793dd..1a22d0b 100644 --- a/beancount_importer_rules/extractor.py +++ b/beancount_importer_rules/extractor.py @@ -129,9 +129,13 @@ def process(self): """ name: str = "extractor" + """The name of the extractor. Will end up being available to matchers as `extractor`""" date_field: str = "Date" - date_format: str = "%Y-%m-%d" - datetime_format: str = "%Y-%m-%d %H:%M:%S" + """The field in the CSV file that contains the date""" + date_format: str = "YYYY-MM-DD" + """Arrow date format""" + datetime_format: str = "YYYY-MM-DD HH:MM:SS" + """Arrow datetime format""" def __init__( self, diff --git a/beancount_importer_rules/includes.py b/beancount_importer_rules/includes.py index e95f9fe..614dc2a 100644 --- a/beancount_importer_rules/includes.py +++ b/beancount_importer_rules/includes.py @@ -4,6 +4,7 @@ """ import pathlib +import sys import yaml from pydantic import TypeAdapter @@ -17,10 +18,45 @@ RuleListAdapter = TypeAdapter(list[ImportRule | IncludeRule]) +class NoDatesSafeLoader(yaml.SafeLoader): + @classmethod + def remove_implicit_resolver(cls, tag_to_remove): + """ + Remove implicit resolvers for a particular tag + + Takes care not to modify resolvers in super classes. + + We want to load datetimes as strings, not dates, because we + go on to serialise as json which doesn't have the advanced types + of yaml, and leads to incompatibilities down the track. + """ + if "yaml_implicit_resolvers" not in cls.__dict__: + cls.yaml_implicit_resolvers = cls.yaml_implicit_resolvers.copy() + + for first_letter, mappings in cls.yaml_implicit_resolvers.items(): + cls.yaml_implicit_resolvers[first_letter] = [ + (tag, regexp) for tag, regexp in mappings if tag != tag_to_remove + ] + + +NoDatesSafeLoader.remove_implicit_resolver("tag:yaml.org,2002:timestamp") + + def load_includes(workdir_path: pathlib.Path, include_path: pathlib.Path) -> ImportList: with include_path.open("rt") as fo: - rules = yaml.safe_load(fo) - imported = RuleListAdapter.validate_python(rules) + rules = yaml.load(fo, Loader=NoDatesSafeLoader) + try: + imported = RuleListAdapter.validate_python(rules) + except Exception as e: + # pretty print the error + print( + f"Error loading include file: {include_path}.\n\n" + f"{e}\n\n" + f"Include file content:\n" + f"{yaml.dump(rules, indent=2)}" + ) + sys.exit(1) + return resolve_includes(workdir_path=workdir_path, rules=imported) diff --git a/beancount_importer_rules/processor/matchers.py b/beancount_importer_rules/processor/matchers.py index 3435d4d..e51320a 100644 --- a/beancount_importer_rules/processor/matchers.py +++ b/beancount_importer_rules/processor/matchers.py @@ -1,13 +1,14 @@ +import datetime import pathlib import re from beancount_importer_rules.data_types import ( SimpleTxnMatchRule, - StrExactMatch, StrMatch, StrRegexMatch, Transaction, TxnMatchVars, + stringify_value, ) @@ -26,23 +27,22 @@ def match_file(pattern: StrMatch, filepath: pathlib.Path | pathlib.PurePath) -> return pattern.test(str(filepath)) -def match_str(pattern: StrMatch, value: str | None) -> bool: +def match_str( + pattern: StrMatch, value: str | datetime.date | datetime.datetime | None +) -> bool: if value is None: return False - if pattern is None: - return True - - if pattern == value: - return True - + # Most patterns that are just strings are valid regexes. if isinstance(pattern, str) and is_valid_regex(pattern): pattern = StrRegexMatch(regex=pattern) + # if the pattern turns out to not be a regex, we can just compare the strings. if isinstance(pattern, str): - pattern = StrExactMatch(equals=pattern) + return stringify_value(value) == pattern - return pattern.test(value) + # otherwise we assume it's a complex matcher + return pattern.test(stringify_value(value)) def match_transaction( diff --git a/beancount_importer_rules/templates.py b/beancount_importer_rules/templates.py index c4b7f8d..4258af3 100644 --- a/beancount_importer_rules/templates.py +++ b/beancount_importer_rules/templates.py @@ -1,4 +1,5 @@ import pathlib +from datetime import date, datetime from jinja2.sandbox import SandboxedEnvironment @@ -7,7 +8,22 @@ def as_posix_path(path: pathlib.Path) -> str: return pathlib.Path(path).as_posix() +def as_datetime(value): + return datetime.strptime(value, "%Y-%m-%d") + + +def as_date(value) -> date: + return datetime.strptime(value, "%Y-%m-%d").date() + + +def datetime_format(value, format="%H:%M %d-%m-%y") -> str: + return datetime.strftime(value, format) + + def make_environment(): env = SandboxedEnvironment() + env.filters["as_date"] = as_date + env.filters["as_datetime"] = as_datetime + env.filters["datetime_format"] = datetime_format env.filters["as_posix_path"] = as_posix_path return env diff --git a/tests/fixtures/engine/imported/.gitignore b/tests/fixtures/engine/imported/.gitignore deleted file mode 100644 index 1639587..0000000 --- a/tests/fixtures/engine/imported/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.bean diff --git a/tests/test_engine.py b/tests/test_engine.py index 84d3b4e..21e6e84 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -23,6 +23,10 @@ def test_engine_run(): config_path = FIXTURE_FOLDER / "engine" / "import.yaml" beanfile_path = FIXTURE_FOLDER / "engine" / "books" / "main.bean" + # remove any existing output files + for f in (workdir / "books" / "imported").glob("*.bean"): + f.unlink + engine = ImportRuleEngine( workdir=str(workdir), config_path=str(config_path), diff --git a/tests/test_match_dates.py b/tests/test_match_dates.py index f49c84f..93b283e 100644 --- a/tests/test_match_dates.py +++ b/tests/test_match_dates.py @@ -107,6 +107,16 @@ now.format("YYYY-MM-DD"), True, ), + ( + now.format("YYYY-MM-DD"), + now.format("YYYY-MM-DD"), + True, + ), + ( + now.shift(days=1).format("YYYY-MM-DD"), + now.format("YYYY-MM-DD"), + False, + ), ], ) def test_match_dates( @@ -119,4 +129,5 @@ def test_match_dates( value: str | None, expected: bool, ): - assert match_str(pattern, value) == expected + outcome = match_str(pattern, value) == expected + assert outcome diff --git a/tests/test_match_regex.py b/tests/test_match_regex.py new file mode 100644 index 0000000..1613b03 --- /dev/null +++ b/tests/test_match_regex.py @@ -0,0 +1,63 @@ +import arrow +import pytest + +from beancount_importer_rules.data_types import StrRegexMatch +from beancount_importer_rules.processor.matchers import ( + match_str, +) + +now = arrow.utcnow() + + +@pytest.mark.parametrize( + "pattern, value, expected", + [ + ( + r"2021-01-01", + r"2021-01-01", + True, + ), + ( + r"2021-01-01", + r"2021-01-02", + False, + ), + ( + r"2021-01-01", + None, + False, + ), + ( + r"2021-01-01", + now.format("YYYY-MM-DD"), + False, + ), + ( + r"2021-01-01", + "2021-01-01", + True, + ), + ( + r"2021-01-01", + "2021-01-02", + False, + ), + ( + r"2021-01-.*", + "2021-01-02", + True, + ), + ( + "2021", + "2021-01-02", + True, + ), + ], +) +def test_match_regex( + pattern: str | StrRegexMatch, + value: str | None, + expected: bool, +): + outcome = match_str(pattern, value) == expected + assert outcome diff --git a/tests/test_templates.py b/tests/test_templates.py new file mode 100644 index 0000000..823e003 --- /dev/null +++ b/tests/test_templates.py @@ -0,0 +1,18 @@ +from beancount_importer_rules.templates import make_environment + + +def test_make_environment(): + env = make_environment() + assert env + assert env.filters["as_date"] + assert env.filters["as_datetime"] + assert env.filters["datetime_format"] + assert env.filters["as_posix_path"] + + +def test_format_datetime(): + env = make_environment() + template = "{{ date | as_date | datetime_format('%Y') }}" + result = env.from_string(template).render({"date": "2022-01-01"}) + + assert result == "2022"