From 7cc44408102154a9cfb45e81d09441c84eb55617 Mon Sep 17 00:00:00 2001 From: Yun Zheng Hu Date: Thu, 24 Aug 2023 09:30:09 +0200 Subject: [PATCH 1/2] Make datetime fieldtypes timezone aware (#78) - Record datetime fields are now offset-aware by default - Naive datetime fields are converted to UTC - Support for packing/unpacking aware datetimes Note that comparing to naive datetime objects will now break and is also in line with default Python behaviour. To ensure uniform datetime field output they are always displayed in UTC. To use a different display timezone you can set the environment variable `FLOW_RECORD_TZ`. Examples: - `FLOW_RECORD_TZ=UTC` to display datetime fields in UTC, this is the default - `FLOW_RECORD_TZ=Europe/Amsterdam` to display datetime fields in local time of the Netherlands - `FLOW_RECORD_TZ=NONE` to disable the datetime display normalisation --------- Co-authored-by: Erik Schamper <1254028+Schamper@users.noreply.github.com> --- flow/record/adapter/elastic.py | 2 +- flow/record/base.py | 5 +- flow/record/fieldtypes/__init__.py | 94 +++++++++++++++++--------- flow/record/jsonpacker.py | 2 +- flow/record/packer.py | 15 +++-- flow/record/stream.py | 4 +- pyproject.toml | 2 + tests/_utils.py | 4 +- tests/test_fieldtypes.py | 102 ++++++++++++++++++++++------- tests/test_json_packer.py | 4 +- tests/test_multi_timestamp.py | 26 ++++---- tests/test_packer.py | 8 ++- tests/test_rdump.py | 78 ++++++++++++++++++++++ tests/test_record_adapter.py | 2 +- tests/test_regression.py | 4 +- tests/test_selector.py | 4 +- tox.ini | 2 +- 17 files changed, 265 insertions(+), 93 deletions(-) diff --git a/flow/record/adapter/elastic.py b/flow/record/adapter/elastic.py index 82f3647..71fd6f2 100644 --- a/flow/record/adapter/elastic.py +++ b/flow/record/adapter/elastic.py @@ -99,7 +99,7 @@ def __init__( index: str = "records", http_compress: Union[str, bool] = True, selector: Union[None, Selector, CompiledSelector] = None, - **kwargs + **kwargs, ) -> None: self.index = index self.uri = uri diff --git a/flow/record/base.py b/flow/record/base.py index eeea6d8..b9bfd2e 100644 --- a/flow/record/base.py +++ b/flow/record/base.py @@ -12,7 +12,7 @@ import re import sys import warnings -from datetime import datetime +from datetime import datetime, timezone from itertools import zip_longest from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple from urllib.parse import parse_qsl, urlparse @@ -44,6 +44,7 @@ from .whitelist import WHITELIST, WHITELIST_TREE log = logging.getLogger(__package__) +_utcnow = functools.partial(datetime.now, timezone.utc) RECORD_VERSION = 1 RESERVED_FIELDS = OrderedDict( @@ -422,7 +423,7 @@ def _generate_record_class(name: str, fields: Tuple[Tuple[str, str]]) -> type: _globals = { "Record": Record, "RECORD_VERSION": RECORD_VERSION, - "_utcnow": datetime.utcnow, + "_utcnow": _utcnow, "_zip_longest": zip_longest, } for field in all_fields.values(): diff --git a/flow/record/fieldtypes/__init__.py b/flow/record/fieldtypes/__init__.py index ca7d0af..d4d4e1a 100644 --- a/flow/record/fieldtypes/__init__.py +++ b/flow/record/fieldtypes/__init__.py @@ -1,20 +1,19 @@ +from __future__ import annotations + import binascii import math import os import pathlib import re +import sys +import warnings from binascii import a2b_hex, b2a_hex from datetime import datetime as _dt from datetime import timezone from posixpath import basename, dirname -from typing import Any, Tuple - -try: - import urlparse -except ImportError: - import urllib.parse as urlparse - -import warnings +from typing import Any, Optional, Tuple +from urllib.parse import urlparse +from zoneinfo import ZoneInfo, ZoneInfoNotFoundError from flow.record.base import FieldType @@ -22,6 +21,12 @@ RE_STRIP_NANOSECS = re.compile(r"(\.\d{6})\d+") NATIVE_UNICODE = isinstance("", str) +UTC = timezone.utc +ISO_FORMAT = "%Y-%m-%dT%H:%M:%S%z" +ISO_FORMAT_WITH_MS = "%Y-%m-%dT%H:%M:%S.%f%z" + +PY_311 = sys.version_info >= (3, 11, 0) + PATH_POSIX = 0 PATH_WINDOWS = 1 @@ -32,6 +37,31 @@ path_type = pathlib.PurePath +def flow_record_tz(*, default_tz: str = "UTC") -> Optional[ZoneInfo | UTC]: + """Return a ``ZoneInfo`` object based on the ``FLOW_RECORD_TZ`` environment variable. + + Args: + default_tz: Default timezone if ``FLOW_RECORD_TZ`` is not set (default: UTC). + + Returns: + None if ``FLOW_RECORD_TZ=NONE`` otherwise ``ZoneInfo(FLOW_RECORD_TZ)`` or ``UTC`` if ZoneInfo is not found. + """ + tz = os.environ.get("FLOW_RECORD_TZ", default_tz) + if tz.upper() == "NONE": + return None + try: + return ZoneInfo(tz) + except ZoneInfoNotFoundError as exc: + warnings.warn(f"{exc!r}, falling back to timezone.utc") + return UTC + + +# The environment variable ``FLOW_RECORD_TZ`` affects the display of datetime fields. +# +# The timezone to use when displaying datetime fields. By default this is UTC. +DISPLAY_TZINFO = flow_record_tz(default_tz="UTC") + + def defang(value: str) -> str: """Defangs the value to make URLs or ip addresses unclickable""" value = re.sub("^http://", "hxxp://", value, flags=re.IGNORECASE) @@ -238,24 +268,24 @@ def __new__(cls, *args, **kwargs): # String constructor is used for example in JsonRecordAdapter # Note: ISO 8601 is fully implemented in fromisoformat() from Python 3.11 and onwards. # Until then, we need to manually detect timezone info and handle it. - if any(z in arg[19:] for z in ["Z", "+", "-"]): - if "." in arg[19:]: - try: - return cls.strptime(arg, "%Y-%m-%dT%H:%M:%S.%f%z") - except ValueError: - # Sometimes nanoseconds need to be stripped - return cls.strptime(re.sub(RE_STRIP_NANOSECS, "\\1", arg), "%Y-%m-%dT%H:%M:%S.%f%z") - return cls.strptime(arg, "%Y-%m-%dT%H:%M:%S%z") + if not PY_311 and any(z in arg[19:] for z in ["Z", "+", "-"]): + spec = ISO_FORMAT_WITH_MS if "." in arg[19:] else ISO_FORMAT + try: + obj = cls.strptime(arg, spec) + except ValueError: + # Sometimes nanoseconds need to be stripped + obj = cls.strptime(re.sub(RE_STRIP_NANOSECS, "\\1", arg), spec) else: try: - return cls.fromisoformat(arg) + obj = cls.fromisoformat(arg) except ValueError: # Sometimes nanoseconds need to be stripped - return cls.fromisoformat(re.sub(RE_STRIP_NANOSECS, "\\1", arg)) + obj = cls.fromisoformat(re.sub(RE_STRIP_NANOSECS, "\\1", arg)) elif isinstance(arg, (int, float_type)): - return cls.utcfromtimestamp(arg) + obj = cls.fromtimestamp(arg, UTC) elif isinstance(arg, (_dt,)): - return _dt.__new__( + tzinfo = arg.tzinfo or UTC + obj = _dt.__new__( cls, arg.year, arg.month, @@ -264,24 +294,24 @@ def __new__(cls, *args, **kwargs): arg.minute, arg.second, arg.microsecond, - arg.tzinfo, + tzinfo, ) + else: + obj = _dt.__new__(cls, *args, **kwargs) - return _dt.__new__(cls, *args, **kwargs) - - def __eq__(self, other): - # Avoid TypeError: can't compare offset-naive and offset-aware datetimes - # naive datetimes are treated as UTC in flow.record instead of local time - ts1 = self.timestamp() if self.tzinfo else self.replace(tzinfo=timezone.utc).timestamp() - ts2 = other.timestamp() if other.tzinfo else other.replace(tzinfo=timezone.utc).timestamp() - return ts1 == ts2 + # Ensure we always return a timezone aware datetime. Treat naive datetimes as UTC + if obj.tzinfo is None: + obj = obj.replace(tzinfo=UTC) + return obj def _pack(self): return self + def __str__(self): + return self.astimezone(DISPLAY_TZINFO).isoformat(" ") if DISPLAY_TZINFO else self.isoformat(" ") + def __repr__(self): - result = str(self) - return result + return str(self) def __hash__(self): return _dt.__hash__(self) @@ -462,7 +492,7 @@ def _unpack(cls, data): class uri(string, FieldType): def __init__(self, value): - self._parsed = urlparse.urlparse(value) + self._parsed = urlparse(value) @staticmethod def normalize(path): diff --git a/flow/record/jsonpacker.py b/flow/record/jsonpacker.py index af8dec3..7808d24 100644 --- a/flow/record/jsonpacker.py +++ b/flow/record/jsonpacker.py @@ -58,7 +58,7 @@ def pack_obj(self, obj): } return serial if isinstance(obj, datetime): - serial = obj.strftime("%Y-%m-%dT%H:%M:%S.%f") + serial = obj.isoformat() return serial if isinstance(obj, fieldtypes.digest): return { diff --git a/flow/record/packer.py b/flow/record/packer.py index b8835b9..cc0c8ec 100644 --- a/flow/record/packer.py +++ b/flow/record/packer.py @@ -1,6 +1,6 @@ -import datetime import functools import warnings +from datetime import datetime, timezone import msgpack @@ -29,6 +29,8 @@ RECORD_PACK_TYPE_VARINT = 0x11 RECORD_PACK_TYPE_GROUPEDRECORD = 0x12 +UTC = timezone.utc + def identifier_to_str(identifier): if isinstance(identifier, tuple) and len(identifier) == 2: @@ -61,9 +63,11 @@ def register(self, desc, notify=False): def pack_obj(self, obj, unversioned=False): packed = None - if isinstance(obj, datetime.datetime): - t = obj.utctimetuple()[:6] + (obj.microsecond,) - packed = (RECORD_PACK_TYPE_DATETIME, t) + if isinstance(obj, datetime): + if obj.tzinfo is None or obj.tzinfo == UTC: + packed = (RECORD_PACK_TYPE_DATETIME, (*obj.timetuple()[:6], obj.microsecond)) + else: + packed = (RECORD_PACK_TYPE_DATETIME, (obj.isoformat(),)) elif isinstance(obj, int): neg = obj < 0 @@ -102,8 +106,7 @@ def unpack_obj(self, t, data): subtype, value = self.unpack(data) if subtype == RECORD_PACK_TYPE_DATETIME: - dt = fieldtypes.datetime(*value) - return dt + return fieldtypes.datetime(*value) if subtype == RECORD_PACK_TYPE_VARINT: neg, h = value diff --git a/flow/record/stream.py b/flow/record/stream.py index d7f71dd..76835ef 100644 --- a/flow/record/stream.py +++ b/flow/record/stream.py @@ -191,7 +191,7 @@ def __init__(self, path_template=None, name=None): def rotate_existing_file(self, path): if os.path.exists(path): - now = datetime.datetime.utcnow() + now = datetime.datetime.now(datetime.timezone.utc) src = os.path.realpath(path) src_dir = os.path.dirname(src) @@ -226,7 +226,7 @@ def record_stream_for_path(self, path): return self.writer def write(self, record): - ts = record._generated or datetime.datetime.utcnow() + ts = record._generated or datetime.datetime.now(datetime.timezone.utc) path = self.path_template.format(name=self.name, record=record, ts=ts) rs = self.record_stream_for_path(path) rs.write(record) diff --git a/pyproject.toml b/pyproject.toml index da3d157..e0c5b7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,8 @@ classifiers = [ ] dependencies = [ "msgpack>=0.5.2", + "backports.zoneinfo[tzdata]; python_version<'3.9'", + "tzdata; platform_system=='Windows'", ] dynamic = ["version"] diff --git a/tests/_utils.py b/tests/_utils.py index 6cf1584..fcaf4d1 100644 --- a/tests/_utils.py +++ b/tests/_utils.py @@ -19,7 +19,7 @@ def generate_records(count=100): ) for i in range(count): - embedded = TestRecordEmbedded(datetime.datetime.utcnow()) + embedded = TestRecordEmbedded(datetime.datetime.now(datetime.timezone.utc)) yield TestRecord(number=i, record=embedded) @@ -33,4 +33,4 @@ def generate_plain_records(count=100): ) for i in range(count): - yield TestRecord(number=i, dt=datetime.datetime.utcnow()) + yield TestRecord(number=i, dt=datetime.datetime.now(datetime.timezone.utc)) diff --git a/tests/test_fieldtypes.py b/tests/test_fieldtypes.py index f1500b6..3f6e831 100644 --- a/tests/test_fieldtypes.py +++ b/tests/test_fieldtypes.py @@ -1,9 +1,9 @@ # coding: utf-8 -import datetime import hashlib import os import pathlib +from datetime import datetime, timedelta, timezone import pytest @@ -18,6 +18,8 @@ from flow.record.fieldtypes import datetime as dt from flow.record.fieldtypes import fieldtype_for_value, net, uri +UTC = timezone.utc + INT64_MAX = (1 << 63) - 1 INT32_MAX = (1 << 31) - 1 INT16_MAX = (1 << 15) - 1 @@ -398,29 +400,29 @@ def test_datetime(): ], ) - now = datetime.datetime.utcnow() + now = datetime.now(UTC) r = TestRecord(now) assert r.ts == now r = TestRecord("2018-03-22T15:15:23") - assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23) + assert r.ts == datetime(2018, 3, 22, 15, 15, 23, tzinfo=UTC) r = TestRecord("2018-03-22T15:15:23.000000") - assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23) + assert r.ts == datetime(2018, 3, 22, 15, 15, 23, tzinfo=UTC) r = TestRecord("2018-03-22T15:15:23.123456") - assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23, 123456) + assert r.ts == datetime(2018, 3, 22, 15, 15, 23, 123456, tzinfo=UTC) - dt = datetime.datetime(2018, 3, 22, 15, 15, 23, 123456) + dt = datetime(2018, 3, 22, 15, 15, 23, 123456, tzinfo=UTC) dt_str = dt.isoformat() r = TestRecord(dt_str) assert r.ts == dt r = TestRecord(1521731723) - assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23) + assert r.ts == datetime(2018, 3, 22, 15, 15, 23, tzinfo=UTC) r = TestRecord(1521731723.123456) - assert r.ts == datetime.datetime(2018, 3, 22, 15, 15, 23, 123456) + assert r.ts == datetime(2018, 3, 22, 15, 15, 23, 123456, tzinfo=UTC) r = TestRecord("2018-03-22T15:15:23.123456") test = {r.ts: "Success"} @@ -430,18 +432,18 @@ def test_datetime(): @pytest.mark.parametrize( "value,expected_dt", [ - ("2023-12-31T13:37:01.123456Z", datetime.datetime(2023, 12, 31, 13, 37, 1, 123456)), - ("2023-01-10T16:12:01+00:00", datetime.datetime(2023, 1, 10, 16, 12, 1)), - ("2023-01-10T16:12:01", datetime.datetime(2023, 1, 10, 16, 12, 1)), - ("2023-01-10T16:12:01Z", datetime.datetime(2023, 1, 10, 16, 12, 1)), - ("2022-12-01T13:00:23.499460Z", datetime.datetime(2022, 12, 1, 13, 0, 23, 499460)), - ("2019-09-26T07:58:30.996+0200", datetime.datetime(2019, 9, 26, 5, 58, 30, 996000)), - ("2011-11-04T00:05:23+04:00", datetime.datetime(2011, 11, 3, 20, 5, 23)), - ("2023-01-01T12:00:00+01:00", datetime.datetime(2023, 1, 1, 11, 0, 0, tzinfo=datetime.timezone.utc)), - ("2006-11-10T14:29:55.5851926", datetime.datetime(2006, 11, 10, 14, 29, 55, 585192)), - ("2006-11-10T14:29:55.585192699999999", datetime.datetime(2006, 11, 10, 14, 29, 55, 585192)), - (datetime.datetime(2023, 1, 1, tzinfo=datetime.timezone.utc), datetime.datetime(2023, 1, 1)), - (0, datetime.datetime(1970, 1, 1, 0, 0)), + ("2023-12-31T13:37:01.123456Z", datetime(2023, 12, 31, 13, 37, 1, 123456, tzinfo=UTC)), + ("2023-01-10T16:12:01+00:00", datetime(2023, 1, 10, 16, 12, 1, tzinfo=UTC)), + ("2023-01-10T16:12:01", datetime(2023, 1, 10, 16, 12, 1, tzinfo=UTC)), + ("2023-01-10T16:12:01Z", datetime(2023, 1, 10, 16, 12, 1, tzinfo=UTC)), + ("2022-12-01T13:00:23.499460Z", datetime(2022, 12, 1, 13, 0, 23, 499460, tzinfo=UTC)), + ("2019-09-26T07:58:30.996+0200", datetime(2019, 9, 26, 5, 58, 30, 996000, tzinfo=UTC)), + ("2011-11-04T00:05:23+04:00", datetime(2011, 11, 3, 20, 5, 23, tzinfo=UTC)), + ("2023-01-01T12:00:00+01:00", datetime(2023, 1, 1, 11, 0, 0, tzinfo=UTC)), + ("2006-11-10T14:29:55.5851926", datetime(2006, 11, 10, 14, 29, 55, 585192, tzinfo=UTC)), + ("2006-11-10T14:29:55.585192699999999", datetime(2006, 11, 10, 14, 29, 55, 585192, tzinfo=UTC)), + (datetime(2023, 1, 1, tzinfo=UTC), datetime(2023, 1, 1, tzinfo=UTC)), + (0, datetime(1970, 1, 1, 0, 0, tzinfo=UTC)), ], ) def test_datetime_formats(tmp_path, value, expected_dt): @@ -740,7 +742,7 @@ def test_fieldtype_for_value(): assert fieldtype_for_value(1.337) == "float" assert fieldtype_for_value(b"\r\n") == "bytes" assert fieldtype_for_value("hello world") == "string" - assert fieldtype_for_value(datetime.datetime.now()) == "datetime" + assert fieldtype_for_value(datetime.now()) == "datetime" assert fieldtype_for_value([1, 2, 3, 4, 5]) == "string" assert fieldtype_for_value([1, 2, 3, 4, 5], None) is None assert fieldtype_for_value(object(), None) is None @@ -775,7 +777,7 @@ def test_dynamic(): assert r.value == [1, 2, 3] assert isinstance(r.value, flow.record.fieldtypes.stringlist) - now = datetime.datetime.utcnow() + now = datetime.now(UTC) r = TestRecord(now) assert r.value == now assert isinstance(r.value, flow.record.fieldtypes.datetime) @@ -899,11 +901,63 @@ def test_datetime_handle_nanoseconds_without_timezone(): d2 = dt("2006-11-10T14:29:55") assert isinstance(d1, dt) assert isinstance(d2, dt) - assert d1 == datetime.datetime(2006, 11, 10, 14, 29, 55, 585192) + assert d1 == datetime(2006, 11, 10, 14, 29, 55, 585192, tzinfo=UTC) assert d1.microsecond == 585192 - assert d2 == datetime.datetime(2006, 11, 10, 14, 29, 55) + assert d2 == datetime(2006, 11, 10, 14, 29, 55, tzinfo=UTC) assert d2.microsecond == 0 +@pytest.mark.parametrize( + "record_filename", + [ + "out.records.gz", + "out.records", + "out.json", + "out.jsonl", + ], +) +def test_datetime_timezone_aware(tmp_path, record_filename): + TestRecord = RecordDescriptor( + "test/tz", + [ + ("datetime", "ts"), + ], + ) + tz = timezone(timedelta(hours=1)) + stamp = datetime.now(tz) + + with RecordWriter(tmp_path / record_filename) as writer: + record = TestRecord(stamp) + writer.write(record) + assert record.ts == stamp + assert record.ts.utcoffset() == timedelta(hours=1) + assert record._generated.tzinfo == UTC + + with RecordReader(tmp_path / record_filename) as reader: + for record in reader: + assert record.ts == stamp + assert record.ts.utcoffset() == timedelta(hours=1) + assert record._generated.tzinfo == UTC + + +def test_datetime_comparisions(): + with pytest.raises(TypeError, match=".* compare .*naive"): + assert dt("2023-01-01") > datetime(2022, 1, 1) + + with pytest.raises(TypeError, match=".* compare .*naive"): + assert datetime(2022, 1, 1) < dt("2023-01-01") + + assert dt("2023-01-01") > datetime(2022, 1, 1, tzinfo=UTC) + assert dt("2023-01-01") == datetime(2023, 1, 1, tzinfo=UTC) + assert dt("2023-01-01") == datetime(2023, 1, 1, tzinfo=UTC) + assert dt("2023-01-01T13:36") <= datetime(2023, 1, 1, 13, 37, tzinfo=UTC) + assert dt("2023-01-01T13:37") <= datetime(2023, 1, 1, 13, 37, tzinfo=UTC) + assert dt("2023-01-01T13:37") >= datetime(2023, 1, 1, 13, 36, tzinfo=UTC) + assert dt("2023-01-01T13:37") >= datetime(2023, 1, 1, 13, 37, tzinfo=UTC) + assert dt("2023-01-01T13:36") < datetime(2023, 1, 1, 13, 37, tzinfo=UTC) + assert dt("2023-01-01T13:37") > datetime(2023, 1, 1, 13, 36, tzinfo=UTC) + assert dt("2023-01-02") != datetime(2023, 3, 4, tzinfo=UTC) + + if __name__ == "__main__": __import__("standalone_test").main(globals()) diff --git a/tests/test_json_packer.py b/tests/test_json_packer.py index ccbea02..8b6119d 100644 --- a/tests/test_json_packer.py +++ b/tests/test_json_packer.py @@ -1,5 +1,5 @@ import json -from datetime import datetime +from datetime import datetime, timezone import pytest @@ -9,7 +9,7 @@ def test_record_in_record(): packer = JsonRecordPacker() - dt = datetime.utcnow() + dt = datetime.now(timezone.utc) RecordA = RecordDescriptor( "test/record_a", diff --git a/tests/test_multi_timestamp.py b/tests/test_multi_timestamp.py index e6143d0..8d0acc2 100644 --- a/tests/test_multi_timestamp.py +++ b/tests/test_multi_timestamp.py @@ -1,8 +1,10 @@ -import datetime +from datetime import datetime, timedelta, timezone from flow.record import RecordDescriptor, iter_timestamped_records from flow.record.base import merge_record_descriptors +UTC = timezone.utc + def test_multi_timestamp(): TestRecord = RecordDescriptor( @@ -15,22 +17,22 @@ def test_multi_timestamp(): ) test_record = TestRecord( - ctime=datetime.datetime(2020, 1, 1, 1, 1, 1), - atime=datetime.datetime(2022, 11, 22, 13, 37, 37), + ctime=datetime(2020, 1, 1, 1, 1, 1), + atime=datetime(2022, 11, 22, 13, 37, 37), data="test", ) ts_records = list(iter_timestamped_records(test_record)) for rec in ts_records: - assert rec.ctime == datetime.datetime(2020, 1, 1, 1, 1, 1) - assert rec.atime == datetime.datetime(2022, 11, 22, 13, 37, 37) + assert rec.ctime == datetime(2020, 1, 1, 1, 1, 1, tzinfo=UTC) + assert rec.atime == datetime(2022, 11, 22, 13, 37, 37, tzinfo=UTC) assert rec.data == "test" - assert ts_records[0].ts == datetime.datetime(2020, 1, 1, 1, 1, 1) + assert ts_records[0].ts == datetime(2020, 1, 1, 1, 1, 1, tzinfo=UTC) assert ts_records[0].ts_description == "ctime" - assert ts_records[1].ts == datetime.datetime(2022, 11, 22, 13, 37, 37) + assert ts_records[1].ts == datetime(2022, 11, 22, 13, 37, 37, tzinfo=UTC) assert ts_records[1].ts_description == "atime" @@ -58,7 +60,7 @@ def test_multi_timestamp_single_datetime(): ) test_record = TestRecord( - ctime=datetime.datetime(2020, 1, 1, 1, 1, 1), + ctime=datetime(2020, 1, 1, 1, 1, 1), data="test", ) ts_records = list(iter_timestamped_records(test_record)) @@ -77,7 +79,7 @@ def test_multi_timestamp_ts_fieldname(): ) test_record = TestRecord( - ts=datetime.datetime(2020, 1, 1, 1, 1, 1), + ts=datetime(2020, 1, 1, 1, 1, 1), data="test", ) ts_records = list(iter_timestamped_records(test_record)) @@ -95,7 +97,7 @@ def test_multi_timestamp_timezone(): ], ) - correct_ts = datetime.datetime(2023, 12, 31, 13, 37, 1, 123456, tzinfo=datetime.timezone.utc) + correct_ts = datetime(2023, 12, 31, 13, 37, 1, 123456, tzinfo=UTC) ts_notations = [ correct_ts, @@ -127,8 +129,8 @@ def test_multi_timestamp_descriptor_cache(): merge_record_descriptors.cache_clear() for i in range(10): test_record = TestRecord( - ctime=datetime.datetime.utcnow() + datetime.timedelta(hours=69), - atime=datetime.datetime.utcnow() + datetime.timedelta(hours=420), + ctime=datetime.now(UTC) + timedelta(hours=69), + atime=datetime.now(UTC) + timedelta(hours=420), count=i, data=f"test {i}", ) diff --git a/tests/test_packer.py b/tests/test_packer.py index 5ef017a..8ee012c 100644 --- a/tests/test_packer.py +++ b/tests/test_packer.py @@ -1,4 +1,4 @@ -import datetime +from datetime import datetime, timezone import pytest @@ -7,6 +7,8 @@ from flow.record.fieldtypes import uri from flow.record.packer import RECORD_PACK_EXT_TYPE +UTC = timezone.utc + def test_uri_packing(): packer = RecordPacker() @@ -151,7 +153,7 @@ def test_dynamic_packer(): assert r.value == [1, True, b"b", "u"] assert isinstance(r.value, fieldtypes.stringlist) - now = datetime.datetime.utcnow() + now = datetime.now(UTC) t = TestRecord(now) data = packer.pack(t) r = packer.unpack(data) @@ -195,7 +197,7 @@ def test_pack_digest(): def test_record_in_record(): packer = RecordPacker() - dt = datetime.datetime.utcnow() + dt = datetime.now(UTC) RecordA = RecordDescriptor( "test/record_a", diff --git a/tests/test_rdump.py b/tests/test_rdump.py index 51664d0..035a3a0 100644 --- a/tests/test_rdump.py +++ b/tests/test_rdump.py @@ -4,10 +4,14 @@ import os import platform import subprocess +from datetime import timezone +from unittest import mock import pytest +import flow.record.fieldtypes from flow.record import RecordDescriptor, RecordReader, RecordWriter +from flow.record.fieldtypes import flow_record_tz from flow.record.tools import rdump @@ -509,3 +513,77 @@ def test_rdump_count_and_skip(tmp_path, capsysbinary, total_records, count, skip with RecordReader(subset_path) as reader: numbers = [rec.number for rec in reader] assert numbers == expected_numbers + + +@pytest.mark.parametrize( + "date_str,tz,expected_date_str", + [ + ("2023-08-02T22:28:06.12345+01:00", None, "2023-08-02 21:28:06.123450+00:00"), + ("2023-08-02T22:28:06.12345+01:00", "NONE", "2023-08-02 22:28:06.123450+01:00"), + ("2023-08-02T22:28:06.12345-08:00", "NONE", "2023-08-02 22:28:06.123450-08:00"), + ("2023-08-02T20:51:32.123456+00:00", "Europe/Amsterdam", "2023-08-02 22:51:32.123456+02:00"), + ("2023-08-02T20:51:32.123456+00:00", "America/New_York", "2023-08-02 16:51:32.123456-04:00"), + ], +) +@pytest.mark.parametrize( + "rdump_params", + [ + [], + ["--mode=csv"], + ["--mode=line"], + ], +) +def test_flow_record_tz_output(tmp_path, capsys, date_str, tz, expected_date_str, rdump_params): + TestRecord = RecordDescriptor( + "test/flow_record_tz", + [ + ("datetime", "stamp"), + ], + ) + with RecordWriter(tmp_path / "test.records") as writer: + writer.write(TestRecord(stamp=date_str)) + + env_dict = {} + if tz is not None: + env_dict["FLOW_RECORD_TZ"] = tz + + with mock.patch.dict(os.environ, env_dict, clear=True): + # Reconfigure DISPLAY_TZINFO + flow.record.fieldtypes.DISPLAY_TZINFO = flow_record_tz(default_tz="UTC") + + rdump.main([str(tmp_path / "test.records")] + rdump_params) + captured = capsys.readouterr() + assert captured.err == "" + assert expected_date_str in captured.out + + # restore DISPLAY_TZINFO just in case + flow.record.fieldtypes.DISPLAY_TZINFO = flow_record_tz(default_tz="UTC") + + +def test_flow_record_invalid_tz(tmp_path, capsys): + TestRecord = RecordDescriptor( + "test/flow_record_tz", + [ + ("datetime", "stamp"), + ], + ) + with RecordWriter(tmp_path / "test.records") as writer: + writer.write(TestRecord(stamp="2023-08-16T17:46:55.390691+02:00")) + + env_dict = { + "FLOW_RECORD_TZ": "invalid", + } + + with mock.patch.dict(os.environ, env_dict, clear=True): + # Reconfigure DISPLAY_TZINFO + with pytest.warns(UserWarning, match=".* falling back to timezone.utc"): + flow.record.fieldtypes.DISPLAY_TZINFO = flow_record_tz() + + rdump.main([str(tmp_path / "test.records")]) + captured = capsys.readouterr() + assert captured.err == "" + assert "2023-08-16 15:46:55.390691+00:00" in captured.out + assert flow.record.fieldtypes.DISPLAY_TZINFO == timezone.utc + + # restore DISPLAY_TZINFO just in case + flow.record.fieldtypes.DISPLAY_TZINFO = flow_record_tz(default_tz="UTC") diff --git a/tests/test_record_adapter.py b/tests/test_record_adapter.py index 7a51a28..39b87f9 100644 --- a/tests/test_record_adapter.py +++ b/tests/test_record_adapter.py @@ -203,7 +203,7 @@ def test_record_writer_stdout(): def test_record_adapter_archive(tmpdir): # archive some records, using "testing" as name writer = RecordWriter("archive://{}?name=testing".format(tmpdir)) - dt = datetime.datetime.utcnow() + dt = datetime.datetime.now(datetime.timezone.utc) count = 0 for rec in generate_records(): writer.write(rec) diff --git a/tests/test_regression.py b/tests/test_regression.py index c55d127..e48610c 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -1,10 +1,10 @@ import codecs -import datetime import json import os import pathlib import subprocess import sys +from datetime import datetime, timezone from unittest.mock import mock_open, patch import msgpack @@ -32,7 +32,7 @@ def test_datetime_serialization(): packer = RecordPacker() - now = datetime.datetime.utcnow() + now = datetime.now(timezone.utc) for tz in ["UTC", "Europe/Amsterdam"]: os.environ["TZ"] = tz diff --git a/tests/test_selector.py b/tests/test_selector.py index b393db7..ad5dde6 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timezone import pytest @@ -449,7 +449,7 @@ def test_record_in_records(): ) test_str = "this is a test" - dt = datetime.utcnow() + dt = datetime.now(timezone.utc) record_a = RecordA(some_dt=dt, field=test_str) record_b = RecordB(record=record_a, some_dt=dt) diff --git a/tox.ini b/tox.ini index a036145..a026ea9 100644 --- a/tox.ini +++ b/tox.ini @@ -49,7 +49,7 @@ deps = vermin commands = flake8 flow tests - vermin -t=3.7- --no-tips --lint flow tests + vermin -t=3.7- --no-tips --lint --exclude zoneinfo flow tests [flake8] max-line-length = 120 From ddea907e9d26f59446d8788a2c2a0653b0c1b597 Mon Sep 17 00:00:00 2001 From: Yun Zheng Hu Date: Thu, 24 Aug 2023 12:47:50 +0200 Subject: [PATCH 2/2] Fix backports.zoneinfo import for Python<3.9 (#80) --- flow/record/fieldtypes/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/flow/record/fieldtypes/__init__.py b/flow/record/fieldtypes/__init__.py index d4d4e1a..3dcc402 100644 --- a/flow/record/fieldtypes/__init__.py +++ b/flow/record/fieldtypes/__init__.py @@ -13,7 +13,11 @@ from posixpath import basename, dirname from typing import Any, Optional, Tuple from urllib.parse import urlparse -from zoneinfo import ZoneInfo, ZoneInfoNotFoundError + +try: + from zoneinfo import ZoneInfo, ZoneInfoNotFoundError +except ImportError: + from backports.zoneinfo import ZoneInfo, ZoneInfoNotFoundError from flow.record.base import FieldType