Skip to content

Commit

Permalink
Speed up datetime parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
yunzheng committed Sep 4, 2023
1 parent 6358ba3 commit 14f71d6
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 20 deletions.
61 changes: 41 additions & 20 deletions flow/record/fieldtypes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,9 @@
from flow.record.base import FieldType

RE_NORMALIZE_PATH = re.compile(r"[\\/]+")
RE_STRIP_NANOSECS = re.compile(r"(\.\d{6})\d+")
NATIVE_UNICODE = isinstance("", str)

UTC = timezone.utc
ISO_FORMAT = "%Y-%m-%dT%H:%M:%S%z"
ISO_FORMAT_WITH_MS = "%Y-%m-%dT%H:%M:%S.%f%z"

PY_311 = sys.version_info >= (3, 11, 0)

Expand Down Expand Up @@ -268,23 +265,47 @@ def __new__(cls, *args, **kwargs):
if isinstance(arg, bytes_type):
arg = arg.decode("utf-8")
if isinstance(arg, string_type):
# I expect ISO 8601 format e.g. datetime.isoformat()
# String constructor is used for example in JsonRecordAdapter
# Note: ISO 8601 is fully implemented in fromisoformat() from Python 3.11 and onwards.
# Until then, we need to manually detect timezone info and handle it.
if not PY_311 and any(z in arg[19:] for z in ["Z", "+", "-"]):
spec = ISO_FORMAT_WITH_MS if "." in arg[19:] else ISO_FORMAT
try:
obj = cls.strptime(arg, spec)
except ValueError:
# Sometimes nanoseconds need to be stripped
obj = cls.strptime(re.sub(RE_STRIP_NANOSECS, "\\1", arg), spec)
else:
try:
obj = cls.fromisoformat(arg)
except ValueError:
# Sometimes nanoseconds need to be stripped
obj = cls.fromisoformat(re.sub(RE_STRIP_NANOSECS, "\\1", arg))
# If we are on Python 3.11 or newer, we can use fromisoformat() to parse the string (fast path)
#
# Else we need to do some manual parsing to fix some issues with the string format:
# - Python 3.10 and older do not support nanoseconds in fromisoformat()
# - Python 3.10 and older do not support Z as timezone info in fromisoformat()
# - Python 3.10 and older do not support +0200 as timezone info in fromisoformat()
# - Python 3.10 and older requires "T" between date and time in fromisoformat()
#
# There are other incompatibilities, but we don't care about those for now.
if not PY_311:
# Convert Z to +00:00 so that fromisoformat() works correctly on Python 3.10 and older
if arg.endswith("Z"):
arg = arg[:-1] + "+00:00"

# Find timezone info after the date part. Possible formats, so we use the longest one:
#
# YYYYmmdd length: 8
# YYYY-mm-dd length: 10
tstr = arg
tzstr = ""
if tzpos := arg[10:].find("+") + 1 or arg[10:].find("-") + 1:
tzstr = arg[10 + tzpos-1 :]
tstr = arg[:10 + tzpos-1]

# Convert +0200 to +02:00 so that fromisoformat() works correctly on Python 3.10 and older
if len(tzstr) == 5 and tzstr[3] != ":":
tzstr = tzstr[:3] + ":" + tzstr[3:]

# Python 3.10 and older do not support nanoseconds in fromisoformat()
if microsecond_pos := arg.rfind(".") + 1:
microseconds = arg[microsecond_pos:]
tstr = arg[:microsecond_pos - 1]
if tzpos := (microseconds.find("+") + 1 or microseconds.find("-") + 1):
microseconds = microseconds[: tzpos - 1]
# Pad microseconds to 6 digits, truncate if longer
microseconds = microseconds.ljust(6, "0")[:6]
arg = tstr + "." + microseconds + tzstr
else:
arg = tstr + tzstr

obj = cls.fromisoformat(arg)
elif isinstance(arg, (int, float_type)):
obj = cls.fromtimestamp(arg, UTC)
elif isinstance(arg, (_dt,)):
Expand Down
2 changes: 2 additions & 0 deletions tests/test_fieldtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,8 @@ def test_datetime():
("2006-11-10T14:29:55.585192699999999", datetime(2006, 11, 10, 14, 29, 55, 585192, tzinfo=UTC)),
(datetime(2023, 1, 1, tzinfo=UTC), datetime(2023, 1, 1, tzinfo=UTC)),
(0, datetime(1970, 1, 1, 0, 0, tzinfo=UTC)),
("2023-09-01 13:37:12.345678+09:00", datetime(2023, 9, 1, 4, 37, 12, 345678, tzinfo=UTC)),
("2006-11-10T14:29:55.585192699999999-07:00", datetime(2006, 11, 10, 21, 29, 55, 585192, tzinfo=UTC)),
],
)
def test_datetime_formats(tmp_path, value, expected_dt):
Expand Down

0 comments on commit 14f71d6

Please sign in to comment.