From a10df97a7d9dc59782f00e2b72fc9844feb0a6df Mon Sep 17 00:00:00 2001 From: Jim Crist-Harif Date: Sat, 26 Aug 2023 15:06:56 -0500 Subject: [PATCH] Relaxed datetime/time parsing Previously we strictly followed the RFC3339 format when parsing datetime and time objects from strings. We now support a few common ISO8601 compatible relaxations: - A `:` isn't required as part of the timezone component in both datetime and time strings (`2022-01-02T03:04:05.678+0102` and `2022-01-02T03:04:05.678+01:02` are treated the same). - A ` ` may be used instead of `T`/`t` as a separator between date and time components when parsing datetime strings. When encoding datetime/time objects we still strictly follow RFC3339. This eases integrating msgspec with other systems that don't strictly follow RFC3339. --- msgspec/_core.c | 23 +++++++++++++---------- tests/test_common.py | 19 +++++++++++++++++++ tests/test_json.py | 19 +++++++++++++++++++ 3 files changed, 51 insertions(+), 10 deletions(-) diff --git a/msgspec/_core.c b/msgspec/_core.c index 67591fbc..b426b12b 100644 --- a/msgspec/_core.c +++ b/msgspec/_core.c @@ -10078,11 +10078,12 @@ ms_decode_time(const char *buf, Py_ssize_t size, TypeNode *type, PathNode *path) goto invalid; } - /* Explicit offset requires exactly 5 bytes left */ - if (buf_end - buf != 5) goto invalid; - + if (buf_end - buf < 3) goto invalid; if ((buf = ms_read_fixint(buf, 2, &offset_hour)) == NULL) goto invalid; - if (*buf++ != ':') goto invalid; + /* RFC3339 requires a ':' separator, ISO8601 doesn't. We support + * either */ + if (*buf == ':') buf++; + if (buf_end - buf != 2) goto invalid; if ((buf = ms_read_fixint(buf, 2, &offset_min)) == NULL) goto invalid; if (offset_hour > 23 || offset_min > 59) goto invalid; offset *= (offset_hour * 60 + offset_min); @@ -10178,9 +10179,10 @@ ms_decode_datetime_from_str( if (*buf++ != '-') goto invalid; if ((buf = ms_read_fixint(buf, 2, &day)) == NULL) goto invalid; - /* Date/time separator can be T or t */ + /* RFC3339 date/time separator can be T or t. We also support ' ', which is + * ISO8601 compatible. */ c = *buf++; - if (!(c == 'T' || c == 't')) goto invalid; + if (!(c == 'T' || c == 't' || c == ' ')) goto invalid; /* Parse time */ if ((buf = ms_read_fixint(buf, 2, &hour)) == NULL) goto invalid; @@ -10251,11 +10253,12 @@ ms_decode_datetime_from_str( goto invalid; } - /* Explicit offset requires exactly 5 bytes left */ - if (buf_end - buf != 5) goto invalid; - + if (buf_end - buf < 3) goto invalid; if ((buf = ms_read_fixint(buf, 2, &offset_hour)) == NULL) goto invalid; - if (*buf++ != ':') goto invalid; + /* RFC3339 requires a ':' separator, ISO8601 doesn't. We support + * either */ + if (*buf == ':') buf++; + if (buf_end - buf != 2) goto invalid; if ((buf = ms_read_fixint(buf, 2, &offset_min)) == NULL) goto invalid; if (offset_hour > 23 || offset_min > 59) goto invalid; offset *= (offset_hour * 60 + offset_min); diff --git a/tests/test_common.py b/tests/test_common.py index 903b0df3..0d71a380 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -3211,6 +3211,20 @@ def test_decode_time_not_case_sensitive(self, proto, z): res = proto.decode(proto.encode(f"04:05:06.000007{z}"), type=datetime.time) assert res == sol + @pytest.mark.parametrize( + "lax, strict", + [ + ("03:04:05+0102", "03:04:05+01:02"), + ("03:04:05-0102", "03:04:05-01:02"), + ], + ) + def test_decode_time_rfc3339_relaxed(self, lax, strict, proto): + """msgspec supports a few relaxations of the RFC3339 format.""" + sol = datetime.time.fromisoformat(strict) + msg = proto.encode(lax) + res = proto.decode(msg, type=datetime.time) + assert res == sol + @pytest.mark.parametrize( "t, sol", [ @@ -3258,6 +3272,8 @@ def test_decode_time_nanos(self, proto, t, sol): "01:02:3.0000004Z", "01:02:03.0000004+5:06", "01:02:03.0000004+05:6", + "01:02:03.0000004+056", + "01:02:03.0000004+05600", # Trailing data "01:02:030", "01:02:03a", @@ -3265,6 +3281,7 @@ def test_decode_time_nanos(self, proto, t, sol): "01:02:03.0a", "01:02:03.0000004a", "01:02:03.0000004+00:000", + "01:02:03.0000004+00000", "01:02:03.0000004Z0", # Truncated "01:02:3", @@ -3280,6 +3297,8 @@ def test_decode_time_nanos(self, proto, t, sol): "01:02:03.00a+05:06", "01:02:03.004+0a:06", "01:02:03.004+05:0a", + "01:02:03.004+0a06", + "01:02:03.004+050a", # Hour out of range "24:02:03.004", # Minute out of range diff --git a/tests/test_json.py b/tests/test_json.py index 8ab56c89..ad27ec2b 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -984,6 +984,21 @@ def test_decode_datetime_nanos(self, msg, sol): res = msgspec.json.decode(msg, type=datetime.datetime) assert res == sol + @pytest.mark.parametrize( + "lax, strict", + [ + ("2022-01-02T03:04:05+0102", "2022-01-02T03:04:05+01:02"), + ("2022-01-02T03:04:05-0102", "2022-01-02T03:04:05-01:02"), + ("2022-01-02 03:04:05", "2022-01-02T03:04:05"), + ], + ) + def test_decode_datetime_rfc3339_relaxed(self, lax, strict): + """msgspec supports a few relaxations of the RFC3339 format.""" + sol = datetime.datetime.fromisoformat(strict) + msg = msgspec.json.encode(lax) + res = msgspec.json.decode(msg, type=datetime.datetime) + assert res == sol + @pytest.mark.parametrize( "s", [ @@ -996,8 +1011,10 @@ def test_decode_datetime_nanos(self, msg, sol): b'"0001-02-03T04:05:6.000007Z"', b'"0001-02-03T04:05:06.000007+0:00"', b'"0001-02-03T04:05:06.000007+00:0"', + b'"0001-02-03T04:05:06.000007+000"', # Trailing data b'"0001-02-03T04:05:06.000007+00:000"', + b'"0001-02-03T04:05:06.000007+00000"', b'"0001-02-03T04:05:06.000007Z0"', b'"0001-02-03T04:05:06a"', b'"0001-02-03T04:05:06.000007a"', @@ -1019,6 +1036,8 @@ def test_decode_datetime_nanos(self, msg, sol): b'"0001-02-03T04:05:06.000007a"', b'"0001-02-03T04:05:06.000007+0a:00"', b'"0001-02-03T04:05:06.000007+00:0a"', + b'"0001-02-03T04:05:06.000007+0a00"', + b'"0001-02-03T04:05:06.000007+000a"', # Year out of range b'"0000-02-03T04:05:06.000007Z"', # Month out of range