Infer naming convention when converting objects to structs

Struct types support renaming of fields for encoding/decoding. A common use of this is to enforce a camelCase convention in the serialized format: ```python import msgspec class Example(msgspec.Struct, rename="camel"): field_one: int field_two: int x = Example(1, 2) print(msgspec.json.encode(x)) #> b'{"fieldOne":1,"fieldTwo":2} ``` Previously when converting an object to a struct we'd always use the renamed field names rather than the original names. This was true whether the input was a `dict`, a non-dict mapping, mapping, or an arbitrary object via attributes if `from_attributes=True`. The latter two inputs will rarely/never occur when coming from a serialization framework, but are more commonly used with database/ORM-like objects. In this case it's more likely that the *original* attribute names are more useful, as both the database and struct object representations are internal to the application (unlike the serialized names which may have to match some external convention like camelCase). We now infer the intended naming schem when a non-dict mapping or object is passed to `msgspec.convert` to convert to a `msgspec.Struct` type. The inference process is as follows: - The attribute names are tried first - If an attribute name is present in the input AND the attribute name doesn't match the renamed name, then attribute names are used exclusively for the remainder of the conversion process. - If an attribute name isn't present AND the attribute name doesn't match the renamed name, then the renamed name is tried. If the renamed name is present, then renamed names are used exclusively for the remainder of the conversion process. A key point here is that inputs may not mix attribute and renamed names together - the inference process will decide to use either only one or the other depending on what names are present. Using `Example` above: - An input with `field_one` and `field_two` would be valid - An input with `fieldOne` and `fieldTwo` would be valid - An input with `field_one` and `fieldTwo` would error saying `field_two` is missing. - An input with `fieldOne` and `field_two` would error saying `fieldTwo` is missing. The overhead of this inference process is low - at worst only one excess `getattr` call is made to determine whether to use the original or renamed names. To reiterate, this change only affects object (non-dict mapping or arbitrary object) inputs to `msgspec.convert` when converting to a `Struct` type. Inputs of other types like `dict` are still assumed to have come from a serialization protocol and will always use the renamed names.
jcrist · Jan 22, 2024 · de1a87b · de1a87b
1 parent b08046b
commit de1a87b
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 17 deletions.
diff --git a/msgspec/_core.c b/msgspec/_core.c
@@ -21263,11 +21263,51 @@ convert_object_to_struct(
     bool is_gc = MS_TYPE_IS_GC(struct_type);
     bool should_untrack = is_gc;
 
+    /* If no fields are renamed we only have one fields tuple to choose from */
+    PyObject *fields = NULL;
+    if (struct_type->struct_fields == struct_type->struct_encode_fields) {
+        fields = struct_type->struct_fields;
+    }
+
     for (Py_ssize_t i = 0; i < nfields; i++) {
-        PyObject *field = PyTuple_GET_ITEM(struct_type->struct_encode_fields, i);
-        PyObject *attr = getter(obj, field);
-        PyObject *val;
-        if (attr == NULL) {
+        PyObject *field, *attr, *val;
+
+        if (MS_LIKELY(fields != NULL)) {
+            /* fields tuple already determined, just get the next field name */
+            field = PyTuple_GET_ITEM(fields, i);
+            attr = getter(obj, field);
+        }
+        else {
+            /* fields tuple undetermined. Try the attribute name first */
+            PyObject *encode_field;
+            field = PyTuple_GET_ITEM(struct_type->struct_fields, i);
+            encode_field = PyTuple_GET_ITEM(struct_type->struct_encode_fields, i);
+            attr = getter(obj, field);
+            if (field != encode_field) {
+                /* The field _was_ renamed */
+                if (attr != NULL) {
+                    /* Got a match, lock-in using attribute names */
+                    fields = struct_type->struct_fields;
+                }
+                else {
+                    /* No match. Try using the renamed name */
+                    PyErr_Clear();
+                    attr = getter(obj, encode_field);
+                    if (attr != NULL) {
+                        /* Got a match, lock-in using renamed names */
+                        field = encode_field;
+                        fields = struct_type->struct_encode_fields;
+                    }
+                }
+            }
+        }
+
+        if (attr != NULL) {
+            PathNode field_path = {path, PATH_STR, field};
+            val = convert(self, attr, info->types[i], &field_path);
+            Py_DECREF(attr);
+        }
+        else {
             PyErr_Clear();
             PyObject *default_val = NULL;
             if (MS_LIKELY(i >= (nfields - ndefaults))) {
@@ -21284,11 +21324,6 @@ convert_object_to_struct(
             }
             val = get_default(default_val);
         }
-        else {
-            PathNode field_path = {path, PATH_STR, field};
-            val = convert(self, attr, info->types[i], &field_path);
-            Py_DECREF(attr);
-        }
         if (val == NULL) goto error;
         Struct_set_index(out, i, val);
         if (should_untrack) {

diff --git a/tests/test_convert.py b/tests/test_convert.py
@@ -1701,16 +1701,16 @@ class Ex2(Struct):
 
 
 class TestStruct:
-    class Account(Struct):
+    class Account(Struct, kw_only=True):
         first: str
         last: str
-        age: int
         verified: bool = False
+        age: int
 
     @mapcls_and_from_attributes
     def test_struct(self, mapcls, from_attributes):
         msg = mapcls(first="alice", last="munro", age=91, verified=True)
-        sol = self.Account("alice", "munro", 91, True)
+        sol = self.Account(first="alice", last="munro", verified=True, age=91)
         res = convert(msg, self.Account, from_attributes=from_attributes)
         assert res == sol
 
@@ -1754,12 +1754,58 @@ class Ex(Struct):
 
         assert convert(msg, Ex, from_attributes=True) == Ex(1)
 
-    def test_from_attributes_option_uses_renamed_fields(self):
+    @pytest.mark.parametrize("mapcls", [GetAttrObj, GetItemObj])
+    def test_object_to_struct_with_renamed_fields(self, mapcls):
         class Ex(Struct, rename="camel"):
-            field_one: int
+            fa: int
+            f_b: int
+            fc: int
+            f_d: int
+
+        sol = Ex(1, 2, 3, 4)
+
+        # Use attribute names
+        msg = mapcls(fa=1, f_b=2, fc=3, f_d=4)
+        assert convert(msg, Ex, from_attributes=True) == sol
+
+        # Use renamed names
+        msg = mapcls(fa=1, fB=2, fc=3, fD=4)
+        assert convert(msg, Ex, from_attributes=True) == sol
 
-        msg = GetAttrObj(fieldOne=2)
-        assert convert(msg, Ex, from_attributes=True) == Ex(2)
+        # Priority to attribute names
+        msg = mapcls(fa=1, f_b=2, fB=100, fc=3, f_d=4, fD=100)
+        assert convert(msg, Ex, from_attributes=True) == sol
+
+        # Don't allow renamed names if determined to be attributes
+        msg = mapcls(fa=1, f_b=2, fc=3, fD=4)
+        with pytest.raises(ValidationError, match="missing required field `f_d`"):
+            convert(msg, Ex, from_attributes=True)
+
+        # Don't allow attributes if determined to be renamed names
+        msg = mapcls(fa=1, fB=2, fc=3, f_d=4)
+        with pytest.raises(ValidationError, match="missing required field `fD`"):
+            convert(msg, Ex, from_attributes=True)
+
+        # Errors use attribute name if using attributes
+        msg = mapcls(fa=1, f_b=2, fc=3, f_d="bad")
+        with pytest.raises(
+            ValidationError, match=r"Expected `int`, got `str` - at `\$.f_d`"
+        ):
+            convert(msg, Ex, from_attributes=True)
+
+        # Errors use renamed name if using renamed names
+        msg = mapcls(fa=1, fB=2, fc=3, fD="bad")
+        with pytest.raises(
+            ValidationError, match=r"Expected `int`, got `str` - at `\$.fD`"
+        ):
+            convert(msg, Ex, from_attributes=True)
+
+        # Errors use attribute name if undecided
+        msg = mapcls(fa="bad")
+        with pytest.raises(
+            ValidationError, match=r"Expected `int`, got `str` - at `\$.fa`"
+        ):
+            convert(msg, Ex, from_attributes=True)
 
     @pytest.mark.parametrize("forbid_unknown_fields", [False, True])
     @mapcls_and_from_attributes
@@ -1780,7 +1826,7 @@ class Ex(Struct, forbid_unknown_fields=forbid_unknown_fields):
     def test_struct_defaults_missing_fields(self, mapcls, from_attributes):
         msg = mapcls(first="alice", last="munro", age=91)
         res = convert(msg, self.Account, from_attributes=from_attributes)
-        assert res == self.Account("alice", "munro", 91)
+        assert res == self.Account(first="alice", last="munro", age=91)
 
     @mapcls_from_attributes_and_array_like
     def test_struct_gc_maybe_untracked_on_decode(