From 9e597db068989e5d35dc58e3a4877838f505b52f Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Fri, 11 Oct 2024 12:23:05 +0200 Subject: [PATCH] hash get_object_state, use dir for native types, fix __slots__ (#208) Fix #207 See #207 for some discussion. Note that this slightly changes the behavior, but only for cases which were basically broken before. E.g., for `functools.partial`, before this change, none of the relevant args (`func`, `args`, `keywords`) were ever part of the hash. So this change should not really break anything. The same is also true for cases where there was both a `__dict__` and `__slots__`. With Python <=3.10, it would have used only the `__dict__`, and very likely, that was just empty, and does not contain any of the relevant things (which are very likely all in the `__slots__`), so it was totally broken and basically unusable. With Python >=3.11, there is a default `__getstate__` now, which would already do the correct thing, at least for "normal" (non-native) types, i.e. merge `__dict__` and `__slots__`, i.e. our existing `get_object_state` was correct for those cases. With this change, Python <=3.10 is also correct now. In any case, we check through `dir` now, and we check for member descriptors, which should also cover such native types (like `functools.partial`). --- sisyphus/hash.py | 73 +++++++++++++++++++++++++++++++++++++----- tests/hash_unittest.py | 21 ++++++++++++ 2 files changed, 86 insertions(+), 8 deletions(-) diff --git a/sisyphus/hash.py b/sisyphus/hash.py index dd5893d..752963d 100644 --- a/sisyphus/hash.py +++ b/sisyphus/hash.py @@ -1,6 +1,7 @@ import enum import hashlib -from inspect import isclass, isfunction +import pathlib +from inspect import isclass, isfunction, ismemberdescriptor def md5(obj): @@ -43,6 +44,13 @@ def get_object_state(obj): Comment: Maybe obj.__reduce__() is a better idea? is it stable for hashing? """ + if isinstance(obj, pathlib.PurePath): + # pathlib paths have a somewhat technical internal state + # ('_drv', '_root', '_parts', '_str', '_hash', '_pparts', '_cached_cparts'), + # so we don't want to rely on this, but instead just use the string representation as state. + # https://github.com/rwth-i6/sisyphus/pull/208#issuecomment-2405560718 + return str(obj) + if hasattr(obj, "__getnewargs_ex__"): args = obj.__getnewargs_ex__() elif hasattr(obj, "__getnewargs__"): @@ -52,15 +60,19 @@ def get_object_state(obj): if hasattr(obj, "__sis_state__"): state = obj.__sis_state__() - elif hasattr(obj, "__getstate__"): + # Note: Since Python 3.11, there is a default object.__getstate__. + # However, this default object.__getstate__ is not correct for some native types, e.g. _functools.partial. + # https://github.com/rwth-i6/sisyphus/issues/207 + # https://github.com/python/cpython/issues/125094 + # Thus, only use __getstate__ if it is not the default object.__getstate__. + elif hasattr(obj, "__getstate__") and obj.__class__.__getstate__ is not getattr(object, "__getstate__", None): state = obj.__getstate__() - elif hasattr(obj, "__dict__"): - state = obj.__dict__ - elif hasattr(obj, "__slots__"): - state = {k: getattr(obj, k) for k in obj.__slots__ if hasattr(obj, k)} else: - assert args is not None, "Failed to get object state of: %s" % repr(obj) - state = None + state = _getmembers(obj) + if not state and not hasattr(obj, "__dict__") and not hasattr(obj, "__slots__"): + # Keep compatibility with old behavior. + assert args is not None, "Failed to get object state of: %s" % repr(obj) + state = None if isinstance(obj, enum.Enum): assert isinstance(state, dict) @@ -129,3 +141,48 @@ def _obj_type_qualname(obj) -> bytes: # In Python >=3.11, keep hash same as in Python <=3.10, https://github.com/rwth-i6/sisyphus/issues/188 return b"EnumMeta" return type(obj).__qualname__.encode() + + +def _getmembers(obj): + res = {} + if hasattr(obj, "__dict__"): + res.update(obj.__dict__) + if hasattr(obj, "__slots__"): + for key in obj.__slots__: + try: + res[key] = getattr(obj, key) + except AttributeError: + pass + # Note, there are cases where `__dict__` or `__slots__` don't contain all attributes, + # e.g. for some native types, e.g. _functools.partial. + # (https://github.com/rwth-i6/sisyphus/issues/207) + # `dir()` usually still lists those attributes. + # However, to keep the behavior as before, we only want to return the object attributes here, + # not the class attributes. + cls_dict = {} + for cls in reversed(type(obj).__mro__): + if getattr(cls, "__dict__", None): + cls_dict.update(cls.__dict__) + for key in dir(obj): + if key.startswith("__"): + continue + if key in res: + continue + # Get class attribute first, to maybe skip descriptors. + if key in cls_dict: + cls_value = cls_dict[key] + if hasattr(cls_value, "__get__"): # descriptor + # descriptor are e.g. properties, bound methods, etc. We don't want to have those. + # But member descriptors are usually for slots (even for native types without __slots__), + # so that is why we keep them. + if not ismemberdescriptor(cls_value): + continue + try: + value = getattr(obj, key) + except AttributeError: + # dir might not be reliable. just skip this + continue + if key in cls_dict and cls_dict[key] is value: + continue # this is a class attribute + res[key] = value + return res diff --git a/tests/hash_unittest.py b/tests/hash_unittest.py index a653e6c..ad804f2 100644 --- a/tests/hash_unittest.py +++ b/tests/hash_unittest.py @@ -32,6 +32,27 @@ def test_enum(self): + b" (tuple, (str, '_name_'), (str, 'Entry1')), (tuple, (str, '_value_'), (int, 1))))", ) + def test_functools_partial(self): + from functools import partial + + obj = partial(int, 42) + self.assertEqual( + sis_hash_helper(obj), + ( + b"(partial, (dict," + b" (tuple, (str, 'args'), (tuple, (int, 42)))," + b" (tuple, (str, 'func'), (type," + b" (tuple, (str, 'builtins'), (str, 'int'))))," + b" (tuple, (str, 'keywords'), (dict))))" + ), + ) + + def test_pathlib_Path(self): + from pathlib import Path + + obj = Path("/etc/passwd") + self.assertEqual(sis_hash_helper(obj), b"(PosixPath, (str, '/etc/passwd'))") + if __name__ == "__main__": unittest.main()