diff --git a/pytools/__init__.py b/pytools/__init__.py index 0292680ba9591093f6a2c56815c50ce56df999de..f1f1ebf31a3fee12fc99e36b3b104b1be326127a 100644 --- a/pytools/__init__.py +++ b/pytools/__init__.py @@ -169,6 +169,11 @@ Backports of newer Python functionality .. autofunction:: resolve_name +Hashing +------- + +.. autofunction:: unordered_hash + Type Variables Used ------------------- @@ -2616,6 +2621,51 @@ def resolve_name(name): # }}} +# {{{ unordered_hash + +def unordered_hash(hash_instance, iterable, hash_constructor=None): + """Using a hash algorithm given by the parameter-less constructor + *hash_constructor*, return a hash object whose internal state + depends on the entries of *iterable*, but not their order. If *hash* + is the instance returned by evaluating ``hash_constructor()``, then + the each entry *i* of the iterable must permit ``hash.upate(i)`` to + succeed. An example of *hash_constructor* is ``hashlib.sha256`` + from :mod:`hashlib`. ``hash.digest_size`` must also be defined. + If *hash_constructor* is not provided, ``hash_instance.name`` is + used to deduce it. + + :returns: the updated *hash_instance*. + + .. warning:: + + The construction used in this function is likely not cryptographically + secure. Do not use this function in a security-relevant context. + + .. versionadded:: 2021.2 + """ + + if hash_constructor is None: + from functools import partial + import hashlib + hash_constructor = partial(hashlib.new, hash_instance.name) + + h_int = 0 + for i in iterable: + h_i = hash_constructor() + h_i.update(i) + # Using sys.byteorder (for efficiency) here technically makes the + # hash system-dependent (which it should not be), however the + # effect of this is undone by the to_bytes conversion below, while + # left invariant by the intervening XOR operations (which do not + # mix adjacent bits). + h_int = h_int ^ int.from_bytes(h_i.digest(), sys.byteorder) + + hash_instance.update(h_int.to_bytes(hash_instance.digest_size, sys.byteorder)) + return hash_instance + +# }}} + + def _test(): import doctest doctest.testmod() diff --git a/pytools/persistent_dict.py b/pytools/persistent_dict.py index c33f2404ebff71a9cc825f38dca0bef678f0227f..c3a7b1da76cd0d7973715ba093ff227a9d895e40 100644 --- a/pytools/persistent_dict.py +++ b/pytools/persistent_dict.py @@ -173,7 +173,37 @@ class ItemDirManager(CleanupBase): # {{{ key generation class KeyBuilder: + """A (stateless) object that computes hashes of objects fed to it. Subclassing + this class permits customizing the computation of hash keys. + + .. automethod:: __call__ + .. automethod:: rec + .. staticmethod:: new_hash() + + Return a new hash instance following the protocol of the ones + from :mod:`hashlib`. This will permit switching to different + hash algorithms in the future. Subclasses are expected to use + this to create new hashes. Not doing so is deprecated and + may stop working as early as 2022. + + .. versionadded:: 2021.2 + """ + + # this exists so that we can (conceivably) switch algorithms at some point + # down the road + new_hash = hashlib.sha256 + def rec(self, key_hash, key): + """ + :arg key_hash: the hash object to be updated with the hash of *key*. + :arg key: the (immutable) Python object to be hashed. + :returns: the updated *key_hash* + + .. versionchanged:: 2021.2 + + Now returns the updated *key_hash*. + """ + digest = None try: @@ -187,7 +217,7 @@ class KeyBuilder: except AttributeError: pass else: - inner_key_hash = hashlib.sha256() + inner_key_hash = self.new_hash() method(inner_key_hash, self) digest = inner_key_hash.digest() @@ -205,7 +235,7 @@ class KeyBuilder: method = self.update_for_specific_dtype if method is not None: - inner_key_hash = hashlib.sha256() + inner_key_hash = self.new_hash() method(inner_key_hash, key) digest = inner_key_hash.digest() @@ -222,9 +252,10 @@ class KeyBuilder: pass key_hash.update(digest) + return key_hash def __call__(self, key): - key_hash = hashlib.sha256() + key_hash = self.new_hash() self.rec(key_hash, key) return key_hash.hexdigest() @@ -232,14 +263,21 @@ class KeyBuilder: @staticmethod def update_for_int(key_hash, key): - key_hash.update(str(key).encode("utf8")) + sz = 8 + while True: + try: + key_hash.update(key.to_bytes(sz, byteorder="little", signed=True)) + return + except OverflowError: + sz *= 2 - update_for_long = update_for_int - update_for_bool = update_for_int + @staticmethod + def update_for_bool(key_hash, key): + key_hash.update(str(key).encode("utf8")) @staticmethod def update_for_float(key_hash, key): - key_hash.update(repr(key).encode("utf8")) + key_hash.update(key.hex().encode("utf8")) @staticmethod def update_for_str(key_hash, key): @@ -254,8 +292,11 @@ class KeyBuilder: self.rec(key_hash, obj_i) def update_for_frozenset(self, key_hash, key): - for set_key in sorted(key): - self.rec(key_hash, set_key) + from pytools import unordered_hash + + unordered_hash( + key_hash, + (self.rec(self.new_hash(), key_i).digest() for key_i in key)) @staticmethod def update_for_NoneType(key_hash, key): # noqa @@ -426,7 +467,7 @@ class _PersistentDictBase: import appdirs container_dir = join( appdirs.user_cache_dir("pytools", "pytools"), - "pdict-v3-{}-py{}".format( + "pdict-v4-{}-py{}".format( identifier, ".".join(str(i) for i in sys.version_info))) diff --git a/pytools/version.py b/pytools/version.py index 8b36d4f9e9fce9d4f7b324fbb69d561f854d7a36..b431cafddb6c25c285c9046eaf4bbf749911478e 100644 --- a/pytools/version.py +++ b/pytools/version.py @@ -1,3 +1,3 @@ -VERSION = (2021, 1, 2) +VERSION = (2021, 2) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS diff --git a/test/test_persistent_dict.py b/test/test_persistent_dict.py index bf1fd5433bb88524fdc75bdbf5c62497c3868563..050d9d951092d5abb0f136e5241427d780e3a909 100644 --- a/test/test_persistent_dict.py +++ b/test/test_persistent_dict.py @@ -58,7 +58,8 @@ def test_persistent_dict_storage_and_lookup(): for i in range(n)) keys = [ - (randrange(2000), rand_str(), None, SomeTag(rand_str())) + (randrange(2000)-1000, rand_str(), None, SomeTag(rand_str()), + frozenset({"abc", 123})) for i in range(20)] values = [randrange(2000) for i in range(20)] diff --git a/test/test_pytools.py b/test/test_pytools.py index 87e62a13a83c486e380b29466367a3a37153c7b8..356e47c34e4bb12016c9b262b6d575181dbaca32 100644 --- a/test/test_pytools.py +++ b/test/test_pytools.py @@ -1,3 +1,26 @@ +__copyright__ = "Copyright (C) 2009-2021 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + import sys import pytest @@ -412,6 +435,28 @@ def test_tag(): t4.without_tags(red_ribbon) +def test_unordered_hash(): + import random + import hashlib + + # FIXME: Use randbytes once >=3.9 is OK + lst = [bytes([random.randrange(256) for _ in range(20)]) + for _ in range(200)] + lorig = lst[:] + random.shuffle(lst) + + from pytools import unordered_hash + assert (unordered_hash(hashlib.sha256(), lorig).digest() + == unordered_hash(hashlib.sha256(), lst).digest()) + assert (unordered_hash(hashlib.sha256(), lorig).digest() + == unordered_hash(hashlib.sha256(), lorig).digest()) + assert (unordered_hash(hashlib.sha256(), lorig).digest() + != unordered_hash(hashlib.sha256(), lorig[:-1]).digest()) + lst[0] = b"aksdjfla;sdfjafd" + assert (unordered_hash(hashlib.sha256(), lorig).digest() + != unordered_hash(hashlib.sha256(), lst).digest()) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])