From a2c5c53f3cd902d0a77bca7f7376e1c470d4e6ac Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Wed, 3 Jul 2024 15:08:11 -0500 Subject: [PATCH] Use siphash for persistent hashing --- .pylintrc-local.yml | 1 + pyproject.toml | 1 + pytools/persistent_dict.py | 11 +++++--- pytools/test/test_persistent_dict.py | 38 +++++++++++----------------- 4 files changed, 24 insertions(+), 27 deletions(-) diff --git a/.pylintrc-local.yml b/.pylintrc-local.yml index 8cb3aa7..e045e85 100644 --- a/.pylintrc-local.yml +++ b/.pylintrc-local.yml @@ -4,3 +4,4 @@ - arg: ignored-modules val: - matplotlib + - siphash24 diff --git a/pyproject.toml b/pyproject.toml index adf94b1..d0ce066 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ classifiers = [ dependencies = [ "platformdirs>=2.2", "typing-extensions>=4; python_version<'3.11'", + "siphash24>=1.6", ] [project.optional-dependencies] diff --git a/pytools/persistent_dict.py b/pytools/persistent_dict.py index c35476c..a4756d6 100644 --- a/pytools/persistent_dict.py +++ b/pytools/persistent_dict.py @@ -30,7 +30,6 @@ THE SOFTWARE. """ -import hashlib import logging import os import pickle @@ -52,6 +51,8 @@ from typing import ( cast, ) +from siphash24 import siphash13 + if TYPE_CHECKING: from _typeshed import ReadableBuffer @@ -160,7 +161,7 @@ class KeyBuilder: # this exists so that we can (conceivably) switch algorithms at some point # down the road - new_hash: Callable[..., Hash] = hashlib.sha256 + new_hash: Callable[..., Hash] = siphash13 def rec(self, key_hash: Hash, key: Any) -> Hash: """ @@ -301,7 +302,8 @@ class KeyBuilder: unordered_hash( key_hash, - (self.rec(self.new_hash(), key_i).digest() for key_i in key)) + (self.rec(self.new_hash(), key_i).digest() for key_i in key), + hash_constructor=self.new_hash) update_for_FrozenOrderedSet = update_for_frozenset # noqa: N815 @@ -351,7 +353,8 @@ class KeyBuilder: unordered_hash( key_hash, - (self.rec(self.new_hash(), (k, v)).digest() for k, v in key.items())) + (self.rec(self.new_hash(), (k, v)).digest() for k, v in key.items()), + hash_constructor=self.new_hash) update_for_immutabledict = update_for_frozendict update_for_constantdict = update_for_frozendict diff --git a/pytools/test/test_persistent_dict.py b/pytools/test/test_persistent_dict.py index c4d3639..5239884 100644 --- a/pytools/test/test_persistent_dict.py +++ b/pytools/test/test_persistent_dict.py @@ -548,17 +548,14 @@ def test_class_hashing() -> None: assert keyb(TagClass) != keyb(TagClass2) assert keyb(TagClass()) != keyb(TagClass2()) - assert keyb(TagClass()) == \ - "f5697a96dde0083e31a290b54ee7a5640b2bb8eb6d18e9c7ee89228b015a6131" - assert keyb(TagClass2) == \ - "0833645765e32e7fb4a586614d0e345878eba50199ed2d8e963b28f797fd6e29" + assert keyb(TagClass()) == "7b3e4e66503438f6" + assert keyb(TagClass2) == "690b86bbf51aad83" @tag_dataclass class TagClass3(Tag): s: str - assert (keyb(TagClass3("foo")) # type: ignore[call-arg] - == "c6521f4157ed530d04e956b7046db85e038c120b047cd1b848340d81f9fd8b4a") + assert (keyb(TagClass3("foo")) == "cf1a33652cc75b9c") # type: ignore[call-arg] def test_dataclass_hashing() -> None: @@ -569,8 +566,7 @@ def test_dataclass_hashing() -> None: name: str value: int - assert keyb(MyDC("hi", 1)) == \ - "2ba6363c3b98f1cc2209bd57388368b3efe3074e3764eee30fbcf15946efb802" + assert keyb(MyDC("hi", 1)) == "d1a1079f1c10aa4f" assert keyb(MyDC("hi", 1)) == keyb(MyDC("hi", 1)) assert keyb(MyDC("hi", 1)) != keyb(MyDC("hi", 2)) @@ -594,8 +590,7 @@ def test_attrs_hashing() -> None: name: str value: int - assert (keyb(MyAttrs("hi", 1)) # type: ignore[call-arg] - == "17f272d114d22c1dc0117354777f2d506b303d90e10840d39fb0eef007252f68") + assert (keyb(MyAttrs("hi", 1)) == "5b6c5da60eb2bd0f") # type: ignore[call-arg] assert keyb(MyAttrs("hi", 1)) == keyb(MyAttrs("hi", 1)) # type: ignore[call-arg] assert keyb(MyAttrs("hi", 1)) != keyb(MyAttrs("hi", 2)) # type: ignore[call-arg] @@ -626,7 +621,7 @@ def test_datetime_hashing() -> None: # No timezone info; date is always naive assert (keyb(datetime.date(2020, 1, 1)) == keyb(datetime.date(2020, 1, 1)) - == "9fb97d7faabc3603f3e334ca5eb1eb0fe0c92665e5611cb1b5aa77fa0f70f5e3") + == "1c866ff10ff0d997") assert keyb(datetime.date(2020, 1, 1)) != keyb(datetime.date(2020, 1, 2)) # }}} @@ -640,7 +635,7 @@ def test_datetime_hashing() -> None: == keyb(datetime.time(12, 0)) == keyb(datetime.time(12, 0, 0)) == keyb(datetime.time(12, 0, 0, 0)) - == "288ec82f6a00ac15968d4d257d4aca1089b863c61ef2ee200e64351238397705") + == "e523be74ebc6b227") assert keyb(datetime.time(12, 0)) != keyb(datetime.time(12, 1)) # Aware time @@ -653,7 +648,7 @@ def test_datetime_hashing() -> None: assert t1 == t2 assert (keyb(t1) == keyb(t2) - == "3587427ca9d581779d532b397df206ddeadfcf4e38b1ee69c19174e8e1268cc4") + == "2041e7cd5b17b8eb") assert t1 != t3 assert keyb(t1) != keyb(t3) @@ -672,7 +667,7 @@ def test_datetime_hashing() -> None: assert dt1 == dt2 assert (keyb(dt1) == keyb(dt2) - == "cd35722af47e42cb3bc81c389b87eb2e78ee8e20298bb1d8a193b30940d1c142") + == "8be96b9e739c7d8c") dt3 = datetime.datetime(2020, 1, 1, 7, tzinfo=datetime.timezone(datetime.timedelta(hours=-4))) @@ -688,7 +683,7 @@ def test_datetime_hashing() -> None: assert (keyb(datetime.datetime(2020, 1, 1)) == keyb(datetime.datetime(2020, 1, 1)) == keyb(datetime.datetime(2020, 1, 1, 0, 0, 0, 0)) - == "8f3b843d7b9176afd8e2ce97ebc19789098a1c7774c4ec00d4054ec954ce2b88" + == "215dbe82add7a55c" ) assert keyb(datetime.datetime(2020, 1, 1)) != keyb(datetime.datetime(2020, 1, 2)) assert (keyb(datetime.datetime(2020, 1, 1)) @@ -711,7 +706,7 @@ def test_datetime_hashing() -> None: assert tz2 == tz3 assert (keyb(tz2) == keyb(tz3) - == "89bd615f32c1f209b0853b1fc7d06ddb6fda7f367a00a8621d60337d52cb8d10") + == "5e1d46ab778c7ccf") # }}} @@ -771,7 +766,7 @@ def test_size(): size = pdict.nbytes() print("sqlite size: ", size/1024/1024, " MByte") - assert 1*1024*1024 < size < 2*1024*1024 + assert 1024*1024//2 < size < 2*1024*1024 finally: shutil.rmtree(tmpdir) @@ -841,8 +836,7 @@ def test_hash_function() -> None: # {{{ global functions - assert keyb(global_fun) == keyb(global_fun) == \ - "51b5980dd3a8aa13f6e83869e4a04c22973d7aaf96cb22899abdfdc55e15c9b2" + assert keyb(global_fun) == keyb(global_fun) == "79efd03f9a38ed77" assert keyb(global_fun) != keyb(global_fun2) # }}} @@ -882,8 +876,7 @@ def test_hash_function() -> None: def local_fun2(): pass - assert keyb(local_fun) == keyb(local_fun) == \ - "fc58f5b0130df821913c848749eb03f5dcd4da7a568c6130f1c0cfb96ed0d12d" + assert keyb(local_fun) == keyb(local_fun) == "adc92e690b62dc2b" assert keyb(local_fun) != keyb(local_fun2) # }}} @@ -898,8 +891,7 @@ def test_hash_function() -> None: def method(self): pass - assert keyb(C1.method) == keyb(C1.method) == \ - "3013eb424dac133a57bd70cb6084d2a2f349a247714efc508fe3b10b99b6f717" + assert keyb(C1.method) == keyb(C1.method) == "af19e056ad7749c4" assert keyb(C1.method) != keyb(C2.method) # }}} -- GitLab