From 5facd423d57885fdfeb231df22d621c1fbc5680c Mon Sep 17 00:00:00 2001
From: Matthias Diener <mdiener@illinois.edu>
Date: Tue, 4 Jun 2024 17:25:18 -0600
Subject: [PATCH] PersistentDict: allow setting sync (#229)

Co-authored-by: Andreas Kloeckner <inform@tiker.net>
---
 pytools/persistent_dict.py           | 38 ++++++++++++++++------
 pytools/test/test_persistent_dict.py | 47 +++++++++++++++++-----------
 2 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/pytools/persistent_dict.py b/pytools/persistent_dict.py
index 3e6ca3e..27860dd 100644
--- a/pytools/persistent_dict.py
+++ b/pytools/persistent_dict.py
@@ -431,7 +431,8 @@ class _PersistentDictBase(Mapping[K, V]):
     def __init__(self, identifier: str,
                  key_builder: Optional[KeyBuilder] = None,
                  container_dir: Optional[str] = None,
-                 enable_wal: bool = False) -> None:
+                 enable_wal: bool = False,
+                 safe_sync: Optional[bool] = None) -> None:
         self.identifier = identifier
         self.conn = None
 
@@ -471,17 +472,30 @@ class _PersistentDictBase(Mapping[K, V]):
         if enable_wal:
             self.conn.execute("PRAGMA journal_mode = 'WAL'")
 
-        # Note: the following configuration values were taken from litedict:
+        # Note: the following configuration values were taken mostly from litedict:
         # https://github.com/litements/litedict/blob/377603fa597453ffd9997186a493ed4fd23e5399/litedict.py#L67-L70
-        # They result in fast operations while maintaining database integrity
-        # even in the face of concurrent accesses and power loss.
 
-        # temp_store=2: use in-memory temp store
+        # Use in-memory temp store
         # https://www.sqlite.org/pragma.html#pragma_temp_store
-        self.conn.execute("PRAGMA temp_store = 2")
+        self.conn.execute("PRAGMA temp_store = 'MEMORY'")
 
+        # fsync() can be extremely slow on some systems.
+        # See https://github.com/inducer/pytools/issues/227 for context.
         # https://www.sqlite.org/pragma.html#pragma_synchronous
-        self.conn.execute("PRAGMA synchronous = NORMAL")
+        if safe_sync is None or safe_sync:
+            if safe_sync is None:
+                from warnings import warn
+                warn(f"pytools.persistent_dict '{identifier}': "
+                     "enabling safe_sync as default. "
+                     "This provides strong protection against data loss, "
+                     "but can be unnecessarily expensive for use cases such as "
+                     "caches."
+                     "Pass 'safe_sync=False' if occasional data loss is tolerable. "
+                     "Pass 'safe_sync=True' to suppress this warning.",
+                     stacklevel=3)
+            self.conn.execute("PRAGMA synchronous = 'NORMAL'")
+        else:
+            self.conn.execute("PRAGMA synchronous = 'OFF'")
 
         # 64 MByte of cache
         # https://www.sqlite.org/pragma.html#pragma_cache_size
@@ -595,7 +609,9 @@ class WriteOncePersistentDict(_PersistentDictBase[K, V]):
     def __init__(self, identifier: str,
                  key_builder: Optional[KeyBuilder] = None,
                  container_dir: Optional[str] = None,
+                 *,
                  enable_wal: bool = False,
+                 safe_sync: Optional[bool] = None,
                  in_mem_cache_size: int = 256) -> None:
         """
         :arg identifier: a filename-compatible string identifying this
@@ -611,7 +627,7 @@ class WriteOncePersistentDict(_PersistentDictBase[K, V]):
             *in_mem_cache_size* items (with an LRU replacement policy)
         """
         _PersistentDictBase.__init__(self, identifier, key_builder,
-                                     container_dir, enable_wal)
+                                     container_dir, enable_wal, safe_sync)
         from functools import lru_cache
         self._fetch = lru_cache(maxsize=in_mem_cache_size)(self._fetch)
 
@@ -683,7 +699,9 @@ class PersistentDict(_PersistentDictBase[K, V]):
                  identifier: str,
                  key_builder: Optional[KeyBuilder] = None,
                  container_dir: Optional[str] = None,
-                 enable_wal: bool = False) -> None:
+                 *,
+                 enable_wal: bool = False,
+                 safe_sync: Optional[bool] = None) -> None:
         """
         :arg identifier: a filename-compatible string identifying this
             dictionary
@@ -696,7 +714,7 @@ class PersistentDict(_PersistentDictBase[K, V]):
             not compatible with network filesystems.
         """
         _PersistentDictBase.__init__(self, identifier, key_builder,
-                                     container_dir, enable_wal)
+                                     container_dir, enable_wal, safe_sync)
 
     def store(self, key: K, value: V, _skip_if_present: bool = False) -> None:
         keyhash = self.key_builder(key)
diff --git a/pytools/test/test_persistent_dict.py b/pytools/test/test_persistent_dict.py
index 5e8dfd4..bc67e66 100644
--- a/pytools/test/test_persistent_dict.py
+++ b/pytools/test/test_persistent_dict.py
@@ -64,7 +64,8 @@ def test_persistent_dict_storage_and_lookup() -> None:
     try:
         tmpdir = tempfile.mkdtemp()
         pdict: PersistentDict[Any, int] = PersistentDict("pytools-test",
-                                                         container_dir=tmpdir)
+                                                         container_dir=tmpdir,
+                                                         safe_sync=False)
 
         from random import randrange
 
@@ -163,7 +164,8 @@ def test_persistent_dict_deletion() -> None:
     try:
         tmpdir = tempfile.mkdtemp()
         pdict: PersistentDict[int, int] = PersistentDict("pytools-test",
-                                                         container_dir=tmpdir)
+                                                         container_dir=tmpdir,
+                                                         safe_sync=False)
 
         pdict[0] = 0
         del pdict[0]
@@ -185,9 +187,11 @@ def test_persistent_dict_synchronization() -> None:
     try:
         tmpdir = tempfile.mkdtemp()
         pdict1: PersistentDict[int, int] = PersistentDict("pytools-test",
-                                                          container_dir=tmpdir)
+                                                          container_dir=tmpdir,
+                                                          safe_sync=False)
         pdict2: PersistentDict[int, int] = PersistentDict("pytools-test",
-                                                          container_dir=tmpdir)
+                                                          container_dir=tmpdir,
+                                                          safe_sync=False)
 
         # check lookup
         pdict1[0] = 1
@@ -210,7 +214,7 @@ def test_persistent_dict_cache_collisions() -> None:
     try:
         tmpdir = tempfile.mkdtemp()
         pdict: PersistentDict[PDictTestingKeyOrValue, int] = \
-            PersistentDict("pytools-test", container_dir=tmpdir)
+            PersistentDict("pytools-test", container_dir=tmpdir, safe_sync=False)
 
         key1 = PDictTestingKeyOrValue(1, hash_key=0)
         key2 = PDictTestingKeyOrValue(2, hash_key=0)
@@ -242,7 +246,8 @@ def test_persistent_dict_clear() -> None:
     try:
         tmpdir = tempfile.mkdtemp()
         pdict: PersistentDict[int, int] = PersistentDict("pytools-test",
-                                                         container_dir=tmpdir)
+                                                         container_dir=tmpdir,
+                                                         safe_sync=False)
 
         pdict[0] = 1
         pdict.fetch(0)
@@ -261,7 +266,7 @@ def test_write_once_persistent_dict_storage_and_lookup(in_mem_cache_size) -> Non
         tmpdir = tempfile.mkdtemp()
         pdict: WriteOncePersistentDict[int, int] = WriteOncePersistentDict(
                 "pytools-test", container_dir=tmpdir,
-                in_mem_cache_size=in_mem_cache_size)
+                in_mem_cache_size=in_mem_cache_size, safe_sync=False)
 
         # check lookup
         pdict[0] = 1
@@ -291,7 +296,8 @@ def test_write_once_persistent_dict_lru_policy() -> None:
     try:
         tmpdir = tempfile.mkdtemp()
         pdict: WriteOncePersistentDict[Any, Any] = WriteOncePersistentDict(
-                "pytools-test", container_dir=tmpdir, in_mem_cache_size=3)
+                "pytools-test", container_dir=tmpdir, in_mem_cache_size=3,
+                safe_sync=False)
 
         pdict[1] = PDictTestingKeyOrValue(1)
         pdict[2] = PDictTestingKeyOrValue(2)
@@ -331,9 +337,11 @@ def test_write_once_persistent_dict_synchronization() -> None:
     try:
         tmpdir = tempfile.mkdtemp()
         pdict1: WriteOncePersistentDict[int, int] = \
-            WriteOncePersistentDict("pytools-test", container_dir=tmpdir)
+            WriteOncePersistentDict("pytools-test", container_dir=tmpdir,
+                                    safe_sync=False)
         pdict2: WriteOncePersistentDict[int, int] = \
-            WriteOncePersistentDict("pytools-test", container_dir=tmpdir)
+            WriteOncePersistentDict("pytools-test", container_dir=tmpdir,
+                                    safe_sync=False)
 
         # check lookup
         pdict1[1] = 0
@@ -351,7 +359,8 @@ def test_write_once_persistent_dict_cache_collisions() -> None:
     try:
         tmpdir = tempfile.mkdtemp()
         pdict: WriteOncePersistentDict[Any, int] = \
-            WriteOncePersistentDict("pytools-test", container_dir=tmpdir)
+            WriteOncePersistentDict("pytools-test", container_dir=tmpdir,
+                                    safe_sync=False)
 
         key1 = PDictTestingKeyOrValue(1, hash_key=0)
         key2 = PDictTestingKeyOrValue(2, hash_key=0)
@@ -378,7 +387,8 @@ def test_write_once_persistent_dict_clear() -> None:
     try:
         tmpdir = tempfile.mkdtemp()
         pdict: WriteOncePersistentDict[int, int] = \
-            WriteOncePersistentDict("pytools-test", container_dir=tmpdir)
+            WriteOncePersistentDict("pytools-test", container_dir=tmpdir,
+                                    safe_sync=False)
 
         pdict[0] = 1
         pdict.fetch(0)
@@ -710,7 +720,7 @@ def test_xdg_cache_home() -> None:
     try:
         os.environ["XDG_CACHE_HOME"] = xdg_dir
 
-        PersistentDict("pytools-test")
+        PersistentDict("pytools-test", safe_sync=False)
 
         assert os.path.exists(xdg_dir)
     finally:
@@ -726,7 +736,8 @@ def test_speed():
     import time
 
     tmpdir = tempfile.mkdtemp()
-    pdict = WriteOncePersistentDict("pytools-test", container_dir=tmpdir)
+    pdict = WriteOncePersistentDict("pytools-test", container_dir=tmpdir,
+                                    safe_sync=False)
 
     start = time.time()
     for i in range(10000):
@@ -747,7 +758,7 @@ def test_speed():
 def test_size():
     try:
         tmpdir = tempfile.mkdtemp()
-        pdict = PersistentDict("pytools-test", container_dir=tmpdir)
+        pdict = PersistentDict("pytools-test", container_dir=tmpdir, safe_sync=False)
 
         for i in range(10000):
             pdict[f"foobarbazfoobbb{i}"] = i
@@ -762,7 +773,7 @@ def test_size():
 def test_len():
     try:
         tmpdir = tempfile.mkdtemp()
-        pdict = PersistentDict("pytools-test", container_dir=tmpdir)
+        pdict = PersistentDict("pytools-test", container_dir=tmpdir, safe_sync=False)
 
         assert len(pdict) == 0
 
@@ -781,7 +792,7 @@ def test_len():
 def test_repr():
     try:
         tmpdir = tempfile.mkdtemp()
-        pdict = PersistentDict("pytools-test", container_dir=tmpdir)
+        pdict = PersistentDict("pytools-test", container_dir=tmpdir, safe_sync=False)
 
         assert repr(pdict)[:15] == "PersistentDict("
     finally:
@@ -791,7 +802,7 @@ def test_repr():
 def test_keys_values_items():
     try:
         tmpdir = tempfile.mkdtemp()
-        pdict = PersistentDict("pytools-test", container_dir=tmpdir)
+        pdict = PersistentDict("pytools-test", container_dir=tmpdir, safe_sync=False)
 
         for i in range(10000):
             pdict[i] = i
-- 
GitLab