diff --git a/pyopencl/scan.py b/pyopencl/scan.py
index 7787041c4ba8135c1f1c5e3f54e5dab0007dbc21..e94af0d2de4325263dada0c8ce5375c7e74deb27 100644
--- a/pyopencl/scan.py
+++ b/pyopencl/scan.py
@@ -1065,6 +1065,10 @@ class GenericScanKernel(_GenericScanKernelBase):
                 dev.local_mem_size
                 for dev in self.devices)
 
+        if "CUDA" in self.devices[0].platform.name:
+            # not sure where these go, but roughly this much seems unavailable.
+            avail_local_mem -= 0x400
+
         is_cpu = self.devices[0].type & cl.device_type.CPU
         is_gpu = self.devices[0].type & cl.device_type.GPU
 
@@ -1091,7 +1095,7 @@ class GenericScanKernel(_GenericScanKernelBase):
                 k_group_size = 2**k_exp
                 lmem_use = self.get_local_mem_use(wg_size, k_group_size,
                         use_bank_conflict_avoidance)
-                if lmem_use + 256 <= avail_local_mem:
+                if lmem_use <= avail_local_mem:
                     solutions.append((wg_size*k_group_size, k_group_size, wg_size))
 
         if is_gpu: