diff --git a/pyopencl/array.py b/pyopencl/array.py
index bf58c965c69d69820a405de7a90eb7cd1e74bc87..f3b84c635a73a02d1c88e46f6a297ba9d37426fd 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -1279,10 +1279,15 @@ class Array:
         if not self.size:
             return
 
-        if (
-                queue._get_cl_version() >= (1, 2)
-                and cl.get_cl_header_version() >= (1, 2)):
-
+        cl_version_gtr_1_2 = (
+            queue._get_cl_version() >= (1, 2)
+            and cl.get_cl_header_version() >= (1, 2)
+        )
+        on_nvidia = queue.device.vendor.startswith("NVIDIA")
+
+        # circumvent bug with large buffers on NVIDIA
+        # https://github.com/inducer/pyopencl/issues/395
+        if cl_version_gtr_1_2 and not (on_nvidia and self.nbytes >= 2**31):
             self.add_event(
                     cl.enqueue_fill_buffer(queue, self.base_data, np.int8(0),
                         self.offset, self.nbytes, wait_for=wait_for))
diff --git a/test/test_array.py b/test/test_array.py
index 613d2ff72cf06b17e33b6a2b15faf72bc711c3df..b5e99273cf2d1313cd3c1506fa6d8a5912253a17 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -205,6 +205,23 @@ def test_vector_fill(ctx_factory):
     a_gpu = cl_array.zeros(queue, 100, dtype=cltypes.float4)
 
 
+def test_zeros_large_array(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+    dev = queue.device
+
+    size = 2**28 + 1
+    if dev.address_bits == 64 and dev.max_mem_alloc_size >= 8 * size:
+        # this shouldn't hang/cause errors
+        # see https://github.com/inducer/pyopencl/issues/395
+        a_gpu = cl_array.zeros(queue, (size,), dtype="float64")
+        # run a couple kernels to ensure no propagated runtime errors
+        a_gpu[...] = 1.
+        a_gpu = 2 * a_gpu - 3
+    else:
+        pass
+
+
 def test_absrealimag(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)