diff --git a/pyopencl/array.py b/pyopencl/array.py index bf58c965c69d69820a405de7a90eb7cd1e74bc87..f3b84c635a73a02d1c88e46f6a297ba9d37426fd 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -1279,10 +1279,15 @@ class Array: if not self.size: return - if ( - queue._get_cl_version() >= (1, 2) - and cl.get_cl_header_version() >= (1, 2)): - + cl_version_gtr_1_2 = ( + queue._get_cl_version() >= (1, 2) + and cl.get_cl_header_version() >= (1, 2) + ) + on_nvidia = queue.device.vendor.startswith("NVIDIA") + + # circumvent bug with large buffers on NVIDIA + # https://github.com/inducer/pyopencl/issues/395 + if cl_version_gtr_1_2 and not (on_nvidia and self.nbytes >= 2**31): self.add_event( cl.enqueue_fill_buffer(queue, self.base_data, np.int8(0), self.offset, self.nbytes, wait_for=wait_for)) diff --git a/test/test_array.py b/test/test_array.py index 613d2ff72cf06b17e33b6a2b15faf72bc711c3df..b5e99273cf2d1313cd3c1506fa6d8a5912253a17 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -205,6 +205,23 @@ def test_vector_fill(ctx_factory): a_gpu = cl_array.zeros(queue, 100, dtype=cltypes.float4) +def test_zeros_large_array(ctx_factory): + context = ctx_factory() + queue = cl.CommandQueue(context) + dev = queue.device + + size = 2**28 + 1 + if dev.address_bits == 64 and dev.max_mem_alloc_size >= 8 * size: + # this shouldn't hang/cause errors + # see https://github.com/inducer/pyopencl/issues/395 + a_gpu = cl_array.zeros(queue, (size,), dtype="float64") + # run a couple kernels to ensure no propagated runtime errors + a_gpu[...] = 1. + a_gpu = 2 * a_gpu - 3 + else: + pass + + def test_absrealimag(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context)