diff --git a/pyopencl/array.py b/pyopencl/array.py index 90e6aa57b0502d4e25ab552a70825e3e730a8a42..a1cbb4249644f331bbc2414077f709853749896a 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -624,7 +624,7 @@ class Array(object): event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary, device_offset=self.offset, is_blocking=not async_) - if not async_: # not already waited for + if not async_: # not already waited for self.add_event(event1) def get(self, queue=None, ary=None, async_=None, **kwargs): @@ -1293,7 +1293,8 @@ class Array(object): krnl = get_any_kernel(self.context, self.dtype) if wait_for is None: wait_for = [] - result, event1 = krnl(self, queue=queue, wait_for=wait_for + self.events, return_event=True) + result, event1 = krnl(self, queue=queue, + wait_for=wait_for + self.events, return_event=True) result.add_event(event1) return result @@ -1302,7 +1303,8 @@ class Array(object): krnl = get_all_kernel(self.context, self.dtype) if wait_for is None: wait_for = [] - result, event1 = krnl(self, queue=queue, wait_for=wait_for + self.events, return_event=True) + result, event1 = krnl(self, queue=queue, + wait_for=wait_for + self.events, return_event=True) result.add_event(event1) return result @@ -1686,7 +1688,7 @@ class Array(object): if flags is None: flags = cl.map_flags.READ | cl.map_flags.WRITE if wait_for is None: - wait_for=[] + wait_for = [] ary, evt = cl.enqueue_map_buffer( queue or self.queue, self.base_data, flags, self.offset, @@ -2274,7 +2276,6 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, if wait_for is None: wait_for = [] wait_for = wait_for + dest_indices.events - vec_count = len(arrays) @@ -2535,7 +2536,8 @@ def sum(a, dtype=None, queue=None, slice=None): """ from pyopencl.reduction import get_sum_kernel krnl = get_sum_kernel(a.context, dtype, a.dtype) - result, event1 = krnl(a, queue=queue, slice=slice, wait_for=a.events, return_event=True) + result, event1 = krnl(a, queue=queue, slice=slice, wait_for=a.events, + return_event=True) result.add_event(event1) return result @@ -2546,7 +2548,8 @@ def dot(a, b, dtype=None, queue=None, slice=None): """ from pyopencl.reduction import get_dot_kernel krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype) - result, event1 = krnl(a, b, queue=queue, slice=slice, wait_for=a.events + b.events, return_event=True) + result, event1 = krnl(a, b, queue=queue, slice=slice, + wait_for=a.events + b.events, return_event=True) result.add_event(event1) return result @@ -2559,7 +2562,8 @@ def vdot(a, b, dtype=None, queue=None, slice=None): from pyopencl.reduction import get_dot_kernel krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype, conjugate_first=True) - result, event1 = krnl(a, b, queue=queue, slice=slice, wait_for=a.events + b.events, return_event=True) + result, event1 = krnl(a, b, queue=queue, slice=slice, + wait_for=a.events + b.events, return_event=True) result.add_event(event1) return result @@ -2572,7 +2576,7 @@ def subset_dot(subset, a, b, dtype=None, queue=None, slice=None): krnl = get_subset_dot_kernel( a.context, dtype, subset.dtype, a.dtype, b.dtype) result, event1 = krnl(subset, a, b, queue=queue, slice=slice, - wait_for=subset.events + a.events + b.events, return_event=True) + wait_for=subset.events + a.events + b.events, return_event=True) result.add_event(event1) return result @@ -2581,7 +2585,8 @@ def _make_minmax_kernel(what): def f(a, queue=None): from pyopencl.reduction import get_minmax_kernel krnl = get_minmax_kernel(a.context, what, a.dtype) - result, event1 = krnl(a, queue=queue, wait_for=a.events, return_event=True) + result, event1 = krnl(a, queue=queue, wait_for=a.events, + return_event=True) result.add_event(event1) return result @@ -2604,7 +2609,7 @@ def _make_subset_minmax_kernel(what): from pyopencl.reduction import get_subset_minmax_kernel krnl = get_subset_minmax_kernel(a.context, what, a.dtype, subset.dtype) result, event1 = krnl(subset, a, queue=queue, slice=slice, - wait_for=a.events + subset.events, return_event=True) + wait_for=a.events + subset.events, return_event=True) result.add_event(event1) return result return f diff --git a/test/test_array.py b/test/test_array.py index c9771ac0b5bd15153080bd06a5dbdb658009b4ed..05008c169ae782a49b5b985c7a79780e337c5770 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -1212,39 +1212,45 @@ def test_multi_put(ctx_factory): assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9)) + def test_outoforderqueue_get(ctx_factory): context = ctx_factory() try: - queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + queue = cl.CommandQueue(context, + properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") a = np.random.rand(10**6).astype(np.dtype('float32')) a_gpu = cl_array.to_device(queue, a) b_gpu = a_gpu + a_gpu**5 + 1 - b1 = b_gpu.get() # testing that this waits for events + b1 = b_gpu.get() # testing that this waits for events b = a + a**5 + 1 assert np.abs(b1 - b).mean() < 1e-5 + def test_outoforderqueue_copy(ctx_factory): context = ctx_factory() try: - queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + queue = cl.CommandQueue(context, + properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") a = np.random.rand(10**6).astype(np.dtype('float32')) a_gpu = cl_array.to_device(queue, a) c_gpu = a_gpu**2 - 7 - b_gpu = c_gpu.copy() # testing that this waits for and creates events + b_gpu = c_gpu.copy() # testing that this waits for and creates events b_gpu *= 10 queue.finish() b1 = b_gpu.get() b = 10 * (a**2 - 7) assert np.abs(b1 - b).mean() < 1e-5 + def test_outoforderqueue_indexing(ctx_factory): context = ctx_factory() try: - queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + queue = cl.CommandQueue(context, + properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") a = np.random.rand(10**6).astype(np.dtype('float32')) @@ -1262,20 +1268,24 @@ def test_outoforderqueue_indexing(ctx_factory): b[i + 10000] = c - 10 assert np.abs(b1 - b).mean() < 1e-5 + def test_outoforderqueue_reductions(ctx_factory): context = ctx_factory() try: - queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + queue = cl.CommandQueue(context, + properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") - a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32')) # 0/1 values to avoid accumulated rounding error - a[800000] = 10 # all<5 looks true until near the end + # 0/1 values to avoid accumulated rounding error + a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32')) + a[800000] = 10 # all<5 looks true until near the end a_gpu = cl_array.to_device(queue, a) b1 = cl_array.sum(a_gpu).get() b2 = cl_array.dot(a_gpu, 3 - a_gpu).get() b3 = (a_gpu < 5).all().get() assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0 + if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the # tests. diff --git a/test/test_clmath.py b/test/test_clmath.py index f4a559367539f7df23e8124c6722c63a34b20d27..beebc2a8c0ad717e7139c340bff14a07fb77b60c 100644 --- a/test/test_clmath.py +++ b/test/test_clmath.py @@ -446,15 +446,18 @@ def test_hankel_01_complex(ctx_factory, ref_src): pt.loglog(np.abs(z), rel_err_h1) pt.show() + def test_outoforderqueue_clmath(ctx_factory): context = ctx_factory() try: - queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + queue = cl.CommandQueue(context, + properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") a = np.random.rand(10**6).astype(np.dtype('float32')) a_gpu = cl_array.to_device(queue, a) - b_gpu = clmath.fabs(clmath.sin(a_gpu * 5)) # testing that clmath functions wait for and create events + # testing that clmath functions wait for and create events + b_gpu = clmath.fabs(clmath.sin(a_gpu * 5)) queue.finish() b1 = b_gpu.get() b = np.abs(np.sin(a * 5))