diff --git a/pyopencl/array.py b/pyopencl/array.py index 2d03207962b5fb1c6f5c9c4edb48ad1d3478e231..d7ef138b021bdf21162cade3a70b59136b616b3b 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -621,9 +621,11 @@ class Array(object): stacklevel=2) if self.size: - cl.enqueue_copy(queue or self.queue, self.base_data, ary, + event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary, device_offset=self.offset, is_blocking=not async_) + if not async_: # not already waited for + self.add_event(event1) def get(self, queue=None, ary=None, async_=None, **kwargs): """Transfer the contents of *self* into *ary* or a newly allocated @@ -687,7 +689,7 @@ class Array(object): if self.size: cl.enqueue_copy(queue, ary, self.base_data, device_offset=self.offset, - is_blocking=not async_) + wait_for=self.events, is_blocking=not async_) return ary @@ -712,9 +714,11 @@ class Array(object): result = result.with_queue(queue) if self.nbytes: - cl.enqueue_copy(queue or self.queue, + event1 = cl.enqueue_copy(queue or self.queue, result.base_data, self.base_data, - src_offset=self.offset, byte_count=self.nbytes) + src_offset=self.offset, byte_count=self.nbytes, + wait_for=self.events) + result.add_event(event1) return result diff --git a/test/test_array.py b/test/test_array.py index bca78f5ccbeebf1d86f9ec03a25ded6050bc4097..fdfcfce3a7ce857125c38b832a4db084c4fd86c5 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -1212,6 +1212,34 @@ def test_multi_put(ctx_factory): assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9)) +def test_outoforderqueue_get(ctx_factory): + context = ctx_factory() + try: + queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + except Exception: + pytest.skip("out-of-order queue not available") + a = np.random.rand(10**6).astype(np.dtype('float32')) + a_gpu = cl_array.to_device(queue, a) + b_gpu = a_gpu + a_gpu**5 + 1 + b1 = b_gpu.get() # testing that this waits for events + b = a + a**5 + 1 + assert np.abs(b1 - b).mean() < 1e-5 + +def test_outoforderqueue_copy(ctx_factory): + context = ctx_factory() + try: + queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + except Exception: + pytest.skip("out-of-order queue not available") + a = np.random.rand(10**6).astype(np.dtype('float32')) + a_gpu = cl_array.to_device(queue, a) + c_gpu = a_gpu**2 - 7 + b_gpu = c_gpu.copy() # testing that this waits for and creates events + b_gpu *= 10 + queue.finish() + b1 = b_gpu.get() + b = 10 * (a**2 - 7) + assert np.abs(b1 - b).mean() < 1e-5 if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the