From 7eaf1041c4d7b053e3ebdcb80d2a4dce7dffa756 Mon Sep 17 00:00:00 2001
From: "Rebecca N. Palmer" <rebecca_palmer@zoho.com>
Date: Sat, 4 Aug 2018 10:14:02 +0100
Subject: [PATCH] Make Array.get/set/copy wait for / append to self.events

---
 pyopencl/array.py  | 12 ++++++++----
 test/test_array.py | 28 ++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/pyopencl/array.py b/pyopencl/array.py
index 2d032079..d7ef138b 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -621,9 +621,11 @@ class Array(object):
                     stacklevel=2)
 
         if self.size:
-            cl.enqueue_copy(queue or self.queue, self.base_data, ary,
+            event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary,
                     device_offset=self.offset,
                     is_blocking=not async_)
+            if not async_: # not already waited for
+                self.add_event(event1)
 
     def get(self, queue=None, ary=None, async_=None, **kwargs):
         """Transfer the contents of *self* into *ary* or a newly allocated
@@ -687,7 +689,7 @@ class Array(object):
         if self.size:
             cl.enqueue_copy(queue, ary, self.base_data,
                     device_offset=self.offset,
-                    is_blocking=not async_)
+                    wait_for=self.events, is_blocking=not async_)
 
         return ary
 
@@ -712,9 +714,11 @@ class Array(object):
             result = result.with_queue(queue)
 
         if self.nbytes:
-            cl.enqueue_copy(queue or self.queue,
+            event1 = cl.enqueue_copy(queue or self.queue,
                     result.base_data, self.base_data,
-                    src_offset=self.offset, byte_count=self.nbytes)
+                    src_offset=self.offset, byte_count=self.nbytes,
+                    wait_for=self.events)
+            result.add_event(event1)
 
         return result
 
diff --git a/test/test_array.py b/test/test_array.py
index bca78f5c..fdfcfce3 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1212,6 +1212,34 @@ def test_multi_put(ctx_factory):
 
     assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9))
 
+def test_outoforderqueue_get(ctx_factory):
+    context = ctx_factory()
+    try:
+        queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    except Exception:
+        pytest.skip("out-of-order queue not available")
+    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    a_gpu = cl_array.to_device(queue, a)
+    b_gpu = a_gpu + a_gpu**5 + 1
+    b1 = b_gpu.get() # testing that this waits for events
+    b = a + a**5 + 1
+    assert np.abs(b1 - b).mean() < 1e-5
+
+def test_outoforderqueue_copy(ctx_factory):
+    context = ctx_factory()
+    try:
+        queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    except Exception:
+        pytest.skip("out-of-order queue not available")
+    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    a_gpu = cl_array.to_device(queue, a)
+    c_gpu = a_gpu**2 - 7
+    b_gpu = c_gpu.copy() # testing that this waits for and creates events
+    b_gpu *= 10
+    queue.finish()
+    b1 = b_gpu.get()
+    b = 10 * (a**2 - 7)
+    assert np.abs(b1 - b).mean() < 1e-5
 
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the
-- 
GitLab