From 7eaf1041c4d7b053e3ebdcb80d2a4dce7dffa756 Mon Sep 17 00:00:00 2001
From: "Rebecca N. Palmer" <rebecca_palmer@zoho.com>
Date: Sat, 4 Aug 2018 10:14:02 +0100
Subject: [PATCH 1/8] Make Array.get/set/copy wait for / append to self.events

---
 pyopencl/array.py  | 12 ++++++++----
 test/test_array.py | 28 ++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/pyopencl/array.py b/pyopencl/array.py
index 2d032079..d7ef138b 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -621,9 +621,11 @@ class Array(object):
                     stacklevel=2)
 
         if self.size:
-            cl.enqueue_copy(queue or self.queue, self.base_data, ary,
+            event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary,
                     device_offset=self.offset,
                     is_blocking=not async_)
+            if not async_: # not already waited for
+                self.add_event(event1)
 
     def get(self, queue=None, ary=None, async_=None, **kwargs):
         """Transfer the contents of *self* into *ary* or a newly allocated
@@ -687,7 +689,7 @@ class Array(object):
         if self.size:
             cl.enqueue_copy(queue, ary, self.base_data,
                     device_offset=self.offset,
-                    is_blocking=not async_)
+                    wait_for=self.events, is_blocking=not async_)
 
         return ary
 
@@ -712,9 +714,11 @@ class Array(object):
             result = result.with_queue(queue)
 
         if self.nbytes:
-            cl.enqueue_copy(queue or self.queue,
+            event1 = cl.enqueue_copy(queue or self.queue,
                     result.base_data, self.base_data,
-                    src_offset=self.offset, byte_count=self.nbytes)
+                    src_offset=self.offset, byte_count=self.nbytes,
+                    wait_for=self.events)
+            result.add_event(event1)
 
         return result
 
diff --git a/test/test_array.py b/test/test_array.py
index bca78f5c..fdfcfce3 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1212,6 +1212,34 @@ def test_multi_put(ctx_factory):
 
     assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9))
 
+def test_outoforderqueue_get(ctx_factory):
+    context = ctx_factory()
+    try:
+        queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    except Exception:
+        pytest.skip("out-of-order queue not available")
+    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    a_gpu = cl_array.to_device(queue, a)
+    b_gpu = a_gpu + a_gpu**5 + 1
+    b1 = b_gpu.get() # testing that this waits for events
+    b = a + a**5 + 1
+    assert np.abs(b1 - b).mean() < 1e-5
+
+def test_outoforderqueue_copy(ctx_factory):
+    context = ctx_factory()
+    try:
+        queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    except Exception:
+        pytest.skip("out-of-order queue not available")
+    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    a_gpu = cl_array.to_device(queue, a)
+    c_gpu = a_gpu**2 - 7
+    b_gpu = c_gpu.copy() # testing that this waits for and creates events
+    b_gpu *= 10
+    queue.finish()
+    b1 = b_gpu.get()
+    b = 10 * (a**2 - 7)
+    assert np.abs(b1 - b).mean() < 1e-5
 
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the
-- 
GitLab


From da8cc309c973ef9572f8ac8b4660720f2faf205b Mon Sep 17 00:00:00 2001
From: "Rebecca N. Palmer" <rebecca_palmer@zoho.com>
Date: Sat, 4 Aug 2018 10:16:27 +0100
Subject: [PATCH 2/8] Make clmath functions append their Event to result.events

---
 pyopencl/clmath.py  | 27 +++++++++++++++++----------
 test/test_clmath.py | 14 ++++++++++++++
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/pyopencl/clmath.py b/pyopencl/clmath.py
index 73d390cf..6d45fe44 100644
--- a/pyopencl/clmath.py
+++ b/pyopencl/clmath.py
@@ -41,7 +41,8 @@ def _make_unary_array_func(name):
 
     def f(array, queue=None):
         result = array._new_like_me(queue=queue)
-        knl_runner(result, array, queue=queue)
+        event1 = knl_runner(result, array, queue=queue)
+        result.add_event(event1)
         return result
 
     return f
@@ -78,7 +79,7 @@ def atan2(y, x, queue=None):
     """
     queue = queue or y.queue
     result = y._new_like_me(_get_common_dtype(y, x, queue))
-    _atan2(result, y, x, queue=queue)
+    result.add_event(_atan2(result, y, x, queue=queue))
     return result
 
 
@@ -92,7 +93,7 @@ def atan2pi(y, x, queue=None):
     """
     queue = queue or y.queue
     result = y._new_like_me(_get_common_dtype(y, x, queue))
-    _atan2pi(result, y, x, queue=queue)
+    result.add_event(_atan2pi(result, y, x, queue=queue))
     return result
 
 
@@ -130,7 +131,7 @@ def fmod(arg, mod, queue=None):
     for each element in `arg` and `mod`."""
     queue = (queue or arg.queue) or mod.queue
     result = arg._new_like_me(_get_common_dtype(arg, mod, queue))
-    _fmod(result, arg, mod, queue=queue)
+    result.add_event(_fmod(result, arg, mod, queue=queue))
     return result
 
 # TODO: fract
@@ -148,7 +149,9 @@ def frexp(arg, queue=None):
     """
     sig = arg._new_like_me(queue=queue)
     expt = arg._new_like_me(queue=queue, dtype=np.int32)
-    _frexp(sig, expt, arg, queue=queue)
+    event1 = _frexp(sig, expt, arg, queue=queue)
+    sig.add_event(event1)
+    expt.add_event(event1)
     return sig, expt
 
 # TODO: hypot
@@ -169,7 +172,7 @@ def ldexp(significand, exponent, queue=None):
     `result = significand * 2**exponent`.
     """
     result = significand._new_like_me(queue=queue)
-    _ldexp(result, significand, exponent)
+    result.add_event(_ldexp(result, significand, exponent))
     return result
 
 
@@ -199,7 +202,9 @@ def modf(arg, queue=None):
     """
     intpart = arg._new_like_me(queue=queue)
     fracpart = arg._new_like_me(queue=queue)
-    _modf(intpart, fracpart, arg, queue=queue)
+    event1 = _modf(intpart, fracpart, arg, queue=queue)
+    fracpart.add_event(event1)
+    intpart.add_event(event1)
     return fracpart, intpart
 
 
@@ -254,18 +259,20 @@ def _hankel_01(h0, h1, x):
 
 def bessel_jn(n, x, queue=None):
     result = x._new_like_me(queue=queue)
-    _bessel_jn(result, n, x, queue=queue)
+    result.add_event(_bessel_jn(result, n, x, queue=queue))
     return result
 
 
 def bessel_yn(n, x, queue=None):
     result = x._new_like_me(queue=queue)
-    _bessel_yn(result, n, x, queue=queue)
+    result.add_event(_bessel_yn(result, n, x, queue=queue))
     return result
 
 
 def hankel_01(x, queue=None):
     h0 = x._new_like_me(queue=queue)
     h1 = x._new_like_me(queue=queue)
-    _hankel_01(h0, h1, x, queue=queue)
+    event1 = _hankel_01(h0, h1, x, queue=queue)
+    h0.add_event(event1)
+    h1.add_event(event1)
     return h0, h1
diff --git a/test/test_clmath.py b/test/test_clmath.py
index 553ed7a6..f4a55936 100644
--- a/test/test_clmath.py
+++ b/test/test_clmath.py
@@ -446,6 +446,20 @@ def test_hankel_01_complex(ctx_factory, ref_src):
         pt.loglog(np.abs(z), rel_err_h1)
         pt.show()
 
+def test_outoforderqueue_clmath(ctx_factory):
+    context = ctx_factory()
+    try:
+        queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    except Exception:
+        pytest.skip("out-of-order queue not available")
+    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    a_gpu = cl_array.to_device(queue, a)
+    b_gpu = clmath.fabs(clmath.sin(a_gpu * 5)) # testing that clmath functions wait for and create events
+    queue.finish()
+    b1 = b_gpu.get()
+    b = np.abs(np.sin(a * 5))
+    assert np.abs(b1 - b).mean() < 1e-5
+
 
 if __name__ == "__main__":
     import sys
-- 
GitLab


From e723b2e7260b7133673bca85abe7e64e3f0a3478 Mon Sep 17 00:00:00 2001
From: "Rebecca N. Palmer" <rebecca_palmer@zoho.com>
Date: Sun, 5 Aug 2018 14:32:09 +0100
Subject: [PATCH 3/8] Make various Array functions wait for / append to
 self.events

(setitem, diff, if_positive, cumsum, all the reductions (sum/all/etc))
---
 pyopencl/array.py  | 86 +++++++++++++++++++++++++++++++++++-----------
 test/test_array.py | 35 +++++++++++++++++++
 2 files changed, 101 insertions(+), 20 deletions(-)

diff --git a/pyopencl/array.py b/pyopencl/array.py
index d7ef138b..b904c7de 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -1291,12 +1291,20 @@ class Array(object):
     def any(self, queue=None, wait_for=None):
         from pyopencl.reduction import get_any_kernel
         krnl = get_any_kernel(self.context, self.dtype)
-        return krnl(self, queue=queue, wait_for=wait_for)
+        if wait_for is None:
+            wait_for = []
+        result, event1 = krnl(self, queue=queue, wait_for=wait_for + self.events, return_event=True)
+        result.add_event(event1)
+        return result
 
     def all(self, queue=None, wait_for=None):
         from pyopencl.reduction import get_all_kernel
         krnl = get_all_kernel(self.context, self.dtype)
-        return krnl(self, queue=queue, wait_for=wait_for)
+        if wait_for is None:
+            wait_for = []
+        result, event1 = krnl(self, queue=queue, wait_for=wait_for + self.events, return_event=True)
+        result.add_event(event1)
+        return result
 
     @staticmethod
     @elwise_kernel_runner
@@ -1677,11 +1685,13 @@ class Array(object):
 
         if flags is None:
             flags = cl.map_flags.READ | cl.map_flags.WRITE
+        if wait_for is None:
+            wait_for=[]
 
         ary, evt = cl.enqueue_map_buffer(
                 queue or self.queue, self.base_data, flags, self.offset,
-                self.shape, self.dtype, strides=self.strides, wait_for=wait_for,
-                is_blocking=is_blocking)
+                self.shape, self.dtype, strides=self.strides,
+                wait_for=wait_for + self.events, is_blocking=is_blocking)
 
         if is_blocking:
             return ary
@@ -1800,6 +1810,9 @@ class Array(object):
         """
 
         queue = queue or self.queue or value.queue
+        if wait_for is None:
+            wait_for = []
+        wait_for = wait_for + self.events
 
         if isinstance(subscript, Array):
             if subscript.dtype.kind != "i":
@@ -2149,11 +2162,16 @@ def multi_take(arrays, indices, out=None, queue=None):
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
 
-        knl(queue, gs, ls,
+        wait_for_this = (indices.events
+            + _builtin_sum((i.events for i in arrays[chunk_slice]), [])
+            + _builtin_sum((o.events for o in out[chunk_slice]), []))
+        evt = knl(queue, gs, ls,
                 indices.data,
                 *([o.data for o in out[chunk_slice]]
                     + [i.data for i in arrays[chunk_slice]]
-                    + [indices.size]))
+                    + [indices.size]), wait_for=wait_for_this)
+        for o in out[chunk_slice]:
+            o.add_event(evt)
 
     return out
 
@@ -2223,7 +2241,10 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None,
                     queue.device))
 
         from pytools import flatten
-        knl(queue, gs, ls,
+        wait_for_this = (dest_indices.events + src_indices.events
+            + _builtin_sum((i.events for i in arrays[chunk_slice]), [])
+            + _builtin_sum((o.events for o in out[chunk_slice]), []))
+        evt = knl(queue, gs, ls,
                 *([o.data for o in out[chunk_slice]]
                     + [dest_indices.base_data,
                         dest_indices.offset,
@@ -2234,6 +2255,8 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None,
                         for i in arrays[chunk_slice]))
                     + src_offsets_list[chunk_slice]
                     + [src_indices.size]))
+        for o in out[chunk_slice]:
+            o.add_event(evt)
 
     return out
 
@@ -2248,6 +2271,10 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
     a_allocator = arrays[0].allocator
     context = dest_indices.context
     queue = queue or dest_indices.queue
+    if wait_for is None:
+        wait_for = []
+    wait_for = wait_for + dest_indices.events
+    
 
     vec_count = len(arrays)
 
@@ -2299,6 +2326,9 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
                     queue.device))
 
         from pytools import flatten
+        wait_for_this = (wait_for
+            + _builtin_sum((i.events for i in arrays[chunk_slice]), [])
+            + _builtin_sum((o.events for o in out[chunk_slice]), []))
         evt = knl(queue, gs, ls,
                 *(
                     list(flatten(
@@ -2311,9 +2341,7 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
                     + [use_fill_cla.base_data, use_fill_cla.offset]
                     + [array_lengths_cla.base_data, array_lengths_cla.offset]
                     + [dest_indices.size]),
-                **dict(wait_for=wait_for))
-
-        # FIXME should wait on incoming events
+                **dict(wait_for=wait_for_this))
 
         for o in out[chunk_slice]:
             o.add_event(evt)
@@ -2391,7 +2419,8 @@ def diff(array, queue=None, allocator=None):
     allocator = allocator or array.allocator
 
     result = empty(queue, (n-1,), array.dtype, allocator=allocator)
-    _diff(result, array, queue=queue)
+    event1 = _diff(result, array, queue=queue)
+    result.add_event(event1)
     return result
 
 
@@ -2472,7 +2501,8 @@ def if_positive(criterion, then_, else_, out=None, queue=None):
 
     if out is None:
         out = empty_like(then_)
-    _if_positive(out, criterion, then_, else_, queue=queue)
+    event1 = _if_positive(out, criterion, then_, else_, queue=queue)
+    out.add_event(event1)
     return out
 
 
@@ -2505,7 +2535,9 @@ def sum(a, dtype=None, queue=None, slice=None):
     """
     from pyopencl.reduction import get_sum_kernel
     krnl = get_sum_kernel(a.context, dtype, a.dtype)
-    return krnl(a, queue=queue, slice=slice)
+    result, event1 = krnl(a, queue=queue, slice=slice, wait_for=a.events, return_event=True)
+    result.add_event(event1)
+    return result
 
 
 def dot(a, b, dtype=None, queue=None, slice=None):
@@ -2514,7 +2546,9 @@ def dot(a, b, dtype=None, queue=None, slice=None):
     """
     from pyopencl.reduction import get_dot_kernel
     krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype)
-    return krnl(a, b, queue=queue, slice=slice)
+    result, event1 = krnl(a, b, queue=queue, slice=slice, wait_for=a.events + b.events, return_event=True)
+    result.add_event(event1)
+    return result
 
 
 def vdot(a, b, dtype=None, queue=None, slice=None):
@@ -2525,7 +2559,9 @@ def vdot(a, b, dtype=None, queue=None, slice=None):
     from pyopencl.reduction import get_dot_kernel
     krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype,
             conjugate_first=True)
-    return krnl(a, b, queue=queue, slice=slice)
+    result, event1 = krnl(a, b, queue=queue, slice=slice, wait_for=a.events + b.events, return_event=True)
+    result.add_event(event1)
+    return result
 
 
 def subset_dot(subset, a, b, dtype=None, queue=None, slice=None):
@@ -2535,14 +2571,19 @@ def subset_dot(subset, a, b, dtype=None, queue=None, slice=None):
     from pyopencl.reduction import get_subset_dot_kernel
     krnl = get_subset_dot_kernel(
             a.context, dtype, subset.dtype, a.dtype, b.dtype)
-    return krnl(subset, a, b, queue=queue, slice=slice)
+    result, event1 = krnl(subset, a, b, queue=queue, slice=slice,
+        wait_for=subset.events + a.events + b.events, return_event=True)
+    result.add_event(event1)
+    return result
 
 
 def _make_minmax_kernel(what):
     def f(a, queue=None):
         from pyopencl.reduction import get_minmax_kernel
         krnl = get_minmax_kernel(a.context, what, a.dtype)
-        return krnl(a,  queue=queue)
+        result, event1 = krnl(a,  queue=queue, wait_for=a.events, return_event=True)
+        result.add_event(event1)
+        return result
 
     return f
 
@@ -2562,8 +2603,10 @@ def _make_subset_minmax_kernel(what):
     def f(subset, a, queue=None, slice=None):
         from pyopencl.reduction import get_subset_minmax_kernel
         krnl = get_subset_minmax_kernel(a.context, what, a.dtype, subset.dtype)
-        return krnl(subset, a,  queue=queue, slice=slice)
-
+        result, event1 = krnl(subset, a,  queue=queue, slice=slice,
+            wait_for=a.events + subset.events, return_event=True)
+        result.add_event(event1)
+        return result
     return f
 
 
@@ -2587,12 +2630,15 @@ def cumsum(a, output_dtype=None, queue=None,
 
     if output_dtype is None:
         output_dtype = a.dtype
+    if wait_for is None:
+        wait_for = []
 
     result = a._new_like_me(output_dtype)
 
     from pyopencl.scan import get_cumsum_kernel
     krnl = get_cumsum_kernel(a.context, a.dtype, output_dtype)
-    evt = krnl(a, result, queue=queue, wait_for=wait_for)
+    evt = krnl(a, result, queue=queue, wait_for=wait_for + a.events)
+    result.add_event(evt)
 
     if return_event:
         return evt, result
diff --git a/test/test_array.py b/test/test_array.py
index fdfcfce3..c9771ac0 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1241,6 +1241,41 @@ def test_outoforderqueue_copy(ctx_factory):
     b = 10 * (a**2 - 7)
     assert np.abs(b1 - b).mean() < 1e-5
 
+def test_outoforderqueue_indexing(ctx_factory):
+    context = ctx_factory()
+    try:
+        queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    except Exception:
+        pytest.skip("out-of-order queue not available")
+    a = np.random.rand(10**6).astype(np.dtype('float32'))
+    i = (8e5 + 1e5 * np.random.rand(10**5)).astype(np.dtype('int32'))
+    a_gpu = cl_array.to_device(queue, a)
+    i_gpu = cl_array.to_device(queue, i)
+    c_gpu = (a_gpu**2)[i_gpu - 10000]
+    b_gpu = 10 - a_gpu
+    b_gpu[:] = 8 * a_gpu
+    b_gpu[i_gpu + 10000] = c_gpu - 10
+    queue.finish()
+    b1 = b_gpu.get()
+    c = (a**2)[i - 10000]
+    b = 8 * a
+    b[i + 10000] = c - 10
+    assert np.abs(b1 - b).mean() < 1e-5
+
+def test_outoforderqueue_reductions(ctx_factory):
+    context = ctx_factory()
+    try:
+        queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    except Exception:
+        pytest.skip("out-of-order queue not available")
+    a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32')) # 0/1 values to avoid accumulated rounding error
+    a[800000] = 10 # all<5 looks true until near the end
+    a_gpu = cl_array.to_device(queue, a)
+    b1 = cl_array.sum(a_gpu).get()
+    b2 = cl_array.dot(a_gpu, 3 - a_gpu).get()
+    b3 = (a_gpu < 5).all().get()
+    assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0
+
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the
     # tests.
-- 
GitLab


From b5e1dc63308183e1462762c471f53a4467369c67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Sun, 5 Aug 2018 19:29:55 -0400
Subject: [PATCH 4/8] Force use of gcc for Conda Apple CI

---
 .gitlab-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e37c40b5..0047755f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -166,6 +166,7 @@ Python 2.7 Apple:
 Python 3 Conda Apple:
   script:
   - CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+  - export CC=gcc
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
   - ". ./build-and-test-py-project-within-miniconda.sh"
   tags:
-- 
GitLab


From faff864aaf0d7ecd6de62350443c066b71040c64 Mon Sep 17 00:00:00 2001
From: "Rebecca N. Palmer" <rebecca_palmer@zoho.com>
Date: Mon, 6 Aug 2018 21:45:00 +0100
Subject: [PATCH 5/8] actually wait for wait_for_this

---
 pyopencl/array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyopencl/array.py b/pyopencl/array.py
index b904c7de..90e6aa57 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -2254,7 +2254,7 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None,
                         (i.base_data, i.offset)
                         for i in arrays[chunk_slice]))
                     + src_offsets_list[chunk_slice]
-                    + [src_indices.size]))
+                    + [src_indices.size]), wait_for=wait_for_this)
         for o in out[chunk_slice]:
             o.add_event(evt)
 
-- 
GitLab


From e247a566931d0ead98a44196b40403704aee2a0c Mon Sep 17 00:00:00 2001
From: "Rebecca N. Palmer" <rebecca_palmer@zoho.com>
Date: Mon, 6 Aug 2018 22:00:55 +0100
Subject: [PATCH 6/8] flake8 style cleanup

---
 pyopencl/array.py   | 27 ++++++++++++++++-----------
 test/test_array.py  | 26 ++++++++++++++++++--------
 test/test_clmath.py |  7 +++++--
 3 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/pyopencl/array.py b/pyopencl/array.py
index 90e6aa57..a1cbb424 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -624,7 +624,7 @@ class Array(object):
             event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary,
                     device_offset=self.offset,
                     is_blocking=not async_)
-            if not async_: # not already waited for
+            if not async_:  # not already waited for
                 self.add_event(event1)
 
     def get(self, queue=None, ary=None, async_=None, **kwargs):
@@ -1293,7 +1293,8 @@ class Array(object):
         krnl = get_any_kernel(self.context, self.dtype)
         if wait_for is None:
             wait_for = []
-        result, event1 = krnl(self, queue=queue, wait_for=wait_for + self.events, return_event=True)
+        result, event1 = krnl(self, queue=queue,
+               wait_for=wait_for + self.events, return_event=True)
         result.add_event(event1)
         return result
 
@@ -1302,7 +1303,8 @@ class Array(object):
         krnl = get_all_kernel(self.context, self.dtype)
         if wait_for is None:
             wait_for = []
-        result, event1 = krnl(self, queue=queue, wait_for=wait_for + self.events, return_event=True)
+        result, event1 = krnl(self, queue=queue,
+               wait_for=wait_for + self.events, return_event=True)
         result.add_event(event1)
         return result
 
@@ -1686,7 +1688,7 @@ class Array(object):
         if flags is None:
             flags = cl.map_flags.READ | cl.map_flags.WRITE
         if wait_for is None:
-            wait_for=[]
+            wait_for = []
 
         ary, evt = cl.enqueue_map_buffer(
                 queue or self.queue, self.base_data, flags, self.offset,
@@ -2274,7 +2276,6 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
     if wait_for is None:
         wait_for = []
     wait_for = wait_for + dest_indices.events
-    
 
     vec_count = len(arrays)
 
@@ -2535,7 +2536,8 @@ def sum(a, dtype=None, queue=None, slice=None):
     """
     from pyopencl.reduction import get_sum_kernel
     krnl = get_sum_kernel(a.context, dtype, a.dtype)
-    result, event1 = krnl(a, queue=queue, slice=slice, wait_for=a.events, return_event=True)
+    result, event1 = krnl(a, queue=queue, slice=slice, wait_for=a.events,
+            return_event=True)
     result.add_event(event1)
     return result
 
@@ -2546,7 +2548,8 @@ def dot(a, b, dtype=None, queue=None, slice=None):
     """
     from pyopencl.reduction import get_dot_kernel
     krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype)
-    result, event1 = krnl(a, b, queue=queue, slice=slice, wait_for=a.events + b.events, return_event=True)
+    result, event1 = krnl(a, b, queue=queue, slice=slice,
+            wait_for=a.events + b.events, return_event=True)
     result.add_event(event1)
     return result
 
@@ -2559,7 +2562,8 @@ def vdot(a, b, dtype=None, queue=None, slice=None):
     from pyopencl.reduction import get_dot_kernel
     krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype,
             conjugate_first=True)
-    result, event1 = krnl(a, b, queue=queue, slice=slice, wait_for=a.events + b.events, return_event=True)
+    result, event1 = krnl(a, b, queue=queue, slice=slice,
+            wait_for=a.events + b.events, return_event=True)
     result.add_event(event1)
     return result
 
@@ -2572,7 +2576,7 @@ def subset_dot(subset, a, b, dtype=None, queue=None, slice=None):
     krnl = get_subset_dot_kernel(
             a.context, dtype, subset.dtype, a.dtype, b.dtype)
     result, event1 = krnl(subset, a, b, queue=queue, slice=slice,
-        wait_for=subset.events + a.events + b.events, return_event=True)
+            wait_for=subset.events + a.events + b.events, return_event=True)
     result.add_event(event1)
     return result
 
@@ -2581,7 +2585,8 @@ def _make_minmax_kernel(what):
     def f(a, queue=None):
         from pyopencl.reduction import get_minmax_kernel
         krnl = get_minmax_kernel(a.context, what, a.dtype)
-        result, event1 = krnl(a,  queue=queue, wait_for=a.events, return_event=True)
+        result, event1 = krnl(a, queue=queue, wait_for=a.events,
+                return_event=True)
         result.add_event(event1)
         return result
 
@@ -2604,7 +2609,7 @@ def _make_subset_minmax_kernel(what):
         from pyopencl.reduction import get_subset_minmax_kernel
         krnl = get_subset_minmax_kernel(a.context, what, a.dtype, subset.dtype)
         result, event1 = krnl(subset, a,  queue=queue, slice=slice,
-            wait_for=a.events + subset.events, return_event=True)
+                wait_for=a.events + subset.events, return_event=True)
         result.add_event(event1)
         return result
     return f
diff --git a/test/test_array.py b/test/test_array.py
index c9771ac0..05008c16 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1212,39 +1212,45 @@ def test_multi_put(ctx_factory):
 
     assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9))
 
+
 def test_outoforderqueue_get(ctx_factory):
     context = ctx_factory()
     try:
-        queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+        queue = cl.CommandQueue(context,
+               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
     except Exception:
         pytest.skip("out-of-order queue not available")
     a = np.random.rand(10**6).astype(np.dtype('float32'))
     a_gpu = cl_array.to_device(queue, a)
     b_gpu = a_gpu + a_gpu**5 + 1
-    b1 = b_gpu.get() # testing that this waits for events
+    b1 = b_gpu.get()  # testing that this waits for events
     b = a + a**5 + 1
     assert np.abs(b1 - b).mean() < 1e-5
 
+
 def test_outoforderqueue_copy(ctx_factory):
     context = ctx_factory()
     try:
-        queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+        queue = cl.CommandQueue(context,
+               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
     except Exception:
         pytest.skip("out-of-order queue not available")
     a = np.random.rand(10**6).astype(np.dtype('float32'))
     a_gpu = cl_array.to_device(queue, a)
     c_gpu = a_gpu**2 - 7
-    b_gpu = c_gpu.copy() # testing that this waits for and creates events
+    b_gpu = c_gpu.copy()  # testing that this waits for and creates events
     b_gpu *= 10
     queue.finish()
     b1 = b_gpu.get()
     b = 10 * (a**2 - 7)
     assert np.abs(b1 - b).mean() < 1e-5
 
+
 def test_outoforderqueue_indexing(ctx_factory):
     context = ctx_factory()
     try:
-        queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+        queue = cl.CommandQueue(context,
+               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
     except Exception:
         pytest.skip("out-of-order queue not available")
     a = np.random.rand(10**6).astype(np.dtype('float32'))
@@ -1262,20 +1268,24 @@ def test_outoforderqueue_indexing(ctx_factory):
     b[i + 10000] = c - 10
     assert np.abs(b1 - b).mean() < 1e-5
 
+
 def test_outoforderqueue_reductions(ctx_factory):
     context = ctx_factory()
     try:
-        queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+        queue = cl.CommandQueue(context,
+               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
     except Exception:
         pytest.skip("out-of-order queue not available")
-    a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32')) # 0/1 values to avoid accumulated rounding error
-    a[800000] = 10 # all<5 looks true until near the end
+    # 0/1 values to avoid accumulated rounding error
+    a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32'))
+    a[800000] = 10  # all<5 looks true until near the end
     a_gpu = cl_array.to_device(queue, a)
     b1 = cl_array.sum(a_gpu).get()
     b2 = cl_array.dot(a_gpu, 3 - a_gpu).get()
     b3 = (a_gpu < 5).all().get()
     assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0
 
+
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the
     # tests.
diff --git a/test/test_clmath.py b/test/test_clmath.py
index f4a55936..beebc2a8 100644
--- a/test/test_clmath.py
+++ b/test/test_clmath.py
@@ -446,15 +446,18 @@ def test_hankel_01_complex(ctx_factory, ref_src):
         pt.loglog(np.abs(z), rel_err_h1)
         pt.show()
 
+
 def test_outoforderqueue_clmath(ctx_factory):
     context = ctx_factory()
     try:
-        queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
+        queue = cl.CommandQueue(context,
+               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
     except Exception:
         pytest.skip("out-of-order queue not available")
     a = np.random.rand(10**6).astype(np.dtype('float32'))
     a_gpu = cl_array.to_device(queue, a)
-    b_gpu = clmath.fabs(clmath.sin(a_gpu * 5)) # testing that clmath functions wait for and create events
+    # testing that clmath functions wait for and create events
+    b_gpu = clmath.fabs(clmath.sin(a_gpu * 5))
     queue.finish()
     b1 = b_gpu.get()
     b = np.abs(np.sin(a * 5))
-- 
GitLab


From 0882d910e3ae9996f96f946afb28d90efefef817 Mon Sep 17 00:00:00 2001
From: "Rebecca N. Palmer" <rebecca_palmer@zoho.com>
Date: Tue, 7 Aug 2018 07:56:43 +0100
Subject: [PATCH 7/8] tests: work around pocl not having clEnqueueWaitForEvents

---
 test/test_algorithm.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/test/test_algorithm.py b/test/test_algorithm.py
index b7b296ce..5264767c 100644
--- a/test/test_algorithm.py
+++ b/test/test_algorithm.py
@@ -961,8 +961,14 @@ def test_bitonic_sort(ctx_factory, size, dtype):
     from pyopencl.bitonic_sort import BitonicSort
 
     s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333)
+    sgs = s.copy()
+    # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for
+    # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
+    if (dev.platform.name == "Portable Computing Language"
+            and cl.get_cl_header_version() < (1, 2)):
+        sgs.finish()
     sorter = BitonicSort(ctx)
-    sgs, evt = sorter(s.copy(), axis=1)
+    sgs, evt = sorter(sgs, axis=1)
     assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
 
 
@@ -1014,7 +1020,14 @@ def test_bitonic_argsort(ctx_factory, size, dtype):
 
     sorterm = BitonicSort(ctx)
 
-    ms, evt = sorterm(m.copy(), idx=index, axis=0)
+    ms = m.copy()
+    # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for
+    # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
+    if (dev.platform.name == "Portable Computing Language"
+            and cl.get_cl_header_version() < (1, 2)):
+        ms.finish()
+        index.finish()
+    ms, evt = sorterm(ms, idx=index, axis=0)
 
     assert np.array_equal(np.sort(m.get()), ms.get())
 
-- 
GitLab


From 064f9be6103ab285e807cd1b774ce0da9aeca9e3 Mon Sep 17 00:00:00 2001
From: "Rebecca N. Palmer" <rebecca_palmer@zoho.com>
Date: Tue, 7 Aug 2018 22:20:14 +0100
Subject: [PATCH 8/8] fix bug in async Array.set

---
 pyopencl/array.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyopencl/array.py b/pyopencl/array.py
index a1cbb424..704c495b 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -624,8 +624,7 @@ class Array(object):
             event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary,
                     device_offset=self.offset,
                     is_blocking=not async_)
-            if not async_:  # not already waited for
-                self.add_event(event1)
+            self.add_event(event1)
 
     def get(self, queue=None, ary=None, async_=None, **kwargs):
         """Transfer the contents of *self* into *ary* or a newly allocated
-- 
GitLab