From 7eaf1041c4d7b053e3ebdcb80d2a4dce7dffa756 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sat, 4 Aug 2018 10:14:02 +0100 Subject: [PATCH 1/8] Make Array.get/set/copy wait for / append to self.events --- pyopencl/array.py | 12 ++++++++---- test/test_array.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/pyopencl/array.py b/pyopencl/array.py index 2d032079..d7ef138b 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -621,9 +621,11 @@ class Array(object): stacklevel=2) if self.size: - cl.enqueue_copy(queue or self.queue, self.base_data, ary, + event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary, device_offset=self.offset, is_blocking=not async_) + if not async_: # not already waited for + self.add_event(event1) def get(self, queue=None, ary=None, async_=None, **kwargs): """Transfer the contents of *self* into *ary* or a newly allocated @@ -687,7 +689,7 @@ class Array(object): if self.size: cl.enqueue_copy(queue, ary, self.base_data, device_offset=self.offset, - is_blocking=not async_) + wait_for=self.events, is_blocking=not async_) return ary @@ -712,9 +714,11 @@ class Array(object): result = result.with_queue(queue) if self.nbytes: - cl.enqueue_copy(queue or self.queue, + event1 = cl.enqueue_copy(queue or self.queue, result.base_data, self.base_data, - src_offset=self.offset, byte_count=self.nbytes) + src_offset=self.offset, byte_count=self.nbytes, + wait_for=self.events) + result.add_event(event1) return result diff --git a/test/test_array.py b/test/test_array.py index bca78f5c..fdfcfce3 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -1212,6 +1212,34 @@ def test_multi_put(ctx_factory): assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9)) +def test_outoforderqueue_get(ctx_factory): + context = ctx_factory() + try: + queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + except Exception: + pytest.skip("out-of-order queue not available") + a = np.random.rand(10**6).astype(np.dtype('float32')) + a_gpu = cl_array.to_device(queue, a) + b_gpu = a_gpu + a_gpu**5 + 1 + b1 = b_gpu.get() # testing that this waits for events + b = a + a**5 + 1 + assert np.abs(b1 - b).mean() < 1e-5 + +def test_outoforderqueue_copy(ctx_factory): + context = ctx_factory() + try: + queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + except Exception: + pytest.skip("out-of-order queue not available") + a = np.random.rand(10**6).astype(np.dtype('float32')) + a_gpu = cl_array.to_device(queue, a) + c_gpu = a_gpu**2 - 7 + b_gpu = c_gpu.copy() # testing that this waits for and creates events + b_gpu *= 10 + queue.finish() + b1 = b_gpu.get() + b = 10 * (a**2 - 7) + assert np.abs(b1 - b).mean() < 1e-5 if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the -- GitLab From da8cc309c973ef9572f8ac8b4660720f2faf205b Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sat, 4 Aug 2018 10:16:27 +0100 Subject: [PATCH 2/8] Make clmath functions append their Event to result.events --- pyopencl/clmath.py | 27 +++++++++++++++++---------- test/test_clmath.py | 14 ++++++++++++++ 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/pyopencl/clmath.py b/pyopencl/clmath.py index 73d390cf..6d45fe44 100644 --- a/pyopencl/clmath.py +++ b/pyopencl/clmath.py @@ -41,7 +41,8 @@ def _make_unary_array_func(name): def f(array, queue=None): result = array._new_like_me(queue=queue) - knl_runner(result, array, queue=queue) + event1 = knl_runner(result, array, queue=queue) + result.add_event(event1) return result return f @@ -78,7 +79,7 @@ def atan2(y, x, queue=None): """ queue = queue or y.queue result = y._new_like_me(_get_common_dtype(y, x, queue)) - _atan2(result, y, x, queue=queue) + result.add_event(_atan2(result, y, x, queue=queue)) return result @@ -92,7 +93,7 @@ def atan2pi(y, x, queue=None): """ queue = queue or y.queue result = y._new_like_me(_get_common_dtype(y, x, queue)) - _atan2pi(result, y, x, queue=queue) + result.add_event(_atan2pi(result, y, x, queue=queue)) return result @@ -130,7 +131,7 @@ def fmod(arg, mod, queue=None): for each element in `arg` and `mod`.""" queue = (queue or arg.queue) or mod.queue result = arg._new_like_me(_get_common_dtype(arg, mod, queue)) - _fmod(result, arg, mod, queue=queue) + result.add_event(_fmod(result, arg, mod, queue=queue)) return result # TODO: fract @@ -148,7 +149,9 @@ def frexp(arg, queue=None): """ sig = arg._new_like_me(queue=queue) expt = arg._new_like_me(queue=queue, dtype=np.int32) - _frexp(sig, expt, arg, queue=queue) + event1 = _frexp(sig, expt, arg, queue=queue) + sig.add_event(event1) + expt.add_event(event1) return sig, expt # TODO: hypot @@ -169,7 +172,7 @@ def ldexp(significand, exponent, queue=None): `result = significand * 2**exponent`. """ result = significand._new_like_me(queue=queue) - _ldexp(result, significand, exponent) + result.add_event(_ldexp(result, significand, exponent)) return result @@ -199,7 +202,9 @@ def modf(arg, queue=None): """ intpart = arg._new_like_me(queue=queue) fracpart = arg._new_like_me(queue=queue) - _modf(intpart, fracpart, arg, queue=queue) + event1 = _modf(intpart, fracpart, arg, queue=queue) + fracpart.add_event(event1) + intpart.add_event(event1) return fracpart, intpart @@ -254,18 +259,20 @@ def _hankel_01(h0, h1, x): def bessel_jn(n, x, queue=None): result = x._new_like_me(queue=queue) - _bessel_jn(result, n, x, queue=queue) + result.add_event(_bessel_jn(result, n, x, queue=queue)) return result def bessel_yn(n, x, queue=None): result = x._new_like_me(queue=queue) - _bessel_yn(result, n, x, queue=queue) + result.add_event(_bessel_yn(result, n, x, queue=queue)) return result def hankel_01(x, queue=None): h0 = x._new_like_me(queue=queue) h1 = x._new_like_me(queue=queue) - _hankel_01(h0, h1, x, queue=queue) + event1 = _hankel_01(h0, h1, x, queue=queue) + h0.add_event(event1) + h1.add_event(event1) return h0, h1 diff --git a/test/test_clmath.py b/test/test_clmath.py index 553ed7a6..f4a55936 100644 --- a/test/test_clmath.py +++ b/test/test_clmath.py @@ -446,6 +446,20 @@ def test_hankel_01_complex(ctx_factory, ref_src): pt.loglog(np.abs(z), rel_err_h1) pt.show() +def test_outoforderqueue_clmath(ctx_factory): + context = ctx_factory() + try: + queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + except Exception: + pytest.skip("out-of-order queue not available") + a = np.random.rand(10**6).astype(np.dtype('float32')) + a_gpu = cl_array.to_device(queue, a) + b_gpu = clmath.fabs(clmath.sin(a_gpu * 5)) # testing that clmath functions wait for and create events + queue.finish() + b1 = b_gpu.get() + b = np.abs(np.sin(a * 5)) + assert np.abs(b1 - b).mean() < 1e-5 + if __name__ == "__main__": import sys -- GitLab From e723b2e7260b7133673bca85abe7e64e3f0a3478 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Sun, 5 Aug 2018 14:32:09 +0100 Subject: [PATCH 3/8] Make various Array functions wait for / append to self.events (setitem, diff, if_positive, cumsum, all the reductions (sum/all/etc)) --- pyopencl/array.py | 86 +++++++++++++++++++++++++++++++++++----------- test/test_array.py | 35 +++++++++++++++++++ 2 files changed, 101 insertions(+), 20 deletions(-) diff --git a/pyopencl/array.py b/pyopencl/array.py index d7ef138b..b904c7de 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -1291,12 +1291,20 @@ class Array(object): def any(self, queue=None, wait_for=None): from pyopencl.reduction import get_any_kernel krnl = get_any_kernel(self.context, self.dtype) - return krnl(self, queue=queue, wait_for=wait_for) + if wait_for is None: + wait_for = [] + result, event1 = krnl(self, queue=queue, wait_for=wait_for + self.events, return_event=True) + result.add_event(event1) + return result def all(self, queue=None, wait_for=None): from pyopencl.reduction import get_all_kernel krnl = get_all_kernel(self.context, self.dtype) - return krnl(self, queue=queue, wait_for=wait_for) + if wait_for is None: + wait_for = [] + result, event1 = krnl(self, queue=queue, wait_for=wait_for + self.events, return_event=True) + result.add_event(event1) + return result @staticmethod @elwise_kernel_runner @@ -1677,11 +1685,13 @@ class Array(object): if flags is None: flags = cl.map_flags.READ | cl.map_flags.WRITE + if wait_for is None: + wait_for=[] ary, evt = cl.enqueue_map_buffer( queue or self.queue, self.base_data, flags, self.offset, - self.shape, self.dtype, strides=self.strides, wait_for=wait_for, - is_blocking=is_blocking) + self.shape, self.dtype, strides=self.strides, + wait_for=wait_for + self.events, is_blocking=is_blocking) if is_blocking: return ary @@ -1800,6 +1810,9 @@ class Array(object): """ queue = queue or self.queue or value.queue + if wait_for is None: + wait_for = [] + wait_for = wait_for + self.events if isinstance(subscript, Array): if subscript.dtype.kind != "i": @@ -2149,11 +2162,16 @@ def multi_take(arrays, indices, out=None, queue=None): cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) - knl(queue, gs, ls, + wait_for_this = (indices.events + + _builtin_sum((i.events for i in arrays[chunk_slice]), []) + + _builtin_sum((o.events for o in out[chunk_slice]), [])) + evt = knl(queue, gs, ls, indices.data, *([o.data for o in out[chunk_slice]] + [i.data for i in arrays[chunk_slice]] - + [indices.size])) + + [indices.size]), wait_for=wait_for_this) + for o in out[chunk_slice]: + o.add_event(evt) return out @@ -2223,7 +2241,10 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, queue.device)) from pytools import flatten - knl(queue, gs, ls, + wait_for_this = (dest_indices.events + src_indices.events + + _builtin_sum((i.events for i in arrays[chunk_slice]), []) + + _builtin_sum((o.events for o in out[chunk_slice]), [])) + evt = knl(queue, gs, ls, *([o.data for o in out[chunk_slice]] + [dest_indices.base_data, dest_indices.offset, @@ -2234,6 +2255,8 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, for i in arrays[chunk_slice])) + src_offsets_list[chunk_slice] + [src_indices.size])) + for o in out[chunk_slice]: + o.add_event(evt) return out @@ -2248,6 +2271,10 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, a_allocator = arrays[0].allocator context = dest_indices.context queue = queue or dest_indices.queue + if wait_for is None: + wait_for = [] + wait_for = wait_for + dest_indices.events + vec_count = len(arrays) @@ -2299,6 +2326,9 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, queue.device)) from pytools import flatten + wait_for_this = (wait_for + + _builtin_sum((i.events for i in arrays[chunk_slice]), []) + + _builtin_sum((o.events for o in out[chunk_slice]), [])) evt = knl(queue, gs, ls, *( list(flatten( @@ -2311,9 +2341,7 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, + [use_fill_cla.base_data, use_fill_cla.offset] + [array_lengths_cla.base_data, array_lengths_cla.offset] + [dest_indices.size]), - **dict(wait_for=wait_for)) - - # FIXME should wait on incoming events + **dict(wait_for=wait_for_this)) for o in out[chunk_slice]: o.add_event(evt) @@ -2391,7 +2419,8 @@ def diff(array, queue=None, allocator=None): allocator = allocator or array.allocator result = empty(queue, (n-1,), array.dtype, allocator=allocator) - _diff(result, array, queue=queue) + event1 = _diff(result, array, queue=queue) + result.add_event(event1) return result @@ -2472,7 +2501,8 @@ def if_positive(criterion, then_, else_, out=None, queue=None): if out is None: out = empty_like(then_) - _if_positive(out, criterion, then_, else_, queue=queue) + event1 = _if_positive(out, criterion, then_, else_, queue=queue) + out.add_event(event1) return out @@ -2505,7 +2535,9 @@ def sum(a, dtype=None, queue=None, slice=None): """ from pyopencl.reduction import get_sum_kernel krnl = get_sum_kernel(a.context, dtype, a.dtype) - return krnl(a, queue=queue, slice=slice) + result, event1 = krnl(a, queue=queue, slice=slice, wait_for=a.events, return_event=True) + result.add_event(event1) + return result def dot(a, b, dtype=None, queue=None, slice=None): @@ -2514,7 +2546,9 @@ def dot(a, b, dtype=None, queue=None, slice=None): """ from pyopencl.reduction import get_dot_kernel krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype) - return krnl(a, b, queue=queue, slice=slice) + result, event1 = krnl(a, b, queue=queue, slice=slice, wait_for=a.events + b.events, return_event=True) + result.add_event(event1) + return result def vdot(a, b, dtype=None, queue=None, slice=None): @@ -2525,7 +2559,9 @@ def vdot(a, b, dtype=None, queue=None, slice=None): from pyopencl.reduction import get_dot_kernel krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype, conjugate_first=True) - return krnl(a, b, queue=queue, slice=slice) + result, event1 = krnl(a, b, queue=queue, slice=slice, wait_for=a.events + b.events, return_event=True) + result.add_event(event1) + return result def subset_dot(subset, a, b, dtype=None, queue=None, slice=None): @@ -2535,14 +2571,19 @@ def subset_dot(subset, a, b, dtype=None, queue=None, slice=None): from pyopencl.reduction import get_subset_dot_kernel krnl = get_subset_dot_kernel( a.context, dtype, subset.dtype, a.dtype, b.dtype) - return krnl(subset, a, b, queue=queue, slice=slice) + result, event1 = krnl(subset, a, b, queue=queue, slice=slice, + wait_for=subset.events + a.events + b.events, return_event=True) + result.add_event(event1) + return result def _make_minmax_kernel(what): def f(a, queue=None): from pyopencl.reduction import get_minmax_kernel krnl = get_minmax_kernel(a.context, what, a.dtype) - return krnl(a, queue=queue) + result, event1 = krnl(a, queue=queue, wait_for=a.events, return_event=True) + result.add_event(event1) + return result return f @@ -2562,8 +2603,10 @@ def _make_subset_minmax_kernel(what): def f(subset, a, queue=None, slice=None): from pyopencl.reduction import get_subset_minmax_kernel krnl = get_subset_minmax_kernel(a.context, what, a.dtype, subset.dtype) - return krnl(subset, a, queue=queue, slice=slice) - + result, event1 = krnl(subset, a, queue=queue, slice=slice, + wait_for=a.events + subset.events, return_event=True) + result.add_event(event1) + return result return f @@ -2587,12 +2630,15 @@ def cumsum(a, output_dtype=None, queue=None, if output_dtype is None: output_dtype = a.dtype + if wait_for is None: + wait_for = [] result = a._new_like_me(output_dtype) from pyopencl.scan import get_cumsum_kernel krnl = get_cumsum_kernel(a.context, a.dtype, output_dtype) - evt = krnl(a, result, queue=queue, wait_for=wait_for) + evt = krnl(a, result, queue=queue, wait_for=wait_for + a.events) + result.add_event(evt) if return_event: return evt, result diff --git a/test/test_array.py b/test/test_array.py index fdfcfce3..c9771ac0 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -1241,6 +1241,41 @@ def test_outoforderqueue_copy(ctx_factory): b = 10 * (a**2 - 7) assert np.abs(b1 - b).mean() < 1e-5 +def test_outoforderqueue_indexing(ctx_factory): + context = ctx_factory() + try: + queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + except Exception: + pytest.skip("out-of-order queue not available") + a = np.random.rand(10**6).astype(np.dtype('float32')) + i = (8e5 + 1e5 * np.random.rand(10**5)).astype(np.dtype('int32')) + a_gpu = cl_array.to_device(queue, a) + i_gpu = cl_array.to_device(queue, i) + c_gpu = (a_gpu**2)[i_gpu - 10000] + b_gpu = 10 - a_gpu + b_gpu[:] = 8 * a_gpu + b_gpu[i_gpu + 10000] = c_gpu - 10 + queue.finish() + b1 = b_gpu.get() + c = (a**2)[i - 10000] + b = 8 * a + b[i + 10000] = c - 10 + assert np.abs(b1 - b).mean() < 1e-5 + +def test_outoforderqueue_reductions(ctx_factory): + context = ctx_factory() + try: + queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + except Exception: + pytest.skip("out-of-order queue not available") + a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32')) # 0/1 values to avoid accumulated rounding error + a[800000] = 10 # all<5 looks true until near the end + a_gpu = cl_array.to_device(queue, a) + b1 = cl_array.sum(a_gpu).get() + b2 = cl_array.dot(a_gpu, 3 - a_gpu).get() + b3 = (a_gpu < 5).all().get() + assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0 + if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the # tests. -- GitLab From b5e1dc63308183e1462762c471f53a4467369c67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Sun, 5 Aug 2018 19:29:55 -0400 Subject: [PATCH 4/8] Force use of gcc for Conda Apple CI --- .gitlab-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e37c40b5..0047755f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -166,6 +166,7 @@ Python 2.7 Apple: Python 3 Conda Apple: script: - CONDA_ENVIRONMENT=.test-conda-env-py3.yml + - export CC=gcc - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh - ". ./build-and-test-py-project-within-miniconda.sh" tags: -- GitLab From faff864aaf0d7ecd6de62350443c066b71040c64 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Mon, 6 Aug 2018 21:45:00 +0100 Subject: [PATCH 5/8] actually wait for wait_for_this --- pyopencl/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyopencl/array.py b/pyopencl/array.py index b904c7de..90e6aa57 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -2254,7 +2254,7 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, (i.base_data, i.offset) for i in arrays[chunk_slice])) + src_offsets_list[chunk_slice] - + [src_indices.size])) + + [src_indices.size]), wait_for=wait_for_this) for o in out[chunk_slice]: o.add_event(evt) -- GitLab From e247a566931d0ead98a44196b40403704aee2a0c Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Mon, 6 Aug 2018 22:00:55 +0100 Subject: [PATCH 6/8] flake8 style cleanup --- pyopencl/array.py | 27 ++++++++++++++++----------- test/test_array.py | 26 ++++++++++++++++++-------- test/test_clmath.py | 7 +++++-- 3 files changed, 39 insertions(+), 21 deletions(-) diff --git a/pyopencl/array.py b/pyopencl/array.py index 90e6aa57..a1cbb424 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -624,7 +624,7 @@ class Array(object): event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary, device_offset=self.offset, is_blocking=not async_) - if not async_: # not already waited for + if not async_: # not already waited for self.add_event(event1) def get(self, queue=None, ary=None, async_=None, **kwargs): @@ -1293,7 +1293,8 @@ class Array(object): krnl = get_any_kernel(self.context, self.dtype) if wait_for is None: wait_for = [] - result, event1 = krnl(self, queue=queue, wait_for=wait_for + self.events, return_event=True) + result, event1 = krnl(self, queue=queue, + wait_for=wait_for + self.events, return_event=True) result.add_event(event1) return result @@ -1302,7 +1303,8 @@ class Array(object): krnl = get_all_kernel(self.context, self.dtype) if wait_for is None: wait_for = [] - result, event1 = krnl(self, queue=queue, wait_for=wait_for + self.events, return_event=True) + result, event1 = krnl(self, queue=queue, + wait_for=wait_for + self.events, return_event=True) result.add_event(event1) return result @@ -1686,7 +1688,7 @@ class Array(object): if flags is None: flags = cl.map_flags.READ | cl.map_flags.WRITE if wait_for is None: - wait_for=[] + wait_for = [] ary, evt = cl.enqueue_map_buffer( queue or self.queue, self.base_data, flags, self.offset, @@ -2274,7 +2276,6 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, if wait_for is None: wait_for = [] wait_for = wait_for + dest_indices.events - vec_count = len(arrays) @@ -2535,7 +2536,8 @@ def sum(a, dtype=None, queue=None, slice=None): """ from pyopencl.reduction import get_sum_kernel krnl = get_sum_kernel(a.context, dtype, a.dtype) - result, event1 = krnl(a, queue=queue, slice=slice, wait_for=a.events, return_event=True) + result, event1 = krnl(a, queue=queue, slice=slice, wait_for=a.events, + return_event=True) result.add_event(event1) return result @@ -2546,7 +2548,8 @@ def dot(a, b, dtype=None, queue=None, slice=None): """ from pyopencl.reduction import get_dot_kernel krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype) - result, event1 = krnl(a, b, queue=queue, slice=slice, wait_for=a.events + b.events, return_event=True) + result, event1 = krnl(a, b, queue=queue, slice=slice, + wait_for=a.events + b.events, return_event=True) result.add_event(event1) return result @@ -2559,7 +2562,8 @@ def vdot(a, b, dtype=None, queue=None, slice=None): from pyopencl.reduction import get_dot_kernel krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype, conjugate_first=True) - result, event1 = krnl(a, b, queue=queue, slice=slice, wait_for=a.events + b.events, return_event=True) + result, event1 = krnl(a, b, queue=queue, slice=slice, + wait_for=a.events + b.events, return_event=True) result.add_event(event1) return result @@ -2572,7 +2576,7 @@ def subset_dot(subset, a, b, dtype=None, queue=None, slice=None): krnl = get_subset_dot_kernel( a.context, dtype, subset.dtype, a.dtype, b.dtype) result, event1 = krnl(subset, a, b, queue=queue, slice=slice, - wait_for=subset.events + a.events + b.events, return_event=True) + wait_for=subset.events + a.events + b.events, return_event=True) result.add_event(event1) return result @@ -2581,7 +2585,8 @@ def _make_minmax_kernel(what): def f(a, queue=None): from pyopencl.reduction import get_minmax_kernel krnl = get_minmax_kernel(a.context, what, a.dtype) - result, event1 = krnl(a, queue=queue, wait_for=a.events, return_event=True) + result, event1 = krnl(a, queue=queue, wait_for=a.events, + return_event=True) result.add_event(event1) return result @@ -2604,7 +2609,7 @@ def _make_subset_minmax_kernel(what): from pyopencl.reduction import get_subset_minmax_kernel krnl = get_subset_minmax_kernel(a.context, what, a.dtype, subset.dtype) result, event1 = krnl(subset, a, queue=queue, slice=slice, - wait_for=a.events + subset.events, return_event=True) + wait_for=a.events + subset.events, return_event=True) result.add_event(event1) return result return f diff --git a/test/test_array.py b/test/test_array.py index c9771ac0..05008c16 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -1212,39 +1212,45 @@ def test_multi_put(ctx_factory): assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9)) + def test_outoforderqueue_get(ctx_factory): context = ctx_factory() try: - queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + queue = cl.CommandQueue(context, + properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") a = np.random.rand(10**6).astype(np.dtype('float32')) a_gpu = cl_array.to_device(queue, a) b_gpu = a_gpu + a_gpu**5 + 1 - b1 = b_gpu.get() # testing that this waits for events + b1 = b_gpu.get() # testing that this waits for events b = a + a**5 + 1 assert np.abs(b1 - b).mean() < 1e-5 + def test_outoforderqueue_copy(ctx_factory): context = ctx_factory() try: - queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + queue = cl.CommandQueue(context, + properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") a = np.random.rand(10**6).astype(np.dtype('float32')) a_gpu = cl_array.to_device(queue, a) c_gpu = a_gpu**2 - 7 - b_gpu = c_gpu.copy() # testing that this waits for and creates events + b_gpu = c_gpu.copy() # testing that this waits for and creates events b_gpu *= 10 queue.finish() b1 = b_gpu.get() b = 10 * (a**2 - 7) assert np.abs(b1 - b).mean() < 1e-5 + def test_outoforderqueue_indexing(ctx_factory): context = ctx_factory() try: - queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + queue = cl.CommandQueue(context, + properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") a = np.random.rand(10**6).astype(np.dtype('float32')) @@ -1262,20 +1268,24 @@ def test_outoforderqueue_indexing(ctx_factory): b[i + 10000] = c - 10 assert np.abs(b1 - b).mean() < 1e-5 + def test_outoforderqueue_reductions(ctx_factory): context = ctx_factory() try: - queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + queue = cl.CommandQueue(context, + properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") - a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32')) # 0/1 values to avoid accumulated rounding error - a[800000] = 10 # all<5 looks true until near the end + # 0/1 values to avoid accumulated rounding error + a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32')) + a[800000] = 10 # all<5 looks true until near the end a_gpu = cl_array.to_device(queue, a) b1 = cl_array.sum(a_gpu).get() b2 = cl_array.dot(a_gpu, 3 - a_gpu).get() b3 = (a_gpu < 5).all().get() assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0 + if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the # tests. diff --git a/test/test_clmath.py b/test/test_clmath.py index f4a55936..beebc2a8 100644 --- a/test/test_clmath.py +++ b/test/test_clmath.py @@ -446,15 +446,18 @@ def test_hankel_01_complex(ctx_factory, ref_src): pt.loglog(np.abs(z), rel_err_h1) pt.show() + def test_outoforderqueue_clmath(ctx_factory): context = ctx_factory() try: - queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + queue = cl.CommandQueue(context, + properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") a = np.random.rand(10**6).astype(np.dtype('float32')) a_gpu = cl_array.to_device(queue, a) - b_gpu = clmath.fabs(clmath.sin(a_gpu * 5)) # testing that clmath functions wait for and create events + # testing that clmath functions wait for and create events + b_gpu = clmath.fabs(clmath.sin(a_gpu * 5)) queue.finish() b1 = b_gpu.get() b = np.abs(np.sin(a * 5)) -- GitLab From 0882d910e3ae9996f96f946afb28d90efefef817 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Tue, 7 Aug 2018 07:56:43 +0100 Subject: [PATCH 7/8] tests: work around pocl not having clEnqueueWaitForEvents --- test/test_algorithm.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/test/test_algorithm.py b/test/test_algorithm.py index b7b296ce..5264767c 100644 --- a/test/test_algorithm.py +++ b/test/test_algorithm.py @@ -961,8 +961,14 @@ def test_bitonic_sort(ctx_factory, size, dtype): from pyopencl.bitonic_sort import BitonicSort s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333) + sgs = s.copy() + # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for + # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237 + if (dev.platform.name == "Portable Computing Language" + and cl.get_cl_header_version() < (1, 2)): + sgs.finish() sorter = BitonicSort(ctx) - sgs, evt = sorter(s.copy(), axis=1) + sgs, evt = sorter(sgs, axis=1) assert np.array_equal(np.sort(s.get(), axis=1), sgs.get()) @@ -1014,7 +1020,14 @@ def test_bitonic_argsort(ctx_factory, size, dtype): sorterm = BitonicSort(ctx) - ms, evt = sorterm(m.copy(), idx=index, axis=0) + ms = m.copy() + # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for + # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237 + if (dev.platform.name == "Portable Computing Language" + and cl.get_cl_header_version() < (1, 2)): + ms.finish() + index.finish() + ms, evt = sorterm(ms, idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) -- GitLab From 064f9be6103ab285e807cd1b774ce0da9aeca9e3 Mon Sep 17 00:00:00 2001 From: "Rebecca N. Palmer" Date: Tue, 7 Aug 2018 22:20:14 +0100 Subject: [PATCH 8/8] fix bug in async Array.set --- pyopencl/array.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyopencl/array.py b/pyopencl/array.py index a1cbb424..704c495b 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -624,8 +624,7 @@ class Array(object): event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary, device_offset=self.offset, is_blocking=not async_) - if not async_: # not already waited for - self.add_event(event1) + self.add_event(event1) def get(self, queue=None, ary=None, async_=None, **kwargs): """Transfer the contents of *self* into *ary* or a newly allocated -- GitLab