diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e37c40b503fa24ba47d8a4f1db3bbf64fe3747cb..0047755fcbc3ec26d86b3c24075354534c00ef13 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -166,6 +166,7 @@ Python 2.7 Apple: Python 3 Conda Apple: script: - CONDA_ENVIRONMENT=.test-conda-env-py3.yml + - export CC=gcc - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh - ". ./build-and-test-py-project-within-miniconda.sh" tags: diff --git a/pyopencl/array.py b/pyopencl/array.py index 2d03207962b5fb1c6f5c9c4edb48ad1d3478e231..704c495b4bff7ceed3d4808244a8bdb844f267b5 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -621,9 +621,10 @@ class Array(object): stacklevel=2) if self.size: - cl.enqueue_copy(queue or self.queue, self.base_data, ary, + event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary, device_offset=self.offset, is_blocking=not async_) + self.add_event(event1) def get(self, queue=None, ary=None, async_=None, **kwargs): """Transfer the contents of *self* into *ary* or a newly allocated @@ -687,7 +688,7 @@ class Array(object): if self.size: cl.enqueue_copy(queue, ary, self.base_data, device_offset=self.offset, - is_blocking=not async_) + wait_for=self.events, is_blocking=not async_) return ary @@ -712,9 +713,11 @@ class Array(object): result = result.with_queue(queue) if self.nbytes: - cl.enqueue_copy(queue or self.queue, + event1 = cl.enqueue_copy(queue or self.queue, result.base_data, self.base_data, - src_offset=self.offset, byte_count=self.nbytes) + src_offset=self.offset, byte_count=self.nbytes, + wait_for=self.events) + result.add_event(event1) return result @@ -1287,12 +1290,22 @@ class Array(object): def any(self, queue=None, wait_for=None): from pyopencl.reduction import get_any_kernel krnl = get_any_kernel(self.context, self.dtype) - return krnl(self, queue=queue, wait_for=wait_for) + if wait_for is None: + wait_for = [] + result, event1 = krnl(self, queue=queue, + wait_for=wait_for + self.events, return_event=True) + result.add_event(event1) + return result def all(self, queue=None, wait_for=None): from pyopencl.reduction import get_all_kernel krnl = get_all_kernel(self.context, self.dtype) - return krnl(self, queue=queue, wait_for=wait_for) + if wait_for is None: + wait_for = [] + result, event1 = krnl(self, queue=queue, + wait_for=wait_for + self.events, return_event=True) + result.add_event(event1) + return result @staticmethod @elwise_kernel_runner @@ -1673,11 +1686,13 @@ class Array(object): if flags is None: flags = cl.map_flags.READ | cl.map_flags.WRITE + if wait_for is None: + wait_for = [] ary, evt = cl.enqueue_map_buffer( queue or self.queue, self.base_data, flags, self.offset, - self.shape, self.dtype, strides=self.strides, wait_for=wait_for, - is_blocking=is_blocking) + self.shape, self.dtype, strides=self.strides, + wait_for=wait_for + self.events, is_blocking=is_blocking) if is_blocking: return ary @@ -1796,6 +1811,9 @@ class Array(object): """ queue = queue or self.queue or value.queue + if wait_for is None: + wait_for = [] + wait_for = wait_for + self.events if isinstance(subscript, Array): if subscript.dtype.kind != "i": @@ -2145,11 +2163,16 @@ def multi_take(arrays, indices, out=None, queue=None): cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) - knl(queue, gs, ls, + wait_for_this = (indices.events + + _builtin_sum((i.events for i in arrays[chunk_slice]), []) + + _builtin_sum((o.events for o in out[chunk_slice]), [])) + evt = knl(queue, gs, ls, indices.data, *([o.data for o in out[chunk_slice]] + [i.data for i in arrays[chunk_slice]] - + [indices.size])) + + [indices.size]), wait_for=wait_for_this) + for o in out[chunk_slice]: + o.add_event(evt) return out @@ -2219,7 +2242,10 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, queue.device)) from pytools import flatten - knl(queue, gs, ls, + wait_for_this = (dest_indices.events + src_indices.events + + _builtin_sum((i.events for i in arrays[chunk_slice]), []) + + _builtin_sum((o.events for o in out[chunk_slice]), [])) + evt = knl(queue, gs, ls, *([o.data for o in out[chunk_slice]] + [dest_indices.base_data, dest_indices.offset, @@ -2229,7 +2255,9 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, (i.base_data, i.offset) for i in arrays[chunk_slice])) + src_offsets_list[chunk_slice] - + [src_indices.size])) + + [src_indices.size]), wait_for=wait_for_this) + for o in out[chunk_slice]: + o.add_event(evt) return out @@ -2244,6 +2272,9 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, a_allocator = arrays[0].allocator context = dest_indices.context queue = queue or dest_indices.queue + if wait_for is None: + wait_for = [] + wait_for = wait_for + dest_indices.events vec_count = len(arrays) @@ -2295,6 +2326,9 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, queue.device)) from pytools import flatten + wait_for_this = (wait_for + + _builtin_sum((i.events for i in arrays[chunk_slice]), []) + + _builtin_sum((o.events for o in out[chunk_slice]), [])) evt = knl(queue, gs, ls, *( list(flatten( @@ -2307,9 +2341,7 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, + [use_fill_cla.base_data, use_fill_cla.offset] + [array_lengths_cla.base_data, array_lengths_cla.offset] + [dest_indices.size]), - **dict(wait_for=wait_for)) - - # FIXME should wait on incoming events + **dict(wait_for=wait_for_this)) for o in out[chunk_slice]: o.add_event(evt) @@ -2387,7 +2419,8 @@ def diff(array, queue=None, allocator=None): allocator = allocator or array.allocator result = empty(queue, (n-1,), array.dtype, allocator=allocator) - _diff(result, array, queue=queue) + event1 = _diff(result, array, queue=queue) + result.add_event(event1) return result @@ -2468,7 +2501,8 @@ def if_positive(criterion, then_, else_, out=None, queue=None): if out is None: out = empty_like(then_) - _if_positive(out, criterion, then_, else_, queue=queue) + event1 = _if_positive(out, criterion, then_, else_, queue=queue) + out.add_event(event1) return out @@ -2501,7 +2535,10 @@ def sum(a, dtype=None, queue=None, slice=None): """ from pyopencl.reduction import get_sum_kernel krnl = get_sum_kernel(a.context, dtype, a.dtype) - return krnl(a, queue=queue, slice=slice) + result, event1 = krnl(a, queue=queue, slice=slice, wait_for=a.events, + return_event=True) + result.add_event(event1) + return result def dot(a, b, dtype=None, queue=None, slice=None): @@ -2510,7 +2547,10 @@ def dot(a, b, dtype=None, queue=None, slice=None): """ from pyopencl.reduction import get_dot_kernel krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype) - return krnl(a, b, queue=queue, slice=slice) + result, event1 = krnl(a, b, queue=queue, slice=slice, + wait_for=a.events + b.events, return_event=True) + result.add_event(event1) + return result def vdot(a, b, dtype=None, queue=None, slice=None): @@ -2521,7 +2561,10 @@ def vdot(a, b, dtype=None, queue=None, slice=None): from pyopencl.reduction import get_dot_kernel krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype, conjugate_first=True) - return krnl(a, b, queue=queue, slice=slice) + result, event1 = krnl(a, b, queue=queue, slice=slice, + wait_for=a.events + b.events, return_event=True) + result.add_event(event1) + return result def subset_dot(subset, a, b, dtype=None, queue=None, slice=None): @@ -2531,14 +2574,20 @@ def subset_dot(subset, a, b, dtype=None, queue=None, slice=None): from pyopencl.reduction import get_subset_dot_kernel krnl = get_subset_dot_kernel( a.context, dtype, subset.dtype, a.dtype, b.dtype) - return krnl(subset, a, b, queue=queue, slice=slice) + result, event1 = krnl(subset, a, b, queue=queue, slice=slice, + wait_for=subset.events + a.events + b.events, return_event=True) + result.add_event(event1) + return result def _make_minmax_kernel(what): def f(a, queue=None): from pyopencl.reduction import get_minmax_kernel krnl = get_minmax_kernel(a.context, what, a.dtype) - return krnl(a, queue=queue) + result, event1 = krnl(a, queue=queue, wait_for=a.events, + return_event=True) + result.add_event(event1) + return result return f @@ -2558,8 +2607,10 @@ def _make_subset_minmax_kernel(what): def f(subset, a, queue=None, slice=None): from pyopencl.reduction import get_subset_minmax_kernel krnl = get_subset_minmax_kernel(a.context, what, a.dtype, subset.dtype) - return krnl(subset, a, queue=queue, slice=slice) - + result, event1 = krnl(subset, a, queue=queue, slice=slice, + wait_for=a.events + subset.events, return_event=True) + result.add_event(event1) + return result return f @@ -2583,12 +2634,15 @@ def cumsum(a, output_dtype=None, queue=None, if output_dtype is None: output_dtype = a.dtype + if wait_for is None: + wait_for = [] result = a._new_like_me(output_dtype) from pyopencl.scan import get_cumsum_kernel krnl = get_cumsum_kernel(a.context, a.dtype, output_dtype) - evt = krnl(a, result, queue=queue, wait_for=wait_for) + evt = krnl(a, result, queue=queue, wait_for=wait_for + a.events) + result.add_event(evt) if return_event: return evt, result diff --git a/pyopencl/clmath.py b/pyopencl/clmath.py index 73d390cf850e9a054541209069c322e67adf402a..6d45fe44ceb02fa13c7c6fb4563045e3aecdd87c 100644 --- a/pyopencl/clmath.py +++ b/pyopencl/clmath.py @@ -41,7 +41,8 @@ def _make_unary_array_func(name): def f(array, queue=None): result = array._new_like_me(queue=queue) - knl_runner(result, array, queue=queue) + event1 = knl_runner(result, array, queue=queue) + result.add_event(event1) return result return f @@ -78,7 +79,7 @@ def atan2(y, x, queue=None): """ queue = queue or y.queue result = y._new_like_me(_get_common_dtype(y, x, queue)) - _atan2(result, y, x, queue=queue) + result.add_event(_atan2(result, y, x, queue=queue)) return result @@ -92,7 +93,7 @@ def atan2pi(y, x, queue=None): """ queue = queue or y.queue result = y._new_like_me(_get_common_dtype(y, x, queue)) - _atan2pi(result, y, x, queue=queue) + result.add_event(_atan2pi(result, y, x, queue=queue)) return result @@ -130,7 +131,7 @@ def fmod(arg, mod, queue=None): for each element in `arg` and `mod`.""" queue = (queue or arg.queue) or mod.queue result = arg._new_like_me(_get_common_dtype(arg, mod, queue)) - _fmod(result, arg, mod, queue=queue) + result.add_event(_fmod(result, arg, mod, queue=queue)) return result # TODO: fract @@ -148,7 +149,9 @@ def frexp(arg, queue=None): """ sig = arg._new_like_me(queue=queue) expt = arg._new_like_me(queue=queue, dtype=np.int32) - _frexp(sig, expt, arg, queue=queue) + event1 = _frexp(sig, expt, arg, queue=queue) + sig.add_event(event1) + expt.add_event(event1) return sig, expt # TODO: hypot @@ -169,7 +172,7 @@ def ldexp(significand, exponent, queue=None): `result = significand * 2**exponent`. """ result = significand._new_like_me(queue=queue) - _ldexp(result, significand, exponent) + result.add_event(_ldexp(result, significand, exponent)) return result @@ -199,7 +202,9 @@ def modf(arg, queue=None): """ intpart = arg._new_like_me(queue=queue) fracpart = arg._new_like_me(queue=queue) - _modf(intpart, fracpart, arg, queue=queue) + event1 = _modf(intpart, fracpart, arg, queue=queue) + fracpart.add_event(event1) + intpart.add_event(event1) return fracpart, intpart @@ -254,18 +259,20 @@ def _hankel_01(h0, h1, x): def bessel_jn(n, x, queue=None): result = x._new_like_me(queue=queue) - _bessel_jn(result, n, x, queue=queue) + result.add_event(_bessel_jn(result, n, x, queue=queue)) return result def bessel_yn(n, x, queue=None): result = x._new_like_me(queue=queue) - _bessel_yn(result, n, x, queue=queue) + result.add_event(_bessel_yn(result, n, x, queue=queue)) return result def hankel_01(x, queue=None): h0 = x._new_like_me(queue=queue) h1 = x._new_like_me(queue=queue) - _hankel_01(h0, h1, x, queue=queue) + event1 = _hankel_01(h0, h1, x, queue=queue) + h0.add_event(event1) + h1.add_event(event1) return h0, h1 diff --git a/test/test_algorithm.py b/test/test_algorithm.py index b7b296ce959f2f9d92c290196e0a8f28b6043dc7..5264767c4094806fe43fdfea6056237aed20ade4 100644 --- a/test/test_algorithm.py +++ b/test/test_algorithm.py @@ -961,8 +961,14 @@ def test_bitonic_sort(ctx_factory, size, dtype): from pyopencl.bitonic_sort import BitonicSort s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333) + sgs = s.copy() + # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for + # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237 + if (dev.platform.name == "Portable Computing Language" + and cl.get_cl_header_version() < (1, 2)): + sgs.finish() sorter = BitonicSort(ctx) - sgs, evt = sorter(s.copy(), axis=1) + sgs, evt = sorter(sgs, axis=1) assert np.array_equal(np.sort(s.get(), axis=1), sgs.get()) @@ -1014,7 +1020,14 @@ def test_bitonic_argsort(ctx_factory, size, dtype): sorterm = BitonicSort(ctx) - ms, evt = sorterm(m.copy(), idx=index, axis=0) + ms = m.copy() + # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for + # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237 + if (dev.platform.name == "Portable Computing Language" + and cl.get_cl_header_version() < (1, 2)): + ms.finish() + index.finish() + ms, evt = sorterm(ms, idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) diff --git a/test/test_array.py b/test/test_array.py index bca78f5ccbeebf1d86f9ec03a25ded6050bc4097..05008c169ae782a49b5b985c7a79780e337c5770 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -1213,6 +1213,79 @@ def test_multi_put(ctx_factory): assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9)) +def test_outoforderqueue_get(ctx_factory): + context = ctx_factory() + try: + queue = cl.CommandQueue(context, + properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + except Exception: + pytest.skip("out-of-order queue not available") + a = np.random.rand(10**6).astype(np.dtype('float32')) + a_gpu = cl_array.to_device(queue, a) + b_gpu = a_gpu + a_gpu**5 + 1 + b1 = b_gpu.get() # testing that this waits for events + b = a + a**5 + 1 + assert np.abs(b1 - b).mean() < 1e-5 + + +def test_outoforderqueue_copy(ctx_factory): + context = ctx_factory() + try: + queue = cl.CommandQueue(context, + properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + except Exception: + pytest.skip("out-of-order queue not available") + a = np.random.rand(10**6).astype(np.dtype('float32')) + a_gpu = cl_array.to_device(queue, a) + c_gpu = a_gpu**2 - 7 + b_gpu = c_gpu.copy() # testing that this waits for and creates events + b_gpu *= 10 + queue.finish() + b1 = b_gpu.get() + b = 10 * (a**2 - 7) + assert np.abs(b1 - b).mean() < 1e-5 + + +def test_outoforderqueue_indexing(ctx_factory): + context = ctx_factory() + try: + queue = cl.CommandQueue(context, + properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + except Exception: + pytest.skip("out-of-order queue not available") + a = np.random.rand(10**6).astype(np.dtype('float32')) + i = (8e5 + 1e5 * np.random.rand(10**5)).astype(np.dtype('int32')) + a_gpu = cl_array.to_device(queue, a) + i_gpu = cl_array.to_device(queue, i) + c_gpu = (a_gpu**2)[i_gpu - 10000] + b_gpu = 10 - a_gpu + b_gpu[:] = 8 * a_gpu + b_gpu[i_gpu + 10000] = c_gpu - 10 + queue.finish() + b1 = b_gpu.get() + c = (a**2)[i - 10000] + b = 8 * a + b[i + 10000] = c - 10 + assert np.abs(b1 - b).mean() < 1e-5 + + +def test_outoforderqueue_reductions(ctx_factory): + context = ctx_factory() + try: + queue = cl.CommandQueue(context, + properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + except Exception: + pytest.skip("out-of-order queue not available") + # 0/1 values to avoid accumulated rounding error + a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32')) + a[800000] = 10 # all<5 looks true until near the end + a_gpu = cl_array.to_device(queue, a) + b1 = cl_array.sum(a_gpu).get() + b2 = cl_array.dot(a_gpu, 3 - a_gpu).get() + b3 = (a_gpu < 5).all().get() + assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0 + + if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the # tests. diff --git a/test/test_clmath.py b/test/test_clmath.py index 553ed7a6f6ad4f93abde9e46b0bc5fb60a9066f0..beebc2a8c0ad717e7139c340bff14a07fb77b60c 100644 --- a/test/test_clmath.py +++ b/test/test_clmath.py @@ -447,6 +447,23 @@ def test_hankel_01_complex(ctx_factory, ref_src): pt.show() +def test_outoforderqueue_clmath(ctx_factory): + context = ctx_factory() + try: + queue = cl.CommandQueue(context, + properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) + except Exception: + pytest.skip("out-of-order queue not available") + a = np.random.rand(10**6).astype(np.dtype('float32')) + a_gpu = cl_array.to_device(queue, a) + # testing that clmath functions wait for and create events + b_gpu = clmath.fabs(clmath.sin(a_gpu * 5)) + queue.finish() + b1 = b_gpu.get() + b = np.abs(np.sin(a * 5)) + assert np.abs(b1 - b).mean() < 1e-5 + + if __name__ == "__main__": import sys if len(sys.argv) > 1: