From be9fd38f6fd85651422817e27dd1636788c374ad Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 15 Jan 2021 14:22:41 -0600 Subject: [PATCH 01/47] Create special-case set_arg wrappers for common integer/float types --- src/wrap_cl.hpp | 7 +++++++ src/wrap_cl_part_2.cpp | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp index ad5b7ab8..7ffcc983 100644 --- a/src/wrap_cl.hpp +++ b/src/wrap_cl.hpp @@ -4343,6 +4343,13 @@ namespace pyopencl sizeof(cl_mem), &m)); } + template + void set_arg_pod(cl_uint arg_index, T pod_value) + { + PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index, + sizeof(T), &pod_value)); + } + void set_arg_mem(cl_uint arg_index, memory_object_holder &moh) { cl_mem m = moh.data(); diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp index e68c7851..2c986718 100644 --- a/src/wrap_cl_part_2.cpp +++ b/src/wrap_cl_part_2.cpp @@ -467,6 +467,13 @@ void pyopencl_expose_part_2(py::module &m) #endif .def("_set_arg_null", &cls::set_arg_null) .def("_set_arg_buf", &cls::set_arg_buf) + .def("_set_arg_mem", &cls::set_arg_mem) + .def("_set_arg_i32", &cls::set_arg_pod) + .def("_set_arg_i64", &cls::set_arg_pod) + .def("_set_arg_u32", &cls::set_arg_pod) + .def("_set_arg_u64", &cls::set_arg_pod) + .def("_set_arg_f32", &cls::set_arg_pod) + .def("_set_arg_f64", &cls::set_arg_pod) #if PYOPENCL_CL_VERSION >= 0x2000 .def("_set_arg_svm", &cls::set_arg_svm) #endif -- GitLab From afaad630e905c990413039e5d8490836f5bda3b8 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 15 Jan 2021 14:22:51 -0600 Subject: [PATCH 02/47] Bump version --- pyopencl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyopencl/version.py b/pyopencl/version.py index fee1f80e..e4c47bc9 100644 --- a/pyopencl/version.py +++ b/pyopencl/version.py @@ -1,3 +1,3 @@ -VERSION = (2020, 3, 1) +VERSION = (2021, 1) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS -- GitLab From c9f439e053838e9f9bd2ab95afbae399f8e406d3 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 15 Jan 2021 15:16:49 -0600 Subject: [PATCH 03/47] Avoid malloc in enqueue_nd_range_kernel --- src/wrap_cl.hpp | 38 +++++++++++++++++++++----------------- src/wrap_helpers.hpp | 12 ++++++++++++ 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp index 7ffcc983..de35513c 100644 --- a/src/wrap_cl.hpp +++ b/src/wrap_cl.hpp @@ -4642,7 +4642,7 @@ namespace pyopencl return result; } - +#define MAX_WS_DIM_COUNT 10 inline event *enqueue_nd_range_kernel( @@ -4659,11 +4659,14 @@ namespace pyopencl cl_uint work_dim = len(py_global_work_size); - std::vector global_work_size; - COPY_PY_LIST(size_t, global_work_size); + std::array global_work_size; + unsigned gws_index = 0; + COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_size, gws_index); + + std::array local_work_size; + unsigned lws_index = 0; + size_t *local_work_size_ptr = nullptr; - size_t *local_work_size_ptr = 0; - std::vector local_work_size; if (py_local_work_size.ptr() != Py_None) { if (g_times_l) @@ -4673,31 +4676,32 @@ namespace pyopencl throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE, "global/local work sizes have differing dimensions"); - COPY_PY_LIST(size_t, local_work_size); + COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, local_work_size, lws_index); - while (local_work_size.size() < work_dim) - local_work_size.push_back(1); - while (global_work_size.size() < work_dim) - global_work_size.push_back(1); + while (lws_index < work_dim) + local_work_size[lws_index++] = 1; + while (gws_index < work_dim) + global_work_size[gws_index++] = 1; - local_work_size_ptr = local_work_size.empty( ) ? nullptr : &local_work_size.front(); + local_work_size_ptr = &local_work_size.front(); } - if (g_times_l && local_work_size_ptr) + if (g_times_l && lws_index) { for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis) global_work_size[work_axis] *= local_work_size[work_axis]; } - size_t *global_work_offset_ptr = 0; - std::vector global_work_offset; + size_t *global_work_offset_ptr = nullptr; + std::array global_work_offset; + unsigned gwo_index = 0; if (py_global_work_offset.ptr() != Py_None) { if (work_dim != unsigned(len(py_global_work_offset))) throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE, "global work size and offset have differing dimensions"); - COPY_PY_LIST(size_t, global_work_offset); + COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_offset, gwo_index); if (g_times_l && local_work_size_ptr) { @@ -4705,7 +4709,7 @@ namespace pyopencl global_work_offset[work_axis] *= local_work_size[work_axis]; } - global_work_offset_ptr = global_work_offset.empty( ) ? nullptr : &global_work_offset.front(); + global_work_offset_ptr = &global_work_offset.front(); } if (allow_empty_ndrange) @@ -4742,7 +4746,7 @@ namespace pyopencl knl.data(), work_dim, global_work_offset_ptr, - global_work_size.empty( ) ? nullptr : &global_work_size.front(), + &global_work_size.front(), local_work_size_ptr, PYOPENCL_WAITLIST_ARGS, &evt )); diff --git a/src/wrap_helpers.hpp b/src/wrap_helpers.hpp index 4799244f..cabc012d 100644 --- a/src/wrap_helpers.hpp +++ b/src/wrap_helpers.hpp @@ -71,6 +71,18 @@ namespace py = pybind11; NAME.push_back(it.cast()); \ } +#define COPY_PY_ARRAY(FUNC_NAME, TYPE, NAME, COUNTER) \ + { \ + COUNTER = 0; \ + for (auto it: py_##NAME) \ + { \ + if (COUNTER == NAME.size()) \ + throw error(FUNC_NAME, \ + CL_INVALID_VALUE, "too many entries in " #NAME " argument"); \ + NAME[COUNTER++] = it.cast(); \ + } \ + } + #define COPY_PY_COORD_TRIPLE(NAME) \ size_t NAME[3] = {0, 0, 0}; \ { \ -- GitLab From c6aad43ed6247da5e376cd0da38cf8fcb9e4cfcf Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 15 Jan 2021 15:53:44 -0600 Subject: [PATCH 04/47] Privatize cl.array.{splay,Array.get_sizes}, cache _get_sizes results --- pyopencl/array.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/pyopencl/array.py b/pyopencl/array.py index 297b4eff..75e744e3 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -103,19 +103,18 @@ class VecLookupWarner: vec = VecLookupWarner() -# {{{ helper functionality +# {{{ helper functionality -def splay(queue, n, kernel_specific_max_wg_size=None): - dev = queue.device - max_work_items = _builtin_min(128, dev.max_work_group_size) +def _splay(device, n, kernel_specific_max_wg_size=None): + max_work_items = _builtin_min(128, device.max_work_group_size) if kernel_specific_max_wg_size is not None: from builtins import min max_work_items = min(max_work_items, kernel_specific_max_wg_size) min_work_items = _builtin_min(32, max_work_items) - max_groups = dev.max_compute_units * 4 * 8 + max_groups = device.max_compute_units * 4 * 8 # 4 to overfill the device # 8 is an Nvidia constant--that's how many # groups fit onto one compute device @@ -163,7 +162,7 @@ def elwise_kernel_runner(kernel_getter): knl = kernel_getter(*args, **kwargs) - gs, ls = repr_ary.get_sizes(queue, + gs, ls = repr_ary._get_sizes(queue, knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) @@ -241,6 +240,9 @@ class _copy_queue: # noqa pass +_ARRAY_GET_SIZES_CACHE = {} + + class Array: """A :class:`numpy.ndarray` work-alike that stores its data and performs its computations on the compute device. *shape* and *dtype* work exactly @@ -604,12 +606,17 @@ class Array: return self._new_with_changes(self.base_data, self.offset, queue=queue) - #@memoize_method FIXME: reenable - def get_sizes(self, queue, kernel_specific_max_wg_size=None): + def _get_sizes(self, queue, kernel_specific_max_wg_size=None): if not self.flags.forc: raise NotImplementedError("cannot operate on non-contiguous array") - return splay(queue, self.size, - kernel_specific_max_wg_size=kernel_specific_max_wg_size) + cache_key = (queue.device.int_ptr, self.size, kernel_specific_max_wg_size) + try: + return _ARRAY_GET_SIZES_CACHE[cache_key] + except KeyError: + sizes = _splay(queue.device, self.size, + kernel_specific_max_wg_size=kernel_specific_max_wg_size) + _ARRAY_GET_SIZES_CACHE[cache_key] = sizes + return sizes def set(self, ary, queue=None, async_=None, **kwargs): """Transfer the contents the :class:`numpy.ndarray` object *ary* @@ -2289,7 +2296,7 @@ def multi_take(arrays, indices, out=None, queue=None): if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) - gs, ls = indices.get_sizes(queue, + gs, ls = indices._get_sizes(queue, knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) @@ -2367,7 +2374,7 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) - gs, ls = src_indices.get_sizes(queue, + gs, ls = src_indices._get_sizes(queue, knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) @@ -2450,7 +2457,7 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) - gs, ls = dest_indices.get_sizes(queue, + gs, ls = dest_indices._get_sizes(queue, knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) -- GitLab From 367800529a7f2c3840c51d7c52237b7b29282336 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 15 Jan 2021 17:57:23 -0600 Subject: [PATCH 05/47] Fix a few remaining invocations of splay --- pyopencl/algorithm.py | 4 ++-- pyopencl/clrandom.py | 4 ++-- pyopencl/elementwise.py | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py index c4eb43ea..2b5508b3 100644 --- a/pyopencl/algorithm.py +++ b/pyopencl/algorithm.py @@ -1188,8 +1188,8 @@ class ListOfListsBuilder: gsize = (4*queue.device.max_compute_units,) lsize = (1,) else: - from pyopencl.array import splay - gsize, lsize = splay(queue, n_objects) + from pyopencl.array import _splay + gsize, lsize = _splay(queue, n_objects) count_event = count_kernel(queue, gsize, lsize, *(tuple(count_list_args) + data_args + (n_objects,)), diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py index ea3862bc..83f2c4bc 100644 --- a/pyopencl/clrandom.py +++ b/pyopencl/clrandom.py @@ -662,8 +662,8 @@ class Random123GeneratorBase: scale, shift] n = ary.size - from pyopencl.array import splay - gsize, lsize = splay(queue, ary.size) + from pyopencl.array import _splay + gsize, lsize = _splay(queue, ary.size) evt = knl(queue, gsize, lsize, *args) ary.add_event(evt) diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index df364eda..0fcabd95 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -307,13 +307,13 @@ class ElementwiseKernel: invocation_args.append(step) - from pyopencl.array import splay - gs, ls = splay(queue, + from pyopencl.array import _splay + gs, ls = _splay(queue, abs(range_.stop - start)//step, max_wg_size) else: invocation_args.append(repr_vec.size) - gs, ls = repr_vec.get_sizes(queue, max_wg_size) + gs, ls = repr_vec._get_sizes(queue, max_wg_size) if capture_as is not None: kernel.set_args(*invocation_args) -- GitLab From 39d4fadbc4527972c2ac268a72cacfa143eff8f1 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 15 Jan 2021 17:57:55 -0600 Subject: [PATCH 06/47] Avoid a few calls to len() in enqueue_nd_range_kernel --- src/wrap_cl.hpp | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp index de35513c..6679ed3f 100644 --- a/src/wrap_cl.hpp +++ b/src/wrap_cl.hpp @@ -389,14 +389,15 @@ \ if (py_wait_for.ptr() != Py_None) \ { \ - event_wait_list.resize(len(py_wait_for)); \ for (py::handle evt: py_wait_for) \ - event_wait_list[num_events_in_wait_list++] = \ - evt.cast().data(); \ + { \ + event_wait_list.push_back(evt.cast().data()); \ + ++num_events_in_wait_list; \ + } \ } #define PYOPENCL_WAITLIST_ARGS \ - num_events_in_wait_list, event_wait_list.empty( ) ? nullptr : &event_wait_list.front() + num_events_in_wait_list, (num_events_in_wait_list == 0) ? nullptr : &event_wait_list.front() #define PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, obj) \ try \ @@ -4657,36 +4658,35 @@ namespace pyopencl { PYOPENCL_PARSE_WAIT_FOR; - cl_uint work_dim = len(py_global_work_size); - std::array global_work_size; - unsigned gws_index = 0; - COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_size, gws_index); + unsigned gws_size = 0; + COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_size, gws_size); + cl_uint work_dim = gws_size; std::array local_work_size; - unsigned lws_index = 0; + unsigned lws_size = 0; size_t *local_work_size_ptr = nullptr; if (py_local_work_size.ptr() != Py_None) { + COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, local_work_size, lws_size); + if (g_times_l) - work_dim = std::max(work_dim, unsigned(len(py_local_work_size))); + work_dim = std::max(work_dim, lws_size); else - if (work_dim != unsigned(len(py_local_work_size))) + if (work_dim != lws_size) throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE, "global/local work sizes have differing dimensions"); - COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, local_work_size, lws_index); - - while (lws_index < work_dim) - local_work_size[lws_index++] = 1; - while (gws_index < work_dim) - global_work_size[gws_index++] = 1; + while (lws_size < work_dim) + local_work_size[lws_size++] = 1; + while (gws_size < work_dim) + global_work_size[gws_size++] = 1; local_work_size_ptr = &local_work_size.front(); } - if (g_times_l && lws_index) + if (g_times_l && lws_size) { for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis) global_work_size[work_axis] *= local_work_size[work_axis]; @@ -4694,15 +4694,15 @@ namespace pyopencl size_t *global_work_offset_ptr = nullptr; std::array global_work_offset; - unsigned gwo_index = 0; if (py_global_work_offset.ptr() != Py_None) { - if (work_dim != unsigned(len(py_global_work_offset))) + unsigned gwo_size = 0; + COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_offset, gwo_size); + + if (work_dim != gwo_size) throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE, "global work size and offset have differing dimensions"); - COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_offset, gwo_index); - if (g_times_l && local_work_size_ptr) { for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis) -- GitLab From cace1cc1d5e8860b71930b8ba15ce27a3ec656a8 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 15 Jan 2021 18:42:49 -0600 Subject: [PATCH 07/47] Actually pass a device to _splay on invocations outside pyopencl.array --- pyopencl/algorithm.py | 2 +- pyopencl/clrandom.py | 2 +- pyopencl/elementwise.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py index 2b5508b3..446eb9c3 100644 --- a/pyopencl/algorithm.py +++ b/pyopencl/algorithm.py @@ -1189,7 +1189,7 @@ class ListOfListsBuilder: lsize = (1,) else: from pyopencl.array import _splay - gsize, lsize = _splay(queue, n_objects) + gsize, lsize = _splay(queue.device, n_objects) count_event = count_kernel(queue, gsize, lsize, *(tuple(count_list_args) + data_args + (n_objects,)), diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py index 83f2c4bc..dd6c1276 100644 --- a/pyopencl/clrandom.py +++ b/pyopencl/clrandom.py @@ -663,7 +663,7 @@ class Random123GeneratorBase: n = ary.size from pyopencl.array import _splay - gsize, lsize = _splay(queue, ary.size) + gsize, lsize = _splay(queue.device, ary.size) evt = knl(queue, gsize, lsize, *args) ary.add_event(evt) diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index 0fcabd95..039cc7cf 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -308,7 +308,7 @@ class ElementwiseKernel: invocation_args.append(step) from pyopencl.array import _splay - gs, ls = _splay(queue, + gs, ls = _splay(queue.device, abs(range_.stop - start)//step, max_wg_size) else: -- GitLab From ee989c8a23ff85b9c45177598cd7684240993cb8 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 17 Jan 2021 18:16:03 -0600 Subject: [PATCH 08/47] Revert "Create special-case set_arg wrappers for common integer/float types" This reverts commit be9fd38f6fd85651422817e27dd1636788c374ad. --- src/wrap_cl.hpp | 7 ------- src/wrap_cl_part_2.cpp | 7 ------- 2 files changed, 14 deletions(-) diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp index 6679ed3f..20db54ba 100644 --- a/src/wrap_cl.hpp +++ b/src/wrap_cl.hpp @@ -4344,13 +4344,6 @@ namespace pyopencl sizeof(cl_mem), &m)); } - template - void set_arg_pod(cl_uint arg_index, T pod_value) - { - PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index, - sizeof(T), &pod_value)); - } - void set_arg_mem(cl_uint arg_index, memory_object_holder &moh) { cl_mem m = moh.data(); diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp index 2c986718..e68c7851 100644 --- a/src/wrap_cl_part_2.cpp +++ b/src/wrap_cl_part_2.cpp @@ -467,13 +467,6 @@ void pyopencl_expose_part_2(py::module &m) #endif .def("_set_arg_null", &cls::set_arg_null) .def("_set_arg_buf", &cls::set_arg_buf) - .def("_set_arg_mem", &cls::set_arg_mem) - .def("_set_arg_i32", &cls::set_arg_pod) - .def("_set_arg_i64", &cls::set_arg_pod) - .def("_set_arg_u32", &cls::set_arg_pod) - .def("_set_arg_u64", &cls::set_arg_pod) - .def("_set_arg_f32", &cls::set_arg_pod) - .def("_set_arg_f64", &cls::set_arg_pod) #if PYOPENCL_CL_VERSION >= 0x2000 .def("_set_arg_svm", &cls::set_arg_svm) #endif -- GitLab From 8c63f87f82767cc155499dc48fc34530237d3f91 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 17 Jan 2021 21:02:50 -0600 Subject: [PATCH 09/47] Remove a pypy compatibility hack for checking for object-array-ness --- pyopencl/array.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/pyopencl/array.py b/pyopencl/array.py index 75e744e3..75f531d6 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -69,19 +69,6 @@ def _get_truedivide_dtype(obj1, obj2, queue): return result -# Work around PyPy not currently supporting the object dtype. -# (Yes, it doesn't even support checking!) -# (as of May 27, 2014 on PyPy 2.3) -try: - np.dtype(object) - - def _dtype_is_object(t): - return t == object -except Exception: - def _dtype_is_object(t): - return False - - class InconsistentOpenCLQueueWarning(UserWarning): pass @@ -499,9 +486,8 @@ class Array: # }}} - if _dtype_is_object(dtype): - raise TypeError("object arrays on the compute device are not allowed") - + assert dtype != np.object, \ + "object arrays on the compute device are not allowed" assert isinstance(shape, tuple) assert isinstance(strides, tuple) @@ -2080,7 +2066,7 @@ def to_device(queue, ary, allocator=None, async_=None, # }}} - if _dtype_is_object(ary.dtype): + if ary.dtype == np.object: raise RuntimeError("to_device does not work on object arrays.") if array_queue is _same_as_transfer: -- GitLab From 05050cd2ee1bfdb2df8ced434d73d61b356bb152 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 17 Jan 2021 21:03:50 -0600 Subject: [PATCH 10/47] Inline _make_strides in pyopencl.array --- pyopencl/array.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/pyopencl/array.py b/pyopencl/array.py index 75f531d6..f245a342 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -198,15 +198,6 @@ class DefaultAllocator(cl.tools.DeferredAllocator): DeprecationWarning, 2) cl.tools.DeferredAllocator.__init__(self, *args, **kwargs) - -def _make_strides(itemsize, shape, order): - if order in "fF": - return _f_contiguous_strides(itemsize, shape) - elif order in "cC": - return _c_contiguous_strides(itemsize, shape) - else: - raise ValueError("invalid order: %s" % order) - # }}} @@ -476,7 +467,19 @@ class Array: size = size.item() if strides is None: - strides = _make_strides(dtype.itemsize, shape, order) + if order in "cC": + # inlined from compyte.array.c_contiguous_strides + if shape: + strides = [dtype.itemsize] + for s in shape[:0:-1]: + strides.append(strides[-1]*s) + strides = tuple(strides[::-1]) + else: + strides = () + elif order in "fF": + strides = _f_contiguous_strides(dtype.itemsize, shape) + else: + raise ValueError("invalid order: %s" % order) else: # FIXME: We should possibly perform some plausibility -- GitLab From a8c0184dbcaaabdb5374e6408591e392757daa76 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 17 Jan 2021 21:04:46 -0600 Subject: [PATCH 11/47] Cache result of _get_common_dtype --- pyopencl/array.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/pyopencl/array.py b/pyopencl/array.py index f245a342..63f85fa3 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -45,9 +45,33 @@ from pyopencl.characterize import has_double_support from pyopencl import cltypes +_COMMON_DTYPE_CACHE = {} + + def _get_common_dtype(obj1, obj2, queue): - return _get_common_dtype_base(obj1, obj2, - has_double_support(queue.device)) + dsupport = has_double_support(queue.device) + cache_key = None + o1_dtype = obj1.dtype + try: + cache_key = (o1_dtype, obj2.dtype, dsupport) + return _COMMON_DTYPE_CACHE[cache_key] + except KeyError: + pass + except AttributeError: + # obj2 doesn't have a dtype + try: + cache_key = (o1_dtype, type(obj2), dsupport) + return _COMMON_DTYPE_CACHE[cache_key] + except KeyError: + pass + + result = _get_common_dtype_base(obj1, obj2, dsupport) + + # we succeeded in constructing the cache key + if cache_key is not None: + _COMMON_DTYPE_CACHE[cache_key] = result + + return result def _get_truedivide_dtype(obj1, obj2, queue): -- GitLab From eb47983a0faf579253410e8df9aa1a4d690d6057 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 17 Jan 2021 21:06:05 -0600 Subject: [PATCH 12/47] Avoid generator expression in checking for negative shape entries --- pyopencl/array.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pyopencl/array.py b/pyopencl/array.py index 63f85fa3..4d9bc124 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -475,6 +475,9 @@ class Array: size = 1 for dim in shape: size *= dim + if dim < 0: + raise ValueError("negative dimensions are not allowed") + except TypeError: admissible_types = (int, np.integer) @@ -482,11 +485,10 @@ class Array: raise TypeError("shape must either be iterable or " "castable to an integer") size = shape + if shape < 0: + raise ValueError("negative dimensions are not allowed") shape = (shape,) - if any(dim < 0 for dim in shape): - raise ValueError("negative dimensions are not allowed") - if isinstance(size, np.integer): size = size.item() -- GitLab From 50705a965eb475d541150bcb3cba3e1fc6cfcc02 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 17 Jan 2021 21:07:15 -0600 Subject: [PATCH 13/47] Do not recompute flags in arrays made by _new_like_me --- pyopencl/array.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pyopencl/array.py b/pyopencl/array.py index 4d9bc124..10547f84 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -33,7 +33,6 @@ from functools import reduce import numpy as np import pyopencl.elementwise as elementwise import pyopencl as cl -from pytools import memoize_method from pyopencl.compyte.array import ( as_strided as _as_strided, f_contiguous_strides as _f_contiguous_strides, @@ -431,7 +430,7 @@ class Array: __array_priority__ = 100 def __init__(self, cq, shape, dtype, order="C", allocator=None, - data=None, offset=0, strides=None, events=None): + data=None, offset=0, strides=None, events=None, _flags=None): # {{{ backward compatibility if isinstance(cq, cl.CommandQueue): @@ -556,6 +555,7 @@ class Array: self.offset = offset self.context = context + self._flags = _flags @property def ndim(self): @@ -569,9 +569,11 @@ class Array: return self.base_data @property - @memoize_method def flags(self): - return _ArrayFlags(self) + f = self._flags + if f is None: + self._flags = f = _ArrayFlags(self) + return f def _new_with_changes(self, data, offset, shape=None, dtype=None, strides=None, queue=_copy_queue, allocator=None): @@ -977,19 +979,21 @@ class Array: def _new_like_me(self, dtype=None, queue=None): strides = None + flags = None if dtype is None: dtype = self.dtype if dtype == self.dtype: strides = self.strides + flags = self.flags queue = queue or self.queue if queue is not None: return self.__class__(queue, self.shape, dtype, - allocator=self.allocator, strides=strides) + allocator=self.allocator, strides=strides, _flags=flags) else: return self.__class__(self.context, self.shape, dtype, - strides=strides, allocator=self.allocator) + strides=strides, allocator=self.allocator, _flags=flags) @staticmethod @elwise_kernel_runner -- GitLab From b7c052f9f5829152cb539adf52320b8ca93cf846 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 17 Jan 2021 21:11:09 -0600 Subject: [PATCH 14/47] Update compyte for equal_strides fast path --- pyopencl/compyte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyopencl/compyte b/pyopencl/compyte index fbfe788a..7533db88 160000 --- a/pyopencl/compyte +++ b/pyopencl/compyte @@ -1 +1 @@ -Subproject commit fbfe788a2dcb190fd241fd42ad047e33bafd85b8 +Subproject commit 7533db88124045924a47d7392eaf9a078670fc4d -- GitLab From f1db98f9c1e0b94d5ec6685c5452d1a838835efa Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 17 Jan 2021 22:09:43 -0600 Subject: [PATCH 15/47] Invoker gen: check for _KERNEL_ARG_CLASSES before None --- pyopencl/invoker.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index c996768d..f4e97615 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -81,10 +81,10 @@ def generate_generic_arg_handler(gen, arg_idx, arg_var): from pytools.py_codegen import Indentation gen(""" - if {arg_var} is None: - self._set_arg_null({arg_idx}) - elif isinstance({arg_var}, _KERNEL_ARG_CLASSES): + if isinstance({arg_var}, _KERNEL_ARG_CLASSES): self.set_arg({arg_idx}, {arg_var}) + elif {arg_var} is None: + self._set_arg_null({arg_idx}) """ .format(arg_idx=arg_idx, arg_var=arg_var)) @@ -341,7 +341,7 @@ def _generate_enqueue_and_set_args_module(function_name, invoker_cache = WriteOncePersistentDict( - "pyopencl-invoker-cache-v7", + "pyopencl-invoker-cache-v11", key_builder=_NumpyTypesKeyBuilder()) -- GitLab From 66d0d25cc43773d08cbb42f30f33e267172f4627 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 17 Jan 2021 22:11:57 -0600 Subject: [PATCH 16/47] Make a kernel-specific class to override __call__ and avoid an indirect call on kernel enqueue --- pyopencl/__init__.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py index 6e0268dc..8a21d804 100644 --- a/pyopencl/__init__.py +++ b/pyopencl/__init__.py @@ -847,12 +847,22 @@ def _add_functionality(): # }}} from pyopencl.invoker import generate_enqueue_and_set_args - self._enqueue, self._set_args = generate_enqueue_and_set_args( - self.function_name, - len(scalar_arg_dtypes), self.num_args, - self._scalar_arg_dtypes, - warn_about_arg_count_bug=warn_about_arg_count_bug, - work_around_arg_count_bug=work_around_arg_count_bug) + enqueue, set_args = \ + generate_enqueue_and_set_args( + self.function_name, + len(scalar_arg_dtypes), self.num_args, + self._scalar_arg_dtypes, + warn_about_arg_count_bug=warn_about_arg_count_bug, + work_around_arg_count_bug=work_around_arg_count_bug) + + # Make ourselves a kernel-specific class, so that we're able to override + # __call__. Inspired by https://stackoverflow.com/a/38541437 + class KernelWithOverriddenCall(type(self)): + pass + + self.__class__ = KernelWithOverriddenCall + KernelWithOverriddenCall.__call__ = enqueue + KernelWithOverriddenCall._set_args = set_args def kernel_get_work_group_info(self, param, device): try: @@ -871,6 +881,9 @@ def _add_functionality(): def kernel_call(self, queue, global_size, local_size, *args, **kwargs): # __call__ can't be overridden directly, so we need this # trampoline hack. + + # Note: This is only used for the generic __call__, before + # kernel_set_scalar_arg_dtypes is called. return self._enqueue(self, queue, global_size, local_size, *args, **kwargs) def kernel_capture_call(self, filename, queue, global_size, local_size, -- GitLab From 0e0683f620da3b2ecccbd6381b22a6590df381d8 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 17 Jan 2021 22:24:29 -0600 Subject: [PATCH 17/47] Do not cache result in _get_common_dtype if obj2 is of type int --- pyopencl/array.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pyopencl/array.py b/pyopencl/array.py index 10547f84..56c055ae 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -59,8 +59,13 @@ def _get_common_dtype(obj1, obj2, queue): except AttributeError: # obj2 doesn't have a dtype try: - cache_key = (o1_dtype, type(obj2), dsupport) - return _COMMON_DTYPE_CACHE[cache_key] + tobj2 = type(obj2) + cache_key = (o1_dtype, tobj2, dsupport) + + # Integers are weird, sized, and signed. Don't pretend that 'int' + # is enough information to decide what should happen. + if tobj2 != int: + return _COMMON_DTYPE_CACHE[cache_key] except KeyError: pass -- GitLab From 6f759230b8c7a4a3e0d8dc98a87d7f55eeeac39c Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 17 Jan 2021 22:47:54 -0600 Subject: [PATCH 18/47] Fix overriding of custom set_args in KernelWithOverriddenCall --- pyopencl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py index 8a21d804..3aeca27f 100644 --- a/pyopencl/__init__.py +++ b/pyopencl/__init__.py @@ -862,7 +862,7 @@ def _add_functionality(): self.__class__ = KernelWithOverriddenCall KernelWithOverriddenCall.__call__ = enqueue - KernelWithOverriddenCall._set_args = set_args + KernelWithOverriddenCall.set_args = set_args def kernel_get_work_group_info(self, param, device): try: -- GitLab From 75e460b7405aa59560c0e9f2bb2497894cd7a584 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 17 Jan 2021 22:56:45 -0600 Subject: [PATCH 19/47] Streamline invoker code (and its generation) --- pyopencl/invoker.py | 59 +++++++++++---------------------------------- 1 file changed, 14 insertions(+), 45 deletions(-) diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index f4e97615..6125628b 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -28,6 +28,7 @@ import numpy as np from warnings import warn import pyopencl._cl as _cl from pytools.persistent_dict import WriteOncePersistentDict +from pytools.py_codegen import Indentation from pyopencl.tools import _NumpyTypesKeyBuilder _PYPY = "__pypy__" in sys.builtin_module_names @@ -53,8 +54,6 @@ del _size_t_char # {{{ individual arg handling def generate_buffer_arg_setter(gen, arg_idx, buf_var): - from pytools.py_codegen import Indentation - if _PYPY: # https://github.com/numpy/numpy/issues/5381 gen(f"if isinstance({buf_var}, np.generic):") @@ -69,29 +68,6 @@ def generate_buffer_arg_setter(gen, arg_idx, buf_var): """ .format(arg_idx=arg_idx, buf_var=buf_var)) - -def generate_bytes_arg_setter(gen, arg_idx, buf_var): - gen(""" - self._set_arg_buf({arg_idx}, {buf_var}) - """ - .format(arg_idx=arg_idx, buf_var=buf_var)) - - -def generate_generic_arg_handler(gen, arg_idx, arg_var): - from pytools.py_codegen import Indentation - - gen(""" - if isinstance({arg_var}, _KERNEL_ARG_CLASSES): - self.set_arg({arg_idx}, {arg_var}) - elif {arg_var} is None: - self._set_arg_null({arg_idx}) - """ - .format(arg_idx=arg_idx, arg_var=arg_var)) - - gen("else:") - with Indentation(gen): - generate_buffer_arg_setter(gen, arg_idx, arg_var) - # }}} @@ -108,7 +84,7 @@ def generate_generic_arg_handling_body(num_args): gen(f"# process argument {i}") gen("") gen(f"current_arg = {i}") - generate_generic_arg_handler(gen, i, "arg%d" % i) + gen(f"self.set_arg({i}, arg{i})") gen("") return gen @@ -141,7 +117,7 @@ def generate_specific_arg_handling_body(function_name, arg_var = "arg%d" % arg_idx if arg_dtype is None: - generate_generic_arg_handler(gen, cl_arg_idx, arg_var) + gen(f"self.set_arg({cl_arg_idx}, {arg_var})") cl_arg_idx += 1 gen("") continue @@ -149,7 +125,7 @@ def generate_specific_arg_handling_body(function_name, arg_dtype = np.dtype(arg_dtype) if arg_dtype.char == "V": - generate_generic_arg_handler(gen, cl_arg_idx, arg_var) + gen(f"self.set_arg({cl_arg_idx}, {arg_var})") cl_arg_idx += 1 elif arg_dtype.kind == "c": @@ -173,13 +149,13 @@ def generate_specific_arg_handling_body(function_name, gen( "buf = pack('{arg_char}', {arg_var}.real)" .format(arg_char=arg_char, arg_var=arg_var)) - generate_bytes_arg_setter(gen, cl_arg_idx, "buf") + gen(f"self._set_arg_buf({cl_arg_idx}, buf)") cl_arg_idx += 1 gen("current_arg = current_arg + 1000") gen( "buf = pack('{arg_char}', {arg_var}.imag)" .format(arg_char=arg_char, arg_var=arg_var)) - generate_bytes_arg_setter(gen, cl_arg_idx, "buf") + gen(f"self._set_arg_buf({cl_arg_idx}, buf)") cl_arg_idx += 1 elif (work_around_arg_count_bug == "apple" @@ -195,7 +171,7 @@ def generate_specific_arg_handling_body(function_name, "buf = pack('{arg_char}{arg_char}', " "{arg_var}.real, {arg_var}.imag)" .format(arg_char=arg_char, arg_var=arg_var)) - generate_bytes_arg_setter(gen, cl_arg_idx, "buf") + gen(f"self._set_arg_buf({cl_arg_idx}, buf)") cl_arg_idx += 1 fp_arg_count += 2 @@ -211,7 +187,7 @@ def generate_specific_arg_handling_body(function_name, .format( arg_char=arg_char, arg_var=arg_var)) - generate_bytes_arg_setter(gen, cl_arg_idx, "buf") + gen(f"self._set_arg_buf({cl_arg_idx}, buf)") cl_arg_idx += 1 gen("") @@ -268,13 +244,6 @@ def wrap_in_error_handler(body, arg_names): # }}} -def add_local_imports(gen): - gen("import numpy as np") - gen("import pyopencl._cl as _cl") - gen("from pyopencl import _KERNEL_ARG_CLASSES") - gen("") - - def _generate_enqueue_and_set_args_module(function_name, num_passed_args, num_cl_args, scalar_arg_dtypes, @@ -292,12 +261,14 @@ def _generate_enqueue_and_set_args_module(function_name, warn_about_arg_count_bug=warn_about_arg_count_bug, work_around_arg_count_bug=work_around_arg_count_bug) - err_handler = wrap_in_error_handler(body, arg_names) + body = wrap_in_error_handler(body, arg_names) gen = PythonCodeGenerator() gen("from struct import pack") gen("from pyopencl import status_code") + gen("import numpy as np") + gen("import pyopencl._cl as _cl") gen("") # {{{ generate _enqueue @@ -314,8 +285,7 @@ def _generate_enqueue_and_set_args_module(function_name, "wait_for=None"]))) with Indentation(gen): - add_local_imports(gen) - gen.extend(err_handler) + gen.extend(body) gen(""" return _cl.enqueue_nd_range_kernel(queue, self, global_size, local_size, @@ -332,8 +302,7 @@ def _generate_enqueue_and_set_args_module(function_name, % (", ".join(["self"] + arg_names))) with Indentation(gen): - add_local_imports(gen) - gen.extend(err_handler) + gen.extend(body) # }}} @@ -341,7 +310,7 @@ def _generate_enqueue_and_set_args_module(function_name, invoker_cache = WriteOncePersistentDict( - "pyopencl-invoker-cache-v11", + "pyopencl-invoker-cache-v17", key_builder=_NumpyTypesKeyBuilder()) -- GitLab From 03e6e62f03478539ca3f32e0de1e430e2577031d Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 17 Jan 2021 23:04:30 -0600 Subject: [PATCH 20/47] Rename KernelWithOverriddenCall->KernelWithCustomEnqueue --- pyopencl/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py index 3aeca27f..7cc3f3af 100644 --- a/pyopencl/__init__.py +++ b/pyopencl/__init__.py @@ -857,12 +857,12 @@ def _add_functionality(): # Make ourselves a kernel-specific class, so that we're able to override # __call__. Inspired by https://stackoverflow.com/a/38541437 - class KernelWithOverriddenCall(type(self)): + class KernelWithCustomEnqueue(type(self)): pass - self.__class__ = KernelWithOverriddenCall - KernelWithOverriddenCall.__call__ = enqueue - KernelWithOverriddenCall.set_args = set_args + self.__class__ = KernelWithCustomEnqueue + KernelWithCustomEnqueue.__call__ = enqueue + KernelWithCustomEnqueue.set_args = set_args def kernel_get_work_group_info(self, param, device): try: -- GitLab From 7e9df9040c72c0b92bbeb5c398b403da9b24c09a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 17 Jan 2021 23:08:50 -0600 Subject: [PATCH 21/47] Turn off generation of debug helper code in invoker with sys.flag.optimize --- pyopencl/invoker.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index 6125628b..f043ae4b 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -96,7 +96,8 @@ def generate_generic_arg_handling_body(num_args): def generate_specific_arg_handling_body(function_name, num_cl_args, scalar_arg_dtypes, - work_around_arg_count_bug, warn_about_arg_count_bug): + work_around_arg_count_bug, warn_about_arg_count_bug, + include_debug_helpers): assert work_around_arg_count_bug is not None assert warn_about_arg_count_bug is not None @@ -113,7 +114,8 @@ def generate_specific_arg_handling_body(function_name, for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes): gen(f"# process argument {arg_idx}") gen("") - gen(f"current_arg = {arg_idx}") + if include_debug_helpers: + gen(f"current_arg = {arg_idx}") arg_var = "arg%d" % arg_idx if arg_dtype is None: @@ -151,7 +153,8 @@ def generate_specific_arg_handling_body(function_name, .format(arg_char=arg_char, arg_var=arg_var)) gen(f"self._set_arg_buf({cl_arg_idx}, buf)") cl_arg_idx += 1 - gen("current_arg = current_arg + 1000") + if include_debug_helpers: + gen("current_arg = current_arg + 1000") gen( "buf = pack('{arg_char}', {arg_var}.imag)" .format(arg_char=arg_char, arg_var=arg_var)) @@ -247,7 +250,8 @@ def wrap_in_error_handler(body, arg_names): def _generate_enqueue_and_set_args_module(function_name, num_passed_args, num_cl_args, scalar_arg_dtypes, - work_around_arg_count_bug, warn_about_arg_count_bug): + work_around_arg_count_bug, warn_about_arg_count_bug, + include_debug_helpers): from pytools.py_codegen import PythonCodeGenerator, Indentation @@ -259,9 +263,11 @@ def _generate_enqueue_and_set_args_module(function_name, body = generate_specific_arg_handling_body( function_name, num_cl_args, scalar_arg_dtypes, warn_about_arg_count_bug=warn_about_arg_count_bug, - work_around_arg_count_bug=work_around_arg_count_bug) + work_around_arg_count_bug=work_around_arg_count_bug, + include_debug_helpers=include_debug_helpers) - body = wrap_in_error_handler(body, arg_names) + if include_debug_helpers: + body = wrap_in_error_handler(body, arg_names) gen = PythonCodeGenerator() @@ -310,7 +316,7 @@ def _generate_enqueue_and_set_args_module(function_name, invoker_cache = WriteOncePersistentDict( - "pyopencl-invoker-cache-v17", + "pyopencl-invoker-cache-v20", key_builder=_NumpyTypesKeyBuilder()) @@ -321,7 +327,8 @@ def generate_enqueue_and_set_args(function_name, cache_key = (function_name, num_passed_args, num_cl_args, scalar_arg_dtypes, - work_around_arg_count_bug, warn_about_arg_count_bug) + work_around_arg_count_bug, warn_about_arg_count_bug, + not sys.flags.optimize) from_cache = False -- GitLab From 95b290f32802f50a8b190cb445baade2853187c9 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 17 Jan 2021 23:48:06 -0600 Subject: [PATCH 22/47] kernel_get_work_group_info: Use Device.int_ptr in cache key --- pyopencl/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py index 7cc3f3af..c3c9cf3f 100644 --- a/pyopencl/__init__.py +++ b/pyopencl/__init__.py @@ -865,13 +865,14 @@ def _add_functionality(): KernelWithCustomEnqueue.set_args = set_args def kernel_get_work_group_info(self, param, device): + cache_key = (param, device.int_ptr) try: - return self._wg_info_cache[param, device] + return self._wg_info_cache[cache_key] except KeyError: pass result = kernel_old_get_work_group_info(self, param, device) - self._wg_info_cache[param, device] = result + self._wg_info_cache[cache_key] = result return result def kernel_set_args(self, *args, **kwargs): -- GitLab From 17c4795827e4f820c5286de591f701f5cd329f23 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 17 Jan 2021 23:49:31 -0600 Subject: [PATCH 23/47] Rewrite first_arg_dependent_memoize without decorator package, drop dep --- pyopencl/tools.py | 37 ++++++++++++++++++++----------------- setup.py | 1 - 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/pyopencl/tools.py b/pyopencl/tools.py index 5b814685..ff5c7965 100644 --- a/pyopencl/tools.py +++ b/pyopencl/tools.py @@ -32,7 +32,6 @@ from sys import intern # Do not add a pyopencl import here: This will add an import cycle. import numpy as np -from decorator import decorator from pytools import memoize, memoize_method from pyopencl._cl import bitlog2 # noqa: F401 from pytools.persistent_dict import KeyBuilder as KeyBuilderBase @@ -73,8 +72,7 @@ from pyopencl._cl import ( # noqa _first_arg_dependent_caches = [] -@decorator -def first_arg_dependent_memoize(func, cl_object, *args): +def first_arg_dependent_memoize(func): """Provides memoization for a function. Typically used to cache things that get created inside a :class:`pyopencl.Context`, e.g. programs and kernels. Assumes that the first argument of the decorated function is @@ -84,21 +82,26 @@ def first_arg_dependent_memoize(func, cl_object, *args): .. versionadded:: 2011.2 """ - try: - ctx_dict = func._pyopencl_first_arg_dep_memoize_dic - except AttributeError: - # FIXME: This may keep contexts alive longer than desired. - # But I guess since the memory in them is freed, who cares. - ctx_dict = func._pyopencl_first_arg_dep_memoize_dic = {} - _first_arg_dependent_caches.append(ctx_dict) + def first_arg_memoized_wrapper(cl_object, *args): + try: + ctx_dict = func._pyopencl_first_arg_dep_memoize_dic + except AttributeError: + # FIXME: This may keep contexts alive longer than desired. + # But I guess since the memory in them is freed, who cares. + ctx_dict = func._pyopencl_first_arg_dep_memoize_dic = {} + _first_arg_dependent_caches.append(ctx_dict) - try: - return ctx_dict[cl_object][args] - except KeyError: - arg_dict = ctx_dict.setdefault(cl_object, {}) - result = func(cl_object, *args) - arg_dict[args] = result - return result + try: + return ctx_dict[cl_object][args] + except KeyError: + arg_dict = ctx_dict.setdefault(cl_object, {}) + result = func(cl_object, *args) + arg_dict[args] = result + return result + + from functools import update_wrapper + update_wrapper(first_arg_memoized_wrapper, func) + return first_arg_memoized_wrapper context_dependent_memoize = first_arg_dependent_memoize diff --git a/setup.py b/setup.py index bf63e71c..ba268a6a 100644 --- a/setup.py +++ b/setup.py @@ -247,7 +247,6 @@ def main(): install_requires=[ "numpy", "pytools>=2017.6", - "decorator>=3.2.0", "appdirs>=1.4.0", # "Mako>=0.3.6", ], -- GitLab From 5eb3f3b95f82c8907059ad808c70cf137db89dcd Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 18 Jan 2021 00:01:47 -0600 Subject: [PATCH 24/47] Revert "Rewrite first_arg_dependent_memoize without decorator package, drop dep" This reverts commit 17c4795827e4f820c5286de591f701f5cd329f23. Turns out decorator performs some magic to support kwargs better than we do. --- pyopencl/tools.py | 37 +++++++++++++++++-------------------- setup.py | 1 + 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/pyopencl/tools.py b/pyopencl/tools.py index ff5c7965..5b814685 100644 --- a/pyopencl/tools.py +++ b/pyopencl/tools.py @@ -32,6 +32,7 @@ from sys import intern # Do not add a pyopencl import here: This will add an import cycle. import numpy as np +from decorator import decorator from pytools import memoize, memoize_method from pyopencl._cl import bitlog2 # noqa: F401 from pytools.persistent_dict import KeyBuilder as KeyBuilderBase @@ -72,7 +73,8 @@ from pyopencl._cl import ( # noqa _first_arg_dependent_caches = [] -def first_arg_dependent_memoize(func): +@decorator +def first_arg_dependent_memoize(func, cl_object, *args): """Provides memoization for a function. Typically used to cache things that get created inside a :class:`pyopencl.Context`, e.g. programs and kernels. Assumes that the first argument of the decorated function is @@ -82,26 +84,21 @@ def first_arg_dependent_memoize(func): .. versionadded:: 2011.2 """ - def first_arg_memoized_wrapper(cl_object, *args): - try: - ctx_dict = func._pyopencl_first_arg_dep_memoize_dic - except AttributeError: - # FIXME: This may keep contexts alive longer than desired. - # But I guess since the memory in them is freed, who cares. - ctx_dict = func._pyopencl_first_arg_dep_memoize_dic = {} - _first_arg_dependent_caches.append(ctx_dict) - - try: - return ctx_dict[cl_object][args] - except KeyError: - arg_dict = ctx_dict.setdefault(cl_object, {}) - result = func(cl_object, *args) - arg_dict[args] = result - return result + try: + ctx_dict = func._pyopencl_first_arg_dep_memoize_dic + except AttributeError: + # FIXME: This may keep contexts alive longer than desired. + # But I guess since the memory in them is freed, who cares. + ctx_dict = func._pyopencl_first_arg_dep_memoize_dic = {} + _first_arg_dependent_caches.append(ctx_dict) - from functools import update_wrapper - update_wrapper(first_arg_memoized_wrapper, func) - return first_arg_memoized_wrapper + try: + return ctx_dict[cl_object][args] + except KeyError: + arg_dict = ctx_dict.setdefault(cl_object, {}) + result = func(cl_object, *args) + arg_dict[args] = result + return result context_dependent_memoize = first_arg_dependent_memoize diff --git a/setup.py b/setup.py index ba268a6a..bf63e71c 100644 --- a/setup.py +++ b/setup.py @@ -247,6 +247,7 @@ def main(): install_requires=[ "numpy", "pytools>=2017.6", + "decorator>=3.2.0", "appdirs>=1.4.0", # "Mako>=0.3.6", ], -- GitLab From 429ba485ee877f34fe5fe895be1f45ad2a7c1ab1 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 18 Jan 2021 13:05:29 -0600 Subject: [PATCH 25/47] Use positional args in _cl.enqueue_nd_range_kernel in invoker to avoid pybind slowness with kwargs --- pyopencl/invoker.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index f043ae4b..691a15c0 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -293,10 +293,11 @@ def _generate_enqueue_and_set_args_module(function_name, with Indentation(gen): gen.extend(body) + # Using positional args here because pybind is slow with keyword args gen(""" return _cl.enqueue_nd_range_kernel(queue, self, global_size, local_size, - global_offset, wait_for, g_times_l=g_times_l, - allow_empty_ndrange=allow_empty_ndrange) + global_offset, wait_for, g_times_l, + allow_empty_ndrange) """) # }}} @@ -316,7 +317,7 @@ def _generate_enqueue_and_set_args_module(function_name, invoker_cache = WriteOncePersistentDict( - "pyopencl-invoker-cache-v20", + "pyopencl-invoker-cache-v21", key_builder=_NumpyTypesKeyBuilder()) -- GitLab From b8542b5d6f0bbc58899e7563c0482c6a5b15d851 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 18 Jan 2021 16:17:15 -0600 Subject: [PATCH 26/47] Remove unused generate_buffer_arg_setter --- pyopencl/invoker.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index 691a15c0..c7fe8560 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -51,26 +51,6 @@ del _size_t_char # }}} -# {{{ individual arg handling - -def generate_buffer_arg_setter(gen, arg_idx, buf_var): - if _PYPY: - # https://github.com/numpy/numpy/issues/5381 - gen(f"if isinstance({buf_var}, np.generic):") - with Indentation(gen): - if _PYPY: - gen("{buf_var} = np.asarray({buf_var})".format(buf_var=buf_var)) - else: - gen("{buf_var} = np.getbuffer({buf_var})".format(buf_var=buf_var)) - - gen(""" - self._set_arg_buf({arg_idx}, {buf_var}) - """ - .format(arg_idx=arg_idx, buf_var=buf_var)) - -# }}} - - # {{{ generic arg handling body def generate_generic_arg_handling_body(num_args): -- GitLab From e2ba90140834cea562c55d4d268a213c0756e9c7 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 18 Jan 2021 16:18:08 -0600 Subject: [PATCH 27/47] Invoker generator: globalize some py_codegen imports --- pyopencl/invoker.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index c7fe8560..3feb7685 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -28,7 +28,7 @@ import numpy as np from warnings import warn import pyopencl._cl as _cl from pytools.persistent_dict import WriteOncePersistentDict -from pytools.py_codegen import Indentation +from pytools.py_codegen import Indentation, PythonCodeGenerator from pyopencl.tools import _NumpyTypesKeyBuilder _PYPY = "__pypy__" in sys.builtin_module_names @@ -54,7 +54,6 @@ del _size_t_char # {{{ generic arg handling body def generate_generic_arg_handling_body(num_args): - from pytools.py_codegen import PythonCodeGenerator gen = PythonCodeGenerator() if num_args == 0: @@ -85,7 +84,6 @@ def generate_specific_arg_handling_body(function_name, fp_arg_count = 0 cl_arg_idx = 0 - from pytools.py_codegen import PythonCodeGenerator gen = PythonCodeGenerator() if not scalar_arg_dtypes: @@ -189,8 +187,6 @@ def generate_specific_arg_handling_body(function_name, # {{{ error handler def wrap_in_error_handler(body, arg_names): - from pytools.py_codegen import PythonCodeGenerator, Indentation - err_gen = PythonCodeGenerator() def gen_error_handler(): @@ -233,8 +229,6 @@ def _generate_enqueue_and_set_args_module(function_name, work_around_arg_count_bug, warn_about_arg_count_bug, include_debug_helpers): - from pytools.py_codegen import PythonCodeGenerator, Indentation - arg_names = ["arg%d" % i for i in range(num_passed_args)] if scalar_arg_dtypes is None: -- GitLab From 852e41397ff6e65ab4e1dfbb913b34df0cb0f197 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 18 Jan 2021 16:19:17 -0600 Subject: [PATCH 28/47] Wrapper: switch some C++ args from py::object to py::handle --- src/wrap_cl.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp index 20db54ba..7a6f056f 100644 --- a/src/wrap_cl.hpp +++ b/src/wrap_cl.hpp @@ -4371,7 +4371,7 @@ namespace pyopencl (m_kernel, arg_index, sizeof(cl_command_queue), &q)); } - void set_arg_buf(cl_uint arg_index, py::object py_buffer) + void set_arg_buf(cl_uint arg_index, py::handle py_buffer) { const void *buf; PYOPENCL_BUFFER_SIZE_T len; @@ -4404,7 +4404,7 @@ namespace pyopencl } #endif - void set_arg(cl_uint arg_index, py::object arg) + void set_arg(cl_uint arg_index, py::handle arg) { if (arg.ptr() == Py_None) { @@ -4642,10 +4642,10 @@ namespace pyopencl event *enqueue_nd_range_kernel( command_queue &cq, kernel &knl, - py::object py_global_work_size, - py::object py_local_work_size, - py::object py_global_work_offset, - py::object py_wait_for, + py::handle py_global_work_size, + py::handle py_local_work_size, + py::handle py_global_work_offset, + py::handle py_wait_for, bool g_times_l, bool allow_empty_ndrange) { -- GitLab From 676e30f54ea4d0e79d8b231751876eb36b098a2e Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 18 Jan 2021 17:44:40 -0600 Subject: [PATCH 29/47] Make creation of KernelWithCustomEnqueue easier to read (thanks @kaushikcfd) --- pyopencl/__init__.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py index c3c9cf3f..775687ce 100644 --- a/pyopencl/__init__.py +++ b/pyopencl/__init__.py @@ -847,7 +847,7 @@ def _add_functionality(): # }}} from pyopencl.invoker import generate_enqueue_and_set_args - enqueue, set_args = \ + enqueue, my_set_args = \ generate_enqueue_and_set_args( self.function_name, len(scalar_arg_dtypes), self.num_args, @@ -858,11 +858,10 @@ def _add_functionality(): # Make ourselves a kernel-specific class, so that we're able to override # __call__. Inspired by https://stackoverflow.com/a/38541437 class KernelWithCustomEnqueue(type(self)): - pass + __call__ = enqueue + set_args = my_set_args self.__class__ = KernelWithCustomEnqueue - KernelWithCustomEnqueue.__call__ = enqueue - KernelWithCustomEnqueue.set_args = set_args def kernel_get_work_group_info(self, param, device): cache_key = (param, device.int_ptr) -- GitLab From 6f6a54a5b028723f580d7b5c3726a4c363dc53c7 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 18 Jan 2021 17:52:01 -0600 Subject: [PATCH 30/47] Introduce _set_arg{,_buf}_multi for batched argument setting --- pyopencl/invoker.py | 125 ++++++++++++++--------------------------- src/wrap_cl.hpp | 54 ++++++++++++++++++ src/wrap_cl_part_2.cpp | 14 +++++ 3 files changed, 109 insertions(+), 84 deletions(-) diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index 3feb7685..e452aa95 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -58,13 +58,14 @@ def generate_generic_arg_handling_body(num_args): if num_args == 0: gen("pass") + else: + gen_arg_indices = list(range(num_args)) + gen_args = [f"arg{i}" for i in gen_arg_indices] - for i in range(num_args): - gen(f"# process argument {i}") - gen("") - gen(f"current_arg = {i}") - gen(f"self.set_arg({i}, arg{i})") - gen("") + gen(f"self._set_arg_multi(" + f"({', '.join(str(i) for i in gen_arg_indices)},), " + f"({', '.join(gen_args)},)" + ")") return gen @@ -75,8 +76,7 @@ def generate_generic_arg_handling_body(num_args): def generate_specific_arg_handling_body(function_name, num_cl_args, scalar_arg_dtypes, - work_around_arg_count_bug, warn_about_arg_count_bug, - include_debug_helpers): + work_around_arg_count_bug, warn_about_arg_count_bug): assert work_around_arg_count_bug is not None assert warn_about_arg_count_bug is not None @@ -89,15 +89,17 @@ def generate_specific_arg_handling_body(function_name, if not scalar_arg_dtypes: gen("pass") + gen_arg_indices = [] + gen_args = [] + buf_arg_indices = [] + buf_args = [] + for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes): - gen(f"# process argument {arg_idx}") - gen("") - if include_debug_helpers: - gen(f"current_arg = {arg_idx}") arg_var = "arg%d" % arg_idx if arg_dtype is None: - gen(f"self.set_arg({cl_arg_idx}, {arg_var})") + gen_arg_indices.append(cl_arg_idx) + gen_args.append(arg_var) cl_arg_idx += 1 gen("") continue @@ -105,7 +107,8 @@ def generate_specific_arg_handling_body(function_name, arg_dtype = np.dtype(arg_dtype) if arg_dtype.char == "V": - gen(f"self.set_arg({cl_arg_idx}, {arg_var})") + gen_arg_indices.append(cl_arg_idx) + gen_args.append(arg_var) cl_arg_idx += 1 elif arg_dtype.kind == "c": @@ -126,17 +129,11 @@ def generate_specific_arg_handling_body(function_name, if (work_around_arg_count_bug == "pocl" and arg_dtype == np.complex128 and fp_arg_count + 2 <= 8): - gen( - "buf = pack('{arg_char}', {arg_var}.real)" - .format(arg_char=arg_char, arg_var=arg_var)) - gen(f"self._set_arg_buf({cl_arg_idx}, buf)") + buf_arg_indices.append(cl_arg_idx) + buf_args.append(f"pack('{arg_char}', {arg_var}.real)") cl_arg_idx += 1 - if include_debug_helpers: - gen("current_arg = current_arg + 1000") - gen( - "buf = pack('{arg_char}', {arg_var}.imag)" - .format(arg_char=arg_char, arg_var=arg_var)) - gen(f"self._set_arg_buf({cl_arg_idx}, buf)") + buf_arg_indices.append(cl_arg_idx) + buf_args.append(f"pack('{arg_char}', {arg_var}.imag)") cl_arg_idx += 1 elif (work_around_arg_count_bug == "apple" @@ -148,11 +145,9 @@ def generate_specific_arg_handling_body(function_name, "Cannot pass complex numbers to kernels.") else: - gen( - "buf = pack('{arg_char}{arg_char}', " - "{arg_var}.real, {arg_var}.imag)" - .format(arg_char=arg_char, arg_var=arg_var)) - gen(f"self._set_arg_buf({cl_arg_idx}, buf)") + buf_arg_indices.append(cl_arg_idx) + buf_args.append( + f"pack('{arg_char}{arg_char}', {arg_var}.real, {arg_var}.imag)") cl_arg_idx += 1 fp_arg_count += 2 @@ -163,16 +158,23 @@ def generate_specific_arg_handling_body(function_name, arg_char = arg_dtype.char arg_char = _type_char_map.get(arg_char, arg_char) - gen( - "buf = pack('{arg_char}', {arg_var})" - .format( - arg_char=arg_char, - arg_var=arg_var)) - gen(f"self._set_arg_buf({cl_arg_idx}, buf)") + buf_arg_indices.append(cl_arg_idx) + buf_args.append(f"pack('{arg_char}', {arg_var})") cl_arg_idx += 1 gen("") + for arg_kind, indices, args in [ + ("", gen_arg_indices, gen_args), + ("_buf", buf_arg_indices, buf_args) + ]: + assert len(indices) == len(args) + if indices: + gen(f"self._set_arg{arg_kind}_multi(" + f"({', '.join(str(i) for i in indices)},), " + f"({', '.join(args)},)" + ")") + if cl_arg_idx != num_cl_args: raise TypeError( "length of argument list (%d) and " @@ -184,50 +186,10 @@ def generate_specific_arg_handling_body(function_name, # }}} -# {{{ error handler - -def wrap_in_error_handler(body, arg_names): - err_gen = PythonCodeGenerator() - - def gen_error_handler(): - err_gen(""" - if current_arg is not None: - args = [{args}] - advice = "" - from pyopencl.array import Array - if isinstance(args[current_arg], Array): - advice = " (perhaps you meant to pass 'array.data' " \ - "instead of the array itself?)" - - raise _cl.LogicError( - "when processing argument #%d (1-based): %s%s" - % (current_arg+1, str(e), advice)) - else: - raise - """ - .format(args=", ".join(arg_names))) - err_gen("") - - err_gen("try:") - with Indentation(err_gen): - err_gen.extend(body) - err_gen("except TypeError as e:") - with Indentation(err_gen): - gen_error_handler() - err_gen("except _cl.LogicError as e:") - with Indentation(err_gen): - gen_error_handler() - - return err_gen - -# }}} - - def _generate_enqueue_and_set_args_module(function_name, num_passed_args, num_cl_args, scalar_arg_dtypes, - work_around_arg_count_bug, warn_about_arg_count_bug, - include_debug_helpers): + work_around_arg_count_bug, warn_about_arg_count_bug): arg_names = ["arg%d" % i for i in range(num_passed_args)] @@ -237,11 +199,7 @@ def _generate_enqueue_and_set_args_module(function_name, body = generate_specific_arg_handling_body( function_name, num_cl_args, scalar_arg_dtypes, warn_about_arg_count_bug=warn_about_arg_count_bug, - work_around_arg_count_bug=work_around_arg_count_bug, - include_debug_helpers=include_debug_helpers) - - if include_debug_helpers: - body = wrap_in_error_handler(body, arg_names) + work_around_arg_count_bug=work_around_arg_count_bug) gen = PythonCodeGenerator() @@ -291,7 +249,7 @@ def _generate_enqueue_and_set_args_module(function_name, invoker_cache = WriteOncePersistentDict( - "pyopencl-invoker-cache-v21", + "pyopencl-invoker-cache-v29", key_builder=_NumpyTypesKeyBuilder()) @@ -302,8 +260,7 @@ def generate_enqueue_and_set_args(function_name, cache_key = (function_name, num_passed_args, num_cl_args, scalar_arg_dtypes, - work_around_arg_count_bug, warn_about_arg_count_bug, - not sys.flags.optimize) + work_around_arg_count_bug, warn_about_arg_count_bug) from_cache = False diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp index 7a6f056f..3210eb09 100644 --- a/src/wrap_cl.hpp +++ b/src/wrap_cl.hpp @@ -86,6 +86,7 @@ #endif +#include #include #include #include @@ -4452,6 +4453,59 @@ namespace pyopencl set_arg_buf(arg_index, arg); } + static + void set_arg_multi( + std::function set_arg_func, + py::tuple indices, + py::tuple args) + { + // This is an internal interface used by generated invokers. + // We can save a tiny bit of time by not checking their work. + /* + if (indices.size() != args.size()) + throw error("Kernel.set_arg_multi", CL_INVALID_VALUE, + "indices and args arguments do not have the same length"); + */ + + cl_uint arg_index; + py::handle arg_value; + + auto indices_it = indices.begin(), args_it = args.begin(), + indices_end = indices.end(); + try + { + while (indices_it != indices_end) + { + arg_index = py::cast(*indices_it++); + arg_value = *args_it++; + set_arg_func(arg_index, arg_value); + } + } + catch (error &err) + { + std::string msg( + std::string("when processing arg#") + std::to_string(arg_index+1) + + std::string(" (1-based): ") + std::string(err.what())); + + auto mod_cl_ary(py::module::import("pyopencl.array")); + auto cls_array(mod_cl_ary.attr("Array")); + if (arg_value.ptr() && py::isinstance(arg_value, cls_array)) + msg.append( + " (perhaps you meant to pass 'array.data' instead of the array itself?)"); + + throw error(err.routine().c_str(), err.code(), msg.c_str()); + } + catch (std::exception &err) + { + std::string msg( + std::string("when processing arg#") + std::to_string(arg_index+1) + + std::string(" (1-based): ") + std::string(err.what())); + + throw std::runtime_error(msg.c_str()); + } + } + + py::object get_info(cl_kernel_info param_name) const { switch (param_name) diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp index e68c7851..3dc46c60 100644 --- a/src/wrap_cl_part_2.cpp +++ b/src/wrap_cl_part_2.cpp @@ -470,6 +470,20 @@ void pyopencl_expose_part_2(py::module &m) #if PYOPENCL_CL_VERSION >= 0x2000 .def("_set_arg_svm", &cls::set_arg_svm) #endif + .def("_set_arg_multi", + [](cls &knl, py::tuple indices, py::tuple args) + { + cls::set_arg_multi( + [&](cl_uint i, py::handle arg) { knl.set_arg(i, arg); }, + indices, args); + }) + .def("_set_arg_buf_multi", + [](cls &knl, py::tuple indices, py::tuple args) + { + cls::set_arg_multi( + [&](cl_uint i, py::handle arg) { knl.set_arg_buf(i, arg); }, + indices, args); + }) .DEF_SIMPLE_METHOD(set_arg) #if PYOPENCL_CL_VERSION >= 0x1020 .DEF_SIMPLE_METHOD(get_arg_info) -- GitLab From f80c664fb8fd0a37d94a64bfcef3f9090b7df802 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 19 Jan 2021 12:16:48 -0600 Subject: [PATCH 31/47] Pack indices and args into the same tuple for set_arg*multi --- pyopencl/invoker.py | 54 ++++++++++++++++++++---------------------- src/wrap_cl.hpp | 12 ++++------ src/wrap_cl_part_2.cpp | 8 +++---- 3 files changed, 35 insertions(+), 39 deletions(-) diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index e452aa95..fec2a2ca 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -59,12 +59,13 @@ def generate_generic_arg_handling_body(num_args): if num_args == 0: gen("pass") else: - gen_arg_indices = list(range(num_args)) - gen_args = [f"arg{i}" for i in gen_arg_indices] + gen_indices_and_args = [] + for i in range(num_args): + gen_indices_and_args.append(i) + gen_indices_and_args.append(f"arg{i}") gen(f"self._set_arg_multi(" - f"({', '.join(str(i) for i in gen_arg_indices)},), " - f"({', '.join(gen_args)},)" + f"({', '.join(str(i) for i in gen_indices_and_args)},), " ")") return gen @@ -89,17 +90,15 @@ def generate_specific_arg_handling_body(function_name, if not scalar_arg_dtypes: gen("pass") - gen_arg_indices = [] - gen_args = [] - buf_arg_indices = [] - buf_args = [] + gen_indices_and_args = [] + buf_args_indices = [] for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes): arg_var = "arg%d" % arg_idx if arg_dtype is None: - gen_arg_indices.append(cl_arg_idx) - gen_args.append(arg_var) + gen_indices_and_args.append(cl_arg_idx) + gen_indices_and_args.append(arg_var) cl_arg_idx += 1 gen("") continue @@ -107,8 +106,8 @@ def generate_specific_arg_handling_body(function_name, arg_dtype = np.dtype(arg_dtype) if arg_dtype.char == "V": - gen_arg_indices.append(cl_arg_idx) - gen_args.append(arg_var) + gen_indices_and_args.append(cl_arg_idx) + gen_indices_and_args.append(arg_var) cl_arg_idx += 1 elif arg_dtype.kind == "c": @@ -129,11 +128,11 @@ def generate_specific_arg_handling_body(function_name, if (work_around_arg_count_bug == "pocl" and arg_dtype == np.complex128 and fp_arg_count + 2 <= 8): - buf_arg_indices.append(cl_arg_idx) - buf_args.append(f"pack('{arg_char}', {arg_var}.real)") + buf_args_indices.append(cl_arg_idx) + buf_args_indices.append(f"pack('{arg_char}', {arg_var}.real)") cl_arg_idx += 1 - buf_arg_indices.append(cl_arg_idx) - buf_args.append(f"pack('{arg_char}', {arg_var}.imag)") + buf_args_indices.append(cl_arg_idx) + buf_args_indices.append(f"pack('{arg_char}', {arg_var}.imag)") cl_arg_idx += 1 elif (work_around_arg_count_bug == "apple" @@ -145,8 +144,8 @@ def generate_specific_arg_handling_body(function_name, "Cannot pass complex numbers to kernels.") else: - buf_arg_indices.append(cl_arg_idx) - buf_args.append( + buf_args_indices.append(cl_arg_idx) + buf_args_indices.append( f"pack('{arg_char}{arg_char}', {arg_var}.real, {arg_var}.imag)") cl_arg_idx += 1 @@ -158,21 +157,20 @@ def generate_specific_arg_handling_body(function_name, arg_char = arg_dtype.char arg_char = _type_char_map.get(arg_char, arg_char) - buf_arg_indices.append(cl_arg_idx) - buf_args.append(f"pack('{arg_char}', {arg_var})") + buf_args_indices.append(cl_arg_idx) + buf_args_indices.append(f"pack('{arg_char}', {arg_var})") cl_arg_idx += 1 gen("") - for arg_kind, indices, args in [ - ("", gen_arg_indices, gen_args), - ("_buf", buf_arg_indices, buf_args) + for arg_kind, args_and_indices in [ + ("", gen_indices_and_args), + ("_buf", buf_args_indices) ]: - assert len(indices) == len(args) - if indices: + assert len(args_and_indices) % 2 == 0 + if args_and_indices: gen(f"self._set_arg{arg_kind}_multi(" - f"({', '.join(str(i) for i in indices)},), " - f"({', '.join(args)},)" + f"({', '.join(str(i) for i in args_and_indices)},), " ")") if cl_arg_idx != num_cl_args: @@ -249,7 +247,7 @@ def _generate_enqueue_and_set_args_module(function_name, invoker_cache = WriteOncePersistentDict( - "pyopencl-invoker-cache-v29", + "pyopencl-invoker-cache-v30", key_builder=_NumpyTypesKeyBuilder()) diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp index 3210eb09..89afb8d6 100644 --- a/src/wrap_cl.hpp +++ b/src/wrap_cl.hpp @@ -4456,8 +4456,7 @@ namespace pyopencl static void set_arg_multi( std::function set_arg_func, - py::tuple indices, - py::tuple args) + py::tuple args_and_indices) { // This is an internal interface used by generated invokers. // We can save a tiny bit of time by not checking their work. @@ -4470,14 +4469,13 @@ namespace pyopencl cl_uint arg_index; py::handle arg_value; - auto indices_it = indices.begin(), args_it = args.begin(), - indices_end = indices.end(); + auto it = args_and_indices.begin(), end = args_and_indices.end(); try { - while (indices_it != indices_end) + while (it != end) { - arg_index = py::cast(*indices_it++); - arg_value = *args_it++; + arg_index = py::cast(*it++); + arg_value = *it++; set_arg_func(arg_index, arg_value); } } diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp index 3dc46c60..d4b60f64 100644 --- a/src/wrap_cl_part_2.cpp +++ b/src/wrap_cl_part_2.cpp @@ -471,18 +471,18 @@ void pyopencl_expose_part_2(py::module &m) .def("_set_arg_svm", &cls::set_arg_svm) #endif .def("_set_arg_multi", - [](cls &knl, py::tuple indices, py::tuple args) + [](cls &knl, py::tuple indices_and_args) { cls::set_arg_multi( [&](cl_uint i, py::handle arg) { knl.set_arg(i, arg); }, - indices, args); + indices_and_args); }) .def("_set_arg_buf_multi", - [](cls &knl, py::tuple indices, py::tuple args) + [](cls &knl, py::tuple indices_and_args) { cls::set_arg_multi( [&](cl_uint i, py::handle arg) { knl.set_arg_buf(i, arg); }, - indices, args); + indices_and_args); }) .DEF_SIMPLE_METHOD(set_arg) #if PYOPENCL_CL_VERSION >= 0x1020 -- GitLab From a80586c8c25d3252b1e98882753a1d1df310e37b Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 19 Jan 2021 18:59:39 -0600 Subject: [PATCH 32/47] Introduce Kernel._set_arg_buf_pack_multi for inlined buffer packing+arg setting --- pyopencl/invoker.py | 45 ++++++----- src/wrap_cl.hpp | 164 ++++++++++++++++++++++++++++------------- src/wrap_cl_part_2.cpp | 12 ++- 3 files changed, 149 insertions(+), 72 deletions(-) diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index fec2a2ca..52548820 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -22,7 +22,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import sys import numpy as np from warnings import warn @@ -31,9 +30,6 @@ from pytools.persistent_dict import WriteOncePersistentDict from pytools.py_codegen import Indentation, PythonCodeGenerator from pyopencl.tools import _NumpyTypesKeyBuilder -_PYPY = "__pypy__" in sys.builtin_module_names - - # {{{ arg packing helpers _size_t_char = ({ @@ -75,6 +71,9 @@ def generate_generic_arg_handling_body(num_args): # {{{ specific arg handling body +BUF_PACK_TYPECHARS = ["c", "b", "B", "h", "H", "i", "I", "l", "L", "f", "d"] + + def generate_specific_arg_handling_body(function_name, num_cl_args, scalar_arg_dtypes, work_around_arg_count_bug, warn_about_arg_count_bug): @@ -91,7 +90,17 @@ def generate_specific_arg_handling_body(function_name, gen("pass") gen_indices_and_args = [] - buf_args_indices = [] + buf_indices_and_args = [] + buf_pack_indices_and_args = [] + + def add_buf_arg(arg_idx, typechar, expr_str): + if typechar in BUF_PACK_TYPECHARS: + buf_pack_indices_and_args.append(cl_arg_idx) + buf_pack_indices_and_args.append(repr(typechar.encode())) + buf_pack_indices_and_args.append(expr_str) + else: + buf_indices_and_args.append(cl_arg_idx) + buf_indices_and_args.append(f"pack('{typechar}', {expr_str})") for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes): arg_var = "arg%d" % arg_idx @@ -106,8 +115,8 @@ def generate_specific_arg_handling_body(function_name, arg_dtype = np.dtype(arg_dtype) if arg_dtype.char == "V": - gen_indices_and_args.append(cl_arg_idx) - gen_indices_and_args.append(arg_var) + buf_indices_and_args.append(cl_arg_idx) + buf_indices_and_args.append(arg_var) cl_arg_idx += 1 elif arg_dtype.kind == "c": @@ -128,11 +137,9 @@ def generate_specific_arg_handling_body(function_name, if (work_around_arg_count_bug == "pocl" and arg_dtype == np.complex128 and fp_arg_count + 2 <= 8): - buf_args_indices.append(cl_arg_idx) - buf_args_indices.append(f"pack('{arg_char}', {arg_var}.real)") + add_buf_arg(cl_arg_idx, arg_char, f"{arg_var}.real") cl_arg_idx += 1 - buf_args_indices.append(cl_arg_idx) - buf_args_indices.append(f"pack('{arg_char}', {arg_var}.imag)") + add_buf_arg(cl_arg_idx, arg_char, f"{arg_var}.imag") cl_arg_idx += 1 elif (work_around_arg_count_bug == "apple" @@ -144,8 +151,8 @@ def generate_specific_arg_handling_body(function_name, "Cannot pass complex numbers to kernels.") else: - buf_args_indices.append(cl_arg_idx) - buf_args_indices.append( + buf_indices_and_args.append(cl_arg_idx) + buf_indices_and_args.append( f"pack('{arg_char}{arg_char}', {arg_var}.real, {arg_var}.imag)") cl_arg_idx += 1 @@ -157,15 +164,15 @@ def generate_specific_arg_handling_body(function_name, arg_char = arg_dtype.char arg_char = _type_char_map.get(arg_char, arg_char) - buf_args_indices.append(cl_arg_idx) - buf_args_indices.append(f"pack('{arg_char}', {arg_var})") + add_buf_arg(cl_arg_idx, arg_char, arg_var) cl_arg_idx += 1 gen("") - for arg_kind, args_and_indices in [ - ("", gen_indices_and_args), - ("_buf", buf_args_indices) + for arg_kind, args_and_indices, entry_length in [ + ("", gen_indices_and_args, 2), + ("_buf", buf_indices_and_args, 2), + ("_buf_pack", buf_pack_indices_and_args, 3), ]: assert len(args_and_indices) % 2 == 0 if args_and_indices: @@ -247,7 +254,7 @@ def _generate_enqueue_and_set_args_module(function_name, invoker_cache = WriteOncePersistentDict( - "pyopencl-invoker-cache-v30", + "pyopencl-invoker-cache-v34", key_builder=_NumpyTypesKeyBuilder()) diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp index 89afb8d6..58663e0c 100644 --- a/src/wrap_cl.hpp +++ b/src/wrap_cl.hpp @@ -4372,6 +4372,40 @@ namespace pyopencl (m_kernel, arg_index, sizeof(cl_command_queue), &q)); } + void set_arg_buf_pack(cl_uint arg_index, py::handle py_typechar, py::handle obj) + { +#define PYOPENCL_KERNEL_PACK_AND_SET_ARG(TYPECH_VAL, TYPE) \ + case TYPECH_VAL: \ + { \ + TYPE val = py::cast(obj); \ + PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index, sizeof(val), &val)); \ + break; \ + } + + /* This is an internal interface that assumes it gets fed well-formed + * data. No meaningful error checking is being performed on + * py_typechar, on purpose. + */ + switch (*PyBytes_AS_STRING(py_typechar.ptr())) + { + PYOPENCL_KERNEL_PACK_AND_SET_ARG('c', char) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('b', signed char) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('B', unsigned char) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('h', short) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('H', unsigned short) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('i', int) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('I', unsigned int) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('l', long) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('L', unsigned long) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('f', float) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('d', double) + default: + throw error("Kernel.set_arg_buf_pack", CL_INVALID_VALUE, + "invalid type char"); + } +#undef PYOPENCL_KERNEL_PACK_AND_SET_ARG + } + void set_arg_buf(cl_uint arg_index, py::handle py_buffer) { const void *buf; @@ -4453,57 +4487,6 @@ namespace pyopencl set_arg_buf(arg_index, arg); } - static - void set_arg_multi( - std::function set_arg_func, - py::tuple args_and_indices) - { - // This is an internal interface used by generated invokers. - // We can save a tiny bit of time by not checking their work. - /* - if (indices.size() != args.size()) - throw error("Kernel.set_arg_multi", CL_INVALID_VALUE, - "indices and args arguments do not have the same length"); - */ - - cl_uint arg_index; - py::handle arg_value; - - auto it = args_and_indices.begin(), end = args_and_indices.end(); - try - { - while (it != end) - { - arg_index = py::cast(*it++); - arg_value = *it++; - set_arg_func(arg_index, arg_value); - } - } - catch (error &err) - { - std::string msg( - std::string("when processing arg#") + std::to_string(arg_index+1) - + std::string(" (1-based): ") + std::string(err.what())); - - auto mod_cl_ary(py::module::import("pyopencl.array")); - auto cls_array(mod_cl_ary.attr("Array")); - if (arg_value.ptr() && py::isinstance(arg_value, cls_array)) - msg.append( - " (perhaps you meant to pass 'array.data' instead of the array itself?)"); - - throw error(err.routine().c_str(), err.code(), msg.c_str()); - } - catch (std::exception &err) - { - std::string msg( - std::string("when processing arg#") + std::to_string(arg_index+1) - + std::string(" (1-based): ") + std::string(err.what())); - - throw std::runtime_error(msg.c_str()); - } - } - - py::object get_info(cl_kernel_info param_name) const { switch (param_name) @@ -4668,6 +4651,85 @@ namespace pyopencl #endif }; +#define PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER \ + catch (error &err) \ + { \ + std::string msg( \ + std::string("when processing arg#") + std::to_string(arg_index+1) \ + + std::string(" (1-based): ") + std::string(err.what())); \ + auto mod_cl_ary(py::module::import("pyopencl.array")); \ + auto cls_array(mod_cl_ary.attr("Array")); \ + if (arg_value.ptr() && py::isinstance(arg_value, cls_array)) \ + msg.append( \ + " (perhaps you meant to pass 'array.data' instead of the array itself?)"); \ + throw error(err.routine().c_str(), err.code(), msg.c_str()); \ + } \ + catch (std::exception &err) \ + { \ + std::string msg( \ + std::string("when processing arg#") + std::to_string(arg_index+1) \ + + std::string(" (1-based): ") + std::string(err.what())); \ + throw std::runtime_error(msg.c_str()); \ + } + + inline + void set_arg_multi( + std::function set_arg_func, + py::tuple args_and_indices) + { + cl_uint arg_index; + py::handle arg_value; + + auto it = args_and_indices.begin(), end = args_and_indices.end(); + try + { + /* This is an internal interface that assumes it gets fed well-formed + * data. No meaningful error checking is being performed on + * off-interval exhaustion of the iterator, on purpose. + */ + while (it != end) + { + // special value in case integer cast fails + arg_index = 9999 - 1; + + arg_index = py::cast(*it++); + arg_value = *it++; + set_arg_func(arg_index, arg_value); + } + } + PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER + } + + + inline + void set_arg_multi( + std::function set_arg_func, + py::tuple args_and_indices) + { + cl_uint arg_index; + py::handle arg_descr, arg_value; + + auto it = args_and_indices.begin(), end = args_and_indices.end(); + try + { + /* This is an internal interface that assumes it gets fed well-formed + * data. No meaningful error checking is being performed on + * off-interval exhaustion of the iterator, on purpose. + */ + while (it != end) + { + // special value in case integer cast fails + arg_index = 9999 - 1; + + arg_index = py::cast(*it++); + arg_descr = *it++; + arg_value = *it++; + set_arg_func(arg_index, arg_descr, arg_value); + } + } + PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER + } + inline py::list create_kernels_in_program(program &pgm) diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp index d4b60f64..205b31ec 100644 --- a/src/wrap_cl_part_2.cpp +++ b/src/wrap_cl_part_2.cpp @@ -473,17 +473,25 @@ void pyopencl_expose_part_2(py::module &m) .def("_set_arg_multi", [](cls &knl, py::tuple indices_and_args) { - cls::set_arg_multi( + set_arg_multi( [&](cl_uint i, py::handle arg) { knl.set_arg(i, arg); }, indices_and_args); }) .def("_set_arg_buf_multi", [](cls &knl, py::tuple indices_and_args) { - cls::set_arg_multi( + set_arg_multi( [&](cl_uint i, py::handle arg) { knl.set_arg_buf(i, arg); }, indices_and_args); }) + .def("_set_arg_buf_pack_multi", + [](cls &knl, py::tuple indices_chars_and_args) + { + set_arg_multi( + [&](cl_uint i, py::handle typechar, py::handle arg) + { knl.set_arg_buf_pack(i, typechar, arg); }, + indices_chars_and_args); + }) .DEF_SIMPLE_METHOD(set_arg) #if PYOPENCL_CL_VERSION >= 0x1020 .DEF_SIMPLE_METHOD(get_arg_info) -- GitLab From 9ea089aaf7cb6368d5f605b641e67e996f96ecad Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 19 Jan 2021 19:10:50 -0600 Subject: [PATCH 33/47] Tweak examples to emphasize importance of retaining kernels --- examples/demo.py | 3 ++- examples/demo_array.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/demo.py b/examples/demo.py index 623660fe..a4a50333 100644 --- a/examples/demo.py +++ b/examples/demo.py @@ -23,7 +23,8 @@ __kernel void sum( """).build() res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes) -prg.sum(queue, a_np.shape, None, a_g, b_g, res_g) +knl = prg.sum # Use this Kernel object for repeated calls +knl(queue, a_np.shape, None, a_g, b_g, res_g) res_np = np.empty_like(a_np) cl.enqueue_copy(queue, res_np, res_g) diff --git a/examples/demo_array.py b/examples/demo_array.py index 41b0f79e..74bb7cfc 100644 --- a/examples/demo_array.py +++ b/examples/demo_array.py @@ -22,6 +22,7 @@ prg = cl.Program(ctx, """ } """).build() -prg.sum(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data) +knl = prg.sum # Use this Kernel object for repeated calls +knl(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data) print(la.norm((dest_dev - (a_dev+b_dev)).get())) -- GitLab From d6e0e1fbe5f976efa2d6f207bb2f0fb7dfdde519 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 19 Jan 2021 19:15:15 -0600 Subject: [PATCH 34/47] Fix lenght assert on args_and_indices in invoker generation --- pyopencl/invoker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index 52548820..c8f39f1d 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -174,7 +174,7 @@ def generate_specific_arg_handling_body(function_name, ("_buf", buf_indices_and_args, 2), ("_buf_pack", buf_pack_indices_and_args, 3), ]: - assert len(args_and_indices) % 2 == 0 + assert len(args_and_indices) % entry_length == 0 if args_and_indices: gen(f"self._set_arg{arg_kind}_multi(" f"({', '.join(str(i) for i in args_and_indices)},), " -- GitLab From 60d5f39a5500dfd9b770b15f56b7af3dac83a784 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 19 Jan 2021 19:46:23 -0600 Subject: [PATCH 35/47] Drop absurd non-error-checking for typechar --- src/wrap_cl.hpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp index 58663e0c..f00a6b88 100644 --- a/src/wrap_cl.hpp +++ b/src/wrap_cl.hpp @@ -4374,6 +4374,13 @@ namespace pyopencl void set_arg_buf_pack(cl_uint arg_index, py::handle py_typechar, py::handle obj) { + std::string typechar_str(py::cast(py_typechar)); + if (typechar_str.size() != 1) + throw error("Kernel.set_arg_buf_pack", CL_INVALID_VALUE, + "type char argument must have exactly one character"); + + char typechar = typechar_str[0]; + #define PYOPENCL_KERNEL_PACK_AND_SET_ARG(TYPECH_VAL, TYPE) \ case TYPECH_VAL: \ { \ @@ -4381,12 +4388,7 @@ namespace pyopencl PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index, sizeof(val), &val)); \ break; \ } - - /* This is an internal interface that assumes it gets fed well-formed - * data. No meaningful error checking is being performed on - * py_typechar, on purpose. - */ - switch (*PyBytes_AS_STRING(py_typechar.ptr())) + switch (typechar) { PYOPENCL_KERNEL_PACK_AND_SET_ARG('c', char) PYOPENCL_KERNEL_PACK_AND_SET_ARG('b', signed char) -- GitLab From 3070eb99c778dba9f372002b5417eac5beb516df Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 19 Jan 2021 23:58:15 -0600 Subject: [PATCH 36/47] Fix scope leakage in add_buf_arg --- pyopencl/invoker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index c8f39f1d..ccf423d3 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -95,11 +95,11 @@ def generate_specific_arg_handling_body(function_name, def add_buf_arg(arg_idx, typechar, expr_str): if typechar in BUF_PACK_TYPECHARS: - buf_pack_indices_and_args.append(cl_arg_idx) + buf_pack_indices_and_args.append(arg_idx) buf_pack_indices_and_args.append(repr(typechar.encode())) buf_pack_indices_and_args.append(expr_str) else: - buf_indices_and_args.append(cl_arg_idx) + buf_indices_and_args.append(arg_idx) buf_indices_and_args.append(f"pack('{typechar}', {expr_str})") for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes): -- GitLab From 89c38431d343f5b1a92dab4b5689c14159f5a865 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 20 Jan 2021 11:54:11 -0600 Subject: [PATCH 37/47] Make VectorArg, ScalarArg comparable and add persistent-dict key generation for them --- pyopencl/tools.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pyopencl/tools.py b/pyopencl/tools.py index 5b814685..d78070b0 100644 --- a/pyopencl/tools.py +++ b/pyopencl/tools.py @@ -334,10 +334,15 @@ class DtypedArgument(Argument): self.name, self.dtype) + def __eq__(self, other): + return (type(self) == type(other) + and self.dtype == other.dtype + and self.name == other.name) + class VectorArg(DtypedArgument): def __init__(self, dtype, name, with_offset=False): - DtypedArgument.__init__(self, dtype, name) + super().__init__(dtype, name) self.with_offset = with_offset def declarator(self): @@ -350,6 +355,10 @@ class VectorArg(DtypedArgument): return result + def __eq__(self, other): + return (super().__eq__(other) + and self.with_offset == other.with_offset) + class ScalarArg(DtypedArgument): def declarator(self): @@ -1025,6 +1034,11 @@ def is_spirv(s): # {{{ numpy key types builder class _NumpyTypesKeyBuilder(KeyBuilderBase): + def update_for_VectorArg(self, key_hash, key): # noqa: N802 + self.rec(key_hash, key.dtype) + self.update_for_str(key_hash, key.name) + self.rec(key_hash, key.with_offset) + def update_for_type(self, key_hash, key): if issubclass(key, np.generic): self.update_for_str(key_hash, key.__name__) -- GitLab From 5dee903563ad201a17b70fab51cd0cbb6e1b02cb Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 20 Jan 2021 11:56:32 -0600 Subject: [PATCH 38/47] Incorporate argument-processing bits of kernel_runner into generated invoker --- pyopencl/__init__.py | 14 +++++--- pyopencl/array.py | 59 +++++------------------------- pyopencl/elementwise.py | 19 +++------- pyopencl/invoker.py | 80 ++++++++++++++++++++++++++++++----------- pyopencl/tools.py | 14 ++++++++ 5 files changed, 97 insertions(+), 89 deletions(-) diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py index 775687ce..1d304fd2 100644 --- a/pyopencl/__init__.py +++ b/pyopencl/__init__.py @@ -819,8 +819,8 @@ def _add_functionality(): self._wg_info_cache = {} return self - def kernel_set_scalar_arg_dtypes(self, scalar_arg_dtypes): - self._scalar_arg_dtypes = tuple(scalar_arg_dtypes) + def kernel_set_arg_types(self, arg_types): + arg_types = tuple(arg_types) # {{{ arg counting bug handling @@ -850,8 +850,8 @@ def _add_functionality(): enqueue, my_set_args = \ generate_enqueue_and_set_args( self.function_name, - len(scalar_arg_dtypes), self.num_args, - self._scalar_arg_dtypes, + len(arg_types), self.num_args, + arg_types, warn_about_arg_count_bug=warn_about_arg_count_bug, work_around_arg_count_bug=work_around_arg_count_bug) @@ -903,7 +903,11 @@ def _add_functionality(): Kernel.__init__ = kernel_init Kernel._setup = kernel__setup Kernel.get_work_group_info = kernel_get_work_group_info - Kernel.set_scalar_arg_dtypes = kernel_set_scalar_arg_dtypes + + # FIXME: Possibly deprecate this version + Kernel.set_scalar_arg_dtypes = kernel_set_arg_types + Kernel.set_arg_types = kernel_set_arg_types + Kernel.set_args = kernel_set_args Kernel.__call__ = kernel_call Kernel.capture_call = kernel_capture_call diff --git a/pyopencl/array.py b/pyopencl/array.py index 56c055ae..7f65bf6e 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -169,12 +169,6 @@ def elwise_kernel_runner(kernel_getter): wait_for = kwargs.pop("wait_for", None) - # wait_for must be a copy, because we modify it in-place below - if wait_for is None: - wait_for = [] - else: - wait_for = list(wait_for) - knl = kernel_getter(*args, **kwargs) gs, ls = repr_ary._get_sizes(queue, @@ -183,31 +177,9 @@ def elwise_kernel_runner(kernel_getter): queue.device)) assert isinstance(repr_ary, Array) + args = args + (repr_ary.size,) - actual_args = [] - for arg in args: - if isinstance(arg, Array): - if not arg.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - actual_args.append(arg.base_data) - actual_args.append(arg.offset) - wait_for.extend(arg.events) - - if (implicit_queue - and arg.queue is not None - and arg.queue != queue): - from warnings import warn - - warn("Implicit queue in elementwise operation does not match " - "queue of a provided argument. This will become an " - "error in 2021.", - type=InconsistentOpenCLQueueWarning) - else: - actual_args.append(arg) - actual_args.append(repr_ary.size) - - return knl(queue, gs, ls, *actual_args, wait_for=wait_for) + return knl(queue, gs, ls, *args, wait_for=wait_for) try: from functools import update_wrapper @@ -2403,19 +2375,13 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) - from pytools import flatten wait_for_this = (dest_indices.events + src_indices.events + _builtin_sum((i.events for i in arrays[chunk_slice]), []) + _builtin_sum((o.events for o in out[chunk_slice]), [])) evt = knl(queue, gs, ls, - *([o.data for o in out[chunk_slice]] - + [dest_indices.base_data, - dest_indices.offset, - src_indices.base_data, - src_indices.offset] - + list(flatten( - (i.base_data, i.offset) - for i in arrays[chunk_slice])) + *([o for o in out[chunk_slice]] + + [dest_indices, src_indices] + + [i for i in arrays[chunk_slice]] + src_offsets_list[chunk_slice] + [src_indices.size]), wait_for=wait_for_this) for o in out[chunk_slice]: @@ -2486,22 +2452,15 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) - from pytools import flatten wait_for_this = (wait_for + _builtin_sum((i.events for i in arrays[chunk_slice]), []) + _builtin_sum((o.events for o in out[chunk_slice]), [])) evt = knl(queue, gs, ls, *( - list(flatten( - (o.base_data, o.offset) - for o in out[chunk_slice])) - + [dest_indices.base_data, dest_indices.offset] - + list(flatten( - (i.base_data, i.offset) - for i in arrays[chunk_slice])) - + [use_fill_cla.base_data, use_fill_cla.offset] - + [array_lengths_cla.base_data, array_lengths_cla.offset] - + [dest_indices.size]), + [o for o in out[chunk_slice]] + + [dest_indices] + + [i for i in arrays[chunk_slice]] + + [use_fill_cla, array_lengths_cla, dest_indices.size]), wait_for=wait_for_this) for o in out[chunk_slice]: diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index 039cc7cf..fcb94ee9 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -156,10 +156,10 @@ def get_elwise_kernel_and_types(context, arguments, operation, name=name, options=options, preamble=preamble, use_range=use_range, loop_prep=loop_prep, **kwargs) - from pyopencl.tools import get_arg_list_scalar_arg_dtypes + from pyopencl.tools import get_arg_list_arg_types kernel = getattr(prg, name) - kernel.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(parsed_args)) + kernel.set_scalar_arg_dtypes(get_arg_list_arg_types(parsed_args)) return kernel, parsed_args @@ -260,17 +260,10 @@ class ElementwiseKernel: invocation_args = [] for arg, arg_descr in zip(args, arg_descrs): if isinstance(arg_descr, VectorArg): - if not arg.flags.forc: - raise RuntimeError("ElementwiseKernel cannot " - "deal with non-contiguous arrays") - if repr_vec is None: repr_vec = arg - invocation_args.append(arg.base_data) - if arg_descr.with_offset: - invocation_args.append(arg.offset) - wait_for.extend(arg.events) + invocation_args.append(arg) else: invocation_args.append(arg) @@ -317,13 +310,11 @@ class ElementwiseKernel: if capture_as is not None: kernel.set_args(*invocation_args) - kernel.capture_call( + return kernel.capture_call( capture_as, queue, gs, ls, *invocation_args, wait_for=wait_for) - kernel.set_args(*invocation_args) - return cl.enqueue_nd_range_kernel(queue, kernel, - gs, ls, wait_for=wait_for) + return kernel(queue, gs, ls, *invocation_args, wait_for=wait_for) # }}} diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index ccf423d3..77f90207 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -28,7 +28,8 @@ from warnings import warn import pyopencl._cl as _cl from pytools.persistent_dict import WriteOncePersistentDict from pytools.py_codegen import Indentation, PythonCodeGenerator -from pyopencl.tools import _NumpyTypesKeyBuilder +from pyopencl.tools import _NumpyTypesKeyBuilder, VectorArg + # {{{ arg packing helpers @@ -75,8 +76,9 @@ BUF_PACK_TYPECHARS = ["c", "b", "B", "h", "H", "i", "I", "l", "L", "f", "d"] def generate_specific_arg_handling_body(function_name, - num_cl_args, scalar_arg_dtypes, - work_around_arg_count_bug, warn_about_arg_count_bug): + num_cl_args, arg_types, + work_around_arg_count_bug, warn_about_arg_count_bug, + in_enqueue): assert work_around_arg_count_bug is not None assert warn_about_arg_count_bug is not None @@ -86,7 +88,7 @@ def generate_specific_arg_handling_body(function_name, gen = PythonCodeGenerator() - if not scalar_arg_dtypes: + if not arg_types: gen("pass") gen_indices_and_args = [] @@ -102,17 +104,53 @@ def generate_specific_arg_handling_body(function_name, buf_indices_and_args.append(arg_idx) buf_indices_and_args.append(f"pack('{typechar}', {expr_str})") - for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes): + if in_enqueue and arg_types is not None and \ + any(isinstance(arg_type, VectorArg) for arg_type in arg_types): + # We're about to modify wait_for, make sure it's a copy. + gen(""" + if wait_for is None: + wait_for = [] + else: + wait_for = list(wait_for) + """) + gen("") + + for arg_idx, arg_type in enumerate(arg_types): arg_var = "arg%d" % arg_idx - if arg_dtype is None: + if arg_type is None: gen_indices_and_args.append(cl_arg_idx) gen_indices_and_args.append(arg_var) cl_arg_idx += 1 gen("") continue - arg_dtype = np.dtype(arg_dtype) + elif isinstance(arg_type, VectorArg): + gen(f"if not {arg_var}.flags.forc:") + with Indentation(gen): + gen("raise RuntimeError('only contiguous arrays may '") + gen(" 'be used as arguments to this operation')") + gen("") + + if in_enqueue: + gen(f"assert {arg_var}.queue is None or {arg_var}.queue == queue, " + "'queues for all arrays must match the queue supplied " + "to enqueue'") + + gen_indices_and_args.append(cl_arg_idx) + gen_indices_and_args.append(f"{arg_var}.base_data") + cl_arg_idx += 1 + + if arg_type.with_offset: + add_buf_arg(cl_arg_idx, np.dtype(np.int64).char, f"{arg_var}.offset") + cl_arg_idx += 1 + + if in_enqueue: + gen(f"wait_for.extend({arg_var}.events)") + + continue + + arg_dtype = np.dtype(arg_type) if arg_dtype.char == "V": buf_indices_and_args.append(cl_arg_idx) @@ -193,18 +231,20 @@ def generate_specific_arg_handling_body(function_name, def _generate_enqueue_and_set_args_module(function_name, num_passed_args, num_cl_args, - scalar_arg_dtypes, + arg_types, work_around_arg_count_bug, warn_about_arg_count_bug): arg_names = ["arg%d" % i for i in range(num_passed_args)] - if scalar_arg_dtypes is None: - body = generate_generic_arg_handling_body(num_passed_args) - else: - body = generate_specific_arg_handling_body( - function_name, num_cl_args, scalar_arg_dtypes, - warn_about_arg_count_bug=warn_about_arg_count_bug, - work_around_arg_count_bug=work_around_arg_count_bug) + def gen_arg_setting(in_enqueue): + if arg_types is None: + return generate_generic_arg_handling_body(num_passed_args) + else: + return generate_specific_arg_handling_body( + function_name, num_cl_args, arg_types, + warn_about_arg_count_bug=warn_about_arg_count_bug, + work_around_arg_count_bug=work_around_arg_count_bug, + in_enqueue=in_enqueue) gen = PythonCodeGenerator() @@ -228,7 +268,7 @@ def _generate_enqueue_and_set_args_module(function_name, "wait_for=None"]))) with Indentation(gen): - gen.extend(body) + gen.extend(gen_arg_setting(in_enqueue=True)) # Using positional args here because pybind is slow with keyword args gen(""" @@ -246,7 +286,7 @@ def _generate_enqueue_and_set_args_module(function_name, % (", ".join(["self"] + arg_names))) with Indentation(gen): - gen.extend(body) + gen.extend(gen_arg_setting(in_enqueue=False)) # }}} @@ -254,17 +294,17 @@ def _generate_enqueue_and_set_args_module(function_name, invoker_cache = WriteOncePersistentDict( - "pyopencl-invoker-cache-v34", + "pyopencl-invoker-cache-v38", key_builder=_NumpyTypesKeyBuilder()) def generate_enqueue_and_set_args(function_name, num_passed_args, num_cl_args, - scalar_arg_dtypes, + arg_types, work_around_arg_count_bug, warn_about_arg_count_bug): cache_key = (function_name, num_passed_args, num_cl_args, - scalar_arg_dtypes, + arg_types, work_around_arg_count_bug, warn_about_arg_count_bug) from_cache = False diff --git a/pyopencl/tools.py b/pyopencl/tools.py index d78070b0..4800b15b 100644 --- a/pyopencl/tools.py +++ b/pyopencl/tools.py @@ -411,6 +411,20 @@ def parse_arg_list(arguments, with_offset=False): return [parse_single_arg(arg) for arg in arguments] +def get_arg_list_arg_types(arg_types): + result = [] + + for arg_type in arg_types: + if isinstance(arg_type, ScalarArg): + result.append(arg_type.dtype) + elif isinstance(arg_type, VectorArg): + result.append(arg_type) + else: + raise RuntimeError("arg type not understood: %s" % type(arg_type)) + + return tuple(result) + + def get_arg_list_scalar_arg_dtypes(arg_types): result = [] -- GitLab From f0308dceec423b90ae9e7743cc093f71352cfb8f Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 20 Jan 2021 12:01:26 -0600 Subject: [PATCH 39/47] Add POCL Nvidia jobs to Gitlab CI --- .gitlab-ci.yml | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index deaf28aa..43149559 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,7 +15,7 @@ Python 3 Intel CPU: reports: junit: test/pytest.xml -Python 3 Titan X: +Python 3 Nvidia Titan X: script: - export PY_EXE=python3 - export PYOPENCL_TEST=nvi:titan @@ -31,7 +31,7 @@ Python 3 Titan X: reports: junit: test/pytest.xml -Python 3 Titan V: +Python 3 Nvidia Titan V: script: - export PY_EXE=python3 - export PYOPENCL_TEST=nvi:titan @@ -47,7 +47,7 @@ Python 3 Titan V: reports: junit: test/pytest.xml -Python 3 K40: +Python 3 Nvidia K40: script: - export PY_EXE=python3 - export PYOPENCL_TEST=nvi:k40 @@ -117,16 +117,34 @@ Python 3 POCL CL 1.1: reports: junit: test/pytest.xml -Python 3 POCL: +Python 3 POCL K40: script: - export PY_EXE=python3 - - export PYOPENCL_TEST=portable:pthread + - export PYOPENCL_TEST=portable:k40 + - export EXTRA_INSTALL="pybind11 numpy mako" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - ". ./build-and-test-py-project.sh" + tags: + - python3 + - pocl + - nvidia-k40 + except: + - tags + artifacts: + reports: + junit: test/pytest.xml + +Python 3 POCL Titan V: + script: + - export PY_EXE=python3 + - export PYOPENCL_TEST=portable:titan - export EXTRA_INSTALL="pybind11 numpy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: - python3 - pocl + - nvidia-titan-v except: - tags artifacts: -- GitLab From 73faf922bf97095fbdb3568ed8df6b04637d01ea Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 20 Jan 2021 12:15:45 -0600 Subject: [PATCH 40/47] Xfail a few tests in test_wrapper on POCL+Nvidia --- test/test_wrapper.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/test/test_wrapper.py b/test/test_wrapper.py index 8aad416e..1068779b 100644 --- a/test/test_wrapper.py +++ b/test/test_wrapper.py @@ -46,10 +46,17 @@ else: faulthandler.enable() -def _skip_if_pocl(plat, up_to_version, msg="unsupported by pocl"): +def _xfail_if_pocl(plat, up_to_version, msg="unsupported by pocl"): if plat.vendor == "The pocl project": if up_to_version is None or get_pocl_version(plat) <= up_to_version: - pytest.skip(msg) + pytest.xfail(msg) + + +def _xfail_if_pocl_gpu(device, what): + if device.platform.vendor == "The pocl project" \ + and device.type & cl.device_type.GPU: + pytest.xfail(f"POCL's {what} support don't work right on Nvidia GPUs, " + "at least the Titan V, as of pocl 1.6, 2021-01-20") def test_get_info(ctx_factory): @@ -382,7 +389,7 @@ def test_image_2d(ctx_factory): if "Intel" in device.vendor and "31360.31426" in device.version: from pytest import skip skip("images crashy on %s" % device) - _skip_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP") + _xfail_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP") prg = cl.Program(context, """ __kernel void copy_image( @@ -454,7 +461,7 @@ def test_image_3d(ctx_factory): if device.platform.vendor == "Intel(R) Corporation": from pytest import skip skip("images crashy on %s" % device) - _skip_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP") + _xfail_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP") prg = cl.Program(context, """ __kernel void copy_image_plane( @@ -676,7 +683,7 @@ def test_can_build_and_run_binary(ctx_factory): def test_enqueue_barrier_marker(ctx_factory): ctx = ctx_factory() # Still relevant on pocl 1.0RC1. - _skip_if_pocl( + _xfail_if_pocl( ctx.devices[0].platform, (1, 0), "pocl crashes on enqueue_barrier") queue = cl.CommandQueue(ctx) @@ -704,7 +711,7 @@ def test_unload_compiler(platform): or cl.get_cl_header_version() < (1, 2)): from pytest import skip skip("clUnloadPlatformCompiler is only available in OpenCL 1.2") - _skip_if_pocl(platform, (0, 13), "pocl does not support unloading compiler") + _xfail_if_pocl(platform, (0, 13), "pocl does not support unloading compiler") if platform.vendor == "Intel(R) Corporation": from pytest import skip skip("Intel proprietary driver does not support unloading compiler") @@ -745,7 +752,7 @@ def test_user_event(ctx_factory): skip("UserEvent is only available in OpenCL 1.1") # https://github.com/pocl/pocl/issues/201 - _skip_if_pocl(ctx.devices[0].platform, (0, 13), + _xfail_if_pocl(ctx.devices[0].platform, (0, 13), "pocl's user events don't work right") status = {} @@ -842,6 +849,8 @@ def test_event_set_callback(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) + _xfail_if_pocl_gpu(queue.device, "event callbacks") + if ctx._get_cl_version() < (1, 1): pytest.skip("OpenCL 1.1 or newer required for set_callback") @@ -896,6 +905,8 @@ def test_global_offset(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) + _xfail_if_pocl_gpu(queue.device, "global offset") + prg = cl.Program(context, """ __kernel void mult(__global float *a) { a[get_global_id(0)] *= 2; } @@ -981,6 +992,8 @@ def test_coarse_grain_svm(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) + _xfail_if_pocl_gpu(queue.device, "SVM") + dev = ctx.devices[0] from pyopencl.characterize import has_coarse_grain_buffer_svm @@ -1062,6 +1075,8 @@ def test_fine_grain_svm(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) + _xfail_if_pocl_gpu(queue.device, "GPU SVM") + from pyopencl.characterize import has_fine_grain_buffer_svm from pytest import skip if not has_fine_grain_buffer_svm(queue.device): @@ -1148,6 +1163,8 @@ def test_copy_buffer_rect(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) + _xfail_if_pocl_gpu(queue.device, "rectangular copies") + arr1 = cl_array.zeros(queue, (2, 3), "f") arr2 = cl_array.zeros(queue, (4, 5), "f") arr1.fill(1) -- GitLab From 0abbbaffef8b435bc62a0049a12891bcc9658118 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 20 Jan 2021 12:42:21 -0600 Subject: [PATCH 41/47] Make {Scalar,Vector,Other}Arg hashable --- pyopencl/tools.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pyopencl/tools.py b/pyopencl/tools.py index 4800b15b..3be95ee2 100644 --- a/pyopencl/tools.py +++ b/pyopencl/tools.py @@ -339,6 +339,12 @@ class DtypedArgument(Argument): and self.dtype == other.dtype and self.name == other.name) + def __hash__(self): + return ( + hash(type(self)) + ^ hash(self.dtype) + ^ hash(self.name)) + class VectorArg(DtypedArgument): def __init__(self, dtype, name, with_offset=False): @@ -359,6 +365,9 @@ class VectorArg(DtypedArgument): return (super().__eq__(other) and self.with_offset == other.with_offset) + def __hash__(self): + return super.__hash__() ^ hash(self.with_offset) + class ScalarArg(DtypedArgument): def declarator(self): @@ -373,6 +382,17 @@ class OtherArg(Argument): def declarator(self): return self.decl + def __eq__(self, other): + return (type(self) == type(other) + and self.decl == other.decl + and self.name == other.name) + + def __hash__(self): + return ( + hash(type(self)) + ^ hash(self.decl) + ^ hash(self.name)) + def parse_c_arg(c_arg, with_offset=False): for aspace in ["__local", "__constant"]: -- GitLab From 385a8fefde149fe7dce25b2cc28c0c62de1b8781 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 20 Jan 2021 13:47:52 -0600 Subject: [PATCH 42/47] Xfail bitonic argsort on pocl nvidia --- test/test_algorithm.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/test_algorithm.py b/test/test_algorithm.py index 353af281..676aee37 100644 --- a/test/test_algorithm.py +++ b/test/test_algorithm.py @@ -1087,6 +1087,12 @@ def test_bitonic_argsort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) + device = queue.device + if device.platform.vendor == "The pocl project" \ + and device.type & cl.device_type.GPU: + pytest.xfail("bitonic argsort fails on POCL + Nvidia," + "at least the K40, as of pocl 1.6, 2021-01-20") + dev = ctx.devices[0] if (dev.platform.name == "Portable Computing Language" and sys.platform == "darwin"): -- GitLab From 50ee7ae805cfffc93222a7a065a73b6430ce45ee Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 20 Jan 2021 15:22:42 -0600 Subject: [PATCH 43/47] Xfail test_array test_get_async on Pocl GPU --- test/test_array.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/test_array.py b/test/test_array.py index b4234a37..2821d923 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -1350,6 +1350,12 @@ def test_get_async(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) + device = queue.device + if device.platform.vendor == "The pocl project" \ + and device.type & cl.device_type.GPU: + pytest.xfail("the async get test fails on POCL + Nvidia," + "at least the K40, as of pocl 1.6, 2021-01-20") + a = np.random.rand(10**6).astype(np.dtype("float32")) a_gpu = cl_array.to_device(queue, a) b = a + a**5 + 1 -- GitLab From 7279388562ab33a7dff99da0477c1ad5eac1b972 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 20 Jan 2021 16:27:31 -0600 Subject: [PATCH 44/47] Xfail ranlux on on POCL + Nvidia --- test/test_clrandom.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/test_clrandom.py b/test/test_clrandom.py index 1ce479b2..1ec53842 100644 --- a/test/test_clrandom.py +++ b/test/test_clrandom.py @@ -68,6 +68,13 @@ def test_clrandom_dtypes(ctx_factory, rng_class, dtype): size = 10 with cl.CommandQueue(cl_ctx) as queue: + device = queue.device + if device.platform.vendor == "The pocl project" \ + and device.type & cl.device_type.GPU \ + and rng_class is make_ranlux_generator: + pytest.xfail("ranlux test fails on POCL + Nvidia," + "at least the K40, as of pocl 1.6, 2021-01-20") + rng.uniform(queue, size, dtype) if dtype not in (np.int32, np.int64): -- GitLab From 67984a7aa0dd8682679badb1c2f53f91ac194a52 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 20 Jan 2021 16:45:08 -0600 Subject: [PATCH 45/47] Xfail rect copies on Pocl GPU --- test/test_enqueue_copy.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/test/test_enqueue_copy.py b/test/test_enqueue_copy.py index 55f72d95..402bc8b9 100644 --- a/test/test_enqueue_copy.py +++ b/test/test_enqueue_copy.py @@ -48,8 +48,14 @@ def test_enqueue_copy_rect_2d(ctx_factory, honor_skip=True): # https://github.com/pocl/pocl/issues/353 pytest.skip("POCL's rectangular copies crash") + device = queue.device + if device.platform.vendor == "The pocl project" \ + and device.type & cl.device_type.GPU: + pytest.xfail("rect copies fail on POCL + Nvidia," + "at least the K40, as of pocl 1.6, 2021-01-20") + if honor_skip and queue.device.platform.name == "Apple": - pytest.skip("Apple's CL implementation crashes on this.") + pytest.xfail("Apple's CL implementation crashes on this.") ary_in_shp = 256, 128 # Entire array shape from which sub-array copied to device sub_ary_shp = 128, 96 # Sub-array shape to be copied to device @@ -138,6 +144,12 @@ def test_enqueue_copy_rect_3d(ctx_factory, honor_skip=True): # https://github.com/pocl/pocl/issues/353 pytest.skip("POCL's rectangular copies crash") + device = queue.device + if device.platform.vendor == "The pocl project" \ + and device.type & cl.device_type.GPU: + pytest.xfail("rect copies fail on POCL + Nvidia," + "at least the K40, as of pocl 1.6, 2021-01-20") + if honor_skip and queue.device.platform.name == "Apple": pytest.skip("Apple's CL implementation crashes on this.") -- GitLab From 2f39f7022f478bb45a6f211f8550a4b275b909f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Wed, 20 Jan 2021 17:25:23 -0600 Subject: [PATCH 46/47] Elementwise capture_as implementation: don't return after capture_call --- pyopencl/elementwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index fcb94ee9..dae42b7e 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -310,7 +310,7 @@ class ElementwiseKernel: if capture_as is not None: kernel.set_args(*invocation_args) - return kernel.capture_call( + kernel.capture_call( capture_as, queue, gs, ls, *invocation_args, wait_for=wait_for) -- GitLab From d280d498d0c4c477ec2851799912b3ecebfa64f9 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 20 Jan 2021 18:13:35 -0600 Subject: [PATCH 47/47] Xfail a few more ranlux tests on pocl+nvidia --- test/test_array.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/test_array.py b/test/test_array.py index 2821d923..899fc238 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -715,6 +715,13 @@ def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False context = ctx_factory() queue = cl.CommandQueue(context) + device = queue.device + if device.platform.vendor == "The pocl project" \ + and device.type & cl.device_type.GPU \ + and rng_class is RanluxGenerator: + pytest.xfail("ranlux test fails on POCL + Nvidia," + "at least the Titan V, as of pocl 1.6, 2021-01-20") + if has_double_support(context.devices[0]): dtypes = [np.float32, np.float64] else: @@ -767,6 +774,12 @@ def test_random_int_in_range(ctx_factory, rng_class, dtype, plot_hist=False): context = ctx_factory() queue = cl.CommandQueue(context) + if queue.device.platform.vendor == "The pocl project" \ + and queue.device.type & cl.device_type.GPU \ + and rng_class is RanluxGenerator: + pytest.xfail("ranlux test fails on POCL + Nvidia," + "at least the Titan V, as of pocl 1.6, 2021-01-20") + if rng_class is RanluxGenerator: gen = rng_class(queue, 5120) else: -- GitLab