diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index deaf28aa64dc2037e0d942fefa2cf9ec125b77b4..431495595c2d0ca4ba65380e5f38b8b6e579dd77 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,7 +15,7 @@ Python 3 Intel CPU: reports: junit: test/pytest.xml -Python 3 Titan X: +Python 3 Nvidia Titan X: script: - export PY_EXE=python3 - export PYOPENCL_TEST=nvi:titan @@ -31,7 +31,7 @@ Python 3 Titan X: reports: junit: test/pytest.xml -Python 3 Titan V: +Python 3 Nvidia Titan V: script: - export PY_EXE=python3 - export PYOPENCL_TEST=nvi:titan @@ -47,7 +47,7 @@ Python 3 Titan V: reports: junit: test/pytest.xml -Python 3 K40: +Python 3 Nvidia K40: script: - export PY_EXE=python3 - export PYOPENCL_TEST=nvi:k40 @@ -117,16 +117,34 @@ Python 3 POCL CL 1.1: reports: junit: test/pytest.xml -Python 3 POCL: +Python 3 POCL K40: script: - export PY_EXE=python3 - - export PYOPENCL_TEST=portable:pthread + - export PYOPENCL_TEST=portable:k40 + - export EXTRA_INSTALL="pybind11 numpy mako" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - ". ./build-and-test-py-project.sh" + tags: + - python3 + - pocl + - nvidia-k40 + except: + - tags + artifacts: + reports: + junit: test/pytest.xml + +Python 3 POCL Titan V: + script: + - export PY_EXE=python3 + - export PYOPENCL_TEST=portable:titan - export EXTRA_INSTALL="pybind11 numpy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: - python3 - pocl + - nvidia-titan-v except: - tags artifacts: diff --git a/examples/demo.py b/examples/demo.py index 623660fee1b20b9ba140504ca594cc648e28bc45..a4a503336e126cf5a392fc32f040166c6d92b939 100644 --- a/examples/demo.py +++ b/examples/demo.py @@ -23,7 +23,8 @@ __kernel void sum( """).build() res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes) -prg.sum(queue, a_np.shape, None, a_g, b_g, res_g) +knl = prg.sum # Use this Kernel object for repeated calls +knl(queue, a_np.shape, None, a_g, b_g, res_g) res_np = np.empty_like(a_np) cl.enqueue_copy(queue, res_np, res_g) diff --git a/examples/demo_array.py b/examples/demo_array.py index 41b0f79ef2ccb74a807a8da5aff5eedf6a3bb15f..74bb7cfc6fead21ff0a0bb29266e82541586aa9e 100644 --- a/examples/demo_array.py +++ b/examples/demo_array.py @@ -22,6 +22,7 @@ prg = cl.Program(ctx, """ } """).build() -prg.sum(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data) +knl = prg.sum # Use this Kernel object for repeated calls +knl(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data) print(la.norm((dest_dev - (a_dev+b_dev)).get())) diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py index 6e0268dc1f5ba748d85af0586e773cf17182eb38..1d304fd2447425785b99b17adaccf14e832bc3a7 100644 --- a/pyopencl/__init__.py +++ b/pyopencl/__init__.py @@ -819,8 +819,8 @@ def _add_functionality(): self._wg_info_cache = {} return self - def kernel_set_scalar_arg_dtypes(self, scalar_arg_dtypes): - self._scalar_arg_dtypes = tuple(scalar_arg_dtypes) + def kernel_set_arg_types(self, arg_types): + arg_types = tuple(arg_types) # {{{ arg counting bug handling @@ -847,21 +847,31 @@ def _add_functionality(): # }}} from pyopencl.invoker import generate_enqueue_and_set_args - self._enqueue, self._set_args = generate_enqueue_and_set_args( - self.function_name, - len(scalar_arg_dtypes), self.num_args, - self._scalar_arg_dtypes, - warn_about_arg_count_bug=warn_about_arg_count_bug, - work_around_arg_count_bug=work_around_arg_count_bug) + enqueue, my_set_args = \ + generate_enqueue_and_set_args( + self.function_name, + len(arg_types), self.num_args, + arg_types, + warn_about_arg_count_bug=warn_about_arg_count_bug, + work_around_arg_count_bug=work_around_arg_count_bug) + + # Make ourselves a kernel-specific class, so that we're able to override + # __call__. Inspired by https://stackoverflow.com/a/38541437 + class KernelWithCustomEnqueue(type(self)): + __call__ = enqueue + set_args = my_set_args + + self.__class__ = KernelWithCustomEnqueue def kernel_get_work_group_info(self, param, device): + cache_key = (param, device.int_ptr) try: - return self._wg_info_cache[param, device] + return self._wg_info_cache[cache_key] except KeyError: pass result = kernel_old_get_work_group_info(self, param, device) - self._wg_info_cache[param, device] = result + self._wg_info_cache[cache_key] = result return result def kernel_set_args(self, *args, **kwargs): @@ -871,6 +881,9 @@ def _add_functionality(): def kernel_call(self, queue, global_size, local_size, *args, **kwargs): # __call__ can't be overridden directly, so we need this # trampoline hack. + + # Note: This is only used for the generic __call__, before + # kernel_set_scalar_arg_dtypes is called. return self._enqueue(self, queue, global_size, local_size, *args, **kwargs) def kernel_capture_call(self, filename, queue, global_size, local_size, @@ -890,7 +903,11 @@ def _add_functionality(): Kernel.__init__ = kernel_init Kernel._setup = kernel__setup Kernel.get_work_group_info = kernel_get_work_group_info - Kernel.set_scalar_arg_dtypes = kernel_set_scalar_arg_dtypes + + # FIXME: Possibly deprecate this version + Kernel.set_scalar_arg_dtypes = kernel_set_arg_types + Kernel.set_arg_types = kernel_set_arg_types + Kernel.set_args = kernel_set_args Kernel.__call__ = kernel_call Kernel.capture_call = kernel_capture_call diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py index c4eb43eaef3b144e8b6217551a1f44ccabf2de3c..446eb9c318d8d241d0ef6fa07fb80eeea8fb57cd 100644 --- a/pyopencl/algorithm.py +++ b/pyopencl/algorithm.py @@ -1188,8 +1188,8 @@ class ListOfListsBuilder: gsize = (4*queue.device.max_compute_units,) lsize = (1,) else: - from pyopencl.array import splay - gsize, lsize = splay(queue, n_objects) + from pyopencl.array import _splay + gsize, lsize = _splay(queue.device, n_objects) count_event = count_kernel(queue, gsize, lsize, *(tuple(count_list_args) + data_args + (n_objects,)), diff --git a/pyopencl/array.py b/pyopencl/array.py index da5fdf0c8da6341b95f345cf0feb784b601f1f0d..e5cdda02609923c50cd8d2c8a6d5f4d33d7cbdbe 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -33,7 +33,6 @@ from functools import reduce import numpy as np import pyopencl.elementwise as elementwise import pyopencl as cl -from pytools import memoize_method from pyopencl.compyte.array import ( as_strided as _as_strided, f_contiguous_strides as _f_contiguous_strides, @@ -45,9 +44,38 @@ from pyopencl.characterize import has_double_support from pyopencl import cltypes +_COMMON_DTYPE_CACHE = {} + + def _get_common_dtype(obj1, obj2, queue): - return _get_common_dtype_base(obj1, obj2, - has_double_support(queue.device)) + dsupport = has_double_support(queue.device) + cache_key = None + o1_dtype = obj1.dtype + try: + cache_key = (o1_dtype, obj2.dtype, dsupport) + return _COMMON_DTYPE_CACHE[cache_key] + except KeyError: + pass + except AttributeError: + # obj2 doesn't have a dtype + try: + tobj2 = type(obj2) + cache_key = (o1_dtype, tobj2, dsupport) + + # Integers are weird, sized, and signed. Don't pretend that 'int' + # is enough information to decide what should happen. + if tobj2 != int: + return _COMMON_DTYPE_CACHE[cache_key] + except KeyError: + pass + + result = _get_common_dtype_base(obj1, obj2, dsupport) + + # we succeeded in constructing the cache key + if cache_key is not None: + _COMMON_DTYPE_CACHE[cache_key] = result + + return result def _get_truedivide_dtype(obj1, obj2, queue): @@ -69,19 +97,6 @@ def _get_truedivide_dtype(obj1, obj2, queue): return result -# Work around PyPy not currently supporting the object dtype. -# (Yes, it doesn't even support checking!) -# (as of May 27, 2014 on PyPy 2.3) -try: - np.dtype(object) - - def _dtype_is_object(t): - return t == object -except Exception: - def _dtype_is_object(t): - return False - - class InconsistentOpenCLQueueWarning(UserWarning): pass @@ -103,19 +118,18 @@ class VecLookupWarner: vec = VecLookupWarner() -# {{{ helper functionality +# {{{ helper functionality -def splay(queue, n, kernel_specific_max_wg_size=None): - dev = queue.device - max_work_items = _builtin_min(128, dev.max_work_group_size) +def _splay(device, n, kernel_specific_max_wg_size=None): + max_work_items = _builtin_min(128, device.max_work_group_size) if kernel_specific_max_wg_size is not None: from builtins import min max_work_items = min(max_work_items, kernel_specific_max_wg_size) min_work_items = _builtin_min(32, max_work_items) - max_groups = dev.max_compute_units * 4 * 8 + max_groups = device.max_compute_units * 4 * 8 # 4 to overfill the device # 8 is an Nvidia constant--that's how many # groups fit onto one compute device @@ -159,49 +173,21 @@ def elwise_kernel_runner(kernel_getter): wait_for = kwargs.pop("wait_for", None) - # wait_for must be a copy, because we modify it in-place below - if wait_for is None: - wait_for = [] - else: - wait_for = list(wait_for) - knl = kernel_getter(*args, **kwargs) - gs, ls = repr_ary.get_sizes(queue, + gs, ls = repr_ary._get_sizes(queue, knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) assert isinstance(repr_ary, Array) - - actual_args = [] - for arg in args: - if isinstance(arg, Array): - if not arg.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - actual_args.append(arg.base_data) - actual_args.append(arg.offset) - wait_for.extend(arg.events) - - if (implicit_queue - and arg.queue is not None - and arg.queue != queue): - from warnings import warn - - warn("Implicit queue in elementwise operation does not match " - "queue of a provided argument. This will become an " - "error in 2021.", - type=InconsistentOpenCLQueueWarning) - else: - actual_args.append(arg) - actual_args.append(repr_ary.size) + args = args + (repr_ary.size,) if ARRAY_KERNEL_EXEC_HOOK is not None: return ARRAY_KERNEL_EXEC_HOOK( # pylint: disable=not-callable - knl, queue, gs, ls, *actual_args, wait_for=wait_for) + knl, queue, gs, ls, *args, wait_for=wait_for) else: - return knl(queue, gs, ls, *actual_args, wait_for=wait_for) + return knl(queue, gs, ls, *args, wait_for=wait_for) try: from functools import update_wrapper @@ -220,15 +206,6 @@ class DefaultAllocator(cl.tools.DeferredAllocator): DeprecationWarning, 2) cl.tools.DeferredAllocator.__init__(self, *args, **kwargs) - -def _make_strides(itemsize, shape, order): - if order in "fF": - return _f_contiguous_strides(itemsize, shape) - elif order in "cC": - return _c_contiguous_strides(itemsize, shape) - else: - raise ValueError("invalid order: %s" % order) - # }}} @@ -249,6 +226,9 @@ class _copy_queue: # noqa pass +_ARRAY_GET_SIZES_CACHE = {} + + class Array: """A :class:`numpy.ndarray` work-alike that stores its data and performs its computations on the compute device. *shape* and *dtype* work exactly @@ -435,7 +415,7 @@ class Array: __array_priority__ = 100 def __init__(self, cq, shape, dtype, order="C", allocator=None, - data=None, offset=0, strides=None, events=None): + data=None, offset=0, strides=None, events=None, _flags=None): # {{{ backward compatibility if isinstance(cq, cl.CommandQueue): @@ -479,6 +459,9 @@ class Array: size = 1 for dim in shape: size *= dim + if dim < 0: + raise ValueError("negative dimensions are not allowed") + except TypeError: admissible_types = (int, np.integer) @@ -486,16 +469,27 @@ class Array: raise TypeError("shape must either be iterable or " "castable to an integer") size = shape + if shape < 0: + raise ValueError("negative dimensions are not allowed") shape = (shape,) - if any(dim < 0 for dim in shape): - raise ValueError("negative dimensions are not allowed") - if isinstance(size, np.integer): size = size.item() if strides is None: - strides = _make_strides(dtype.itemsize, shape, order) + if order in "cC": + # inlined from compyte.array.c_contiguous_strides + if shape: + strides = [dtype.itemsize] + for s in shape[:0:-1]: + strides.append(strides[-1]*s) + strides = tuple(strides[::-1]) + else: + strides = () + elif order in "fF": + strides = _f_contiguous_strides(dtype.itemsize, shape) + else: + raise ValueError("invalid order: %s" % order) else: # FIXME: We should possibly perform some plausibility @@ -505,9 +499,8 @@ class Array: # }}} - if _dtype_is_object(dtype): - raise TypeError("object arrays on the compute device are not allowed") - + assert dtype != np.object, \ + "object arrays on the compute device are not allowed" assert isinstance(shape, tuple) assert isinstance(strides, tuple) @@ -547,6 +540,7 @@ class Array: self.offset = offset self.context = context + self._flags = _flags @property def ndim(self): @@ -560,9 +554,11 @@ class Array: return self.base_data @property - @memoize_method def flags(self): - return _ArrayFlags(self) + f = self._flags + if f is None: + self._flags = f = _ArrayFlags(self) + return f def _new_with_changes(self, data, offset, shape=None, dtype=None, strides=None, queue=_copy_queue, allocator=None): @@ -612,12 +608,17 @@ class Array: return self._new_with_changes(self.base_data, self.offset, queue=queue) - #@memoize_method FIXME: reenable - def get_sizes(self, queue, kernel_specific_max_wg_size=None): + def _get_sizes(self, queue, kernel_specific_max_wg_size=None): if not self.flags.forc: raise NotImplementedError("cannot operate on non-contiguous array") - return splay(queue, self.size, - kernel_specific_max_wg_size=kernel_specific_max_wg_size) + cache_key = (queue.device.int_ptr, self.size, kernel_specific_max_wg_size) + try: + return _ARRAY_GET_SIZES_CACHE[cache_key] + except KeyError: + sizes = _splay(queue.device, self.size, + kernel_specific_max_wg_size=kernel_specific_max_wg_size) + _ARRAY_GET_SIZES_CACHE[cache_key] = sizes + return sizes def set(self, ary, queue=None, async_=None, **kwargs): """Transfer the contents the :class:`numpy.ndarray` object *ary* @@ -963,19 +964,21 @@ class Array: def _new_like_me(self, dtype=None, queue=None): strides = None + flags = None if dtype is None: dtype = self.dtype if dtype == self.dtype: strides = self.strides + flags = self.flags queue = queue or self.queue if queue is not None: return self.__class__(queue, self.shape, dtype, - allocator=self.allocator, strides=strides) + allocator=self.allocator, strides=strides, _flags=flags) else: return self.__class__(self.context, self.shape, dtype, - strides=strides, allocator=self.allocator) + strides=strides, allocator=self.allocator, _flags=flags) @staticmethod @elwise_kernel_runner @@ -2081,7 +2084,7 @@ def to_device(queue, ary, allocator=None, async_=None, # }}} - if _dtype_is_object(ary.dtype): + if ary.dtype == np.object: raise RuntimeError("to_device does not work on object arrays.") if array_queue is _same_as_transfer: @@ -2297,7 +2300,7 @@ def multi_take(arrays, indices, out=None, queue=None): if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) - gs, ls = indices.get_sizes(queue, + gs, ls = indices._get_sizes(queue, knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) @@ -2375,24 +2378,18 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) - gs, ls = src_indices.get_sizes(queue, + gs, ls = src_indices._get_sizes(queue, knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) - from pytools import flatten wait_for_this = (dest_indices.events + src_indices.events + _builtin_sum((i.events for i in arrays[chunk_slice]), []) + _builtin_sum((o.events for o in out[chunk_slice]), [])) evt = knl(queue, gs, ls, - *([o.data for o in out[chunk_slice]] - + [dest_indices.base_data, - dest_indices.offset, - src_indices.base_data, - src_indices.offset] - + list(flatten( - (i.base_data, i.offset) - for i in arrays[chunk_slice])) + *([o for o in out[chunk_slice]] + + [dest_indices, src_indices] + + [i for i in arrays[chunk_slice]] + src_offsets_list[chunk_slice] + [src_indices.size]), wait_for=wait_for_this) for o in out[chunk_slice]: @@ -2458,27 +2455,20 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) - gs, ls = dest_indices.get_sizes(queue, + gs, ls = dest_indices._get_sizes(queue, knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) - from pytools import flatten wait_for_this = (wait_for + _builtin_sum((i.events for i in arrays[chunk_slice]), []) + _builtin_sum((o.events for o in out[chunk_slice]), [])) evt = knl(queue, gs, ls, *( - list(flatten( - (o.base_data, o.offset) - for o in out[chunk_slice])) - + [dest_indices.base_data, dest_indices.offset] - + list(flatten( - (i.base_data, i.offset) - for i in arrays[chunk_slice])) - + [use_fill_cla.base_data, use_fill_cla.offset] - + [array_lengths_cla.base_data, array_lengths_cla.offset] - + [dest_indices.size]), + [o for o in out[chunk_slice]] + + [dest_indices] + + [i for i in arrays[chunk_slice]] + + [use_fill_cla, array_lengths_cla, dest_indices.size]), wait_for=wait_for_this) for o in out[chunk_slice]: diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py index ea3862bc4d999e593e1dda9ad8d549423ffeb7b2..dd6c1276cffe2effb53255e1c2d8bc02ec24a3ac 100644 --- a/pyopencl/clrandom.py +++ b/pyopencl/clrandom.py @@ -662,8 +662,8 @@ class Random123GeneratorBase: scale, shift] n = ary.size - from pyopencl.array import splay - gsize, lsize = splay(queue, ary.size) + from pyopencl.array import _splay + gsize, lsize = _splay(queue.device, ary.size) evt = knl(queue, gsize, lsize, *args) ary.add_event(evt) diff --git a/pyopencl/compyte b/pyopencl/compyte index fbfe788a2dcb190fd241fd42ad047e33bafd85b8..7533db88124045924a47d7392eaf9a078670fc4d 160000 --- a/pyopencl/compyte +++ b/pyopencl/compyte @@ -1 +1 @@ -Subproject commit fbfe788a2dcb190fd241fd42ad047e33bafd85b8 +Subproject commit 7533db88124045924a47d7392eaf9a078670fc4d diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index df364eda3c883d378c1e9d25136d8f59f5763f9d..dae42b7e70883c18f88f39461f925ae124926d82 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -156,10 +156,10 @@ def get_elwise_kernel_and_types(context, arguments, operation, name=name, options=options, preamble=preamble, use_range=use_range, loop_prep=loop_prep, **kwargs) - from pyopencl.tools import get_arg_list_scalar_arg_dtypes + from pyopencl.tools import get_arg_list_arg_types kernel = getattr(prg, name) - kernel.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(parsed_args)) + kernel.set_scalar_arg_dtypes(get_arg_list_arg_types(parsed_args)) return kernel, parsed_args @@ -260,17 +260,10 @@ class ElementwiseKernel: invocation_args = [] for arg, arg_descr in zip(args, arg_descrs): if isinstance(arg_descr, VectorArg): - if not arg.flags.forc: - raise RuntimeError("ElementwiseKernel cannot " - "deal with non-contiguous arrays") - if repr_vec is None: repr_vec = arg - invocation_args.append(arg.base_data) - if arg_descr.with_offset: - invocation_args.append(arg.offset) - wait_for.extend(arg.events) + invocation_args.append(arg) else: invocation_args.append(arg) @@ -307,13 +300,13 @@ class ElementwiseKernel: invocation_args.append(step) - from pyopencl.array import splay - gs, ls = splay(queue, + from pyopencl.array import _splay + gs, ls = _splay(queue.device, abs(range_.stop - start)//step, max_wg_size) else: invocation_args.append(repr_vec.size) - gs, ls = repr_vec.get_sizes(queue, max_wg_size) + gs, ls = repr_vec._get_sizes(queue, max_wg_size) if capture_as is not None: kernel.set_args(*invocation_args) @@ -321,9 +314,7 @@ class ElementwiseKernel: capture_as, queue, gs, ls, *invocation_args, wait_for=wait_for) - kernel.set_args(*invocation_args) - return cl.enqueue_nd_range_kernel(queue, kernel, - gs, ls, wait_for=wait_for) + return kernel(queue, gs, ls, *invocation_args, wait_for=wait_for) # }}} diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index c996768d97d8f9a3e58a99e4839db5f37143128e..77f902071296049d7286cf3996571ce08004f208 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -22,15 +22,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import sys import numpy as np from warnings import warn import pyopencl._cl as _cl from pytools.persistent_dict import WriteOncePersistentDict -from pyopencl.tools import _NumpyTypesKeyBuilder - -_PYPY = "__pypy__" in sys.builtin_module_names +from pytools.py_codegen import Indentation, PythonCodeGenerator +from pyopencl.tools import _NumpyTypesKeyBuilder, VectorArg # {{{ arg packing helpers @@ -50,66 +48,22 @@ del _size_t_char # }}} -# {{{ individual arg handling - -def generate_buffer_arg_setter(gen, arg_idx, buf_var): - from pytools.py_codegen import Indentation - - if _PYPY: - # https://github.com/numpy/numpy/issues/5381 - gen(f"if isinstance({buf_var}, np.generic):") - with Indentation(gen): - if _PYPY: - gen("{buf_var} = np.asarray({buf_var})".format(buf_var=buf_var)) - else: - gen("{buf_var} = np.getbuffer({buf_var})".format(buf_var=buf_var)) - - gen(""" - self._set_arg_buf({arg_idx}, {buf_var}) - """ - .format(arg_idx=arg_idx, buf_var=buf_var)) - - -def generate_bytes_arg_setter(gen, arg_idx, buf_var): - gen(""" - self._set_arg_buf({arg_idx}, {buf_var}) - """ - .format(arg_idx=arg_idx, buf_var=buf_var)) - - -def generate_generic_arg_handler(gen, arg_idx, arg_var): - from pytools.py_codegen import Indentation - - gen(""" - if {arg_var} is None: - self._set_arg_null({arg_idx}) - elif isinstance({arg_var}, _KERNEL_ARG_CLASSES): - self.set_arg({arg_idx}, {arg_var}) - """ - .format(arg_idx=arg_idx, arg_var=arg_var)) - - gen("else:") - with Indentation(gen): - generate_buffer_arg_setter(gen, arg_idx, arg_var) - -# }}} - - # {{{ generic arg handling body def generate_generic_arg_handling_body(num_args): - from pytools.py_codegen import PythonCodeGenerator gen = PythonCodeGenerator() if num_args == 0: gen("pass") + else: + gen_indices_and_args = [] + for i in range(num_args): + gen_indices_and_args.append(i) + gen_indices_and_args.append(f"arg{i}") - for i in range(num_args): - gen(f"# process argument {i}") - gen("") - gen(f"current_arg = {i}") - generate_generic_arg_handler(gen, i, "arg%d" % i) - gen("") + gen(f"self._set_arg_multi(" + f"({', '.join(str(i) for i in gen_indices_and_args)},), " + ")") return gen @@ -118,9 +72,13 @@ def generate_generic_arg_handling_body(num_args): # {{{ specific arg handling body +BUF_PACK_TYPECHARS = ["c", "b", "B", "h", "H", "i", "I", "l", "L", "f", "d"] + + def generate_specific_arg_handling_body(function_name, - num_cl_args, scalar_arg_dtypes, - work_around_arg_count_bug, warn_about_arg_count_bug): + num_cl_args, arg_types, + work_around_arg_count_bug, warn_about_arg_count_bug, + in_enqueue): assert work_around_arg_count_bug is not None assert warn_about_arg_count_bug is not None @@ -128,28 +86,75 @@ def generate_specific_arg_handling_body(function_name, fp_arg_count = 0 cl_arg_idx = 0 - from pytools.py_codegen import PythonCodeGenerator gen = PythonCodeGenerator() - if not scalar_arg_dtypes: + if not arg_types: gen("pass") - for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes): - gen(f"# process argument {arg_idx}") + gen_indices_and_args = [] + buf_indices_and_args = [] + buf_pack_indices_and_args = [] + + def add_buf_arg(arg_idx, typechar, expr_str): + if typechar in BUF_PACK_TYPECHARS: + buf_pack_indices_and_args.append(arg_idx) + buf_pack_indices_and_args.append(repr(typechar.encode())) + buf_pack_indices_and_args.append(expr_str) + else: + buf_indices_and_args.append(arg_idx) + buf_indices_and_args.append(f"pack('{typechar}', {expr_str})") + + if in_enqueue and arg_types is not None and \ + any(isinstance(arg_type, VectorArg) for arg_type in arg_types): + # We're about to modify wait_for, make sure it's a copy. + gen(""" + if wait_for is None: + wait_for = [] + else: + wait_for = list(wait_for) + """) gen("") - gen(f"current_arg = {arg_idx}") + + for arg_idx, arg_type in enumerate(arg_types): arg_var = "arg%d" % arg_idx - if arg_dtype is None: - generate_generic_arg_handler(gen, cl_arg_idx, arg_var) + if arg_type is None: + gen_indices_and_args.append(cl_arg_idx) + gen_indices_and_args.append(arg_var) cl_arg_idx += 1 gen("") continue - arg_dtype = np.dtype(arg_dtype) + elif isinstance(arg_type, VectorArg): + gen(f"if not {arg_var}.flags.forc:") + with Indentation(gen): + gen("raise RuntimeError('only contiguous arrays may '") + gen(" 'be used as arguments to this operation')") + gen("") + + if in_enqueue: + gen(f"assert {arg_var}.queue is None or {arg_var}.queue == queue, " + "'queues for all arrays must match the queue supplied " + "to enqueue'") + + gen_indices_and_args.append(cl_arg_idx) + gen_indices_and_args.append(f"{arg_var}.base_data") + cl_arg_idx += 1 + + if arg_type.with_offset: + add_buf_arg(cl_arg_idx, np.dtype(np.int64).char, f"{arg_var}.offset") + cl_arg_idx += 1 + + if in_enqueue: + gen(f"wait_for.extend({arg_var}.events)") + + continue + + arg_dtype = np.dtype(arg_type) if arg_dtype.char == "V": - generate_generic_arg_handler(gen, cl_arg_idx, arg_var) + buf_indices_and_args.append(cl_arg_idx) + buf_indices_and_args.append(arg_var) cl_arg_idx += 1 elif arg_dtype.kind == "c": @@ -170,16 +175,9 @@ def generate_specific_arg_handling_body(function_name, if (work_around_arg_count_bug == "pocl" and arg_dtype == np.complex128 and fp_arg_count + 2 <= 8): - gen( - "buf = pack('{arg_char}', {arg_var}.real)" - .format(arg_char=arg_char, arg_var=arg_var)) - generate_bytes_arg_setter(gen, cl_arg_idx, "buf") + add_buf_arg(cl_arg_idx, arg_char, f"{arg_var}.real") cl_arg_idx += 1 - gen("current_arg = current_arg + 1000") - gen( - "buf = pack('{arg_char}', {arg_var}.imag)" - .format(arg_char=arg_char, arg_var=arg_var)) - generate_bytes_arg_setter(gen, cl_arg_idx, "buf") + add_buf_arg(cl_arg_idx, arg_char, f"{arg_var}.imag") cl_arg_idx += 1 elif (work_around_arg_count_bug == "apple" @@ -191,11 +189,9 @@ def generate_specific_arg_handling_body(function_name, "Cannot pass complex numbers to kernels.") else: - gen( - "buf = pack('{arg_char}{arg_char}', " - "{arg_var}.real, {arg_var}.imag)" - .format(arg_char=arg_char, arg_var=arg_var)) - generate_bytes_arg_setter(gen, cl_arg_idx, "buf") + buf_indices_and_args.append(cl_arg_idx) + buf_indices_and_args.append( + f"pack('{arg_char}{arg_char}', {arg_var}.real, {arg_var}.imag)") cl_arg_idx += 1 fp_arg_count += 2 @@ -206,16 +202,22 @@ def generate_specific_arg_handling_body(function_name, arg_char = arg_dtype.char arg_char = _type_char_map.get(arg_char, arg_char) - gen( - "buf = pack('{arg_char}', {arg_var})" - .format( - arg_char=arg_char, - arg_var=arg_var)) - generate_bytes_arg_setter(gen, cl_arg_idx, "buf") + add_buf_arg(cl_arg_idx, arg_char, arg_var) cl_arg_idx += 1 gen("") + for arg_kind, args_and_indices, entry_length in [ + ("", gen_indices_and_args, 2), + ("_buf", buf_indices_and_args, 2), + ("_buf_pack", buf_pack_indices_and_args, 3), + ]: + assert len(args_and_indices) % entry_length == 0 + if args_and_indices: + gen(f"self._set_arg{arg_kind}_multi(" + f"({', '.join(str(i) for i in args_and_indices)},), " + ")") + if cl_arg_idx != num_cl_args: raise TypeError( "length of argument list (%d) and " @@ -227,77 +229,29 @@ def generate_specific_arg_handling_body(function_name, # }}} -# {{{ error handler - -def wrap_in_error_handler(body, arg_names): - from pytools.py_codegen import PythonCodeGenerator, Indentation - - err_gen = PythonCodeGenerator() - - def gen_error_handler(): - err_gen(""" - if current_arg is not None: - args = [{args}] - advice = "" - from pyopencl.array import Array - if isinstance(args[current_arg], Array): - advice = " (perhaps you meant to pass 'array.data' " \ - "instead of the array itself?)" - - raise _cl.LogicError( - "when processing argument #%d (1-based): %s%s" - % (current_arg+1, str(e), advice)) - else: - raise - """ - .format(args=", ".join(arg_names))) - err_gen("") - - err_gen("try:") - with Indentation(err_gen): - err_gen.extend(body) - err_gen("except TypeError as e:") - with Indentation(err_gen): - gen_error_handler() - err_gen("except _cl.LogicError as e:") - with Indentation(err_gen): - gen_error_handler() - - return err_gen - -# }}} - - -def add_local_imports(gen): - gen("import numpy as np") - gen("import pyopencl._cl as _cl") - gen("from pyopencl import _KERNEL_ARG_CLASSES") - gen("") - - def _generate_enqueue_and_set_args_module(function_name, num_passed_args, num_cl_args, - scalar_arg_dtypes, + arg_types, work_around_arg_count_bug, warn_about_arg_count_bug): - from pytools.py_codegen import PythonCodeGenerator, Indentation - arg_names = ["arg%d" % i for i in range(num_passed_args)] - if scalar_arg_dtypes is None: - body = generate_generic_arg_handling_body(num_passed_args) - else: - body = generate_specific_arg_handling_body( - function_name, num_cl_args, scalar_arg_dtypes, - warn_about_arg_count_bug=warn_about_arg_count_bug, - work_around_arg_count_bug=work_around_arg_count_bug) - - err_handler = wrap_in_error_handler(body, arg_names) + def gen_arg_setting(in_enqueue): + if arg_types is None: + return generate_generic_arg_handling_body(num_passed_args) + else: + return generate_specific_arg_handling_body( + function_name, num_cl_args, arg_types, + warn_about_arg_count_bug=warn_about_arg_count_bug, + work_around_arg_count_bug=work_around_arg_count_bug, + in_enqueue=in_enqueue) gen = PythonCodeGenerator() gen("from struct import pack") gen("from pyopencl import status_code") + gen("import numpy as np") + gen("import pyopencl._cl as _cl") gen("") # {{{ generate _enqueue @@ -314,13 +268,13 @@ def _generate_enqueue_and_set_args_module(function_name, "wait_for=None"]))) with Indentation(gen): - add_local_imports(gen) - gen.extend(err_handler) + gen.extend(gen_arg_setting(in_enqueue=True)) + # Using positional args here because pybind is slow with keyword args gen(""" return _cl.enqueue_nd_range_kernel(queue, self, global_size, local_size, - global_offset, wait_for, g_times_l=g_times_l, - allow_empty_ndrange=allow_empty_ndrange) + global_offset, wait_for, g_times_l, + allow_empty_ndrange) """) # }}} @@ -332,8 +286,7 @@ def _generate_enqueue_and_set_args_module(function_name, % (", ".join(["self"] + arg_names))) with Indentation(gen): - add_local_imports(gen) - gen.extend(err_handler) + gen.extend(gen_arg_setting(in_enqueue=False)) # }}} @@ -341,17 +294,17 @@ def _generate_enqueue_and_set_args_module(function_name, invoker_cache = WriteOncePersistentDict( - "pyopencl-invoker-cache-v7", + "pyopencl-invoker-cache-v38", key_builder=_NumpyTypesKeyBuilder()) def generate_enqueue_and_set_args(function_name, num_passed_args, num_cl_args, - scalar_arg_dtypes, + arg_types, work_around_arg_count_bug, warn_about_arg_count_bug): cache_key = (function_name, num_passed_args, num_cl_args, - scalar_arg_dtypes, + arg_types, work_around_arg_count_bug, warn_about_arg_count_bug) from_cache = False diff --git a/pyopencl/tools.py b/pyopencl/tools.py index 5b8146858119e233338b8e23d4414f1005e3cdda..3be95ee2c736da374011002f2076ecfa172d9d35 100644 --- a/pyopencl/tools.py +++ b/pyopencl/tools.py @@ -334,10 +334,21 @@ class DtypedArgument(Argument): self.name, self.dtype) + def __eq__(self, other): + return (type(self) == type(other) + and self.dtype == other.dtype + and self.name == other.name) + + def __hash__(self): + return ( + hash(type(self)) + ^ hash(self.dtype) + ^ hash(self.name)) + class VectorArg(DtypedArgument): def __init__(self, dtype, name, with_offset=False): - DtypedArgument.__init__(self, dtype, name) + super().__init__(dtype, name) self.with_offset = with_offset def declarator(self): @@ -350,6 +361,13 @@ class VectorArg(DtypedArgument): return result + def __eq__(self, other): + return (super().__eq__(other) + and self.with_offset == other.with_offset) + + def __hash__(self): + return super.__hash__() ^ hash(self.with_offset) + class ScalarArg(DtypedArgument): def declarator(self): @@ -364,6 +382,17 @@ class OtherArg(Argument): def declarator(self): return self.decl + def __eq__(self, other): + return (type(self) == type(other) + and self.decl == other.decl + and self.name == other.name) + + def __hash__(self): + return ( + hash(type(self)) + ^ hash(self.decl) + ^ hash(self.name)) + def parse_c_arg(c_arg, with_offset=False): for aspace in ["__local", "__constant"]: @@ -402,6 +431,20 @@ def parse_arg_list(arguments, with_offset=False): return [parse_single_arg(arg) for arg in arguments] +def get_arg_list_arg_types(arg_types): + result = [] + + for arg_type in arg_types: + if isinstance(arg_type, ScalarArg): + result.append(arg_type.dtype) + elif isinstance(arg_type, VectorArg): + result.append(arg_type) + else: + raise RuntimeError("arg type not understood: %s" % type(arg_type)) + + return tuple(result) + + def get_arg_list_scalar_arg_dtypes(arg_types): result = [] @@ -1025,6 +1068,11 @@ def is_spirv(s): # {{{ numpy key types builder class _NumpyTypesKeyBuilder(KeyBuilderBase): + def update_for_VectorArg(self, key_hash, key): # noqa: N802 + self.rec(key_hash, key.dtype) + self.update_for_str(key_hash, key.name) + self.rec(key_hash, key.with_offset) + def update_for_type(self, key_hash, key): if issubclass(key, np.generic): self.update_for_str(key_hash, key.__name__) diff --git a/pyopencl/version.py b/pyopencl/version.py index fee1f80e88ef054d5c33a33622ccff1e41eca20f..e4c47bc9ed6a54934f00ad2e03256e2fa19c60f7 100644 --- a/pyopencl/version.py +++ b/pyopencl/version.py @@ -1,3 +1,3 @@ -VERSION = (2020, 3, 1) +VERSION = (2021, 1) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp index ad5b7ab80c167ccfadbe3e4d5c7673859528a7b1..f00a6b882d50471efb14940f9aad7f93bc720b59 100644 --- a/src/wrap_cl.hpp +++ b/src/wrap_cl.hpp @@ -86,6 +86,7 @@ #endif +#include #include #include #include @@ -389,14 +390,15 @@ \ if (py_wait_for.ptr() != Py_None) \ { \ - event_wait_list.resize(len(py_wait_for)); \ for (py::handle evt: py_wait_for) \ - event_wait_list[num_events_in_wait_list++] = \ - evt.cast().data(); \ + { \ + event_wait_list.push_back(evt.cast().data()); \ + ++num_events_in_wait_list; \ + } \ } #define PYOPENCL_WAITLIST_ARGS \ - num_events_in_wait_list, event_wait_list.empty( ) ? nullptr : &event_wait_list.front() + num_events_in_wait_list, (num_events_in_wait_list == 0) ? nullptr : &event_wait_list.front() #define PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, obj) \ try \ @@ -4370,7 +4372,43 @@ namespace pyopencl (m_kernel, arg_index, sizeof(cl_command_queue), &q)); } - void set_arg_buf(cl_uint arg_index, py::object py_buffer) + void set_arg_buf_pack(cl_uint arg_index, py::handle py_typechar, py::handle obj) + { + std::string typechar_str(py::cast(py_typechar)); + if (typechar_str.size() != 1) + throw error("Kernel.set_arg_buf_pack", CL_INVALID_VALUE, + "type char argument must have exactly one character"); + + char typechar = typechar_str[0]; + +#define PYOPENCL_KERNEL_PACK_AND_SET_ARG(TYPECH_VAL, TYPE) \ + case TYPECH_VAL: \ + { \ + TYPE val = py::cast(obj); \ + PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index, sizeof(val), &val)); \ + break; \ + } + switch (typechar) + { + PYOPENCL_KERNEL_PACK_AND_SET_ARG('c', char) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('b', signed char) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('B', unsigned char) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('h', short) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('H', unsigned short) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('i', int) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('I', unsigned int) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('l', long) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('L', unsigned long) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('f', float) + PYOPENCL_KERNEL_PACK_AND_SET_ARG('d', double) + default: + throw error("Kernel.set_arg_buf_pack", CL_INVALID_VALUE, + "invalid type char"); + } +#undef PYOPENCL_KERNEL_PACK_AND_SET_ARG + } + + void set_arg_buf(cl_uint arg_index, py::handle py_buffer) { const void *buf; PYOPENCL_BUFFER_SIZE_T len; @@ -4403,7 +4441,7 @@ namespace pyopencl } #endif - void set_arg(cl_uint arg_index, py::object arg) + void set_arg(cl_uint arg_index, py::handle arg) { if (arg.ptr() == Py_None) { @@ -4615,6 +4653,85 @@ namespace pyopencl #endif }; +#define PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER \ + catch (error &err) \ + { \ + std::string msg( \ + std::string("when processing arg#") + std::to_string(arg_index+1) \ + + std::string(" (1-based): ") + std::string(err.what())); \ + auto mod_cl_ary(py::module::import("pyopencl.array")); \ + auto cls_array(mod_cl_ary.attr("Array")); \ + if (arg_value.ptr() && py::isinstance(arg_value, cls_array)) \ + msg.append( \ + " (perhaps you meant to pass 'array.data' instead of the array itself?)"); \ + throw error(err.routine().c_str(), err.code(), msg.c_str()); \ + } \ + catch (std::exception &err) \ + { \ + std::string msg( \ + std::string("when processing arg#") + std::to_string(arg_index+1) \ + + std::string(" (1-based): ") + std::string(err.what())); \ + throw std::runtime_error(msg.c_str()); \ + } + + inline + void set_arg_multi( + std::function set_arg_func, + py::tuple args_and_indices) + { + cl_uint arg_index; + py::handle arg_value; + + auto it = args_and_indices.begin(), end = args_and_indices.end(); + try + { + /* This is an internal interface that assumes it gets fed well-formed + * data. No meaningful error checking is being performed on + * off-interval exhaustion of the iterator, on purpose. + */ + while (it != end) + { + // special value in case integer cast fails + arg_index = 9999 - 1; + + arg_index = py::cast(*it++); + arg_value = *it++; + set_arg_func(arg_index, arg_value); + } + } + PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER + } + + + inline + void set_arg_multi( + std::function set_arg_func, + py::tuple args_and_indices) + { + cl_uint arg_index; + py::handle arg_descr, arg_value; + + auto it = args_and_indices.begin(), end = args_and_indices.end(); + try + { + /* This is an internal interface that assumes it gets fed well-formed + * data. No meaningful error checking is being performed on + * off-interval exhaustion of the iterator, on purpose. + */ + while (it != end) + { + // special value in case integer cast fails + arg_index = 9999 - 1; + + arg_index = py::cast(*it++); + arg_descr = *it++; + arg_value = *it++; + set_arg_func(arg_index, arg_descr, arg_value); + } + } + PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER + } + inline py::list create_kernels_in_program(program &pgm) @@ -4635,70 +4752,73 @@ namespace pyopencl return result; } - +#define MAX_WS_DIM_COUNT 10 inline event *enqueue_nd_range_kernel( command_queue &cq, kernel &knl, - py::object py_global_work_size, - py::object py_local_work_size, - py::object py_global_work_offset, - py::object py_wait_for, + py::handle py_global_work_size, + py::handle py_local_work_size, + py::handle py_global_work_offset, + py::handle py_wait_for, bool g_times_l, bool allow_empty_ndrange) { PYOPENCL_PARSE_WAIT_FOR; - cl_uint work_dim = len(py_global_work_size); + std::array global_work_size; + unsigned gws_size = 0; + COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_size, gws_size); + cl_uint work_dim = gws_size; - std::vector global_work_size; - COPY_PY_LIST(size_t, global_work_size); + std::array local_work_size; + unsigned lws_size = 0; + size_t *local_work_size_ptr = nullptr; - size_t *local_work_size_ptr = 0; - std::vector local_work_size; if (py_local_work_size.ptr() != Py_None) { + COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, local_work_size, lws_size); + if (g_times_l) - work_dim = std::max(work_dim, unsigned(len(py_local_work_size))); + work_dim = std::max(work_dim, lws_size); else - if (work_dim != unsigned(len(py_local_work_size))) + if (work_dim != lws_size) throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE, "global/local work sizes have differing dimensions"); - COPY_PY_LIST(size_t, local_work_size); - - while (local_work_size.size() < work_dim) - local_work_size.push_back(1); - while (global_work_size.size() < work_dim) - global_work_size.push_back(1); + while (lws_size < work_dim) + local_work_size[lws_size++] = 1; + while (gws_size < work_dim) + global_work_size[gws_size++] = 1; - local_work_size_ptr = local_work_size.empty( ) ? nullptr : &local_work_size.front(); + local_work_size_ptr = &local_work_size.front(); } - if (g_times_l && local_work_size_ptr) + if (g_times_l && lws_size) { for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis) global_work_size[work_axis] *= local_work_size[work_axis]; } - size_t *global_work_offset_ptr = 0; - std::vector global_work_offset; + size_t *global_work_offset_ptr = nullptr; + std::array global_work_offset; if (py_global_work_offset.ptr() != Py_None) { - if (work_dim != unsigned(len(py_global_work_offset))) + unsigned gwo_size = 0; + COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_offset, gwo_size); + + if (work_dim != gwo_size) throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE, "global work size and offset have differing dimensions"); - COPY_PY_LIST(size_t, global_work_offset); - if (g_times_l && local_work_size_ptr) { for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis) global_work_offset[work_axis] *= local_work_size[work_axis]; } - global_work_offset_ptr = global_work_offset.empty( ) ? nullptr : &global_work_offset.front(); + global_work_offset_ptr = &global_work_offset.front(); } if (allow_empty_ndrange) @@ -4735,7 +4855,7 @@ namespace pyopencl knl.data(), work_dim, global_work_offset_ptr, - global_work_size.empty( ) ? nullptr : &global_work_size.front(), + &global_work_size.front(), local_work_size_ptr, PYOPENCL_WAITLIST_ARGS, &evt )); diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp index e68c785128dda3da644a9dbd76c5feb06d7dd591..205b31ec452b388fe1b32f3443e63762b33a10c0 100644 --- a/src/wrap_cl_part_2.cpp +++ b/src/wrap_cl_part_2.cpp @@ -470,6 +470,28 @@ void pyopencl_expose_part_2(py::module &m) #if PYOPENCL_CL_VERSION >= 0x2000 .def("_set_arg_svm", &cls::set_arg_svm) #endif + .def("_set_arg_multi", + [](cls &knl, py::tuple indices_and_args) + { + set_arg_multi( + [&](cl_uint i, py::handle arg) { knl.set_arg(i, arg); }, + indices_and_args); + }) + .def("_set_arg_buf_multi", + [](cls &knl, py::tuple indices_and_args) + { + set_arg_multi( + [&](cl_uint i, py::handle arg) { knl.set_arg_buf(i, arg); }, + indices_and_args); + }) + .def("_set_arg_buf_pack_multi", + [](cls &knl, py::tuple indices_chars_and_args) + { + set_arg_multi( + [&](cl_uint i, py::handle typechar, py::handle arg) + { knl.set_arg_buf_pack(i, typechar, arg); }, + indices_chars_and_args); + }) .DEF_SIMPLE_METHOD(set_arg) #if PYOPENCL_CL_VERSION >= 0x1020 .DEF_SIMPLE_METHOD(get_arg_info) diff --git a/src/wrap_helpers.hpp b/src/wrap_helpers.hpp index 4799244ff72c583a73a9d92d43229a171be4ee9c..cabc012d4a683a20807f066757278e22fafc8268 100644 --- a/src/wrap_helpers.hpp +++ b/src/wrap_helpers.hpp @@ -71,6 +71,18 @@ namespace py = pybind11; NAME.push_back(it.cast()); \ } +#define COPY_PY_ARRAY(FUNC_NAME, TYPE, NAME, COUNTER) \ + { \ + COUNTER = 0; \ + for (auto it: py_##NAME) \ + { \ + if (COUNTER == NAME.size()) \ + throw error(FUNC_NAME, \ + CL_INVALID_VALUE, "too many entries in " #NAME " argument"); \ + NAME[COUNTER++] = it.cast(); \ + } \ + } + #define COPY_PY_COORD_TRIPLE(NAME) \ size_t NAME[3] = {0, 0, 0}; \ { \ diff --git a/test/test_algorithm.py b/test/test_algorithm.py index 353af28173c1dadec40e8c761e4b8df8bd45caaf..676aee379272ede7352d26e3bd81ebfc7f5686f9 100644 --- a/test/test_algorithm.py +++ b/test/test_algorithm.py @@ -1087,6 +1087,12 @@ def test_bitonic_argsort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) + device = queue.device + if device.platform.vendor == "The pocl project" \ + and device.type & cl.device_type.GPU: + pytest.xfail("bitonic argsort fails on POCL + Nvidia," + "at least the K40, as of pocl 1.6, 2021-01-20") + dev = ctx.devices[0] if (dev.platform.name == "Portable Computing Language" and sys.platform == "darwin"): diff --git a/test/test_array.py b/test/test_array.py index b4234a3777059cc44b2f37006c98239e4dbe2ec5..899fc2383c1248efc80dbb7a49f8801146376554 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -715,6 +715,13 @@ def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False context = ctx_factory() queue = cl.CommandQueue(context) + device = queue.device + if device.platform.vendor == "The pocl project" \ + and device.type & cl.device_type.GPU \ + and rng_class is RanluxGenerator: + pytest.xfail("ranlux test fails on POCL + Nvidia," + "at least the Titan V, as of pocl 1.6, 2021-01-20") + if has_double_support(context.devices[0]): dtypes = [np.float32, np.float64] else: @@ -767,6 +774,12 @@ def test_random_int_in_range(ctx_factory, rng_class, dtype, plot_hist=False): context = ctx_factory() queue = cl.CommandQueue(context) + if queue.device.platform.vendor == "The pocl project" \ + and queue.device.type & cl.device_type.GPU \ + and rng_class is RanluxGenerator: + pytest.xfail("ranlux test fails on POCL + Nvidia," + "at least the Titan V, as of pocl 1.6, 2021-01-20") + if rng_class is RanluxGenerator: gen = rng_class(queue, 5120) else: @@ -1350,6 +1363,12 @@ def test_get_async(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) + device = queue.device + if device.platform.vendor == "The pocl project" \ + and device.type & cl.device_type.GPU: + pytest.xfail("the async get test fails on POCL + Nvidia," + "at least the K40, as of pocl 1.6, 2021-01-20") + a = np.random.rand(10**6).astype(np.dtype("float32")) a_gpu = cl_array.to_device(queue, a) b = a + a**5 + 1 diff --git a/test/test_clrandom.py b/test/test_clrandom.py index 1ce479b2b1f18b099c9457d43a03c4b2327ea77f..1ec53842a036bc659d12fa546425604ac01c4964 100644 --- a/test/test_clrandom.py +++ b/test/test_clrandom.py @@ -68,6 +68,13 @@ def test_clrandom_dtypes(ctx_factory, rng_class, dtype): size = 10 with cl.CommandQueue(cl_ctx) as queue: + device = queue.device + if device.platform.vendor == "The pocl project" \ + and device.type & cl.device_type.GPU \ + and rng_class is make_ranlux_generator: + pytest.xfail("ranlux test fails on POCL + Nvidia," + "at least the K40, as of pocl 1.6, 2021-01-20") + rng.uniform(queue, size, dtype) if dtype not in (np.int32, np.int64): diff --git a/test/test_enqueue_copy.py b/test/test_enqueue_copy.py index 55f72d9519f89ed5b17c061559104379a3896e72..402bc8b9d71e47592f67101ff58dfb90a0c56630 100644 --- a/test/test_enqueue_copy.py +++ b/test/test_enqueue_copy.py @@ -48,8 +48,14 @@ def test_enqueue_copy_rect_2d(ctx_factory, honor_skip=True): # https://github.com/pocl/pocl/issues/353 pytest.skip("POCL's rectangular copies crash") + device = queue.device + if device.platform.vendor == "The pocl project" \ + and device.type & cl.device_type.GPU: + pytest.xfail("rect copies fail on POCL + Nvidia," + "at least the K40, as of pocl 1.6, 2021-01-20") + if honor_skip and queue.device.platform.name == "Apple": - pytest.skip("Apple's CL implementation crashes on this.") + pytest.xfail("Apple's CL implementation crashes on this.") ary_in_shp = 256, 128 # Entire array shape from which sub-array copied to device sub_ary_shp = 128, 96 # Sub-array shape to be copied to device @@ -138,6 +144,12 @@ def test_enqueue_copy_rect_3d(ctx_factory, honor_skip=True): # https://github.com/pocl/pocl/issues/353 pytest.skip("POCL's rectangular copies crash") + device = queue.device + if device.platform.vendor == "The pocl project" \ + and device.type & cl.device_type.GPU: + pytest.xfail("rect copies fail on POCL + Nvidia," + "at least the K40, as of pocl 1.6, 2021-01-20") + if honor_skip and queue.device.platform.name == "Apple": pytest.skip("Apple's CL implementation crashes on this.") diff --git a/test/test_wrapper.py b/test/test_wrapper.py index 8aad416e75b9e3e346946cb2b5c5b40836c88ee4..1068779b78d443cd0f3fe012c429025d82057d6d 100644 --- a/test/test_wrapper.py +++ b/test/test_wrapper.py @@ -46,10 +46,17 @@ else: faulthandler.enable() -def _skip_if_pocl(plat, up_to_version, msg="unsupported by pocl"): +def _xfail_if_pocl(plat, up_to_version, msg="unsupported by pocl"): if plat.vendor == "The pocl project": if up_to_version is None or get_pocl_version(plat) <= up_to_version: - pytest.skip(msg) + pytest.xfail(msg) + + +def _xfail_if_pocl_gpu(device, what): + if device.platform.vendor == "The pocl project" \ + and device.type & cl.device_type.GPU: + pytest.xfail(f"POCL's {what} support don't work right on Nvidia GPUs, " + "at least the Titan V, as of pocl 1.6, 2021-01-20") def test_get_info(ctx_factory): @@ -382,7 +389,7 @@ def test_image_2d(ctx_factory): if "Intel" in device.vendor and "31360.31426" in device.version: from pytest import skip skip("images crashy on %s" % device) - _skip_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP") + _xfail_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP") prg = cl.Program(context, """ __kernel void copy_image( @@ -454,7 +461,7 @@ def test_image_3d(ctx_factory): if device.platform.vendor == "Intel(R) Corporation": from pytest import skip skip("images crashy on %s" % device) - _skip_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP") + _xfail_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP") prg = cl.Program(context, """ __kernel void copy_image_plane( @@ -676,7 +683,7 @@ def test_can_build_and_run_binary(ctx_factory): def test_enqueue_barrier_marker(ctx_factory): ctx = ctx_factory() # Still relevant on pocl 1.0RC1. - _skip_if_pocl( + _xfail_if_pocl( ctx.devices[0].platform, (1, 0), "pocl crashes on enqueue_barrier") queue = cl.CommandQueue(ctx) @@ -704,7 +711,7 @@ def test_unload_compiler(platform): or cl.get_cl_header_version() < (1, 2)): from pytest import skip skip("clUnloadPlatformCompiler is only available in OpenCL 1.2") - _skip_if_pocl(platform, (0, 13), "pocl does not support unloading compiler") + _xfail_if_pocl(platform, (0, 13), "pocl does not support unloading compiler") if platform.vendor == "Intel(R) Corporation": from pytest import skip skip("Intel proprietary driver does not support unloading compiler") @@ -745,7 +752,7 @@ def test_user_event(ctx_factory): skip("UserEvent is only available in OpenCL 1.1") # https://github.com/pocl/pocl/issues/201 - _skip_if_pocl(ctx.devices[0].platform, (0, 13), + _xfail_if_pocl(ctx.devices[0].platform, (0, 13), "pocl's user events don't work right") status = {} @@ -842,6 +849,8 @@ def test_event_set_callback(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) + _xfail_if_pocl_gpu(queue.device, "event callbacks") + if ctx._get_cl_version() < (1, 1): pytest.skip("OpenCL 1.1 or newer required for set_callback") @@ -896,6 +905,8 @@ def test_global_offset(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) + _xfail_if_pocl_gpu(queue.device, "global offset") + prg = cl.Program(context, """ __kernel void mult(__global float *a) { a[get_global_id(0)] *= 2; } @@ -981,6 +992,8 @@ def test_coarse_grain_svm(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) + _xfail_if_pocl_gpu(queue.device, "SVM") + dev = ctx.devices[0] from pyopencl.characterize import has_coarse_grain_buffer_svm @@ -1062,6 +1075,8 @@ def test_fine_grain_svm(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) + _xfail_if_pocl_gpu(queue.device, "GPU SVM") + from pyopencl.characterize import has_fine_grain_buffer_svm from pytest import skip if not has_fine_grain_buffer_svm(queue.device): @@ -1148,6 +1163,8 @@ def test_copy_buffer_rect(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) + _xfail_if_pocl_gpu(queue.device, "rectangular copies") + arr1 = cl_array.zeros(queue, (2, 3), "f") arr2 = cl_array.zeros(queue, (4, 5), "f") arr1.fill(1)