From f374b90f5052d1ee389c1bfe0573b059b2c7f95e Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Mon, 15 Jun 2020 13:05:25 -0500 Subject: [PATCH] Add 'allow_empty_ndrange' kwarg to kernel enqueue --- doc/runtime_program.rst | 24 +++++++++++++++++++----- doc/subst.rst | 9 +++++++++ pyopencl/invoker.py | 7 +++++-- src/wrap_cl.hpp | 30 +++++++++++++++++++++++++++++- src/wrap_cl_part_2.cpp | 3 ++- 5 files changed, 64 insertions(+), 9 deletions(-) diff --git a/doc/runtime_program.rst b/doc/runtime_program.rst index 174b25d1..ef549042 100644 --- a/doc/runtime_program.rst +++ b/doc/runtime_program.rst @@ -205,15 +205,18 @@ Kernel prg.kernel(queue, n_globals, None, args) - .. method:: __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False) + .. method:: __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False, allow_empty_ndrange=False) Use :func:`enqueue_nd_range_kernel` to enqueue a kernel execution, after using :meth:`set_args` to set each argument in turn. See the documentation for :meth:`set_arg` to see what argument types are allowed. - |std-enqueue-blurb| |glsize| + |empty-nd-range| + + |std-enqueue-blurb| + .. note:: :meth:`__call__` is *not* thread-safe. It sets the arguments using :meth:`set_args` @@ -241,6 +244,10 @@ Kernel .. versionchanged:: 2011.1 Added the *g_times_l* keyword arg. + .. versionchanged:: 2020.2 + + Added the *allow_empty_ndrange* keyword argument. + .. method:: capture_call(filename, queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False) This method supports the exact same interface as :meth:`__call__`, but @@ -278,11 +285,18 @@ Kernel The size of local buffer in bytes to be provided. -.. function:: enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=False) - - |std-enqueue-blurb| +.. function:: enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=False, allow_empty_ndrange=False) |glsize| + |empty-nd-range| + + |std-enqueue-blurb| + .. versionchanged:: 2011.1 + Added the *g_times_l* keyword arg. + + .. versionchanged:: 2020.2 + + Added the *allow_empty_ndrange* keyword argument. diff --git a/doc/subst.rst b/doc/subst.rst index 5e7b524b..eba35363 100644 --- a/doc/subst.rst +++ b/doc/subst.rst @@ -25,3 +25,12 @@ local size. (which makes the behavior more like Nvidia CUDA) In this case, *global_size* and *local_size* also do not have to have the same number of entries. + +.. |empty-nd-range| replace:: *allow_empty_ndrange* is a :class:`bool` indicating + how an empty NDRange is to be treated, where "empty" means that one or more + entries of *global_size* or *local_size* are zero. OpenCL itself does not + allow enqueueing kernels over empty NDRanges. Setting this flag to *True* + enqueues a marker with a wait list (``clEnqueueMarkerWithWaitList``) + to obtain the synchronization effects that would have resulted from + the kernel enqueue. + Setting *allow_empty_ndrange* to *True* requires OpenCL 1.2 or newer. diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index 4d01888f..aab76cc4 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -322,7 +322,9 @@ def _generate_enqueue_and_set_args_module(function_name, ", ".join( ["self", "queue", "global_size", "local_size"] + arg_names - + ["global_offset=None", "g_times_l=None", + + ["global_offset=None", + "g_times_l=None", + "allow_empty_ndrange=False", "wait_for=None"]))) with Indentation(gen): @@ -331,7 +333,8 @@ def _generate_enqueue_and_set_args_module(function_name, gen(""" return _cl.enqueue_nd_range_kernel(queue, self, global_size, local_size, - global_offset, wait_for, g_times_l=g_times_l) + global_offset, wait_for, g_times_l=g_times_l, + allow_empty_ndrange=allow_empty_ndrange) """) # }}} diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp index 9ff7f023..e7471bd8 100644 --- a/src/wrap_cl.hpp +++ b/src/wrap_cl.hpp @@ -4527,7 +4527,8 @@ namespace pyopencl py::object py_local_work_size, py::object py_global_work_offset, py::object py_wait_for, - bool g_times_l) + bool g_times_l, + bool allow_empty_ndrange) { PYOPENCL_PARSE_WAIT_FOR; @@ -4582,6 +4583,33 @@ namespace pyopencl global_work_offset_ptr = global_work_offset.empty( ) ? nullptr : &global_work_offset.front(); } + if (allow_empty_ndrange) + { +#if PYOPENCL_CL_VERSION >= 0x1020 + bool is_empty = false; + for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis) + if (global_work_size[work_axis] == 0) + is_empty = true; + if (local_work_size_ptr) + for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis) + if (local_work_size_ptr[work_axis] == 0) + is_empty = true; + + if (is_empty) + { + cl_event evt; + PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList, ( + cq.data(), PYOPENCL_WAITLIST_ARGS, &evt)); + PYOPENCL_RETURN_NEW_EVENT(evt); + } +#else + // clEnqueueWaitForEvents + clEnqueueMarker is not equivalent + // in the case of an out-of-order queue. + throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE, + "allow_empty_ndrange requires OpenCL 1.2"); +#endif + } + PYOPENCL_RETRY_RETURN_IF_MEM_ERROR( { cl_event evt; PYOPENCL_CALL_GUARDED(clEnqueueNDRangeKernel, ( diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp index 5ca5efcf..cbd1f9a4 100644 --- a/src/wrap_cl_part_2.cpp +++ b/src/wrap_cl_part_2.cpp @@ -461,7 +461,8 @@ void pyopencl_expose_part_2(py::module &m) py::arg("local_work_size"), py::arg("global_work_offset")=py::none(), py::arg("wait_for")=py::none(), - py::arg("g_times_l")=false + py::arg("g_times_l")=false, + py::arg("allow_empty_ndrange")=false ); // TODO: clEnqueueNativeKernel -- GitLab