From f374b90f5052d1ee389c1bfe0573b059b2c7f95e Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 15 Jun 2020 13:05:25 -0500
Subject: [PATCH] Add 'allow_empty_ndrange' kwarg to kernel enqueue

---
 doc/runtime_program.rst | 24 +++++++++++++++++++-----
 doc/subst.rst           |  9 +++++++++
 pyopencl/invoker.py     |  7 +++++--
 src/wrap_cl.hpp         | 30 +++++++++++++++++++++++++++++-
 src/wrap_cl_part_2.cpp  |  3 ++-
 5 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/doc/runtime_program.rst b/doc/runtime_program.rst
index 174b25d1..ef549042 100644
--- a/doc/runtime_program.rst
+++ b/doc/runtime_program.rst
@@ -205,15 +205,18 @@ Kernel
                prg.kernel(queue, n_globals, None, args)
 
 
-    .. method:: __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False)
+    .. method:: __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False, allow_empty_ndrange=False)
 
         Use :func:`enqueue_nd_range_kernel` to enqueue a kernel execution, after using
         :meth:`set_args` to set each argument in turn. See the documentation for
         :meth:`set_arg` to see what argument types are allowed.
-        |std-enqueue-blurb|
 
         |glsize|
 
+        |empty-nd-range|
+
+        |std-enqueue-blurb|
+
         .. note::
 
             :meth:`__call__` is *not* thread-safe. It sets the arguments using :meth:`set_args`
@@ -241,6 +244,10 @@ Kernel
         .. versionchanged:: 2011.1
             Added the *g_times_l* keyword arg.
 
+        .. versionchanged:: 2020.2
+
+            Added the *allow_empty_ndrange* keyword argument.
+
     .. method:: capture_call(filename, queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False)
 
         This method supports the exact same interface as :meth:`__call__`, but
@@ -278,11 +285,18 @@ Kernel
 
         The size of local buffer in bytes to be provided.
 
-.. function:: enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=False)
-
-    |std-enqueue-blurb|
+.. function:: enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=False, allow_empty_ndrange=False)
 
     |glsize|
 
+    |empty-nd-range|
+
+    |std-enqueue-blurb|
+
     .. versionchanged:: 2011.1
+
         Added the *g_times_l* keyword arg.
+
+    .. versionchanged:: 2020.2
+
+        Added the *allow_empty_ndrange* keyword argument.
diff --git a/doc/subst.rst b/doc/subst.rst
index 5e7b524b..eba35363 100644
--- a/doc/subst.rst
+++ b/doc/subst.rst
@@ -25,3 +25,12 @@
         local size. (which makes the behavior more like Nvidia CUDA) In this case,
         *global_size* and *local_size* also do not have to have the same number
         of entries.
+
+.. |empty-nd-range| replace:: *allow_empty_ndrange* is a :class:`bool` indicating
+        how an empty NDRange is to be treated, where "empty" means that one or more
+        entries of *global_size* or *local_size* are zero. OpenCL itself does not
+        allow enqueueing kernels over empty NDRanges. Setting this flag to *True*
+        enqueues a marker with a wait list (``clEnqueueMarkerWithWaitList``)
+        to obtain the synchronization effects that would have resulted from
+        the kernel enqueue.
+        Setting *allow_empty_ndrange* to *True* requires OpenCL 1.2 or newer.
diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index 4d01888f..aab76cc4 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -322,7 +322,9 @@ def _generate_enqueue_and_set_args_module(function_name,
                 ", ".join(
                     ["self", "queue", "global_size", "local_size"]
                     + arg_names
-                    + ["global_offset=None", "g_times_l=None",
+                    + ["global_offset=None",
+                        "g_times_l=None",
+                        "allow_empty_ndrange=False",
                         "wait_for=None"])))
 
     with Indentation(gen):
@@ -331,7 +333,8 @@ def _generate_enqueue_and_set_args_module(function_name,
 
         gen("""
             return _cl.enqueue_nd_range_kernel(queue, self, global_size, local_size,
-                    global_offset, wait_for, g_times_l=g_times_l)
+                    global_offset, wait_for, g_times_l=g_times_l,
+                    allow_empty_ndrange=allow_empty_ndrange)
             """)
 
     # }}}
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 9ff7f023..e7471bd8 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -4527,7 +4527,8 @@ namespace pyopencl
       py::object py_local_work_size,
       py::object py_global_work_offset,
       py::object py_wait_for,
-      bool g_times_l)
+      bool g_times_l,
+      bool allow_empty_ndrange)
   {
     PYOPENCL_PARSE_WAIT_FOR;
 
@@ -4582,6 +4583,33 @@ namespace pyopencl
       global_work_offset_ptr = global_work_offset.empty( ) ? nullptr :  &global_work_offset.front();
     }
 
+    if (allow_empty_ndrange)
+    {
+#if PYOPENCL_CL_VERSION >= 0x1020
+      bool is_empty = false;
+      for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
+        if (global_work_size[work_axis] == 0)
+          is_empty = true;
+      if (local_work_size_ptr)
+        for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
+          if (local_work_size_ptr[work_axis] == 0)
+            is_empty = true;
+
+      if (is_empty)
+      {
+        cl_event evt;
+        PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList, (
+              cq.data(), PYOPENCL_WAITLIST_ARGS, &evt));
+        PYOPENCL_RETURN_NEW_EVENT(evt);
+      }
+#else
+      // clEnqueueWaitForEvents + clEnqueueMarker is not equivalent
+      // in the case of an out-of-order queue.
+      throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
+          "allow_empty_ndrange requires OpenCL 1.2");
+#endif
+    }
+
     PYOPENCL_RETRY_RETURN_IF_MEM_ERROR( {
           cl_event evt;
           PYOPENCL_CALL_GUARDED(clEnqueueNDRangeKernel, (
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index 5ca5efcf..cbd1f9a4 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -461,7 +461,8 @@ void pyopencl_expose_part_2(py::module &m)
       py::arg("local_work_size"),
       py::arg("global_work_offset")=py::none(),
       py::arg("wait_for")=py::none(),
-      py::arg("g_times_l")=false
+      py::arg("g_times_l")=false,
+      py::arg("allow_empty_ndrange")=false
       );
 
   // TODO: clEnqueueNativeKernel
-- 
GitLab