diff --git a/doc/runtime_program.rst b/doc/runtime_program.rst
index 174b25d1cd9fee29562c9df31826981a5b1bbde8..ef549042cb19a30d981c49330a1ac118edddaad5 100644
--- a/doc/runtime_program.rst
+++ b/doc/runtime_program.rst
@@ -205,15 +205,18 @@ Kernel
                prg.kernel(queue, n_globals, None, args)
 
 
-    .. method:: __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False)
+    .. method:: __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False, allow_empty_ndrange=False)
 
         Use :func:`enqueue_nd_range_kernel` to enqueue a kernel execution, after using
         :meth:`set_args` to set each argument in turn. See the documentation for
         :meth:`set_arg` to see what argument types are allowed.
-        |std-enqueue-blurb|
 
         |glsize|
 
+        |empty-nd-range|
+
+        |std-enqueue-blurb|
+
         .. note::
 
             :meth:`__call__` is *not* thread-safe. It sets the arguments using :meth:`set_args`
@@ -241,6 +244,10 @@ Kernel
         .. versionchanged:: 2011.1
             Added the *g_times_l* keyword arg.
 
+        .. versionchanged:: 2020.2
+
+            Added the *allow_empty_ndrange* keyword argument.
+
     .. method:: capture_call(filename, queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False)
 
         This method supports the exact same interface as :meth:`__call__`, but
@@ -278,11 +285,18 @@ Kernel
 
         The size of local buffer in bytes to be provided.
 
-.. function:: enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=False)
-
-    |std-enqueue-blurb|
+.. function:: enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=False, allow_empty_ndrange=False)
 
     |glsize|
 
+    |empty-nd-range|
+
+    |std-enqueue-blurb|
+
     .. versionchanged:: 2011.1
+
         Added the *g_times_l* keyword arg.
+
+    .. versionchanged:: 2020.2
+
+        Added the *allow_empty_ndrange* keyword argument.
diff --git a/doc/subst.rst b/doc/subst.rst
index 5e7b524b4c927602196fca2027d99a954af68199..eba3536324545ccd4586244d5c2665b7db5d9d26 100644
--- a/doc/subst.rst
+++ b/doc/subst.rst
@@ -25,3 +25,12 @@
         local size. (which makes the behavior more like Nvidia CUDA) In this case,
         *global_size* and *local_size* also do not have to have the same number
         of entries.
+
+.. |empty-nd-range| replace:: *allow_empty_ndrange* is a :class:`bool` indicating
+        how an empty NDRange is to be treated, where "empty" means that one or more
+        entries of *global_size* or *local_size* are zero. OpenCL itself does not
+        allow enqueueing kernels over empty NDRanges. Setting this flag to *True*
+        enqueues a marker with a wait list (``clEnqueueMarkerWithWaitList``)
+        to obtain the synchronization effects that would have resulted from
+        the kernel enqueue.
+        Setting *allow_empty_ndrange* to *True* requires OpenCL 1.2 or newer.
diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index 4d01888fc2a40297b19ce904ff90ea5072707fd6..aab76cc42a24715d2af30a3cb9f7ef5524257962 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -322,7 +322,9 @@ def _generate_enqueue_and_set_args_module(function_name,
                 ", ".join(
                     ["self", "queue", "global_size", "local_size"]
                     + arg_names
-                    + ["global_offset=None", "g_times_l=None",
+                    + ["global_offset=None",
+                        "g_times_l=None",
+                        "allow_empty_ndrange=False",
                         "wait_for=None"])))
 
     with Indentation(gen):
@@ -331,7 +333,8 @@ def _generate_enqueue_and_set_args_module(function_name,
 
         gen("""
             return _cl.enqueue_nd_range_kernel(queue, self, global_size, local_size,
-                    global_offset, wait_for, g_times_l=g_times_l)
+                    global_offset, wait_for, g_times_l=g_times_l,
+                    allow_empty_ndrange=allow_empty_ndrange)
             """)
 
     # }}}
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 9ff7f02368e5166451f38f34978a9f1f9d99a99e..e7471bd8b67c81445a367195edea568e8622d5db 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -4527,7 +4527,8 @@ namespace pyopencl
       py::object py_local_work_size,
       py::object py_global_work_offset,
       py::object py_wait_for,
-      bool g_times_l)
+      bool g_times_l,
+      bool allow_empty_ndrange)
   {
     PYOPENCL_PARSE_WAIT_FOR;
 
@@ -4582,6 +4583,33 @@ namespace pyopencl
       global_work_offset_ptr = global_work_offset.empty( ) ? nullptr :  &global_work_offset.front();
     }
 
+    if (allow_empty_ndrange)
+    {
+#if PYOPENCL_CL_VERSION >= 0x1020
+      bool is_empty = false;
+      for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
+        if (global_work_size[work_axis] == 0)
+          is_empty = true;
+      if (local_work_size_ptr)
+        for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
+          if (local_work_size_ptr[work_axis] == 0)
+            is_empty = true;
+
+      if (is_empty)
+      {
+        cl_event evt;
+        PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList, (
+              cq.data(), PYOPENCL_WAITLIST_ARGS, &evt));
+        PYOPENCL_RETURN_NEW_EVENT(evt);
+      }
+#else
+      // clEnqueueWaitForEvents + clEnqueueMarker is not equivalent
+      // in the case of an out-of-order queue.
+      throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
+          "allow_empty_ndrange requires OpenCL 1.2");
+#endif
+    }
+
     PYOPENCL_RETRY_RETURN_IF_MEM_ERROR( {
           cl_event evt;
           PYOPENCL_CALL_GUARDED(clEnqueueNDRangeKernel, (
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index 5ca5efcf658bc7a81f2c67c265427486d6c4dfc4..cbd1f9a40f85ee0c71be5adfe38bc8cd1cd20e50 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -461,7 +461,8 @@ void pyopencl_expose_part_2(py::module &m)
       py::arg("local_work_size"),
       py::arg("global_work_offset")=py::none(),
       py::arg("wait_for")=py::none(),
-      py::arg("g_times_l")=false
+      py::arg("g_times_l")=false,
+      py::arg("allow_empty_ndrange")=false
       );
 
   // TODO: clEnqueueNativeKernel