diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 51b22c4f941c8254d94e7c5f88d32cab7564d1c8..5b9b6393ebcae7db9da706c21bb61d36ae3b3c54 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -22,7 +22,7 @@ jobs:
         -   name: "Main Script"
             run: |
                 curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
-                . ./prepare-and-run-flake8.sh ./pyopencl ./test
+                . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" ./test
 
     pytest3:
         name: Pytest on Py3
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7796b71e16d106381913cd45866a9f297514a9fa..32063a80617ae0a8d19f82cbe2cdc92356ab7308 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -197,7 +197,7 @@ Pylint:
   # is only one copy of everything.
   - PROJECT_INSTALL_FLAGS="--editable"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh
-  - ". ./prepare-and-run-pylint.sh pyopencl test/test_*.py"
+  - . ./prepare-and-run-pylint.sh "$CI_PROJECT_NAME" test/test_*.py
   tags:
   - python3
   except:
@@ -216,7 +216,7 @@ Documentation:
 Flake8:
   script:
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
-  - ". ./prepare-and-run-flake8.sh pyopencl test"
+  - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test
   tags:
   - python3
   except:
diff --git a/doc/runtime_program.rst b/doc/runtime_program.rst
index 174b25d1cd9fee29562c9df31826981a5b1bbde8..77a495914219517b991b972a3a14ed9cb6ad8b83 100644
--- a/doc/runtime_program.rst
+++ b/doc/runtime_program.rst
@@ -205,15 +205,18 @@ Kernel
                prg.kernel(queue, n_globals, None, args)
 
 
-    .. method:: __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False)
+    .. method:: __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False, allow_empty_ndrange=False)
 
         Use :func:`enqueue_nd_range_kernel` to enqueue a kernel execution, after using
         :meth:`set_args` to set each argument in turn. See the documentation for
         :meth:`set_arg` to see what argument types are allowed.
-        |std-enqueue-blurb|
 
         |glsize|
 
+        |empty-nd-range|
+
+        |std-enqueue-blurb|
+
         .. note::
 
             :meth:`__call__` is *not* thread-safe. It sets the arguments using :meth:`set_args`
@@ -228,6 +231,7 @@ Kernel
             <http://lists.tiker.net/pipermail/pyopencl/2012-October/001311.html>`_.
 
         .. versionchanged:: 0.92
+
             *local_size* was promoted to third positional argument from being a
             keyword argument. The old keyword argument usage will continue to
             be accepted with a warning throughout the 0.92 release cycle.
@@ -239,8 +243,13 @@ Kernel
             it from working.
 
         .. versionchanged:: 2011.1
+
             Added the *g_times_l* keyword arg.
 
+        .. versionchanged:: 2020.2
+
+            Added the *allow_empty_ndrange* keyword argument.
+
     .. method:: capture_call(filename, queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False)
 
         This method supports the exact same interface as :meth:`__call__`, but
@@ -278,11 +287,18 @@ Kernel
 
         The size of local buffer in bytes to be provided.
 
-.. function:: enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=False)
-
-    |std-enqueue-blurb|
+.. function:: enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=False, allow_empty_ndrange=False)
 
     |glsize|
 
+    |empty-nd-range|
+
+    |std-enqueue-blurb|
+
     .. versionchanged:: 2011.1
+
         Added the *g_times_l* keyword arg.
+
+    .. versionchanged:: 2020.2
+
+        Added the *allow_empty_ndrange* keyword argument.
diff --git a/doc/subst.rst b/doc/subst.rst
index 5e7b524b4c927602196fca2027d99a954af68199..eba3536324545ccd4586244d5c2665b7db5d9d26 100644
--- a/doc/subst.rst
+++ b/doc/subst.rst
@@ -25,3 +25,12 @@
         local size. (which makes the behavior more like Nvidia CUDA) In this case,
         *global_size* and *local_size* also do not have to have the same number
         of entries.
+
+.. |empty-nd-range| replace:: *allow_empty_ndrange* is a :class:`bool` indicating
+        how an empty NDRange is to be treated, where "empty" means that one or more
+        entries of *global_size* or *local_size* are zero. OpenCL itself does not
+        allow enqueueing kernels over empty NDRanges. Setting this flag to *True*
+        enqueues a marker with a wait list (``clEnqueueMarkerWithWaitList``)
+        to obtain the synchronization effects that would have resulted from
+        the kernel enqueue.
+        Setting *allow_empty_ndrange* to *True* requires OpenCL 1.2 or newer.
diff --git a/doc/tools.rst b/doc/tools.rst
index 243535142b21ef219391f7e9a552b846b58ec60c..7260add22ffbd1bff227b6cd7641cc7e4f5b3ac8 100644
--- a/doc/tools.rst
+++ b/doc/tools.rst
@@ -46,14 +46,20 @@ not complicated::
     bound to contexts, not devices, and memory availability depends on which
     device the buffer is used with.)
 
-    .. versionchanged::
-        In version 2013.1, :class:`CLAllocator` was deprecated and replaced
+    .. versionchanged :: 2013.1
+
+        :class:`CLAllocator` was deprecated and replaced
         by :class:`DeferredAllocator`.
 
     .. method:: __call__(size)
 
         Allocate a :class:`pyopencl.Buffer` of the given *size*.
 
+        .. versionchanged :: 2020.2
+
+            The allocator will succeed even for allocations of size zero,
+            returning *None*.
+
 .. class:: ImmediateAllocator(queue, mem_flags=pyopencl.mem_flags.READ_WRITE)
 
     *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds
@@ -68,6 +74,11 @@ not complicated::
 
         Allocate a :class:`pyopencl.Buffer` of the given *size*.
 
+        .. versionchanged :: 2020.2
+
+            The allocator will succeed even for allocations of size zero,
+            returning *None*.
+
 .. class:: MemoryPool(allocator[, leading_bits_in_bin_id])
 
     A memory pool for OpenCL device memory. *allocator* must be an instance of
diff --git a/pyopencl/array.py b/pyopencl/array.py
index bcc0770fa4a1446cb8a7e6c481f4ffc2ffe55f83..35c521b88665bdabd7d6415d22e2f1e80de64a5a 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -483,23 +483,22 @@ class Array(object):
         self.allocator = allocator
 
         if data is None:
-            if alloc_nbytes <= 0:
-                if alloc_nbytes == 0:
-                    # Work around CL not allowing zero-sized buffers.
-                    alloc_nbytes = 1
+            if alloc_nbytes < 0:
+                raise ValueError("cannot allocate CL buffer with "
+                        "negative size")
 
-                else:
-                    raise ValueError("cannot allocate CL buffer with "
-                            "negative size")
-
-            if allocator is None:
-                if context is None and queue is not None:
-                    context = queue.context
+            elif alloc_nbytes == 0:
+                self.base_data = None
 
-                self.base_data = cl.Buffer(
-                        context, cl.mem_flags.READ_WRITE, alloc_nbytes)
             else:
-                self.base_data = self.allocator(alloc_nbytes)
+                if allocator is None:
+                    if context is None and queue is not None:
+                        context = queue.context
+
+                    self.base_data = cl.Buffer(
+                            context, cl.mem_flags.READ_WRITE, alloc_nbytes)
+                else:
+                    self.base_data = self.allocator(alloc_nbytes)
         else:
             self.base_data = data
 
@@ -1223,6 +1222,9 @@ class Array(object):
     def _zero_fill(self, queue=None, wait_for=None):
         queue = queue or self.queue
 
+        if not self.size:
+            return
+
         if (
                 queue._get_cl_version() >= (1, 2)
                 and cl.get_cl_header_version() >= (1, 2)):
diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index 4d01888fc2a40297b19ce904ff90ea5072707fd6..57ab4b1f4ce29ebe9ddcd145e6efee8a3056038c 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -322,7 +322,9 @@ def _generate_enqueue_and_set_args_module(function_name,
                 ", ".join(
                     ["self", "queue", "global_size", "local_size"]
                     + arg_names
-                    + ["global_offset=None", "g_times_l=None",
+                    + ["global_offset=None",
+                        "g_times_l=None",
+                        "allow_empty_ndrange=False",
                         "wait_for=None"])))
 
     with Indentation(gen):
@@ -331,7 +333,8 @@ def _generate_enqueue_and_set_args_module(function_name,
 
         gen("""
             return _cl.enqueue_nd_range_kernel(queue, self, global_size, local_size,
-                    global_offset, wait_for, g_times_l=g_times_l)
+                    global_offset, wait_for, g_times_l=g_times_l,
+                    allow_empty_ndrange=allow_empty_ndrange)
             """)
 
     # }}}
@@ -352,7 +355,7 @@ def _generate_enqueue_and_set_args_module(function_name,
 
 
 invoker_cache = WriteOncePersistentDict(
-        "pyopencl-invoker-cache-v6",
+        "pyopencl-invoker-cache-v7",
         key_builder=_NumpyTypesKeyBuilder())
 
 
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 9ff7f02368e5166451f38f34978a9f1f9d99a99e..e7471bd8b67c81445a367195edea568e8622d5db 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -4527,7 +4527,8 @@ namespace pyopencl
       py::object py_local_work_size,
       py::object py_global_work_offset,
       py::object py_wait_for,
-      bool g_times_l)
+      bool g_times_l,
+      bool allow_empty_ndrange)
   {
     PYOPENCL_PARSE_WAIT_FOR;
 
@@ -4582,6 +4583,33 @@ namespace pyopencl
       global_work_offset_ptr = global_work_offset.empty( ) ? nullptr :  &global_work_offset.front();
     }
 
+    if (allow_empty_ndrange)
+    {
+#if PYOPENCL_CL_VERSION >= 0x1020
+      bool is_empty = false;
+      for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
+        if (global_work_size[work_axis] == 0)
+          is_empty = true;
+      if (local_work_size_ptr)
+        for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
+          if (local_work_size_ptr[work_axis] == 0)
+            is_empty = true;
+
+      if (is_empty)
+      {
+        cl_event evt;
+        PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList, (
+              cq.data(), PYOPENCL_WAITLIST_ARGS, &evt));
+        PYOPENCL_RETURN_NEW_EVENT(evt);
+      }
+#else
+      // clEnqueueWaitForEvents + clEnqueueMarker is not equivalent
+      // in the case of an out-of-order queue.
+      throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
+          "allow_empty_ndrange requires OpenCL 1.2");
+#endif
+    }
+
     PYOPENCL_RETRY_RETURN_IF_MEM_ERROR( {
           cl_event evt;
           PYOPENCL_CALL_GUARDED(clEnqueueNDRangeKernel, (
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index 5ca5efcf658bc7a81f2c67c265427486d6c4dfc4..cbd1f9a40f85ee0c71be5adfe38bc8cd1cd20e50 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -461,7 +461,8 @@ void pyopencl_expose_part_2(py::module &m)
       py::arg("local_work_size"),
       py::arg("global_work_offset")=py::none(),
       py::arg("wait_for")=py::none(),
-      py::arg("g_times_l")=false
+      py::arg("g_times_l")=false,
+      py::arg("allow_empty_ndrange")=false
       );
 
   // TODO: clEnqueueNativeKernel
diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp
index e29110ec25caf46cbf8d11a4d653772e6068789a..0c3004ad9e8f448195e5fb7ac76cea4e2c21d2a9 100644
--- a/src/wrap_mempool.cpp
+++ b/src/wrap_mempool.cpp
@@ -104,6 +104,9 @@ namespace
 
       pointer_type allocate(size_type s)
       {
+        if (s == 0)
+          return nullptr;
+
         return pyopencl::create_buffer(m_context->data(), m_flags, s, 0);
       }
   };
@@ -137,6 +140,9 @@ namespace
 
       pointer_type allocate(size_type s)
       {
+        if (s == 0)
+          return nullptr;
+
         pointer_type ptr =  pyopencl::create_buffer(
             m_context->data(), m_flags, s, 0);
 
@@ -144,7 +150,10 @@ namespace
         // This looks (and is) expensive. But immediate allocators
         // have their main use in memory pools, whose basic assumption
         // is that allocation is too expensive anyway--but they rely
-        // on exact 'out-of-memory' information.
+        // on 'out-of-memory' being reported on allocation. (If it is
+        // reported in a deferred manner, it has no way to react
+        // (e.g. by freeing unused memory) because it is not part of
+        // the call stack.)
         unsigned zero = 0;
         PYOPENCL_CALL_GUARDED(clEnqueueWriteBuffer, (
               m_queue.data(),
@@ -188,6 +197,15 @@ namespace
       alloc.try_release_blocks();
     }
 
+    if (!mem)
+    {
+      if (size == 0)
+        return nullptr;
+      else
+        throw pyopencl::error("Allocator", CL_INVALID_VALUE,
+            "allocator succeeded but returned NULL cl_mem");
+    }
+
     try
     {
       return new pyopencl::buffer(mem, false);
diff --git a/test/test_array.py b/test/test_array.py
index 521f6719474f7a05a98a100bb9a6183018caa5df..cb0dbee8b8dae83c8ed5e0c4d32378ca506bf84c 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1316,6 +1316,19 @@ def test_outoforderqueue_reductions(ctx_factory):
     assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0
 
 
+@pytest.mark.parametrize("empty_shape", [0, (), (3, 0, 2)])
+def test_zero_size_array(ctx_factory, empty_shape):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    a = cl_array.zeros(queue, empty_shape, dtype=np.float32)
+    b = cl_array.zeros(queue, empty_shape, dtype=np.float32)
+    b.fill(1)
+    c = a + b
+    c_host = c.get()
+    cl_array.to_device(queue, c_host)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index dc5772de57ae5d7c05040cdc98117dbed8d53301..63c5e8a0633faf16c1fba77f4a47c52c6768e43b 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -33,7 +33,8 @@ import pyopencl.array as cl_array
 import pyopencl.cltypes as cltypes
 import pyopencl.clrandom
 from pyopencl.tools import (  # noqa
-        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests,
+        ImmediateAllocator, DeferredAllocator)
 from pyopencl.characterize import get_pocl_version
 
 # Are CL implementations crashy? You be the judge. :)
@@ -573,6 +574,23 @@ def test_mempool_2(ctx_factory):
         assert asize < asize*(1+1/8)
 
 
+@pytest.mark.parametrize("allocator_cls", [ImmediateAllocator, DeferredAllocator])
+def test_allocator(ctx_factory, allocator_cls):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    if allocator_cls is DeferredAllocator:
+        allocator = allocator_cls(context)
+    else:
+        allocator = allocator_cls(queue)
+
+    mem = allocator(15)
+    mem2 = allocator(0)
+
+    assert mem is not None
+    assert mem2 is None
+
+
 def test_vector_args(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -1142,6 +1160,26 @@ def test_threaded_nanny_events(ctx_factory):
     t2.join()
 
 
+@pytest.mark.parametrize("empty_shape", [(0,), (3, 0, 2)])
+def test_empty_ndrange(ctx_factory, empty_shape):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    if ctx._get_cl_version() < (1, 2) or cl.get_cl_header_version() < (1, 2):
+        pytest.skip("OpenCL 1.2 required for empty NDRange suuport")
+
+    a = cl_array.zeros(queue, empty_shape, dtype=np.float32)
+
+    prg = cl.Program(ctx, """
+        __kernel void add_two(__global float *a_g)
+        {
+          a_g[get_global_id(0)] += 2;
+        }
+        """).build()
+
+    prg.add_two(queue, a.shape, None, a.data, allow_empty_ndrange=True)
+
+
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the tests.
     import pyopencl  # noqa