From bdf0a13dec579e89121623e46687508212d71f83 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 14 Jun 2020 23:55:08 -0500 Subject: [PATCH 01/14] Explain better why memory pools need exact out-of-memory reporting --- src/wrap_mempool.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp index e29110ec..8778c1bf 100644 --- a/src/wrap_mempool.cpp +++ b/src/wrap_mempool.cpp @@ -144,7 +144,10 @@ namespace // This looks (and is) expensive. But immediate allocators // have their main use in memory pools, whose basic assumption // is that allocation is too expensive anyway--but they rely - // on exact 'out-of-memory' information. + // on 'out-of-memory' being reported on allocation. (If it is + // reported in a deferred manner, it has no way to react + // (e.g. by freeing unused memory) because it is not part of + // the call stack. unsigned zero = 0; PYOPENCL_CALL_GUARDED(clEnqueueWriteBuffer, ( m_queue.data(), -- GitLab From 60c4cea84734a54599bc92535f191b7622a27d4b Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 15 Jun 2020 13:02:03 -0500 Subject: [PATCH 02/14] Allocators: allow zero size, return None --- doc/tools.rst | 10 ++++++++++ src/wrap_mempool.cpp | 6 ++++++ 2 files changed, 16 insertions(+) diff --git a/doc/tools.rst b/doc/tools.rst index 24353514..c0565d2f 100644 --- a/doc/tools.rst +++ b/doc/tools.rst @@ -54,6 +54,11 @@ not complicated:: Allocate a :class:`pyopencl.Buffer` of the given *size*. + .. versionchanged :: 2020.2 + + The allocator will succeed even for allocations of size zero, + returning *None*. + .. class:: ImmediateAllocator(queue, mem_flags=pyopencl.mem_flags.READ_WRITE) *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds @@ -68,6 +73,11 @@ not complicated:: Allocate a :class:`pyopencl.Buffer` of the given *size*. + .. versionchanged :: 2020.2 + + The allocator will succeed even for allocations of size zero, + returning *None*. + .. class:: MemoryPool(allocator[, leading_bits_in_bin_id]) A memory pool for OpenCL device memory. *allocator* must be an instance of diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp index 8778c1bf..dceb33dc 100644 --- a/src/wrap_mempool.cpp +++ b/src/wrap_mempool.cpp @@ -104,6 +104,9 @@ namespace pointer_type allocate(size_type s) { + if (s == 0) + return nullptr; + return pyopencl::create_buffer(m_context->data(), m_flags, s, 0); } }; @@ -137,6 +140,9 @@ namespace pointer_type allocate(size_type s) { + if (s == 0) + return nullptr; + pointer_type ptr = pyopencl::create_buffer( m_context->data(), m_flags, s, 0); -- GitLab From 73ec3f088f652f215af8069e29627bc37b21da48 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 15 Jun 2020 13:02:37 -0500 Subject: [PATCH 03/14] tools.rst formatting fix --- doc/tools.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/tools.rst b/doc/tools.rst index c0565d2f..7260add2 100644 --- a/doc/tools.rst +++ b/doc/tools.rst @@ -46,8 +46,9 @@ not complicated:: bound to contexts, not devices, and memory availability depends on which device the buffer is used with.) - .. versionchanged:: - In version 2013.1, :class:`CLAllocator` was deprecated and replaced + .. versionchanged :: 2013.1 + + :class:`CLAllocator` was deprecated and replaced by :class:`DeferredAllocator`. .. method:: __call__(size) -- GitLab From f374b90f5052d1ee389c1bfe0573b059b2c7f95e Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 15 Jun 2020 13:05:25 -0500 Subject: [PATCH 04/14] Add 'allow_empty_ndrange' kwarg to kernel enqueue --- doc/runtime_program.rst | 24 +++++++++++++++++++----- doc/subst.rst | 9 +++++++++ pyopencl/invoker.py | 7 +++++-- src/wrap_cl.hpp | 30 +++++++++++++++++++++++++++++- src/wrap_cl_part_2.cpp | 3 ++- 5 files changed, 64 insertions(+), 9 deletions(-) diff --git a/doc/runtime_program.rst b/doc/runtime_program.rst index 174b25d1..ef549042 100644 --- a/doc/runtime_program.rst +++ b/doc/runtime_program.rst @@ -205,15 +205,18 @@ Kernel prg.kernel(queue, n_globals, None, args) - .. method:: __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False) + .. method:: __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False, allow_empty_ndrange=False) Use :func:`enqueue_nd_range_kernel` to enqueue a kernel execution, after using :meth:`set_args` to set each argument in turn. See the documentation for :meth:`set_arg` to see what argument types are allowed. - |std-enqueue-blurb| |glsize| + |empty-nd-range| + + |std-enqueue-blurb| + .. note:: :meth:`__call__` is *not* thread-safe. It sets the arguments using :meth:`set_args` @@ -241,6 +244,10 @@ Kernel .. versionchanged:: 2011.1 Added the *g_times_l* keyword arg. + .. versionchanged:: 2020.2 + + Added the *allow_empty_ndrange* keyword argument. + .. method:: capture_call(filename, queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False) This method supports the exact same interface as :meth:`__call__`, but @@ -278,11 +285,18 @@ Kernel The size of local buffer in bytes to be provided. -.. function:: enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=False) - - |std-enqueue-blurb| +.. function:: enqueue_nd_range_kernel(queue, kernel, global_work_size, local_work_size, global_work_offset=None, wait_for=None, g_times_l=False, allow_empty_ndrange=False) |glsize| + |empty-nd-range| + + |std-enqueue-blurb| + .. versionchanged:: 2011.1 + Added the *g_times_l* keyword arg. + + .. versionchanged:: 2020.2 + + Added the *allow_empty_ndrange* keyword argument. diff --git a/doc/subst.rst b/doc/subst.rst index 5e7b524b..eba35363 100644 --- a/doc/subst.rst +++ b/doc/subst.rst @@ -25,3 +25,12 @@ local size. (which makes the behavior more like Nvidia CUDA) In this case, *global_size* and *local_size* also do not have to have the same number of entries. + +.. |empty-nd-range| replace:: *allow_empty_ndrange* is a :class:`bool` indicating + how an empty NDRange is to be treated, where "empty" means that one or more + entries of *global_size* or *local_size* are zero. OpenCL itself does not + allow enqueueing kernels over empty NDRanges. Setting this flag to *True* + enqueues a marker with a wait list (``clEnqueueMarkerWithWaitList``) + to obtain the synchronization effects that would have resulted from + the kernel enqueue. + Setting *allow_empty_ndrange* to *True* requires OpenCL 1.2 or newer. diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index 4d01888f..aab76cc4 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -322,7 +322,9 @@ def _generate_enqueue_and_set_args_module(function_name, ", ".join( ["self", "queue", "global_size", "local_size"] + arg_names - + ["global_offset=None", "g_times_l=None", + + ["global_offset=None", + "g_times_l=None", + "allow_empty_ndrange=False", "wait_for=None"]))) with Indentation(gen): @@ -331,7 +333,8 @@ def _generate_enqueue_and_set_args_module(function_name, gen(""" return _cl.enqueue_nd_range_kernel(queue, self, global_size, local_size, - global_offset, wait_for, g_times_l=g_times_l) + global_offset, wait_for, g_times_l=g_times_l, + allow_empty_ndrange=allow_empty_ndrange) """) # }}} diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp index 9ff7f023..e7471bd8 100644 --- a/src/wrap_cl.hpp +++ b/src/wrap_cl.hpp @@ -4527,7 +4527,8 @@ namespace pyopencl py::object py_local_work_size, py::object py_global_work_offset, py::object py_wait_for, - bool g_times_l) + bool g_times_l, + bool allow_empty_ndrange) { PYOPENCL_PARSE_WAIT_FOR; @@ -4582,6 +4583,33 @@ namespace pyopencl global_work_offset_ptr = global_work_offset.empty( ) ? nullptr : &global_work_offset.front(); } + if (allow_empty_ndrange) + { +#if PYOPENCL_CL_VERSION >= 0x1020 + bool is_empty = false; + for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis) + if (global_work_size[work_axis] == 0) + is_empty = true; + if (local_work_size_ptr) + for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis) + if (local_work_size_ptr[work_axis] == 0) + is_empty = true; + + if (is_empty) + { + cl_event evt; + PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList, ( + cq.data(), PYOPENCL_WAITLIST_ARGS, &evt)); + PYOPENCL_RETURN_NEW_EVENT(evt); + } +#else + // clEnqueueWaitForEvents + clEnqueueMarker is not equivalent + // in the case of an out-of-order queue. + throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE, + "allow_empty_ndrange requires OpenCL 1.2"); +#endif + } + PYOPENCL_RETRY_RETURN_IF_MEM_ERROR( { cl_event evt; PYOPENCL_CALL_GUARDED(clEnqueueNDRangeKernel, ( diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp index 5ca5efcf..cbd1f9a4 100644 --- a/src/wrap_cl_part_2.cpp +++ b/src/wrap_cl_part_2.cpp @@ -461,7 +461,8 @@ void pyopencl_expose_part_2(py::module &m) py::arg("local_work_size"), py::arg("global_work_offset")=py::none(), py::arg("wait_for")=py::none(), - py::arg("g_times_l")=false + py::arg("g_times_l")=false, + py::arg("allow_empty_ndrange")=false ); // TODO: clEnqueueNativeKernel -- GitLab From 033b7f4883fe613292fac82a0328787270dea183 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 15 Jun 2020 13:06:09 -0500 Subject: [PATCH 05/14] Empty array: set base_data to None instead of allocating bogus 1-byte buffer --- pyopencl/array.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/pyopencl/array.py b/pyopencl/array.py index bcc0770f..b25ffc57 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -483,23 +483,22 @@ class Array(object): self.allocator = allocator if data is None: - if alloc_nbytes <= 0: - if alloc_nbytes == 0: - # Work around CL not allowing zero-sized buffers. - alloc_nbytes = 1 + if alloc_nbytes < 0: + raise ValueError("cannot allocate CL buffer with " + "negative size") - else: - raise ValueError("cannot allocate CL buffer with " - "negative size") - - if allocator is None: - if context is None and queue is not None: - context = queue.context + elif alloc_nbytes == 0: + self.base_data = None - self.base_data = cl.Buffer( - context, cl.mem_flags.READ_WRITE, alloc_nbytes) else: - self.base_data = self.allocator(alloc_nbytes) + if allocator is None: + if context is None and queue is not None: + context = queue.context + + self.base_data = cl.Buffer( + context, cl.mem_flags.READ_WRITE, alloc_nbytes) + else: + self.base_data = self.allocator(alloc_nbytes) else: self.base_data = data -- GitLab From 578aee1c150b8bb56f04278a9b63d26bd040568a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 15 Jun 2020 13:06:39 -0500 Subject: [PATCH 06/14] runtime_program.rst formatting fixes --- doc/runtime_program.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/runtime_program.rst b/doc/runtime_program.rst index ef549042..77a49591 100644 --- a/doc/runtime_program.rst +++ b/doc/runtime_program.rst @@ -231,6 +231,7 @@ Kernel `_. .. versionchanged:: 0.92 + *local_size* was promoted to third positional argument from being a keyword argument. The old keyword argument usage will continue to be accepted with a warning throughout the 0.92 release cycle. @@ -242,6 +243,7 @@ Kernel it from working. .. versionchanged:: 2011.1 + Added the *g_times_l* keyword arg. .. versionchanged:: 2020.2 -- GitLab From 5c165817465d5d9e2e198d11b81dd27a362f8b9f Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 15 Jun 2020 17:20:41 -0500 Subject: [PATCH 07/14] Use CI variable for project name in Github CI --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 51b22c4f..5b9b6393 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,7 +22,7 @@ jobs: - name: "Main Script" run: | curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh - . ./prepare-and-run-flake8.sh ./pyopencl ./test + . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" ./test pytest3: name: Pytest on Py3 -- GitLab From 673f1357d8436c5e07d4d4e6e49e467e17f20ca4 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 15 Jun 2020 17:22:44 -0500 Subject: [PATCH 08/14] Use CI variable for project name in Gitlab CI --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7796b71e..32063a80 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -197,7 +197,7 @@ Pylint: # is only one copy of everything. - PROJECT_INSTALL_FLAGS="--editable" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-pylint.sh - - ". ./prepare-and-run-pylint.sh pyopencl test/test_*.py" + - . ./prepare-and-run-pylint.sh "$CI_PROJECT_NAME" test/test_*.py tags: - python3 except: @@ -216,7 +216,7 @@ Documentation: Flake8: script: - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh - - ". ./prepare-and-run-flake8.sh pyopencl test" + - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test tags: - python3 except: -- GitLab From 808451680a31d11168c60b39c025880bbc7a6399 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 15 Jun 2020 17:23:31 -0500 Subject: [PATCH 09/14] Test, fix allocator behavior for zero-size allocations --- src/wrap_mempool.cpp | 9 +++++++++ test/test_wrapper.py | 20 +++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp index dceb33dc..16662bc5 100644 --- a/src/wrap_mempool.cpp +++ b/src/wrap_mempool.cpp @@ -197,6 +197,15 @@ namespace alloc.try_release_blocks(); } + if (!mem) + { + if (size == 0) + return nullptr; + else + throw pyopencl::error("Allocator", CL_INVALID_VALUE, + "allocator succeeded but returned NULL cl_mem"); + } + try { return new pyopencl::buffer(mem, false); diff --git a/test/test_wrapper.py b/test/test_wrapper.py index dc5772de..1b4a3b42 100644 --- a/test/test_wrapper.py +++ b/test/test_wrapper.py @@ -33,7 +33,8 @@ import pyopencl.array as cl_array import pyopencl.cltypes as cltypes import pyopencl.clrandom from pyopencl.tools import ( # noqa - pytest_generate_tests_for_pyopencl as pytest_generate_tests) + pytest_generate_tests_for_pyopencl as pytest_generate_tests, + ImmediateAllocator, DeferredAllocator) from pyopencl.characterize import get_pocl_version # Are CL implementations crashy? You be the judge. :) @@ -573,6 +574,23 @@ def test_mempool_2(ctx_factory): assert asize < asize*(1+1/8) +@pytest.mark.parametrize("allocator_cls", [ImmediateAllocator, DeferredAllocator]) +def test_allocator(ctx_factory, allocator_cls): + context = ctx_factory() + queue = cl.CommandQueue(context) + + if allocator_cls is DeferredAllocator: + allocator = allocator_cls(context) + else: + allocator = allocator_cls(queue) + + mem = allocator(15) + mem2 = allocator(0) + + assert mem is not None + assert mem2 is None + + def test_vector_args(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) -- GitLab From 299612cfe93de63a23bd89b640cf6830e86e5ffb Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 15 Jun 2020 17:59:53 -0500 Subject: [PATCH 10/14] Test, fix zero-size arrays --- pyopencl/array.py | 3 +++ test/test_array.py | 13 +++++++++++++ 2 files changed, 16 insertions(+) diff --git a/pyopencl/array.py b/pyopencl/array.py index b25ffc57..35c521b8 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -1222,6 +1222,9 @@ class Array(object): def _zero_fill(self, queue=None, wait_for=None): queue = queue or self.queue + if not self.size: + return + if ( queue._get_cl_version() >= (1, 2) and cl.get_cl_header_version() >= (1, 2)): diff --git a/test/test_array.py b/test/test_array.py index 521f6719..cb0dbee8 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -1316,6 +1316,19 @@ def test_outoforderqueue_reductions(ctx_factory): assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0 +@pytest.mark.parametrize("empty_shape", [0, (), (3, 0, 2)]) +def test_zero_size_array(ctx_factory, empty_shape): + context = ctx_factory() + queue = cl.CommandQueue(context) + + a = cl_array.zeros(queue, empty_shape, dtype=np.float32) + b = cl_array.zeros(queue, empty_shape, dtype=np.float32) + b.fill(1) + c = a + b + c_host = c.get() + cl_array.to_device(queue, c_host) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From ee4acf309a6882b8e66f710cc41bf8ceb3d4c208 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 15 Jun 2020 18:00:19 -0500 Subject: [PATCH 11/14] Test allow_empty_ndrange in kernel enqueue --- test/test_wrapper.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/test_wrapper.py b/test/test_wrapper.py index 1b4a3b42..6c9fff19 100644 --- a/test/test_wrapper.py +++ b/test/test_wrapper.py @@ -1160,6 +1160,26 @@ def test_threaded_nanny_events(ctx_factory): t2.join() +@pytest.mark.parametrize("empty_shape", [(0,), (3, 0, 2)]) +def test_empty_ndrange(ctx_factory, empty_shape): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + if ctx._get_cl_version() < (1, 2) or cl.get_cl_header_version() < (1, 2): + pytest.skip("OpenCL 1.2 required for empty NDRange suuport") + + a = cl_array.zeros(queue, empty_shape, dtype=np.float32) + + prg = cl.Program(ctx, """ + __kernel void twice(__global float *a_g) + { + a_g[get_global_id(0)] += 2; + } + """).build() + + prg.twice(queue, a.shape, None, a.data, allow_empty_ndrange=True) + + if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the tests. import pyopencl # noqa -- GitLab From c5a2b5fd0194a99addfd4b711e5ac9fe3b1f6f0c Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 15 Jun 2020 18:25:53 -0500 Subject: [PATCH 12/14] Bump invoker cache version --- pyopencl/invoker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py index aab76cc4..57ab4b1f 100644 --- a/pyopencl/invoker.py +++ b/pyopencl/invoker.py @@ -355,7 +355,7 @@ def _generate_enqueue_and_set_args_module(function_name, invoker_cache = WriteOncePersistentDict( - "pyopencl-invoker-cache-v6", + "pyopencl-invoker-cache-v7", key_builder=_NumpyTypesKeyBuilder()) -- GitLab From ddbb4a6c33288d02aa9c001cf342ac58700de938 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Tue, 16 Jun 2020 23:05:36 +0200 Subject: [PATCH 13/14] Apply suggestion to src/wrap_mempool.cpp --- src/wrap_mempool.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp index 16662bc5..0c3004ad 100644 --- a/src/wrap_mempool.cpp +++ b/src/wrap_mempool.cpp @@ -153,7 +153,7 @@ namespace // on 'out-of-memory' being reported on allocation. (If it is // reported in a deferred manner, it has no way to react // (e.g. by freeing unused memory) because it is not part of - // the call stack. + // the call stack.) unsigned zero = 0; PYOPENCL_CALL_GUARDED(clEnqueueWriteBuffer, ( m_queue.data(), -- GitLab From 625e9a5026e8ccf0ad02dd02e32651f93f19b109 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Tue, 16 Jun 2020 23:07:43 +0200 Subject: [PATCH 14/14] test_empty_ndrange: twice->add_two --- test/test_wrapper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_wrapper.py b/test/test_wrapper.py index 6c9fff19..63c5e8a0 100644 --- a/test/test_wrapper.py +++ b/test/test_wrapper.py @@ -1171,13 +1171,13 @@ def test_empty_ndrange(ctx_factory, empty_shape): a = cl_array.zeros(queue, empty_shape, dtype=np.float32) prg = cl.Program(ctx, """ - __kernel void twice(__global float *a_g) + __kernel void add_two(__global float *a_g) { a_g[get_global_id(0)] += 2; } """).build() - prg.twice(queue, a.shape, None, a.data, allow_empty_ndrange=True) + prg.add_two(queue, a.shape, None, a.data, allow_empty_ndrange=True) if __name__ == "__main__": -- GitLab