From be9fd38f6fd85651422817e27dd1636788c374ad Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 15 Jan 2021 14:22:41 -0600
Subject: [PATCH 01/47] Create special-case set_arg wrappers for common
 integer/float types

---
 src/wrap_cl.hpp        | 7 +++++++
 src/wrap_cl_part_2.cpp | 7 +++++++
 2 files changed, 14 insertions(+)
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index ad5b7ab8..7ffcc983 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -4343,6 +4343,13 @@ namespace pyopencl
               sizeof(cl_mem), &m));
       }
 
+      template <typename T>
+      void set_arg_pod(cl_uint arg_index, T pod_value)
+      {
+        PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index,
+              sizeof(T), &pod_value));
+      }
+
       void set_arg_mem(cl_uint arg_index, memory_object_holder &moh)
       {
         cl_mem m = moh.data();
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index e68c7851..2c986718 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -467,6 +467,13 @@ void pyopencl_expose_part_2(py::module &m)
 #endif
       .def("_set_arg_null", &cls::set_arg_null)
       .def("_set_arg_buf", &cls::set_arg_buf)
+      .def("_set_arg_mem", &cls::set_arg_mem)
+      .def("_set_arg_i32", &cls::set_arg_pod<int32_t>)
+      .def("_set_arg_i64", &cls::set_arg_pod<int64_t>)
+      .def("_set_arg_u32", &cls::set_arg_pod<uint32_t>)
+      .def("_set_arg_u64", &cls::set_arg_pod<uint64_t>)
+      .def("_set_arg_f32", &cls::set_arg_pod<float>)
+      .def("_set_arg_f64", &cls::set_arg_pod<double>)
 #if PYOPENCL_CL_VERSION >= 0x2000
       .def("_set_arg_svm", &cls::set_arg_svm)
 #endif
-- 
GitLab


From afaad630e905c990413039e5d8490836f5bda3b8 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 15 Jan 2021 14:22:51 -0600
Subject: [PATCH 02/47] Bump version

---
 pyopencl/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyopencl/version.py b/pyopencl/version.py
index fee1f80e..e4c47bc9 100644
--- a/pyopencl/version.py
+++ b/pyopencl/version.py
@@ -1,3 +1,3 @@
-VERSION = (2020, 3, 1)
+VERSION = (2021, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
-- 
GitLab


From c9f439e053838e9f9bd2ab95afbae399f8e406d3 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 15 Jan 2021 15:16:49 -0600
Subject: [PATCH 03/47] Avoid malloc in enqueue_nd_range_kernel

---
 src/wrap_cl.hpp      | 38 +++++++++++++++++++++-----------------
 src/wrap_helpers.hpp | 12 ++++++++++++
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 7ffcc983..de35513c 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -4642,7 +4642,7 @@ namespace pyopencl
     return result;
   }
 
-
+#define MAX_WS_DIM_COUNT 10
 
   inline
   event *enqueue_nd_range_kernel(
@@ -4659,11 +4659,14 @@ namespace pyopencl
 
     cl_uint work_dim = len(py_global_work_size);
 
-    std::vector<size_t> global_work_size;
-    COPY_PY_LIST(size_t, global_work_size);
+    std::array<size_t, MAX_WS_DIM_COUNT> global_work_size;
+    unsigned gws_index = 0;
+    COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_size, gws_index);
+
+    std::array<size_t, MAX_WS_DIM_COUNT> local_work_size;
+    unsigned lws_index = 0;
+    size_t *local_work_size_ptr = nullptr;
 
-    size_t *local_work_size_ptr = 0;
-    std::vector<size_t> local_work_size;
     if (py_local_work_size.ptr() != Py_None)
     {
       if (g_times_l)
@@ -4673,31 +4676,32 @@ namespace pyopencl
           throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
               "global/local work sizes have differing dimensions");
 
-      COPY_PY_LIST(size_t, local_work_size);
+      COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, local_work_size, lws_index);
 
-      while (local_work_size.size() < work_dim)
-        local_work_size.push_back(1);
-      while (global_work_size.size() < work_dim)
-        global_work_size.push_back(1);
+      while (lws_index < work_dim)
+        local_work_size[lws_index++] = 1;
+      while (gws_index < work_dim)
+        global_work_size[gws_index++] = 1;
 
-      local_work_size_ptr = local_work_size.empty( ) ? nullptr : &local_work_size.front();
+      local_work_size_ptr = &local_work_size.front();
     }
 
-    if (g_times_l && local_work_size_ptr)
+    if (g_times_l && lws_index)
     {
       for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
         global_work_size[work_axis] *= local_work_size[work_axis];
     }
 
-    size_t *global_work_offset_ptr = 0;
-    std::vector<size_t> global_work_offset;
+    size_t *global_work_offset_ptr = nullptr;
+    std::array<size_t, MAX_WS_DIM_COUNT> global_work_offset;
+    unsigned gwo_index = 0;
     if (py_global_work_offset.ptr() != Py_None)
     {
       if (work_dim != unsigned(len(py_global_work_offset)))
         throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
             "global work size and offset have differing dimensions");
 
-      COPY_PY_LIST(size_t, global_work_offset);
+      COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_offset, gwo_index);
 
       if (g_times_l && local_work_size_ptr)
       {
@@ -4705,7 +4709,7 @@ namespace pyopencl
           global_work_offset[work_axis] *= local_work_size[work_axis];
       }
 
-      global_work_offset_ptr = global_work_offset.empty( ) ? nullptr :  &global_work_offset.front();
+      global_work_offset_ptr = &global_work_offset.front();
     }
 
     if (allow_empty_ndrange)
@@ -4742,7 +4746,7 @@ namespace pyopencl
                 knl.data(),
                 work_dim,
                 global_work_offset_ptr,
-                global_work_size.empty( ) ? nullptr : &global_work_size.front(),
+                &global_work_size.front(),
                 local_work_size_ptr,
                 PYOPENCL_WAITLIST_ARGS, &evt
                 ));
diff --git a/src/wrap_helpers.hpp b/src/wrap_helpers.hpp
index 4799244f..cabc012d 100644
--- a/src/wrap_helpers.hpp
+++ b/src/wrap_helpers.hpp
@@ -71,6 +71,18 @@ namespace py = pybind11;
       NAME.push_back(it.cast<TYPE>()); \
   }
 
+#define COPY_PY_ARRAY(FUNC_NAME, TYPE, NAME, COUNTER) \
+  { \
+    COUNTER = 0; \
+    for (auto it: py_##NAME) \
+    { \
+      if (COUNTER == NAME.size()) \
+        throw error(FUNC_NAME, \
+            CL_INVALID_VALUE, "too many entries in " #NAME " argument"); \
+      NAME[COUNTER++] = it.cast<TYPE>(); \
+    } \
+  }
+
 #define COPY_PY_COORD_TRIPLE(NAME) \
   size_t NAME[3] = {0, 0, 0}; \
   { \
-- 
GitLab


From c6aad43ed6247da5e376cd0da38cf8fcb9e4cfcf Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 15 Jan 2021 15:53:44 -0600
Subject: [PATCH 04/47] Privatize cl.array.{splay,Array.get_sizes}, cache
 _get_sizes results

---
 pyopencl/array.py | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/pyopencl/array.py b/pyopencl/array.py
index 297b4eff..75e744e3 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -103,19 +103,18 @@ class VecLookupWarner:
 
 vec = VecLookupWarner()
 
-# {{{ helper functionality
 
+# {{{ helper functionality
 
-def splay(queue, n, kernel_specific_max_wg_size=None):
-    dev = queue.device
-    max_work_items = _builtin_min(128, dev.max_work_group_size)
+def _splay(device, n, kernel_specific_max_wg_size=None):
+    max_work_items = _builtin_min(128, device.max_work_group_size)
 
     if kernel_specific_max_wg_size is not None:
         from builtins import min
         max_work_items = min(max_work_items, kernel_specific_max_wg_size)
 
     min_work_items = _builtin_min(32, max_work_items)
-    max_groups = dev.max_compute_units * 4 * 8
+    max_groups = device.max_compute_units * 4 * 8
     # 4 to overfill the device
     # 8 is an Nvidia constant--that's how many
     # groups fit onto one compute device
@@ -163,7 +162,7 @@ def elwise_kernel_runner(kernel_getter):
 
         knl = kernel_getter(*args, **kwargs)
 
-        gs, ls = repr_ary.get_sizes(queue,
+        gs, ls = repr_ary._get_sizes(queue,
                 knl.get_work_group_info(
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
@@ -241,6 +240,9 @@ class _copy_queue:  # noqa
     pass
 
 
+_ARRAY_GET_SIZES_CACHE = {}
+
+
 class Array:
     """A :class:`numpy.ndarray` work-alike that stores its data and performs
     its computations on the compute device.  *shape* and *dtype* work exactly
@@ -604,12 +606,17 @@ class Array:
         return self._new_with_changes(self.base_data, self.offset,
                 queue=queue)
 
-    #@memoize_method FIXME: reenable
-    def get_sizes(self, queue, kernel_specific_max_wg_size=None):
+    def _get_sizes(self, queue, kernel_specific_max_wg_size=None):
         if not self.flags.forc:
             raise NotImplementedError("cannot operate on non-contiguous array")
-        return splay(queue, self.size,
-                kernel_specific_max_wg_size=kernel_specific_max_wg_size)
+        cache_key = (queue.device.int_ptr, self.size, kernel_specific_max_wg_size)
+        try:
+            return _ARRAY_GET_SIZES_CACHE[cache_key]
+        except KeyError:
+            sizes = _splay(queue.device, self.size,
+                    kernel_specific_max_wg_size=kernel_specific_max_wg_size)
+            _ARRAY_GET_SIZES_CACHE[cache_key] = sizes
+            return sizes
 
     def set(self, ary, queue=None, async_=None, **kwargs):
         """Transfer the contents the :class:`numpy.ndarray` object *ary*
@@ -2289,7 +2296,7 @@ def multi_take(arrays, indices, out=None, queue=None):
         if start_i + chunk_size > vec_count:
             knl = make_func_for_chunk_size(vec_count-start_i)
 
-        gs, ls = indices.get_sizes(queue,
+        gs, ls = indices._get_sizes(queue,
                 knl.get_work_group_info(
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
@@ -2367,7 +2374,7 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None,
         if start_i + chunk_size > vec_count:
             knl = make_func_for_chunk_size(vec_count-start_i)
 
-        gs, ls = src_indices.get_sizes(queue,
+        gs, ls = src_indices._get_sizes(queue,
                 knl.get_work_group_info(
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
@@ -2450,7 +2457,7 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
         if start_i + chunk_size > vec_count:
             knl = make_func_for_chunk_size(vec_count-start_i)
 
-        gs, ls = dest_indices.get_sizes(queue,
+        gs, ls = dest_indices._get_sizes(queue,
                 knl.get_work_group_info(
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
-- 
GitLab


From 367800529a7f2c3840c51d7c52237b7b29282336 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 15 Jan 2021 17:57:23 -0600
Subject: [PATCH 05/47] Fix a few remaining invocations of splay

---
 pyopencl/algorithm.py   | 4 ++--
 pyopencl/clrandom.py    | 4 ++--
 pyopencl/elementwise.py | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py
index c4eb43ea..2b5508b3 100644
--- a/pyopencl/algorithm.py
+++ b/pyopencl/algorithm.py
@@ -1188,8 +1188,8 @@ class ListOfListsBuilder:
             gsize = (4*queue.device.max_compute_units,)
             lsize = (1,)
         else:
-            from pyopencl.array import splay
-            gsize, lsize = splay(queue, n_objects)
+            from pyopencl.array import _splay
+            gsize, lsize = _splay(queue, n_objects)
 
         count_event = count_kernel(queue, gsize, lsize,
                 *(tuple(count_list_args) + data_args + (n_objects,)),
diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py
index ea3862bc..83f2c4bc 100644
--- a/pyopencl/clrandom.py
+++ b/pyopencl/clrandom.py
@@ -662,8 +662,8 @@ class Random123GeneratorBase:
                 scale, shift]
 
         n = ary.size
-        from pyopencl.array import splay
-        gsize, lsize = splay(queue, ary.size)
+        from pyopencl.array import _splay
+        gsize, lsize = _splay(queue, ary.size)
 
         evt = knl(queue, gsize, lsize, *args)
         ary.add_event(evt)
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index df364eda..0fcabd95 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -307,13 +307,13 @@ class ElementwiseKernel:
 
             invocation_args.append(step)
 
-            from pyopencl.array import splay
-            gs, ls = splay(queue,
+            from pyopencl.array import _splay
+            gs, ls = _splay(queue,
                     abs(range_.stop - start)//step,
                     max_wg_size)
         else:
             invocation_args.append(repr_vec.size)
-            gs, ls = repr_vec.get_sizes(queue, max_wg_size)
+            gs, ls = repr_vec._get_sizes(queue, max_wg_size)
 
         if capture_as is not None:
             kernel.set_args(*invocation_args)
-- 
GitLab


From 39d4fadbc4527972c2ac268a72cacfa143eff8f1 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 15 Jan 2021 17:57:55 -0600
Subject: [PATCH 06/47] Avoid a few calls to len() in enqueue_nd_range_kernel

---
 src/wrap_cl.hpp | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index de35513c..6679ed3f 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -389,14 +389,15 @@
     \
     if (py_wait_for.ptr() != Py_None) \
     { \
-      event_wait_list.resize(len(py_wait_for)); \
       for (py::handle evt: py_wait_for) \
-        event_wait_list[num_events_in_wait_list++] = \
-          evt.cast<const event &>().data(); \
+      { \
+        event_wait_list.push_back(evt.cast<const event &>().data()); \
+        ++num_events_in_wait_list; \
+      } \
     }
 
 #define PYOPENCL_WAITLIST_ARGS \
-    num_events_in_wait_list, event_wait_list.empty( ) ? nullptr : &event_wait_list.front()
+    num_events_in_wait_list, (num_events_in_wait_list == 0) ? nullptr : &event_wait_list.front()
 
 #define PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, obj) \
     try \
@@ -4657,36 +4658,35 @@ namespace pyopencl
   {
     PYOPENCL_PARSE_WAIT_FOR;
 
-    cl_uint work_dim = len(py_global_work_size);
-
     std::array<size_t, MAX_WS_DIM_COUNT> global_work_size;
-    unsigned gws_index = 0;
-    COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_size, gws_index);
+    unsigned gws_size = 0;
+    COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_size, gws_size);
+    cl_uint work_dim = gws_size;
 
     std::array<size_t, MAX_WS_DIM_COUNT> local_work_size;
-    unsigned lws_index = 0;
+    unsigned lws_size = 0;
     size_t *local_work_size_ptr = nullptr;
 
     if (py_local_work_size.ptr() != Py_None)
     {
+      COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, local_work_size, lws_size);
+
       if (g_times_l)
-        work_dim = std::max(work_dim, unsigned(len(py_local_work_size)));
+        work_dim = std::max(work_dim, lws_size);
       else
-        if (work_dim != unsigned(len(py_local_work_size)))
+        if (work_dim != lws_size)
           throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
               "global/local work sizes have differing dimensions");
 
-      COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, local_work_size, lws_index);
-
-      while (lws_index < work_dim)
-        local_work_size[lws_index++] = 1;
-      while (gws_index < work_dim)
-        global_work_size[gws_index++] = 1;
+      while (lws_size < work_dim)
+        local_work_size[lws_size++] = 1;
+      while (gws_size < work_dim)
+        global_work_size[gws_size++] = 1;
 
       local_work_size_ptr = &local_work_size.front();
     }
 
-    if (g_times_l && lws_index)
+    if (g_times_l && lws_size)
     {
       for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
         global_work_size[work_axis] *= local_work_size[work_axis];
@@ -4694,15 +4694,15 @@ namespace pyopencl
 
     size_t *global_work_offset_ptr = nullptr;
     std::array<size_t, MAX_WS_DIM_COUNT> global_work_offset;
-    unsigned gwo_index = 0;
     if (py_global_work_offset.ptr() != Py_None)
     {
-      if (work_dim != unsigned(len(py_global_work_offset)))
+      unsigned gwo_size = 0;
+      COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_offset, gwo_size);
+
+      if (work_dim != gwo_size)
         throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
             "global work size and offset have differing dimensions");
 
-      COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_offset, gwo_index);
-
       if (g_times_l && local_work_size_ptr)
       {
         for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
-- 
GitLab


From cace1cc1d5e8860b71930b8ba15ce27a3ec656a8 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 15 Jan 2021 18:42:49 -0600
Subject: [PATCH 07/47] Actually pass a device to _splay on invocations outside
 pyopencl.array

---
 pyopencl/algorithm.py   | 2 +-
 pyopencl/clrandom.py    | 2 +-
 pyopencl/elementwise.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py
index 2b5508b3..446eb9c3 100644
--- a/pyopencl/algorithm.py
+++ b/pyopencl/algorithm.py
@@ -1189,7 +1189,7 @@ class ListOfListsBuilder:
             lsize = (1,)
         else:
             from pyopencl.array import _splay
-            gsize, lsize = _splay(queue, n_objects)
+            gsize, lsize = _splay(queue.device, n_objects)
 
         count_event = count_kernel(queue, gsize, lsize,
                 *(tuple(count_list_args) + data_args + (n_objects,)),
diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py
index 83f2c4bc..dd6c1276 100644
--- a/pyopencl/clrandom.py
+++ b/pyopencl/clrandom.py
@@ -663,7 +663,7 @@ class Random123GeneratorBase:
 
         n = ary.size
         from pyopencl.array import _splay
-        gsize, lsize = _splay(queue, ary.size)
+        gsize, lsize = _splay(queue.device, ary.size)
 
         evt = knl(queue, gsize, lsize, *args)
         ary.add_event(evt)
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index 0fcabd95..039cc7cf 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -308,7 +308,7 @@ class ElementwiseKernel:
             invocation_args.append(step)
 
             from pyopencl.array import _splay
-            gs, ls = _splay(queue,
+            gs, ls = _splay(queue.device,
                     abs(range_.stop - start)//step,
                     max_wg_size)
         else:
-- 
GitLab


From ee989c8a23ff85b9c45177598cd7684240993cb8 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 17 Jan 2021 18:16:03 -0600
Subject: [PATCH 08/47] Revert "Create special-case set_arg wrappers for common
 integer/float types"

This reverts commit be9fd38f6fd85651422817e27dd1636788c374ad.
---
 src/wrap_cl.hpp        | 7 -------
 src/wrap_cl_part_2.cpp | 7 -------
 2 files changed, 14 deletions(-)

diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 6679ed3f..20db54ba 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -4344,13 +4344,6 @@ namespace pyopencl
               sizeof(cl_mem), &m));
       }
 
-      template <typename T>
-      void set_arg_pod(cl_uint arg_index, T pod_value)
-      {
-        PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index,
-              sizeof(T), &pod_value));
-      }
-
       void set_arg_mem(cl_uint arg_index, memory_object_holder &moh)
       {
         cl_mem m = moh.data();
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index 2c986718..e68c7851 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -467,13 +467,6 @@ void pyopencl_expose_part_2(py::module &m)
 #endif
       .def("_set_arg_null", &cls::set_arg_null)
       .def("_set_arg_buf", &cls::set_arg_buf)
-      .def("_set_arg_mem", &cls::set_arg_mem)
-      .def("_set_arg_i32", &cls::set_arg_pod<int32_t>)
-      .def("_set_arg_i64", &cls::set_arg_pod<int64_t>)
-      .def("_set_arg_u32", &cls::set_arg_pod<uint32_t>)
-      .def("_set_arg_u64", &cls::set_arg_pod<uint64_t>)
-      .def("_set_arg_f32", &cls::set_arg_pod<float>)
-      .def("_set_arg_f64", &cls::set_arg_pod<double>)
 #if PYOPENCL_CL_VERSION >= 0x2000
       .def("_set_arg_svm", &cls::set_arg_svm)
 #endif
-- 
GitLab


From 8c63f87f82767cc155499dc48fc34530237d3f91 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 17 Jan 2021 21:02:50 -0600
Subject: [PATCH 09/47] Remove a pypy compatibility hack for checking for
 object-array-ness

---
 pyopencl/array.py | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/pyopencl/array.py b/pyopencl/array.py
index 75e744e3..75f531d6 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -69,19 +69,6 @@ def _get_truedivide_dtype(obj1, obj2, queue):
     return result
 
 
-# Work around PyPy not currently supporting the object dtype.
-# (Yes, it doesn't even support checking!)
-# (as of May 27, 2014 on PyPy 2.3)
-try:
-    np.dtype(object)
-
-    def _dtype_is_object(t):
-        return t == object
-except Exception:
-    def _dtype_is_object(t):
-        return False
-
-
 class InconsistentOpenCLQueueWarning(UserWarning):
     pass
 
@@ -499,9 +486,8 @@ class Array:
 
         # }}}
 
-        if _dtype_is_object(dtype):
-            raise TypeError("object arrays on the compute device are not allowed")
-
+        assert dtype != np.object, \
+                "object arrays on the compute device are not allowed"
         assert isinstance(shape, tuple)
         assert isinstance(strides, tuple)
 
@@ -2080,7 +2066,7 @@ def to_device(queue, ary, allocator=None, async_=None,
 
     # }}}
 
-    if _dtype_is_object(ary.dtype):
+    if ary.dtype == np.object:
         raise RuntimeError("to_device does not work on object arrays.")
 
     if array_queue is _same_as_transfer:
-- 
GitLab


From 05050cd2ee1bfdb2df8ced434d73d61b356bb152 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 17 Jan 2021 21:03:50 -0600
Subject: [PATCH 10/47] Inline _make_strides in pyopencl.array

---
 pyopencl/array.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/pyopencl/array.py b/pyopencl/array.py
index 75f531d6..f245a342 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -198,15 +198,6 @@ class DefaultAllocator(cl.tools.DeferredAllocator):
                 DeprecationWarning, 2)
         cl.tools.DeferredAllocator.__init__(self, *args, **kwargs)
 
-
-def _make_strides(itemsize, shape, order):
-    if order in "fF":
-        return _f_contiguous_strides(itemsize, shape)
-    elif order in "cC":
-        return _c_contiguous_strides(itemsize, shape)
-    else:
-        raise ValueError("invalid order: %s" % order)
-
 # }}}
 
 
@@ -476,7 +467,19 @@ class Array:
             size = size.item()
 
         if strides is None:
-            strides = _make_strides(dtype.itemsize, shape, order)
+            if order in "cC":
+                # inlined from compyte.array.c_contiguous_strides
+                if shape:
+                    strides = [dtype.itemsize]
+                    for s in shape[:0:-1]:
+                        strides.append(strides[-1]*s)
+                    strides = tuple(strides[::-1])
+                else:
+                    strides = ()
+            elif order in "fF":
+                strides = _f_contiguous_strides(dtype.itemsize, shape)
+            else:
+                raise ValueError("invalid order: %s" % order)
 
         else:
             # FIXME: We should possibly perform some plausibility
-- 
GitLab


From a8c0184dbcaaabdb5374e6408591e392757daa76 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 17 Jan 2021 21:04:46 -0600
Subject: [PATCH 11/47] Cache result of _get_common_dtype

---
 pyopencl/array.py | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/pyopencl/array.py b/pyopencl/array.py
index f245a342..63f85fa3 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -45,9 +45,33 @@ from pyopencl.characterize import has_double_support
 from pyopencl import cltypes
 
 
+_COMMON_DTYPE_CACHE = {}
+
+
 def _get_common_dtype(obj1, obj2, queue):
-    return _get_common_dtype_base(obj1, obj2,
-                                  has_double_support(queue.device))
+    dsupport = has_double_support(queue.device)
+    cache_key = None
+    o1_dtype = obj1.dtype
+    try:
+        cache_key = (o1_dtype, obj2.dtype, dsupport)
+        return _COMMON_DTYPE_CACHE[cache_key]
+    except KeyError:
+        pass
+    except AttributeError:
+        # obj2 doesn't have a dtype
+        try:
+            cache_key = (o1_dtype, type(obj2), dsupport)
+            return _COMMON_DTYPE_CACHE[cache_key]
+        except KeyError:
+            pass
+
+    result = _get_common_dtype_base(obj1, obj2, dsupport)
+
+    # we succeeded in constructing the cache key
+    if cache_key is not None:
+        _COMMON_DTYPE_CACHE[cache_key] = result
+
+    return result
 
 
 def _get_truedivide_dtype(obj1, obj2, queue):
-- 
GitLab


From eb47983a0faf579253410e8df9aa1a4d690d6057 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 17 Jan 2021 21:06:05 -0600
Subject: [PATCH 12/47] Avoid generator expression in checking for negative
 shape entries

---
 pyopencl/array.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pyopencl/array.py b/pyopencl/array.py
index 63f85fa3..4d9bc124 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -475,6 +475,9 @@ class Array:
             size = 1
             for dim in shape:
                 size *= dim
+                if dim < 0:
+                    raise ValueError("negative dimensions are not allowed")
+
         except TypeError:
             admissible_types = (int, np.integer)
 
@@ -482,11 +485,10 @@ class Array:
                 raise TypeError("shape must either be iterable or "
                         "castable to an integer")
             size = shape
+            if shape < 0:
+                raise ValueError("negative dimensions are not allowed")
             shape = (shape,)
 
-        if any(dim < 0 for dim in shape):
-            raise ValueError("negative dimensions are not allowed")
-
         if isinstance(size, np.integer):
             size = size.item()
 
-- 
GitLab


From 50705a965eb475d541150bcb3cba3e1fc6cfcc02 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 17 Jan 2021 21:07:15 -0600
Subject: [PATCH 13/47] Do not recompute flags in arrays made by _new_like_me

---
 pyopencl/array.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/pyopencl/array.py b/pyopencl/array.py
index 4d9bc124..10547f84 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -33,7 +33,6 @@ from functools import reduce
 import numpy as np
 import pyopencl.elementwise as elementwise
 import pyopencl as cl
-from pytools import memoize_method
 from pyopencl.compyte.array import (
         as_strided as _as_strided,
         f_contiguous_strides as _f_contiguous_strides,
@@ -431,7 +430,7 @@ class Array:
     __array_priority__ = 100
 
     def __init__(self, cq, shape, dtype, order="C", allocator=None,
-            data=None, offset=0, strides=None, events=None):
+            data=None, offset=0, strides=None, events=None, _flags=None):
         # {{{ backward compatibility
 
         if isinstance(cq, cl.CommandQueue):
@@ -556,6 +555,7 @@ class Array:
 
         self.offset = offset
         self.context = context
+        self._flags = _flags
 
     @property
     def ndim(self):
@@ -569,9 +569,11 @@ class Array:
             return self.base_data
 
     @property
-    @memoize_method
     def flags(self):
-        return _ArrayFlags(self)
+        f = self._flags
+        if f is None:
+            self._flags = f = _ArrayFlags(self)
+        return f
 
     def _new_with_changes(self, data, offset, shape=None, dtype=None,
             strides=None, queue=_copy_queue, allocator=None):
@@ -977,19 +979,21 @@ class Array:
 
     def _new_like_me(self, dtype=None, queue=None):
         strides = None
+        flags = None
         if dtype is None:
             dtype = self.dtype
 
         if dtype == self.dtype:
             strides = self.strides
+            flags = self.flags
 
         queue = queue or self.queue
         if queue is not None:
             return self.__class__(queue, self.shape, dtype,
-                    allocator=self.allocator, strides=strides)
+                    allocator=self.allocator, strides=strides, _flags=flags)
         else:
             return self.__class__(self.context, self.shape, dtype,
-                    strides=strides, allocator=self.allocator)
+                    strides=strides, allocator=self.allocator, _flags=flags)
 
     @staticmethod
     @elwise_kernel_runner
-- 
GitLab


From b7c052f9f5829152cb539adf52320b8ca93cf846 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 17 Jan 2021 21:11:09 -0600
Subject: [PATCH 14/47] Update compyte for equal_strides fast path

---
 pyopencl/compyte | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyopencl/compyte b/pyopencl/compyte
index fbfe788a..7533db88 160000
--- a/pyopencl/compyte
+++ b/pyopencl/compyte
@@ -1 +1 @@
-Subproject commit fbfe788a2dcb190fd241fd42ad047e33bafd85b8
+Subproject commit 7533db88124045924a47d7392eaf9a078670fc4d
-- 
GitLab


From f1db98f9c1e0b94d5ec6685c5452d1a838835efa Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 17 Jan 2021 22:09:43 -0600
Subject: [PATCH 15/47] Invoker gen: check for _KERNEL_ARG_CLASSES before None

---
 pyopencl/invoker.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index c996768d..f4e97615 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -81,10 +81,10 @@ def generate_generic_arg_handler(gen, arg_idx, arg_var):
     from pytools.py_codegen import Indentation
 
     gen("""
-        if {arg_var} is None:
-            self._set_arg_null({arg_idx})
-        elif isinstance({arg_var}, _KERNEL_ARG_CLASSES):
+        if isinstance({arg_var}, _KERNEL_ARG_CLASSES):
             self.set_arg({arg_idx}, {arg_var})
+        elif {arg_var} is None:
+            self._set_arg_null({arg_idx})
         """
         .format(arg_idx=arg_idx, arg_var=arg_var))
 
@@ -341,7 +341,7 @@ def _generate_enqueue_and_set_args_module(function_name,
 
 
 invoker_cache = WriteOncePersistentDict(
-        "pyopencl-invoker-cache-v7",
+        "pyopencl-invoker-cache-v11",
         key_builder=_NumpyTypesKeyBuilder())
 
 
-- 
GitLab


From 66d0d25cc43773d08cbb42f30f33e267172f4627 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 17 Jan 2021 22:11:57 -0600
Subject: [PATCH 16/47] Make a kernel-specific class to override __call__ and
 avoid an indirect call on kernel enqueue

---
 pyopencl/__init__.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 6e0268dc..8a21d804 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -847,12 +847,22 @@ def _add_functionality():
         # }}}
 
         from pyopencl.invoker import generate_enqueue_and_set_args
-        self._enqueue, self._set_args = generate_enqueue_and_set_args(
-                self.function_name,
-                len(scalar_arg_dtypes), self.num_args,
-                self._scalar_arg_dtypes,
-                warn_about_arg_count_bug=warn_about_arg_count_bug,
-                work_around_arg_count_bug=work_around_arg_count_bug)
+        enqueue, set_args = \
+                generate_enqueue_and_set_args(
+                        self.function_name,
+                        len(scalar_arg_dtypes), self.num_args,
+                        self._scalar_arg_dtypes,
+                        warn_about_arg_count_bug=warn_about_arg_count_bug,
+                        work_around_arg_count_bug=work_around_arg_count_bug)
+
+        # Make ourselves a kernel-specific class, so that we're able to override
+        # __call__. Inspired by https://stackoverflow.com/a/38541437
+        class KernelWithOverriddenCall(type(self)):
+            pass
+
+        self.__class__ = KernelWithOverriddenCall
+        KernelWithOverriddenCall.__call__ = enqueue
+        KernelWithOverriddenCall._set_args = set_args
 
     def kernel_get_work_group_info(self, param, device):
         try:
@@ -871,6 +881,9 @@ def _add_functionality():
     def kernel_call(self, queue, global_size, local_size, *args, **kwargs):
         # __call__ can't be overridden directly, so we need this
         # trampoline hack.
+
+        # Note: This is only used for the generic __call__, before
+        # kernel_set_scalar_arg_dtypes is called.
         return self._enqueue(self, queue, global_size, local_size, *args, **kwargs)
 
     def kernel_capture_call(self, filename, queue, global_size, local_size,
-- 
GitLab


From 0e0683f620da3b2ecccbd6381b22a6590df381d8 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 17 Jan 2021 22:24:29 -0600
Subject: [PATCH 17/47] Do not cache result in _get_common_dtype if obj2 is of
 type int

---
 pyopencl/array.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pyopencl/array.py b/pyopencl/array.py
index 10547f84..56c055ae 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -59,8 +59,13 @@ def _get_common_dtype(obj1, obj2, queue):
     except AttributeError:
         # obj2 doesn't have a dtype
         try:
-            cache_key = (o1_dtype, type(obj2), dsupport)
-            return _COMMON_DTYPE_CACHE[cache_key]
+            tobj2 = type(obj2)
+            cache_key = (o1_dtype, tobj2, dsupport)
+
+            # Integers are weird, sized, and signed. Don't pretend that 'int'
+            # is enough information to decide what should happen.
+            if tobj2 != int:
+                return _COMMON_DTYPE_CACHE[cache_key]
         except KeyError:
             pass
 
-- 
GitLab


From 6f759230b8c7a4a3e0d8dc98a87d7f55eeeac39c Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 17 Jan 2021 22:47:54 -0600
Subject: [PATCH 18/47] Fix overriding of custom set_args in
 KernelWithOverriddenCall

---
 pyopencl/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 8a21d804..3aeca27f 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -862,7 +862,7 @@ def _add_functionality():
 
         self.__class__ = KernelWithOverriddenCall
         KernelWithOverriddenCall.__call__ = enqueue
-        KernelWithOverriddenCall._set_args = set_args
+        KernelWithOverriddenCall.set_args = set_args
 
     def kernel_get_work_group_info(self, param, device):
         try:
-- 
GitLab


From 75e460b7405aa59560c0e9f2bb2497894cd7a584 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 17 Jan 2021 22:56:45 -0600
Subject: [PATCH 19/47] Streamline invoker code (and its generation)

---
 pyopencl/invoker.py | 59 +++++++++++----------------------------------
 1 file changed, 14 insertions(+), 45 deletions(-)

diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index f4e97615..6125628b 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -28,6 +28,7 @@ import numpy as np
 from warnings import warn
 import pyopencl._cl as _cl
 from pytools.persistent_dict import WriteOncePersistentDict
+from pytools.py_codegen import Indentation
 from pyopencl.tools import _NumpyTypesKeyBuilder
 
 _PYPY = "__pypy__" in sys.builtin_module_names
@@ -53,8 +54,6 @@ del _size_t_char
 # {{{ individual arg handling
 
 def generate_buffer_arg_setter(gen, arg_idx, buf_var):
-    from pytools.py_codegen import Indentation
-
     if _PYPY:
         # https://github.com/numpy/numpy/issues/5381
         gen(f"if isinstance({buf_var}, np.generic):")
@@ -69,29 +68,6 @@ def generate_buffer_arg_setter(gen, arg_idx, buf_var):
         """
         .format(arg_idx=arg_idx, buf_var=buf_var))
 
-
-def generate_bytes_arg_setter(gen, arg_idx, buf_var):
-    gen("""
-        self._set_arg_buf({arg_idx}, {buf_var})
-        """
-        .format(arg_idx=arg_idx, buf_var=buf_var))
-
-
-def generate_generic_arg_handler(gen, arg_idx, arg_var):
-    from pytools.py_codegen import Indentation
-
-    gen("""
-        if isinstance({arg_var}, _KERNEL_ARG_CLASSES):
-            self.set_arg({arg_idx}, {arg_var})
-        elif {arg_var} is None:
-            self._set_arg_null({arg_idx})
-        """
-        .format(arg_idx=arg_idx, arg_var=arg_var))
-
-    gen("else:")
-    with Indentation(gen):
-        generate_buffer_arg_setter(gen, arg_idx, arg_var)
-
 # }}}
 
 
@@ -108,7 +84,7 @@ def generate_generic_arg_handling_body(num_args):
         gen(f"# process argument {i}")
         gen("")
         gen(f"current_arg = {i}")
-        generate_generic_arg_handler(gen, i, "arg%d" % i)
+        gen(f"self.set_arg({i}, arg{i})")
         gen("")
 
     return gen
@@ -141,7 +117,7 @@ def generate_specific_arg_handling_body(function_name,
         arg_var = "arg%d" % arg_idx
 
         if arg_dtype is None:
-            generate_generic_arg_handler(gen, cl_arg_idx, arg_var)
+            gen(f"self.set_arg({cl_arg_idx}, {arg_var})")
             cl_arg_idx += 1
             gen("")
             continue
@@ -149,7 +125,7 @@ def generate_specific_arg_handling_body(function_name,
         arg_dtype = np.dtype(arg_dtype)
 
         if arg_dtype.char == "V":
-            generate_generic_arg_handler(gen, cl_arg_idx, arg_var)
+            gen(f"self.set_arg({cl_arg_idx}, {arg_var})")
             cl_arg_idx += 1
 
         elif arg_dtype.kind == "c":
@@ -173,13 +149,13 @@ def generate_specific_arg_handling_body(function_name,
                 gen(
                         "buf = pack('{arg_char}', {arg_var}.real)"
                         .format(arg_char=arg_char, arg_var=arg_var))
-                generate_bytes_arg_setter(gen, cl_arg_idx, "buf")
+                gen(f"self._set_arg_buf({cl_arg_idx}, buf)")
                 cl_arg_idx += 1
                 gen("current_arg = current_arg + 1000")
                 gen(
                         "buf = pack('{arg_char}', {arg_var}.imag)"
                         .format(arg_char=arg_char, arg_var=arg_var))
-                generate_bytes_arg_setter(gen, cl_arg_idx, "buf")
+                gen(f"self._set_arg_buf({cl_arg_idx}, buf)")
                 cl_arg_idx += 1
 
             elif (work_around_arg_count_bug == "apple"
@@ -195,7 +171,7 @@ def generate_specific_arg_handling_body(function_name,
                         "buf = pack('{arg_char}{arg_char}', "
                         "{arg_var}.real, {arg_var}.imag)"
                         .format(arg_char=arg_char, arg_var=arg_var))
-                generate_bytes_arg_setter(gen, cl_arg_idx, "buf")
+                gen(f"self._set_arg_buf({cl_arg_idx}, buf)")
                 cl_arg_idx += 1
 
             fp_arg_count += 2
@@ -211,7 +187,7 @@ def generate_specific_arg_handling_body(function_name,
                     .format(
                         arg_char=arg_char,
                         arg_var=arg_var))
-            generate_bytes_arg_setter(gen, cl_arg_idx, "buf")
+            gen(f"self._set_arg_buf({cl_arg_idx}, buf)")
             cl_arg_idx += 1
 
         gen("")
@@ -268,13 +244,6 @@ def wrap_in_error_handler(body, arg_names):
 # }}}
 
 
-def add_local_imports(gen):
-    gen("import numpy as np")
-    gen("import pyopencl._cl as _cl")
-    gen("from pyopencl import _KERNEL_ARG_CLASSES")
-    gen("")
-
-
 def _generate_enqueue_and_set_args_module(function_name,
         num_passed_args, num_cl_args,
         scalar_arg_dtypes,
@@ -292,12 +261,14 @@ def _generate_enqueue_and_set_args_module(function_name,
                 warn_about_arg_count_bug=warn_about_arg_count_bug,
                 work_around_arg_count_bug=work_around_arg_count_bug)
 
-    err_handler = wrap_in_error_handler(body, arg_names)
+    body = wrap_in_error_handler(body, arg_names)
 
     gen = PythonCodeGenerator()
 
     gen("from struct import pack")
     gen("from pyopencl import status_code")
+    gen("import numpy as np")
+    gen("import pyopencl._cl as _cl")
     gen("")
 
     # {{{ generate _enqueue
@@ -314,8 +285,7 @@ def _generate_enqueue_and_set_args_module(function_name,
                         "wait_for=None"])))
 
     with Indentation(gen):
-        add_local_imports(gen)
-        gen.extend(err_handler)
+        gen.extend(body)
 
         gen("""
             return _cl.enqueue_nd_range_kernel(queue, self, global_size, local_size,
@@ -332,8 +302,7 @@ def _generate_enqueue_and_set_args_module(function_name,
             % (", ".join(["self"] + arg_names)))
 
     with Indentation(gen):
-        add_local_imports(gen)
-        gen.extend(err_handler)
+        gen.extend(body)
 
     # }}}
 
@@ -341,7 +310,7 @@ def _generate_enqueue_and_set_args_module(function_name,
 
 
 invoker_cache = WriteOncePersistentDict(
-        "pyopencl-invoker-cache-v11",
+        "pyopencl-invoker-cache-v17",
         key_builder=_NumpyTypesKeyBuilder())
 
 
-- 
GitLab


From 03e6e62f03478539ca3f32e0de1e430e2577031d Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 17 Jan 2021 23:04:30 -0600
Subject: [PATCH 20/47] Rename
 KernelWithOverriddenCall->KernelWithCustomEnqueue

---
 pyopencl/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 3aeca27f..7cc3f3af 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -857,12 +857,12 @@ def _add_functionality():
 
         # Make ourselves a kernel-specific class, so that we're able to override
         # __call__. Inspired by https://stackoverflow.com/a/38541437
-        class KernelWithOverriddenCall(type(self)):
+        class KernelWithCustomEnqueue(type(self)):
             pass
 
-        self.__class__ = KernelWithOverriddenCall
-        KernelWithOverriddenCall.__call__ = enqueue
-        KernelWithOverriddenCall.set_args = set_args
+        self.__class__ = KernelWithCustomEnqueue
+        KernelWithCustomEnqueue.__call__ = enqueue
+        KernelWithCustomEnqueue.set_args = set_args
 
     def kernel_get_work_group_info(self, param, device):
         try:
-- 
GitLab


From 7e9df9040c72c0b92bbeb5c398b403da9b24c09a Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 17 Jan 2021 23:08:50 -0600
Subject: [PATCH 21/47] Turn off generation of debug helper code in invoker
 with sys.flag.optimize

---
 pyopencl/invoker.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index 6125628b..f043ae4b 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -96,7 +96,8 @@ def generate_generic_arg_handling_body(num_args):
 
 def generate_specific_arg_handling_body(function_name,
         num_cl_args, scalar_arg_dtypes,
-        work_around_arg_count_bug, warn_about_arg_count_bug):
+        work_around_arg_count_bug, warn_about_arg_count_bug,
+        include_debug_helpers):
 
     assert work_around_arg_count_bug is not None
     assert warn_about_arg_count_bug is not None
@@ -113,7 +114,8 @@ def generate_specific_arg_handling_body(function_name,
     for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes):
         gen(f"# process argument {arg_idx}")
         gen("")
-        gen(f"current_arg = {arg_idx}")
+        if include_debug_helpers:
+            gen(f"current_arg = {arg_idx}")
         arg_var = "arg%d" % arg_idx
 
         if arg_dtype is None:
@@ -151,7 +153,8 @@ def generate_specific_arg_handling_body(function_name,
                         .format(arg_char=arg_char, arg_var=arg_var))
                 gen(f"self._set_arg_buf({cl_arg_idx}, buf)")
                 cl_arg_idx += 1
-                gen("current_arg = current_arg + 1000")
+                if include_debug_helpers:
+                    gen("current_arg = current_arg + 1000")
                 gen(
                         "buf = pack('{arg_char}', {arg_var}.imag)"
                         .format(arg_char=arg_char, arg_var=arg_var))
@@ -247,7 +250,8 @@ def wrap_in_error_handler(body, arg_names):
 def _generate_enqueue_and_set_args_module(function_name,
         num_passed_args, num_cl_args,
         scalar_arg_dtypes,
-        work_around_arg_count_bug, warn_about_arg_count_bug):
+        work_around_arg_count_bug, warn_about_arg_count_bug,
+        include_debug_helpers):
 
     from pytools.py_codegen import PythonCodeGenerator, Indentation
 
@@ -259,9 +263,11 @@ def _generate_enqueue_and_set_args_module(function_name,
         body = generate_specific_arg_handling_body(
                 function_name, num_cl_args, scalar_arg_dtypes,
                 warn_about_arg_count_bug=warn_about_arg_count_bug,
-                work_around_arg_count_bug=work_around_arg_count_bug)
+                work_around_arg_count_bug=work_around_arg_count_bug,
+                include_debug_helpers=include_debug_helpers)
 
-    body = wrap_in_error_handler(body, arg_names)
+    if include_debug_helpers:
+        body = wrap_in_error_handler(body, arg_names)
 
     gen = PythonCodeGenerator()
 
@@ -310,7 +316,7 @@ def _generate_enqueue_and_set_args_module(function_name,
 
 
 invoker_cache = WriteOncePersistentDict(
-        "pyopencl-invoker-cache-v17",
+        "pyopencl-invoker-cache-v20",
         key_builder=_NumpyTypesKeyBuilder())
 
 
@@ -321,7 +327,8 @@ def generate_enqueue_and_set_args(function_name,
 
     cache_key = (function_name, num_passed_args, num_cl_args,
             scalar_arg_dtypes,
-            work_around_arg_count_bug, warn_about_arg_count_bug)
+            work_around_arg_count_bug, warn_about_arg_count_bug,
+            not sys.flags.optimize)
 
     from_cache = False
 
-- 
GitLab


From 95b290f32802f50a8b190cb445baade2853187c9 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 17 Jan 2021 23:48:06 -0600
Subject: [PATCH 22/47] kernel_get_work_group_info: Use Device.int_ptr in cache
 key

---
 pyopencl/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 7cc3f3af..c3c9cf3f 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -865,13 +865,14 @@ def _add_functionality():
         KernelWithCustomEnqueue.set_args = set_args
 
     def kernel_get_work_group_info(self, param, device):
+        cache_key = (param, device.int_ptr)
         try:
-            return self._wg_info_cache[param, device]
+            return self._wg_info_cache[cache_key]
         except KeyError:
             pass
 
         result = kernel_old_get_work_group_info(self, param, device)
-        self._wg_info_cache[param, device] = result
+        self._wg_info_cache[cache_key] = result
         return result
 
     def kernel_set_args(self, *args, **kwargs):
-- 
GitLab


From 17c4795827e4f820c5286de591f701f5cd329f23 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 17 Jan 2021 23:49:31 -0600
Subject: [PATCH 23/47] Rewrite first_arg_dependent_memoize without decorator
 package, drop dep

---
 pyopencl/tools.py | 37 ++++++++++++++++++++-----------------
 setup.py          |  1 -
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 5b814685..ff5c7965 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -32,7 +32,6 @@ from sys import intern
 # Do not add a pyopencl import here: This will add an import cycle.
 
 import numpy as np
-from decorator import decorator
 from pytools import memoize, memoize_method
 from pyopencl._cl import bitlog2  # noqa: F401
 from pytools.persistent_dict import KeyBuilder as KeyBuilderBase
@@ -73,8 +72,7 @@ from pyopencl._cl import (  # noqa
 _first_arg_dependent_caches = []
 
 
-@decorator
-def first_arg_dependent_memoize(func, cl_object, *args):
+def first_arg_dependent_memoize(func):
     """Provides memoization for a function. Typically used to cache
     things that get created inside a :class:`pyopencl.Context`, e.g. programs
     and kernels. Assumes that the first argument of the decorated function is
@@ -84,21 +82,26 @@ def first_arg_dependent_memoize(func, cl_object, *args):
 
     .. versionadded:: 2011.2
     """
-    try:
-        ctx_dict = func._pyopencl_first_arg_dep_memoize_dic
-    except AttributeError:
-        # FIXME: This may keep contexts alive longer than desired.
-        # But I guess since the memory in them is freed, who cares.
-        ctx_dict = func._pyopencl_first_arg_dep_memoize_dic = {}
-        _first_arg_dependent_caches.append(ctx_dict)
+    def first_arg_memoized_wrapper(cl_object, *args):
+        try:
+            ctx_dict = func._pyopencl_first_arg_dep_memoize_dic
+        except AttributeError:
+            # FIXME: This may keep contexts alive longer than desired.
+            # But I guess since the memory in them is freed, who cares.
+            ctx_dict = func._pyopencl_first_arg_dep_memoize_dic = {}
+            _first_arg_dependent_caches.append(ctx_dict)
 
-    try:
-        return ctx_dict[cl_object][args]
-    except KeyError:
-        arg_dict = ctx_dict.setdefault(cl_object, {})
-        result = func(cl_object, *args)
-        arg_dict[args] = result
-        return result
+        try:
+            return ctx_dict[cl_object][args]
+        except KeyError:
+            arg_dict = ctx_dict.setdefault(cl_object, {})
+            result = func(cl_object, *args)
+            arg_dict[args] = result
+            return result
+
+    from functools import update_wrapper
+    update_wrapper(first_arg_memoized_wrapper, func)
+    return first_arg_memoized_wrapper
 
 
 context_dependent_memoize = first_arg_dependent_memoize
diff --git a/setup.py b/setup.py
index bf63e71c..ba268a6a 100644
--- a/setup.py
+++ b/setup.py
@@ -247,7 +247,6 @@ def main():
             install_requires=[
                 "numpy",
                 "pytools>=2017.6",
-                "decorator>=3.2.0",
                 "appdirs>=1.4.0",
                 # "Mako>=0.3.6",
                 ],
-- 
GitLab


From 5eb3f3b95f82c8907059ad808c70cf137db89dcd Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 18 Jan 2021 00:01:47 -0600
Subject: [PATCH 24/47] Revert "Rewrite first_arg_dependent_memoize without
 decorator package, drop dep"

This reverts commit 17c4795827e4f820c5286de591f701f5cd329f23.

Turns out decorator performs some magic to support kwargs better than we do.
---
 pyopencl/tools.py | 37 +++++++++++++++++--------------------
 setup.py          |  1 +
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index ff5c7965..5b814685 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -32,6 +32,7 @@ from sys import intern
 # Do not add a pyopencl import here: This will add an import cycle.
 
 import numpy as np
+from decorator import decorator
 from pytools import memoize, memoize_method
 from pyopencl._cl import bitlog2  # noqa: F401
 from pytools.persistent_dict import KeyBuilder as KeyBuilderBase
@@ -72,7 +73,8 @@ from pyopencl._cl import (  # noqa
 _first_arg_dependent_caches = []
 
 
-def first_arg_dependent_memoize(func):
+@decorator
+def first_arg_dependent_memoize(func, cl_object, *args):
     """Provides memoization for a function. Typically used to cache
     things that get created inside a :class:`pyopencl.Context`, e.g. programs
     and kernels. Assumes that the first argument of the decorated function is
@@ -82,26 +84,21 @@ def first_arg_dependent_memoize(func):
 
     .. versionadded:: 2011.2
     """
-    def first_arg_memoized_wrapper(cl_object, *args):
-        try:
-            ctx_dict = func._pyopencl_first_arg_dep_memoize_dic
-        except AttributeError:
-            # FIXME: This may keep contexts alive longer than desired.
-            # But I guess since the memory in them is freed, who cares.
-            ctx_dict = func._pyopencl_first_arg_dep_memoize_dic = {}
-            _first_arg_dependent_caches.append(ctx_dict)
-
-        try:
-            return ctx_dict[cl_object][args]
-        except KeyError:
-            arg_dict = ctx_dict.setdefault(cl_object, {})
-            result = func(cl_object, *args)
-            arg_dict[args] = result
-            return result
+    try:
+        ctx_dict = func._pyopencl_first_arg_dep_memoize_dic
+    except AttributeError:
+        # FIXME: This may keep contexts alive longer than desired.
+        # But I guess since the memory in them is freed, who cares.
+        ctx_dict = func._pyopencl_first_arg_dep_memoize_dic = {}
+        _first_arg_dependent_caches.append(ctx_dict)
 
-    from functools import update_wrapper
-    update_wrapper(first_arg_memoized_wrapper, func)
-    return first_arg_memoized_wrapper
+    try:
+        return ctx_dict[cl_object][args]
+    except KeyError:
+        arg_dict = ctx_dict.setdefault(cl_object, {})
+        result = func(cl_object, *args)
+        arg_dict[args] = result
+        return result
 
 
 context_dependent_memoize = first_arg_dependent_memoize
diff --git a/setup.py b/setup.py
index ba268a6a..bf63e71c 100644
--- a/setup.py
+++ b/setup.py
@@ -247,6 +247,7 @@ def main():
             install_requires=[
                 "numpy",
                 "pytools>=2017.6",
+                "decorator>=3.2.0",
                 "appdirs>=1.4.0",
                 # "Mako>=0.3.6",
                 ],
-- 
GitLab


From 429ba485ee877f34fe5fe895be1f45ad2a7c1ab1 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 18 Jan 2021 13:05:29 -0600
Subject: [PATCH 25/47] Use positional args in _cl.enqueue_nd_range_kernel in
 invoker to avoid pybind slowness with kwargs

---
 pyopencl/invoker.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index f043ae4b..691a15c0 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -293,10 +293,11 @@ def _generate_enqueue_and_set_args_module(function_name,
     with Indentation(gen):
         gen.extend(body)
 
+        # Using positional args here because pybind is slow with keyword args
         gen("""
             return _cl.enqueue_nd_range_kernel(queue, self, global_size, local_size,
-                    global_offset, wait_for, g_times_l=g_times_l,
-                    allow_empty_ndrange=allow_empty_ndrange)
+                    global_offset, wait_for, g_times_l,
+                    allow_empty_ndrange)
             """)
 
     # }}}
@@ -316,7 +317,7 @@ def _generate_enqueue_and_set_args_module(function_name,
 
 
 invoker_cache = WriteOncePersistentDict(
-        "pyopencl-invoker-cache-v20",
+        "pyopencl-invoker-cache-v21",
         key_builder=_NumpyTypesKeyBuilder())
 
 
-- 
GitLab


From b8542b5d6f0bbc58899e7563c0482c6a5b15d851 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 18 Jan 2021 16:17:15 -0600
Subject: [PATCH 26/47] Remove unused generate_buffer_arg_setter

---
 pyopencl/invoker.py | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index 691a15c0..c7fe8560 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -51,26 +51,6 @@ del _size_t_char
 # }}}
 
 
-# {{{ individual arg handling
-
-def generate_buffer_arg_setter(gen, arg_idx, buf_var):
-    if _PYPY:
-        # https://github.com/numpy/numpy/issues/5381
-        gen(f"if isinstance({buf_var}, np.generic):")
-        with Indentation(gen):
-            if _PYPY:
-                gen("{buf_var} = np.asarray({buf_var})".format(buf_var=buf_var))
-            else:
-                gen("{buf_var} = np.getbuffer({buf_var})".format(buf_var=buf_var))
-
-    gen("""
-        self._set_arg_buf({arg_idx}, {buf_var})
-        """
-        .format(arg_idx=arg_idx, buf_var=buf_var))
-
-# }}}
-
-
 # {{{ generic arg handling body
 
 def generate_generic_arg_handling_body(num_args):
-- 
GitLab


From e2ba90140834cea562c55d4d268a213c0756e9c7 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 18 Jan 2021 16:18:08 -0600
Subject: [PATCH 27/47] Invoker generator: globalize some py_codegen imports

---
 pyopencl/invoker.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index c7fe8560..3feb7685 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -28,7 +28,7 @@ import numpy as np
 from warnings import warn
 import pyopencl._cl as _cl
 from pytools.persistent_dict import WriteOncePersistentDict
-from pytools.py_codegen import Indentation
+from pytools.py_codegen import Indentation, PythonCodeGenerator
 from pyopencl.tools import _NumpyTypesKeyBuilder
 
 _PYPY = "__pypy__" in sys.builtin_module_names
@@ -54,7 +54,6 @@ del _size_t_char
 # {{{ generic arg handling body
 
 def generate_generic_arg_handling_body(num_args):
-    from pytools.py_codegen import PythonCodeGenerator
     gen = PythonCodeGenerator()
 
     if num_args == 0:
@@ -85,7 +84,6 @@ def generate_specific_arg_handling_body(function_name,
     fp_arg_count = 0
     cl_arg_idx = 0
 
-    from pytools.py_codegen import PythonCodeGenerator
     gen = PythonCodeGenerator()
 
     if not scalar_arg_dtypes:
@@ -189,8 +187,6 @@ def generate_specific_arg_handling_body(function_name,
 # {{{ error handler
 
 def wrap_in_error_handler(body, arg_names):
-    from pytools.py_codegen import PythonCodeGenerator, Indentation
-
     err_gen = PythonCodeGenerator()
 
     def gen_error_handler():
@@ -233,8 +229,6 @@ def _generate_enqueue_and_set_args_module(function_name,
         work_around_arg_count_bug, warn_about_arg_count_bug,
         include_debug_helpers):
 
-    from pytools.py_codegen import PythonCodeGenerator, Indentation
-
     arg_names = ["arg%d" % i for i in range(num_passed_args)]
 
     if scalar_arg_dtypes is None:
-- 
GitLab


From 852e41397ff6e65ab4e1dfbb913b34df0cb0f197 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 18 Jan 2021 16:19:17 -0600
Subject: [PATCH 28/47] Wrapper: switch some C++ args from py::object to
 py::handle

---
 src/wrap_cl.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 20db54ba..7a6f056f 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -4371,7 +4371,7 @@ namespace pyopencl
             (m_kernel, arg_index, sizeof(cl_command_queue), &q));
       }
 
-      void set_arg_buf(cl_uint arg_index, py::object py_buffer)
+      void set_arg_buf(cl_uint arg_index, py::handle py_buffer)
       {
         const void *buf;
         PYOPENCL_BUFFER_SIZE_T len;
@@ -4404,7 +4404,7 @@ namespace pyopencl
       }
 #endif
 
-      void set_arg(cl_uint arg_index, py::object arg)
+      void set_arg(cl_uint arg_index, py::handle arg)
       {
         if (arg.ptr() == Py_None)
         {
@@ -4642,10 +4642,10 @@ namespace pyopencl
   event *enqueue_nd_range_kernel(
       command_queue &cq,
       kernel &knl,
-      py::object py_global_work_size,
-      py::object py_local_work_size,
-      py::object py_global_work_offset,
-      py::object py_wait_for,
+      py::handle py_global_work_size,
+      py::handle py_local_work_size,
+      py::handle py_global_work_offset,
+      py::handle py_wait_for,
       bool g_times_l,
       bool allow_empty_ndrange)
   {
-- 
GitLab


From 676e30f54ea4d0e79d8b231751876eb36b098a2e Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 18 Jan 2021 17:44:40 -0600
Subject: [PATCH 29/47] Make creation of KernelWithCustomEnqueue easier to read
 (thanks @kaushikcfd)

---
 pyopencl/__init__.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index c3c9cf3f..775687ce 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -847,7 +847,7 @@ def _add_functionality():
         # }}}
 
         from pyopencl.invoker import generate_enqueue_and_set_args
-        enqueue, set_args = \
+        enqueue, my_set_args = \
                 generate_enqueue_and_set_args(
                         self.function_name,
                         len(scalar_arg_dtypes), self.num_args,
@@ -858,11 +858,10 @@ def _add_functionality():
         # Make ourselves a kernel-specific class, so that we're able to override
         # __call__. Inspired by https://stackoverflow.com/a/38541437
         class KernelWithCustomEnqueue(type(self)):
-            pass
+            __call__ = enqueue
+            set_args = my_set_args
 
         self.__class__ = KernelWithCustomEnqueue
-        KernelWithCustomEnqueue.__call__ = enqueue
-        KernelWithCustomEnqueue.set_args = set_args
 
     def kernel_get_work_group_info(self, param, device):
         cache_key = (param, device.int_ptr)
-- 
GitLab


From 6f6a54a5b028723f580d7b5c3726a4c363dc53c7 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 18 Jan 2021 17:52:01 -0600
Subject: [PATCH 30/47] Introduce _set_arg{,_buf}_multi for batched argument
 setting

---
 pyopencl/invoker.py    | 125 ++++++++++++++---------------------------
 src/wrap_cl.hpp        |  54 ++++++++++++++++++
 src/wrap_cl_part_2.cpp |  14 +++++
 3 files changed, 109 insertions(+), 84 deletions(-)

diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index 3feb7685..e452aa95 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -58,13 +58,14 @@ def generate_generic_arg_handling_body(num_args):
 
     if num_args == 0:
         gen("pass")
+    else:
+        gen_arg_indices = list(range(num_args))
+        gen_args = [f"arg{i}" for i in gen_arg_indices]
 
-    for i in range(num_args):
-        gen(f"# process argument {i}")
-        gen("")
-        gen(f"current_arg = {i}")
-        gen(f"self.set_arg({i}, arg{i})")
-        gen("")
+        gen(f"self._set_arg_multi("
+                f"({', '.join(str(i) for i in gen_arg_indices)},), "
+                f"({', '.join(gen_args)},)"
+                ")")
 
     return gen
 
@@ -75,8 +76,7 @@ def generate_generic_arg_handling_body(num_args):
 
 def generate_specific_arg_handling_body(function_name,
         num_cl_args, scalar_arg_dtypes,
-        work_around_arg_count_bug, warn_about_arg_count_bug,
-        include_debug_helpers):
+        work_around_arg_count_bug, warn_about_arg_count_bug):
 
     assert work_around_arg_count_bug is not None
     assert warn_about_arg_count_bug is not None
@@ -89,15 +89,17 @@ def generate_specific_arg_handling_body(function_name,
     if not scalar_arg_dtypes:
         gen("pass")
 
+    gen_arg_indices = []
+    gen_args = []
+    buf_arg_indices = []
+    buf_args = []
+
     for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes):
-        gen(f"# process argument {arg_idx}")
-        gen("")
-        if include_debug_helpers:
-            gen(f"current_arg = {arg_idx}")
         arg_var = "arg%d" % arg_idx
 
         if arg_dtype is None:
-            gen(f"self.set_arg({cl_arg_idx}, {arg_var})")
+            gen_arg_indices.append(cl_arg_idx)
+            gen_args.append(arg_var)
             cl_arg_idx += 1
             gen("")
             continue
@@ -105,7 +107,8 @@ def generate_specific_arg_handling_body(function_name,
         arg_dtype = np.dtype(arg_dtype)
 
         if arg_dtype.char == "V":
-            gen(f"self.set_arg({cl_arg_idx}, {arg_var})")
+            gen_arg_indices.append(cl_arg_idx)
+            gen_args.append(arg_var)
             cl_arg_idx += 1
 
         elif arg_dtype.kind == "c":
@@ -126,17 +129,11 @@ def generate_specific_arg_handling_body(function_name,
             if (work_around_arg_count_bug == "pocl"
                     and arg_dtype == np.complex128
                     and fp_arg_count + 2 <= 8):
-                gen(
-                        "buf = pack('{arg_char}', {arg_var}.real)"
-                        .format(arg_char=arg_char, arg_var=arg_var))
-                gen(f"self._set_arg_buf({cl_arg_idx}, buf)")
+                buf_arg_indices.append(cl_arg_idx)
+                buf_args.append(f"pack('{arg_char}', {arg_var}.real)")
                 cl_arg_idx += 1
-                if include_debug_helpers:
-                    gen("current_arg = current_arg + 1000")
-                gen(
-                        "buf = pack('{arg_char}', {arg_var}.imag)"
-                        .format(arg_char=arg_char, arg_var=arg_var))
-                gen(f"self._set_arg_buf({cl_arg_idx}, buf)")
+                buf_arg_indices.append(cl_arg_idx)
+                buf_args.append(f"pack('{arg_char}', {arg_var}.imag)")
                 cl_arg_idx += 1
 
             elif (work_around_arg_count_bug == "apple"
@@ -148,11 +145,9 @@ def generate_specific_arg_handling_body(function_name,
                         "Cannot pass complex numbers to kernels.")
 
             else:
-                gen(
-                        "buf = pack('{arg_char}{arg_char}', "
-                        "{arg_var}.real, {arg_var}.imag)"
-                        .format(arg_char=arg_char, arg_var=arg_var))
-                gen(f"self._set_arg_buf({cl_arg_idx}, buf)")
+                buf_arg_indices.append(cl_arg_idx)
+                buf_args.append(
+                    f"pack('{arg_char}{arg_char}', {arg_var}.real, {arg_var}.imag)")
                 cl_arg_idx += 1
 
             fp_arg_count += 2
@@ -163,16 +158,23 @@ def generate_specific_arg_handling_body(function_name,
 
             arg_char = arg_dtype.char
             arg_char = _type_char_map.get(arg_char, arg_char)
-            gen(
-                    "buf = pack('{arg_char}', {arg_var})"
-                    .format(
-                        arg_char=arg_char,
-                        arg_var=arg_var))
-            gen(f"self._set_arg_buf({cl_arg_idx}, buf)")
+            buf_arg_indices.append(cl_arg_idx)
+            buf_args.append(f"pack('{arg_char}', {arg_var})")
             cl_arg_idx += 1
 
         gen("")
 
+    for arg_kind, indices, args in [
+            ("", gen_arg_indices, gen_args),
+            ("_buf", buf_arg_indices, buf_args)
+            ]:
+        assert len(indices) == len(args)
+        if indices:
+            gen(f"self._set_arg{arg_kind}_multi("
+                    f"({', '.join(str(i) for i in indices)},), "
+                    f"({', '.join(args)},)"
+                    ")")
+
     if cl_arg_idx != num_cl_args:
         raise TypeError(
             "length of argument list (%d) and "
@@ -184,50 +186,10 @@ def generate_specific_arg_handling_body(function_name,
 # }}}
 
 
-# {{{ error handler
-
-def wrap_in_error_handler(body, arg_names):
-    err_gen = PythonCodeGenerator()
-
-    def gen_error_handler():
-        err_gen("""
-            if current_arg is not None:
-                args = [{args}]
-                advice = ""
-                from pyopencl.array import Array
-                if isinstance(args[current_arg], Array):
-                    advice = " (perhaps you meant to pass 'array.data' " \
-                        "instead of the array itself?)"
-
-                raise _cl.LogicError(
-                        "when processing argument #%d (1-based): %s%s"
-                        % (current_arg+1, str(e), advice))
-            else:
-                raise
-            """
-            .format(args=", ".join(arg_names)))
-        err_gen("")
-
-    err_gen("try:")
-    with Indentation(err_gen):
-        err_gen.extend(body)
-    err_gen("except TypeError as e:")
-    with Indentation(err_gen):
-        gen_error_handler()
-    err_gen("except _cl.LogicError as e:")
-    with Indentation(err_gen):
-        gen_error_handler()
-
-    return err_gen
-
-# }}}
-
-
 def _generate_enqueue_and_set_args_module(function_name,
         num_passed_args, num_cl_args,
         scalar_arg_dtypes,
-        work_around_arg_count_bug, warn_about_arg_count_bug,
-        include_debug_helpers):
+        work_around_arg_count_bug, warn_about_arg_count_bug):
 
     arg_names = ["arg%d" % i for i in range(num_passed_args)]
 
@@ -237,11 +199,7 @@ def _generate_enqueue_and_set_args_module(function_name,
         body = generate_specific_arg_handling_body(
                 function_name, num_cl_args, scalar_arg_dtypes,
                 warn_about_arg_count_bug=warn_about_arg_count_bug,
-                work_around_arg_count_bug=work_around_arg_count_bug,
-                include_debug_helpers=include_debug_helpers)
-
-    if include_debug_helpers:
-        body = wrap_in_error_handler(body, arg_names)
+                work_around_arg_count_bug=work_around_arg_count_bug)
 
     gen = PythonCodeGenerator()
 
@@ -291,7 +249,7 @@ def _generate_enqueue_and_set_args_module(function_name,
 
 
 invoker_cache = WriteOncePersistentDict(
-        "pyopencl-invoker-cache-v21",
+        "pyopencl-invoker-cache-v29",
         key_builder=_NumpyTypesKeyBuilder())
 
 
@@ -302,8 +260,7 @@ def generate_enqueue_and_set_args(function_name,
 
     cache_key = (function_name, num_passed_args, num_cl_args,
             scalar_arg_dtypes,
-            work_around_arg_count_bug, warn_about_arg_count_bug,
-            not sys.flags.optimize)
+            work_around_arg_count_bug, warn_about_arg_count_bug)
 
     from_cache = False
 
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 7a6f056f..3210eb09 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -86,6 +86,7 @@
 
 #endif
 
+#include <functional>
 #include <thread>
 #include <mutex>
 #include <condition_variable>
@@ -4452,6 +4453,59 @@ namespace pyopencl
         set_arg_buf(arg_index, arg);
       }
 
+      static
+      void set_arg_multi(
+          std::function<void(cl_uint, py::handle)> set_arg_func,
+          py::tuple indices,
+          py::tuple args)
+      {
+        // This is an internal interface used by generated invokers.
+        // We can save a tiny bit of time by not checking their work.
+        /*
+        if (indices.size() != args.size())
+            throw error("Kernel.set_arg_multi", CL_INVALID_VALUE,
+                "indices and args arguments do not have the same length");
+        */
+
+        cl_uint arg_index;
+        py::handle arg_value;
+
+        auto indices_it = indices.begin(), args_it = args.begin(),
+          indices_end = indices.end();
+        try
+        {
+          while (indices_it != indices_end)
+          {
+            arg_index = py::cast<cl_uint>(*indices_it++);
+            arg_value = *args_it++;
+            set_arg_func(arg_index, arg_value);
+          }
+        }
+        catch (error &err)
+        {
+          std::string msg(
+              std::string("when processing arg#") + std::to_string(arg_index+1)
+              + std::string(" (1-based): ") + std::string(err.what()));
+
+          auto mod_cl_ary(py::module::import("pyopencl.array"));
+          auto cls_array(mod_cl_ary.attr("Array"));
+          if (arg_value.ptr() && py::isinstance(arg_value, cls_array))
+            msg.append(
+                " (perhaps you meant to pass 'array.data' instead of the array itself?)");
+
+          throw error(err.routine().c_str(), err.code(), msg.c_str());
+        }
+        catch (std::exception &err)
+        {
+          std::string msg(
+              std::string("when processing arg#") + std::to_string(arg_index+1)
+              + std::string(" (1-based): ") + std::string(err.what()));
+
+          throw std::runtime_error(msg.c_str());
+        }
+      }
+
+
       py::object get_info(cl_kernel_info param_name) const
       {
         switch (param_name)
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index e68c7851..3dc46c60 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -470,6 +470,20 @@ void pyopencl_expose_part_2(py::module &m)
 #if PYOPENCL_CL_VERSION >= 0x2000
       .def("_set_arg_svm", &cls::set_arg_svm)
 #endif
+      .def("_set_arg_multi",
+          [](cls &knl, py::tuple indices, py::tuple args)
+          {
+            cls::set_arg_multi(
+                [&](cl_uint i, py::handle arg) { knl.set_arg(i, arg); },
+                indices, args);
+          })
+      .def("_set_arg_buf_multi",
+          [](cls &knl, py::tuple indices, py::tuple args)
+          {
+            cls::set_arg_multi(
+                [&](cl_uint i, py::handle arg) { knl.set_arg_buf(i, arg); },
+                indices, args);
+          })
       .DEF_SIMPLE_METHOD(set_arg)
 #if PYOPENCL_CL_VERSION >= 0x1020
       .DEF_SIMPLE_METHOD(get_arg_info)
-- 
GitLab


From f80c664fb8fd0a37d94a64bfcef3f9090b7df802 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 19 Jan 2021 12:16:48 -0600
Subject: [PATCH 31/47] Pack indices and args into the same tuple for
 set_arg*multi

---
 pyopencl/invoker.py    | 54 ++++++++++++++++++++----------------------
 src/wrap_cl.hpp        | 12 ++++------
 src/wrap_cl_part_2.cpp |  8 +++----
 3 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index e452aa95..fec2a2ca 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -59,12 +59,13 @@ def generate_generic_arg_handling_body(num_args):
     if num_args == 0:
         gen("pass")
     else:
-        gen_arg_indices = list(range(num_args))
-        gen_args = [f"arg{i}" for i in gen_arg_indices]
+        gen_indices_and_args = []
+        for i in range(num_args):
+            gen_indices_and_args.append(i)
+            gen_indices_and_args.append(f"arg{i}")
 
         gen(f"self._set_arg_multi("
-                f"({', '.join(str(i) for i in gen_arg_indices)},), "
-                f"({', '.join(gen_args)},)"
+                f"({', '.join(str(i) for i in gen_indices_and_args)},), "
                 ")")
 
     return gen
@@ -89,17 +90,15 @@ def generate_specific_arg_handling_body(function_name,
     if not scalar_arg_dtypes:
         gen("pass")
 
-    gen_arg_indices = []
-    gen_args = []
-    buf_arg_indices = []
-    buf_args = []
+    gen_indices_and_args = []
+    buf_args_indices = []
 
     for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes):
         arg_var = "arg%d" % arg_idx
 
         if arg_dtype is None:
-            gen_arg_indices.append(cl_arg_idx)
-            gen_args.append(arg_var)
+            gen_indices_and_args.append(cl_arg_idx)
+            gen_indices_and_args.append(arg_var)
             cl_arg_idx += 1
             gen("")
             continue
@@ -107,8 +106,8 @@ def generate_specific_arg_handling_body(function_name,
         arg_dtype = np.dtype(arg_dtype)
 
         if arg_dtype.char == "V":
-            gen_arg_indices.append(cl_arg_idx)
-            gen_args.append(arg_var)
+            gen_indices_and_args.append(cl_arg_idx)
+            gen_indices_and_args.append(arg_var)
             cl_arg_idx += 1
 
         elif arg_dtype.kind == "c":
@@ -129,11 +128,11 @@ def generate_specific_arg_handling_body(function_name,
             if (work_around_arg_count_bug == "pocl"
                     and arg_dtype == np.complex128
                     and fp_arg_count + 2 <= 8):
-                buf_arg_indices.append(cl_arg_idx)
-                buf_args.append(f"pack('{arg_char}', {arg_var}.real)")
+                buf_args_indices.append(cl_arg_idx)
+                buf_args_indices.append(f"pack('{arg_char}', {arg_var}.real)")
                 cl_arg_idx += 1
-                buf_arg_indices.append(cl_arg_idx)
-                buf_args.append(f"pack('{arg_char}', {arg_var}.imag)")
+                buf_args_indices.append(cl_arg_idx)
+                buf_args_indices.append(f"pack('{arg_char}', {arg_var}.imag)")
                 cl_arg_idx += 1
 
             elif (work_around_arg_count_bug == "apple"
@@ -145,8 +144,8 @@ def generate_specific_arg_handling_body(function_name,
                         "Cannot pass complex numbers to kernels.")
 
             else:
-                buf_arg_indices.append(cl_arg_idx)
-                buf_args.append(
+                buf_args_indices.append(cl_arg_idx)
+                buf_args_indices.append(
                     f"pack('{arg_char}{arg_char}', {arg_var}.real, {arg_var}.imag)")
                 cl_arg_idx += 1
 
@@ -158,21 +157,20 @@ def generate_specific_arg_handling_body(function_name,
 
             arg_char = arg_dtype.char
             arg_char = _type_char_map.get(arg_char, arg_char)
-            buf_arg_indices.append(cl_arg_idx)
-            buf_args.append(f"pack('{arg_char}', {arg_var})")
+            buf_args_indices.append(cl_arg_idx)
+            buf_args_indices.append(f"pack('{arg_char}', {arg_var})")
             cl_arg_idx += 1
 
         gen("")
 
-    for arg_kind, indices, args in [
-            ("", gen_arg_indices, gen_args),
-            ("_buf", buf_arg_indices, buf_args)
+    for arg_kind, args_and_indices in [
+            ("", gen_indices_and_args),
+            ("_buf", buf_args_indices)
             ]:
-        assert len(indices) == len(args)
-        if indices:
+        assert len(args_and_indices) % 2 == 0
+        if args_and_indices:
             gen(f"self._set_arg{arg_kind}_multi("
-                    f"({', '.join(str(i) for i in indices)},), "
-                    f"({', '.join(args)},)"
+                    f"({', '.join(str(i) for i in args_and_indices)},), "
                     ")")
 
     if cl_arg_idx != num_cl_args:
@@ -249,7 +247,7 @@ def _generate_enqueue_and_set_args_module(function_name,
 
 
 invoker_cache = WriteOncePersistentDict(
-        "pyopencl-invoker-cache-v29",
+        "pyopencl-invoker-cache-v30",
         key_builder=_NumpyTypesKeyBuilder())
 
 
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 3210eb09..89afb8d6 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -4456,8 +4456,7 @@ namespace pyopencl
       static
       void set_arg_multi(
           std::function<void(cl_uint, py::handle)> set_arg_func,
-          py::tuple indices,
-          py::tuple args)
+          py::tuple args_and_indices)
       {
         // This is an internal interface used by generated invokers.
         // We can save a tiny bit of time by not checking their work.
@@ -4470,14 +4469,13 @@ namespace pyopencl
         cl_uint arg_index;
         py::handle arg_value;
 
-        auto indices_it = indices.begin(), args_it = args.begin(),
-          indices_end = indices.end();
+        auto it = args_and_indices.begin(), end = args_and_indices.end();
         try
         {
-          while (indices_it != indices_end)
+          while (it != end)
           {
-            arg_index = py::cast<cl_uint>(*indices_it++);
-            arg_value = *args_it++;
+            arg_index = py::cast<cl_uint>(*it++);
+            arg_value = *it++;
             set_arg_func(arg_index, arg_value);
           }
         }
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index 3dc46c60..d4b60f64 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -471,18 +471,18 @@ void pyopencl_expose_part_2(py::module &m)
       .def("_set_arg_svm", &cls::set_arg_svm)
 #endif
       .def("_set_arg_multi",
-          [](cls &knl, py::tuple indices, py::tuple args)
+          [](cls &knl, py::tuple indices_and_args)
           {
             cls::set_arg_multi(
                 [&](cl_uint i, py::handle arg) { knl.set_arg(i, arg); },
-                indices, args);
+                indices_and_args);
           })
       .def("_set_arg_buf_multi",
-          [](cls &knl, py::tuple indices, py::tuple args)
+          [](cls &knl, py::tuple indices_and_args)
           {
             cls::set_arg_multi(
                 [&](cl_uint i, py::handle arg) { knl.set_arg_buf(i, arg); },
-                indices, args);
+                indices_and_args);
           })
       .DEF_SIMPLE_METHOD(set_arg)
 #if PYOPENCL_CL_VERSION >= 0x1020
-- 
GitLab


From a80586c8c25d3252b1e98882753a1d1df310e37b Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 19 Jan 2021 18:59:39 -0600
Subject: [PATCH 32/47] Introduce Kernel._set_arg_buf_pack_multi for inlined
 buffer packing+arg setting

---
 pyopencl/invoker.py    |  45 ++++++-----
 src/wrap_cl.hpp        | 164 ++++++++++++++++++++++++++++-------------
 src/wrap_cl_part_2.cpp |  12 ++-
 3 files changed, 149 insertions(+), 72 deletions(-)

diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index fec2a2ca..52548820 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -22,7 +22,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import sys
 import numpy as np
 
 from warnings import warn
@@ -31,9 +30,6 @@ from pytools.persistent_dict import WriteOncePersistentDict
 from pytools.py_codegen import Indentation, PythonCodeGenerator
 from pyopencl.tools import _NumpyTypesKeyBuilder
 
-_PYPY = "__pypy__" in sys.builtin_module_names
-
-
 # {{{ arg packing helpers
 
 _size_t_char = ({
@@ -75,6 +71,9 @@ def generate_generic_arg_handling_body(num_args):
 
 # {{{ specific arg handling body
 
+BUF_PACK_TYPECHARS = ["c", "b", "B", "h", "H", "i", "I", "l", "L", "f", "d"]
+
+
 def generate_specific_arg_handling_body(function_name,
         num_cl_args, scalar_arg_dtypes,
         work_around_arg_count_bug, warn_about_arg_count_bug):
@@ -91,7 +90,17 @@ def generate_specific_arg_handling_body(function_name,
         gen("pass")
 
     gen_indices_and_args = []
-    buf_args_indices = []
+    buf_indices_and_args = []
+    buf_pack_indices_and_args = []
+
+    def add_buf_arg(arg_idx, typechar, expr_str):
+        if typechar in BUF_PACK_TYPECHARS:
+            buf_pack_indices_and_args.append(cl_arg_idx)
+            buf_pack_indices_and_args.append(repr(typechar.encode()))
+            buf_pack_indices_and_args.append(expr_str)
+        else:
+            buf_indices_and_args.append(cl_arg_idx)
+            buf_indices_and_args.append(f"pack('{typechar}', {expr_str})")
 
     for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes):
         arg_var = "arg%d" % arg_idx
@@ -106,8 +115,8 @@ def generate_specific_arg_handling_body(function_name,
         arg_dtype = np.dtype(arg_dtype)
 
         if arg_dtype.char == "V":
-            gen_indices_and_args.append(cl_arg_idx)
-            gen_indices_and_args.append(arg_var)
+            buf_indices_and_args.append(cl_arg_idx)
+            buf_indices_and_args.append(arg_var)
             cl_arg_idx += 1
 
         elif arg_dtype.kind == "c":
@@ -128,11 +137,9 @@ def generate_specific_arg_handling_body(function_name,
             if (work_around_arg_count_bug == "pocl"
                     and arg_dtype == np.complex128
                     and fp_arg_count + 2 <= 8):
-                buf_args_indices.append(cl_arg_idx)
-                buf_args_indices.append(f"pack('{arg_char}', {arg_var}.real)")
+                add_buf_arg(cl_arg_idx, arg_char, f"{arg_var}.real")
                 cl_arg_idx += 1
-                buf_args_indices.append(cl_arg_idx)
-                buf_args_indices.append(f"pack('{arg_char}', {arg_var}.imag)")
+                add_buf_arg(cl_arg_idx, arg_char, f"{arg_var}.imag")
                 cl_arg_idx += 1
 
             elif (work_around_arg_count_bug == "apple"
@@ -144,8 +151,8 @@ def generate_specific_arg_handling_body(function_name,
                         "Cannot pass complex numbers to kernels.")
 
             else:
-                buf_args_indices.append(cl_arg_idx)
-                buf_args_indices.append(
+                buf_indices_and_args.append(cl_arg_idx)
+                buf_indices_and_args.append(
                     f"pack('{arg_char}{arg_char}', {arg_var}.real, {arg_var}.imag)")
                 cl_arg_idx += 1
 
@@ -157,15 +164,15 @@ def generate_specific_arg_handling_body(function_name,
 
             arg_char = arg_dtype.char
             arg_char = _type_char_map.get(arg_char, arg_char)
-            buf_args_indices.append(cl_arg_idx)
-            buf_args_indices.append(f"pack('{arg_char}', {arg_var})")
+            add_buf_arg(cl_arg_idx, arg_char, arg_var)
             cl_arg_idx += 1
 
         gen("")
 
-    for arg_kind, args_and_indices in [
-            ("", gen_indices_and_args),
-            ("_buf", buf_args_indices)
+    for arg_kind, args_and_indices, entry_length in [
+            ("", gen_indices_and_args, 2),
+            ("_buf", buf_indices_and_args, 2),
+            ("_buf_pack", buf_pack_indices_and_args, 3),
             ]:
         assert len(args_and_indices) % 2 == 0
         if args_and_indices:
@@ -247,7 +254,7 @@ def _generate_enqueue_and_set_args_module(function_name,
 
 
 invoker_cache = WriteOncePersistentDict(
-        "pyopencl-invoker-cache-v30",
+        "pyopencl-invoker-cache-v34",
         key_builder=_NumpyTypesKeyBuilder())
 
 
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 89afb8d6..58663e0c 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -4372,6 +4372,40 @@ namespace pyopencl
             (m_kernel, arg_index, sizeof(cl_command_queue), &q));
       }
 
+      void set_arg_buf_pack(cl_uint arg_index, py::handle py_typechar, py::handle obj)
+      {
+#define PYOPENCL_KERNEL_PACK_AND_SET_ARG(TYPECH_VAL, TYPE) \
+        case TYPECH_VAL: \
+          { \
+            TYPE val = py::cast<TYPE>(obj); \
+            PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index, sizeof(val), &val)); \
+            break; \
+          }
+
+        /* This is an internal interface that assumes it gets fed well-formed
+         * data.  No meaningful error checking is being performed on
+         * py_typechar, on purpose.
+         */
+        switch (*PyBytes_AS_STRING(py_typechar.ptr()))
+        {
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('c', char)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('b', signed char)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('B', unsigned char)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('h', short)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('H', unsigned short)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('i', int)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('I', unsigned int)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('l', long)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('L', unsigned long)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('f', float)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('d', double)
+          default:
+            throw error("Kernel.set_arg_buf_pack", CL_INVALID_VALUE,
+                "invalid type char");
+        }
+#undef PYOPENCL_KERNEL_PACK_AND_SET_ARG
+      }
+
       void set_arg_buf(cl_uint arg_index, py::handle py_buffer)
       {
         const void *buf;
@@ -4453,57 +4487,6 @@ namespace pyopencl
         set_arg_buf(arg_index, arg);
       }
 
-      static
-      void set_arg_multi(
-          std::function<void(cl_uint, py::handle)> set_arg_func,
-          py::tuple args_and_indices)
-      {
-        // This is an internal interface used by generated invokers.
-        // We can save a tiny bit of time by not checking their work.
-        /*
-        if (indices.size() != args.size())
-            throw error("Kernel.set_arg_multi", CL_INVALID_VALUE,
-                "indices and args arguments do not have the same length");
-        */
-
-        cl_uint arg_index;
-        py::handle arg_value;
-
-        auto it = args_and_indices.begin(), end = args_and_indices.end();
-        try
-        {
-          while (it != end)
-          {
-            arg_index = py::cast<cl_uint>(*it++);
-            arg_value = *it++;
-            set_arg_func(arg_index, arg_value);
-          }
-        }
-        catch (error &err)
-        {
-          std::string msg(
-              std::string("when processing arg#") + std::to_string(arg_index+1)
-              + std::string(" (1-based): ") + std::string(err.what()));
-
-          auto mod_cl_ary(py::module::import("pyopencl.array"));
-          auto cls_array(mod_cl_ary.attr("Array"));
-          if (arg_value.ptr() && py::isinstance(arg_value, cls_array))
-            msg.append(
-                " (perhaps you meant to pass 'array.data' instead of the array itself?)");
-
-          throw error(err.routine().c_str(), err.code(), msg.c_str());
-        }
-        catch (std::exception &err)
-        {
-          std::string msg(
-              std::string("when processing arg#") + std::to_string(arg_index+1)
-              + std::string(" (1-based): ") + std::string(err.what()));
-
-          throw std::runtime_error(msg.c_str());
-        }
-      }
-
-
       py::object get_info(cl_kernel_info param_name) const
       {
         switch (param_name)
@@ -4668,6 +4651,85 @@ namespace pyopencl
 #endif
   };
 
+#define PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER \
+    catch (error &err) \
+    { \
+      std::string msg( \
+          std::string("when processing arg#") + std::to_string(arg_index+1) \
+          + std::string(" (1-based): ") + std::string(err.what())); \
+      auto mod_cl_ary(py::module::import("pyopencl.array")); \
+      auto cls_array(mod_cl_ary.attr("Array")); \
+      if (arg_value.ptr() && py::isinstance(arg_value, cls_array)) \
+        msg.append( \
+            " (perhaps you meant to pass 'array.data' instead of the array itself?)"); \
+      throw error(err.routine().c_str(), err.code(), msg.c_str()); \
+    } \
+    catch (std::exception &err) \
+    { \
+      std::string msg( \
+          std::string("when processing arg#") + std::to_string(arg_index+1) \
+          + std::string(" (1-based): ") + std::string(err.what())); \
+      throw std::runtime_error(msg.c_str()); \
+    }
+
+  inline
+  void set_arg_multi(
+      std::function<void(cl_uint, py::handle)> set_arg_func,
+      py::tuple args_and_indices)
+  {
+    cl_uint arg_index;
+    py::handle arg_value;
+
+    auto it = args_and_indices.begin(), end = args_and_indices.end();
+    try
+    {
+      /* This is an internal interface that assumes it gets fed well-formed
+       * data.  No meaningful error checking is being performed on
+       * off-interval exhaustion of the iterator, on purpose.
+       */
+      while (it != end)
+      {
+        // special value in case integer cast fails
+        arg_index = 9999 - 1;
+
+        arg_index = py::cast<cl_uint>(*it++);
+        arg_value = *it++;
+        set_arg_func(arg_index, arg_value);
+      }
+    }
+    PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER
+  }
+
+
+  inline
+  void set_arg_multi(
+      std::function<void(cl_uint, py::handle, py::handle)> set_arg_func,
+      py::tuple args_and_indices)
+  {
+    cl_uint arg_index;
+    py::handle arg_descr, arg_value;
+
+    auto it = args_and_indices.begin(), end = args_and_indices.end();
+    try
+    {
+      /* This is an internal interface that assumes it gets fed well-formed
+       * data.  No meaningful error checking is being performed on
+       * off-interval exhaustion of the iterator, on purpose.
+       */
+      while (it != end)
+      {
+        // special value in case integer cast fails
+        arg_index = 9999 - 1;
+
+        arg_index = py::cast<cl_uint>(*it++);
+        arg_descr = *it++;
+        arg_value = *it++;
+        set_arg_func(arg_index, arg_descr, arg_value);
+      }
+    }
+    PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER
+  }
+
 
   inline
   py::list create_kernels_in_program(program &pgm)
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index d4b60f64..205b31ec 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -473,17 +473,25 @@ void pyopencl_expose_part_2(py::module &m)
       .def("_set_arg_multi",
           [](cls &knl, py::tuple indices_and_args)
           {
-            cls::set_arg_multi(
+            set_arg_multi(
                 [&](cl_uint i, py::handle arg) { knl.set_arg(i, arg); },
                 indices_and_args);
           })
       .def("_set_arg_buf_multi",
           [](cls &knl, py::tuple indices_and_args)
           {
-            cls::set_arg_multi(
+            set_arg_multi(
                 [&](cl_uint i, py::handle arg) { knl.set_arg_buf(i, arg); },
                 indices_and_args);
           })
+      .def("_set_arg_buf_pack_multi",
+          [](cls &knl, py::tuple indices_chars_and_args)
+          {
+            set_arg_multi(
+                [&](cl_uint i, py::handle typechar, py::handle arg)
+                { knl.set_arg_buf_pack(i, typechar, arg); },
+                indices_chars_and_args);
+          })
       .DEF_SIMPLE_METHOD(set_arg)
 #if PYOPENCL_CL_VERSION >= 0x1020
       .DEF_SIMPLE_METHOD(get_arg_info)
-- 
GitLab


From 9ea089aaf7cb6368d5f605b641e67e996f96ecad Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 19 Jan 2021 19:10:50 -0600
Subject: [PATCH 33/47] Tweak examples to emphasize importance of retaining
 kernels

---
 examples/demo.py       | 3 ++-
 examples/demo_array.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/demo.py b/examples/demo.py
index 623660fe..a4a50333 100644
--- a/examples/demo.py
+++ b/examples/demo.py
@@ -23,7 +23,8 @@ __kernel void sum(
 """).build()
 
 res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
-prg.sum(queue, a_np.shape, None, a_g, b_g, res_g)
+knl = prg.sum  # Use this Kernel object for repeated calls
+knl(queue, a_np.shape, None, a_g, b_g, res_g)
 
 res_np = np.empty_like(a_np)
 cl.enqueue_copy(queue, res_np, res_g)
diff --git a/examples/demo_array.py b/examples/demo_array.py
index 41b0f79e..74bb7cfc 100644
--- a/examples/demo_array.py
+++ b/examples/demo_array.py
@@ -22,6 +22,7 @@ prg = cl.Program(ctx, """
     }
     """).build()
 
-prg.sum(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data)
+knl = prg.sum  # Use this Kernel object for repeated calls
+knl(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data)
 
 print(la.norm((dest_dev - (a_dev+b_dev)).get()))
-- 
GitLab


From d6e0e1fbe5f976efa2d6f207bb2f0fb7dfdde519 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 19 Jan 2021 19:15:15 -0600
Subject: [PATCH 34/47] Fix lenght assert on args_and_indices in invoker
 generation

---
 pyopencl/invoker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index 52548820..c8f39f1d 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -174,7 +174,7 @@ def generate_specific_arg_handling_body(function_name,
             ("_buf", buf_indices_and_args, 2),
             ("_buf_pack", buf_pack_indices_and_args, 3),
             ]:
-        assert len(args_and_indices) % 2 == 0
+        assert len(args_and_indices) % entry_length == 0
         if args_and_indices:
             gen(f"self._set_arg{arg_kind}_multi("
                     f"({', '.join(str(i) for i in args_and_indices)},), "
-- 
GitLab


From 60d5f39a5500dfd9b770b15f56b7af3dac83a784 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 19 Jan 2021 19:46:23 -0600
Subject: [PATCH 35/47] Drop absurd non-error-checking for typechar

---
 src/wrap_cl.hpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 58663e0c..f00a6b88 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -4374,6 +4374,13 @@ namespace pyopencl
 
       void set_arg_buf_pack(cl_uint arg_index, py::handle py_typechar, py::handle obj)
       {
+        std::string typechar_str(py::cast<std::string>(py_typechar));
+        if (typechar_str.size() != 1)
+          throw error("Kernel.set_arg_buf_pack", CL_INVALID_VALUE,
+              "type char argument must have exactly one character");
+
+        char typechar = typechar_str[0];
+
 #define PYOPENCL_KERNEL_PACK_AND_SET_ARG(TYPECH_VAL, TYPE) \
         case TYPECH_VAL: \
           { \
@@ -4381,12 +4388,7 @@ namespace pyopencl
             PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index, sizeof(val), &val)); \
             break; \
           }
-
-        /* This is an internal interface that assumes it gets fed well-formed
-         * data.  No meaningful error checking is being performed on
-         * py_typechar, on purpose.
-         */
-        switch (*PyBytes_AS_STRING(py_typechar.ptr()))
+        switch (typechar)
         {
           PYOPENCL_KERNEL_PACK_AND_SET_ARG('c', char)
           PYOPENCL_KERNEL_PACK_AND_SET_ARG('b', signed char)
-- 
GitLab


From 3070eb99c778dba9f372002b5417eac5beb516df Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 19 Jan 2021 23:58:15 -0600
Subject: [PATCH 36/47] Fix scope leakage in add_buf_arg

---
 pyopencl/invoker.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index c8f39f1d..ccf423d3 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -95,11 +95,11 @@ def generate_specific_arg_handling_body(function_name,
 
     def add_buf_arg(arg_idx, typechar, expr_str):
         if typechar in BUF_PACK_TYPECHARS:
-            buf_pack_indices_and_args.append(cl_arg_idx)
+            buf_pack_indices_and_args.append(arg_idx)
             buf_pack_indices_and_args.append(repr(typechar.encode()))
             buf_pack_indices_and_args.append(expr_str)
         else:
-            buf_indices_and_args.append(cl_arg_idx)
+            buf_indices_and_args.append(arg_idx)
             buf_indices_and_args.append(f"pack('{typechar}', {expr_str})")
 
     for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes):
-- 
GitLab


From 89c38431d343f5b1a92dab4b5689c14159f5a865 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 20 Jan 2021 11:54:11 -0600
Subject: [PATCH 37/47] Make VectorArg, ScalarArg comparable and add
 persistent-dict key generation for them

---
 pyopencl/tools.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 5b814685..d78070b0 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -334,10 +334,15 @@ class DtypedArgument(Argument):
                 self.name,
                 self.dtype)
 
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self.dtype == other.dtype
+                and self.name == other.name)
+
 
 class VectorArg(DtypedArgument):
     def __init__(self, dtype, name, with_offset=False):
-        DtypedArgument.__init__(self, dtype, name)
+        super().__init__(dtype, name)
         self.with_offset = with_offset
 
     def declarator(self):
@@ -350,6 +355,10 @@ class VectorArg(DtypedArgument):
 
         return result
 
+    def __eq__(self, other):
+        return (super().__eq__(other)
+                and self.with_offset == other.with_offset)
+
 
 class ScalarArg(DtypedArgument):
     def declarator(self):
@@ -1025,6 +1034,11 @@ def is_spirv(s):
 # {{{ numpy key types builder
 
 class _NumpyTypesKeyBuilder(KeyBuilderBase):
+    def update_for_VectorArg(self, key_hash, key):  # noqa: N802
+        self.rec(key_hash, key.dtype)
+        self.update_for_str(key_hash, key.name)
+        self.rec(key_hash, key.with_offset)
+
     def update_for_type(self, key_hash, key):
         if issubclass(key, np.generic):
             self.update_for_str(key_hash, key.__name__)
-- 
GitLab


From 5dee903563ad201a17b70fab51cd0cbb6e1b02cb Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 20 Jan 2021 11:56:32 -0600
Subject: [PATCH 38/47] Incorporate argument-processing bits of kernel_runner
 into generated invoker

---
 pyopencl/__init__.py    | 14 +++++---
 pyopencl/array.py       | 59 +++++-------------------------
 pyopencl/elementwise.py | 19 +++-------
 pyopencl/invoker.py     | 80 ++++++++++++++++++++++++++++++-----------
 pyopencl/tools.py       | 14 ++++++++
 5 files changed, 97 insertions(+), 89 deletions(-)

diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 775687ce..1d304fd2 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -819,8 +819,8 @@ def _add_functionality():
         self._wg_info_cache = {}
         return self
 
-    def kernel_set_scalar_arg_dtypes(self, scalar_arg_dtypes):
-        self._scalar_arg_dtypes = tuple(scalar_arg_dtypes)
+    def kernel_set_arg_types(self, arg_types):
+        arg_types = tuple(arg_types)
 
         # {{{ arg counting bug handling
 
@@ -850,8 +850,8 @@ def _add_functionality():
         enqueue, my_set_args = \
                 generate_enqueue_and_set_args(
                         self.function_name,
-                        len(scalar_arg_dtypes), self.num_args,
-                        self._scalar_arg_dtypes,
+                        len(arg_types), self.num_args,
+                        arg_types,
                         warn_about_arg_count_bug=warn_about_arg_count_bug,
                         work_around_arg_count_bug=work_around_arg_count_bug)
 
@@ -903,7 +903,11 @@ def _add_functionality():
     Kernel.__init__ = kernel_init
     Kernel._setup = kernel__setup
     Kernel.get_work_group_info = kernel_get_work_group_info
-    Kernel.set_scalar_arg_dtypes = kernel_set_scalar_arg_dtypes
+
+    # FIXME: Possibly deprecate this version
+    Kernel.set_scalar_arg_dtypes = kernel_set_arg_types
+    Kernel.set_arg_types = kernel_set_arg_types
+
     Kernel.set_args = kernel_set_args
     Kernel.__call__ = kernel_call
     Kernel.capture_call = kernel_capture_call
diff --git a/pyopencl/array.py b/pyopencl/array.py
index 56c055ae..7f65bf6e 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -169,12 +169,6 @@ def elwise_kernel_runner(kernel_getter):
 
         wait_for = kwargs.pop("wait_for", None)
 
-        # wait_for must be a copy, because we modify it in-place below
-        if wait_for is None:
-            wait_for = []
-        else:
-            wait_for = list(wait_for)
-
         knl = kernel_getter(*args, **kwargs)
 
         gs, ls = repr_ary._get_sizes(queue,
@@ -183,31 +177,9 @@ def elwise_kernel_runner(kernel_getter):
                     queue.device))
 
         assert isinstance(repr_ary, Array)
+        args = args + (repr_ary.size,)
 
-        actual_args = []
-        for arg in args:
-            if isinstance(arg, Array):
-                if not arg.flags.forc:
-                    raise RuntimeError("only contiguous arrays may "
-                            "be used as arguments to this operation")
-                actual_args.append(arg.base_data)
-                actual_args.append(arg.offset)
-                wait_for.extend(arg.events)
-
-                if (implicit_queue
-                        and arg.queue is not None
-                        and arg.queue != queue):
-                    from warnings import warn
-
-                    warn("Implicit queue in elementwise operation does not match "
-                            "queue of a provided argument. This will become an "
-                            "error in 2021.",
-                            type=InconsistentOpenCLQueueWarning)
-            else:
-                actual_args.append(arg)
-        actual_args.append(repr_ary.size)
-
-        return knl(queue, gs, ls, *actual_args, wait_for=wait_for)
+        return knl(queue, gs, ls, *args, wait_for=wait_for)
 
     try:
         from functools import update_wrapper
@@ -2403,19 +2375,13 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None,
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
 
-        from pytools import flatten
         wait_for_this = (dest_indices.events + src_indices.events
             + _builtin_sum((i.events for i in arrays[chunk_slice]), [])
             + _builtin_sum((o.events for o in out[chunk_slice]), []))
         evt = knl(queue, gs, ls,
-                *([o.data for o in out[chunk_slice]]
-                    + [dest_indices.base_data,
-                        dest_indices.offset,
-                        src_indices.base_data,
-                        src_indices.offset]
-                    + list(flatten(
-                        (i.base_data, i.offset)
-                        for i in arrays[chunk_slice]))
+                *([o for o in out[chunk_slice]]
+                    + [dest_indices, src_indices]
+                    + [i for i in arrays[chunk_slice]]
                     + src_offsets_list[chunk_slice]
                     + [src_indices.size]), wait_for=wait_for_this)
         for o in out[chunk_slice]:
@@ -2486,22 +2452,15 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
 
-        from pytools import flatten
         wait_for_this = (wait_for
             + _builtin_sum((i.events for i in arrays[chunk_slice]), [])
             + _builtin_sum((o.events for o in out[chunk_slice]), []))
         evt = knl(queue, gs, ls,
                 *(
-                    list(flatten(
-                        (o.base_data, o.offset)
-                        for o in out[chunk_slice]))
-                    + [dest_indices.base_data, dest_indices.offset]
-                    + list(flatten(
-                        (i.base_data, i.offset)
-                        for i in arrays[chunk_slice]))
-                    + [use_fill_cla.base_data, use_fill_cla.offset]
-                    + [array_lengths_cla.base_data, array_lengths_cla.offset]
-                    + [dest_indices.size]),
+                    [o for o in out[chunk_slice]]
+                    + [dest_indices]
+                    + [i for i in arrays[chunk_slice]]
+                    + [use_fill_cla, array_lengths_cla, dest_indices.size]),
                 wait_for=wait_for_this)
 
         for o in out[chunk_slice]:
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index 039cc7cf..fcb94ee9 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -156,10 +156,10 @@ def get_elwise_kernel_and_types(context, arguments, operation,
         name=name, options=options, preamble=preamble,
         use_range=use_range, loop_prep=loop_prep, **kwargs)
 
-    from pyopencl.tools import get_arg_list_scalar_arg_dtypes
+    from pyopencl.tools import get_arg_list_arg_types
 
     kernel = getattr(prg, name)
-    kernel.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(parsed_args))
+    kernel.set_scalar_arg_dtypes(get_arg_list_arg_types(parsed_args))
 
     return kernel, parsed_args
 
@@ -260,17 +260,10 @@ class ElementwiseKernel:
         invocation_args = []
         for arg, arg_descr in zip(args, arg_descrs):
             if isinstance(arg_descr, VectorArg):
-                if not arg.flags.forc:
-                    raise RuntimeError("ElementwiseKernel cannot "
-                            "deal with non-contiguous arrays")
-
                 if repr_vec is None:
                     repr_vec = arg
 
-                invocation_args.append(arg.base_data)
-                if arg_descr.with_offset:
-                    invocation_args.append(arg.offset)
-                wait_for.extend(arg.events)
+                invocation_args.append(arg)
             else:
                 invocation_args.append(arg)
 
@@ -317,13 +310,11 @@ class ElementwiseKernel:
 
         if capture_as is not None:
             kernel.set_args(*invocation_args)
-            kernel.capture_call(
+            return kernel.capture_call(
                     capture_as, queue,
                     gs, ls, *invocation_args, wait_for=wait_for)
 
-        kernel.set_args(*invocation_args)
-        return cl.enqueue_nd_range_kernel(queue, kernel,
-                gs, ls, wait_for=wait_for)
+        return kernel(queue, gs, ls, *invocation_args, wait_for=wait_for)
 
 # }}}
 
diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index ccf423d3..77f90207 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -28,7 +28,8 @@ from warnings import warn
 import pyopencl._cl as _cl
 from pytools.persistent_dict import WriteOncePersistentDict
 from pytools.py_codegen import Indentation, PythonCodeGenerator
-from pyopencl.tools import _NumpyTypesKeyBuilder
+from pyopencl.tools import _NumpyTypesKeyBuilder, VectorArg
+
 
 # {{{ arg packing helpers
 
@@ -75,8 +76,9 @@ BUF_PACK_TYPECHARS = ["c", "b", "B", "h", "H", "i", "I", "l", "L", "f", "d"]
 
 
 def generate_specific_arg_handling_body(function_name,
-        num_cl_args, scalar_arg_dtypes,
-        work_around_arg_count_bug, warn_about_arg_count_bug):
+        num_cl_args, arg_types,
+        work_around_arg_count_bug, warn_about_arg_count_bug,
+        in_enqueue):
 
     assert work_around_arg_count_bug is not None
     assert warn_about_arg_count_bug is not None
@@ -86,7 +88,7 @@ def generate_specific_arg_handling_body(function_name,
 
     gen = PythonCodeGenerator()
 
-    if not scalar_arg_dtypes:
+    if not arg_types:
         gen("pass")
 
     gen_indices_and_args = []
@@ -102,17 +104,53 @@ def generate_specific_arg_handling_body(function_name,
             buf_indices_and_args.append(arg_idx)
             buf_indices_and_args.append(f"pack('{typechar}', {expr_str})")
 
-    for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes):
+    if in_enqueue and arg_types is not None and \
+            any(isinstance(arg_type, VectorArg) for arg_type in arg_types):
+        # We're about to modify wait_for, make sure it's a copy.
+        gen("""
+            if wait_for is None:
+                wait_for = []
+            else:
+                wait_for = list(wait_for)
+            """)
+        gen("")
+
+    for arg_idx, arg_type in enumerate(arg_types):
         arg_var = "arg%d" % arg_idx
 
-        if arg_dtype is None:
+        if arg_type is None:
             gen_indices_and_args.append(cl_arg_idx)
             gen_indices_and_args.append(arg_var)
             cl_arg_idx += 1
             gen("")
             continue
 
-        arg_dtype = np.dtype(arg_dtype)
+        elif isinstance(arg_type, VectorArg):
+            gen(f"if not {arg_var}.flags.forc:")
+            with Indentation(gen):
+                gen("raise RuntimeError('only contiguous arrays may '")
+                gen("   'be used as arguments to this operation')")
+                gen("")
+
+            if in_enqueue:
+                gen(f"assert {arg_var}.queue is None or {arg_var}.queue == queue, "
+                    "'queues for all arrays must match the queue supplied "
+                    "to enqueue'")
+
+            gen_indices_and_args.append(cl_arg_idx)
+            gen_indices_and_args.append(f"{arg_var}.base_data")
+            cl_arg_idx += 1
+
+            if arg_type.with_offset:
+                add_buf_arg(cl_arg_idx, np.dtype(np.int64).char, f"{arg_var}.offset")
+                cl_arg_idx += 1
+
+            if in_enqueue:
+                gen(f"wait_for.extend({arg_var}.events)")
+
+            continue
+
+        arg_dtype = np.dtype(arg_type)
 
         if arg_dtype.char == "V":
             buf_indices_and_args.append(cl_arg_idx)
@@ -193,18 +231,20 @@ def generate_specific_arg_handling_body(function_name,
 
 def _generate_enqueue_and_set_args_module(function_name,
         num_passed_args, num_cl_args,
-        scalar_arg_dtypes,
+        arg_types,
         work_around_arg_count_bug, warn_about_arg_count_bug):
 
     arg_names = ["arg%d" % i for i in range(num_passed_args)]
 
-    if scalar_arg_dtypes is None:
-        body = generate_generic_arg_handling_body(num_passed_args)
-    else:
-        body = generate_specific_arg_handling_body(
-                function_name, num_cl_args, scalar_arg_dtypes,
-                warn_about_arg_count_bug=warn_about_arg_count_bug,
-                work_around_arg_count_bug=work_around_arg_count_bug)
+    def gen_arg_setting(in_enqueue):
+        if arg_types is None:
+            return generate_generic_arg_handling_body(num_passed_args)
+        else:
+            return generate_specific_arg_handling_body(
+                    function_name, num_cl_args, arg_types,
+                    warn_about_arg_count_bug=warn_about_arg_count_bug,
+                    work_around_arg_count_bug=work_around_arg_count_bug,
+                    in_enqueue=in_enqueue)
 
     gen = PythonCodeGenerator()
 
@@ -228,7 +268,7 @@ def _generate_enqueue_and_set_args_module(function_name,
                         "wait_for=None"])))
 
     with Indentation(gen):
-        gen.extend(body)
+        gen.extend(gen_arg_setting(in_enqueue=True))
 
         # Using positional args here because pybind is slow with keyword args
         gen("""
@@ -246,7 +286,7 @@ def _generate_enqueue_and_set_args_module(function_name,
             % (", ".join(["self"] + arg_names)))
 
     with Indentation(gen):
-        gen.extend(body)
+        gen.extend(gen_arg_setting(in_enqueue=False))
 
     # }}}
 
@@ -254,17 +294,17 @@ def _generate_enqueue_and_set_args_module(function_name,
 
 
 invoker_cache = WriteOncePersistentDict(
-        "pyopencl-invoker-cache-v34",
+        "pyopencl-invoker-cache-v38",
         key_builder=_NumpyTypesKeyBuilder())
 
 
 def generate_enqueue_and_set_args(function_name,
         num_passed_args, num_cl_args,
-        scalar_arg_dtypes,
+        arg_types,
         work_around_arg_count_bug, warn_about_arg_count_bug):
 
     cache_key = (function_name, num_passed_args, num_cl_args,
-            scalar_arg_dtypes,
+            arg_types,
             work_around_arg_count_bug, warn_about_arg_count_bug)
 
     from_cache = False
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index d78070b0..4800b15b 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -411,6 +411,20 @@ def parse_arg_list(arguments, with_offset=False):
     return [parse_single_arg(arg) for arg in arguments]
 
 
+def get_arg_list_arg_types(arg_types):
+    result = []
+
+    for arg_type in arg_types:
+        if isinstance(arg_type, ScalarArg):
+            result.append(arg_type.dtype)
+        elif isinstance(arg_type, VectorArg):
+            result.append(arg_type)
+        else:
+            raise RuntimeError("arg type not understood: %s" % type(arg_type))
+
+    return tuple(result)
+
+
 def get_arg_list_scalar_arg_dtypes(arg_types):
     result = []
 
-- 
GitLab


From f0308dceec423b90ae9e7743cc093f71352cfb8f Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 20 Jan 2021 12:01:26 -0600
Subject: [PATCH 39/47] Add POCL Nvidia jobs to Gitlab CI

---
 .gitlab-ci.yml | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index deaf28aa..43149559 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -15,7 +15,7 @@ Python 3 Intel CPU:
     reports:
       junit: test/pytest.xml
 
-Python 3 Titan X:
+Python 3 Nvidia Titan X:
   script:
   - export PY_EXE=python3
   - export PYOPENCL_TEST=nvi:titan
@@ -31,7 +31,7 @@ Python 3 Titan X:
     reports:
       junit: test/pytest.xml
 
-Python 3 Titan V:
+Python 3 Nvidia Titan V:
   script:
   - export PY_EXE=python3
   - export PYOPENCL_TEST=nvi:titan
@@ -47,7 +47,7 @@ Python 3 Titan V:
     reports:
       junit: test/pytest.xml
 
-Python 3 K40:
+Python 3 Nvidia K40:
   script:
   - export PY_EXE=python3
   - export PYOPENCL_TEST=nvi:k40
@@ -117,16 +117,34 @@ Python 3 POCL CL 1.1:
     reports:
       junit: test/pytest.xml
 
-Python 3 POCL:
+Python 3 POCL K40:
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable:pthread
+  - export PYOPENCL_TEST=portable:k40
+  - export EXTRA_INSTALL="pybind11 numpy mako"
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python3
+  - pocl
+  - nvidia-k40
+  except:
+  - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
+Python 3 POCL Titan V:
+  script:
+  - export PY_EXE=python3
+  - export PYOPENCL_TEST=portable:titan
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
   - python3
   - pocl
+  - nvidia-titan-v
   except:
   - tags
   artifacts:
-- 
GitLab


From 73faf922bf97095fbdb3568ed8df6b04637d01ea Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 20 Jan 2021 12:15:45 -0600
Subject: [PATCH 40/47] Xfail a few tests in test_wrapper on POCL+Nvidia

---
 test/test_wrapper.py | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index 8aad416e..1068779b 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -46,10 +46,17 @@ else:
     faulthandler.enable()
 
 
-def _skip_if_pocl(plat, up_to_version, msg="unsupported by pocl"):
+def _xfail_if_pocl(plat, up_to_version, msg="unsupported by pocl"):
     if plat.vendor == "The pocl project":
         if up_to_version is None or get_pocl_version(plat) <= up_to_version:
-            pytest.skip(msg)
+            pytest.xfail(msg)
+
+
+def _xfail_if_pocl_gpu(device, what):
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU:
+        pytest.xfail(f"POCL's {what} support don't work right on Nvidia GPUs, "
+                "at least the Titan V, as of pocl 1.6, 2021-01-20")
 
 
 def test_get_info(ctx_factory):
@@ -382,7 +389,7 @@ def test_image_2d(ctx_factory):
     if "Intel" in device.vendor and "31360.31426" in device.version:
         from pytest import skip
         skip("images crashy on %s" % device)
-    _skip_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP")
+    _xfail_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP")
 
     prg = cl.Program(context, """
         __kernel void copy_image(
@@ -454,7 +461,7 @@ def test_image_3d(ctx_factory):
     if device.platform.vendor == "Intel(R) Corporation":
         from pytest import skip
         skip("images crashy on %s" % device)
-    _skip_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP")
+    _xfail_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP")
 
     prg = cl.Program(context, """
         __kernel void copy_image_plane(
@@ -676,7 +683,7 @@ def test_can_build_and_run_binary(ctx_factory):
 def test_enqueue_barrier_marker(ctx_factory):
     ctx = ctx_factory()
     # Still relevant on pocl 1.0RC1.
-    _skip_if_pocl(
+    _xfail_if_pocl(
             ctx.devices[0].platform, (1, 0), "pocl crashes on enqueue_barrier")
 
     queue = cl.CommandQueue(ctx)
@@ -704,7 +711,7 @@ def test_unload_compiler(platform):
             or cl.get_cl_header_version() < (1, 2)):
         from pytest import skip
         skip("clUnloadPlatformCompiler is only available in OpenCL 1.2")
-    _skip_if_pocl(platform, (0, 13), "pocl does not support unloading compiler")
+    _xfail_if_pocl(platform, (0, 13), "pocl does not support unloading compiler")
     if platform.vendor == "Intel(R) Corporation":
         from pytest import skip
         skip("Intel proprietary driver does not support unloading compiler")
@@ -745,7 +752,7 @@ def test_user_event(ctx_factory):
         skip("UserEvent is only available in OpenCL 1.1")
 
     # https://github.com/pocl/pocl/issues/201
-    _skip_if_pocl(ctx.devices[0].platform, (0, 13),
+    _xfail_if_pocl(ctx.devices[0].platform, (0, 13),
             "pocl's user events don't work right")
 
     status = {}
@@ -842,6 +849,8 @@ def test_event_set_callback(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    _xfail_if_pocl_gpu(queue.device, "event callbacks")
+
     if ctx._get_cl_version() < (1, 1):
         pytest.skip("OpenCL 1.1 or newer required for set_callback")
 
@@ -896,6 +905,8 @@ def test_global_offset(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
+    _xfail_if_pocl_gpu(queue.device, "global offset")
+
     prg = cl.Program(context, """
         __kernel void mult(__global float *a)
         { a[get_global_id(0)] *= 2; }
@@ -981,6 +992,8 @@ def test_coarse_grain_svm(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    _xfail_if_pocl_gpu(queue.device, "SVM")
+
     dev = ctx.devices[0]
 
     from pyopencl.characterize import has_coarse_grain_buffer_svm
@@ -1062,6 +1075,8 @@ def test_fine_grain_svm(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    _xfail_if_pocl_gpu(queue.device, "GPU SVM")
+
     from pyopencl.characterize import has_fine_grain_buffer_svm
     from pytest import skip
     if not has_fine_grain_buffer_svm(queue.device):
@@ -1148,6 +1163,8 @@ def test_copy_buffer_rect(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    _xfail_if_pocl_gpu(queue.device, "rectangular copies")
+
     arr1 = cl_array.zeros(queue, (2, 3), "f")
     arr2 = cl_array.zeros(queue, (4, 5), "f")
     arr1.fill(1)
-- 
GitLab


From 0abbbaffef8b435bc62a0049a12891bcc9658118 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 20 Jan 2021 12:42:21 -0600
Subject: [PATCH 41/47] Make {Scalar,Vector,Other}Arg hashable

---
 pyopencl/tools.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 4800b15b..3be95ee2 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -339,6 +339,12 @@ class DtypedArgument(Argument):
                 and self.dtype == other.dtype
                 and self.name == other.name)
 
+    def __hash__(self):
+        return (
+                hash(type(self))
+                ^ hash(self.dtype)
+                ^ hash(self.name))
+
 
 class VectorArg(DtypedArgument):
     def __init__(self, dtype, name, with_offset=False):
@@ -359,6 +365,9 @@ class VectorArg(DtypedArgument):
         return (super().__eq__(other)
                 and self.with_offset == other.with_offset)
 
+    def __hash__(self):
+        return super.__hash__() ^ hash(self.with_offset)
+
 
 class ScalarArg(DtypedArgument):
     def declarator(self):
@@ -373,6 +382,17 @@ class OtherArg(Argument):
     def declarator(self):
         return self.decl
 
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self.decl == other.decl
+                and self.name == other.name)
+
+    def __hash__(self):
+        return (
+                hash(type(self))
+                ^ hash(self.decl)
+                ^ hash(self.name))
+
 
 def parse_c_arg(c_arg, with_offset=False):
     for aspace in ["__local", "__constant"]:
-- 
GitLab


From 385a8fefde149fe7dce25b2cc28c0c62de1b8781 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 20 Jan 2021 13:47:52 -0600
Subject: [PATCH 42/47] Xfail bitonic argsort on pocl nvidia

---
 test/test_algorithm.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/test_algorithm.py b/test/test_algorithm.py
index 353af281..676aee37 100644
--- a/test/test_algorithm.py
+++ b/test/test_algorithm.py
@@ -1087,6 +1087,12 @@ def test_bitonic_argsort(ctx_factory, size, dtype):
     ctx = cl.create_some_context()
     queue = cl.CommandQueue(ctx)
 
+    device = queue.device
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU:
+        pytest.xfail("bitonic argsort fails on POCL + Nvidia,"
+                "at least the K40, as of pocl 1.6, 2021-01-20")
+
     dev = ctx.devices[0]
     if (dev.platform.name == "Portable Computing Language"
             and sys.platform == "darwin"):
-- 
GitLab


From 50ee7ae805cfffc93222a7a065a73b6430ce45ee Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 20 Jan 2021 15:22:42 -0600
Subject: [PATCH 43/47] Xfail test_array test_get_async on Pocl GPU

---
 test/test_array.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/test_array.py b/test/test_array.py
index b4234a37..2821d923 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1350,6 +1350,12 @@ def test_get_async(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
+    device = queue.device
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU:
+        pytest.xfail("the async get test fails on POCL + Nvidia,"
+                "at least the K40, as of pocl 1.6, 2021-01-20")
+
     a = np.random.rand(10**6).astype(np.dtype("float32"))
     a_gpu = cl_array.to_device(queue, a)
     b = a + a**5 + 1
-- 
GitLab


From 7279388562ab33a7dff99da0477c1ad5eac1b972 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 20 Jan 2021 16:27:31 -0600
Subject: [PATCH 44/47] Xfail ranlux on on POCL + Nvidia

---
 test/test_clrandom.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/test_clrandom.py b/test/test_clrandom.py
index 1ce479b2..1ec53842 100644
--- a/test/test_clrandom.py
+++ b/test/test_clrandom.py
@@ -68,6 +68,13 @@ def test_clrandom_dtypes(ctx_factory, rng_class, dtype):
     size = 10
 
     with cl.CommandQueue(cl_ctx) as queue:
+        device = queue.device
+        if device.platform.vendor == "The pocl project" \
+                and device.type & cl.device_type.GPU \
+                and rng_class is make_ranlux_generator:
+            pytest.xfail("ranlux test fails on POCL + Nvidia,"
+                    "at least the K40, as of pocl 1.6, 2021-01-20")
+
         rng.uniform(queue, size, dtype)
 
         if dtype not in (np.int32, np.int64):
-- 
GitLab


From 67984a7aa0dd8682679badb1c2f53f91ac194a52 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 20 Jan 2021 16:45:08 -0600
Subject: [PATCH 45/47] Xfail rect copies on Pocl GPU

---
 test/test_enqueue_copy.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/test/test_enqueue_copy.py b/test/test_enqueue_copy.py
index 55f72d95..402bc8b9 100644
--- a/test/test_enqueue_copy.py
+++ b/test/test_enqueue_copy.py
@@ -48,8 +48,14 @@ def test_enqueue_copy_rect_2d(ctx_factory, honor_skip=True):
         # https://github.com/pocl/pocl/issues/353
         pytest.skip("POCL's rectangular copies crash")
 
+    device = queue.device
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU:
+        pytest.xfail("rect copies fail on POCL + Nvidia,"
+                "at least the K40, as of pocl 1.6, 2021-01-20")
+
     if honor_skip and queue.device.platform.name == "Apple":
-        pytest.skip("Apple's CL implementation crashes on this.")
+        pytest.xfail("Apple's CL implementation crashes on this.")
 
     ary_in_shp = 256, 128  # Entire array shape from which sub-array copied to device
     sub_ary_shp = 128, 96  # Sub-array shape to be copied to device
@@ -138,6 +144,12 @@ def test_enqueue_copy_rect_3d(ctx_factory, honor_skip=True):
         # https://github.com/pocl/pocl/issues/353
         pytest.skip("POCL's rectangular copies crash")
 
+    device = queue.device
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU:
+        pytest.xfail("rect copies fail on POCL + Nvidia,"
+                "at least the K40, as of pocl 1.6, 2021-01-20")
+
     if honor_skip and queue.device.platform.name == "Apple":
         pytest.skip("Apple's CL implementation crashes on this.")
 
-- 
GitLab


From 2f39f7022f478bb45a6f211f8550a4b275b909f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Wed, 20 Jan 2021 17:25:23 -0600
Subject: [PATCH 46/47] Elementwise capture_as implementation: don't return
 after capture_call

---
 pyopencl/elementwise.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index fcb94ee9..dae42b7e 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -310,7 +310,7 @@ class ElementwiseKernel:
 
         if capture_as is not None:
             kernel.set_args(*invocation_args)
-            return kernel.capture_call(
+            kernel.capture_call(
                     capture_as, queue,
                     gs, ls, *invocation_args, wait_for=wait_for)
 
-- 
GitLab


From d280d498d0c4c477ec2851799912b3ecebfa64f9 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 20 Jan 2021 18:13:35 -0600
Subject: [PATCH 47/47] Xfail a few more ranlux tests on pocl+nvidia

---
 test/test_array.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/test/test_array.py b/test/test_array.py
index 2821d923..899fc238 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -715,6 +715,13 @@ def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
+    device = queue.device
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU \
+            and rng_class is RanluxGenerator:
+        pytest.xfail("ranlux test fails on POCL + Nvidia,"
+                "at least the Titan V, as of pocl 1.6, 2021-01-20")
+
     if has_double_support(context.devices[0]):
         dtypes = [np.float32, np.float64]
     else:
@@ -767,6 +774,12 @@ def test_random_int_in_range(ctx_factory, rng_class, dtype, plot_hist=False):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
+    if queue.device.platform.vendor == "The pocl project" \
+            and queue.device.type & cl.device_type.GPU \
+            and rng_class is RanluxGenerator:
+        pytest.xfail("ranlux test fails on POCL + Nvidia,"
+                "at least the Titan V, as of pocl 1.6, 2021-01-20")
+
     if rng_class is RanluxGenerator:
         gen = rng_class(queue, 5120)
     else:
-- 
GitLab