diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index deaf28aa64dc2037e0d942fefa2cf9ec125b77b4..431495595c2d0ca4ba65380e5f38b8b6e579dd77 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -15,7 +15,7 @@ Python 3 Intel CPU:
     reports:
       junit: test/pytest.xml
 
-Python 3 Titan X:
+Python 3 Nvidia Titan X:
   script:
   - export PY_EXE=python3
   - export PYOPENCL_TEST=nvi:titan
@@ -31,7 +31,7 @@ Python 3 Titan X:
     reports:
       junit: test/pytest.xml
 
-Python 3 Titan V:
+Python 3 Nvidia Titan V:
   script:
   - export PY_EXE=python3
   - export PYOPENCL_TEST=nvi:titan
@@ -47,7 +47,7 @@ Python 3 Titan V:
     reports:
       junit: test/pytest.xml
 
-Python 3 K40:
+Python 3 Nvidia K40:
   script:
   - export PY_EXE=python3
   - export PYOPENCL_TEST=nvi:k40
@@ -117,16 +117,34 @@ Python 3 POCL CL 1.1:
     reports:
       junit: test/pytest.xml
 
-Python 3 POCL:
+Python 3 POCL K40:
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable:pthread
+  - export PYOPENCL_TEST=portable:k40
+  - export EXTRA_INSTALL="pybind11 numpy mako"
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python3
+  - pocl
+  - nvidia-k40
+  except:
+  - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
+Python 3 POCL Titan V:
+  script:
+  - export PY_EXE=python3
+  - export PYOPENCL_TEST=portable:titan
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
   - python3
   - pocl
+  - nvidia-titan-v
   except:
   - tags
   artifacts:
diff --git a/examples/demo.py b/examples/demo.py
index 623660fee1b20b9ba140504ca594cc648e28bc45..a4a503336e126cf5a392fc32f040166c6d92b939 100644
--- a/examples/demo.py
+++ b/examples/demo.py
@@ -23,7 +23,8 @@ __kernel void sum(
 """).build()
 
 res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
-prg.sum(queue, a_np.shape, None, a_g, b_g, res_g)
+knl = prg.sum  # Use this Kernel object for repeated calls
+knl(queue, a_np.shape, None, a_g, b_g, res_g)
 
 res_np = np.empty_like(a_np)
 cl.enqueue_copy(queue, res_np, res_g)
diff --git a/examples/demo_array.py b/examples/demo_array.py
index 41b0f79ef2ccb74a807a8da5aff5eedf6a3bb15f..74bb7cfc6fead21ff0a0bb29266e82541586aa9e 100644
--- a/examples/demo_array.py
+++ b/examples/demo_array.py
@@ -22,6 +22,7 @@ prg = cl.Program(ctx, """
     }
     """).build()
 
-prg.sum(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data)
+knl = prg.sum  # Use this Kernel object for repeated calls
+knl(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data)
 
 print(la.norm((dest_dev - (a_dev+b_dev)).get()))
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 6e0268dc1f5ba748d85af0586e773cf17182eb38..1d304fd2447425785b99b17adaccf14e832bc3a7 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -819,8 +819,8 @@ def _add_functionality():
         self._wg_info_cache = {}
         return self
 
-    def kernel_set_scalar_arg_dtypes(self, scalar_arg_dtypes):
-        self._scalar_arg_dtypes = tuple(scalar_arg_dtypes)
+    def kernel_set_arg_types(self, arg_types):
+        arg_types = tuple(arg_types)
 
         # {{{ arg counting bug handling
 
@@ -847,21 +847,31 @@ def _add_functionality():
         # }}}
 
         from pyopencl.invoker import generate_enqueue_and_set_args
-        self._enqueue, self._set_args = generate_enqueue_and_set_args(
-                self.function_name,
-                len(scalar_arg_dtypes), self.num_args,
-                self._scalar_arg_dtypes,
-                warn_about_arg_count_bug=warn_about_arg_count_bug,
-                work_around_arg_count_bug=work_around_arg_count_bug)
+        enqueue, my_set_args = \
+                generate_enqueue_and_set_args(
+                        self.function_name,
+                        len(arg_types), self.num_args,
+                        arg_types,
+                        warn_about_arg_count_bug=warn_about_arg_count_bug,
+                        work_around_arg_count_bug=work_around_arg_count_bug)
+
+        # Make ourselves a kernel-specific class, so that we're able to override
+        # __call__. Inspired by https://stackoverflow.com/a/38541437
+        class KernelWithCustomEnqueue(type(self)):
+            __call__ = enqueue
+            set_args = my_set_args
+
+        self.__class__ = KernelWithCustomEnqueue
 
     def kernel_get_work_group_info(self, param, device):
+        cache_key = (param, device.int_ptr)
         try:
-            return self._wg_info_cache[param, device]
+            return self._wg_info_cache[cache_key]
         except KeyError:
             pass
 
         result = kernel_old_get_work_group_info(self, param, device)
-        self._wg_info_cache[param, device] = result
+        self._wg_info_cache[cache_key] = result
         return result
 
     def kernel_set_args(self, *args, **kwargs):
@@ -871,6 +881,9 @@ def _add_functionality():
     def kernel_call(self, queue, global_size, local_size, *args, **kwargs):
         # __call__ can't be overridden directly, so we need this
         # trampoline hack.
+
+        # Note: This is only used for the generic __call__, before
+        # kernel_set_scalar_arg_dtypes is called.
         return self._enqueue(self, queue, global_size, local_size, *args, **kwargs)
 
     def kernel_capture_call(self, filename, queue, global_size, local_size,
@@ -890,7 +903,11 @@ def _add_functionality():
     Kernel.__init__ = kernel_init
     Kernel._setup = kernel__setup
     Kernel.get_work_group_info = kernel_get_work_group_info
-    Kernel.set_scalar_arg_dtypes = kernel_set_scalar_arg_dtypes
+
+    # FIXME: Possibly deprecate this version
+    Kernel.set_scalar_arg_dtypes = kernel_set_arg_types
+    Kernel.set_arg_types = kernel_set_arg_types
+
     Kernel.set_args = kernel_set_args
     Kernel.__call__ = kernel_call
     Kernel.capture_call = kernel_capture_call
diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py
index c4eb43eaef3b144e8b6217551a1f44ccabf2de3c..446eb9c318d8d241d0ef6fa07fb80eeea8fb57cd 100644
--- a/pyopencl/algorithm.py
+++ b/pyopencl/algorithm.py
@@ -1188,8 +1188,8 @@ class ListOfListsBuilder:
             gsize = (4*queue.device.max_compute_units,)
             lsize = (1,)
         else:
-            from pyopencl.array import splay
-            gsize, lsize = splay(queue, n_objects)
+            from pyopencl.array import _splay
+            gsize, lsize = _splay(queue.device, n_objects)
 
         count_event = count_kernel(queue, gsize, lsize,
                 *(tuple(count_list_args) + data_args + (n_objects,)),
diff --git a/pyopencl/array.py b/pyopencl/array.py
index da5fdf0c8da6341b95f345cf0feb784b601f1f0d..e5cdda02609923c50cd8d2c8a6d5f4d33d7cbdbe 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -33,7 +33,6 @@ from functools import reduce
 import numpy as np
 import pyopencl.elementwise as elementwise
 import pyopencl as cl
-from pytools import memoize_method
 from pyopencl.compyte.array import (
         as_strided as _as_strided,
         f_contiguous_strides as _f_contiguous_strides,
@@ -45,9 +44,38 @@ from pyopencl.characterize import has_double_support
 from pyopencl import cltypes
 
 
+_COMMON_DTYPE_CACHE = {}
+
+
 def _get_common_dtype(obj1, obj2, queue):
-    return _get_common_dtype_base(obj1, obj2,
-                                  has_double_support(queue.device))
+    dsupport = has_double_support(queue.device)
+    cache_key = None
+    o1_dtype = obj1.dtype
+    try:
+        cache_key = (o1_dtype, obj2.dtype, dsupport)
+        return _COMMON_DTYPE_CACHE[cache_key]
+    except KeyError:
+        pass
+    except AttributeError:
+        # obj2 doesn't have a dtype
+        try:
+            tobj2 = type(obj2)
+            cache_key = (o1_dtype, tobj2, dsupport)
+
+            # Integers are weird, sized, and signed. Don't pretend that 'int'
+            # is enough information to decide what should happen.
+            if tobj2 != int:
+                return _COMMON_DTYPE_CACHE[cache_key]
+        except KeyError:
+            pass
+
+    result = _get_common_dtype_base(obj1, obj2, dsupport)
+
+    # we succeeded in constructing the cache key
+    if cache_key is not None:
+        _COMMON_DTYPE_CACHE[cache_key] = result
+
+    return result
 
 
 def _get_truedivide_dtype(obj1, obj2, queue):
@@ -69,19 +97,6 @@ def _get_truedivide_dtype(obj1, obj2, queue):
     return result
 
 
-# Work around PyPy not currently supporting the object dtype.
-# (Yes, it doesn't even support checking!)
-# (as of May 27, 2014 on PyPy 2.3)
-try:
-    np.dtype(object)
-
-    def _dtype_is_object(t):
-        return t == object
-except Exception:
-    def _dtype_is_object(t):
-        return False
-
-
 class InconsistentOpenCLQueueWarning(UserWarning):
     pass
 
@@ -103,19 +118,18 @@ class VecLookupWarner:
 
 vec = VecLookupWarner()
 
-# {{{ helper functionality
 
+# {{{ helper functionality
 
-def splay(queue, n, kernel_specific_max_wg_size=None):
-    dev = queue.device
-    max_work_items = _builtin_min(128, dev.max_work_group_size)
+def _splay(device, n, kernel_specific_max_wg_size=None):
+    max_work_items = _builtin_min(128, device.max_work_group_size)
 
     if kernel_specific_max_wg_size is not None:
         from builtins import min
         max_work_items = min(max_work_items, kernel_specific_max_wg_size)
 
     min_work_items = _builtin_min(32, max_work_items)
-    max_groups = dev.max_compute_units * 4 * 8
+    max_groups = device.max_compute_units * 4 * 8
     # 4 to overfill the device
     # 8 is an Nvidia constant--that's how many
     # groups fit onto one compute device
@@ -159,49 +173,21 @@ def elwise_kernel_runner(kernel_getter):
 
         wait_for = kwargs.pop("wait_for", None)
 
-        # wait_for must be a copy, because we modify it in-place below
-        if wait_for is None:
-            wait_for = []
-        else:
-            wait_for = list(wait_for)
-
         knl = kernel_getter(*args, **kwargs)
 
-        gs, ls = repr_ary.get_sizes(queue,
+        gs, ls = repr_ary._get_sizes(queue,
                 knl.get_work_group_info(
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
 
         assert isinstance(repr_ary, Array)
-
-        actual_args = []
-        for arg in args:
-            if isinstance(arg, Array):
-                if not arg.flags.forc:
-                    raise RuntimeError("only contiguous arrays may "
-                            "be used as arguments to this operation")
-                actual_args.append(arg.base_data)
-                actual_args.append(arg.offset)
-                wait_for.extend(arg.events)
-
-                if (implicit_queue
-                        and arg.queue is not None
-                        and arg.queue != queue):
-                    from warnings import warn
-
-                    warn("Implicit queue in elementwise operation does not match "
-                            "queue of a provided argument. This will become an "
-                            "error in 2021.",
-                            type=InconsistentOpenCLQueueWarning)
-            else:
-                actual_args.append(arg)
-        actual_args.append(repr_ary.size)
+        args = args + (repr_ary.size,)
 
         if ARRAY_KERNEL_EXEC_HOOK is not None:
             return ARRAY_KERNEL_EXEC_HOOK(  # pylint: disable=not-callable
-                    knl, queue, gs, ls, *actual_args, wait_for=wait_for)
+                    knl, queue, gs, ls, *args, wait_for=wait_for)
         else:
-            return knl(queue, gs, ls, *actual_args, wait_for=wait_for)
+            return knl(queue, gs, ls, *args, wait_for=wait_for)
 
     try:
         from functools import update_wrapper
@@ -220,15 +206,6 @@ class DefaultAllocator(cl.tools.DeferredAllocator):
                 DeprecationWarning, 2)
         cl.tools.DeferredAllocator.__init__(self, *args, **kwargs)
 
-
-def _make_strides(itemsize, shape, order):
-    if order in "fF":
-        return _f_contiguous_strides(itemsize, shape)
-    elif order in "cC":
-        return _c_contiguous_strides(itemsize, shape)
-    else:
-        raise ValueError("invalid order: %s" % order)
-
 # }}}
 
 
@@ -249,6 +226,9 @@ class _copy_queue:  # noqa
     pass
 
 
+_ARRAY_GET_SIZES_CACHE = {}
+
+
 class Array:
     """A :class:`numpy.ndarray` work-alike that stores its data and performs
     its computations on the compute device.  *shape* and *dtype* work exactly
@@ -435,7 +415,7 @@ class Array:
     __array_priority__ = 100
 
     def __init__(self, cq, shape, dtype, order="C", allocator=None,
-            data=None, offset=0, strides=None, events=None):
+            data=None, offset=0, strides=None, events=None, _flags=None):
         # {{{ backward compatibility
 
         if isinstance(cq, cl.CommandQueue):
@@ -479,6 +459,9 @@ class Array:
             size = 1
             for dim in shape:
                 size *= dim
+                if dim < 0:
+                    raise ValueError("negative dimensions are not allowed")
+
         except TypeError:
             admissible_types = (int, np.integer)
 
@@ -486,16 +469,27 @@ class Array:
                 raise TypeError("shape must either be iterable or "
                         "castable to an integer")
             size = shape
+            if shape < 0:
+                raise ValueError("negative dimensions are not allowed")
             shape = (shape,)
 
-        if any(dim < 0 for dim in shape):
-            raise ValueError("negative dimensions are not allowed")
-
         if isinstance(size, np.integer):
             size = size.item()
 
         if strides is None:
-            strides = _make_strides(dtype.itemsize, shape, order)
+            if order in "cC":
+                # inlined from compyte.array.c_contiguous_strides
+                if shape:
+                    strides = [dtype.itemsize]
+                    for s in shape[:0:-1]:
+                        strides.append(strides[-1]*s)
+                    strides = tuple(strides[::-1])
+                else:
+                    strides = ()
+            elif order in "fF":
+                strides = _f_contiguous_strides(dtype.itemsize, shape)
+            else:
+                raise ValueError("invalid order: %s" % order)
 
         else:
             # FIXME: We should possibly perform some plausibility
@@ -505,9 +499,8 @@ class Array:
 
         # }}}
 
-        if _dtype_is_object(dtype):
-            raise TypeError("object arrays on the compute device are not allowed")
-
+        assert dtype != np.object, \
+                "object arrays on the compute device are not allowed"
         assert isinstance(shape, tuple)
         assert isinstance(strides, tuple)
 
@@ -547,6 +540,7 @@ class Array:
 
         self.offset = offset
         self.context = context
+        self._flags = _flags
 
     @property
     def ndim(self):
@@ -560,9 +554,11 @@ class Array:
             return self.base_data
 
     @property
-    @memoize_method
     def flags(self):
-        return _ArrayFlags(self)
+        f = self._flags
+        if f is None:
+            self._flags = f = _ArrayFlags(self)
+        return f
 
     def _new_with_changes(self, data, offset, shape=None, dtype=None,
             strides=None, queue=_copy_queue, allocator=None):
@@ -612,12 +608,17 @@ class Array:
         return self._new_with_changes(self.base_data, self.offset,
                 queue=queue)
 
-    #@memoize_method FIXME: reenable
-    def get_sizes(self, queue, kernel_specific_max_wg_size=None):
+    def _get_sizes(self, queue, kernel_specific_max_wg_size=None):
         if not self.flags.forc:
             raise NotImplementedError("cannot operate on non-contiguous array")
-        return splay(queue, self.size,
-                kernel_specific_max_wg_size=kernel_specific_max_wg_size)
+        cache_key = (queue.device.int_ptr, self.size, kernel_specific_max_wg_size)
+        try:
+            return _ARRAY_GET_SIZES_CACHE[cache_key]
+        except KeyError:
+            sizes = _splay(queue.device, self.size,
+                    kernel_specific_max_wg_size=kernel_specific_max_wg_size)
+            _ARRAY_GET_SIZES_CACHE[cache_key] = sizes
+            return sizes
 
     def set(self, ary, queue=None, async_=None, **kwargs):
         """Transfer the contents the :class:`numpy.ndarray` object *ary*
@@ -963,19 +964,21 @@ class Array:
 
     def _new_like_me(self, dtype=None, queue=None):
         strides = None
+        flags = None
         if dtype is None:
             dtype = self.dtype
 
         if dtype == self.dtype:
             strides = self.strides
+            flags = self.flags
 
         queue = queue or self.queue
         if queue is not None:
             return self.__class__(queue, self.shape, dtype,
-                    allocator=self.allocator, strides=strides)
+                    allocator=self.allocator, strides=strides, _flags=flags)
         else:
             return self.__class__(self.context, self.shape, dtype,
-                    strides=strides, allocator=self.allocator)
+                    strides=strides, allocator=self.allocator, _flags=flags)
 
     @staticmethod
     @elwise_kernel_runner
@@ -2081,7 +2084,7 @@ def to_device(queue, ary, allocator=None, async_=None,
 
     # }}}
 
-    if _dtype_is_object(ary.dtype):
+    if ary.dtype == np.object:
         raise RuntimeError("to_device does not work on object arrays.")
 
     if array_queue is _same_as_transfer:
@@ -2297,7 +2300,7 @@ def multi_take(arrays, indices, out=None, queue=None):
         if start_i + chunk_size > vec_count:
             knl = make_func_for_chunk_size(vec_count-start_i)
 
-        gs, ls = indices.get_sizes(queue,
+        gs, ls = indices._get_sizes(queue,
                 knl.get_work_group_info(
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
@@ -2375,24 +2378,18 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None,
         if start_i + chunk_size > vec_count:
             knl = make_func_for_chunk_size(vec_count-start_i)
 
-        gs, ls = src_indices.get_sizes(queue,
+        gs, ls = src_indices._get_sizes(queue,
                 knl.get_work_group_info(
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
 
-        from pytools import flatten
         wait_for_this = (dest_indices.events + src_indices.events
             + _builtin_sum((i.events for i in arrays[chunk_slice]), [])
             + _builtin_sum((o.events for o in out[chunk_slice]), []))
         evt = knl(queue, gs, ls,
-                *([o.data for o in out[chunk_slice]]
-                    + [dest_indices.base_data,
-                        dest_indices.offset,
-                        src_indices.base_data,
-                        src_indices.offset]
-                    + list(flatten(
-                        (i.base_data, i.offset)
-                        for i in arrays[chunk_slice]))
+                *([o for o in out[chunk_slice]]
+                    + [dest_indices, src_indices]
+                    + [i for i in arrays[chunk_slice]]
                     + src_offsets_list[chunk_slice]
                     + [src_indices.size]), wait_for=wait_for_this)
         for o in out[chunk_slice]:
@@ -2458,27 +2455,20 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
         if start_i + chunk_size > vec_count:
             knl = make_func_for_chunk_size(vec_count-start_i)
 
-        gs, ls = dest_indices.get_sizes(queue,
+        gs, ls = dest_indices._get_sizes(queue,
                 knl.get_work_group_info(
                     cl.kernel_work_group_info.WORK_GROUP_SIZE,
                     queue.device))
 
-        from pytools import flatten
         wait_for_this = (wait_for
             + _builtin_sum((i.events for i in arrays[chunk_slice]), [])
             + _builtin_sum((o.events for o in out[chunk_slice]), []))
         evt = knl(queue, gs, ls,
                 *(
-                    list(flatten(
-                        (o.base_data, o.offset)
-                        for o in out[chunk_slice]))
-                    + [dest_indices.base_data, dest_indices.offset]
-                    + list(flatten(
-                        (i.base_data, i.offset)
-                        for i in arrays[chunk_slice]))
-                    + [use_fill_cla.base_data, use_fill_cla.offset]
-                    + [array_lengths_cla.base_data, array_lengths_cla.offset]
-                    + [dest_indices.size]),
+                    [o for o in out[chunk_slice]]
+                    + [dest_indices]
+                    + [i for i in arrays[chunk_slice]]
+                    + [use_fill_cla, array_lengths_cla, dest_indices.size]),
                 wait_for=wait_for_this)
 
         for o in out[chunk_slice]:
diff --git a/pyopencl/clrandom.py b/pyopencl/clrandom.py
index ea3862bc4d999e593e1dda9ad8d549423ffeb7b2..dd6c1276cffe2effb53255e1c2d8bc02ec24a3ac 100644
--- a/pyopencl/clrandom.py
+++ b/pyopencl/clrandom.py
@@ -662,8 +662,8 @@ class Random123GeneratorBase:
                 scale, shift]
 
         n = ary.size
-        from pyopencl.array import splay
-        gsize, lsize = splay(queue, ary.size)
+        from pyopencl.array import _splay
+        gsize, lsize = _splay(queue.device, ary.size)
 
         evt = knl(queue, gsize, lsize, *args)
         ary.add_event(evt)
diff --git a/pyopencl/compyte b/pyopencl/compyte
index fbfe788a2dcb190fd241fd42ad047e33bafd85b8..7533db88124045924a47d7392eaf9a078670fc4d 160000
--- a/pyopencl/compyte
+++ b/pyopencl/compyte
@@ -1 +1 @@
-Subproject commit fbfe788a2dcb190fd241fd42ad047e33bafd85b8
+Subproject commit 7533db88124045924a47d7392eaf9a078670fc4d
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index df364eda3c883d378c1e9d25136d8f59f5763f9d..dae42b7e70883c18f88f39461f925ae124926d82 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -156,10 +156,10 @@ def get_elwise_kernel_and_types(context, arguments, operation,
         name=name, options=options, preamble=preamble,
         use_range=use_range, loop_prep=loop_prep, **kwargs)
 
-    from pyopencl.tools import get_arg_list_scalar_arg_dtypes
+    from pyopencl.tools import get_arg_list_arg_types
 
     kernel = getattr(prg, name)
-    kernel.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(parsed_args))
+    kernel.set_scalar_arg_dtypes(get_arg_list_arg_types(parsed_args))
 
     return kernel, parsed_args
 
@@ -260,17 +260,10 @@ class ElementwiseKernel:
         invocation_args = []
         for arg, arg_descr in zip(args, arg_descrs):
             if isinstance(arg_descr, VectorArg):
-                if not arg.flags.forc:
-                    raise RuntimeError("ElementwiseKernel cannot "
-                            "deal with non-contiguous arrays")
-
                 if repr_vec is None:
                     repr_vec = arg
 
-                invocation_args.append(arg.base_data)
-                if arg_descr.with_offset:
-                    invocation_args.append(arg.offset)
-                wait_for.extend(arg.events)
+                invocation_args.append(arg)
             else:
                 invocation_args.append(arg)
 
@@ -307,13 +300,13 @@ class ElementwiseKernel:
 
             invocation_args.append(step)
 
-            from pyopencl.array import splay
-            gs, ls = splay(queue,
+            from pyopencl.array import _splay
+            gs, ls = _splay(queue.device,
                     abs(range_.stop - start)//step,
                     max_wg_size)
         else:
             invocation_args.append(repr_vec.size)
-            gs, ls = repr_vec.get_sizes(queue, max_wg_size)
+            gs, ls = repr_vec._get_sizes(queue, max_wg_size)
 
         if capture_as is not None:
             kernel.set_args(*invocation_args)
@@ -321,9 +314,7 @@ class ElementwiseKernel:
                     capture_as, queue,
                     gs, ls, *invocation_args, wait_for=wait_for)
 
-        kernel.set_args(*invocation_args)
-        return cl.enqueue_nd_range_kernel(queue, kernel,
-                gs, ls, wait_for=wait_for)
+        return kernel(queue, gs, ls, *invocation_args, wait_for=wait_for)
 
 # }}}
 
diff --git a/pyopencl/invoker.py b/pyopencl/invoker.py
index c996768d97d8f9a3e58a99e4839db5f37143128e..77f902071296049d7286cf3996571ce08004f208 100644
--- a/pyopencl/invoker.py
+++ b/pyopencl/invoker.py
@@ -22,15 +22,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import sys
 import numpy as np
 
 from warnings import warn
 import pyopencl._cl as _cl
 from pytools.persistent_dict import WriteOncePersistentDict
-from pyopencl.tools import _NumpyTypesKeyBuilder
-
-_PYPY = "__pypy__" in sys.builtin_module_names
+from pytools.py_codegen import Indentation, PythonCodeGenerator
+from pyopencl.tools import _NumpyTypesKeyBuilder, VectorArg
 
 
 # {{{ arg packing helpers
@@ -50,66 +48,22 @@ del _size_t_char
 # }}}
 
 
-# {{{ individual arg handling
-
-def generate_buffer_arg_setter(gen, arg_idx, buf_var):
-    from pytools.py_codegen import Indentation
-
-    if _PYPY:
-        # https://github.com/numpy/numpy/issues/5381
-        gen(f"if isinstance({buf_var}, np.generic):")
-        with Indentation(gen):
-            if _PYPY:
-                gen("{buf_var} = np.asarray({buf_var})".format(buf_var=buf_var))
-            else:
-                gen("{buf_var} = np.getbuffer({buf_var})".format(buf_var=buf_var))
-
-    gen("""
-        self._set_arg_buf({arg_idx}, {buf_var})
-        """
-        .format(arg_idx=arg_idx, buf_var=buf_var))
-
-
-def generate_bytes_arg_setter(gen, arg_idx, buf_var):
-    gen("""
-        self._set_arg_buf({arg_idx}, {buf_var})
-        """
-        .format(arg_idx=arg_idx, buf_var=buf_var))
-
-
-def generate_generic_arg_handler(gen, arg_idx, arg_var):
-    from pytools.py_codegen import Indentation
-
-    gen("""
-        if {arg_var} is None:
-            self._set_arg_null({arg_idx})
-        elif isinstance({arg_var}, _KERNEL_ARG_CLASSES):
-            self.set_arg({arg_idx}, {arg_var})
-        """
-        .format(arg_idx=arg_idx, arg_var=arg_var))
-
-    gen("else:")
-    with Indentation(gen):
-        generate_buffer_arg_setter(gen, arg_idx, arg_var)
-
-# }}}
-
-
 # {{{ generic arg handling body
 
 def generate_generic_arg_handling_body(num_args):
-    from pytools.py_codegen import PythonCodeGenerator
     gen = PythonCodeGenerator()
 
     if num_args == 0:
         gen("pass")
+    else:
+        gen_indices_and_args = []
+        for i in range(num_args):
+            gen_indices_and_args.append(i)
+            gen_indices_and_args.append(f"arg{i}")
 
-    for i in range(num_args):
-        gen(f"# process argument {i}")
-        gen("")
-        gen(f"current_arg = {i}")
-        generate_generic_arg_handler(gen, i, "arg%d" % i)
-        gen("")
+        gen(f"self._set_arg_multi("
+                f"({', '.join(str(i) for i in gen_indices_and_args)},), "
+                ")")
 
     return gen
 
@@ -118,9 +72,13 @@ def generate_generic_arg_handling_body(num_args):
 
 # {{{ specific arg handling body
 
+BUF_PACK_TYPECHARS = ["c", "b", "B", "h", "H", "i", "I", "l", "L", "f", "d"]
+
+
 def generate_specific_arg_handling_body(function_name,
-        num_cl_args, scalar_arg_dtypes,
-        work_around_arg_count_bug, warn_about_arg_count_bug):
+        num_cl_args, arg_types,
+        work_around_arg_count_bug, warn_about_arg_count_bug,
+        in_enqueue):
 
     assert work_around_arg_count_bug is not None
     assert warn_about_arg_count_bug is not None
@@ -128,28 +86,75 @@ def generate_specific_arg_handling_body(function_name,
     fp_arg_count = 0
     cl_arg_idx = 0
 
-    from pytools.py_codegen import PythonCodeGenerator
     gen = PythonCodeGenerator()
 
-    if not scalar_arg_dtypes:
+    if not arg_types:
         gen("pass")
 
-    for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes):
-        gen(f"# process argument {arg_idx}")
+    gen_indices_and_args = []
+    buf_indices_and_args = []
+    buf_pack_indices_and_args = []
+
+    def add_buf_arg(arg_idx, typechar, expr_str):
+        if typechar in BUF_PACK_TYPECHARS:
+            buf_pack_indices_and_args.append(arg_idx)
+            buf_pack_indices_and_args.append(repr(typechar.encode()))
+            buf_pack_indices_and_args.append(expr_str)
+        else:
+            buf_indices_and_args.append(arg_idx)
+            buf_indices_and_args.append(f"pack('{typechar}', {expr_str})")
+
+    if in_enqueue and arg_types is not None and \
+            any(isinstance(arg_type, VectorArg) for arg_type in arg_types):
+        # We're about to modify wait_for, make sure it's a copy.
+        gen("""
+            if wait_for is None:
+                wait_for = []
+            else:
+                wait_for = list(wait_for)
+            """)
         gen("")
-        gen(f"current_arg = {arg_idx}")
+
+    for arg_idx, arg_type in enumerate(arg_types):
         arg_var = "arg%d" % arg_idx
 
-        if arg_dtype is None:
-            generate_generic_arg_handler(gen, cl_arg_idx, arg_var)
+        if arg_type is None:
+            gen_indices_and_args.append(cl_arg_idx)
+            gen_indices_and_args.append(arg_var)
             cl_arg_idx += 1
             gen("")
             continue
 
-        arg_dtype = np.dtype(arg_dtype)
+        elif isinstance(arg_type, VectorArg):
+            gen(f"if not {arg_var}.flags.forc:")
+            with Indentation(gen):
+                gen("raise RuntimeError('only contiguous arrays may '")
+                gen("   'be used as arguments to this operation')")
+                gen("")
+
+            if in_enqueue:
+                gen(f"assert {arg_var}.queue is None or {arg_var}.queue == queue, "
+                    "'queues for all arrays must match the queue supplied "
+                    "to enqueue'")
+
+            gen_indices_and_args.append(cl_arg_idx)
+            gen_indices_and_args.append(f"{arg_var}.base_data")
+            cl_arg_idx += 1
+
+            if arg_type.with_offset:
+                add_buf_arg(cl_arg_idx, np.dtype(np.int64).char, f"{arg_var}.offset")
+                cl_arg_idx += 1
+
+            if in_enqueue:
+                gen(f"wait_for.extend({arg_var}.events)")
+
+            continue
+
+        arg_dtype = np.dtype(arg_type)
 
         if arg_dtype.char == "V":
-            generate_generic_arg_handler(gen, cl_arg_idx, arg_var)
+            buf_indices_and_args.append(cl_arg_idx)
+            buf_indices_and_args.append(arg_var)
             cl_arg_idx += 1
 
         elif arg_dtype.kind == "c":
@@ -170,16 +175,9 @@ def generate_specific_arg_handling_body(function_name,
             if (work_around_arg_count_bug == "pocl"
                     and arg_dtype == np.complex128
                     and fp_arg_count + 2 <= 8):
-                gen(
-                        "buf = pack('{arg_char}', {arg_var}.real)"
-                        .format(arg_char=arg_char, arg_var=arg_var))
-                generate_bytes_arg_setter(gen, cl_arg_idx, "buf")
+                add_buf_arg(cl_arg_idx, arg_char, f"{arg_var}.real")
                 cl_arg_idx += 1
-                gen("current_arg = current_arg + 1000")
-                gen(
-                        "buf = pack('{arg_char}', {arg_var}.imag)"
-                        .format(arg_char=arg_char, arg_var=arg_var))
-                generate_bytes_arg_setter(gen, cl_arg_idx, "buf")
+                add_buf_arg(cl_arg_idx, arg_char, f"{arg_var}.imag")
                 cl_arg_idx += 1
 
             elif (work_around_arg_count_bug == "apple"
@@ -191,11 +189,9 @@ def generate_specific_arg_handling_body(function_name,
                         "Cannot pass complex numbers to kernels.")
 
             else:
-                gen(
-                        "buf = pack('{arg_char}{arg_char}', "
-                        "{arg_var}.real, {arg_var}.imag)"
-                        .format(arg_char=arg_char, arg_var=arg_var))
-                generate_bytes_arg_setter(gen, cl_arg_idx, "buf")
+                buf_indices_and_args.append(cl_arg_idx)
+                buf_indices_and_args.append(
+                    f"pack('{arg_char}{arg_char}', {arg_var}.real, {arg_var}.imag)")
                 cl_arg_idx += 1
 
             fp_arg_count += 2
@@ -206,16 +202,22 @@ def generate_specific_arg_handling_body(function_name,
 
             arg_char = arg_dtype.char
             arg_char = _type_char_map.get(arg_char, arg_char)
-            gen(
-                    "buf = pack('{arg_char}', {arg_var})"
-                    .format(
-                        arg_char=arg_char,
-                        arg_var=arg_var))
-            generate_bytes_arg_setter(gen, cl_arg_idx, "buf")
+            add_buf_arg(cl_arg_idx, arg_char, arg_var)
             cl_arg_idx += 1
 
         gen("")
 
+    for arg_kind, args_and_indices, entry_length in [
+            ("", gen_indices_and_args, 2),
+            ("_buf", buf_indices_and_args, 2),
+            ("_buf_pack", buf_pack_indices_and_args, 3),
+            ]:
+        assert len(args_and_indices) % entry_length == 0
+        if args_and_indices:
+            gen(f"self._set_arg{arg_kind}_multi("
+                    f"({', '.join(str(i) for i in args_and_indices)},), "
+                    ")")
+
     if cl_arg_idx != num_cl_args:
         raise TypeError(
             "length of argument list (%d) and "
@@ -227,77 +229,29 @@ def generate_specific_arg_handling_body(function_name,
 # }}}
 
 
-# {{{ error handler
-
-def wrap_in_error_handler(body, arg_names):
-    from pytools.py_codegen import PythonCodeGenerator, Indentation
-
-    err_gen = PythonCodeGenerator()
-
-    def gen_error_handler():
-        err_gen("""
-            if current_arg is not None:
-                args = [{args}]
-                advice = ""
-                from pyopencl.array import Array
-                if isinstance(args[current_arg], Array):
-                    advice = " (perhaps you meant to pass 'array.data' " \
-                        "instead of the array itself?)"
-
-                raise _cl.LogicError(
-                        "when processing argument #%d (1-based): %s%s"
-                        % (current_arg+1, str(e), advice))
-            else:
-                raise
-            """
-            .format(args=", ".join(arg_names)))
-        err_gen("")
-
-    err_gen("try:")
-    with Indentation(err_gen):
-        err_gen.extend(body)
-    err_gen("except TypeError as e:")
-    with Indentation(err_gen):
-        gen_error_handler()
-    err_gen("except _cl.LogicError as e:")
-    with Indentation(err_gen):
-        gen_error_handler()
-
-    return err_gen
-
-# }}}
-
-
-def add_local_imports(gen):
-    gen("import numpy as np")
-    gen("import pyopencl._cl as _cl")
-    gen("from pyopencl import _KERNEL_ARG_CLASSES")
-    gen("")
-
-
 def _generate_enqueue_and_set_args_module(function_name,
         num_passed_args, num_cl_args,
-        scalar_arg_dtypes,
+        arg_types,
         work_around_arg_count_bug, warn_about_arg_count_bug):
 
-    from pytools.py_codegen import PythonCodeGenerator, Indentation
-
     arg_names = ["arg%d" % i for i in range(num_passed_args)]
 
-    if scalar_arg_dtypes is None:
-        body = generate_generic_arg_handling_body(num_passed_args)
-    else:
-        body = generate_specific_arg_handling_body(
-                function_name, num_cl_args, scalar_arg_dtypes,
-                warn_about_arg_count_bug=warn_about_arg_count_bug,
-                work_around_arg_count_bug=work_around_arg_count_bug)
-
-    err_handler = wrap_in_error_handler(body, arg_names)
+    def gen_arg_setting(in_enqueue):
+        if arg_types is None:
+            return generate_generic_arg_handling_body(num_passed_args)
+        else:
+            return generate_specific_arg_handling_body(
+                    function_name, num_cl_args, arg_types,
+                    warn_about_arg_count_bug=warn_about_arg_count_bug,
+                    work_around_arg_count_bug=work_around_arg_count_bug,
+                    in_enqueue=in_enqueue)
 
     gen = PythonCodeGenerator()
 
     gen("from struct import pack")
     gen("from pyopencl import status_code")
+    gen("import numpy as np")
+    gen("import pyopencl._cl as _cl")
     gen("")
 
     # {{{ generate _enqueue
@@ -314,13 +268,13 @@ def _generate_enqueue_and_set_args_module(function_name,
                         "wait_for=None"])))
 
     with Indentation(gen):
-        add_local_imports(gen)
-        gen.extend(err_handler)
+        gen.extend(gen_arg_setting(in_enqueue=True))
 
+        # Using positional args here because pybind is slow with keyword args
         gen("""
             return _cl.enqueue_nd_range_kernel(queue, self, global_size, local_size,
-                    global_offset, wait_for, g_times_l=g_times_l,
-                    allow_empty_ndrange=allow_empty_ndrange)
+                    global_offset, wait_for, g_times_l,
+                    allow_empty_ndrange)
             """)
 
     # }}}
@@ -332,8 +286,7 @@ def _generate_enqueue_and_set_args_module(function_name,
             % (", ".join(["self"] + arg_names)))
 
     with Indentation(gen):
-        add_local_imports(gen)
-        gen.extend(err_handler)
+        gen.extend(gen_arg_setting(in_enqueue=False))
 
     # }}}
 
@@ -341,17 +294,17 @@ def _generate_enqueue_and_set_args_module(function_name,
 
 
 invoker_cache = WriteOncePersistentDict(
-        "pyopencl-invoker-cache-v7",
+        "pyopencl-invoker-cache-v38",
         key_builder=_NumpyTypesKeyBuilder())
 
 
 def generate_enqueue_and_set_args(function_name,
         num_passed_args, num_cl_args,
-        scalar_arg_dtypes,
+        arg_types,
         work_around_arg_count_bug, warn_about_arg_count_bug):
 
     cache_key = (function_name, num_passed_args, num_cl_args,
-            scalar_arg_dtypes,
+            arg_types,
             work_around_arg_count_bug, warn_about_arg_count_bug)
 
     from_cache = False
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 5b8146858119e233338b8e23d4414f1005e3cdda..3be95ee2c736da374011002f2076ecfa172d9d35 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -334,10 +334,21 @@ class DtypedArgument(Argument):
                 self.name,
                 self.dtype)
 
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self.dtype == other.dtype
+                and self.name == other.name)
+
+    def __hash__(self):
+        return (
+                hash(type(self))
+                ^ hash(self.dtype)
+                ^ hash(self.name))
+
 
 class VectorArg(DtypedArgument):
     def __init__(self, dtype, name, with_offset=False):
-        DtypedArgument.__init__(self, dtype, name)
+        super().__init__(dtype, name)
         self.with_offset = with_offset
 
     def declarator(self):
@@ -350,6 +361,13 @@ class VectorArg(DtypedArgument):
 
         return result
 
+    def __eq__(self, other):
+        return (super().__eq__(other)
+                and self.with_offset == other.with_offset)
+
+    def __hash__(self):
+        return super.__hash__() ^ hash(self.with_offset)
+
 
 class ScalarArg(DtypedArgument):
     def declarator(self):
@@ -364,6 +382,17 @@ class OtherArg(Argument):
     def declarator(self):
         return self.decl
 
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self.decl == other.decl
+                and self.name == other.name)
+
+    def __hash__(self):
+        return (
+                hash(type(self))
+                ^ hash(self.decl)
+                ^ hash(self.name))
+
 
 def parse_c_arg(c_arg, with_offset=False):
     for aspace in ["__local", "__constant"]:
@@ -402,6 +431,20 @@ def parse_arg_list(arguments, with_offset=False):
     return [parse_single_arg(arg) for arg in arguments]
 
 
+def get_arg_list_arg_types(arg_types):
+    result = []
+
+    for arg_type in arg_types:
+        if isinstance(arg_type, ScalarArg):
+            result.append(arg_type.dtype)
+        elif isinstance(arg_type, VectorArg):
+            result.append(arg_type)
+        else:
+            raise RuntimeError("arg type not understood: %s" % type(arg_type))
+
+    return tuple(result)
+
+
 def get_arg_list_scalar_arg_dtypes(arg_types):
     result = []
 
@@ -1025,6 +1068,11 @@ def is_spirv(s):
 # {{{ numpy key types builder
 
 class _NumpyTypesKeyBuilder(KeyBuilderBase):
+    def update_for_VectorArg(self, key_hash, key):  # noqa: N802
+        self.rec(key_hash, key.dtype)
+        self.update_for_str(key_hash, key.name)
+        self.rec(key_hash, key.with_offset)
+
     def update_for_type(self, key_hash, key):
         if issubclass(key, np.generic):
             self.update_for_str(key_hash, key.__name__)
diff --git a/pyopencl/version.py b/pyopencl/version.py
index fee1f80e88ef054d5c33a33622ccff1e41eca20f..e4c47bc9ed6a54934f00ad2e03256e2fa19c60f7 100644
--- a/pyopencl/version.py
+++ b/pyopencl/version.py
@@ -1,3 +1,3 @@
-VERSION = (2020, 3, 1)
+VERSION = (2021, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index ad5b7ab80c167ccfadbe3e4d5c7673859528a7b1..f00a6b882d50471efb14940f9aad7f93bc720b59 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -86,6 +86,7 @@
 
 #endif
 
+#include <functional>
 #include <thread>
 #include <mutex>
 #include <condition_variable>
@@ -389,14 +390,15 @@
     \
     if (py_wait_for.ptr() != Py_None) \
     { \
-      event_wait_list.resize(len(py_wait_for)); \
       for (py::handle evt: py_wait_for) \
-        event_wait_list[num_events_in_wait_list++] = \
-          evt.cast<const event &>().data(); \
+      { \
+        event_wait_list.push_back(evt.cast<const event &>().data()); \
+        ++num_events_in_wait_list; \
+      } \
     }
 
 #define PYOPENCL_WAITLIST_ARGS \
-    num_events_in_wait_list, event_wait_list.empty( ) ? nullptr : &event_wait_list.front()
+    num_events_in_wait_list, (num_events_in_wait_list == 0) ? nullptr : &event_wait_list.front()
 
 #define PYOPENCL_RETURN_NEW_NANNY_EVENT(evt, obj) \
     try \
@@ -4370,7 +4372,43 @@ namespace pyopencl
             (m_kernel, arg_index, sizeof(cl_command_queue), &q));
       }
 
-      void set_arg_buf(cl_uint arg_index, py::object py_buffer)
+      void set_arg_buf_pack(cl_uint arg_index, py::handle py_typechar, py::handle obj)
+      {
+        std::string typechar_str(py::cast<std::string>(py_typechar));
+        if (typechar_str.size() != 1)
+          throw error("Kernel.set_arg_buf_pack", CL_INVALID_VALUE,
+              "type char argument must have exactly one character");
+
+        char typechar = typechar_str[0];
+
+#define PYOPENCL_KERNEL_PACK_AND_SET_ARG(TYPECH_VAL, TYPE) \
+        case TYPECH_VAL: \
+          { \
+            TYPE val = py::cast<TYPE>(obj); \
+            PYOPENCL_CALL_GUARDED(clSetKernelArg, (m_kernel, arg_index, sizeof(val), &val)); \
+            break; \
+          }
+        switch (typechar)
+        {
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('c', char)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('b', signed char)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('B', unsigned char)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('h', short)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('H', unsigned short)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('i', int)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('I', unsigned int)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('l', long)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('L', unsigned long)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('f', float)
+          PYOPENCL_KERNEL_PACK_AND_SET_ARG('d', double)
+          default:
+            throw error("Kernel.set_arg_buf_pack", CL_INVALID_VALUE,
+                "invalid type char");
+        }
+#undef PYOPENCL_KERNEL_PACK_AND_SET_ARG
+      }
+
+      void set_arg_buf(cl_uint arg_index, py::handle py_buffer)
       {
         const void *buf;
         PYOPENCL_BUFFER_SIZE_T len;
@@ -4403,7 +4441,7 @@ namespace pyopencl
       }
 #endif
 
-      void set_arg(cl_uint arg_index, py::object arg)
+      void set_arg(cl_uint arg_index, py::handle arg)
       {
         if (arg.ptr() == Py_None)
         {
@@ -4615,6 +4653,85 @@ namespace pyopencl
 #endif
   };
 
+#define PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER \
+    catch (error &err) \
+    { \
+      std::string msg( \
+          std::string("when processing arg#") + std::to_string(arg_index+1) \
+          + std::string(" (1-based): ") + std::string(err.what())); \
+      auto mod_cl_ary(py::module::import("pyopencl.array")); \
+      auto cls_array(mod_cl_ary.attr("Array")); \
+      if (arg_value.ptr() && py::isinstance(arg_value, cls_array)) \
+        msg.append( \
+            " (perhaps you meant to pass 'array.data' instead of the array itself?)"); \
+      throw error(err.routine().c_str(), err.code(), msg.c_str()); \
+    } \
+    catch (std::exception &err) \
+    { \
+      std::string msg( \
+          std::string("when processing arg#") + std::to_string(arg_index+1) \
+          + std::string(" (1-based): ") + std::string(err.what())); \
+      throw std::runtime_error(msg.c_str()); \
+    }
+
+  inline
+  void set_arg_multi(
+      std::function<void(cl_uint, py::handle)> set_arg_func,
+      py::tuple args_and_indices)
+  {
+    cl_uint arg_index;
+    py::handle arg_value;
+
+    auto it = args_and_indices.begin(), end = args_and_indices.end();
+    try
+    {
+      /* This is an internal interface that assumes it gets fed well-formed
+       * data.  No meaningful error checking is being performed on
+       * off-interval exhaustion of the iterator, on purpose.
+       */
+      while (it != end)
+      {
+        // special value in case integer cast fails
+        arg_index = 9999 - 1;
+
+        arg_index = py::cast<cl_uint>(*it++);
+        arg_value = *it++;
+        set_arg_func(arg_index, arg_value);
+      }
+    }
+    PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER
+  }
+
+
+  inline
+  void set_arg_multi(
+      std::function<void(cl_uint, py::handle, py::handle)> set_arg_func,
+      py::tuple args_and_indices)
+  {
+    cl_uint arg_index;
+    py::handle arg_descr, arg_value;
+
+    auto it = args_and_indices.begin(), end = args_and_indices.end();
+    try
+    {
+      /* This is an internal interface that assumes it gets fed well-formed
+       * data.  No meaningful error checking is being performed on
+       * off-interval exhaustion of the iterator, on purpose.
+       */
+      while (it != end)
+      {
+        // special value in case integer cast fails
+        arg_index = 9999 - 1;
+
+        arg_index = py::cast<cl_uint>(*it++);
+        arg_descr = *it++;
+        arg_value = *it++;
+        set_arg_func(arg_index, arg_descr, arg_value);
+      }
+    }
+    PYOPENCL_KERNEL_SET_ARG_MULTI_ERROR_HANDLER
+  }
+
 
   inline
   py::list create_kernels_in_program(program &pgm)
@@ -4635,70 +4752,73 @@ namespace pyopencl
     return result;
   }
 
-
+#define MAX_WS_DIM_COUNT 10
 
   inline
   event *enqueue_nd_range_kernel(
       command_queue &cq,
       kernel &knl,
-      py::object py_global_work_size,
-      py::object py_local_work_size,
-      py::object py_global_work_offset,
-      py::object py_wait_for,
+      py::handle py_global_work_size,
+      py::handle py_local_work_size,
+      py::handle py_global_work_offset,
+      py::handle py_wait_for,
       bool g_times_l,
       bool allow_empty_ndrange)
   {
     PYOPENCL_PARSE_WAIT_FOR;
 
-    cl_uint work_dim = len(py_global_work_size);
+    std::array<size_t, MAX_WS_DIM_COUNT> global_work_size;
+    unsigned gws_size = 0;
+    COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_size, gws_size);
+    cl_uint work_dim = gws_size;
 
-    std::vector<size_t> global_work_size;
-    COPY_PY_LIST(size_t, global_work_size);
+    std::array<size_t, MAX_WS_DIM_COUNT> local_work_size;
+    unsigned lws_size = 0;
+    size_t *local_work_size_ptr = nullptr;
 
-    size_t *local_work_size_ptr = 0;
-    std::vector<size_t> local_work_size;
     if (py_local_work_size.ptr() != Py_None)
     {
+      COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, local_work_size, lws_size);
+
       if (g_times_l)
-        work_dim = std::max(work_dim, unsigned(len(py_local_work_size)));
+        work_dim = std::max(work_dim, lws_size);
       else
-        if (work_dim != unsigned(len(py_local_work_size)))
+        if (work_dim != lws_size)
           throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
               "global/local work sizes have differing dimensions");
 
-      COPY_PY_LIST(size_t, local_work_size);
-
-      while (local_work_size.size() < work_dim)
-        local_work_size.push_back(1);
-      while (global_work_size.size() < work_dim)
-        global_work_size.push_back(1);
+      while (lws_size < work_dim)
+        local_work_size[lws_size++] = 1;
+      while (gws_size < work_dim)
+        global_work_size[gws_size++] = 1;
 
-      local_work_size_ptr = local_work_size.empty( ) ? nullptr : &local_work_size.front();
+      local_work_size_ptr = &local_work_size.front();
     }
 
-    if (g_times_l && local_work_size_ptr)
+    if (g_times_l && lws_size)
     {
       for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
         global_work_size[work_axis] *= local_work_size[work_axis];
     }
 
-    size_t *global_work_offset_ptr = 0;
-    std::vector<size_t> global_work_offset;
+    size_t *global_work_offset_ptr = nullptr;
+    std::array<size_t, MAX_WS_DIM_COUNT> global_work_offset;
     if (py_global_work_offset.ptr() != Py_None)
     {
-      if (work_dim != unsigned(len(py_global_work_offset)))
+      unsigned gwo_size = 0;
+      COPY_PY_ARRAY("enqueue_nd_range_kernel", size_t, global_work_offset, gwo_size);
+
+      if (work_dim != gwo_size)
         throw error("enqueue_nd_range_kernel", CL_INVALID_VALUE,
             "global work size and offset have differing dimensions");
 
-      COPY_PY_LIST(size_t, global_work_offset);
-
       if (g_times_l && local_work_size_ptr)
       {
         for (cl_uint work_axis = 0; work_axis < work_dim; ++work_axis)
           global_work_offset[work_axis] *= local_work_size[work_axis];
       }
 
-      global_work_offset_ptr = global_work_offset.empty( ) ? nullptr :  &global_work_offset.front();
+      global_work_offset_ptr = &global_work_offset.front();
     }
 
     if (allow_empty_ndrange)
@@ -4735,7 +4855,7 @@ namespace pyopencl
                 knl.data(),
                 work_dim,
                 global_work_offset_ptr,
-                global_work_size.empty( ) ? nullptr : &global_work_size.front(),
+                &global_work_size.front(),
                 local_work_size_ptr,
                 PYOPENCL_WAITLIST_ARGS, &evt
                 ));
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index e68c785128dda3da644a9dbd76c5feb06d7dd591..205b31ec452b388fe1b32f3443e63762b33a10c0 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -470,6 +470,28 @@ void pyopencl_expose_part_2(py::module &m)
 #if PYOPENCL_CL_VERSION >= 0x2000
       .def("_set_arg_svm", &cls::set_arg_svm)
 #endif
+      .def("_set_arg_multi",
+          [](cls &knl, py::tuple indices_and_args)
+          {
+            set_arg_multi(
+                [&](cl_uint i, py::handle arg) { knl.set_arg(i, arg); },
+                indices_and_args);
+          })
+      .def("_set_arg_buf_multi",
+          [](cls &knl, py::tuple indices_and_args)
+          {
+            set_arg_multi(
+                [&](cl_uint i, py::handle arg) { knl.set_arg_buf(i, arg); },
+                indices_and_args);
+          })
+      .def("_set_arg_buf_pack_multi",
+          [](cls &knl, py::tuple indices_chars_and_args)
+          {
+            set_arg_multi(
+                [&](cl_uint i, py::handle typechar, py::handle arg)
+                { knl.set_arg_buf_pack(i, typechar, arg); },
+                indices_chars_and_args);
+          })
       .DEF_SIMPLE_METHOD(set_arg)
 #if PYOPENCL_CL_VERSION >= 0x1020
       .DEF_SIMPLE_METHOD(get_arg_info)
diff --git a/src/wrap_helpers.hpp b/src/wrap_helpers.hpp
index 4799244ff72c583a73a9d92d43229a171be4ee9c..cabc012d4a683a20807f066757278e22fafc8268 100644
--- a/src/wrap_helpers.hpp
+++ b/src/wrap_helpers.hpp
@@ -71,6 +71,18 @@ namespace py = pybind11;
       NAME.push_back(it.cast<TYPE>()); \
   }
 
+#define COPY_PY_ARRAY(FUNC_NAME, TYPE, NAME, COUNTER) \
+  { \
+    COUNTER = 0; \
+    for (auto it: py_##NAME) \
+    { \
+      if (COUNTER == NAME.size()) \
+        throw error(FUNC_NAME, \
+            CL_INVALID_VALUE, "too many entries in " #NAME " argument"); \
+      NAME[COUNTER++] = it.cast<TYPE>(); \
+    } \
+  }
+
 #define COPY_PY_COORD_TRIPLE(NAME) \
   size_t NAME[3] = {0, 0, 0}; \
   { \
diff --git a/test/test_algorithm.py b/test/test_algorithm.py
index 353af28173c1dadec40e8c761e4b8df8bd45caaf..676aee379272ede7352d26e3bd81ebfc7f5686f9 100644
--- a/test/test_algorithm.py
+++ b/test/test_algorithm.py
@@ -1087,6 +1087,12 @@ def test_bitonic_argsort(ctx_factory, size, dtype):
     ctx = cl.create_some_context()
     queue = cl.CommandQueue(ctx)
 
+    device = queue.device
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU:
+        pytest.xfail("bitonic argsort fails on POCL + Nvidia,"
+                "at least the K40, as of pocl 1.6, 2021-01-20")
+
     dev = ctx.devices[0]
     if (dev.platform.name == "Portable Computing Language"
             and sys.platform == "darwin"):
diff --git a/test/test_array.py b/test/test_array.py
index b4234a3777059cc44b2f37006c98239e4dbe2ec5..899fc2383c1248efc80dbb7a49f8801146376554 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -715,6 +715,13 @@ def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
+    device = queue.device
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU \
+            and rng_class is RanluxGenerator:
+        pytest.xfail("ranlux test fails on POCL + Nvidia,"
+                "at least the Titan V, as of pocl 1.6, 2021-01-20")
+
     if has_double_support(context.devices[0]):
         dtypes = [np.float32, np.float64]
     else:
@@ -767,6 +774,12 @@ def test_random_int_in_range(ctx_factory, rng_class, dtype, plot_hist=False):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
+    if queue.device.platform.vendor == "The pocl project" \
+            and queue.device.type & cl.device_type.GPU \
+            and rng_class is RanluxGenerator:
+        pytest.xfail("ranlux test fails on POCL + Nvidia,"
+                "at least the Titan V, as of pocl 1.6, 2021-01-20")
+
     if rng_class is RanluxGenerator:
         gen = rng_class(queue, 5120)
     else:
@@ -1350,6 +1363,12 @@ def test_get_async(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
+    device = queue.device
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU:
+        pytest.xfail("the async get test fails on POCL + Nvidia,"
+                "at least the K40, as of pocl 1.6, 2021-01-20")
+
     a = np.random.rand(10**6).astype(np.dtype("float32"))
     a_gpu = cl_array.to_device(queue, a)
     b = a + a**5 + 1
diff --git a/test/test_clrandom.py b/test/test_clrandom.py
index 1ce479b2b1f18b099c9457d43a03c4b2327ea77f..1ec53842a036bc659d12fa546425604ac01c4964 100644
--- a/test/test_clrandom.py
+++ b/test/test_clrandom.py
@@ -68,6 +68,13 @@ def test_clrandom_dtypes(ctx_factory, rng_class, dtype):
     size = 10
 
     with cl.CommandQueue(cl_ctx) as queue:
+        device = queue.device
+        if device.platform.vendor == "The pocl project" \
+                and device.type & cl.device_type.GPU \
+                and rng_class is make_ranlux_generator:
+            pytest.xfail("ranlux test fails on POCL + Nvidia,"
+                    "at least the K40, as of pocl 1.6, 2021-01-20")
+
         rng.uniform(queue, size, dtype)
 
         if dtype not in (np.int32, np.int64):
diff --git a/test/test_enqueue_copy.py b/test/test_enqueue_copy.py
index 55f72d9519f89ed5b17c061559104379a3896e72..402bc8b9d71e47592f67101ff58dfb90a0c56630 100644
--- a/test/test_enqueue_copy.py
+++ b/test/test_enqueue_copy.py
@@ -48,8 +48,14 @@ def test_enqueue_copy_rect_2d(ctx_factory, honor_skip=True):
         # https://github.com/pocl/pocl/issues/353
         pytest.skip("POCL's rectangular copies crash")
 
+    device = queue.device
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU:
+        pytest.xfail("rect copies fail on POCL + Nvidia,"
+                "at least the K40, as of pocl 1.6, 2021-01-20")
+
     if honor_skip and queue.device.platform.name == "Apple":
-        pytest.skip("Apple's CL implementation crashes on this.")
+        pytest.xfail("Apple's CL implementation crashes on this.")
 
     ary_in_shp = 256, 128  # Entire array shape from which sub-array copied to device
     sub_ary_shp = 128, 96  # Sub-array shape to be copied to device
@@ -138,6 +144,12 @@ def test_enqueue_copy_rect_3d(ctx_factory, honor_skip=True):
         # https://github.com/pocl/pocl/issues/353
         pytest.skip("POCL's rectangular copies crash")
 
+    device = queue.device
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU:
+        pytest.xfail("rect copies fail on POCL + Nvidia,"
+                "at least the K40, as of pocl 1.6, 2021-01-20")
+
     if honor_skip and queue.device.platform.name == "Apple":
         pytest.skip("Apple's CL implementation crashes on this.")
 
diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index 8aad416e75b9e3e346946cb2b5c5b40836c88ee4..1068779b78d443cd0f3fe012c429025d82057d6d 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -46,10 +46,17 @@ else:
     faulthandler.enable()
 
 
-def _skip_if_pocl(plat, up_to_version, msg="unsupported by pocl"):
+def _xfail_if_pocl(plat, up_to_version, msg="unsupported by pocl"):
     if plat.vendor == "The pocl project":
         if up_to_version is None or get_pocl_version(plat) <= up_to_version:
-            pytest.skip(msg)
+            pytest.xfail(msg)
+
+
+def _xfail_if_pocl_gpu(device, what):
+    if device.platform.vendor == "The pocl project" \
+            and device.type & cl.device_type.GPU:
+        pytest.xfail(f"POCL's {what} support don't work right on Nvidia GPUs, "
+                "at least the Titan V, as of pocl 1.6, 2021-01-20")
 
 
 def test_get_info(ctx_factory):
@@ -382,7 +389,7 @@ def test_image_2d(ctx_factory):
     if "Intel" in device.vendor and "31360.31426" in device.version:
         from pytest import skip
         skip("images crashy on %s" % device)
-    _skip_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP")
+    _xfail_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP")
 
     prg = cl.Program(context, """
         __kernel void copy_image(
@@ -454,7 +461,7 @@ def test_image_3d(ctx_factory):
     if device.platform.vendor == "Intel(R) Corporation":
         from pytest import skip
         skip("images crashy on %s" % device)
-    _skip_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP")
+    _xfail_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP")
 
     prg = cl.Program(context, """
         __kernel void copy_image_plane(
@@ -676,7 +683,7 @@ def test_can_build_and_run_binary(ctx_factory):
 def test_enqueue_barrier_marker(ctx_factory):
     ctx = ctx_factory()
     # Still relevant on pocl 1.0RC1.
-    _skip_if_pocl(
+    _xfail_if_pocl(
             ctx.devices[0].platform, (1, 0), "pocl crashes on enqueue_barrier")
 
     queue = cl.CommandQueue(ctx)
@@ -704,7 +711,7 @@ def test_unload_compiler(platform):
             or cl.get_cl_header_version() < (1, 2)):
         from pytest import skip
         skip("clUnloadPlatformCompiler is only available in OpenCL 1.2")
-    _skip_if_pocl(platform, (0, 13), "pocl does not support unloading compiler")
+    _xfail_if_pocl(platform, (0, 13), "pocl does not support unloading compiler")
     if platform.vendor == "Intel(R) Corporation":
         from pytest import skip
         skip("Intel proprietary driver does not support unloading compiler")
@@ -745,7 +752,7 @@ def test_user_event(ctx_factory):
         skip("UserEvent is only available in OpenCL 1.1")
 
     # https://github.com/pocl/pocl/issues/201
-    _skip_if_pocl(ctx.devices[0].platform, (0, 13),
+    _xfail_if_pocl(ctx.devices[0].platform, (0, 13),
             "pocl's user events don't work right")
 
     status = {}
@@ -842,6 +849,8 @@ def test_event_set_callback(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    _xfail_if_pocl_gpu(queue.device, "event callbacks")
+
     if ctx._get_cl_version() < (1, 1):
         pytest.skip("OpenCL 1.1 or newer required for set_callback")
 
@@ -896,6 +905,8 @@ def test_global_offset(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
+    _xfail_if_pocl_gpu(queue.device, "global offset")
+
     prg = cl.Program(context, """
         __kernel void mult(__global float *a)
         { a[get_global_id(0)] *= 2; }
@@ -981,6 +992,8 @@ def test_coarse_grain_svm(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    _xfail_if_pocl_gpu(queue.device, "SVM")
+
     dev = ctx.devices[0]
 
     from pyopencl.characterize import has_coarse_grain_buffer_svm
@@ -1062,6 +1075,8 @@ def test_fine_grain_svm(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    _xfail_if_pocl_gpu(queue.device, "GPU SVM")
+
     from pyopencl.characterize import has_fine_grain_buffer_svm
     from pytest import skip
     if not has_fine_grain_buffer_svm(queue.device):
@@ -1148,6 +1163,8 @@ def test_copy_buffer_rect(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
+    _xfail_if_pocl_gpu(queue.device, "rectangular copies")
+
     arr1 = cl_array.zeros(queue, (2, 3), "f")
     arr2 = cl_array.zeros(queue, (4, 5), "f")
     arr1.fill(1)