diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index aa8abdd74f64ca043bd2f14048c9bb8238c2cd99..0e7412b17d4e3bf6f4b731a90a9782448350c5a5 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -42,6 +42,8 @@ except ImportError:
                 "its source directory. This likely won't work.")
     raise
 
+_CPY2 = _cl._CPY2
+_CPY26 = _cl._CPY2 and sys.version_info < (2, 7)
 
 import numpy as np
 
@@ -167,6 +169,8 @@ CONSTANT_CLASSES = [
         and name[0].islower() and name not in ["zip", "map", "range"]]
 
 
+# {{{ diagnostics
+
 class CompilerWarning(UserWarning):
     pass
 
@@ -185,6 +189,25 @@ def compiler_output(text):
 class _ErrorRecord(_Record):
     pass
 
+# }}}
+
+
+# {{{ arg packing helpers
+
+_size_t_char = ({
+    8: 'Q',
+    4: 'L',
+    2: 'H',
+    1: 'B',
+})[_cl._ffi.sizeof('size_t')]
+_type_char_map = {
+    'n': _size_t_char.lower(),
+    'N': _size_t_char
+}
+del _size_t_char
+
+# }}}
+
 
 # {{{ find pyopencl shipped source code
 
@@ -609,72 +632,260 @@ def _add_functionality():
         kernel_old_init(self, prg, name)
         self._source = getattr(prg, "_source", None)
 
-    def kernel_call(self, queue, global_size, local_size, *args, **kwargs):
-        global_offset = kwargs.pop("global_offset", None)
-        g_times_l = kwargs.pop("g_times_l", False)
-        wait_for = kwargs.pop("wait_for", None)
+        self._generate_naive_call()
 
-        if kwargs:
-            raise TypeError(
-                    "Kernel.__call__ recived unexpected keyword arguments: %s"
-                    % ", ".join(list(kwargs.keys())))
+    # {{{ code generation for __call__, set_args
 
-        self.set_args(*args)
+    def kernel__set_set_args_body(self, body, num_passed_args):
+        from pytools.py_codegen import (
+                PythonFunctionGenerator,
+                PythonCodeGenerator,
+                Indentation)
 
-        return enqueue_nd_range_kernel(queue, self, global_size, local_size,
-                global_offset, wait_for, g_times_l=g_times_l)
+        arg_names = ["arg%d" % i for i in xrange(num_passed_args)]
 
-    def kernel_set_scalar_arg_dtypes(self, arg_dtypes):
-        assert len(arg_dtypes) == self.num_args, (
-                "length of argument type array (%d) and "
-                "CL-generated number of arguments (%d) do not agree"
-                % (len(arg_dtypes), self.num_args))
+        # {{{ wrap in error handler
 
-        arg_type_chars = []
+        err_gen = PythonCodeGenerator()
 
-        for arg_dtype in arg_dtypes:
-            if arg_dtype is None:
-                arg_type_chars.append(None)
+        err_gen("try:")
+        with Indentation(err_gen):
+            err_gen.extend(body)
+        err_gen("except TypeError as e:")
+        with Indentation(err_gen):
+            err_gen("""
+                if current_arg is not None:
+                    args = [{args}]
+                    advice = ""
+                    from pyopencl.array import Array
+                    if isinstance(args[current_arg], Array):
+                        advice = " (perhaps you meant to pass 'array.data' " \
+                            "instead of the array itself?)"
+
+                    raise _cl.LogicError(
+                            "when processing argument #%d (1-based): %s%s"
+                            % (current_arg+1, str(e), advice))
+                else:
+                    raise
+                """
+                .format(args=", ".join(arg_names)))
+            err_gen("")
+
+        # }}}
+
+        def add_preamble(gen):
+            gen.add_to_preamble(
+                "import numpy as np")
+            gen.add_to_preamble(
+                "import pyopencl.cffi_cl as _cl")
+            gen.add_to_preamble(
+                "from pyopencl.cffi_cl import _lib, "
+                "_ffi, _handle_error, _CLKernelArg")
+            gen.add_to_preamble("from pyopencl import status_code")
+            gen.add_to_preamble("from struct import pack")
+            gen.add_to_preamble("")
+
+        # {{{ generate _enqueue
+
+        gen = PythonFunctionGenerator("enqueue_knl_%s" % self.function_name,
+                ["self", "queue", "global_size", "local_size"]
+                + arg_names
+                + ["global_offset=None", "g_times_l=None", "wait_for=None"])
+
+        add_preamble(gen)
+        gen.extend(err_gen)
+
+        gen("""
+            return _cl.enqueue_nd_range_kernel(queue, self, global_size, local_size,
+                    global_offset, wait_for, g_times_l=g_times_l)
+            """)
+
+        self._enqueue = gen.get_function()
+
+        # }}}
+
+        # {{{ generate set_args
+
+        gen = PythonFunctionGenerator("_set_args", ["self"] + arg_names)
+
+        add_preamble(gen)
+        gen.extend(err_gen)
+
+        self._set_args = gen.get_function()
+
+        # }}}
+
+    def kernel__generate_buffer_arg_setter(self, gen, arg_idx, buf_var):
+        from pytools.py_codegen import Indentation
+
+        if _CPY2:
+            # https://github.com/numpy/numpy/issues/5381
+            gen("if isinstance({buf_var}, np.generic):".format(buf_var=buf_var))
+            with Indentation(gen):
+                gen("{buf_var} = np.getbuffer({buf_var})".format(buf_var=buf_var))
+
+        gen("""
+            c_buf, sz, _ = _cl._c_buffer_from_obj({buf_var})
+            status = _lib.kernel__set_arg_buf(self.ptr, {arg_idx}, c_buf, sz)
+            if status != _ffi.NULL:
+                _handle_error(status)
+            """
+            .format(arg_idx=arg_idx, buf_var=buf_var))
+
+    def kernel__generate_generic_arg_handler(self, gen, arg_idx, arg_var):
+        from pytools.py_codegen import Indentation
+
+        gen("""
+            if {arg_var} is None:
+                status = _lib.kernel__set_arg_null(self.ptr, {arg_idx})
+                if status != _ffi.NULL:
+                    _handle_error(status)
+            elif isinstance({arg_var}, _CLKernelArg):
+                self.set_arg({arg_idx}, {arg_var})
+            """
+            .format(arg_idx=arg_idx, arg_var=arg_var))
+
+        gen("else:")
+        with Indentation(gen):
+            self._generate_buffer_arg_setter(gen, arg_idx, arg_var)
+
+    def kernel__generate_naive_call(self):
+        num_args = self.num_args
+
+        from pytools.py_codegen import PythonCodeGenerator
+        gen = PythonCodeGenerator()
+
+        for i in range(num_args):
+            gen("# process argument {arg_idx}".format(arg_idx=i))
+            gen("")
+            gen("current_arg = {arg_idx}".format(arg_idx=i))
+            self._generate_generic_arg_handler(gen, i, "arg%d" % i)
+            gen("")
+
+        self._set_set_args_body(gen, num_args)
+
+    def kernel_set_scalar_arg_dtypes(self, scalar_arg_dtypes):
+        # {{{ arg counting bug handling
+
+        # For example:
+        # https://github.com/pocl/pocl/issues/197
+        # (but Apple CPU has a similar bug)
+
+        work_around_arg_count_bug = False
+        warn_about_arg_count_bug = False
+
+        from pyopencl.characterize import has_struct_arg_count_bug
+
+        count_bug_per_dev = [
+                has_struct_arg_count_bug(dev)
+                for dev in self.context.devices]
+
+        if any(count_bug_per_dev):
+            if all(count_bug_per_dev):
+                work_around_arg_count_bug = True
             else:
-                arg_type_chars.append(np.dtype(arg_dtype).char)
+                warn_about_arg_count_bug = True
+
+        # }}}
+
+        cl_arg_idx = 0
+
+        from pytools.py_codegen import PythonCodeGenerator
+        gen = PythonCodeGenerator()
+
+        for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes):
+            gen("# process argument {arg_idx}".format(arg_idx=arg_idx))
+            gen("")
+            gen("current_arg = {arg_idx}".format(arg_idx=arg_idx))
+            arg_var = "arg%d" % arg_idx
+
+            if arg_dtype is None:
+                self._generate_generic_arg_handler(gen, cl_arg_idx, arg_var)
+                cl_arg_idx += 1
+                gen("")
+                continue
+
+            arg_dtype = np.dtype(arg_dtype)
 
-        self._arg_type_chars = arg_type_chars
+            if arg_dtype.char == "V":
+                self._generate_generic_arg_handler(gen, cl_arg_idx, arg_var)
+                cl_arg_idx += 1
 
-    def kernel_set_args(self, *args):
-        assert len(args) == self.num_args, (
+            elif arg_dtype.kind == "c":
+                if warn_about_arg_count_bug:
+                    warn("{knl_name}: arguments include complex numbers, and "
+                            "some (but not all) of the target devices mishandle "
+                            "struct kernel arguments (hence the workaround is "
+                            "disabled".format(
+                                knl_name=self.function_name, stacklevel=2))
+
+                if arg_dtype == np.complex64:
+                    arg_char = "f"
+                elif arg_dtype == np.complex128:
+                    arg_char = "d"
+                else:
+                    raise TypeError("unexpected complex type: %s" % arg_dtype)
+
+                if work_around_arg_count_bug and arg_dtype == np.complex128:
+                    gen(
+                            "buf = pack('{arg_char}', {arg_var}.real)"
+                            .format(arg_char=arg_char, arg_var=arg_var))
+                    self._generate_buffer_arg_setter(gen, cl_arg_idx, "buf")
+                    cl_arg_idx += 1
+                    gen(
+                            "buf = pack('{arg_char}', {arg_var}.imag)"
+                            .format(arg_char=arg_char, arg_var=arg_var))
+                    self._generate_buffer_arg_setter(gen, cl_arg_idx, "buf")
+                    cl_arg_idx += 1
+                else:
+                    gen(
+                            "buf = pack('{arg_char}{arg_char}', "
+                            "{arg_var}.real, {arg_var}.imag)"
+                            .format(arg_char=arg_char, arg_var=arg_var))
+                    self._generate_buffer_arg_setter(gen, cl_arg_idx, "buf")
+                    cl_arg_idx += 1
+
+            elif arg_dtype.char in "IL" and _CPY26:
+                # Prevent SystemError: ../Objects/longobject.c:336: bad
+                # argument to internal function
+
+                gen(
+                        "buf = pack('{arg_char}', long({arg_var})"
+                        .format(arg_char=arg_dtype.char, arg_var=arg_var))
+                self._generate_buffer_arg_setter(gen, cl_arg_idx, "buf")
+                cl_arg_idx += 1
+
+            else:
+                arg_char = arg_dtype.char
+                arg_char = _type_char_map.get(arg_char, arg_char)
+                gen(
+                        "buf = pack('{arg_char}', {arg_var})"
+                        .format(
+                            arg_char=arg_char,
+                            arg_var=arg_var))
+                self._generate_buffer_arg_setter(gen, cl_arg_idx, "buf")
+                cl_arg_idx += 1
+
+            gen("")
+
+        if cl_arg_idx != self.num_args:
+            raise TypeError(
                 "length of argument list (%d) and "
                 "CL-generated number of arguments (%d) do not agree"
-                % (len(args), self.num_args))
+                % (cl_arg_idx, self.num_args))
 
-        i = None
-        try:
-            try:
-                arg_type_chars = self.__dict__["_arg_type_chars"]
-            except KeyError:
-                for i, arg in enumerate(args):
-                    self.set_arg(i, arg)
-            else:
-                from pyopencl._pvt_struct import pack
+        self._set_set_args_body(gen, len(scalar_arg_dtypes))
 
-                for i, (arg, arg_type_char) in enumerate(
-                        zip(args, arg_type_chars)):
-                    if arg_type_char and arg_type_char != "V":
-                        self.set_arg(i, pack(arg_type_char, arg))
-                    else:
-                        self.set_arg(i, arg)
-        except TypeError as e:
-            if i is not None:
-                advice = ""
-                from pyopencl.array import Array
-                if isinstance(args[i], Array):
-                    advice = " (perhaps you meant to pass 'array.data' " \
-                        "instead of the array itself?)"
-
-                raise LogicError(
-                        "when processing argument #%d (1-based): %s%s"
-                        % (i+1, str(e), advice))
-            else:
-                raise
+    # }}}
+
+    def kernel_set_args(self, *args, **kwargs):
+        # Need to dupicate the 'self' argument for dynamically generated  method
+        return self._set_args(self, *args, **kwargs)
+
+    def kernel_call(self, queue, global_size, local_size, *args, **kwargs):
+        # __call__ can't be overridden directly, so we need this
+        # trampoline hack.
+        return self._enqueue(self, queue, global_size, local_size, *args, **kwargs)
 
     def kernel_capture_call(self, filename, queue, global_size, local_size,
             *args, **kwargs):
@@ -683,9 +894,13 @@ def _add_functionality():
                 *args, **kwargs)
 
     Kernel.__init__ = kernel_init
-    Kernel.__call__ = kernel_call
+    Kernel._set_set_args_body = kernel__set_set_args_body
+    Kernel._generate_buffer_arg_setter = kernel__generate_buffer_arg_setter
+    Kernel._generate_generic_arg_handler = kernel__generate_generic_arg_handler
+    Kernel._generate_naive_call = kernel__generate_naive_call
     Kernel.set_scalar_arg_dtypes = kernel_set_scalar_arg_dtypes
     Kernel.set_args = kernel_set_args
+    Kernel.__call__ = kernel_call
     Kernel.capture_call = kernel_capture_call
 
     # }}}
@@ -842,11 +1057,20 @@ def _add_functionality():
         except AttributeError:
             return str(val)
         else:
-            result = "%s failed: %s" % (val.routine(),
-                    status_code.to_string(val.code(), "<unknown error %d>")
-                    .lower().replace("_", " "))
-            if val.what():
-                result += " - " + val.what()
+            result = ""
+            if val.code() != status_code.SUCCESS:
+                result = status_code.to_string(
+                        val.code(), "<unknown error %d>")
+            routine = val.routine()
+            if routine:
+                result = "%s failed: %s" % (
+                    routine.lower().replace("_", " "),
+                    result)
+            what = val.what()
+            if what:
+                if result:
+                    result += " - "
+                result += what
             return result
 
     def error_code(self):
diff --git a/pyopencl/cffi_cl.py b/pyopencl/cffi_cl.py
index 2f92250fc94ff80cf1ca43f4ca4daa60ea91fa2e..fbd2a0cc6ba3e69399210bccb7a004336e26f4ce 100644
--- a/pyopencl/cffi_cl.py
+++ b/pyopencl/cffi_cl.py
@@ -37,6 +37,11 @@ from .compyte.array import f_contiguous_strides, c_contiguous_strides
 
 _lib = _ffi.dlopen(None)
 
+
+class _CLKernelArg(object):
+    pass
+
+
 # {{{ hook up connections between the wrapper and the interperter
 
 import gc
@@ -713,7 +718,7 @@ class cffi_array(np.ndarray):  # noqa
         return self.__base
 
 
-class LocalMemory(object):
+class LocalMemory(_CLKernelArg):
     __slots__ = ('_size',)
 
     def __init__(self, size):
@@ -724,7 +729,7 @@ class LocalMemory(object):
         return self._size
 
 
-class MemoryObjectHolder(_Common):
+class MemoryObjectHolder(_Common, _CLKernelArg):
     def get_host_array(self, shape, dtype, order="C"):
         dtype, shape, strides = _norm_shape_dtype(
             shape, dtype, order, None, 'MemoryObjectHolder.get_host_array')
@@ -1030,17 +1035,20 @@ class Kernel(_Common):
         self.ptr = ptr_kernel[0]
 
     def set_arg(self, arg_index, arg):
+        # If you change this, also change the kernel call generation logic.
         if arg is None:
             _handle_error(_lib.kernel__set_arg_null(self.ptr, arg_index))
-        elif isinstance(arg, MemoryObjectHolder):
-            _handle_error(_lib.kernel__set_arg_mem(self.ptr, arg_index, arg.ptr))
-        elif isinstance(arg, Sampler):
-            _handle_error(_lib.kernel__set_arg_sampler(self.ptr, arg_index,
-                                                       arg.ptr))
-        elif isinstance(arg, LocalMemory):
-            _handle_error(_lib.kernel__set_arg_buf(self.ptr, arg_index,
-                                                   _ffi.NULL, arg.size))
+        elif isinstance(arg, _CLKernelArg):
+            if isinstance(arg, MemoryObjectHolder):
+                _handle_error(_lib.kernel__set_arg_mem(self.ptr, arg_index, arg.ptr))
+            elif isinstance(arg, Sampler):
+                _handle_error(_lib.kernel__set_arg_sampler(self.ptr, arg_index,
+                                                           arg.ptr))
+            elif isinstance(arg, LocalMemory):
+                _handle_error(_lib.kernel__set_arg_buf(self.ptr, arg_index,
+                                                       _ffi.NULL, arg.size))
         elif _CPY2 and isinstance(arg, np.generic):
+            # https://github.com/numpy/numpy/issues/5381
             c_buf, size, _ = _c_buffer_from_obj(np.getbuffer(arg))
             _handle_error(_lib.kernel__set_arg_buf(self.ptr, arg_index,
                                                    c_buf, size))
@@ -1869,7 +1877,7 @@ class Image(MemoryObject):
 
 # {{{ Sampler
 
-class Sampler(_Common):
+class Sampler(_Common, _CLKernelArg):
     _id = 'sampler'
 
     def __init__(self, context, normalized_coords, addressing_mode, filter_mode):
diff --git a/setup.py b/setup.py
index 0c19b0f7e4bc4e5286b0ad1b705f466bacfc0b0d..4ae6a0526a2c00e9b2ef7ab92e2f99bff0db452f 100644
--- a/setup.py
+++ b/setup.py
@@ -207,7 +207,7 @@ def main():
 
             install_requires=[
                 "numpy",
-                "pytools>=2014.2",
+                "pytools>=2015.1.1",
                 "pytest>=2",
                 "decorator>=3.2.0",
                 "cffi>=1.1.0",
diff --git a/test/test_algorithm.py b/test/test_algorithm.py
index 5518d508e01910126143602f34fca40dcc954485..b55c850e6681bd738b45378817a28e8c90ac1eb4 100644
--- a/test/test_algorithm.py
+++ b/test/test_algorithm.py
@@ -328,9 +328,6 @@ def test_dot(ctx_factory):
     queue = cl.CommandQueue(context)
 
     dev = context.devices[0]
-    from pyopencl.characterize import has_struct_arg_count_bug
-    if has_struct_arg_count_bug(dev):
-        pytest.xfail("device has struct arg counting bug")
 
     dtypes = [np.float32, np.complex64]
     if has_double_support(dev):
diff --git a/test/test_array.py b/test/test_array.py
index adb2f74482f7d86023c6d69a1abb28c4648da5f2..ecfd3ba97a027278ffb4c317c5180d49cd25695e 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -95,11 +95,6 @@ def test_mix_complex(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-    dev = context.devices[0]
-    from pyopencl.characterize import has_struct_arg_count_bug
-    if has_struct_arg_count_bug(dev):
-        pytest.xfail("device has struct arg counting bug")
-
     size = 10
 
     dtypes = [
@@ -174,9 +169,6 @@ def test_pow_neg1_vs_inv(ctx_factory):
     if not has_double_support(device):
         from pytest import skip
         skip("double precision not supported on %s" % device)
-    from pyopencl.characterize import has_struct_arg_count_bug
-    if has_struct_arg_count_bug(device):
-        pytest.xfail("device has struct arg counting bug")
 
     a_dev = make_random_array(queue, np.complex128, 20000)
 
diff --git a/test/test_clmath.py b/test/test_clmath.py
index e0e07764c9a324916183f0cd8423b3a1a1795aac..4dea8f91963e59f5a2ca2e3018cd73bf9287296f 100644
--- a/test/test_clmath.py
+++ b/test/test_clmath.py
@@ -76,11 +76,6 @@ def make_unary_function_test(name, limits=(0, 1), threshold=0, use_complex=False
         gpu_func = getattr(clmath, name)
         cpu_func = getattr(np, numpy_func_names.get(name, name))
 
-        dev = context.devices[0]
-        from pyopencl.characterize import has_struct_arg_count_bug
-        if use_complex and has_struct_arg_count_bug(dev):
-            pytest.xfail("device has struct arg counting bug")
-
         if has_double_support(context.devices[0]):
             if use_complex:
                 dtypes = [np.float32, np.float64, np.complex64, np.complex128]
@@ -132,8 +127,8 @@ if have_cl():
             (-math.pi/2 + 0.1, math.pi/2 - 0.1), 4e-5, use_complex=True)
     test_atan = make_unary_function_test("atan", (-10, 10), 2e-7)
 
-    test_sinh = make_unary_function_test("sinh", (-3, 3), 2e-6, use_complex=2e-3)
-    test_cosh = make_unary_function_test("cosh", (-3, 3), 2e-6, use_complex=2e-3)
+    test_sinh = make_unary_function_test("sinh", (-3, 3), 3e-6, use_complex=2e-3)
+    test_cosh = make_unary_function_test("cosh", (-3, 3), 3e-6, use_complex=2e-3)
     test_tanh = make_unary_function_test("tanh", (-3, 3), 2e-6, use_complex=True)
 
 
@@ -228,10 +223,6 @@ def test_frexp(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-    if context.devices[0].platform.name == "Portable Computing Language":
-        # https://github.com/pocl/pocl/issues/202
-        pytest.xfail("POCL's frexp seems to have issues")
-
     for s in sizes:
         a = cl_array.arange(queue, s, dtype=np.float32)/10
         significands, exponents = clmath.frexp(a)