diff --git a/doc/source/index.rst b/doc/source/index.rst
index 9fd85142d25562240029b1354b6de8a53ed91efc..38456e96a14eabd00176b328c123d2a72df1aa51 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -53,7 +53,7 @@ Here's an example, to give you an impression::
         }
         """).build()
 
-    prg.sum(queue, a.shape, a_buf, b_buf, dest_buf)
+    prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf)
 
     a_plus_b = numpy.empty_like(a)
     cl.enqueue_read_buffer(queue, dest_buf, a_plus_b).wait()
diff --git a/doc/source/misc.rst b/doc/source/misc.rst
index 7504bf8f326acbb8b384c5b4ce87df5bbbed76b3..8deba3b084ec46c024b50bc393ce5637ac565cd5 100644
--- a/doc/source/misc.rst
+++ b/doc/source/misc.rst
@@ -78,6 +78,8 @@ Version 0.92
   `cl_khr_gl_sharing <ghttp://www.khronos.org/registry/cl/extensions/khr/cl_khr_gl_sharing.txt>`_
   extension, leading to working GL interoperability.
 * Add :meth:`pyopencl.Kernel.set_args`.
+* The call signature of :meth:`pyopencl.Kernel.__call__` changed to
+  emphasize the importance of *loccal_size*.
 
 Version 0.91.5
 --------------
diff --git a/doc/source/runtime.rst b/doc/source/runtime.rst
index 11ddf72c08c15d3ef5d379aa01af6b6e3bd25b84..7fa2b130ed861818322825228a7893a025b979bf 100644
--- a/doc/source/runtime.rst
+++ b/doc/source/runtime.rst
@@ -592,13 +592,26 @@ Programs and Kernels
 
         Invoke :meth:`set_arg` on each element of *args* in turn.
 
-    .. method:: __call__(queue, global_size, *args, global_offset=None, local_size=None, wait_for=None)
+    .. method:: __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None)
 
         Use :func:`enqueue_nd_range_kernel` to enqueue a kernel execution, after using
-        :meth:`set_arg` to set each argument in turn. See the documentation for 
+        :meth:`set_args` to set each argument in turn. See the documentation for 
         :meth:`set_arg` to see what argument types are allowed.
         |std-enqueue-blurb|
 
+        *None* may be passed for local_size
+
+        .. versionchanged:: 0.92
+            *local_size* was promoted to third positional argument from being a
+            keyword argument. The old keyword argument usage will continue to
+            be accepted with a warning throughout the 0.92 release cycle. 
+            This is a backward-compatible change (just barely!) because
+            *local_size* as third positional argument can only be a
+            :class:`tuple` or *None*.  :class:`tuple` instances are never valid
+            :class:`Kernel` arguments, and *None* is valid as an argument, but
+            its treatment in the wrapper had a bug (now fixed) that prevented
+            it from working.
+
     |comparable|
 
 .. class:: LocalMemory(size)
diff --git a/examples/benchmark-all.py b/examples/benchmark-all.py
index 03553f94af4bbbdae217b0b8b4c9de4fa1aa56d0..0d08979b76e89ee1b75e8b3a07d93e86beab23f0 100644
--- a/examples/benchmark-all.py
+++ b/examples/benchmark-all.py
@@ -60,7 +60,7 @@ for platform in cl.get_platforms():
                 }
                 """).build()
 
-        exec_evt = prg.sum(queue, a.shape, a_buf, b_buf, dest_buf)
+        exec_evt = prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf)
         exec_evt.wait()
         elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start)
 
diff --git a/examples/demo.py b/examples/demo.py
index 71f146eb87581e145f909f872e5d9c07a02b46b6..44bc0a58e24142f6263c70c5e77849960d9c4da2 100644
--- a/examples/demo.py
+++ b/examples/demo.py
@@ -22,7 +22,7 @@ prg = cl.Program(ctx, """
     }
     """).build()
 
-prg.sum(queue, a.shape, a_buf, b_buf, dest_buf)
+prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf)
 
 a_plus_b = numpy.empty_like(a)
 cl.enqueue_read_buffer(queue, dest_buf, a_plus_b).wait()
diff --git a/examples/demo_meta_codepy.py b/examples/demo_meta_codepy.py
index 8c2a827560e77e72d0dad71eb4d406cb1107fe29..c080109b9dcfe45c16525db2eaa7709f9250b3a9 100644
--- a/examples/demo_meta_codepy.py
+++ b/examples/demo_meta_codepy.py
@@ -46,9 +46,8 @@ mod = Module([
 
 knl = cl.Program(ctx, str(mod)).build().add
 
-knl(queue, (local_size*macroblock_count,), 
-        c_buf, a_buf, b_buf, 
-        local_size=(local_size,))
+knl(queue, (local_size*macroblock_count,), (local_size,),
+        c_buf, a_buf, b_buf)
 
 c = numpy.empty_like(a)
 cl.enqueue_read_buffer(queue, c_buf, c).wait()
diff --git a/examples/demo_meta_template.py b/examples/demo_meta_template.py
index fe05b1eeabb2fac27b1716e34daf3d1ee44d3930..e6d9315dc308936e8f9bed888e108c4bf609e200 100644
--- a/examples/demo_meta_template.py
+++ b/examples/demo_meta_template.py
@@ -44,9 +44,8 @@ rendered_tpl = tpl.render(type_name="float",
 
 knl = cl.Program(ctx, str(rendered_tpl)).build().add
 
-knl(queue, (local_size*macroblock_count,), 
-        c_buf, a_buf, b_buf, 
-        local_size=(local_size,))
+knl(queue, (local_size*macroblock_count,), (local_size,),
+        c_buf, a_buf, b_buf)
 
 c = numpy.empty_like(a)
 cl.enqueue_read_buffer(queue, c_buf, c).wait()
diff --git a/examples/gl_interop_demo.py b/examples/gl_interop_demo.py
index e773db83a8dd587d72fbc40437a59fef2cd5e7be..dd4bf3e3abe962d37947a6ebfd88e1d40a6aefeb 100644
--- a/examples/gl_interop_demo.py
+++ b/examples/gl_interop_demo.py
@@ -32,7 +32,7 @@ def initialize():
     plats = cl.get_platforms()
     ctx_props = cl.context_properties
     props = [(ctx_props.PLATFORM, plats[0]), (ctx_props.GL_CONTEXT_KHR,
-	GetCurrentContext()), (ctx_props.GLX_DISPLAY_KHR, GetCurrentDisplay())]
+        GetCurrentContext()), (ctx_props.GLX_DISPLAY_KHR, GetCurrentDisplay())]
     ctx = cl.Context(properties=props)
     glClearColor(1, 1, 1, 1)
     glColor(0, 0, 1)
@@ -45,7 +45,7 @@ def initialize():
     prog = cl.Program(ctx, src).build()
     queue = cl.CommandQueue(ctx)
     cl.enqueue_acquire_gl_objects(queue, [coords_dev])
-    prog.generate_sin(queue, (n_vertices,), coords_dev)
+    prog.generate_sin(queue, (n_vertices,), None, coords_dev)
     cl.enqueue_release_gl_objects(queue, [coords_dev])
     queue.finish()
     glFlush()
@@ -65,7 +65,7 @@ if __name__ == '__main__':
     import sys
     glutInit(sys.argv)
     if len(sys.argv) > 1:
-	n_vertices = int(sys.argv[1])
+        n_vertices = int(sys.argv[1])
     glutInitWindowSize(800, 160)
     glutInitWindowPosition(0, 0)
     glutCreateWindow('OpenCL/OpenGL Interop Tutorial: Sin Generator')
diff --git a/examples/matrix-multiply.py b/examples/matrix-multiply.py
index 7e26f8c2aa50b4bc0842d16b400f5a635da9786c..142591f14e7bbeb568c9558386c87a64fd7bf10a 100644
--- a/examples/matrix-multiply.py
+++ b/examples/matrix-multiply.py
@@ -183,9 +183,9 @@ push_time = time()-t1
 
 # warmup ----------------------------------------------------------------------
 for i in range(5):
-    event = kernel(queue, h_c.shape, d_c_buf, d_a_buf, d_b_buf, 
-            local_size=(block_size, block_size))
-event.wait()
+    event = kernel(queue, h_c.shape, (block_size, block_size), 
+            d_c_buf, d_a_buf, d_b_buf)
+    event.wait()
 
 queue.finish()
 
@@ -194,8 +194,8 @@ t1 = time()
 
 count = 20
 for i in range(count):
-    event = kernel(queue, h_c.shape, d_c_buf, d_a_buf, d_b_buf, 
-            local_size=(block_size, block_size))
+    event = kernel(queue, h_c.shape, (block_size, block_size),
+            d_c_buf, d_a_buf, d_b_buf)
 
 event.wait()
 
diff --git a/examples/narray.py b/examples/narray.py
index 667f6717411e80e76670a951af55298b70b1ad47..17450d73c0977f597c59947ccfb70c9f5b3a79ff 100644
--- a/examples/narray.py
+++ b/examples/narray.py
@@ -28,7 +28,7 @@ except:
     print prg.get_build_info(ctx.devices[0], cl.program_build_info.LOG)
     raise
 
-prg.demo(queue, (500,), demo_buf)
+prg.demo(queue, (500,), None, demo_buf)
 cl.enqueue_read_buffer(queue, demo_buf, demo_r).wait()
 
 for res in demo_r:
diff --git a/examples/transpose.py b/examples/transpose.py
index ad86de3b7112d107173d78e6d52222b70e38e469..7c571fdf07d6d6a27cc4d49afdaef81b5b5f2c85 100644
--- a/examples/transpose.py
+++ b/examples/transpose.py
@@ -35,9 +35,8 @@ class NaiveTranspose:
         assert w % block_size == 0
         assert h % block_size == 0
 
-        return self.kernel(queue, (w, h),
-            tgt, src, numpy.uint32(w), numpy.uint32(h),
-            local_size=(block_size, block_size))
+        return self.kernel(queue, (w, h), (block_size, block_size),
+            tgt, src, numpy.uint32(w), numpy.uint32(h))
 
 
 
@@ -48,7 +47,7 @@ class SillyTranspose(NaiveTranspose):
         assert w % block_size == 0
         assert h % block_size == 0
 
-        return self.kernel(queue, (w, h),
+        return self.kernel(queue, (w, h), None,
             tgt, src, numpy.uint32(w), numpy.uint32(h))
 
 
@@ -90,10 +89,9 @@ class TransposeWithLocal:
         assert w % block_size == 0
         assert h % block_size == 0
 
-        return self.kernel(queue, (w, h),
+        return self.kernel(queue, (w, h), (block_size, block_size),
             tgt, src, numpy.uint32(w), numpy.uint32(h),
-            cl.LocalMemory(4*block_size*(block_size+1)),
-            local_size=(block_size, block_size))
+            cl.LocalMemory(4*block_size*(block_size+1)))
 
 
 
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index f8083bc4ec68016f626eb66cfefc879e898f3156..e9c4474502922b81699422f02768bace3aa3d1b5 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -52,7 +52,7 @@ def _add_functionality():
     for cls in CONSTANT_CLASSES:
         cls.to_string = classmethod(to_string)
 
-    # get_info attributes -----------------------------------------------------
+    # {{{ get_info attributes -------------------------------------------------
     def make_getattr(info_classes):
         name_to_info = dict(
                 (intern(info_name.lower()), (info_method, info_value))
@@ -76,26 +76,34 @@ def _add_functionality():
     for cls, info_classes in cls_to_info_cls.iteritems():
         cls.__getattr__ = make_getattr(info_classes)
 
-    # Platform ----------------------------------------------------------------
+    # }}}
+
+    # {{{ Platform
     def platform_repr(self):
         return "<pyopencl.Platform '%s' at 0x%x>" % (self.name, self.obj_ptr)
 
     Platform.__repr__ = platform_repr
 
-    # Device ------------------------------------------------------------------
+    # }}}
+
+    # {{{ Device
     def device_repr(self):
         return "<pyopencl.Device '%s' at 0x%x>" % (self.name, self.obj_ptr)
 
     Device.__repr__ = device_repr
 
-    # Context -----------------------------------------------------------------
+    # }}}
+
+    # {{{ Context
     def context_repr(self):
         return "<pyopencl.Context at 0x%x on %s>" % (self.obj_ptr,
                 ", ".join(repr(dev) for dev in self.devices))
 
     Context.__repr__ = context_repr
 
-    # Program -----------------------------------------------------------------
+    # }}}
+
+    # {{{ Program
     def program_getattr(self, attr):
         try:
             pi_attr = getattr(_cl.program_info, attr.upper())
@@ -134,7 +142,9 @@ def _add_functionality():
     Program.__getattr__ = program_getattr
     Program.build = program_build
 
-    # Event -------------------------------------------------------------------
+    # }}}
+
+    # {{{ Event
     class ProfilingInfoGetter:
         def __init__(self, event):
             self.event = event
@@ -152,12 +162,12 @@ def _add_functionality():
 
     _cl.Event.profile = property(ProfilingInfoGetter)
 
-    # Kernel ------------------------------------------------------------------
-    def kernel_call(self, queue, global_size, *args, **kwargs):
-        for i, arg in enumerate(args):
-            self.set_arg(i, arg)
+    # }}}
 
+    # {{{ Kernel
+    def kernel_call(self, queue, global_size, *args, **kwargs):
         global_offset = kwargs.pop("global_offset", None)
+        had_local_size = "local_size" in kwargs
         local_size = kwargs.pop("local_size", None)
         wait_for = kwargs.pop("wait_for", None)
 
@@ -166,25 +176,77 @@ def _add_functionality():
                     "Kernel.__call__ recived unexpected keyword arguments: %s"
                     % ", ".join(kwargs.keys()))
 
+        if had_local_size:
+            from warnings import warn
+            warn("The local_size keyword argument is deprecated and will be "
+                    "removed in pyopencl 0.94. Pass the local "
+                    "size as the third positional argument instead.",
+                    DeprecationWarning, stacklevel=2)
+
+        from types import NoneType
+        if isinstance(args[0], (NoneType, tuple)) and not had_local_size:
+            local_size = args[0]
+            args = args[1:]
+        elif not had_local_size:
+            from warnings import warn
+            warn("PyOpenCL Warning: There was an API change "
+                    "in Kernel.__call__() in pyopencl 0.92. "
+                    "local_size was moved from keyword argument to third "
+                    "positional argument in pyopencl 0.92. "
+                    "You didn't pass local_size, but you still need to insert "
+                    "'None' as a third argument. "
+                    "Your present usage is deprecated and will stop "
+                    "working in pyopencl 0.94.",
+                    DeprecationWarning, stacklevel=2)
+
+        self.set_args(*args)
+
         return enqueue_nd_range_kernel(queue, self, global_size, local_size,
                 global_offset, wait_for)
 
+    def kernel_set_scalar_arg_dtypes(self, arg_dtypes):
+        arg_type_chars = []
+
+        for arg_dtype in arg_dtypes:
+            if arg_dtype is None:
+                arg_type_chars.append(None)
+            else:
+                import numpy
+                arg_type_chars.append(numpy.dtype(arg_dtype).char)
+
+        self._arg_type_chars = arg_type_chars
+
     def kernel_set_args(self, *args):
-        for i, arg in enumerate(args):
-            self.set_arg(i, arg)
+        try:
+            arg_type_chars = self.__dict__["_arg_type_chars"]
+        except KeyError:
+            for i, arg in enumerate(args):
+                self.set_arg(i, arg)
+        else:
+            from struct import pack
+            for i, (arg, arg_type_char) in enumerate(
+                    zip(args, arg_type_chars)):
+                if arg_type_char:
+                    self.set_arg(i, pack(arg_type_char, arg))
+                else:
+                    self.set_arg(i, arg)
 
     Kernel.__call__ = kernel_call
+    Kernel.set_scalar_arg_dtypes = kernel_set_scalar_arg_dtypes
     Kernel.set_args = kernel_set_args
 
-    # ImageFormat -------------------------------------------------------------
+    # }}}
+
+    # {{{ ImageFormat
     def image_format_repr(self):
         return "ImageFormat(%s, %s)" % (
                 channel_order.to_string(self.channel_order),
                 channel_type.to_string(self.channel_data_type))
 
     ImageFormat.__repr__ = image_format_repr
+    # }}}
 
-    # Image -------------------------------------------------------------------
+    # {{{ Image
     class ImageInfoGetter:
         def __init__(self, event):
             from warnings import warn
@@ -213,13 +275,17 @@ def _add_functionality():
     _cl.Image.image = property(ImageInfoGetter)
     _cl.Image.shape = property(image_shape)
 
-    # Event -------------------------------------------------------------------
+    # }}}
+
+    # {{{ Event
     def event_wait(self):
         wait_for_events([self])
         return self
 
     Event.wait = event_wait
 
+    # }}}
+
     if _cl.have_gl():
         def gl_object_get_gl_object(self):
             return self.get_gl_object_info()[1]
@@ -232,7 +298,7 @@ _add_functionality()
 
 
 
-# convenience -----------------------------------------------------------------
+# {{{ convenience -------------------------------------------------------------
 def create_some_context(interactive=True):
     try:
         import sys
@@ -248,7 +314,7 @@ def create_some_context(interactive=True):
     elif len(platforms) == 1 or not interactive:
         platform = platforms[0]
     else:
-        print "Choose platform from these choices:"
+        print "Choose platform:"
         for i, pf in enumerate(platforms):
             print "[%d] %s" % (i, pf)
 
@@ -267,7 +333,7 @@ def create_some_context(interactive=True):
     elif len(devices) == 1 or not interactive:
         pass
     else:
-        print "Choose device(s) from these choices:"
+        print "Choose device(s):"
         for i, dev in enumerate(devices):
             print "[%d] %s" % (i, dev)
 
@@ -279,6 +345,7 @@ def create_some_context(interactive=True):
 
     return Context(devices)
 
+# }}}
 
 
 # vim: foldmethod=marker
diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index ed683120f87f6be784c0c882e69a2d675224e32d..7bd9e26801f6f40016e4632a98df79087f7f3309 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -26,7 +26,7 @@ class TestCL:
 
     @pytools.test.mark_test.opencl
     def test_get_info(self, platform, device):
-        had_failures = [False]
+        failure_count = [0]
 
         CRASH_QUIRKS = [
                 (("NVIDIA Corporation", "NVIDIA CUDA", 
@@ -69,13 +69,12 @@ class TestCL:
                     try:
                         func(info)
                     except:
-                        print "failed get_info", type(cl_obj), info_name
+                        msg = "failed get_info", type(cl_obj), info_name
 
                         if find_quirk(QUIRKS, cl_obj, info):
-                            print "(known quirk for %s)" % platform.name
+                            msg += ("(known quirk for %s)" % platform.name)
                         else:
-                            had_failures[0] = True
-                            raise
+                            failure_count[0] += 1
 
                     if try_attr_form:
                         try:
@@ -86,8 +85,7 @@ class TestCL:
                             if find_quirk(QUIRKS, cl_obj, info):
                                 print "(known quirk for %s)" % platform.name
                             else:
-                                had_failures[0] = True
-                                raise
+                                failure_count[0] += 1
 
         do_test(platform, cl.platform_info)
 
@@ -125,7 +123,7 @@ class TestCL:
         kernel = prg.sum
         do_test(kernel, cl.kernel_info)
 
-        evt = kernel(queue, (n,), a_buf)
+        evt = kernel(queue, (n,), None, a_buf)
         do_test(evt, cl.event_info)
 
         if profiling:
@@ -151,8 +149,12 @@ class TestCL:
             do_test(img, cl.image_info,
                     lambda info: img.get_image_info(info))
 
-        if had_failures[0]:
-            raise RuntimeError("get_info testing had errors")
+        if failure_count[0]:
+            raise RuntimeError(
+                    "get_info testing had %d errors "
+                    "(If you compiled against OpenCL 1.1 but are testing a 1.0 "
+                    "implementation, you can safely ignore this.)"
+                    % failure_count[0])
 
     @pytools.test.mark_test.opencl
     def test_invalid_kernel_names_cause_failures(self):
@@ -201,18 +203,18 @@ class TestCL:
         a_buf = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)
 
         try:
-            prg.mult(queue, a.shape, a_buf, 2, 3)
+            prg.mult(queue, a.shape, None, a_buf, 2, 3)
             assert False, "PyOpenCL should not accept bare Python types as arguments"
-        except TypeError:
+        except cl.LogicError:
             pass
 
         try:
-            prg.mult(queue, a.shape, a_buf, float(2), 3)
+            prg.mult(queue, a.shape, None, a_buf, float(2), 3)
             assert False, "PyOpenCL should not accept bare Python types as arguments"
-        except TypeError:
+        except cl.LogicError:
             pass
 
-        prg.mult(queue, a.shape, a_buf, numpy.float32(2), numpy.int32(3))
+        prg.mult(queue, a.shape, None, a_buf, numpy.float32(2), numpy.int32(3))
 
         a_result = numpy.empty_like(a)
         cl.enqueue_read_buffer(queue, a_buf, a_result).wait()
@@ -254,7 +256,7 @@ class TestCL:
         samp = cl.Sampler(context, False,
                 cl.addressing_mode.CLAMP,
                 cl.filter_mode.NEAREST)
-        prg.copy_image(queue, a.shape, a_dest, a_img, samp, numpy.int32(a.shape[0]))
+        prg.copy_image(queue, a.shape, None, a_dest, a_img, samp, numpy.int32(a.shape[0]))
 
         a_result = numpy.empty_like(a)
         cl.enqueue_read_buffer(queue, a_dest, a_result, is_blocking=True)