diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index 015c82dd1fa5f81665f062f974149c2e93a324a9..0ce3db81761cdc9b468bc7b964da484058807705 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -80,7 +80,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters):
     import pyopencl.array as cl_array
 
     from loopy.kernel.data import ValueArg, ArrayArg, ImageArg, \
-            TemporaryVariable, ConstantArg
+            TemporaryVariable, ConstantArg, AddressSpace
 
     from pymbolic import evaluate
 
@@ -110,6 +110,15 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters):
 
         elif arg.arg_class is ArrayArg or arg.arg_class is ImageArg \
                 or arg.arg_class is ConstantArg:
+
+            if arg.address_space == AddressSpace.LOCAL:
+                # generally local kernel arguments are used as dynamically sized
+                # memory but you can't pass data from the host to the local arg,
+                # so there are no "reference" local arguments
+                ref_args[arg.name] = None
+                ref_arg_data.append(None)
+                continue
+
             if arg.shape is None or any(saxis is None for saxis in arg.shape):
                 raise LoopyError("array '%s' needs known shape to use automatic "
                         "testing" % arg.name)
@@ -197,7 +206,7 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters):
     import pyopencl.array as cl_array
 
     from loopy.kernel.data import ValueArg, ArrayArg, ImageArg,\
-            TemporaryVariable, ConstantArg
+            TemporaryVariable, ConstantArg, AddressSpace
 
     from pymbolic import evaluate
 
@@ -232,6 +241,11 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters):
 
         elif arg.arg_class is ArrayArg or\
                 arg.arg_class is ConstantArg:
+
+            if arg.address_space == AddressSpace.LOCAL:
+                # handled in invocation
+                continue
+
             shape = evaluate(arg.unvec_shape, parameters)
             strides = evaluate(arg.unvec_strides, parameters)
 
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 11f874e1bd90bcfc1fe4595345c1b1efb2e6a35f..ebb43520b08842c0c9df5bf8a6b3eb4150584a2e 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -51,6 +51,10 @@ class ImplementedDataInfo(ImmutableRecord):
 
     .. attribute:: arg_class
 
+    .. attribute:: address_space
+        The address-space of the array.
+        May be *None* for non-array arguments
+
     .. attribute:: base_name
 
         The user-facing name of the underlying array.
@@ -86,7 +90,7 @@ class ImplementedDataInfo(ImmutableRecord):
             unvec_shape=None, unvec_strides=None,
             offset_for_name=None, stride_for_name_and_axis=None,
             allows_offset=None,
-            is_written=None):
+            is_written=None, address_space=None):
 
         from loopy.types import LoopyType
         assert isinstance(dtype, LoopyType)
@@ -103,7 +107,8 @@ class ImplementedDataInfo(ImmutableRecord):
                 offset_for_name=offset_for_name,
                 stride_for_name_and_axis=stride_for_name_and_axis,
                 allows_offset=allows_offset,
-                is_written=is_written)
+                is_written=is_written,
+                address_space=address_space)
 
 # }}}
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 6b0033808c616829e60615b92849fa6353751a82..d97d0fc73a24fe2f83c0c90b1536402d924a3323 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -978,7 +978,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     @memoize_method
     def global_var_names(self):
         from loopy.kernel.data import AddressSpace
-
         from loopy.kernel.data import ArrayArg
         return (
                 set(
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 186597c64734b54b8d08f0db43b57826d79f9567..372bf6eca36d04c1c44b391a41742e2863d62f7a 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1058,11 +1058,11 @@ class ArrayBase(ImmutableRecord):
                                     full_name, stride_impl_axis),
                                 is_written=False))
 
+                space = getattr(self, 'address_space', None)
                 yield ImplementedDataInfo(
                             target=target,
                             name=full_name,
                             base_name=self.name,
-
                             arg_class=type(self),
                             dtype=dtype,
                             shape=shape,
@@ -1070,8 +1070,8 @@ class ArrayBase(ImmutableRecord):
                             unvec_shape=unvec_shape,
                             unvec_strides=tuple(unvec_strides),
                             allows_offset=bool(self.offset),
-
-                            is_written=is_written)
+                            is_written=is_written,
+                            address_space=space)
 
                 import loopy as lp
 
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index 3cdf20577bf995b351f90615dd18f7bd0681be0b..7aade780c725d5e2c45a6f64047f8c7fecb89b54 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -364,7 +364,7 @@ class ExecutionWrapperGeneratorBase(object):
             self, gen, kernel, implemented_data_info, options):
         import loopy as lp
 
-        from loopy.kernel.data import KernelArgument
+        from loopy.kernel.data import KernelArgument, AddressSpace
         from loopy.kernel.array import ArrayBase
         from loopy.symbolic import StringifyMapper
         from loopy.types import NumpyType
@@ -463,104 +463,121 @@ class ExecutionWrapperGeneratorBase(object):
                     gen("if True:")
 
                 with Indentation(gen):
-                    gen("if %s.dtype != %s:"
-                            % (arg.name, self.python_dtype_str(
-                                kernel_arg.dtype.numpy_dtype)))
-                    with Indentation(gen):
-                        gen("raise TypeError(\"dtype mismatch on argument '%s' "
-                                "(got: %%s, expected: %s)\" %% %s.dtype)"
-                                % (arg.name, arg.dtype, arg.name))
-
-                    # {{{ generate shape checking code
-
-                    def strify_allowing_none(shape_axis):
-                        if shape_axis is None:
-                            return "None"
-                        else:
-                            return strify(shape_axis)
-
-                    def strify_tuple(t):
-                        if len(t) == 0:
-                            return "()"
-                        else:
-                            return "(%s,)" % ", ".join(
-                                    strify_allowing_none(sa)
-                                    for sa in t)
-
-                    shape_mismatch_msg = (
-                            "raise TypeError(\"shape mismatch on argument '%s' "
-                            "(got: %%s, expected: %%s)\" "
-                            "%% (%s.shape, %s))"
-                            % (arg.name, arg.name, strify_tuple(arg.unvec_shape)))
-
-                    if kernel_arg.shape is None:
-                        pass
-
-                    elif any(shape_axis is None for shape_axis in kernel_arg.shape):
-                        gen("if len(%s.shape) != %s:"
-                                % (arg.name, len(arg.unvec_shape)))
+                    # check for local memory
+                    if arg.address_space == AddressSpace.LOCAL:
+                        from numpy import prod
+                        # simply check that the argument size is sufficient
+                        expected_size = prod(arg.shape) * arg.dtype.itemsize
+                        gen("if %s.size < %d:" % (arg.name, expected_size))
+                        with Indentation(gen):
+                            gen("raise TypeError(\"size mismatch on local argument "
+                                "'%s' (got: %%d bytes, expected: %d bytes)\" %% "
+                                "%s.size)" % (
+                                    arg.name, expected_size, arg.name))
+                    else:
+
+                        gen("if %s.dtype != %s:"
+                                % (arg.name, self.python_dtype_str(
+                                    kernel_arg.dtype.numpy_dtype)))
                         with Indentation(gen):
-                            gen(shape_mismatch_msg)
+                            gen("raise TypeError(\"dtype mismatch on argument '%s' "
+                                    "(got: %%s, expected: %s)\" %% %s.dtype)"
+                                    % (arg.name, arg.dtype, arg.name))
 
-                        for i, shape_axis in enumerate(arg.unvec_shape):
+                        # {{{ generate shape checking code
+
+                        def strify_allowing_none(shape_axis):
                             if shape_axis is None:
-                                continue
+                                return "None"
+                            else:
+                                return strify(shape_axis)
 
-                            gen("if %s.shape[%d] != %s:"
-                                    % (arg.name, i, strify(shape_axis)))
+                        def strify_tuple(t):
+                            if len(t) == 0:
+                                return "()"
+                            else:
+                                return "(%s,)" % ", ".join(
+                                        strify_allowing_none(sa)
+                                        for sa in t)
+
+                        shape_mismatch_msg = (
+                                "raise TypeError(\"shape mismatch on argument '%s' "
+                                "(got: %%s, expected: %%s)\" "
+                                "%% (%s.shape, %s))"
+                                % (arg.name, arg.name, strify_tuple(
+                                    arg.unvec_shape)))
+
+                        if kernel_arg.shape is None:
+                            pass
+
+                        elif any(shape_axis is None
+                                for shape_axis in kernel_arg.shape):
+                            gen("if len(%s.shape) != %s:"
+                                    % (arg.name, len(arg.unvec_shape)))
                             with Indentation(gen):
                                 gen(shape_mismatch_msg)
 
-                    else:  # not None, no Nones in tuple
-                        gen("if %s.shape != %s:"
-                                % (arg.name, strify(arg.unvec_shape)))
-                        with Indentation(gen):
-                            gen(shape_mismatch_msg)
+                            for i, shape_axis in enumerate(arg.unvec_shape):
+                                if shape_axis is None:
+                                    continue
 
-                    # }}}
+                                gen("if %s.shape[%d] != %s:"
+                                        % (arg.name, i, strify(shape_axis)))
+                                with Indentation(gen):
+                                    gen(shape_mismatch_msg)
 
-                    if arg.unvec_strides and kernel_arg.dim_tags:
-                        itemsize = kernel_arg.dtype.numpy_dtype.itemsize
-                        sym_strides = tuple(
-                                itemsize*s_i for s_i in arg.unvec_strides)
+                        else:  # not None, no Nones in tuple
+                            gen("if %s.shape != %s:"
+                                    % (arg.name, strify(arg.unvec_shape)))
+                            with Indentation(gen):
+                                gen(shape_mismatch_msg)
 
-                        ndim = len(arg.unvec_shape)
-                        shape = ["_lpy_shape_%d" % i for i in range(ndim)]
-                        strides = ["_lpy_stride_%d" % i for i in range(ndim)]
+                        # }}}
 
-                        gen("(%s,) = %s.shape" % (", ".join(shape), arg.name))
-                        gen("(%s,) = %s.strides" % (", ".join(strides), arg.name))
+                        if arg.unvec_strides and kernel_arg.dim_tags:
+                            itemsize = kernel_arg.dtype.numpy_dtype.itemsize
+                            sym_strides = tuple(
+                                    itemsize*s_i for s_i in arg.unvec_strides)
 
-                        gen("if not %s:"
-                                % self.get_strides_check_expr(
-                                    shape, strides,
-                                    (strify(s) for s in sym_strides)))
-                        with Indentation(gen):
-                            gen("_lpy_got = tuple(stride "
-                                    "for (dim, stride) in zip(%s.shape, %s.strides) "
-                                    "if dim > 1)"
-                                    % (arg.name, arg.name))
-                            gen("_lpy_expected = tuple(stride "
-                                    "for (dim, stride) in zip(%s.shape, %s) "
-                                    "if dim > 1)"
-                                    % (arg.name, strify_tuple(sym_strides)))
-
-                            gen("raise TypeError(\"strides mismatch on "
-                                    "argument '%s' "
-                                    "(after removing unit length dims, "
-                                    "got: %%s, expected: %%s)\" "
-                                    "%% (_lpy_got, _lpy_expected))"
-                                    % arg.name)
-
-                    if not arg.allows_offset:
-                        gen("if hasattr(%s, 'offset') and %s.offset:" % (
-                                arg.name, arg.name))
-                        with Indentation(gen):
-                            gen("raise ValueError(\"Argument '%s' does not "
-                                    "allow arrays with offsets. Try passing "
-                                    "default_offset=loopy.auto to make_kernel()."
-                                    "\")" % arg.name)
-                            gen("")
+                            ndim = len(arg.unvec_shape)
+                            shape = ["_lpy_shape_%d" % i for i in range(ndim)]
+                            strides = ["_lpy_stride_%d" % i for i in range(ndim)]
+
+                            gen("(%s,) = %s.shape" % (", ".join(shape), arg.name))
+                            gen("(%s,) = %s.strides" % (
+                                ", ".join(strides), arg.name))
+
+                            gen("if not %s:"
+                                    % self.get_strides_check_expr(
+                                        shape, strides,
+                                        (strify(s) for s in sym_strides)))
+                            with Indentation(gen):
+                                gen("_lpy_got = tuple(stride "
+                                        "for (dim, stride) in "
+                                        "zip(%s.shape, %s.strides) "
+                                        "if dim > 1)"
+                                        % (arg.name, arg.name))
+                                gen("_lpy_expected = tuple(stride "
+                                        "for (dim, stride) in zip(%s.shape, %s) "
+                                        "if dim > 1)"
+                                        % (arg.name, strify_tuple(sym_strides)))
+
+                                gen("raise TypeError(\"strides mismatch on "
+                                        "argument '%s' "
+                                        "(after removing unit length dims, "
+                                        "got: %%s, expected: %%s)\" "
+                                        "%% (_lpy_got, _lpy_expected))"
+                                        % arg.name)
+
+                        if not arg.allows_offset:
+                            gen("if hasattr(%s, 'offset') and %s.offset:" % (
+                                    arg.name, arg.name))
+                            with Indentation(gen):
+                                gen("raise ValueError(\"Argument '%s' does not "
+                                        "allow arrays with offsets. Try passing "
+                                        "default_offset=loopy.auto to make_kernel()."
+                                        "\")" % arg.name)
+                                gen("")
 
             # }}}
 
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index 27be619870de6939cf3de06751dea65a3fd558c0..9a0f859417ecaed20a4f013934311ba1a4197663 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -63,8 +63,14 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
     # {{{ handle non-numpy args
 
     def handle_non_numpy_arg(self, gen, arg):
+        from loopy.kernel.data import AddressSpace
+        is_local = arg.address_space == AddressSpace.LOCAL
         gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name)
         with Indentation(gen):
+            if is_local:
+                gen("raise Exception('Cannot pass numpy data directly to a "
+                    "__local argument.')")
+
             gen("# synchronous, nothing to worry about")
             gen("%s = _lpy_cl_array.to_device("
                     "queue, %s, allocator=allocator)"
@@ -72,7 +78,12 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
             gen("_lpy_encountered_numpy = True")
         gen("elif %s is not None:" % arg.name)
         with Indentation(gen):
-            gen("_lpy_encountered_dev = True")
+            if is_local:
+                gen("assert isinstance(%s, _lpy_cl.LocalMemory), 'Arguments with "
+                    "local scope must either be None or an instance of a "
+                    "pyopencl.LocalMemory object.'" % arg.name)
+            else:
+                gen("_lpy_encountered_dev = True")
 
         gen("")
 
@@ -85,6 +96,15 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
         Handle allocation of non-specified arguements for pyopencl execution
         """
         from pymbolic import var
+        from loopy.kernel.data import AddressSpace
+
+        if arg.address_space == AddressSpace.LOCAL:
+            # handle local argument allocations
+            from numpy import prod
+            gen('# create a properly sized LocalMemory object')
+            gen('%s = _lpy_cl.LocalMemory(%d)' % (
+                arg.name, prod(arg.shape) * arg.dtype.itemsize))
+            return
 
         num_axes = len(arg.strides)
         for i in range(num_axes):
@@ -192,7 +212,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
     def generate_output_handler(
             self, gen, options, kernel, implemented_data_info):
 
-        from loopy.kernel.data import KernelArgument
+        from loopy.kernel.data import KernelArgument, AddressSpace
 
         if not options.no_numpy:
             gen("if out_host is None and (_lpy_encountered_numpy "
@@ -206,6 +226,9 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
                 for arg in implemented_data_info:
                     if not issubclass(arg.arg_class, KernelArgument):
                         continue
+                    elif arg.address_space == AddressSpace.LOCAL:
+                        # local memory doesn't have a .get()
+                        continue
 
                     is_written = arg.base_name in kernel.get_written_variables()
                     if is_written:
@@ -236,7 +259,9 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
         gen.add_to_preamble(codegen_result.host_code())
 
     def get_arg_pass(self, arg):
-        return "%s.base_data" % arg.name
+        from loopy.kernel.data import AddressSpace
+        is_local = arg.address_space == AddressSpace.LOCAL
+        return "%s%s" % (arg.name, '.base_data' if not is_local else '')
 
 # }}}
 
diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py
index d4dcb3701f4f23a5b1c66b1559bf6c4879425902..b695c1327278e2e8663e16b6bd8a2a72d62bf63c 100644
--- a/loopy/transform/diff.py
+++ b/loopy/transform/diff.py
@@ -336,9 +336,10 @@ class DifferentiationContext(object):
 
         if var_name in self.kernel.arg_dict:
             self.new_args.append(
-                lp.GlobalArg(
+                lp.ArrayArg(
                     new_var_name,
                     arg.dtype,
+                    address_space=self.kernel.arg_dict[var_name].address_space,
                     shape=shape,
                     dim_tags=dim_tags,
                 ))
diff --git a/test/test_loopy.py b/test/test_loopy.py
index accf9c1dff5a1f660871dd63d6af3337aced6490..3a49429d3f3bd9903157efee4398c4f6608431b6 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2872,6 +2872,66 @@ def test_half_complex_conditional(ctx_factory):
     knl(queue)
 
 
+def test_local_arg_execution(ctx_factory):
+    ctx = ctx_factory()
+    from loopy.kernel.data import AddressSpace
+    local = AddressSpace.LOCAL
+
+    # simple example, allow the user to pass in a workspace local array
+    knl = lp.make_kernel(
+           "{[i,i0]: 0 <= i,i0 < 10}",
+           """
+                tmp[i] = i {id=init, dep=*}
+                ... lbarrier {id=barrier, mem_kind=local, dep=init}
+                out[i0] = tmp[(i0 + 1) % 10] {id=set, dep=barrier, nosync=init}
+           """,
+           [lp.ArrayArg('tmp', address_space=local, shape=(10,),
+                        dtype=np.int32),
+            lp.GlobalArg('out', shape=(10,), dtype=np.int32)]
+           )
+
+    # get vectorized form
+    ref_knl = knl
+    knl = lp.split_iname(knl, 'i', 4, inner_tag='l.0')
+    lp.auto_test_vs_ref(ref_knl, ctx, knl)
+
+    # call directly w/ cl local memory
+    from pytools import product
+    nbytes = product(si for si in knl.arg_dict['tmp'].shape) * \
+        knl.arg_dict['tmp'].dtype.itemsize
+    from pyopencl import LocalMemory
+    queue = cl.CommandQueue(ctx)
+    tmp = LocalMemory(nbytes)
+    knl(queue, tmp=tmp, out=np.zeros(10, dtype=np.int32))
+    # and that we get an error if we're short on memory
+    tmp = LocalMemory(nbytes - 1)
+    with pytest.raises(TypeError):
+        knl(queue, tmp=tmp, out=np.zeros(10, dtype=np.int32))
+
+    # try with 2 local args for compatibility
+    knl = lp.make_kernel(
+            "{[i,i0]: 0 <= i,i0 < 10}",
+            """
+            for i
+                tmp[i] = i {id=init, dep=*}
+                tmp2[i] = i + 1 {id=init2, dep=*}
+            end
+            ... lbarrier {id=barrier, mem_kind=local, dep=init*}
+            for i0
+                out[i0] = tmp[tmp2[i0] % 10] {id=set, dep=barrier, nosync=init*}
+            end
+            """,
+            [lp.ArrayArg('tmp', shape=(10,), dtype=np.int32, address_space=local),
+             lp.ArrayArg('tmp2', shape=(10,), dtype=np.int32, address_space=local),
+             lp.GlobalArg('out', shape=(10,), dtype=np.int32)]
+           )
+
+    # get vectorized form
+    ref_knl = knl
+    knl = lp.split_iname(knl, 'i', 4, inner_tag='l.0')
+    lp.auto_test_vs_ref(ref_knl, ctx, knl)
+
+
 def test_dep_cycle_printing_and_error():
     # https://gitlab.tiker.net/inducer/loopy/issues/140
     # This kernel has two dep cycles.