diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py
index b317135dd2db7c186d658b695d8b78f02e305b12..9977417198b8e3c7b1069716b02ac74410400df1 100644
--- a/loopy/schedule/device_mapping.py
+++ b/loopy/schedule/device_mapping.py
@@ -93,6 +93,20 @@ def get_common_hw_inames(kernel, insn_ids):
         set.intersection,
         (get_hw_inames(kernel, id_to_insn[id]) for id in insn_ids))
 
+
+def remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel):
+    from loopy.kernel.data import HardwareParallelTag
+    new_schedule = []
+
+    for item in kernel.schedule:
+        if isinstance(item, (EnterLoop, LeaveLoop)):
+            tag = kernel.iname_to_tag.get(item.iname)
+            if isinstance(tag, HardwareParallelTag):
+                continue
+        new_schedule.append(item)
+
+    return kernel.copy(schedule=new_schedule)
+
 # }}}
 
 
@@ -421,11 +435,15 @@ def augment_domain_for_temporary_promotion(
         new_iname = name_gen("{name}_{mode}_dim_{dim}".
             format(name=orig_temporary.name,
                    mode=mode,
-                   dim=orig_dim + t_idx))
+                   dim=t_idx))
         domain = domain.set_dim_name(
             isl.dim_type.set, orig_dim + t_idx, new_iname)
-        #from loopy.kernel.data import auto
-        #iname_to_tag[new_iname] = auto
+        if orig_temporary.is_local:
+            # If the temporary is tagged local, then loads / stores can be done
+            # in parallel.
+            from loopy.kernel.data import AutoFitLocalIndexTag
+            iname_to_tag[new_iname] = AutoFitLocalIndexTag()
+
         dim_inames.append(new_iname)
 
         # Add size information.
@@ -433,7 +451,7 @@ def augment_domain_for_temporary_promotion(
         domain &= aff[0].le_set(aff[new_iname])
         size = orig_temporary.shape[t_idx]
         from loopy.symbolic import aff_from_expr
-        domain &= aff[new_iname].le_set(aff_from_expr(domain.space, size))
+        domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, size))
 
     hw_inames = []
 
@@ -632,17 +650,25 @@ def restore_and_save_temporaries(kernel):
     # }}}
 
     new_iname_to_tag.update(kernel.iname_to_tag)
-    new_temporary_variables = dict(
+    updated_temporary_variables = dict(
         (t.name, t.as_variable()) for t in new_temporaries.values())
-    new_temporary_variables.update(kernel.temporary_variables)
+    updated_temporary_variables.update(kernel.temporary_variables)
 
     kernel = kernel.copy(
         iname_to_tag=new_iname_to_tag,
-        temporary_variables=new_temporary_variables,
+        temporary_variables=updated_temporary_variables,
         instructions=kernel.instructions + new_instructions,
         schedule=new_schedule
         )
 
+    from loopy.kernel.tools import assign_automatic_axes
+    kernel = assign_automatic_axes(kernel)
+
+    # Once assign_automatic_axes() does its job, loops in the schedule
+    # for newly hardware-tagged inames are no longer necessary (and in
+    # fact illegal), so remove them.
+    kernel = remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel)
+
     return kernel
 
 
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index b570cf4ab7e4ed543213b73716b6271e61e6b69b..428f333f660c1d0965a50f0af1d7c2cd907395a2 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -615,7 +615,9 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
         from genpy import Assign, Comment, Line
 
         def alloc_nbytes(tv):
-            return tv.dtype.numpy_dtype.itemsize
+            from six.moves import reduce
+            from operator import mul
+            return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1)
 
         from loopy.kernel.data import temp_var_scope
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 26f4f2faf3b83a24932cad0ac3941696ed3e2636..4b6dc898f086ac6f33da60d367c3b333dfa1806d 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1136,6 +1136,46 @@ def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory):
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
 
 
+def test_kernel_splitting_with_loop_and_local_temporary(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{ [i,k]: 0<=i<n and 0<=k<3 }",
+            """
+            <> t_local[i % 8,k] = i % 8
+            c[k,i] = a[k,i+1]
+            out[k,i] = c[k,i] + t_local[i % 8,k]
+            """)
+
+    knl = lp.add_and_infer_dtypes(knl,
+            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
+
+    knl = lp.set_temporary_scope(knl, "t_local", "local")
+
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0")
+
+    # schedule
+    from loopy.preprocess import preprocess_kernel
+    knl = preprocess_kernel(knl)
+
+    from loopy.schedule import get_one_scheduled_kernel
+    knl = get_one_scheduled_kernel(knl)
+
+    # map schedule onto host or device
+    print(knl)
+
+    cgr = lp.generate_code_v2(knl)
+
+    assert len(cgr.device_programs) == 2
+
+    print(cgr.device_code())
+    print(cgr.host_code())
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=8))
+
+
 def test_global_temporary(ctx_factory):
     ctx = ctx_factory()