diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
index 5609fc25328ed5b0a243eaa8d33c4d1aa6b3eddc..4a07b63330747aa69d7ed498e004d60b7c312a7b 100644
--- a/doc/ref_transform.rst
+++ b/doc/ref_transform.rst
@@ -114,6 +114,8 @@ Finishing up
 
 .. autofunction:: get_one_scheduled_kernel
 
+.. autofunction:: save_and_reload_temporaries
+
 .. autoclass:: GeneratedProgram
 .. autoclass:: CodeGenerationResult
 
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index ec9a09ae6a0766081db51587f83e95a7a2d992fd..0024b19153e27ca4331edd742554441107d175d9 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -922,6 +922,8 @@ expression being assigned.
     ...     """)
     >>> evt, (out1, out2) = knl(queue, a=x_vec_dev)
 
+.. _local_temporaries:
+
 Temporaries in local memory
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -1064,6 +1066,170 @@ Generic Precomputation
 
 .. }}}
 
+
+.. _synchronization:
+
+Synchronization
+---------------
+
+.. {{{
+
+In OpenCL, writes are not generally guaranteed to be immediately visible to
+other work items. In order to ensure that memory is consistent across work
+items, some sort of synchronization operation is used.
+
+:mod:`loopy` supports synchronization in the form of *barriers* or *atomic
+operations*.
+
+Barriers
+~~~~~~~~
+
+Prior to code generation, :mod:`loopy` performs a check to see that every memory
+access is free of dependencies requiring a barrier. A memory access dependency
+that exists across multiple work items requires a barrier if it involves at
+least one write operation.
+
+:mod:`loopy` supports two kinds of barriers:
+
+* *Local barriers* ensure consistency of local memory accesses to items within
+  *the same* work group. As in OpenCL, all work items in the group are required
+  to wait until everyone has reached the barrier instruction before continuing.
+
+* *Global barriers* ensure consistency of *global* memory accesses across *all*
+  work groups. Note that there is no exact equivalent in OpenCL. All work items
+  across all work groups are required to wait until everyone has reached the
+  barrier instruction before continuing.
+
+By default, :mod:`loopy` inserts local barriers between two instructions when it
+detects that a dependency involving local memory may occur across work items. To
+see this in action, take a look at the section on :ref:`local_temporaries`.
+
+In contrast, :mod:`loopy` will *not* insert global barriers
+automatically. Consider the following kernel, which attempts to rotate its input
+to the right by 1:
+
+.. doctest::
+
+   >>> knl = lp.make_kernel(
+   ...     "[n] -> {[i] : 0<=i<n}",
+   ...     """
+   ...     for i
+   ...        <>tmp = arr[i] {id=maketmp,dep=*}
+   ...        arr[(i + 1) % n] = tmp {id=rotate,dep=*maketmp}
+   ...     end
+   ...     """,
+   ...      [
+   ...         lp.GlobalArg("arr", shape=("n",), dtype=np.int32),
+   ...          "...",
+   ...      ],
+   ...     name="rotate_v1",
+   ...     assumptions="n mod 16 = 0")
+   >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0")
+   >>> cgr = lp.generate_code_v2(knl)
+   Traceback (most recent call last):
+   ...
+   MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that nosynchronization is needed)
+
+Because of the write-after-read dependency in global memory, a global barrier
+needs to be inserted. This can be accomplished with a ``... gbarrier``
+instruction. Note that :mod:`loopy` implements global barriers by splitting the
+kernel into multiple device-side kernels, so that the resulting code will
+contain more than one kernel.
+
+.. doctest::
+
+   >>> knl = lp.make_kernel(
+   ...     "[n] -> {[i] : 0<=i<n}",
+   ...     """
+   ...     for i
+   ...        <>tmp = arr[i] {id=maketmp,dep=*}
+   ...        ... gbarrier {id=bar,dep=*maketmp}
+   ...        arr[(i + 1) % n] = tmp {id=rotate,dep=*bar}
+   ...     end
+   ...     """,
+   ...      [
+   ...         lp.GlobalArg("arr", shape=("n",), dtype=np.int32),
+   ...          "...",
+   ...      ],
+   ...     name="rotate_v2",
+   ...     assumptions="n mod 16 = 0")
+   >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0")
+   >>> cgr = lp.generate_code_v2(knl)
+   >>> print(cgr.device_code())
+   #define lid(N) ((int) get_local_id(N))
+   #define gid(N) ((int) get_group_id(N))
+   <BLANKLINE>
+   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n)
+   {
+     int tmp;
+   <BLANKLINE>
+     tmp = arr[16 * gid(0) + lid(0)];
+   }
+   <BLANKLINE>
+   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n)
+   {
+     int tmp;
+   <BLANKLINE>
+     arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp;
+   }
+
+Note that we are not done yet. The problem is that while `tmp` is assigned in
+the first kernel, the assignment of `tmp` is not saved for the second
+kernel. :mod:`loopy` provides a function called
+:func:`loopy.save_and_reload_temporaries` for the purpose of handling the
+situation of saving and restoring temporary values across global barriers. In
+order to use this function the kernel must be preprocessed and scheduled first,
+the latter of which is handled by :func:`loopy.get_one_scheduled_kernel`.
+
+.. doctest::
+
+   >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
+   >>> knl = lp.save_and_reload_temporaries(knl)
+   >>> knl = lp.get_one_scheduled_kernel(knl)
+   >>> cgr = lp.generate_code_v2(knl)
+   >>> print(cgr.device_code())
+   #define lid(N) ((int) get_local_id(N))
+   #define gid(N) ((int) get_group_id(N))
+   <BLANKLINE>
+   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
+   {
+     int tmp;
+   <BLANKLINE>
+     tmp = arr[16 * gid(0) + lid(0)];
+     tmp_save_slot[16 * gid(0) + lid(0)] = tmp;
+   }
+   <BLANKLINE>
+   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
+   {
+     int tmp;
+   <BLANKLINE>
+     tmp = tmp_save_slot[16 * gid(0) + lid(0)];
+     arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp;
+   }
+   >>> evt, (out,) = knl(queue, arr=cl.array.arange(queue, 16, dtype=np.int32), out_host=True)
+   >>> print(out)
+   [15  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
+
+Atomic operations
+~~~~~~~~~~~~~~~~~
+
+:mod:`loopy` supports atomic operations. To use them, both the data on which the
+atomic operations work as well as the operations themselves must be suitably
+tagged, as in the following example::
+
+
+    knl = lp.make_kernel(
+            "{ [i]: 0<=i<n }",
+            "out[i%20] = out[i%20] + 2*a[i] {atomic}",
+            [
+                lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True),
+                lp.GlobalArg("a", dtype, shape=lp.auto),
+                "..."
+                ],
+            assumptions="n>0")
+
+.. }}}
+
 .. _more-complicated-programs:
 
 More complicated programs
@@ -1105,24 +1271,6 @@ Conditionals
 Snippets of C
 ~~~~~~~~~~~~~
 
-Atomic operations
-~~~~~~~~~~~~~~~~~
-
-Loopy supports atomic operations. To use them, both the data on which the
-atomic operations work as well as the operations themselves must be
-suitably tagged, as in the following example::
-
-
-    knl = lp.make_kernel(
-            "{ [i]: 0<=i<n }",
-            "out[i%20] = out[i%20] + 2*a[i] {atomic}",
-            [
-                lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True),
-                lp.GlobalArg("a", dtype, shape=lp.auto),
-                "..."
-                ],
-            assumptions="n>0")
-
 .. }}}
 
 Common Problems
@@ -1328,7 +1476,7 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
     >>> f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(param_dict)
     >>> f64mul = op_map[lp.Op(np.float64, 'mul')].eval_with_dict(param_dict)
     >>> i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(param_dict)
-    >>> print("%i\n%i\n%i\n%i\n%i\n%i" % 
+    >>> print("%i\n%i\n%i\n%i\n%i\n%i" %
     ...     (f32add, f32div, f32mul, f64add, f64mul, i32add))
     524288
     524288
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 110652cf75d467ceb473d4997142f4dabe3e763b..6bd764f8df93f1b4b2ae5755c1c90ccddc654fe6 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -106,6 +106,7 @@ from loopy.transform.padding import (
 from loopy.transform.ilp import realize_ilp
 from loopy.transform.batch import to_batched
 from loopy.transform.parameter import assume, fix_parameters
+from loopy.transform.save import save_and_reload_temporaries
 
 # }}}
 
@@ -206,6 +207,8 @@ __all__ = [
 
         "assume", "fix_parameters",
 
+        "save_and_reload_temporaries",
+
         # }}}
 
         "get_dot_dependency_graph",
@@ -258,7 +261,6 @@ __all__ = [
         # }}}
         ]
 
-
 # }}}
 
 
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index e75a95dec781317389b06aeaf2fbdbcbcc7f7bf6..55d4560be0849511b0e60ae7382053ce7a97912d 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -150,8 +150,15 @@ def generate_code_for_sched_index(codegen_state, sched_index):
         return func(codegen_state, sched_index)
 
     elif isinstance(sched_item, Barrier):
-        return codegen_state.ast_builder.emit_barrier(
-                sched_item.kind, sched_item.comment)
+        if codegen_state.is_generating_device_code:
+            return codegen_state.ast_builder.emit_barrier(
+                    sched_item.kind, sched_item.comment)
+        from loopy.codegen.result import CodeGenerationResult
+        return CodeGenerationResult(
+                host_program=None,
+                device_programs=[],
+                implemented_domains={},
+                implemented_data_info=codegen_state.implemented_data_info)
 
     elif isinstance(sched_item, RunInstruction):
         insn = kernel.id_to_insn[sched_item.insn_id]
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 5701bf77254277501e86e32cdef0e01edaa90b29..e71a88886cd2c244211f8f39dca3745c9c6ebc80 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1056,6 +1056,19 @@ class LoopKernel(RecordWithoutPickling):
 
     # }}}
 
+    # {{{ nosync sets
+
+    @memoize_method
+    def get_nosync_set(self, insn_id, scope):
+        assert scope in ("local", "global")
+
+        return frozenset(
+            insn_id
+            for insn_id, nosync_scope in self.id_to_insn[insn_id].no_sync_with
+            if nosync_scope == scope or nosync_scope == "any")
+
+    # }}}
+
     # {{{ pretty-printing
 
     def stringify(self, what=None, with_dependencies=False):
@@ -1213,7 +1226,9 @@ class LoopKernel(RecordWithoutPickling):
                     options.append(
                             "conflicts=%s" % ":".join(insn.conflicts_with_groups))
                 if insn.no_sync_with:
-                    options.append("no_sync_with=%s" % ":".join(insn.no_sync_with))
+                    # FIXME: Find a syntax to express scopes.
+                    options.append("no_sync_with=%s" % ":".join(id for id, _ in
+                                                                insn.no_sync_with))
 
                 if lhs:
                     core = "%s <- %s" % (
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index ab3035be0388877af12d71dd15b7dbb522c7b84e..6c5491384d4fc37dc48604aa52753d11ac10fc55 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -235,9 +235,11 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None):
                 raise LoopyError("'nosync' option may not be specified "
                         "in a 'with' block")
 
+            # TODO: Come up with a syntax that allows the user to express
+            # different synchronization scopes.
             result["no_sync_with"] = result["no_sync_with"].union(frozenset(
-                    intern(dep.strip()) for dep in opt_value.split(":")
-                    if dep.strip()))
+                    (intern(dep.strip()), "any")
+                    for dep in opt_value.split(":") if dep.strip()))
 
         elif opt_key == "nosync_query" and opt_value is not None:
             if is_with_block:
@@ -246,8 +248,10 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None):
 
             from loopy.match import parse_match
             match = parse_match(opt_value)
+            # TODO: Come up with a syntax that allows the user to express
+            # different synchronization scopes.
             result["no_sync_with"] = result["no_sync_with"].union(
-                    frozenset([match]))
+                    frozenset([(match, "any")]))
 
         elif opt_key == "groups" and opt_value is not None:
             result["groups"] = frozenset(
@@ -1462,8 +1466,11 @@ def resolve_dependencies(knl):
     for insn in knl.instructions:
         new_insns.append(insn.copy(
                     depends_on=_resolve_dependencies(knl, insn, insn.depends_on),
-                    no_sync_with=_resolve_dependencies(
-                        knl, insn, insn.no_sync_with),
+                    no_sync_with=frozenset(
+                        (resolved_insn_id, nosync_scope)
+                        for nosync_dep, nosync_scope in insn.no_sync_with
+                        for resolved_insn_id in
+                        _resolve_dependencies(knl, insn, nosync_dep)),
                     ))
 
     return knl.copy(instructions=new_insns)
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index b306e6e6e25fbd3f661a4f271345129a82ea93f1..417ff9dd37c682e599e0586ec61f02169e9fe7aa 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -91,9 +91,17 @@ class InstructionBase(Record):
 
     .. attribute:: no_sync_with
 
-        a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances
-        with which no barrier synchronization is necessary, even given the existence
-        of a dependency chain and apparently conflicting access.
+        a :class:`frozenset` of tuples of the form `(insn_id, scope)`, where
+        `insn_id` refers to :attr:`id` of :class:`Instruction` instances
+        and `scope` is one of the following strings:
+
+           - `"local"`
+           - `"global"`
+           - `"any"`.
+
+        This indicates no barrier synchronization is necessary with the given
+        instruction using barriers of type `scope`, even given the existence of
+        a dependency chain and apparently conflicting access.
 
         Note, that :attr:`no_sync_with` allows instruction matching through wildcards
         and match expression, just like :attr:`depends_on`.
@@ -380,7 +388,10 @@ class InstructionBase(Record):
         if self.depends_on:
             result.append("dep="+":".join(self.depends_on))
         if self.no_sync_with:
-            result.append("nosync="+":".join(self.no_sync_with))
+            # TODO: Come up with a syntax to express different kinds of
+            # synchronization scopes.
+            result.append("nosync="+":".join(
+                    insn_id for insn_id, _ in self.no_sync_with))
         if self.groups:
             result.append("groups=%s" % ":".join(self.groups))
         if self.conflicts_with_groups:
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index f93ea891fd7f1aa420c8c109c31b50fd8305a9e3..6b5488a20bc9d714fb5fde908b559ddebf4b9591 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -459,7 +459,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     | frozenset([red_iname])),
                 within_inames_is_final=insn.within_inames_is_final,
                 depends_on=frozenset([init_id]) | insn.depends_on,
-                no_sync_with=frozenset([init_id]))
+                no_sync_with=frozenset([(init_id, "any")]))
         generated_insns.append(transfer_insn)
 
         def _strip_if_scalar(c):
@@ -517,7 +517,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             istage += 1
 
         new_insn_add_depends_on.add(prev_id)
-        new_insn_add_no_sync_with.add(prev_id)
+        new_insn_add_no_sync_with.add((prev_id, "any"))
         new_insn_add_within_inames.add(stage_exec_iname or base_exec_iname)
 
         if nresults == 1:
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 9110c4ac079c614ce6a078bbe986888db3c21422..694312df9768f8a6cf2f91b7d5d85dddbac71063 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -399,6 +399,17 @@ def get_priority_tiers(wanted, priorities):
     for tier in get_priority_tiers(wanted, priorities):
         yield tier
 
+
+def sched_item_to_insn_id(sched_item):
+    # Helper for use in generator expressions, i.e.
+    # (... for insn_id in sched_item_to_insn_id(item) ...)
+    if isinstance(sched_item, RunInstruction):
+        yield sched_item.insn_id
+    elif isinstance(sched_item, Barrier):
+        if (hasattr(sched_item, "originating_insn_id")
+                and sched_item.originating_insn_id is not None):
+            yield sched_item.originating_insn_id
+
 # }}}
 
 
@@ -572,12 +583,37 @@ class SchedulerState(Record):
 
         A :class:`frozenset` of all inames ever entered.
 
+    .. attribute:: enclosing_subkernel_inames
+
+        The inames of the last entered subkernel
+
     .. attribute:: schedule
 
     .. attribute:: scheduled_insn_ids
 
     .. attribute:: unscheduled_insn_ids
 
+    .. attribute:: preschedule
+
+        A sequence of schedule items that must be inserted into the
+        schedule, maintaining the same ordering
+
+    .. attribute:: prescheduled_insn_ids
+
+        A :class:`frozenset` of any instruction that started prescheduled
+
+    .. attribute:: prescheduled_inames
+
+        A :class:`frozenset` of any iname that started prescheduled
+
+    .. attribute:: may_schedule_global_barriers
+
+        Whether global barrier scheduling is allowed
+
+    .. attribute:: within_subkernel
+
+        Whether the scheduler is inside a subkernel
+
     .. attribute:: group_insn_counts
 
         A mapping from instruction group names to the number of instructions
@@ -619,6 +655,11 @@ def generate_loop_schedules_internal(
 
     active_inames_set = frozenset(sched_state.active_inames)
 
+    next_preschedule_item = (
+        sched_state.preschedule[0]
+        if len(sched_state.preschedule) > 0
+        else None)
+
     # {{{ decide about debug mode
 
     debug_mode = False
@@ -637,6 +678,10 @@ def generate_loop_schedules_internal(
         print(75*"=")
         print("CURRENT SCHEDULE:")
         print(dump_schedule(sched_state.kernel, sched_state.schedule))
+        if sched_state.preschedule:
+            print(75*"=")
+            print("PRESCHEDULED ITEMS AWAITING SCHEDULING:")
+            print(dump_schedule(sched_state.kernel, sched_state.preschedule))
         #print("boost allowed:", allow_boost)
         print(75*"=")
         print("LOOP NEST MAP (inner: outer):")
@@ -652,6 +697,54 @@ def generate_loop_schedules_internal(
 
     # }}}
 
+    # {{{ see if we have reached the start/end of kernel in the preschedule
+
+    if isinstance(next_preschedule_item, CallKernel):
+        assert sched_state.within_subkernel is False
+        for result in generate_loop_schedules_internal(
+                sched_state.copy(
+                    schedule=sched_state.schedule + (next_preschedule_item,),
+                    preschedule=sched_state.preschedule[1:],
+                    within_subkernel=True,
+                    may_schedule_global_barriers=False,
+                    enclosing_subkernel_inames=sched_state.active_inames),
+                allow_boost=rec_allow_boost,
+                debug=debug):
+            yield result
+
+    if isinstance(next_preschedule_item, ReturnFromKernel):
+        assert sched_state.within_subkernel is True
+        # Make sure all subkernel inames have finished.
+        if sched_state.active_inames == sched_state.enclosing_subkernel_inames:
+            for result in generate_loop_schedules_internal(
+                    sched_state.copy(
+                        schedule=sched_state.schedule + (next_preschedule_item,),
+                        preschedule=sched_state.preschedule[1:],
+                        within_subkernel=False,
+                        may_schedule_global_barriers=True),
+                    allow_boost=rec_allow_boost,
+                    debug=debug):
+                yield result
+
+    # }}}
+
+    # {{{ see if there are pending local barriers in the preschedule
+
+    # Local barriers do not have associated instructions, so they need to
+    # be handled separately from instructions.
+    if (
+            isinstance(next_preschedule_item, Barrier)
+            and next_preschedule_item.kind == "local"):
+        for result in generate_loop_schedules_internal(
+                    sched_state.copy(
+                        schedule=sched_state.schedule + (next_preschedule_item,),
+                        preschedule=sched_state.preschedule[1:]),
+                    allow_boost=rec_allow_boost,
+                    debug=debug):
+                yield result
+
+    # }}}
+
     # {{{ see if any insns are ready to be scheduled now
 
     # Also take note of insns that have a chance of being schedulable inside
@@ -667,9 +760,16 @@ def generate_loop_schedules_internal(
         # schedule generation order.
         return (insn.priority, len(active_groups & insn.groups), insn.id)
 
-    insn_ids_to_try = sorted(sched_state.unscheduled_insn_ids,
+    insn_ids_to_try = sorted(
+            # Non-prescheduled instructions go first.
+            sched_state.unscheduled_insn_ids - sched_state.prescheduled_insn_ids,
             key=insn_sort_key, reverse=True)
 
+    insn_ids_to_try.extend(
+        insn_id
+        for item in sched_state.preschedule
+        for insn_id in sched_item_to_insn_id(item))
+
     for insn_id in insn_ids_to_try:
         insn = kernel.id_to_insn[insn_id]
 
@@ -705,6 +805,46 @@ def generate_loop_schedules_internal(
                     print("instruction '%s' won't work under inames '%s'"
                             % (format_insn(kernel, insn.id), ",".join(have-want)))
 
+        # {{{ check if scheduling this insn is compatible with preschedule
+
+        if insn_id in sched_state.prescheduled_insn_ids:
+            if isinstance(next_preschedule_item, RunInstruction):
+                next_preschedule_insn_id = next_preschedule_item.insn_id
+            elif (
+                    isinstance(next_preschedule_item, Barrier)
+                    and next_preschedule_item.kind == "global"):
+                assert hasattr(next_preschedule_item, "originating_insn_id")
+                assert next_preschedule_item.originating_insn_id is not None
+                next_preschedule_insn_id = next_preschedule_item.originating_insn_id
+            else:
+                next_preschedule_insn_id = None
+
+            if next_preschedule_insn_id != insn_id:
+                if debug_mode:
+                    print("can't schedule '%s' because another preschedule "
+                          "instruction precedes it" % format_insn(kernel, insn.id))
+                is_ready = False
+
+        # }}}
+
+        # {{{ check if scheduler state allows insn scheduling
+
+        from loopy.kernel.instruction import BarrierInstruction
+        if isinstance(insn, BarrierInstruction) and insn.kind == "global":
+            if not sched_state.may_schedule_global_barriers:
+                if debug_mode:
+                    print("can't schedule '%s' because global barriers are "
+                          "not currently allowed" % format_insn(kernel, insn.id))
+                is_ready = False
+        else:
+            if not sched_state.within_subkernel:
+                if debug_mode:
+                    print("can't schedule '%s' because not within subkernel"
+                          % format_insn(kernel, insn.id))
+                is_ready = False
+
+        # }}}
+
         # {{{ determine group-based readiness
 
         if insn.conflicts_with_groups & active_groups:
@@ -761,6 +901,10 @@ def generate_loop_schedules_internal(
                     unscheduled_insn_ids=sched_state.unscheduled_insn_ids - iid_set,
                     schedule=(
                         sched_state.schedule + (RunInstruction(insn_id=insn.id),)),
+                    preschedule=(
+                        sched_state.preschedule
+                        if insn_id not in sched_state.prescheduled_insn_ids
+                        else sched_state.preschedule[1:]),
                     active_group_counts=new_active_group_counts,
                     uses_of_boostability=(
                         sched_state.uses_of_boostability
@@ -790,7 +934,17 @@ def generate_loop_schedules_internal(
     if last_entered_loop is not None:
         can_leave = True
 
-        if last_entered_loop not in sched_state.breakable_inames:
+        if (
+                last_entered_loop in sched_state.prescheduled_inames
+                and not (
+                    isinstance(next_preschedule_item, LeaveLoop)
+                    and next_preschedule_item.iname == last_entered_loop)):
+            # A prescheduled loop can only be left if the preschedule agrees.
+            if debug_mode:
+                print("cannot leave '%s' because of preschedule constraints"
+                      % last_entered_loop)
+            can_leave = False
+        elif last_entered_loop not in sched_state.breakable_inames:
             # If the iname is not breakable, then check that we've
             # scheduled all the instructions that require it.
 
@@ -857,12 +1011,19 @@ def generate_loop_schedules_internal(
                         break
 
             if can_leave and not debug_mode:
+
                 for sub_sched in generate_loop_schedules_internal(
                         sched_state.copy(
                             schedule=(
                                 sched_state.schedule
                                 + (LeaveLoop(iname=last_entered_loop),)),
-                            active_inames=sched_state.active_inames[:-1]),
+                            active_inames=sched_state.active_inames[:-1],
+                            preschedule=(
+                                sched_state.preschedule
+                                if last_entered_loop
+                                not in sched_state.prescheduled_inames
+                                else sched_state.preschedule[1:]),
+                        ),
                         allow_boost=rec_allow_boost, debug=debug):
                     yield sub_sched
 
@@ -902,6 +1063,38 @@ def generate_loop_schedules_internal(
 
             # {{{ check if scheduling this iname now is allowed/plausible
 
+            if (
+                    iname in sched_state.prescheduled_inames
+                    and not (
+                        isinstance(next_preschedule_item, EnterLoop)
+                        and next_preschedule_item.iname == iname)):
+                if debug_mode:
+                    print("scheduling %s prohibited by preschedule constraints"
+                          % iname)
+                continue
+
+            if (
+                    not sched_state.within_subkernel
+                    and iname not in sched_state.prescheduled_inames):
+                # Avoid messing up some orderings such as picking:
+                #
+                # EnterLoop(temporary.reload)
+                # CallKernel
+                # ...
+                #
+                # instead of
+                #
+                # CallKernel
+                # EnterLoop(temporary.reload)
+                # ...
+                #
+                # This serves a heuristic to catch some bad decisions early, the
+                # scheduler will not allow the first variant regardless.
+                if debug_mode:
+                    print("scheduling '%s' prohibited because we are outside "
+                          "a subkernel" % iname)
+                continue
+
             currently_accessible_inames = (
                     active_inames_set | sched_state.parallel_inames)
             if (
@@ -1063,6 +1256,10 @@ def generate_loop_schedules_internal(
                                 entered_inames=(
                                     sched_state.entered_inames
                                     | frozenset((iname,))),
+                                preschedule=(
+                                    sched_state.preschedule
+                                    if iname not in sched_state.prescheduled_inames
+                                    else sched_state.preschedule[1:]),
                                 ),
                             allow_boost=rec_allow_boost,
                             debug=debug):
@@ -1082,7 +1279,10 @@ def generate_loop_schedules_internal(
         if inp:
             raise ScheduleDebugInput(inp)
 
-    if not sched_state.active_inames and not sched_state.unscheduled_insn_ids:
+    if (
+            not sched_state.active_inames
+            and not sched_state.unscheduled_insn_ids
+            and not sched_state.preschedule):
         # if done, yield result
         debug.log_success(sched_state.schedule)
 
@@ -1210,8 +1410,8 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind):
     if reverse:
         source, target = target, source
 
-    if source.id in target.no_sync_with:
-        return None
+    if source.id in kernel.get_nosync_set(target.id, var_kind):
+        return
 
     # {{{ check that a dependency exists
 
@@ -1310,6 +1510,9 @@ def get_tail_starting_at_last_barrier(schedule, kind):
         elif isinstance(sched_item, (EnterLoop, LeaveLoop)):
             pass
 
+        elif isinstance(sched_item, (CallKernel, ReturnFromKernel)):
+            pass
+
         else:
             raise ValueError("unexpected schedule item type '%s'"
                     % type(sched_item).__name__)
@@ -1323,7 +1526,8 @@ def insn_ids_from_schedule(schedule):
         if isinstance(sched_item, RunInstruction):
             result.append(sched_item.insn_id)
 
-        elif isinstance(sched_item, (EnterLoop, LeaveLoop, Barrier)):
+        elif isinstance(sched_item, (EnterLoop, LeaveLoop, Barrier, CallKernel,
+                                     ReturnFromKernel)):
             pass
 
         else:
@@ -1456,8 +1660,22 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0):
                             source=dep_src_insn_id,
                             reverse=reverse, var_kind=kind)
                     if dep:
-                        issue_barrier(dep=dep)
-                        break
+                        if verify_only:
+                            from loopy.diagnostic import MissingBarrierError
+                            raise MissingBarrierError(
+                                    "Dependency '%s' (for variable '%s') "
+                                    "requires synchronization "
+                                    "by a %s barrier (add a 'no_sync_with' "
+                                    "instruction option to state that no"
+                                    "synchronization is needed)"
+                                    % (
+                                        dep.dep_descr.format(
+                                            tgt=dep.target.id, src=dep.source.id),
+                                        dep.variable,
+                                        kind))
+                        else:
+                            issue_barrier(dep=dep)
+                            break
 
             # }}}
 
@@ -1516,6 +1734,10 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0):
             result.append(sched_item)
             candidates.add(sched_item.insn_id)
 
+        elif isinstance(sched_item, (CallKernel, ReturnFromKernel)):
+            result.append(sched_item)
+            i += 1
+
         else:
             raise ValueError("unexpected schedule item type '%s'"
                     % type(sched_item).__name__)
@@ -1537,7 +1759,7 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0):
 
 def generate_loop_schedules(kernel, debug_args={}):
     from loopy.kernel import kernel_state
-    if kernel.state != kernel_state.PREPROCESSED:
+    if kernel.state not in (kernel_state.PREPROCESSED, kernel_state.SCHEDULED):
         raise LoopyError("cannot schedule a kernel that has not been "
                 "preprocessed")
 
@@ -1548,6 +1770,18 @@ def generate_loop_schedules(kernel, debug_args={}):
 
     debug = ScheduleDebugger(**debug_args)
 
+    preschedule = kernel.schedule if kernel.state == kernel_state.SCHEDULED else ()
+
+    prescheduled_inames = set(
+            insn.iname
+            for insn in preschedule
+            if isinstance(insn, EnterLoop))
+
+    prescheduled_insn_ids = set(
+        insn_id
+        for item in preschedule
+        for insn_id in sched_item_to_insn_id(item))
+
     from loopy.kernel.data import IlpBaseTag, ParallelTag, VectorizeTag
     ilp_inames = set(
             iname
@@ -1574,14 +1808,22 @@ def generate_loop_schedules(kernel, debug_args={}):
             ilp_inames=ilp_inames,
             vec_inames=vec_inames,
 
+            prescheduled_inames=prescheduled_inames,
+            prescheduled_insn_ids=prescheduled_insn_ids,
+
             # time-varying part
             active_inames=(),
             entered_inames=frozenset(),
+            enclosing_subkernel_inames=(),
 
             schedule=(),
 
             unscheduled_insn_ids=set(insn.id for insn in kernel.instructions),
             scheduled_insn_ids=frozenset(),
+            within_subkernel=kernel.state != kernel_state.SCHEDULED,
+            may_schedule_global_barriers=True,
+
+            preschedule=preschedule,
 
             # ilp and vec are not parallel for the purposes of the scheduler
             parallel_inames=parallel_inames - ilp_inames - vec_inames,
@@ -1639,18 +1881,15 @@ def generate_loop_schedules(kernel, debug_args={}):
 
                 gsize, lsize = kernel.get_grid_size_upper_bounds()
 
-                if gsize or lsize:
+                if (gsize or lsize):
                     if not kernel.options.disable_global_barriers:
                         logger.info("%s: barrier insertion: global" % kernel.name)
-
                         gen_sched = insert_barriers(kernel, gen_sched,
                                 reverse=False, kind="global", verify_only=True)
 
                     logger.info("%s: barrier insertion: local" % kernel.name)
-
                     gen_sched = insert_barriers(kernel, gen_sched,
                             reverse=False, kind="local", verify_only=False)
-
                     logger.info("%s: barrier insertion: done" % kernel.name)
 
                 new_kernel = kernel.copy(
@@ -1659,7 +1898,12 @@ def generate_loop_schedules(kernel, debug_args={}):
 
                 from loopy.schedule.device_mapping import \
                         map_schedule_onto_host_or_device
-                new_kernel = map_schedule_onto_host_or_device(new_kernel)
+                if kernel.state != kernel_state.SCHEDULED:
+                    # Device mapper only gets run once.
+                    new_kernel = map_schedule_onto_host_or_device(new_kernel)
+
+                from loopy.schedule.tools import add_extra_args_to_schedule
+                new_kernel = add_extra_args_to_schedule(new_kernel)
                 yield new_kernel
 
                 debug.start()
diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py
index ca782a3d8ca85ea6250f7c9317ca0947db28d5e8..1a0789c2f61e21e4a0371e2a73195c9771245527 100644
--- a/loopy/schedule/device_mapping.py
+++ b/loopy/schedule/device_mapping.py
@@ -23,14 +23,13 @@ THE SOFTWARE.
 """
 
 from loopy.diagnostic import LoopyError
-from loopy.kernel.data import TemporaryVariable, temp_var_scope
-from loopy.schedule import (Barrier, BeginBlockItem, CallKernel, EndBlockItem,
-                            EnterLoop, LeaveLoop, ReturnFromKernel,
-                            RunInstruction)
-from pytools import Record, memoize_method
+from loopy.schedule import (Barrier, CallKernel, EnterLoop, LeaveLoop,
+                            ReturnFromKernel, RunInstruction)
+from loopy.schedule.tools import get_block_boundaries
 
 
 def map_schedule_onto_host_or_device(kernel):
+    # FIXME: Should be idempotent.
     from loopy.kernel import kernel_state
     assert kernel.state == kernel_state.SCHEDULED
 
@@ -53,659 +52,14 @@ def map_schedule_onto_host_or_device(kernel):
         kernel = map_schedule_onto_host_or_device_impl(
                 kernel, device_prog_name_gen)
 
-    return restore_and_save_temporaries(
-        add_extra_args_to_schedule(kernel))
-
-
-# {{{ Schedule / instruction utilities
-
-def get_block_boundaries(schedule):
-    """
-    Return a dictionary mapping indices of
-    :class:`loopy.schedule.BlockBeginItem`s to
-    :class:`loopy.schedule.BlockEndItem`s and vice versa.
-    """
-    block_bounds = {}
-    active_blocks = []
-    for idx, sched_item in enumerate(schedule):
-        if isinstance(sched_item, BeginBlockItem):
-            active_blocks.append(idx)
-        elif isinstance(sched_item, EndBlockItem):
-            start = active_blocks.pop()
-            block_bounds[start] = idx
-            block_bounds[idx] = start
-    return block_bounds
-
-
-def get_hw_inames(kernel, insn):
-    """
-    Return the inames that insn runs in and that are tagged as hardware
-    parallel.
-    """
-    from loopy.kernel.data import HardwareParallelTag
-    return set(iname for iname in kernel.insn_inames(insn)
-        if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag))
-
-
-def get_common_hw_inames(kernel, insn_ids):
-    """
-    Return the common set of hardware parallel tagged inames among
-    the list of instructions.
-    """
-    # Get the list of hardware inames in which the temporary is defined.
-    if len(insn_ids) == 0:
-        return set()
-    return set.intersection(
-        *(get_hw_inames(kernel, kernel.id_to_insn[id]) for id in insn_ids))
-
-
-def remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel):
-    from loopy.kernel.data import HardwareParallelTag
-    new_schedule = []
-
-    for item in kernel.schedule:
-        if isinstance(item, (EnterLoop, LeaveLoop)):
-            tag = kernel.iname_to_tag.get(item.iname)
-            if isinstance(tag, HardwareParallelTag):
-                continue
-        new_schedule.append(item)
-
-    return kernel.copy(schedule=new_schedule)
-
-# }}}
-
-
-# {{{ Use / def utilities
-
-def filter_out_subscripts(exprs):
-    """
-    Remove subscripts from expressions in `exprs`.
-    """
-    result = set()
-    from pymbolic.primitives import Subscript
-    for expr in exprs:
-        if isinstance(expr, Subscript):
-            expr = expr.aggregate
-        result.add(expr)
-    return result
-
-
-def filter_items_by_varname(pred, kernel, items):
-    """
-    Keep only the values in `items` whose variable names satisfy `pred`.
-    """
-    from pymbolic.primitives import Subscript, Variable
-    result = set()
-    for item in items:
-        base = item
-        if isinstance(base, Subscript):
-            base = base.aggregate
-        if isinstance(base, Variable):
-            base = base.name
-        if pred(kernel, base):
-            result.add(item)
-    return result
-
-
-from functools import partial
-
-filter_temporaries = partial(filter_items_by_varname,
-    lambda kernel, name: name in kernel.temporary_variables)
-
-filter_scalar_temporaries = partial(filter_items_by_varname,
-    lambda kernel, name: name in kernel.temporary_variables and
-        len(kernel.temporary_variables[name].shape) == 0)
-
-
-def get_use_set(insn, include_subscripts=True):
-    """
-    Return the use-set of the instruction, for liveness analysis.
-    """
-    result = insn.read_dependency_names()
-    if not include_subscripts:
-        result = filter_out_subscripts(result)
-    return result
-
-
-def get_def_set(insn, include_subscripts=True):
-    """
-    Return the def-set of the instruction, for liveness analysis.
-    """
-    result = insn.write_dependency_names()
-    if not include_subscripts:
-        result = filter_out_subscripts(result)
-    return result
-
-
-def get_def_and_use_lists_for_all_temporaries(kernel):
-    """
-    Return a pair `def_lists`, `use_lists` which map temporary variable
-    names to lists of instructions where they are defined or used.
-    """
-    def_lists = dict((t, []) for t in kernel.temporary_variables)
-    use_lists = dict((t, []) for t in kernel.temporary_variables)
-
-    for insn in kernel.instructions:
-        assignees = get_def_set(insn, include_subscripts=False)
-        dependencies = get_use_set(insn, include_subscripts=False)
-
-        from pymbolic.primitives import Variable
-
-        for assignee in assignees:
-            if isinstance(assignee, Variable):
-                assignee = assignee.name
-            if assignee in kernel.temporary_variables:
-                def_lists[assignee].append(insn.id)
-
-        for dep in dependencies:
-            if isinstance(dep, Variable):
-                dep = dep.name
-            if dep in kernel.temporary_variables:
-                use_lists[dep].append(insn.id)
-
-    return def_lists, use_lists
-
-
-def get_temporaries_defined_and_used_in_subrange(
-        kernel, schedule, start_idx, end_idx):
-    defs = set()
-    uses = set()
-
-    for idx in range(start_idx, end_idx + 1):
-        sched_item = schedule[idx]
-        if isinstance(sched_item, RunInstruction):
-            insn = kernel.id_to_insn[sched_item.insn_id]
-            defs.update(
-                filter_temporaries(
-                    kernel, get_def_set(insn)))
-            uses.update(
-                filter_temporaries(
-                    kernel, get_use_set(insn)))
-
-    return defs, uses
-
-# }}}
-
-
-# {{{ Liveness analysis
-
-def compute_live_temporaries(kernel, schedule):
-    """
-    Compute live-in and live-out sets for temporary variables.
-    """
-    live_in = [set() for i in range(len(schedule) + 1)]
-    live_out = [set() for i in range(len(schedule))]
-
-    id_to_insn = kernel.id_to_insn
-    block_bounds = get_block_boundaries(schedule)
-
-    # {{{ Liveness analysis implementation
-
-    def compute_subrange_liveness(start_idx, end_idx):
-        idx = end_idx
-        while start_idx <= idx:
-            sched_item = schedule[idx]
-            if isinstance(sched_item, LeaveLoop):
-                start = block_bounds[idx]
-                live_in[idx] = live_out[idx] = live_in[idx + 1]
-                compute_subrange_liveness(start + 1, idx - 1)
-                prev_live_in = live_in[start].copy()
-                live_in[start] = live_out[start] = live_in[start + 1]
-                # Propagate live values through the loop.
-                if live_in[start] != prev_live_in:
-                    live_out[idx] |= live_in[start]
-                    live_in[idx] = live_out[idx]
-                    compute_subrange_liveness(start + 1, idx - 1)
-                idx = start - 1
-
-            elif isinstance(sched_item, ReturnFromKernel):
-                start = block_bounds[idx]
-                live_in[idx] = live_out[idx] = live_in[idx + 1]
-                compute_subrange_liveness(start + 1, idx - 1)
-                live_in[start] = live_out[start] = live_in[start + 1]
-                idx = start - 1
-
-            elif isinstance(sched_item, RunInstruction):
-                live_out[idx] = live_in[idx + 1]
-                insn = id_to_insn[sched_item.insn_id]
-                defs = filter_scalar_temporaries(kernel,
-                    get_def_set(insn, include_subscripts=False))
-                uses = filter_temporaries(kernel,
-                    get_use_set(insn, include_subscripts=False))
-                live_in[idx] = (live_out[idx] - defs) | uses
-                idx -= 1
-
-            elif isinstance(sched_item, Barrier):
-                live_in[idx] = live_out[idx] = live_in[idx + 1]
-                idx -= 1
-            else:
-                raise LoopyError("unexpected type of schedule item: %s"
-                        % type(sched_item).__name__)
-
-    # }}}
-
-    # Compute live variables
-    compute_subrange_liveness(0, len(schedule) - 1)
-    live_in = live_in[:-1]
-
-    if 0:
-        print(kernel)
-        print("Live-in values:")
-        for i, li in enumerate(live_in):
-            print("{}: {}".format(i, ", ".join(li)))
-        print("Live-out values:")
-        for i, lo in enumerate(live_out):
-            print("{}: {}".format(i, ", ".join(lo)))
-
-    # Strip off subscripts.
-    live_in = [filter_out_subscripts(li) for li in live_in]
-    live_out = [filter_out_subscripts(lo) for lo in live_out]
-
-    return live_in, live_out
-
-# }}}
-
-
-# {{{ Temporary promotion
-
-class PromotedTemporary(Record):
-    """
-    .. attribute:: name
-
-        The name of the new temporary.
-
-    .. attribute:: orig_temporary
-
-        The original temporary variable object.
-
-    .. attribute:: hw_inames
-
-        The common list of hw axes that define the original object.
-
-    .. attribute:: shape_prefix
-
-        A list of expressions, to be added in front of the shape
-        of the promoted temporary value
-    """
-
-    @memoize_method
-    def as_variable(self):
-        temporary = self.orig_temporary
-        return TemporaryVariable(
-            name=self.name,
-            dtype=temporary.dtype,
-            scope=temp_var_scope.GLOBAL,
-            shape=self.new_shape)
-
-    @property
-    def new_shape(self):
-        return self.shape_prefix + self.orig_temporary.shape
-
-
-def determine_temporaries_to_promote(kernel, temporaries, name_gen):
-    """
-    For each temporary in the passed list of temporaries, construct a
-    :class:`PromotedTemporary` which describes how the temporary should
-    get promoted into global storage.
-
-    :returns: A :class:`dict` mapping temporary names from `temporaries` to
-              :class:`PromotedTemporary` objects
-    """
-    new_temporaries = {}
-
-    def_lists, use_lists = get_def_and_use_lists_for_all_temporaries(kernel)
-
-    from loopy.kernel.data import LocalIndexTag
-
-    for temporary in temporaries:
-        temporary = kernel.temporary_variables[temporary]
-        if temporary.scope == temp_var_scope.GLOBAL:
-            # Nothing to be done for global temporaries (I hope)
-            continue
-
-        assert temporary.base_storage is None, \
-            "Cannot promote temporaries with base_storage to global"
-
-        # `hw_inames`: The set of hw-parallel tagged inames that this temporary
-        # is associated with. This is used for determining the shape of the
-        # global storage needed for saving and restoring the temporary across
-        # kernel calls.
-        #
-        # TODO: Make a policy decision about which dimensions to use. Currently,
-        # the code looks at each instruction that defines or uses the temporary,
-        # and takes the common set of hw-parallel tagged inames associated with
-        # these instructions.
-        #
-        # Furthermore, in the case of local temporaries, inames that are tagged
-        # hw-local do not contribute to the global storage shape.
-        hw_inames = get_common_hw_inames(kernel,
-            def_lists[temporary.name] + use_lists[temporary.name])
-
-        # This takes advantage of the fact that g < l in the alphabet :)
-        hw_inames = sorted(hw_inames,
-            key=lambda iname: str(kernel.iname_to_tag[iname]))
-
-        # Calculate the sizes of the dimensions that get added in front for
-        # the global storage of the temporary.
-        shape_prefix = []
-
-        backing_hw_inames = []
-        for iname in hw_inames:
-            tag = kernel.iname_to_tag[iname]
-            is_local_iname = isinstance(tag, LocalIndexTag)
-            if is_local_iname and temporary.scope == temp_var_scope.LOCAL:
-                # Restrict shape to that of group inames for locals.
-                continue
-            backing_hw_inames.append(iname)
-            from loopy.isl_helpers import static_max_of_pw_aff
-            from loopy.symbolic import aff_to_expr
-            shape_prefix.append(
-                aff_to_expr(
-                    static_max_of_pw_aff(
-                        kernel.get_iname_bounds(iname).size, False)))
-
-        backing_temporary = PromotedTemporary(
-            name=name_gen(temporary.name),
-            orig_temporary=temporary,
-            shape_prefix=tuple(shape_prefix),
-            hw_inames=backing_hw_inames)
-        new_temporaries[temporary.name] = backing_temporary
-
-    return new_temporaries
-
-# }}}
-
-
-# {{{ Domain augmentation
-
-def augment_domain_for_temporary_promotion(
-        kernel, domain, promoted_temporary, mode, name_gen):
-    """
-    Add new axes to the domain corresponding to the dimensions of
-    `promoted_temporary`.
-    """
-    import islpy as isl
-
-    orig_temporary = promoted_temporary.orig_temporary
-    orig_dim = domain.dim(isl.dim_type.set)
-    dims_to_insert = len(orig_temporary.shape)
-
-    iname_to_tag = {}
-
-    # Add dimension-dependent inames.
-    dim_inames = []
-
-    domain = domain.add(isl.dim_type.set, dims_to_insert)
-    for t_idx in range(len(orig_temporary.shape)):
-        new_iname = name_gen("{name}_{mode}_dim_{dim}".
-            format(name=orig_temporary.name,
-                   mode=mode,
-                   dim=t_idx))
-        domain = domain.set_dim_name(
-            isl.dim_type.set, orig_dim + t_idx, new_iname)
-        if orig_temporary.is_local:
-            # If the temporary is has local scope, then loads / stores can be
-            # done in parallel.
-            from loopy.kernel.data import AutoFitLocalIndexTag
-            iname_to_tag[new_iname] = AutoFitLocalIndexTag()
-
-        dim_inames.append(new_iname)
-
-        # Add size information.
-        aff = isl.affs_from_space(domain.space)
-        domain &= aff[0].le_set(aff[new_iname])
-        size = orig_temporary.shape[t_idx]
-        from loopy.symbolic import aff_from_expr
-        domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, size))
-
-    hw_inames = []
-
-    # Add hardware inames duplicates.
-    for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames):
-        new_iname = name_gen("{name}_{mode}_hw_dim_{dim}".
-            format(name=orig_temporary.name,
-                   mode=mode,
-                   dim=t_idx))
-        hw_inames.append(new_iname)
-        iname_to_tag[new_iname] = kernel.iname_to_tag[hw_iname]
-
-    from loopy.isl_helpers import duplicate_axes
-    domain = duplicate_axes(
-        domain, promoted_temporary.hw_inames, hw_inames)
-
-    # The operations on the domain above return a Set object, but the
-    # underlying domain should be expressible as a single BasicSet.
-    domain_list = domain.get_basic_set_list()
-    assert domain_list.n_basic_set() == 1
-    domain = domain_list.get_basic_set(0)
-    return domain, hw_inames, dim_inames, iname_to_tag
-
-# }}}
-
-
-def restore_and_save_temporaries(kernel):
-    """
-    Add code that loads / spills the temporaries in the kernel which are
-    live across sub-kernel calls.
-    """
-    # Compute live temporaries.
-    live_in, live_out = compute_live_temporaries(kernel, kernel.schedule)
-
-    # Create kernel variables based on live temporaries.
-    inter_kernel_temporaries = set()
-
-    call_count = 0
-    for idx, sched_item in enumerate(kernel.schedule):
-        if isinstance(sched_item, CallKernel):
-            inter_kernel_temporaries |= filter_out_subscripts(live_in[idx])
-            call_count += 1
-
-    if call_count == 1:
-        # Single call corresponds to a kernel which has not been split -
-        # no need for restores / spills of temporaries.
-        return kernel
-
-    name_gen = kernel.get_var_name_generator()
-    new_temporaries = determine_temporaries_to_promote(
-        kernel, inter_kernel_temporaries, name_gen)
-
-    # {{{ Insert loads and spills of new temporaries
-
-    new_schedule = []
-    new_instructions = []
-    new_iname_to_tag = {}
-
-    idx = 0
-    schedule = kernel.schedule
-    while idx < len(schedule):
-        sched_item = schedule[idx]
-
-        if not isinstance(sched_item, CallKernel):
-            new_schedule.append(sched_item)
-            idx += 1
-            continue
-
-        subkernel_prolog = []
-        subkernel_epilog = []
-        subkernel_schedule = []
-
-        start_idx = idx
-        idx += 1
-        while not isinstance(schedule[idx], ReturnFromKernel):
-            subkernel_schedule.append(schedule[idx])
-            idx += 1
-
-        subkernel_defs, subkernel_uses = \
-            get_temporaries_defined_and_used_in_subrange(
-                kernel, schedule, start_idx + 1, idx - 1)
-
-        from loopy.kernel.data import temp_var_scope
-        # Filter out temporaries that are global.
-        subkernel_globals = set(
-            tval for tval in subkernel_defs | subkernel_uses
-            if kernel.temporary_variables[tval].scope == temp_var_scope.GLOBAL)
-
-        tvals_to_spill = (subkernel_defs - subkernel_globals) & live_out[idx]
-        # Need to load tvals_to_spill, to avoid overwriting entries that the
-        # code doesn't touch when doing the spill.
-        tvals_to_load = ((subkernel_uses - subkernel_globals)
-            | tvals_to_spill) & live_in[start_idx]
-
-        # Add new arguments.
-        sched_item = sched_item.copy(
-            extra_args=sched_item.extra_args
-            + sorted(new_temporaries[tv].name
-                     for tv in tvals_to_load | tvals_to_spill))
-
-        # {{{ Add all the loads and spills.
-
-        def insert_loads_or_spills(tvals, mode):
-            assert mode in ["load", "spill"]
-            local_temporaries = set()
-
-            code_block = \
-                subkernel_prolog if mode == "load" else subkernel_epilog
-
-            new_kernel = kernel
-
-            for tval in tvals:
-                from loopy.kernel.tools import DomainChanger
-                tval_hw_inames = new_temporaries[tval].hw_inames
-                dchg = DomainChanger(new_kernel,
-                    frozenset(sched_item.extra_inames + tval_hw_inames))
-                domain = dchg.domain
-
-                domain, hw_inames, dim_inames, itt = \
-                    augment_domain_for_temporary_promotion(
-                        new_kernel, domain, new_temporaries[tval], mode,
-                        name_gen)
-                new_iname_to_tag.update(itt)
-
-                new_kernel = dchg.get_kernel_with(domain)
-
-                # Add the load / spill instruction.
-                insn_id = name_gen("{name}.{mode}".format(name=tval, mode=mode))
-
-                def subscript_or_var(agg, subscript):
-                    from pymbolic.primitives import Subscript, Variable
-                    if len(subscript) == 0:
-                        return Variable(agg)
-                    else:
-                        return Subscript(
-                            Variable(agg),
-                            tuple(map(Variable, subscript)))
-
-                args = (
-                    subscript_or_var(
-                        tval, dim_inames),
-                    subscript_or_var(
-                        new_temporaries[tval].name, hw_inames + dim_inames))
-
-                if mode == "spill":
-                    args = reversed(args)
-
-                from loopy.kernel.data import Assignment
-                new_insn = Assignment(*args, id=insn_id,
-                    within_inames=frozenset(hw_inames + dim_inames),
-                    within_inames_is_final=True)
-
-                new_instructions.append(new_insn)
-
-                loop_begin = [EnterLoop(iname=iname) for iname in dim_inames]
-                loop_end = list(reversed([
-                    LeaveLoop(iname=iname) for iname in dim_inames]))
-                code_block.extend(
-                    loop_begin +
-                    [RunInstruction(insn_id=insn_id)] +
-                    loop_end)
-                if new_temporaries[tval].orig_temporary.is_local:
-                    local_temporaries.add(new_temporaries[tval].name)
-
-            # After loading / before spilling local temporaries, we need to
-            # insert a barrier.
-            if local_temporaries:
-                if mode == "load":
-                    subkernel_prolog.append(
-                        Barrier(kind="local",
-                                comment="for loads of {0}".format(
-                                    ", ".join(sorted(local_temporaries)))))
-                else:
-                    subkernel_epilog.insert(0,
-                        Barrier(kind="local",
-                                comment="for spills of {0}".format(
-                                    ", ".join(sorted(local_temporaries)))))
-            return new_kernel
-
-        kernel = insert_loads_or_spills(tvals_to_load, "load")
-        kernel = insert_loads_or_spills(tvals_to_spill, "spill")
-
-        # }}}
-
-        new_schedule.extend(
-            [sched_item] +
-            subkernel_prolog +
-            subkernel_schedule +
-            subkernel_epilog +
-            # ReturnFromKernel
-            [schedule[idx]])
-
-        # ReturnFromKernel
-        idx += 1
-
-    # }}}
-
-    new_iname_to_tag.update(kernel.iname_to_tag)
-    updated_temporary_variables = dict(
-        (t.name, t.as_variable()) for t in new_temporaries.values())
-    updated_temporary_variables.update(kernel.temporary_variables)
-
-    kernel = kernel.copy(
-        iname_to_tag=new_iname_to_tag,
-        temporary_variables=updated_temporary_variables,
-        instructions=kernel.instructions + new_instructions,
-        schedule=new_schedule
-        )
-
-    from loopy.kernel.tools import assign_automatic_axes
-    kernel = assign_automatic_axes(kernel)
-
-    # Once assign_automatic_axes() does its job, loops in the schedule
-    # for newly hardware-tagged inames are no longer necessary (and in
-    # fact illegal), so remove them.
-    kernel = remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel)
-
     return kernel
 
 
-def add_extra_args_to_schedule(kernel):
-    """
-    Fill the `extra_args` fields in all the :class:`loopy.schedule.CallKernel`
-    instructions in the schedule with global temporaries.
-    """
-    new_schedule = []
-
-    block_bounds = get_block_boundaries(kernel.schedule)
-    for idx, sched_item in enumerate(kernel.schedule):
-        if isinstance(sched_item, CallKernel):
-            defs, uses = get_temporaries_defined_and_used_in_subrange(
-                   kernel, kernel.schedule, idx + 1, block_bounds[idx] - 1)
-            # Filter out temporaries that are global.
-            extra_args = (tv for tv in defs | uses if
-                kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL
-                and
-                kernel.temporary_variables[tv].initializer is None)
-            new_schedule.append(sched_item.copy(extra_args=sorted(extra_args)))
-        else:
-            new_schedule.append(sched_item)
-
-    return kernel.copy(schedule=new_schedule)
-
-
 def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen):
     schedule = kernel.schedule
     loop_bounds = get_block_boundaries(schedule)
 
-    # {{{ Inner mapper function
+    # {{{ inner mapper function
 
     dummy_call = CallKernel(kernel_name="", extra_args=[], extra_inames=[])
     dummy_return = ReturnFromKernel(kernel_name="")
@@ -760,6 +114,7 @@ def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen):
                             [dummy_call.copy()] +
                             current_chunk +
                             [dummy_return.copy()])
+                    new_schedule.append(sched_item)
                     current_chunk = []
                 else:
                     current_chunk.append(sched_item)
diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..5de677e72708be844a5276b3d40ace8b1dad9da0
--- /dev/null
+++ b/loopy/schedule/tools.py
@@ -0,0 +1,191 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2016 Matt Wala"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from loopy.kernel.data import temp_var_scope
+from loopy.schedule import (BeginBlockItem, CallKernel, EndBlockItem,
+                            RunInstruction, Barrier)
+
+from pytools import memoize_method
+
+
+# {{{ block boundary finder
+
+def get_block_boundaries(schedule):
+    """
+    Return a dictionary mapping indices of
+    :class:`loopy.schedule.BlockBeginItem`s to
+    :class:`loopy.schedule.BlockEndItem`s and vice versa.
+    """
+    block_bounds = {}
+    active_blocks = []
+    for idx, sched_item in enumerate(schedule):
+        if isinstance(sched_item, BeginBlockItem):
+            active_blocks.append(idx)
+        elif isinstance(sched_item, EndBlockItem):
+            start = active_blocks.pop()
+            block_bounds[start] = idx
+            block_bounds[idx] = start
+    return block_bounds
+
+# }}}
+
+
+# {{{ instruction query utility
+
+class InstructionQuery(object):
+
+    def __init__(self, kernel):
+        self.kernel = kernel
+        block_bounds = get_block_boundaries(kernel.schedule)
+        subkernel_slices = {}
+        from six import iteritems
+        for start, end in iteritems(block_bounds):
+            sched_item = kernel.schedule[start]
+            if isinstance(sched_item, CallKernel):
+                subkernel_slices[sched_item.kernel_name] = slice(start, end + 1)
+        self.subkernel_slices = subkernel_slices
+
+    @memoize_method
+    def subkernels(self):
+        return frozenset(self.subkernel_slices.keys())
+
+    @memoize_method
+    def insns_reading_or_writing(self, var):
+        return frozenset(insn.id for insn in self.kernel.instructions
+            if var in insn.read_dependency_names()
+                or var in insn.assignee_var_names())
+
+    @memoize_method
+    def insns_in_subkernel(self, subkernel):
+        return frozenset(sched_item.insn_id for sched_item
+            in self.kernel.schedule[self.subkernel_slices[subkernel]]
+            if isinstance(sched_item, RunInstruction))
+
+    @memoize_method
+    def temporaries_read_in_subkernel(self, subkernel):
+        return frozenset(
+            var
+            for insn in self.insns_in_subkernel(subkernel)
+            for var in self.kernel.id_to_insn[insn].read_dependency_names()
+            if var in self.kernel.temporary_variables)
+
+    @memoize_method
+    def temporaries_written_in_subkernel(self, subkernel):
+        return frozenset(
+            var
+            for insn in self.insns_in_subkernel(subkernel)
+            for var in self.kernel.id_to_insn[insn].assignee_var_names()
+            if var in self.kernel.temporary_variables)
+
+    @memoize_method
+    def temporaries_read_or_written_in_subkernel(self, subkernel):
+        return (
+            self.temporaries_read_in_subkernel(subkernel) |
+            self.temporaries_written_in_subkernel(subkernel))
+
+    @memoize_method
+    def inames_in_subkernel(self, subkernel):
+        subkernel_start = self.subkernel_slices[subkernel].start
+        return frozenset(self.kernel.schedule[subkernel_start].extra_inames)
+
+    @memoize_method
+    def pre_and_post_barriers(self, subkernel):
+        subkernel_start = self.subkernel_slices[subkernel].start
+        subkernel_end = self.subkernel_slices[subkernel].stop
+
+        def is_global_barrier(item):
+            return isinstance(item, Barrier) and item.kind == "global"
+
+        try:
+            pre_barrier = next(item for item in
+                    self.kernel.schedule[subkernel_start::-1]
+                    if is_global_barrier(item)).originating_insn_id
+        except StopIteration:
+            pre_barrier = None
+
+        try:
+            post_barrier = next(item for item in
+                    self.kernel.schedule[subkernel_end:]
+                    if is_global_barrier(item)).originating_insn_id
+        except StopIteration:
+            post_barrier = None
+
+        return (pre_barrier, post_barrier)
+
+    @memoize_method
+    def hw_inames(self, insn_id):
+        """
+        Return the inames that insn runs in and that are tagged as hardware
+        parallel.
+        """
+        from loopy.kernel.data import HardwareParallelTag
+        return set(iname for iname in self.kernel.insn_inames(insn_id)
+                   if isinstance(self.kernel.iname_to_tag.get(iname),
+                                 HardwareParallelTag))
+
+    @memoize_method
+    def common_hw_inames(self, insn_ids):
+        """
+        Return the common set of hardware parallel tagged inames among
+        the list of instructions.
+        """
+        # Get the list of hardware inames in which the temporary is defined.
+        if len(insn_ids) == 0:
+            return set()
+        return set.intersection(*(self.hw_inames(id) for id in insn_ids))
+
+# }}}
+
+
+# {{{ add extra args to schedule
+
+def add_extra_args_to_schedule(kernel):
+    """
+    Fill the `extra_args` fields in all the :class:`loopy.schedule.CallKernel`
+    instructions in the schedule with global temporaries.
+    """
+    new_schedule = []
+
+    insn_query = InstructionQuery(kernel)
+
+    for sched_item in kernel.schedule:
+        if isinstance(sched_item, CallKernel):
+            subrange_temporaries = (insn_query
+                .temporaries_read_or_written_in_subkernel(sched_item.kernel_name))
+            more_args = set(tv
+                for tv in subrange_temporaries
+                if
+                kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL
+                and
+                kernel.temporary_variables[tv].initializer is None
+                and
+                tv not in sched_item.extra_args)
+            new_schedule.append(sched_item.copy(
+                extra_args=sched_item.extra_args + sorted(more_args)))
+        else:
+            new_schedule.append(sched_item)
+
+    return kernel.copy(schedule=new_schedule)
+
+# }}}
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 8cc1950b8ea7358134e0861a7664ba3d74fb9417..68cc32e56be077c7e45d11b9e2aade86b04494cc 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -220,8 +220,9 @@ class ExpressionToCExpressionMapper(IdentityMapper):
 
         elif isinstance(ary, (GlobalArg, TemporaryVariable, ConstantArg)):
             if len(access_info.subscripts) == 0:
-                if (isinstance(ary, (ConstantArg, GlobalArg)) or
-                    (isinstance(ary, TemporaryVariable) and ary.base_storage)):
+                if (
+                        (isinstance(ary, (ConstantArg, GlobalArg)) or
+                         (isinstance(ary, TemporaryVariable) and ary.base_storage))):
                     # unsubscripted global args are pointers
                     result = var(access_info.array_name)[0]
 
diff --git a/loopy/transform/__init__.py b/loopy/transform/__init__.py
index 570b5efffb29e0ebb56b99444db19766127be596..f42fd3c8d2943bb37b75e9ef0003b88985950926 100644
--- a/loopy/transform/__init__.py
+++ b/loopy/transform/__init__.py
@@ -21,6 +21,3 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
-
-
-
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index b2c86c084f0c56ebfb6ec8ebe4f6f5e65c5fd37d..92cff7a507d672a3acc51a8abed572a04cb7e86a 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -500,7 +500,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
         store_instruction = Assignment(
                     id=kernel.make_unique_instruction_id(based_on="store_"+var_name),
                     depends_on=frozenset(aar.modified_insn_ids),
-                    no_sync_with=frozenset([init_insn_id]),
+                    no_sync_with=frozenset([(init_insn_id, "any")]),
                     assignee=store_target,
                     expression=store_expression,
                     within_inames=(
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
new file mode 100644
index 0000000000000000000000000000000000000000..8706bc4da70b94ad678f07158e0a0f648fdd0030
--- /dev/null
+++ b/loopy/transform/save.py
@@ -0,0 +1,587 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2016 Matt Wala"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from loopy.diagnostic import LoopyError
+import loopy as lp
+
+from loopy.kernel.data import auto
+from pytools import memoize_method, Record
+from loopy.schedule import (
+            EnterLoop, LeaveLoop, RunInstruction,
+            CallKernel, ReturnFromKernel, Barrier)
+
+from loopy.schedule.tools import (get_block_boundaries, InstructionQuery)
+
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+__doc__ = """
+.. currentmodule:: loopy
+
+.. autofunction:: save_and_reload_temporaries
+"""
+
+
+# {{{ liveness analysis
+
+class LivenessResult(dict):
+
+    class InstructionResult(Record):
+        __slots__ = ["live_in", "live_out"]
+
+    @classmethod
+    def make_empty(cls, nscheditems):
+        return cls((idx, cls.InstructionResult(live_in=set(), live_out=set()))
+                   for idx in range(nscheditems))
+
+
+class LivenessAnalysis(object):
+
+    def __init__(self, kernel):
+        self.kernel = kernel
+        self.schedule = self.kernel.schedule
+
+    @memoize_method
+    def get_successor_relation(self):
+        successors = {}
+        block_bounds = get_block_boundaries(self.kernel.schedule)
+
+        for idx, (item, next_item) in enumerate(zip(
+                reversed(self.schedule),
+                reversed(self.schedule + [None]))):
+            sched_idx = len(self.schedule) - idx - 1
+
+            # Look at next_item
+            if next_item is None:
+                after = set()
+            elif isinstance(next_item, EnterLoop):
+                # Account for empty loop
+                loop_end = block_bounds[sched_idx + 1]
+                after = successors[loop_end] | set([sched_idx + 1])
+            elif isinstance(next_item, (LeaveLoop, RunInstruction,
+                    CallKernel, ReturnFromKernel, Barrier)):
+                after = set([sched_idx + 1])
+            else:
+                raise LoopyError("unexpected type of schedule item: {ty}"
+                    .format(ty=type(next_item).__name__))
+
+            # Look at item
+            if isinstance(item, LeaveLoop):
+                # Account for loop
+                loop_begin = block_bounds[sched_idx]
+                after |= set([loop_begin])
+            elif not isinstance(item, (EnterLoop, RunInstruction,
+                    CallKernel, ReturnFromKernel, Barrier)):
+                raise LoopyError("unexpected type of schedule item: {ty}"
+                    .format(ty=type(item).__name__))
+
+            successors[sched_idx] = after
+
+        return successors
+
+    def get_gen_and_kill_sets(self):
+        gen = dict((idx, set()) for idx in range(len(self.schedule)))
+        kill = dict((idx, set()) for idx in range(len(self.schedule)))
+
+        for sched_idx, sched_item in enumerate(self.schedule):
+            if not isinstance(sched_item, RunInstruction):
+                continue
+            insn = self.kernel.id_to_insn[sched_item.insn_id]
+            for var in insn.assignee_var_names():
+                if var not in self.kernel.temporary_variables:
+                    continue
+                if not insn.predicates:
+                    # Fully kills the liveness only when unconditional.
+                    kill[sched_idx].add(var)
+                if len(self.kernel.temporary_variables[var].shape) > 0:
+                    # For an array variable, all definitions generate a use as
+                    # well, because the write could be a partial write,
+                    # necessitating a reload of whatever is not written.
+                    #
+                    # We don't currently check if the write is a partial write
+                    # or a full write. Instead, we analyze the access
+                    # footprint later on to determine how much to reload/save.
+                    gen[sched_idx].add(var)
+            for var in insn.read_dependency_names():
+                if var not in self.kernel.temporary_variables:
+                    continue
+                gen[sched_idx].add(var)
+
+        return gen, kill
+
+    @memoize_method
+    def liveness(self):
+        logging.info("running liveness analysis")
+        successors = self.get_successor_relation()
+        gen, kill = self.get_gen_and_kill_sets()
+
+        # Fixed point iteration for liveness analysis
+        lr = LivenessResult.make_empty(len(self.schedule))
+
+        prev_lr = None
+
+        while prev_lr != lr:
+            from copy import deepcopy
+            prev_lr = deepcopy(lr)
+            for idx in range(len(self.schedule) - 1, -1, -1):
+                for succ in successors[idx]:
+                    lr[idx].live_out.update(lr[succ].live_in)
+                lr[idx].live_in = gen[idx] | (lr[idx].live_out - kill[idx])
+
+        logging.info("done running liveness analysis")
+
+        return lr
+
+    def print_liveness(self):
+        print(75 * "-")
+        print("LIVE IN:")
+        for sched_idx, sched_item in enumerate(self.schedule):
+            print("{item}: {{{vars}}}".format(
+                item=sched_idx,
+                vars=", ".join(sorted(self[sched_idx].live_in))))
+        print(75 * "-")
+        print("LIVE OUT:")
+        for sched_idx, sched_item in enumerate(self.schedule):
+            print("{item}: {{{vars}}}".format(
+                item=sched_idx,
+                vars=", ".join(sorted(self[sched_idx].live_out))))
+        print(75 * "-")
+
+    def __getitem__(self, sched_idx):
+        """
+        :arg insn: An instruction name or instance of
+            :class:`loopy.instruction.InstructionBase`
+
+        :returns: A :class:`LivenessResult` associated with `insn`
+        """
+        return self.liveness()[sched_idx]
+
+# }}}
+
+
+# {{{ save and reload implementation
+
+class TemporarySaver(object):
+
+    class PromotedTemporary(Record):
+        """
+        .. attribute:: name
+
+            The name of the new temporary.
+
+        .. attribute:: orig_temporary
+
+            The original temporary variable object.
+
+        .. attribute:: hw_inames
+
+            The common list of hw axes that define the original object.
+
+        .. attribute:: hw_dims
+
+            A list of expressions, to be added in front of the shape
+            of the promoted temporary value, corresponding to
+            hardware dimensions
+
+        .. attribute:: non_hw_dims
+
+            A list of expressions, to be added in front of the shape
+            of the promoted temporary value, corresponding to
+            non-hardware dimensions
+        """
+
+        @memoize_method
+        def as_variable(self):
+            temporary = self.orig_temporary
+            from loopy.kernel.data import TemporaryVariable, temp_var_scope
+            return TemporaryVariable(
+                name=self.name,
+                dtype=temporary.dtype,
+                scope=temp_var_scope.GLOBAL,
+                shape=self.new_shape)
+
+        @property
+        def new_shape(self):
+            return self.hw_dims + self.non_hw_dims
+
+    def __init__(self, kernel):
+        self.kernel = kernel
+        self.insn_query = InstructionQuery(kernel)
+        self.var_name_gen = kernel.get_var_name_generator()
+        self.insn_name_gen = kernel.get_instruction_id_generator()
+        # These fields keep track of updates to the kernel.
+        self.insns_to_insert = []
+        self.insns_to_update = {}
+        self.extra_args_to_add = {}
+        self.updated_iname_to_tag = {}
+        self.updated_temporary_variables = {}
+        self.saves_or_reloads_added = {}
+
+    @memoize_method
+    def auto_promote_temporary(self, temporary_name):
+        temporary = self.kernel.temporary_variables[temporary_name]
+
+        from loopy.kernel.data import temp_var_scope
+        if temporary.scope == temp_var_scope.GLOBAL:
+            # Nothing to be done for global temporaries (I hope)
+            return None
+
+        if temporary.base_storage is not None:
+            raise ValueError(
+                "Cannot promote temporaries with base_storage to global")
+
+        # `hw_inames`: The set of hw-parallel tagged inames that this temporary
+        # is associated with. This is used for determining the shape of the
+        # global storage needed for saving and restoring the temporary across
+        # kernel calls.
+        #
+        # TODO: Make a policy decision about which dimensions to use. Currently,
+        # the code looks at each instruction that defines or uses the temporary,
+        # and takes the common set of hw-parallel tagged inames associated with
+        # these instructions.
+        #
+        # Furthermore, in the case of local temporaries, inames that are tagged
+        # hw-local do not contribute to the global storage shape.
+        hw_inames = self.insn_query.common_hw_inames(
+            self.insn_query.insns_reading_or_writing(temporary.name))
+
+        # We want hw_inames to be arranged according to the order:
+        #    g.0 < g.1 < ... < l.0 < l.1 < ...
+        # Sorting lexicographically accomplishes this.
+        hw_inames = sorted(hw_inames,
+            key=lambda iname: str(self.kernel.iname_to_tag[iname]))
+
+        # Calculate the sizes of the dimensions that get added in front for
+        # the global storage of the temporary.
+        hw_dims = []
+
+        backing_hw_inames = []
+
+        for iname in hw_inames:
+            tag = self.kernel.iname_to_tag[iname]
+            from loopy.kernel.data import LocalIndexTag
+            is_local_iname = isinstance(tag, LocalIndexTag)
+            if is_local_iname and temporary.scope == temp_var_scope.LOCAL:
+                # Restrict shape to that of group inames for locals.
+                continue
+            backing_hw_inames.append(iname)
+            from loopy.isl_helpers import static_max_of_pw_aff
+            from loopy.symbolic import aff_to_expr
+            hw_dims.append(
+                aff_to_expr(
+                    static_max_of_pw_aff(
+                        self.kernel.get_iname_bounds(iname).size, False)))
+
+        non_hw_dims = temporary.shape
+
+        if len(non_hw_dims) == 0 and len(hw_dims) == 0:
+            # Scalar not in hardware: ensure at least one dimension.
+            non_hw_dims = (1,)
+
+        backing_temporary = self.PromotedTemporary(
+            name=self.var_name_gen(temporary.name + "_save_slot"),
+            orig_temporary=temporary,
+            hw_dims=tuple(hw_dims),
+            non_hw_dims=non_hw_dims,
+            hw_inames=backing_hw_inames)
+
+        return backing_temporary
+
+    def save_or_reload_impl(self, temporary, subkernel, mode,
+                             promoted_temporary=lp.auto):
+        assert mode in ("save", "reload")
+
+        if promoted_temporary is auto:
+            promoted_temporary = self.auto_promote_temporary(temporary)
+
+        if promoted_temporary is None:
+            return
+
+        from loopy.kernel.tools import DomainChanger
+        dchg = DomainChanger(
+            self.kernel,
+            frozenset(
+                self.insn_query.inames_in_subkernel(subkernel) |
+                set(promoted_temporary.hw_inames)))
+
+        domain, hw_inames, dim_inames, iname_to_tag = \
+            self.augment_domain_for_save_or_reload(
+                dchg.domain, promoted_temporary, mode, subkernel)
+
+        self.kernel = dchg.get_kernel_with(domain)
+
+        save_or_load_insn_id = self.insn_name_gen(
+            "{name}.{mode}".format(name=temporary, mode=mode))
+
+        def subscript_or_var(agg, subscript=()):
+            from pymbolic.primitives import Subscript, Variable
+            if len(subscript) == 0:
+                return Variable(agg)
+            else:
+                return Subscript(
+                    Variable(agg),
+                    tuple(map(Variable, subscript)))
+
+        dim_inames_trunc = dim_inames[:len(promoted_temporary.orig_temporary.shape)]
+
+        args = (
+            subscript_or_var(
+                temporary, dim_inames_trunc),
+            subscript_or_var(
+                promoted_temporary.name, hw_inames + dim_inames))
+
+        if mode == "save":
+            args = reversed(args)
+
+        accessing_insns_in_subkernel = (
+            self.insn_query.insns_reading_or_writing(temporary) &
+            self.insn_query.insns_in_subkernel(subkernel))
+
+        if mode == "save":
+            depends_on = accessing_insns_in_subkernel
+            update_deps = frozenset()
+        elif mode == "reload":
+            depends_on = frozenset()
+            update_deps = accessing_insns_in_subkernel
+
+        pre_barrier, post_barrier = self.insn_query.pre_and_post_barriers(subkernel)
+
+        if pre_barrier is not None:
+            depends_on |= set([pre_barrier])
+
+        if post_barrier is not None:
+            update_deps |= set([post_barrier])
+
+        # Create the load / store instruction.
+        from loopy.kernel.data import Assignment
+        save_or_load_insn = Assignment(
+            *args,
+            id=save_or_load_insn_id,
+            within_inames=(
+                self.insn_query.inames_in_subkernel(subkernel) |
+                frozenset(hw_inames + dim_inames)),
+            within_inames_is_final=True,
+            depends_on=depends_on,
+            boostable=False,
+            boostable_into=frozenset())
+
+        if temporary not in self.saves_or_reloads_added:
+            self.saves_or_reloads_added[temporary] = set()
+        self.saves_or_reloads_added[temporary].add(save_or_load_insn_id)
+
+        self.insns_to_insert.append(save_or_load_insn)
+
+        for insn_id in update_deps:
+            insn = self.insns_to_update.get(insn_id, self.kernel.id_to_insn[insn_id])
+            self.insns_to_update[insn_id] = insn.copy(
+                depends_on=insn.depends_on | frozenset([save_or_load_insn_id]))
+
+        self.updated_temporary_variables[promoted_temporary.name] = \
+            promoted_temporary.as_variable()
+
+        self.updated_iname_to_tag.update(iname_to_tag)
+
+    @memoize_method
+    def finish(self):
+        new_instructions = []
+
+        insns_to_insert = dict((insn.id, insn) for insn in self.insns_to_insert)
+
+        # Add global no_sync_with between any added reloads and saves
+        from six import iteritems
+        for temporary, added_insns in iteritems(self.saves_or_reloads_added):
+            for insn_id in added_insns:
+                insn = insns_to_insert[insn_id]
+                insns_to_insert[insn_id] = insn.copy(
+                    no_sync_with=frozenset(
+                        (added_insn, "global") for added_insn in added_insns))
+
+        for orig_insn in self.kernel.instructions:
+            if orig_insn.id in self.insns_to_update:
+                new_instructions.append(self.insns_to_update[orig_insn.id])
+            else:
+                new_instructions.append(orig_insn)
+        new_instructions.extend(
+            sorted(insns_to_insert.values(), key=lambda insn: insn.id))
+
+        self.updated_iname_to_tag.update(self.kernel.iname_to_tag)
+        self.updated_temporary_variables.update(self.kernel.temporary_variables)
+
+        kernel = self.kernel.copy(
+            instructions=new_instructions,
+            iname_to_tag=self.updated_iname_to_tag,
+            temporary_variables=self.updated_temporary_variables)
+
+        from loopy.kernel.tools import assign_automatic_axes
+        return assign_automatic_axes(kernel)
+
+    def save(self, temporary, subkernel):
+        self.save_or_reload_impl(temporary, subkernel, "save")
+
+    def reload(self, temporary, subkernel):
+        self.save_or_reload_impl(temporary, subkernel, "reload")
+
+    def augment_domain_for_save_or_reload(self,
+            domain, promoted_temporary, mode, subkernel):
+        """
+        Add new axes to the domain corresponding to the dimensions of
+        `promoted_temporary`. These axes will be used in the save/
+        reload stage.
+        """
+        assert mode in ("save", "reload")
+        import islpy as isl
+
+        orig_temporary = promoted_temporary.orig_temporary
+        orig_dim = domain.dim(isl.dim_type.set)
+
+        # Tags for newly added inames
+        iname_to_tag = {}
+
+        # FIXME: Restrict size of new inames to access footprint.
+
+        # Add dimension-dependent inames.
+        dim_inames = []
+        domain = domain.add(isl.dim_type.set, len(promoted_temporary.non_hw_dims))
+
+        for dim_idx, dim_size in enumerate(promoted_temporary.non_hw_dims):
+            new_iname = self.insn_name_gen("{name}_{mode}_axis_{dim}_{sk}".
+                format(name=orig_temporary.name,
+                       mode=mode,
+                       dim=dim_idx,
+                       sk=subkernel))
+            domain = domain.set_dim_name(
+                isl.dim_type.set, orig_dim + dim_idx, new_iname)
+
+            if orig_temporary.is_local:
+                # If the temporary has local scope, then loads / stores can
+                # be done in parallel.
+                from loopy.kernel.data import AutoFitLocalIndexTag
+                iname_to_tag[new_iname] = AutoFitLocalIndexTag()
+
+            dim_inames.append(new_iname)
+
+            # Add size information.
+            aff = isl.affs_from_space(domain.space)
+            domain &= aff[0].le_set(aff[new_iname])
+            from loopy.symbolic import aff_from_expr
+            domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, dim_size))
+
+        # FIXME: Use promoted_temporary.hw_inames
+        hw_inames = []
+
+        # Add hardware inames duplicates.
+        for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames):
+            new_iname = self.insn_name_gen("{name}_{mode}_hw_dim_{dim}_{sk}".
+                format(name=orig_temporary.name,
+                       mode=mode,
+                       dim=t_idx,
+                       sk=subkernel))
+            hw_inames.append(new_iname)
+            iname_to_tag[new_iname] = self.kernel.iname_to_tag[hw_iname]
+
+        from loopy.isl_helpers import duplicate_axes
+        domain = duplicate_axes(
+            domain, promoted_temporary.hw_inames, hw_inames)
+
+        # The operations on the domain above return a Set object, but the
+        # underlying domain should be expressible as a single BasicSet.
+        domain_list = domain.get_basic_set_list()
+        assert domain_list.n_basic_set() == 1
+        domain = domain_list.get_basic_set(0)
+        return domain, hw_inames, dim_inames, iname_to_tag
+
+# }}}
+
+
+# {{{ auto save and reload across kernel calls
+
+def save_and_reload_temporaries(knl):
+    """
+    Add instructions to save and reload temporary variables that are live
+    across kernel calls.
+
+    The basic code transformation turns schedule segments::
+
+        t = <...>
+        <return followed by call>
+        <...> = t
+
+    into this code::
+
+        t = <...>
+        t_save_slot = t
+        <return followed by call>
+        t = t_save_slot
+        <...> = t
+
+    where `t_save_slot` is a newly-created global temporary variable.
+
+    :returns: The resulting kernel
+    """
+    liveness = LivenessAnalysis(knl)
+    saver = TemporarySaver(knl)
+
+    insn_query = InstructionQuery(knl)
+
+    for sched_idx, sched_item in enumerate(knl.schedule):
+
+        if isinstance(sched_item, CallKernel):
+            # Any written temporary that is live-out needs to be read into
+            # memory because of the potential for partial writes.
+            if sched_idx == 0:
+                # Kernel entry: nothing live
+                interesting_temporaries = set()
+            else:
+                interesting_temporaries = (
+                    insn_query.temporaries_read_or_written_in_subkernel(
+                        sched_item.kernel_name))
+
+            for temporary in liveness[sched_idx].live_out & interesting_temporaries:
+                logger.info("reloading {0} at entry of {1}"
+                        .format(temporary, sched_item.kernel_name))
+                saver.reload(temporary, sched_item.kernel_name)
+
+        elif isinstance(sched_item, ReturnFromKernel):
+            if sched_idx == len(knl.schedule) - 1:
+                # Kernel exit: nothing live
+                interesting_temporaries = set()
+            else:
+                interesting_temporaries = (
+                    insn_query.temporaries_written_in_subkernel(
+                        sched_item.kernel_name))
+
+            for temporary in liveness[sched_idx].live_in & interesting_temporaries:
+                logger.info("saving {0} before return of {1}"
+                        .format(temporary, sched_item.kernel_name))
+                saver.save(temporary, sched_item.kernel_name)
+
+    return saver.finish()
+
+# }}}
+
+
+# vim: foldmethod=marker
diff --git a/loopy/version.py b/loopy/version.py
index 0b56284bbbf68b92bbab368de3c30a997724b29e..f7d157f650304a83164e11763279d3c5eabbc4c0 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v48-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v49-islpy%s" % _islpy_version
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 69e0ea325a328d03bbd98cec2163f5c7981a4a78..af4269047539b800a5fd389f9293f11551c9a291 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1105,95 +1105,215 @@ def test_kernel_splitting_with_loop(ctx_factory):
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
 
 
-def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory):
+def save_and_reload_temporaries_test(queue, knl, out_expect, debug=False):
+    from loopy.preprocess import preprocess_kernel
+    from loopy.schedule import get_one_scheduled_kernel
+
+    knl = preprocess_kernel(knl)
+    knl = get_one_scheduled_kernel(knl)
+
+    from loopy.transform.save import save_and_reload_temporaries
+    knl = save_and_reload_temporaries(knl)
+    knl = get_one_scheduled_kernel(knl)
+
+    if debug:
+        print(knl)
+        cgr = lp.generate_code_v2(knl)
+        print(cgr.device_code())
+        print(cgr.host_code())
+        1/0
+
+    _, (out,) = knl(queue, out_host=True)
+    assert (out == out_expect).all()
+
+
+@pytest.mark.parametrize("hw_loop", [True, False])
+def test_save_of_private_scalar(ctx_factory, hw_loop, debug=False):
     ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
 
-    pytest.xfail("spilling doesn't yet use local axes")
+    knl = lp.make_kernel(
+        "{ [i]: 0<=i<8 }",
+        """
+        for i
+            <>t = i
+            ... gbarrier
+            out[i] = t
+        end
+        """, seq_dependencies=True)
+
+    if hw_loop:
+        knl = lp.tag_inames(knl, dict(i="g.0"))
+
+    save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
+
+
+def test_save_of_private_array(ctx_factory, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
 
     knl = lp.make_kernel(
-            "{ [i,k]: 0<=i<n and 0<=k<3 }",
-            """
-            for i, k
-                ... gbarrier
-                <> t_private_scalar = a[k,i+1]
-                <> t_private_array[i % 2] = a[k,i+1]
-                c[k,i] = a[k,i+1]
-                ... gbarrier
-                out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2]
+        "{ [i]: 0<=i<8 }",
+        """
+        for i
+            <>t[i] = i
+            ... gbarrier
+            out[i] = t[i]
+        end
+        """, seq_dependencies=True)
+
+    knl = lp.set_temporary_scope(knl, "t", "private")
+    save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
+
+
+def test_save_of_private_array_in_hw_loop(ctx_factory, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "{ [i,j,k]: 0<=i,j,k<8 }",
+        """
+        for i
+            for j
+               <>t[j] = j
             end
-            """, seq_dependencies=True)
+            ... gbarrier
+            for k
+                out[i,k] = t[k]
+            end
+        end
+        """, seq_dependencies=True)
 
-    knl = lp.add_and_infer_dtypes(knl,
-            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
-    knl = lp.set_temporary_scope(knl, "t_private_scalar", "private")
-    knl = lp.set_temporary_scope(knl, "t_private_array", "private")
+    knl = lp.tag_inames(knl, dict(i="g.0"))
+    knl = lp.set_temporary_scope(knl, "t", "private")
 
-    ref_knl = knl
+    save_and_reload_temporaries_test(
+        queue, knl, np.vstack((8 * (np.arange(8),))), debug)
 
-    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
-    # schedule
-    from loopy.preprocess import preprocess_kernel
-    knl = preprocess_kernel(knl)
+def test_save_of_private_multidim_array(ctx_factory, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
 
-    from loopy.schedule import get_one_scheduled_kernel
-    knl = get_one_scheduled_kernel(knl)
+    knl = lp.make_kernel(
+        "{ [i,j,k,l,m]: 0<=i,j,k,l,m<8 }",
+        """
+        for i
+            for j, k
+               <>t[j,k] = k
+            end
+            ... gbarrier
+            for l, m
+                out[i,l,m] = t[l,m]
+            end
+        end
+        """, seq_dependencies=True)
 
-    # map schedule onto host or device
-    print(knl)
+    knl = lp.set_temporary_scope(knl, "t", "private")
 
-    cgr = lp.generate_code_v2(knl)
+    result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)])
+    save_and_reload_temporaries_test(queue, knl, result, debug)
 
-    assert len(cgr.device_programs) == 2
 
-    print(cgr.device_code())
-    print(cgr.host_code())
+def test_save_of_private_multidim_array_in_hw_loop(ctx_factory, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
 
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+    knl = lp.make_kernel(
+        "{ [i,j,k,l,m]: 0<=i,j,k,l,m<8 }",
+        """
+        for i
+            for j, k
+               <>t[j,k] = k
+            end
+            ... gbarrier
+            for l, m
+                out[i,l,m] = t[l,m]
+            end
+        end
+        """, seq_dependencies=True)
+
+    knl = lp.set_temporary_scope(knl, "t", "private")
+    knl = lp.tag_inames(knl, dict(i="g.0"))
+
+    result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)])
+    save_and_reload_temporaries_test(queue, knl, result, debug)
 
 
-def test_kernel_splitting_with_loop_and_local_temporary(ctx_factory):
+@pytest.mark.parametrize("hw_loop", [True, False])
+def test_save_of_multiple_private_temporaries(ctx_factory, hw_loop, debug=False):
     ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
 
     knl = lp.make_kernel(
-            "{ [i,k]: 0<=i<n and 0<=k<3 }",
+            "{ [i,j,k]: 0<=i,j,k<10 }",
             """
-            for i, k
-                ... gbarrier
-                <> t_local[i % 8,k] = i % 8
-                c[k,i] = a[k,i+1]
+            for i
+                for k
+                    <> t_arr[k] = k
+                end
+                <> t_scalar = 1
+                for j
+                    ... gbarrier
+                    out[j] = t_scalar
+                    ... gbarrier
+                    t_scalar = 10
+                end
                 ... gbarrier
-                out[k,i] = c[k,i] + t_local[i % 8,k]
+                <> flag = i == 9
+                out[i] = t_arr[i] {if=flag}
             end
             """, seq_dependencies=True)
 
-    knl = lp.add_and_infer_dtypes(knl,
-            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
+    knl = lp.set_temporary_scope(knl, "t_arr", "private")
+    if hw_loop:
+        knl = lp.tag_inames(knl, dict(i="g.0"))
 
-    knl = lp.set_temporary_scope(knl, "t_local", "local")
+    result = np.array([1, 10, 10, 10, 10, 10, 10, 10, 10, 9])
 
-    ref_knl = knl
+    save_and_reload_temporaries_test(queue, knl, result, debug)
 
-    knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0")
 
-    # schedule
-    from loopy.preprocess import preprocess_kernel
-    knl = preprocess_kernel(knl)
+def test_save_of_local_array(ctx_factory, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
 
-    from loopy.schedule import get_one_scheduled_kernel
-    knl = get_one_scheduled_kernel(knl)
+    knl = lp.make_kernel(
+        "{ [i,j]: 0<=i,j<8 }",
+        """
+        for i, j
+            <>t[2*j] = j
+            t[2*j+1] = j
+            ... gbarrier
+            out[i] = t[2*i]
+        end
+        """, seq_dependencies=True)
 
-    # map schedule onto host or device
-    print(knl)
+    knl = lp.set_temporary_scope(knl, "t", "local")
+    knl = lp.tag_inames(knl, dict(i="g.0", j="l.0"))
 
-    cgr = lp.generate_code_v2(knl)
+    save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
 
-    assert len(cgr.device_programs) == 2
 
-    print(cgr.device_code())
-    print(cgr.host_code())
+def test_save_local_multidim_array(ctx_factory, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{ [i,j,k]: 0<=i<2 and 0<=k<3 and 0<=j<2}",
+            """
+            for i, j, k
+                ... gbarrier
+                <> t_local[k,j] = 1
+                ... gbarrier
+                out[k,i*2+j] = t_local[k,j]
+            end
+            """, seq_dependencies=True)
+
+    knl = lp.set_temporary_scope(knl, "t_local", "local")
+    knl = lp.tag_inames(knl, dict(j="l.0", i="g.0"))
 
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=8))
+    save_and_reload_temporaries_test(queue, knl, 1, debug)
 
 
 def test_global_temporary(ctx_factory):