diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 942c7d56e01f9d037b0e2b601f88bc8b96dda151..b983312645062cd6a984201b9efda294fe0e2a74 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1228,18 +1228,18 @@ put those instructions into the schedule.
    ---------------------------------------------------------------------------
    TEMPORARIES:
    tmp: type: np:dtype('int32'), shape: () scope:private
-   tmp_save_slot: type: np:dtype('int32'), shape: (n // 16, 16), dim_tags: (N1:stride:16, N0:stride:1) scope:global
+   tmp__save_slot: type: np:dtype('int32'), shape: (n // 16, 16), dim_tags: (N1:stride:16, N0:stride:1) scope:global
    ---------------------------------------------------------------------------
    ...
    ---------------------------------------------------------------------------
    SCHEDULE:
-      0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[])
+      0: CALL KERNEL rotate_v2(extra_args=['tmp__save_slot'], extra_inames=[])
       1:     [maketmp] tmp <- arr[i_inner + i_outer*16]
-      2:     [tmp.save] tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] <- tmp
+      2:     [tmp.save] tmp__save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] <- tmp
       3: RETURN FROM KERNEL rotate_v2
       4: ---BARRIER:global---
-      5: CALL KERNEL rotate_v2_0(extra_args=['tmp_save_slot'], extra_inames=[])
-      6:     [tmp.reload] tmp <- tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0]
+      5: CALL KERNEL rotate_v2_0(extra_args=['tmp__save_slot'], extra_inames=[])
+      6:     [tmp.reload] tmp <- tmp__save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0]
       7:     [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp
       8: RETURN FROM KERNEL rotate_v2_0
    ---------------------------------------------------------------------------
@@ -1264,19 +1264,19 @@ The kernel translates into two OpenCL kernels.
    #define lid(N) ((int) get_local_id(N))
    #define gid(N) ((int) get_group_id(N))
    <BLANKLINE>
-   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
+   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp__save_slot)
    {
      int tmp;
    <BLANKLINE>
      tmp = arr[16 * gid(0) + lid(0)];
-     tmp_save_slot[16 * gid(0) + lid(0)] = tmp;
+     tmp__save_slot[16 * gid(0) + lid(0)] = tmp;
    }
    <BLANKLINE>
-   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
+   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp__save_slot)
    {
      int tmp;
    <BLANKLINE>
-     tmp = tmp_save_slot[16 * gid(0) + lid(0)];
+     tmp = tmp__save_slot[16 * gid(0) + lid(0)];
      arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp;
    }
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 6cbb3362ef91b27c3b7b1cf6a591f7f9a20c2f7a..206cf49ff71573fb0d33cfffbc029f7b54a0e9f8 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -75,7 +75,8 @@ from loopy.transform.instruction import (
         set_instruction_priority, add_dependency,
         remove_instructions,
         replace_instruction_ids,
-        tag_instructions)
+        tag_instructions,
+        add_nosync)
 
 from loopy.transform.data import (
         add_prefetch, change_arg_to_image,
@@ -108,6 +109,8 @@ from loopy.transform.batch import to_batched
 from loopy.transform.parameter import assume, fix_parameters
 from loopy.transform.save import save_and_reload_temporaries
 
+from loopy.transform.reduction import make_two_level_reduction
+
 # }}}
 
 from loopy.type_inference import infer_unknown_types
@@ -189,6 +192,7 @@ __all__ = [
         "remove_instructions",
         "replace_instruction_ids",
         "tag_instructions",
+        "add_nosync",
 
         "extract_subst", "expand_subst", "assignment_to_subst",
         "find_rules_matching", "find_one_rule_matching",
@@ -207,6 +211,8 @@ __all__ = [
 
         "assume", "fix_parameters",
 
+        "make_two_level_reduction",
+
         "save_and_reload_temporaries",
 
         # }}}
diff --git a/loopy/check.py b/loopy/check.py
index 6a1e3dc33a33b826ad54c42a549b35ad275d9fe5..68ca4a2b3082a1427393f6c6243a8a0e9cf4fd88 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -505,22 +505,22 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel):
 # {{{ check that temporaries are defined in subkernels where used
 
 def check_that_temporaries_are_defined_in_subkernels_where_used(kernel):
-    from loopy.schedule.tools import InstructionQuery
     from loopy.kernel.data import temp_var_scope
 
-    insn_query = InstructionQuery(kernel)
-
-    for subkernel in insn_query.subkernels():
+    for subkernel in kernel.subkernels:
         defined_base_storage = set()
 
-        for temporary in insn_query.temporaries_written_in_subkernel(subkernel):
+        from loopy.schedule.tools import (
+                temporaries_written_in_subkernel, temporaries_read_in_subkernel)
+
+        for temporary in temporaries_written_in_subkernel(kernel, subkernel):
             tval = kernel.temporary_variables[temporary]
             if tval.base_storage is not None:
                 defined_base_storage.add(tval.base_storage)
 
         for temporary in (
-                insn_query.temporaries_read_in_subkernel(subkernel) -
-                insn_query.temporaries_written_in_subkernel(subkernel)):
+                temporaries_read_in_subkernel(kernel, subkernel) -
+                temporaries_written_in_subkernel(kernel, subkernel)):
             tval = kernel.temporary_variables[temporary]
 
             if tval.initializer is not None:
@@ -530,16 +530,17 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel):
             if tval.base_storage is not None:
                 if tval.base_storage not in defined_base_storage:
                     from loopy.diagnostic import MissingDefinitionError
-                    raise MissingDefinitionError("temporary variable '%s' gets used "
-                        "in subkernel '%s' and neither it nor its aliases have a "
-                        "definition" % (temporary, subkernel))
+                    raise MissingDefinitionError("temporary variable '%s' gets "
+                            "used in subkernel '%s' and neither it nor its "
+                            "aliases have a definition" % (temporary, subkernel))
                 continue
 
             if tval.scope in (temp_var_scope.PRIVATE, temp_var_scope.LOCAL):
                 from loopy.diagnostic import MissingDefinitionError
-                raise MissingDefinitionError("temporary variable '%s' gets used in "
-                    "subkernel '%s' without a definition (maybe you forgot to call "
-                    "loopy.save_and_reload_temporaries?)" % (temporary, subkernel))
+                raise MissingDefinitionError("temporary variable '%s' gets used "
+                        "in subkernel '%s' without a definition (maybe you forgot "
+                        "to call loopy.save_and_reload_temporaries?)"
+                        % (temporary, subkernel))
 
 # }}}
 
diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py
index 15ab8a1ee13df440926e51e676223bc6a398df57..512e4ac8619f33856d0a8ed929de0b574f7da014 100644
--- a/loopy/diagnostic.py
+++ b/loopy/diagnostic.py
@@ -103,6 +103,10 @@ class MissingDefinitionError(LoopyError):
 class UnscheduledInstructionError(LoopyError):
     pass
 
+
+class ReductionIsNotTriangularError(LoopyError):
+    pass
+
 # }}}
 
 
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 602830de38e457c5ff4a55d7685dc346a7b4de35..0b3068a468ca1ff0e2ab97ae4a8b28dc63e7b8d4 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -29,6 +29,7 @@ from six.moves import range, zip
 
 from loopy.diagnostic import StaticValueFindingError
 
+import contextlib
 import islpy as isl
 from islpy import dim_type
 
@@ -60,6 +61,14 @@ def dump_space(ls):
             for dt in dim_type.names)
 
 
+@contextlib.contextmanager
+def no_stderr_output_from_isl(ctx):
+    prev_on_error = ctx.get_on_error()
+    ctx.set_on_error(isl.on_error.CONTINUE)
+    yield
+    ctx.set_on_error(prev_on_error)
+
+
 # {{{ make_slab
 
 def make_slab(space, iname, start, stop):
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 793d31791a3295ef1d7c03132f43489ab828f089..cc3414489635243a3f77fcb769572fa8773921cc 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -786,6 +786,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             for var_name in insn.read_dependency_names() & admissible_vars:
                 result.setdefault(var_name, set()).add(insn.id)
 
+        return result
+
     @memoize_method
     def writer_map(self):
         """
@@ -823,6 +825,138 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return result
 
+    @property
+    @memoize_method
+    def global_barrier_order(self):
+        """Return a :class:`tuple` of the listing the ids of global barrier instructions
+        as they appear in order in the kernel.
+
+        See also :class:`loopy.instruction.BarrierInstruction`.
+        """
+        barriers = []
+        visiting = set()
+        visited = set()
+
+        unvisited = set(insn.id for insn in self.instructions)
+
+        while unvisited:
+            stack = [unvisited.pop()]
+
+            while stack:
+                top = stack[-1]
+
+                if top in visiting:
+                    visiting.remove(top)
+
+                    from loopy.kernel.instruction import BarrierInstruction
+                    insn = self.id_to_insn[top]
+                    if isinstance(insn, BarrierInstruction):
+                        if insn.kind == "global":
+                            barriers.append(top)
+
+                if top in visited:
+                    stack.pop()
+                    continue
+
+                visited.add(top)
+                visiting.add(top)
+
+                for child in self.id_to_insn[top].depends_on:
+                    # Check for no cycles.
+                    assert child not in visiting
+                    stack.append(child)
+
+        # Ensure this is the only possible order.
+        for prev_barrier, barrier in zip(barriers, barriers[1:]):
+            if prev_barrier not in self.recursive_insn_dep_map()[barrier]:
+                raise LoopyError(
+                        "Unordered global barriers detected: '%s', '%s'"
+                        % (barrier, prev_barrier))
+
+        return tuple(barriers)
+
+    @memoize_method
+    def find_most_recent_global_barrier(self, insn_id):
+        """Return the id of the latest occuring global barrier which the
+        given instruction (indirectly or directly) depends on, or *None* if this
+        instruction does not depend on a global barrier.
+
+        The return value is guaranteed to be unique because global barriers are
+        totally ordered within the kernel.
+        """
+
+        if len(self.global_barrier_order) == 0:
+            return None
+
+        insn = self.id_to_insn[insn_id]
+
+        if len(insn.depends_on) == 0:
+            return None
+
+        def is_barrier(my_insn_id):
+            insn = self.id_to_insn[my_insn_id]
+            from loopy.kernel.instruction import BarrierInstruction
+            return isinstance(insn, BarrierInstruction) and insn.kind == "global"
+
+        global_barrier_to_ordinal = dict(
+            (b, i) for i, b in enumerate(self.global_barrier_order))
+
+        def get_barrier_ordinal(barrier_id):
+            return (global_barrier_to_ordinal[barrier_id]
+                    if barrier_id is not None
+                    else -1)
+
+        direct_barrier_dependencies = set(
+                dep for dep in insn.depends_on if is_barrier(dep))
+
+        if len(direct_barrier_dependencies) > 0:
+            return max(direct_barrier_dependencies, key=get_barrier_ordinal)
+        else:
+            return max((self.find_most_recent_global_barrier(dep)
+                        for dep in insn.depends_on),
+                    key=get_barrier_ordinal)
+
+    @property
+    @memoize_method
+    def subkernels(self):
+        if self.state != kernel_state.SCHEDULED:
+            raise LoopyError("Kernel must be scheduled")
+
+        from loopy.schedule import CallKernel
+
+        return tuple(sched_item.kernel_name
+                for sched_item in self.schedule
+                if isinstance(sched_item, CallKernel))
+
+    @property
+    @memoize_method
+    def subkernel_to_insn_ids(self):
+        if self.state != kernel_state.SCHEDULED:
+            raise LoopyError("Kernel must be scheduled")
+
+        from loopy.schedule import (
+                sched_item_to_insn_id, CallKernel, ReturnFromKernel)
+
+        subkernel = None
+        result = {}
+
+        for sched_item in self.schedule:
+            if isinstance(sched_item, CallKernel):
+                subkernel = sched_item.kernel_name
+                result[subkernel] = set()
+
+            if isinstance(sched_item, ReturnFromKernel):
+                subkernel = None
+
+            if subkernel is not None:
+                for insn_id in sched_item_to_insn_id(sched_item):
+                    result[subkernel].add(insn_id)
+
+        for subkernel in result:
+            result[subkernel] = frozenset(result[subkernel])
+
+        return result
+
     # }}}
 
     # {{{ argument wrangling
@@ -968,7 +1102,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             try:
                 # insist block size is constant
                 size = static_max_of_pw_aff(size,
-                        constants_only=isinstance(tag, LocalIndexTag))
+                        constants_only=isinstance(tag, LocalIndexTag),
+                        context=self.assumptions)
             except ValueError:
                 pass
 
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index fdd8f1d3764ec03ca40a8338dc512b8cd2ae38cf..fc10258619a82f1637ce738d5b9d6c8bec7a7a2b 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -658,7 +658,8 @@ class MultiAssignmentBase(InstructionBase):
     @memoize_method
     def reduction_inames(self):
         def map_reduction(expr, rec):
-            rec(expr.expr)
+            for sub_expr in expr.exprs:
+                rec(sub_expr)
             for iname in expr.inames:
                 result.add(iname)
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 539bfbed06572b07491c215770a0330963764d1d..d94136e4359009637b0c5ab89ba8009f3f810956 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -1354,4 +1354,5 @@ def draw_dependencies_as_unicode_arrows(
 
 # }}}
 
+
 # vim: foldmethod=marker
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index d24b61c12e43cd16431c9727d8fb057319475633..b6dbc4b43135926778eadce0af049cbb0542458a 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -36,15 +36,19 @@ class ReductionOperation(object):
     equality-comparable.
     """
 
-    def result_dtypes(self, target, arg_dtype, inames):
+    def result_dtypes(self, target, *arg_dtypes):
         """
-        :arg arg_dtype: may be None if not known
+        :arg arg_dtypes: may be None if not known
         :returns: None if not known, otherwise the returned type
         """
 
         raise NotImplementedError
 
-    def neutral_element(self, dtype, inames):
+    @property
+    def arg_count(self):
+        raise NotImplementedError
+
+    def neutral_element(self, *dtypes):
         raise NotImplementedError
 
     def __hash__(self):
@@ -55,7 +59,7 @@ class ReductionOperation(object):
         # Force subclasses to override
         raise NotImplementedError
 
-    def __call__(self, dtype, operand1, operand2, inames):
+    def __call__(self, dtype, operand1, operand2):
         raise NotImplementedError
 
     def __ne__(self, other):
@@ -87,7 +91,11 @@ class ScalarReductionOperation(ReductionOperation):
         """
         self.forced_result_type = forced_result_type
 
-    def result_dtypes(self, kernel, arg_dtype, inames):
+    @property
+    def arg_count(self):
+        return 1
+
+    def result_dtypes(self, kernel, arg_dtype):
         if self.forced_result_type is not None:
             return (self.parse_result_type(
                     kernel.target, self.forced_result_type),)
@@ -114,18 +122,18 @@ class ScalarReductionOperation(ReductionOperation):
 
 
 class SumReductionOperation(ScalarReductionOperation):
-    def neutral_element(self, dtype, inames):
+    def neutral_element(self, dtype):
         return 0
 
-    def __call__(self, dtype, operand1, operand2, inames):
+    def __call__(self, dtype, operand1, operand2):
         return operand1 + operand2
 
 
 class ProductReductionOperation(ScalarReductionOperation):
-    def neutral_element(self, dtype, inames):
+    def neutral_element(self, dtype):
         return 1
 
-    def __call__(self, dtype, operand1, operand2, inames):
+    def __call__(self, dtype, operand1, operand2):
         return operand1 * operand2
 
 
@@ -166,32 +174,144 @@ def get_ge_neutral(dtype):
 
 
 class MaxReductionOperation(ScalarReductionOperation):
-    def neutral_element(self, dtype, inames):
+    def neutral_element(self, dtype):
         return get_ge_neutral(dtype)
 
-    def __call__(self, dtype, operand1, operand2, inames):
+    def __call__(self, dtype, operand1, operand2):
         return var("max")(operand1, operand2)
 
 
 class MinReductionOperation(ScalarReductionOperation):
-    def neutral_element(self, dtype, inames):
+    def neutral_element(self, dtype):
         return get_le_neutral(dtype)
 
-    def __call__(self, dtype, operand1, operand2, inames):
+    def __call__(self, dtype, operand1, operand2):
         return var("min")(operand1, operand2)
 
 
+# {{{ segmented reduction
+
+class _SegmentedScalarReductionOperation(ReductionOperation):
+    def __init__(self, **kwargs):
+        self.inner_reduction = self.base_reduction_class(**kwargs)
+
+    @property
+    def arg_count(self):
+        return 2
+
+    def prefix(self, scalar_dtype, segment_flag_dtype):
+        return "loopy_segmented_%s_%s_%s" % (self.which,
+                scalar_dtype.numpy_dtype.type.__name__,
+                segment_flag_dtype.numpy_dtype.type.__name__)
+
+    def neutral_element(self, scalar_dtype, segment_flag_dtype):
+        return SegmentedFunction(self, (scalar_dtype, segment_flag_dtype), "init")()
+
+    def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype):
+        return (self.inner_reduction.result_dtypes(kernel, scalar_dtype)
+                + (segment_flag_dtype,))
+
+    def __str__(self):
+        return "segmented_" + self.which
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __call__(self, dtypes, operand1, operand2):
+        return SegmentedFunction(self, dtypes, "update")(*(operand1 + operand2))
+
+
+class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation):
+    base_reduction_class = SumReductionOperation
+    which = "sum"
+    op = "((%s) + (%s))"
+
+
+class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation):
+    base_reduction_class = ProductReductionOperation
+    op = "((%s) * (%s))"
+    which = "product"
+
+
+class SegmentedFunction(FunctionIdentifier):
+    init_arg_names = ("reduction_op", "dtypes", "name")
+
+    def __init__(self, reduction_op, dtypes, name):
+        """
+        :arg dtypes: A :class:`tuple` of `(scalar_dtype, segment_flag_dtype)`
+        """
+        self.reduction_op = reduction_op
+        self.dtypes = dtypes
+        self.name = name
+
+    @property
+    def scalar_dtype(self):
+        return self.dtypes[0]
+
+    @property
+    def segment_flag_dtype(self):
+        return self.dtypes[1]
+
+    def __getinitargs__(self):
+        return (self.reduction_op, self.dtypes, self.name)
+
+
+def get_segmented_function_preamble(kernel, func_id):
+    op = func_id.reduction_op
+    prefix = op.prefix(func_id.scalar_dtype, func_id.segment_flag_dtype)
+
+    from pymbolic.mapper.c_code import CCodeMapper
+
+    c_code_mapper = CCodeMapper()
+
+    return (prefix, """
+    inline %(scalar_t)s %(prefix)s_init(%(segment_flag_t)s *segment_flag_out)
+    {
+        *segment_flag_out = 0;
+        return %(neutral)s;
+    }
+
+    inline %(scalar_t)s %(prefix)s_update(
+        %(scalar_t)s op1, %(segment_flag_t)s segment_flag1,
+        %(scalar_t)s op2, %(segment_flag_t)s segment_flag2,
+        %(segment_flag_t)s *segment_flag_out)
+    {
+        *segment_flag_out = segment_flag2;
+        return segment_flag2 ? op2 : %(combined)s;
+    }
+    """ % dict(
+            scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype),
+            prefix=prefix,
+            segment_flag_t=kernel.target.dtype_to_typename(
+                    func_id.segment_flag_dtype),
+            neutral=c_code_mapper(
+                    op.inner_reduction.neutral_element(func_id.scalar_dtype)),
+            combined=op.op % ("op1", "op2"),
+            ))
+
+
+# }}}
+
+
 # {{{ argmin/argmax
 
 class _ArgExtremumReductionOperation(ReductionOperation):
-    def prefix(self, dtype):
-        return "loopy_arg%s_%s" % (self.which, dtype.numpy_dtype.type.__name__)
+    def prefix(self, scalar_dtype, index_dtype):
+        return "loopy_arg%s_%s_%s" % (self.which,
+                index_dtype.numpy_dtype.type.__name__,
+                scalar_dtype.numpy_dtype.type.__name__)
 
-    def result_dtypes(self, kernel, dtype, inames):
-        return (dtype, kernel.index_dtype)
+    def result_dtypes(self, kernel, scalar_dtype, index_dtype):
+        return (scalar_dtype, index_dtype)
 
-    def neutral_element(self, dtype, inames):
-        return ArgExtFunction(self, dtype, "init", inames)()
+    def neutral_element(self, scalar_dtype, index_dtype):
+        return ArgExtFunction(self, (scalar_dtype, index_dtype), "init")()
+
+    def __str__(self):
+        return self.which
 
     def __hash__(self):
         return hash(type(self))
@@ -199,11 +319,12 @@ class _ArgExtremumReductionOperation(ReductionOperation):
     def __eq__(self, other):
         return type(self) == type(other)
 
-    def __call__(self, dtype, operand1, operand2, inames):
-        iname, = inames
+    @property
+    def arg_count(self):
+        return 2
 
-        return ArgExtFunction(self, dtype, "update", inames)(
-                *(operand1 + (operand2, var(iname))))
+    def __call__(self, dtypes, operand1, operand2):
+        return ArgExtFunction(self, dtypes, "update")(*(operand1 + operand2))
 
 
 class ArgMaxReductionOperation(_ArgExtremumReductionOperation):
@@ -219,21 +340,28 @@ class ArgMinReductionOperation(_ArgExtremumReductionOperation):
 
 
 class ArgExtFunction(FunctionIdentifier):
-    init_arg_names = ("reduction_op", "scalar_dtype", "name", "inames")
+    init_arg_names = ("reduction_op", "dtypes", "name")
 
-    def __init__(self, reduction_op, scalar_dtype, name, inames):
+    def __init__(self, reduction_op, dtypes, name):
         self.reduction_op = reduction_op
-        self.scalar_dtype = scalar_dtype
+        self.dtypes = dtypes
         self.name = name
-        self.inames = inames
+
+    @property
+    def scalar_dtype(self):
+        return self.dtypes[0]
+
+    @property
+    def index_dtype(self):
+        return self.dtypes[1]
 
     def __getinitargs__(self):
-        return (self.reduction_op, self.scalar_dtype, self.name, self.inames)
+        return (self.reduction_op, self.dtypes, self.name)
 
 
 def get_argext_preamble(kernel, func_id):
     op = func_id.reduction_op
-    prefix = op.prefix(func_id.scalar_dtype)
+    prefix = op.prefix(func_id.scalar_dtype, func_id.index_dtype)
 
     from pymbolic.mapper.c_code import CCodeMapper
 
@@ -267,7 +395,7 @@ def get_argext_preamble(kernel, func_id):
     """ % dict(
             scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype),
             prefix=prefix,
-            index_t=kernel.target.dtype_to_typename(kernel.index_dtype),
+            index_t=kernel.target.dtype_to_typename(func_id.index_dtype),
             neutral=c_code_mapper(neutral(func_id.scalar_dtype)),
             comp=op.update_comparison,
             ))
@@ -284,6 +412,8 @@ _REDUCTION_OPS = {
         "min": MinReductionOperation,
         "argmax": ArgMaxReductionOperation,
         "argmin": ArgMinReductionOperation,
+        "segmented_sum": SegmentedSumReductionOperation,
+        "segmented_product": SegmentedProductReductionOperation,
         }
 
 _REDUCTION_OP_PARSERS = [
@@ -333,9 +463,10 @@ def reduction_function_mangler(kernel, func_id, arg_dtypes):
 
         from loopy.kernel.data import CallMangleInfo
         return CallMangleInfo(
-                target_name="%s_init" % op.prefix(func_id.scalar_dtype),
+                target_name="%s_init" % op.prefix(
+                    func_id.scalar_dtype, func_id.index_dtype),
                 result_dtypes=op.result_dtypes(
-                    kernel, func_id.scalar_dtype, func_id.inames),
+                    kernel, func_id.scalar_dtype, func_id.index_dtype),
                 arg_dtypes=(),
                 )
 
@@ -348,9 +479,10 @@ def reduction_function_mangler(kernel, func_id, arg_dtypes):
 
         from loopy.kernel.data import CallMangleInfo
         return CallMangleInfo(
-                target_name="%s_update" % op.prefix(func_id.scalar_dtype),
+                target_name="%s_update" % op.prefix(
+                    func_id.scalar_dtype, func_id.index_dtype),
                 result_dtypes=op.result_dtypes(
-                    kernel, func_id.scalar_dtype, func_id.inames),
+                    kernel, func_id.scalar_dtype, func_id.index_dtype),
                 arg_dtypes=(
                     func_id.scalar_dtype,
                     kernel.index_dtype,
@@ -358,6 +490,42 @@ def reduction_function_mangler(kernel, func_id, arg_dtypes):
                     kernel.index_dtype),
                 )
 
+    elif isinstance(func_id, SegmentedFunction) and func_id.name == "init":
+        from loopy.target.opencl import OpenCLTarget
+        if not isinstance(kernel.target, OpenCLTarget):
+            raise LoopyError("only OpenCL supported for now")
+
+        op = func_id.reduction_op
+
+        from loopy.kernel.data import CallMangleInfo
+        return CallMangleInfo(
+                target_name="%s_init" % op.prefix(
+                    func_id.scalar_dtype, func_id.segment_flag_dtype),
+                result_dtypes=op.result_dtypes(
+                    kernel, func_id.scalar_dtype, func_id.segment_flag_dtype),
+                arg_dtypes=(),
+                )
+
+    elif isinstance(func_id, SegmentedFunction) and func_id.name == "update":
+        from loopy.target.opencl import OpenCLTarget
+        if not isinstance(kernel.target, OpenCLTarget):
+            raise LoopyError("only OpenCL supported for now")
+
+        op = func_id.reduction_op
+
+        from loopy.kernel.data import CallMangleInfo
+        return CallMangleInfo(
+                target_name="%s_update" % op.prefix(
+                    func_id.scalar_dtype, func_id.segment_flag_dtype),
+                result_dtypes=op.result_dtypes(
+                    kernel, func_id.scalar_dtype, func_id.segment_flag_dtype),
+                arg_dtypes=(
+                    func_id.scalar_dtype,
+                    func_id.segment_flag_dtype,
+                    func_id.scalar_dtype,
+                    func_id.segment_flag_dtype),
+                )
+
     return None
 
 
@@ -371,4 +539,10 @@ def reduction_preamble_generator(preamble_info):
 
             yield get_argext_preamble(preamble_info.kernel, func.name)
 
+        elif isinstance(func.name, SegmentedFunction):
+            if not isinstance(preamble_info.kernel.target, OpenCLTarget):
+                raise LoopyError("only OpenCL supported for now")
+
+            yield get_segmented_function_preamble(preamble_info.kernel, func.name)
+
 # vim: fdm=marker
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 2b6d97c38a12b47e5b4653297c18b24c40ed938b..c2aaf9cd1e7074a5e92a064096dd6042a78e8e52 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -30,6 +30,7 @@ from loopy.diagnostic import (
 
 import islpy as isl
 
+from pytools import memoize
 from pytools.persistent_dict import PersistentDict
 
 from loopy.tools import LoopyKeyBuilder
@@ -97,7 +98,9 @@ def check_reduction_iname_uniqueness(kernel):
     iname_to_nonsimultaneous_reduction_count = {}
 
     def map_reduction(expr, rec):
-        rec(expr.expr)
+        for sub_expr in expr.exprs:
+            rec(sub_expr)
+
         for iname in expr.inames:
             iname_to_reduction_count[iname] = (
                     iname_to_reduction_count.get(iname, 0) + 1)
@@ -272,7 +275,663 @@ def find_temporary_scope(kernel):
 
 # {{{ rewrite reduction to imperative form
 
-def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
+
+# {{{ utils (not stateful)
+
+from collections import namedtuple
+
+
+_InameClassification = namedtuple("_InameClassifiction",
+                                  "sequential, local_parallel, nonlocal_parallel")
+
+
+def _classify_reduction_inames(kernel, inames):
+    sequential = []
+    local_par = []
+    nonlocal_par = []
+
+    from loopy.kernel.data import (
+            LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
+            ParallelTag)
+
+    for iname in inames:
+        iname_tag = kernel.iname_to_tag.get(iname)
+
+        if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)):
+            # These are nominally parallel, but we can live with
+            # them as sequential.
+            sequential.append(iname)
+
+        elif isinstance(iname_tag, LocalIndexTagBase):
+            local_par.append(iname)
+
+        elif isinstance(iname_tag, (ParallelTag, VectorizeTag)):
+            nonlocal_par.append(iname)
+
+        else:
+            sequential.append(iname)
+
+    return _InameClassification(
+            tuple(sequential), tuple(local_par), tuple(nonlocal_par))
+
+
+def _add_params_to_domain(domain, param_names):
+    dim_type = isl.dim_type
+    nparams_orig = domain.dim(dim_type.param)
+    domain = domain.add_dims(dim_type.param, len(param_names))
+
+    for param_idx, param_name in enumerate(param_names):
+        domain = domain.set_dim_name(
+                dim_type.param, param_idx + nparams_orig, param_name)
+
+    return domain
+
+
+def _move_set_to_param_dims_except(domain, except_dims):
+    dim_type = isl.dim_type
+
+    iname_idx = 0
+    for iname in domain.get_var_names(dim_type.set):
+        if iname not in except_dims:
+            domain = domain.move_dims(
+                    dim_type.param, 0,
+                    dim_type.set, iname_idx, 1)
+            iname_idx -= 1
+        iname_idx += 1
+
+    return domain
+
+
+def _domain_depends_on_given_set_dims(domain, set_dim_names):
+    set_dim_names = frozenset(set_dim_names)
+
+    return any(
+            set_dim_names & set(constr.get_coefficients_by_name())
+            for constr in domain.get_constraints())
+
+
+def _check_reduction_is_triangular(kernel, expr, scan_param):
+    """Check whether the reduction within `expr` with scan parameters described by
+    the structure `scan_param` is triangular. This attempts to verify that the
+    domain for the scan and sweep inames is as follows:
+
+    [params] -> {
+        [other inames..., scan_iname, sweep_iname]:
+            (sweep_min_value
+                <= sweep_iname
+                <= sweep_max_value)
+            and
+            (scan_min_value
+                <= scan_iname
+                <= stride * (sweep_iname - sweep_min_value) + scan_min_value)
+            and
+            (irrelevant constraints)
+    }
+    """
+
+    orig_domain = kernel.get_inames_domain(
+            frozenset((scan_param.sweep_iname, scan_param.scan_iname)))
+
+    sweep_iname = scan_param.sweep_iname
+    scan_iname = scan_param.scan_iname
+    affs = isl.affs_from_space(orig_domain.space)
+
+    sweep_lower_bound = isl.align_spaces(
+            scan_param.sweep_lower_bound,
+            affs[0],
+            across_dim_types=True)
+
+    sweep_upper_bound = isl.align_spaces(
+            scan_param.sweep_upper_bound,
+            affs[0],
+            across_dim_types=True)
+
+    scan_lower_bound = isl.align_spaces(
+            scan_param.scan_lower_bound,
+            affs[0],
+            across_dim_types=True)
+
+    from itertools import product
+
+    for (sweep_lb_domain, sweep_lb_aff), \
+        (sweep_ub_domain, sweep_ub_aff), \
+        (scan_lb_domain, scan_lb_aff) in \
+            product(sweep_lower_bound.get_pieces(),
+                    sweep_upper_bound.get_pieces(),
+                    scan_lower_bound.get_pieces()):
+
+        # Assumptions inherited from the domains of the pwaffs
+        assumptions = sweep_lb_domain & sweep_ub_domain & scan_lb_domain
+
+        # Sweep iname constraints
+        hyp_domain = affs[sweep_iname].ge_set(sweep_lb_aff)
+        hyp_domain &= affs[sweep_iname].le_set(sweep_ub_aff)
+
+        # Scan iname constraints
+        hyp_domain &= affs[scan_iname].ge_set(scan_lb_aff)
+        hyp_domain &= affs[scan_iname].le_set(
+                scan_param.stride * (affs[sweep_iname] - sweep_lb_aff)
+                + scan_lb_aff)
+
+        hyp_domain, = (hyp_domain & assumptions).get_basic_sets()
+        test_domain, = (orig_domain & assumptions).get_basic_sets()
+
+        hyp_gist_against_test = hyp_domain.gist(test_domain)
+        if _domain_depends_on_given_set_dims(hyp_gist_against_test,
+                (sweep_iname, scan_iname)):
+            return False, (
+                    "gist of hypothesis against test domain "
+                    "has sweep or scan dependent constraints: '%s'"
+                    % hyp_gist_against_test)
+
+        test_gist_against_hyp = test_domain.gist(hyp_domain)
+        if _domain_depends_on_given_set_dims(test_gist_against_hyp,
+                (sweep_iname, scan_iname)):
+            return False, (
+                   "gist of test against hypothesis domain "
+                   "has sweep or scan dependent constraint: '%s'"
+                   % test_gist_against_hyp)
+
+    return True, "ok"
+
+
+_ScanCandidateParameters = namedtuple(
+        "_ScanCandidateParameters",
+        "sweep_iname, scan_iname, sweep_lower_bound, "
+        "sweep_upper_bound, scan_lower_bound, stride")
+
+
+def _try_infer_scan_candidate_from_expr(
+        kernel, expr, within_inames, sweep_iname=None):
+    """Analyze `expr` and determine if it can be implemented as a scan.
+    """
+    from loopy.symbolic import Reduction
+    assert isinstance(expr, Reduction)
+
+    if len(expr.inames) != 1:
+        raise ValueError(
+                "Multiple inames in reduction: '%s'" % (", ".join(expr.inames),))
+
+    scan_iname, = expr.inames
+
+    from loopy.kernel.tools import DomainChanger
+    dchg = DomainChanger(kernel, (scan_iname,))
+    domain = dchg.get_original_domain()
+
+    if sweep_iname is None:
+        try:
+            sweep_iname = _try_infer_sweep_iname(
+                    domain, scan_iname, kernel.all_inames())
+        except ValueError as v:
+            raise ValueError(
+                    "Couldn't determine a sweep iname for the scan "
+                    "expression '%s': %s" % (expr, v))
+
+    try:
+        sweep_lower_bound, sweep_upper_bound, scan_lower_bound = (
+                _try_infer_scan_and_sweep_bounds(
+                    kernel, scan_iname, sweep_iname, within_inames))
+    except ValueError as v:
+        raise ValueError(
+                "Couldn't determine bounds for the scan with expression '%s' "
+                "(sweep iname: '%s', scan iname: '%s'): %s"
+                % (expr, sweep_iname, scan_iname, v))
+
+    try:
+        stride = _try_infer_scan_stride(
+                kernel, scan_iname, sweep_iname, sweep_lower_bound)
+    except ValueError as v:
+        raise ValueError(
+                "Couldn't determine a scan stride for the scan with expression '%s' "
+                "(sweep iname: '%s', scan iname: '%s'): %s"
+                % (expr, sweep_iname, scan_iname, v))
+
+    return _ScanCandidateParameters(sweep_iname, scan_iname, sweep_lower_bound,
+            sweep_upper_bound, scan_lower_bound, stride)
+
+
+def _try_infer_sweep_iname(domain, scan_iname, candidate_inames):
+    """The sweep iname is the outer iname which guides the scan.
+
+    E.g. for a domain of {[i,j]: 0<=i<n and 0<=j<=i}, i is the sweep iname.
+    """
+    constrs = domain.get_constraints()
+    sweep_iname_candidate = None
+
+    for constr in constrs:
+        candidate_vars = set([
+                var for var in constr.get_var_dict()
+                if var in candidate_inames])
+
+        # Irrelevant constraint - skip
+        if scan_iname not in candidate_vars:
+            continue
+
+        # No additional inames - skip
+        if len(candidate_vars) == 1:
+            continue
+
+        candidate_vars.remove(scan_iname)
+
+        # Depends on more than one iname - error
+        if len(candidate_vars) > 1:
+            raise ValueError(
+                    "More than one sweep iname candidate for scan iname '%s' found "
+                    "(via constraint '%s')" % (scan_iname, constr))
+
+        next_candidate = candidate_vars.pop()
+
+        if sweep_iname_candidate is None:
+            sweep_iname_candidate = next_candidate
+            defining_constraint = constr
+        else:
+            # Check next_candidate consistency
+            if sweep_iname_candidate != next_candidate:
+                raise ValueError(
+                        "More than one sweep iname candidate for scan iname '%s' "
+                        "found (via constraints '%s', '%s')" %
+                        (scan_iname, defining_constraint, constr))
+
+    if sweep_iname_candidate is None:
+        raise ValueError(
+                "Couldn't find any sweep iname candidates for "
+                "scan iname '%s'" % scan_iname)
+
+    return sweep_iname_candidate
+
+
+def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_inames):
+    domain = kernel.get_inames_domain(frozenset((sweep_iname, scan_iname)))
+    domain = _move_set_to_param_dims_except(domain, (sweep_iname, scan_iname))
+
+    var_dict = domain.get_var_dict()
+    sweep_idx = var_dict[sweep_iname][1]
+    scan_idx = var_dict[scan_iname][1]
+
+    domain = domain.project_out_except(
+            within_inames | kernel.non_iname_variable_names(), (isl.dim_type.param,))
+
+    try:
+        from loopy.isl_helpers import no_stderr_output_from_isl
+        with no_stderr_output_from_isl(domain.get_ctx()):
+            sweep_lower_bound = domain.dim_min(sweep_idx)
+            sweep_upper_bound = domain.dim_max(sweep_idx)
+            scan_lower_bound = domain.dim_min(scan_idx)
+    except isl.Error as e:
+        raise ValueError("isl error: %s" % e)
+
+    return (sweep_lower_bound, sweep_upper_bound, scan_lower_bound)
+
+
+def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound):
+    """The stride is the number of steps the scan iname takes per iteration
+    of the sweep iname. This is allowed to be an integer constant.
+
+    E.g. for a domain of {[i,j]: 0<=i<n and 0<=j<=6*i}, the stride is 6.
+    """
+    dim_type = isl.dim_type
+
+    domain = kernel.get_inames_domain(frozenset([sweep_iname, scan_iname]))
+    domain_with_sweep_param = _move_set_to_param_dims_except(domain, (scan_iname,))
+
+    domain_with_sweep_param = domain_with_sweep_param.project_out_except(
+            (sweep_iname, scan_iname), (dim_type.set, dim_type.param))
+
+    scan_iname_idx = domain_with_sweep_param.find_dim_by_name(
+            dim_type.set, scan_iname)
+
+    # Should be equal to k * sweep_iname, where k is the stride.
+
+    try:
+        from loopy.isl_helpers import no_stderr_output_from_isl
+        with no_stderr_output_from_isl(domain_with_sweep_param.get_ctx()):
+            scan_iname_range = (
+                    domain_with_sweep_param.dim_max(scan_iname_idx)
+                    - domain_with_sweep_param.dim_min(scan_iname_idx)
+                    ).gist(domain_with_sweep_param.params())
+    except isl.Error as e:
+        raise ValueError("isl error: '%s'" % e)
+
+    scan_iname_pieces = scan_iname_range.get_pieces()
+
+    # FIXME: Some of these checks may not be necessary.
+
+    if len(scan_iname_pieces) > 1:
+        raise ValueError("range in multiple pieces: %s" % scan_iname_range)
+    elif len(scan_iname_pieces) == 0:
+        raise ValueError("empty range found for iname '%s'" % scan_iname)
+
+    scan_iname_constr, scan_iname_aff = scan_iname_pieces[0]
+
+    if not scan_iname_constr.plain_is_universe():
+        raise ValueError("found constraints: %s" % scan_iname_constr)
+
+    if scan_iname_aff.dim(dim_type.div):
+        raise ValueError("aff has div: %s" % scan_iname_aff)
+
+    coeffs = scan_iname_aff.get_coefficients_by_name(dim_type.param)
+
+    if len(coeffs) == 0:
+        try:
+            scan_iname_aff.get_constant_val()
+        except:
+            raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff)
+
+        # If this point is reached we're assuming the domain is of the form
+        # {[i,j]: i=0 and j=0}, so the stride is technically 1 - any value
+        # this function returns will be verified later by
+        # _check_reduction_is_triangular().
+        return 1
+
+    if sweep_iname not in coeffs:
+        raise ValueError("didn't find sweep iname in coeffs: %s" % sweep_iname)
+
+    stride = coeffs[sweep_iname]
+
+    if not stride.is_int():
+        raise ValueError("stride not an integer: %s" % stride)
+
+    if not stride.is_pos():
+        raise ValueError("stride not positive: %s" % stride)
+
+    return stride.to_python()
+
+
+def _get_domain_with_iname_as_param(domain, iname):
+    dim_type = isl.dim_type
+
+    if domain.find_dim_by_name(dim_type.param, iname) >= 0:
+        return domain
+
+    iname_idx = domain.find_dim_by_name(dim_type.set, iname)
+
+    assert iname_idx >= 0, (iname, domain)
+
+    return domain.move_dims(
+        dim_type.param, domain.dim(dim_type.param),
+        dim_type.set, iname_idx, 1)
+
+
+def _create_domain_for_sweep_tracking(orig_domain,
+        tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride):
+    dim_type = isl.dim_type
+
+    subd = isl.BasicSet.universe(orig_domain.params().space)
+
+    # Add tracking_iname and sweep iname.
+
+    subd = _add_params_to_domain(subd, (sweep_iname, tracking_iname))
+
+    # Here we realize the domain:
+    #
+    # [..., i] -> {
+    #  [j]: 0 <= j - l
+    #       and
+    #       j - l <= k * (i - m)
+    #       and
+    #       k * (i - m - 1) < j - l }
+    # where
+    #   * i is the sweep iname
+    #   * j is the tracking iname
+    #   * k is the stride for the scan
+    #   * l is the lower bound for the scan
+    #   * m is the lower bound for the sweep iname
+    #
+    affs = isl.affs_from_space(subd.space)
+
+    subd &= (affs[tracking_iname] - scan_min_value).ge_set(affs[0])
+    subd &= (affs[tracking_iname] - scan_min_value)\
+            .le_set(stride * (affs[sweep_iname] - sweep_min_value))
+    subd &= (affs[tracking_iname] - scan_min_value)\
+            .gt_set(stride * (affs[sweep_iname] - sweep_min_value - 1))
+
+    # Move tracking_iname into a set dim (NOT sweep iname).
+    subd = subd.move_dims(
+            dim_type.set, 0,
+            dim_type.param, subd.dim(dim_type.param) - 1, 1)
+
+    # Simplify (maybe).
+    orig_domain_with_sweep_param = (
+            _get_domain_with_iname_as_param(orig_domain, sweep_iname))
+    subd = subd.gist_params(orig_domain_with_sweep_param.params())
+
+    subd, = subd.get_basic_sets()
+
+    return subd
+
+
+def _strip_if_scalar(reference_exprs, expr):
+    if len(reference_exprs) == 1:
+        return expr[0]
+    else:
+        return expr
+
+
+def _infer_arg_dtypes_and_reduction_dtypes(kernel, expr, unknown_types_ok):
+    arg_dtypes = []
+
+    from loopy.type_inference import TypeInferenceMapper
+    type_inf_mapper = TypeInferenceMapper(kernel)
+    import loopy as lp
+
+    for sub_expr in expr.exprs:
+        try:
+            arg_dtype = type_inf_mapper(sub_expr)
+        except DependencyTypeInferenceFailure:
+            if unknown_types_ok:
+                arg_dtype = lp.auto
+            else:
+                raise LoopyError("failed to determine type of accumulator for "
+                        "reduction sub-expression '%s'" % sub_expr)
+        else:
+            arg_dtype = arg_dtype.with_target(kernel.target)
+
+        arg_dtypes.append(arg_dtype)
+
+    reduction_dtypes = expr.operation.result_dtypes(kernel, *arg_dtypes)
+    reduction_dtypes = tuple(
+            dt.with_target(kernel.target)
+            if dt is not lp.auto else dt
+            for dt in reduction_dtypes)
+
+    return tuple(arg_dtypes), reduction_dtypes
+
+
+def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
+    """
+    Multi assignment function calls are currently lowered into OpenCL so that
+    the function call::
+
+       a, b = segmented_sum(x, y, z, w)
+
+    becomes::
+
+       a = segmented_sum_mangled(x, y, z, w, &b).
+
+    For OpenCL, the scope of "b" is significant, and the preamble generation
+    currently assumes the scope is always private. This function forces that to
+    be the case by introducing temporary assignments into the kernel.
+    """
+
+    insn_id_gen = kernel.get_instruction_id_generator()
+    var_name_gen = kernel.get_var_name_generator()
+
+    new_or_updated_instructions = {}
+    new_temporaries = {}
+
+    dep_map = dict(
+            (insn.id, insn.depends_on) for insn in kernel.instructions)
+
+    inverse_dep_map = dict((insn.id, set()) for insn in kernel.instructions)
+
+    import six
+    for insn_id, deps in six.iteritems(dep_map):
+        for dep in deps:
+            inverse_dep_map[dep].add(insn_id)
+
+    del dep_map
+
+    # {{{ utils
+
+    def _add_to_no_sync_with(insn_id, new_no_sync_with_params):
+        insn = kernel.id_to_insn.get(insn_id)
+        insn = new_or_updated_instructions.get(insn_id, insn)
+        new_or_updated_instructions[insn_id] = (
+                insn.copy(
+                    no_sync_with=(
+                        insn.no_sync_with | frozenset(new_no_sync_with_params))))
+
+    def _add_to_depends_on(insn_id, new_depends_on_params):
+        insn = kernel.id_to_insn.get(insn_id)
+        insn = new_or_updated_instructions.get(insn_id, insn)
+        new_or_updated_instructions[insn_id] = (
+                insn.copy(
+                    depends_on=insn.depends_on | frozenset(new_depends_on_params)))
+
+    # }}}
+
+    from loopy.kernel.instruction import CallInstruction
+    for insn in kernel.instructions:
+        if not isinstance(insn, CallInstruction):
+            continue
+
+        if len(insn.assignees) <= 1:
+            continue
+
+        assignees = insn.assignees
+        assignee_var_names = insn.assignee_var_names()
+
+        new_assignees = [assignees[0]]
+        newly_added_assignments_ids = set()
+        needs_replacement = False
+
+        last_added_insn_id = insn.id
+
+        from loopy.kernel.data import temp_var_scope, TemporaryVariable
+
+        FIRST_POINTER_ASSIGNEE_IDX = 1  # noqa
+
+        for assignee_nr, assignee_var_name, assignee in zip(
+                range(FIRST_POINTER_ASSIGNEE_IDX, len(assignees)),
+                assignee_var_names[FIRST_POINTER_ASSIGNEE_IDX:],
+                assignees[FIRST_POINTER_ASSIGNEE_IDX:]):
+
+            if (
+                    assignee_var_name in kernel.temporary_variables
+                    and
+                    (kernel.temporary_variables[assignee_var_name].scope
+                         == temp_var_scope.PRIVATE)):
+                new_assignees.append(assignee)
+                continue
+
+            needs_replacement = True
+
+            # {{{ generate a new assignent instruction
+
+            new_assignee_name = var_name_gen(
+                    "{insn_id}_retval_{assignee_nr}"
+                    .format(insn_id=insn.id, assignee_nr=assignee_nr))
+
+            new_assignment_id = insn_id_gen(
+                    "{insn_id}_assign_retval_{assignee_nr}"
+                    .format(insn_id=insn.id, assignee_nr=assignee_nr))
+
+            newly_added_assignments_ids.add(new_assignment_id)
+
+            import loopy as lp
+            new_temporaries[new_assignee_name] = (
+                    TemporaryVariable(
+                        name=new_assignee_name,
+                        dtype=lp.auto,
+                        scope=temp_var_scope.PRIVATE))
+
+            from pymbolic import var
+            new_assignee = var(new_assignee_name)
+            new_assignees.append(new_assignee)
+
+            new_or_updated_instructions[new_assignment_id] = (
+                    make_assignment(
+                        assignees=(assignee,),
+                        expression=new_assignee,
+                        id=new_assignment_id,
+                        depends_on=frozenset([last_added_insn_id]),
+                        depends_on_is_final=True,
+                        no_sync_with=(
+                            insn.no_sync_with | frozenset([(insn.id, "any")])),
+                        predicates=insn.predicates,
+                        within_inames=insn.within_inames))
+
+            last_added_insn_id = new_assignment_id
+
+            # }}}
+
+        if not needs_replacement:
+            continue
+
+        # {{{ update originating instruction
+
+        orig_insn = new_or_updated_instructions.get(insn.id, insn)
+
+        new_or_updated_instructions[insn.id] = (
+                orig_insn.copy(assignees=tuple(new_assignees)))
+
+        _add_to_no_sync_with(insn.id,
+                [(id, "any") for id in newly_added_assignments_ids])
+
+        # }}}
+
+        # {{{ squash spurious memory dependencies amongst new assignments
+
+        for new_insn_id in newly_added_assignments_ids:
+            _add_to_no_sync_with(new_insn_id,
+                    [(id, "any")
+                     for id in newly_added_assignments_ids
+                     if id != new_insn_id])
+
+        # }}}
+
+        # {{{ update instructions that depend on the originating instruction
+
+        for inverse_dep in inverse_dep_map[insn.id]:
+            _add_to_depends_on(inverse_dep, newly_added_assignments_ids)
+
+            for insn_id, scope in (
+                    new_or_updated_instructions[inverse_dep].no_sync_with):
+                if insn_id == insn.id:
+                    _add_to_no_sync_with(
+                            inverse_dep,
+                            [(id, scope) for id in newly_added_assignments_ids])
+
+        # }}}
+
+    new_temporary_variables = kernel.temporary_variables.copy()
+    new_temporary_variables.update(new_temporaries)
+
+    new_instructions = (
+            list(new_or_updated_instructions.values())
+            + list(insn
+                for insn in kernel.instructions
+                if insn.id not in new_or_updated_instructions))
+
+    return kernel.copy(temporary_variables=new_temporary_variables,
+                       instructions=new_instructions)
+
+
+def _insert_subdomain_into_domain_tree(kernel, domains, subdomain):
+    # Intersect with inames, because we could have captured some kernel params
+    # in here too...
+    dependent_inames = (
+            frozenset(subdomain.get_var_names(isl.dim_type.param))
+            & kernel.all_inames())
+    idx, = kernel.get_leaf_domain_indices(dependent_inames)
+    domains.insert(idx + 1, subdomain)
+
+# }}}
+
+
+def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
+                      automagic_scans_ok=True, force_scan=False,
+                      force_outer_iname_for_scan=None):
     """Rewrites reductions into their imperative form. With *insn_id_filter*
     specified, operate only on the instruction with an instruction id matching
     *insn_id_filter*.
@@ -283,6 +942,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
     If *insn_id_filter* is not given, all reductions in all instructions will
     be realized.
+
+    If *automagic_scans_ok*, this function will attempt to rewrite triangular
+    reductions as scans automatically.
+
+    If *force_scan* is *True*, this function will attempt to rewrite *all*
+    candidate reductions as scans and raise an error if this is not possible
+    (this is most useful combined with *insn_id_filter*).
+
+    If *force_outer_iname_for_scan* is not *None*, this function will attempt
+    to realize candidate reductions as scans using the specified iname as the
+    outer (sweep) iname.
     """
 
     logger.debug("%s: realize reduction" % kernel.name)
@@ -295,29 +965,34 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
     var_name_gen = kernel.get_var_name_generator()
     new_temporary_variables = kernel.temporary_variables.copy()
 
-    from loopy.type_inference import TypeInferenceMapper
-    type_inf_mapper = TypeInferenceMapper(kernel)
+    # Dummy inames to remove after scans have been realized
+    inames_to_remove = set()
+
+    inames_added_for_scan = set()
 
     # {{{ sequential
 
-    def map_reduction_seq(expr, rec, nresults, arg_dtype,
+    def map_reduction_seq(expr, rec, nresults, arg_dtypes,
             reduction_dtypes):
         outer_insn_inames = temp_kernel.insn_inames(insn)
 
-        from pymbolic import var
-        acc_var_names = [
-                var_name_gen("acc_"+"_".join(expr.inames))
-                for i in range(nresults)]
-        acc_vars = tuple(var(n) for n in acc_var_names)
+        from loopy.kernel.data import temp_var_scope
+        acc_var_names = make_temporaries(
+                name_based_on="acc_"+"_".join(expr.inames),
+                nvars=nresults,
+                shape=(),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.PRIVATE)
 
-        from loopy.kernel.data import TemporaryVariable, temp_var_scope
+        init_insn_depends_on = frozenset()
 
-        for name, dtype in zip(acc_var_names, reduction_dtypes):
-            new_temporary_variables[name] = TemporaryVariable(
-                    name=name,
-                    shape=(),
-                    dtype=dtype,
-                    scope=temp_var_scope.PRIVATE)
+        global_barrier = temp_kernel.find_most_recent_global_barrier(insn.id)
+
+        if global_barrier is not None:
+            init_insn_depends_on |= frozenset([global_barrier])
+
+        from pymbolic import var
+        acc_vars = tuple(var(n) for n in acc_var_names)
 
         init_id = insn_id_gen(
                 "%s_%s_init" % (insn.id, "_".join(expr.inames)))
@@ -327,8 +1002,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                 assignees=acc_vars,
                 within_inames=outer_insn_inames - frozenset(expr.inames),
                 within_inames_is_final=insn.within_inames_is_final,
-                depends_on=frozenset(),
-                expression=expr.operation.neutral_element(arg_dtype, expr.inames))
+                depends_on=init_insn_depends_on,
+                expression=expr.operation.neutral_element(*arg_dtypes))
 
         generated_insns.append(init_insn)
 
@@ -343,9 +1018,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                 id=update_id,
                 assignees=acc_vars,
                 expression=expr.operation(
-                    arg_dtype,
-                    acc_vars if len(acc_vars) > 1 else acc_vars[0],
-                    expr.expr, expr.inames),
+                    arg_dtypes,
+                    _strip_if_scalar(acc_vars, acc_vars),
+                    _strip_if_scalar(acc_vars, expr.exprs)),
                 depends_on=frozenset([init_insn.id]) | insn.depends_on,
                 within_inames=update_insn_iname_deps,
                 within_inames_is_final=insn.within_inames_is_final)
@@ -382,7 +1057,15 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                 v[iname].lt_set(v[0] + size)).get_basic_sets()
         return bs
 
-    def map_reduction_local(expr, rec, nresults, arg_dtype,
+    def _make_slab_set_from_range(iname, lbound, ubound):
+        v = isl.make_zero_and_vars([iname])
+        bs, = (
+                v[iname].ge_set(v[0] + lbound)
+                &
+                v[iname].lt_set(v[0] + ubound)).get_basic_sets()
+        return bs
+
+    def map_reduction_local(expr, rec, nresults, arg_dtypes,
             reduction_dtypes):
         red_iname, = expr.inames
 
@@ -406,6 +1089,24 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                 _get_int_iname_size(oiname)
                 for oiname in outer_local_inames)
 
+        from loopy.kernel.data import temp_var_scope
+
+        neutral_var_names = make_temporaries(
+                name_based_on="neutral_"+red_iname,
+                nvars=nresults,
+                shape=(),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.PRIVATE)
+
+        acc_var_names = make_temporaries(
+                name_based_on="acc_"+red_iname,
+                nvars=nresults,
+                shape=outer_local_iname_sizes + (size,),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.LOCAL)
+
+        acc_vars = tuple(var(n) for n in acc_var_names)
+
         # {{{ add separate iname to carry out the reduction
 
         # Doing this sheds any odd conditionals that may be active
@@ -417,32 +1118,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
         # }}}
 
-        neutral_var_names = [
-                var_name_gen("neutral_"+red_iname)
-                for i in range(nresults)]
-        acc_var_names = [
-                var_name_gen("acc_"+red_iname)
-                for i in range(nresults)]
-        acc_vars = tuple(var(n) for n in acc_var_names)
-
-        from loopy.kernel.data import TemporaryVariable, temp_var_scope
-        for name, dtype in zip(acc_var_names, reduction_dtypes):
-            new_temporary_variables[name] = TemporaryVariable(
-                    name=name,
-                    shape=outer_local_iname_sizes + (size,),
-                    dtype=dtype,
-                    scope=temp_var_scope.LOCAL)
-        for name, dtype in zip(neutral_var_names, reduction_dtypes):
-            new_temporary_variables[name] = TemporaryVariable(
-                    name=name,
-                    shape=(),
-                    dtype=dtype,
-                    scope=temp_var_scope.PRIVATE)
-
         base_iname_deps = outer_insn_inames - frozenset(expr.inames)
 
-        neutral = expr.operation.neutral_element(arg_dtype, expr.inames)
-
+        neutral = expr.operation.neutral_element(*arg_dtypes)
         init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname))
         init_insn = make_assignment(
                 id=init_id,
@@ -455,12 +1133,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                 depends_on=frozenset())
         generated_insns.append(init_insn)
 
-        def _strip_if_scalar(c):
-            if len(acc_vars) == 1:
-                return c[0]
-            else:
-                return c
-
         init_neutral_id = insn_id_gen("%s_%s_init_neutral" % (insn.id, red_iname))
         init_neutral_insn = make_assignment(
                 id=init_neutral_id,
@@ -478,9 +1150,11 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     acc_var[outer_local_iname_vars + (var(red_iname),)]
                     for acc_var in acc_vars),
                 expression=expr.operation(
-                    arg_dtype,
-                    _strip_if_scalar(tuple(var(nvn) for nvn in neutral_var_names)),
-                    expr.expr, expr.inames),
+                    arg_dtypes,
+                    _strip_if_scalar(
+                        expr.exprs,
+                        tuple(var(nvn) for nvn in neutral_var_names)),
+                    _strip_if_scalar(expr.exprs, expr.exprs)),
                 within_inames=(
                     (outer_insn_inames - frozenset(expr.inames))
                     | frozenset([red_iname])),
@@ -513,17 +1187,16 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                         acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
                         for acc_var in acc_vars),
                     expression=expr.operation(
-                        arg_dtype,
-                        _strip_if_scalar(tuple(
+                        arg_dtypes,
+                        _strip_if_scalar(acc_vars, tuple(
                             acc_var[
                                 outer_local_iname_vars + (var(stage_exec_iname),)]
                             for acc_var in acc_vars)),
-                        _strip_if_scalar(tuple(
+                        _strip_if_scalar(acc_vars, tuple(
                             acc_var[
                                 outer_local_iname_vars + (
                                     var(stage_exec_iname) + new_size,)]
-                            for acc_var in acc_vars)),
-                        expr.inames),
+                            for acc_var in acc_vars))),
                     within_inames=(
                         base_iname_deps | frozenset([stage_exec_iname])),
                     within_inames_is_final=insn.within_inames_is_final,
@@ -548,30 +1221,371 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars]
     # }}}
 
-    # {{{ seq/par dispatch
+    # {{{ utils (stateful)
 
-    def map_reduction(expr, rec, nresults=1):
-        # Only expand one level of reduction at a time, going from outermost to
-        # innermost. Otherwise we get the (iname + insn) dependencies wrong.
+    @memoize
+    def get_or_add_sweep_tracking_iname_and_domain(
+            scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
+            tracking_iname):
+        domain = temp_kernel.get_inames_domain(frozenset((scan_iname, sweep_iname)))
 
-        try:
-            arg_dtype = type_inf_mapper(expr.expr)
-        except DependencyTypeInferenceFailure:
-            if unknown_types_ok:
-                arg_dtype = lp.auto
+        inames_added_for_scan.add(tracking_iname)
+
+        new_domain = _create_domain_for_sweep_tracking(domain,
+                tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride)
+
+        _insert_subdomain_into_domain_tree(temp_kernel, domains, new_domain)
+
+        return tracking_iname
+
+    def replace_var_within_expr(expr, from_var, to_var):
+        from pymbolic.mapper.substitutor import make_subst_func
+
+        from loopy.symbolic import (
+            SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper)
+
+        rule_mapping_context = SubstitutionRuleMappingContext(
+            temp_kernel.substitutions, var_name_gen)
+
+        from pymbolic import var
+        mapper = RuleAwareSubstitutionMapper(
+            rule_mapping_context,
+            make_subst_func({from_var: var(to_var)}),
+            within=lambda *args: True)
+
+        return mapper(expr, temp_kernel, None)
+
+    def make_temporaries(name_based_on, nvars, shape, dtypes, scope):
+        var_names = [
+                var_name_gen(name_based_on.format(index=i))
+                for i in range(nvars)]
+
+        from loopy.kernel.data import TemporaryVariable
+
+        for name, dtype in zip(var_names, dtypes):
+            new_temporary_variables[name] = TemporaryVariable(
+                    name=name,
+                    shape=shape,
+                    dtype=dtype,
+                    scope=scope)
+
+        return var_names
+
+    # }}}
+
+    # {{{ sequential scan
+
+    def map_scan_seq(expr, rec, nresults, arg_dtypes,
+            reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
+            scan_min_value, stride):
+        outer_insn_inames = temp_kernel.insn_inames(insn)
+        inames_to_remove.add(scan_iname)
+
+        track_iname = var_name_gen(
+                "{sweep_iname}__seq_scan"
+                .format(scan_iname=scan_iname, sweep_iname=sweep_iname))
+
+        get_or_add_sweep_tracking_iname_and_domain(
+                scan_iname, sweep_iname, sweep_min_value, scan_min_value,
+                stride, track_iname)
+
+        from loopy.kernel.data import temp_var_scope
+        acc_var_names = make_temporaries(
+                name_based_on="acc_" + scan_iname,
+                nvars=nresults,
+                shape=(),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.PRIVATE)
+
+        from pymbolic import var
+        acc_vars = tuple(var(n) for n in acc_var_names)
+
+        init_id = insn_id_gen(
+                "%s_%s_init" % (insn.id, "_".join(expr.inames)))
+
+        init_insn_depends_on = frozenset()
+
+        global_barrier = temp_kernel.find_most_recent_global_barrier(insn.id)
+
+        if global_barrier is not None:
+            init_insn_depends_on |= frozenset([global_barrier])
+
+        init_insn = make_assignment(
+                id=init_id,
+                assignees=acc_vars,
+                within_inames=outer_insn_inames - frozenset(
+                    (sweep_iname,) + expr.inames),
+                within_inames_is_final=insn.within_inames_is_final,
+                depends_on=init_insn_depends_on,
+                expression=expr.operation.neutral_element(*arg_dtypes))
+
+        generated_insns.append(init_insn)
+
+        updated_inner_exprs = tuple(
+                replace_var_within_expr(sub_expr, scan_iname, track_iname)
+                for sub_expr in expr.exprs)
+
+        update_id = insn_id_gen(
+                based_on="%s_%s_update" % (insn.id, "_".join(expr.inames)))
+
+        update_insn_iname_deps = temp_kernel.insn_inames(insn) | set([track_iname])
+        if insn.within_inames_is_final:
+            update_insn_iname_deps = insn.within_inames | set([track_iname])
+
+        scan_insn = make_assignment(
+                id=update_id,
+                assignees=acc_vars,
+                expression=expr.operation(
+                    arg_dtypes,
+                    _strip_if_scalar(acc_vars, acc_vars),
+                    _strip_if_scalar(acc_vars, updated_inner_exprs)),
+                depends_on=frozenset([init_insn.id]) | insn.depends_on,
+                within_inames=update_insn_iname_deps,
+                no_sync_with=insn.no_sync_with,
+                within_inames_is_final=insn.within_inames_is_final)
+
+        generated_insns.append(scan_insn)
+
+        new_insn_add_depends_on.add(scan_insn.id)
 
-                reduction_dtypes = (lp.auto,)*nresults
+        if nresults == 1:
+            assert len(acc_vars) == 1
+            return acc_vars[0]
+        else:
+            return acc_vars
+
+    # }}}
+
+    # {{{ local-parallel scan
+
+    def map_scan_local(expr, rec, nresults, arg_dtypes,
+            reduction_dtypes, sweep_iname, scan_iname,
+            sweep_min_value, scan_min_value, stride):
+
+        scan_size = _get_int_iname_size(sweep_iname)
+
+        assert scan_size > 0
+
+        if scan_size == 1:
+            return map_reduction_seq(
+                    expr, rec, nresults, arg_dtypes, reduction_dtypes)
+
+        outer_insn_inames = temp_kernel.insn_inames(insn)
 
+        from loopy.kernel.data import LocalIndexTagBase
+        outer_local_inames = tuple(
+                oiname
+                for oiname in outer_insn_inames
+                if isinstance(
+                    kernel.iname_to_tag.get(oiname),
+                    LocalIndexTagBase)
+                and oiname != sweep_iname)
+
+        from pymbolic import var
+        outer_local_iname_vars = tuple(
+                var(oiname) for oiname in outer_local_inames)
+
+        outer_local_iname_sizes = tuple(
+                _get_int_iname_size(oiname)
+                for oiname in outer_local_inames)
+
+        track_iname = var_name_gen(
+                "{sweep_iname}__pre_scan"
+                .format(scan_iname=scan_iname, sweep_iname=sweep_iname))
+
+        get_or_add_sweep_tracking_iname_and_domain(
+                scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
+                track_iname)
+
+        # {{{ add separate iname to carry out the scan
+
+        # Doing this sheds any odd conditionals that may be active
+        # on our scan_iname.
+
+        base_exec_iname = var_name_gen(sweep_iname + "__scan")
+        domains.append(_make_slab_set(base_exec_iname, scan_size))
+        new_iname_tags[base_exec_iname] = kernel.iname_to_tag[sweep_iname]
+
+        # }}}
+
+        from loopy.kernel.data import temp_var_scope
+
+        """
+        neutral_var_names = make_temporaries(
+                name_based_on="neutral_"+scan_iname,
+                nvars=nresults,
+                shape=(),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.PRIVATE)
+        """
+
+        read_var_names = make_temporaries(
+                name_based_on="read_"+scan_iname+"_arg_{index}",
+                nvars=nresults,
+                shape=(),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.PRIVATE)
+
+        acc_var_names = make_temporaries(
+                name_based_on="acc_"+scan_iname,
+                nvars=nresults,
+                shape=outer_local_iname_sizes + (scan_size,),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.LOCAL)
+
+        acc_vars = tuple(var(n) for n in acc_var_names)
+        read_vars = tuple(var(n) for n in read_var_names)
+        #neutral_vars = tuple(var(n) for n in neutral_var_names)
+
+        base_iname_deps = (outer_insn_inames
+                - frozenset(expr.inames) - frozenset([sweep_iname]))
+
+        neutral = expr.operation.neutral_element(*arg_dtypes)
+
+        init_insn_depends_on = insn.depends_on
+
+        global_barrier = temp_kernel.find_most_recent_global_barrier(insn.id)
+
+        if global_barrier is not None:
+            init_insn_depends_on |= frozenset([global_barrier])
+
+        init_id = insn_id_gen("%s_%s_init" % (insn.id, scan_iname))
+        init_insn = make_assignment(
+                id=init_id,
+                assignees=tuple(
+                    acc_var[outer_local_iname_vars + (var(base_exec_iname),)]
+                    for acc_var in acc_vars),
+                expression=neutral,
+                within_inames=base_iname_deps | frozenset([base_exec_iname]),
+                within_inames_is_final=insn.within_inames_is_final,
+                depends_on=init_insn_depends_on)
+        generated_insns.append(init_insn)
+
+        updated_inner_exprs = tuple(
+                replace_var_within_expr(sub_expr, scan_iname, track_iname)
+                for sub_expr in expr.exprs)
+
+        from loopy.symbolic import Reduction
+
+        from loopy.symbolic import pw_aff_to_expr
+        sweep_min_value_expr = pw_aff_to_expr(sweep_min_value)
+
+        transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, scan_iname))
+        transfer_insn = make_assignment(
+                id=transfer_id,
+                assignees=tuple(
+                    acc_var[outer_local_iname_vars
+                            + (var(sweep_iname) - sweep_min_value_expr,)]
+                    for acc_var in acc_vars),
+                expression=Reduction(
+                    operation=expr.operation,
+                    inames=(track_iname,),
+                    exprs=updated_inner_exprs,
+                    allow_simultaneous=False,
+                    ),
+                within_inames=outer_insn_inames - frozenset(expr.inames),
+                within_inames_is_final=insn.within_inames_is_final,
+                depends_on=frozenset([init_id]) | insn.depends_on,
+                no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with)
+
+        generated_insns.append(transfer_insn)
+
+        def _strip_if_scalar(c):
+            if len(acc_vars) == 1:
+                return c[0]
             else:
-                raise LoopyError("failed to determine type of accumulator for "
-                        "reduction '%s'" % expr)
+                return c
+
+        prev_id = transfer_id
+
+        istage = 0
+        cur_size = 1
+
+        while cur_size < scan_size:
+            stage_exec_iname = var_name_gen("%s__scan_s%d" % (sweep_iname, istage))
+            domains.append(
+                    _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size))
+            new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[sweep_iname]
+
+            for read_var, acc_var in zip(read_vars, acc_vars):
+                read_stage_id = insn_id_gen(
+                        "scan_%s_read_stage_%d" % (scan_iname, istage))
+
+                read_stage_insn = make_assignment(
+                        id=read_stage_id,
+                        assignees=(read_var,),
+                        expression=(
+                                acc_var[
+                                    outer_local_iname_vars
+                                    + (var(stage_exec_iname) - cur_size,)]),
+                        within_inames=(
+                            base_iname_deps | frozenset([stage_exec_iname])),
+                        within_inames_is_final=insn.within_inames_is_final,
+                        depends_on=frozenset([prev_id]))
+
+                if cur_size == 1:
+                    # Performance hack: don't add a barrier here with transfer_insn.
+                    # XXX: If the lowering logic changes, this could be brittle.
+                    read_stage_insn = read_stage_insn.copy(
+                            no_sync_with=(
+                                read_stage_insn.no_sync_with
+                                | frozenset([(transfer_id, "any")])))
+
+                generated_insns.append(read_stage_insn)
+                prev_id = read_stage_id
+
+            write_stage_id = insn_id_gen(
+                    "scan_%s_write_stage_%d" % (scan_iname, istage))
+            write_stage_insn = make_assignment(
+                    id=write_stage_id,
+                    assignees=tuple(
+                        acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
+                        for acc_var in acc_vars),
+                    expression=expr.operation(
+                        arg_dtypes,
+                        _strip_if_scalar(tuple(
+                            acc_var[
+                                outer_local_iname_vars + (var(stage_exec_iname),)]
+                            for acc_var in acc_vars)),
+                        _strip_if_scalar(read_vars)
+                        ),
+                    within_inames=(
+                        base_iname_deps | frozenset([stage_exec_iname])),
+                    within_inames_is_final=insn.within_inames_is_final,
+                    depends_on=frozenset([prev_id]),
+                    )
+
+            generated_insns.append(write_stage_insn)
+            prev_id = write_stage_id
+
+            #cur_size = new_size
+            #bound = cur_size
+            cur_size *= 2
+            istage += 1
+
+        new_insn_add_depends_on.add(prev_id)
+        new_insn_add_within_inames.add(sweep_iname)
+
+        output_idx = var(sweep_iname) - sweep_min_value_expr
+
+        if nresults == 1:
+            assert len(acc_vars) == 1
+            return acc_vars[0][outer_local_iname_vars + (output_idx,)]
         else:
-            arg_dtype = arg_dtype.with_target(kernel.target)
+            return [acc_var[outer_local_iname_vars + (output_idx,)]
+                    for acc_var in acc_vars]
+
+    # }}}
 
-            reduction_dtypes = expr.operation.result_dtypes(
-                        kernel, arg_dtype, expr.inames)
-            reduction_dtypes = tuple(
-                    dt.with_target(kernel.target) for dt in reduction_dtypes)
+    # {{{ seq/par dispatch
+
+    def map_reduction(expr, rec, nresults=1):
+        # Only expand one level of reduction at a time, going from outermost to
+        # innermost. Otherwise we get the (iname + insn) dependencies wrong.
+
+        arg_dtypes, reduction_dtypes = (
+                _infer_arg_dtypes_and_reduction_dtypes(
+                        temp_kernel, expr, unknown_types_ok))
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
         bad_inames = frozenset(expr.inames) & outer_insn_inames
@@ -579,31 +1593,43 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             raise LoopyError("reduction used within loop(s) that it was "
                     "supposed to reduce over: " + ", ".join(bad_inames))
 
-        n_sequential = 0
-        n_local_par = 0
+        iname_classes = _classify_reduction_inames(temp_kernel, expr.inames)
 
-        from loopy.kernel.data import (
-                LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
-                ParallelTag)
-        for iname in expr.inames:
-            iname_tag = kernel.iname_to_tag.get(iname)
+        n_sequential = len(iname_classes.sequential)
+        n_local_par = len(iname_classes.local_parallel)
+        n_nonlocal_par = len(iname_classes.nonlocal_parallel)
+
+        really_force_scan = force_scan and (
+            len(expr.inames) != 1 or expr.inames[0] not in inames_added_for_scan)
+
+        def _error_if_force_scan_on(cls, msg):
+            if really_force_scan:
+                raise cls(msg)
 
-            if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)):
-                # These are nominally parallel, but we can live with
-                # them as sequential.
-                n_sequential += 1
+        may_be_implemented_as_scan = False
+        if force_scan or automagic_scans_ok:
+            from loopy.diagnostic import ReductionIsNotTriangularError
 
-            elif isinstance(iname_tag, LocalIndexTagBase):
-                n_local_par += 1
+            try:
+                # Try to determine scan candidate information (sweep iname, scan
+                # iname, etc).
+                scan_param = _try_infer_scan_candidate_from_expr(
+                        temp_kernel, expr, outer_insn_inames,
+                        sweep_iname=force_outer_iname_for_scan)
 
-            elif isinstance(iname_tag, (ParallelTag, VectorizeTag)):
-                raise LoopyError("the only form of parallelism supported "
-                        "by reductions is 'local'--found iname '%s' "
-                        "tagged '%s'"
-                        % (iname, type(iname_tag).__name__))
+            except ValueError as v:
+                error = str(v)
 
             else:
-                n_sequential += 1
+                # Ensures the reduction is triangular (somewhat expensive).
+                may_be_implemented_as_scan, error = (
+                        _check_reduction_is_triangular(
+                            temp_kernel, expr, scan_param))
+
+            if not may_be_implemented_as_scan:
+                _error_if_force_scan_on(ReductionIsNotTriangularError, error)
+
+        # {{{ sanity checks
 
         if n_local_par and n_sequential:
             raise LoopyError("Reduction over '%s' contains both parallel and "
@@ -619,21 +1645,84 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     "before code generation."
                     % ", ".join(expr.inames))
 
-        if n_sequential:
-            assert n_local_par == 0
-            return map_reduction_seq(expr, rec, nresults, arg_dtype,
-                    reduction_dtypes)
-        elif n_local_par:
-            return map_reduction_local(expr, rec, nresults, arg_dtype,
-                    reduction_dtypes)
-        else:
+        if n_nonlocal_par:
+            bad_inames = iname_classes.nonlocal_parallel
+            raise LoopyError("the only form of parallelism supported "
+                    "by reductions is 'local'--found iname(s) '%s' "
+                    "respectively tagged '%s'"
+                    % (", ".join(bad_inames),
+                       ", ".join(kernel.iname_to_tag[iname]
+                                 for iname in bad_inames)))
+
+        if n_local_par == 0 and n_sequential == 0:
             from loopy.diagnostic import warn_with_kernel
             warn_with_kernel(kernel, "empty_reduction",
                     "Empty reduction found (no inames to reduce over). "
                     "Eliminating.")
 
+            # FIXME: return a neutral element.
+
             return expr.expr
 
+        # }}}
+
+        if may_be_implemented_as_scan:
+            assert force_scan or automagic_scans_ok
+
+            # We require the "scan" iname to be tagged sequential.
+            if n_sequential:
+                sweep_iname = scan_param.sweep_iname
+                sweep_class = _classify_reduction_inames(kernel, (sweep_iname,))
+
+                sequential = sweep_iname in sweep_class.sequential
+                parallel = sweep_iname in sweep_class.local_parallel
+                bad_parallel = sweep_iname in sweep_class.nonlocal_parallel
+
+                if sweep_iname not in outer_insn_inames:
+                    _error_if_force_scan_on(LoopyError,
+                            "Sweep iname '%s' was detected, but is not an iname "
+                            "for the instruction." % sweep_iname)
+                elif bad_parallel:
+                    _error_if_force_scan_on(LoopyError,
+                            "Sweep iname '%s' has an unsupported parallel tag '%s' "
+                            "- the only parallelism allowed is 'local'." %
+                            (sweep_iname, temp_kernel.iname_to_tag[sweep_iname]))
+                elif parallel:
+                    return map_scan_local(
+                            expr, rec, nresults, arg_dtypes, reduction_dtypes,
+                            sweep_iname, scan_param.scan_iname,
+                            scan_param.sweep_lower_bound,
+                            scan_param.scan_lower_bound,
+                            scan_param.stride)
+                elif sequential:
+                    return map_scan_seq(
+                            expr, rec, nresults, arg_dtypes, reduction_dtypes,
+                            sweep_iname, scan_param.scan_iname,
+                            scan_param.sweep_lower_bound,
+                            scan_param.scan_lower_bound,
+                            scan_param.stride)
+
+                # fallthrough to reduction implementation
+
+            else:
+                assert n_local_par > 0
+                scan_iname, = expr.inames
+                _error_if_force_scan_on(LoopyError,
+                        "Scan iname '%s' is parallel tagged: this is not allowed "
+                        "(only the sweep iname should be tagged if parallelism "
+                        "is desired)." % scan_iname)
+
+                # fallthrough to reduction implementation
+
+        if n_sequential:
+            assert n_local_par == 0
+            return map_reduction_seq(
+                    expr, rec, nresults, arg_dtypes, reduction_dtypes)
+        else:
+            assert n_local_par > 0
+            return map_reduction_local(
+                    expr, rec, nresults, arg_dtypes, reduction_dtypes)
+
     # }}}
 
     from loopy.symbolic import ReductionCallbackMapper
@@ -739,6 +1828,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
     kernel = lp.tag_inames(kernel, new_iname_tags)
 
+    # TODO: remove unused inames...
+
+    kernel = (
+            _hackily_ensure_multi_assignment_return_values_are_scoped_private(
+                kernel))
+
     return kernel
 
 # }}}
diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py
index 5de677e72708be844a5276b3d40ace8b1dad9da0..e058fe30f1f6a9fc2746b4c542e88041f76818d3 100644
--- a/loopy/schedule/tools.py
+++ b/loopy/schedule/tools.py
@@ -23,10 +23,6 @@ THE SOFTWARE.
 """
 
 from loopy.kernel.data import temp_var_scope
-from loopy.schedule import (BeginBlockItem, CallKernel, EndBlockItem,
-                            RunInstruction, Barrier)
-
-from pytools import memoize_method
 
 
 # {{{ block boundary finder
@@ -37,6 +33,7 @@ def get_block_boundaries(schedule):
     :class:`loopy.schedule.BlockBeginItem`s to
     :class:`loopy.schedule.BlockEndItem`s and vice versa.
     """
+    from loopy.schedule import (BeginBlockItem, EndBlockItem)
     block_bounds = {}
     active_blocks = []
     for idx, sched_item in enumerate(schedule):
@@ -51,109 +48,20 @@ def get_block_boundaries(schedule):
 # }}}
 
 
-# {{{ instruction query utility
-
-class InstructionQuery(object):
-
-    def __init__(self, kernel):
-        self.kernel = kernel
-        block_bounds = get_block_boundaries(kernel.schedule)
-        subkernel_slices = {}
-        from six import iteritems
-        for start, end in iteritems(block_bounds):
-            sched_item = kernel.schedule[start]
-            if isinstance(sched_item, CallKernel):
-                subkernel_slices[sched_item.kernel_name] = slice(start, end + 1)
-        self.subkernel_slices = subkernel_slices
-
-    @memoize_method
-    def subkernels(self):
-        return frozenset(self.subkernel_slices.keys())
-
-    @memoize_method
-    def insns_reading_or_writing(self, var):
-        return frozenset(insn.id for insn in self.kernel.instructions
-            if var in insn.read_dependency_names()
-                or var in insn.assignee_var_names())
-
-    @memoize_method
-    def insns_in_subkernel(self, subkernel):
-        return frozenset(sched_item.insn_id for sched_item
-            in self.kernel.schedule[self.subkernel_slices[subkernel]]
-            if isinstance(sched_item, RunInstruction))
-
-    @memoize_method
-    def temporaries_read_in_subkernel(self, subkernel):
-        return frozenset(
-            var
-            for insn in self.insns_in_subkernel(subkernel)
-            for var in self.kernel.id_to_insn[insn].read_dependency_names()
-            if var in self.kernel.temporary_variables)
-
-    @memoize_method
-    def temporaries_written_in_subkernel(self, subkernel):
-        return frozenset(
-            var
-            for insn in self.insns_in_subkernel(subkernel)
-            for var in self.kernel.id_to_insn[insn].assignee_var_names()
-            if var in self.kernel.temporary_variables)
-
-    @memoize_method
-    def temporaries_read_or_written_in_subkernel(self, subkernel):
-        return (
-            self.temporaries_read_in_subkernel(subkernel) |
-            self.temporaries_written_in_subkernel(subkernel))
-
-    @memoize_method
-    def inames_in_subkernel(self, subkernel):
-        subkernel_start = self.subkernel_slices[subkernel].start
-        return frozenset(self.kernel.schedule[subkernel_start].extra_inames)
-
-    @memoize_method
-    def pre_and_post_barriers(self, subkernel):
-        subkernel_start = self.subkernel_slices[subkernel].start
-        subkernel_end = self.subkernel_slices[subkernel].stop
-
-        def is_global_barrier(item):
-            return isinstance(item, Barrier) and item.kind == "global"
-
-        try:
-            pre_barrier = next(item for item in
-                    self.kernel.schedule[subkernel_start::-1]
-                    if is_global_barrier(item)).originating_insn_id
-        except StopIteration:
-            pre_barrier = None
-
-        try:
-            post_barrier = next(item for item in
-                    self.kernel.schedule[subkernel_end:]
-                    if is_global_barrier(item)).originating_insn_id
-        except StopIteration:
-            post_barrier = None
-
-        return (pre_barrier, post_barrier)
-
-    @memoize_method
-    def hw_inames(self, insn_id):
-        """
-        Return the inames that insn runs in and that are tagged as hardware
-        parallel.
-        """
-        from loopy.kernel.data import HardwareParallelTag
-        return set(iname for iname in self.kernel.insn_inames(insn_id)
-                   if isinstance(self.kernel.iname_to_tag.get(iname),
-                                 HardwareParallelTag))
-
-    @memoize_method
-    def common_hw_inames(self, insn_ids):
-        """
-        Return the common set of hardware parallel tagged inames among
-        the list of instructions.
-        """
-        # Get the list of hardware inames in which the temporary is defined.
-        if len(insn_ids) == 0:
-            return set()
-        return set.intersection(*(self.hw_inames(id) for id in insn_ids))
+# {{{ subkernel tools
+
+def temporaries_read_in_subkernel(kernel, subkernel):
+    return frozenset(tv
+            for insn_id in kernel.subkernel_to_insn_ids[subkernel]
+            for tv in kernel.id_to_insn[insn_id].read_dependency_names()
+            if tv in kernel.temporary_variables)
+
+
+def temporaries_written_in_subkernel(kernel, subkernel):
+    return frozenset(tv
+            for insn_id in kernel.subkernel_to_insn_ids[subkernel]
+            for tv in kernel.id_to_insn[insn_id].write_dependency_names()
+            if tv in kernel.temporary_variables)
 
 # }}}
 
@@ -166,23 +74,27 @@ def add_extra_args_to_schedule(kernel):
     instructions in the schedule with global temporaries.
     """
     new_schedule = []
-
-    insn_query = InstructionQuery(kernel)
+    from loopy.schedule import CallKernel
 
     for sched_item in kernel.schedule:
         if isinstance(sched_item, CallKernel):
-            subrange_temporaries = (insn_query
-                .temporaries_read_or_written_in_subkernel(sched_item.kernel_name))
+            subkernel = sched_item.kernel_name
+
+            used_temporaries = (
+                    temporaries_read_in_subkernel(kernel, subkernel)
+                    | temporaries_written_in_subkernel(kernel, subkernel))
+
             more_args = set(tv
-                for tv in subrange_temporaries
-                if
-                kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL
-                and
-                kernel.temporary_variables[tv].initializer is None
-                and
-                tv not in sched_item.extra_args)
+                    for tv in used_temporaries
+                    if
+                    kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL
+                    and
+                    kernel.temporary_variables[tv].initializer is None
+                    and
+                    tv not in sched_item.extra_args)
+
             new_schedule.append(sched_item.copy(
-                extra_args=sched_item.extra_args + sorted(more_args)))
+                    extra_args=sched_item.extra_args + sorted(more_args)))
         else:
             new_schedule.append(sched_item)
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 50c891be476810887720c4e13c9659966b431f5d..bbf798e5e1d146050d210a04c99af6cb33057087 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -95,7 +95,8 @@ class IdentityMapperMixin(object):
             new_inames.append(new_sym_iname.name)
 
         return Reduction(
-                expr.operation, tuple(new_inames), self.rec(expr.expr, *args),
+                expr.operation, tuple(new_inames),
+                tuple(self.rec(e, *args) for e in expr.exprs),
                 allow_simultaneous=expr.allow_simultaneous)
 
     def map_tagged_variable(self, expr, *args):
@@ -144,7 +145,8 @@ class WalkMapper(WalkMapperBase):
         if not self.visit(expr):
             return
 
-        self.rec(expr.expr, *args)
+        for sub_expr in expr.exprs:
+            self.rec(sub_expr, *args)
 
     map_tagged_variable = WalkMapperBase.map_variable
 
@@ -162,7 +164,7 @@ class CallbackMapper(CallbackMapperBase, IdentityMapper):
 
 class CombineMapper(CombineMapperBase):
     def map_reduction(self, expr):
-        return self.rec(expr.expr)
+        return self.combine(self.rec(sub_expr) for sub_expr in expr.exprs)
 
     map_linear_subscript = CombineMapperBase.map_subscript
 
@@ -192,9 +194,11 @@ class StringifyMapper(StringifyMapperBase):
         return "loc.%d" % expr.index
 
     def map_reduction(self, expr, prec):
+        from pymbolic.mapper.stringifier import PREC_NONE
         return "%sreduce(%s, [%s], %s)" % (
                 "simul_" if expr.allow_simultaneous else "",
-                expr.operation, ", ".join(expr.inames), expr.expr)
+                expr.operation, ", ".join(expr.inames),
+                ", ".join(self.rec(e, PREC_NONE) for e in expr.exprs))
 
     def map_tagged_variable(self, expr, prec):
         return "%s$%s" % (expr.name, expr.tag)
@@ -225,7 +229,7 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase):
                 ):
             return []
 
-        return self.rec(expr.expr, other.expr, unis)
+        return self.rec(expr.exprs, other.exprs, unis)
 
     def map_tagged_variable(self, expr, other, urecs):
         new_uni_record = self.unification_record_from_equation(
@@ -258,7 +262,7 @@ class DependencyMapper(DependencyMapperBase):
                 self.rec(child, *args) for child in expr.parameters)
 
     def map_reduction(self, expr):
-        return (self.rec(expr.expr)
+        return (self.combine(self.rec(sub_expr) for sub_expr in expr.exprs)
                 - set(p.Variable(iname) for iname in expr.inames))
 
     def map_tagged_variable(self, expr):
@@ -440,10 +444,10 @@ class Reduction(p.Expression):
         a list of inames across which reduction on :attr:`expr` is being
         carried out.
 
-    .. attribute:: expr
+    .. attribute:: exprs
 
-        The expression (as a :class:`pymbolic.primitives.Expression`)
-        on which reduction is performed.
+        A :class:`tuple` of :class:`pymbolic.primitives.Expression`,
+        representing the expression(s) over which reduction is performed.
 
     .. attribute:: allow_simultaneous
 
@@ -451,9 +455,9 @@ class Reduction(p.Expression):
         in precisely one reduction, to avoid mis-nesting errors.
     """
 
-    init_arg_names = ("operation", "inames", "expr", "allow_simultaneous")
+    init_arg_names = ("operation", "inames", "exprs", "allow_simultaneous")
 
-    def __init__(self, operation, inames, expr, allow_simultaneous=False):
+    def __init__(self, operation, inames, exprs, allow_simultaneous=False):
         if isinstance(inames, str):
             inames = tuple(iname.strip() for iname in inames.split(","))
 
@@ -475,26 +479,28 @@ class Reduction(p.Expression):
             from loopy.library.reduction import parse_reduction_op
             operation = parse_reduction_op(operation)
 
+        if not isinstance(exprs, tuple):
+            exprs = (exprs,)
+
         from loopy.library.reduction import ReductionOperation
         assert isinstance(operation, ReductionOperation)
 
         self.operation = operation
         self.inames = inames
-        self.expr = expr
+        self.exprs = exprs
         self.allow_simultaneous = allow_simultaneous
 
     def __getinitargs__(self):
-        return (self.operation, self.inames, self.expr, self.allow_simultaneous)
+        return (self.operation, self.inames, self.exprs, self.allow_simultaneous)
 
     def get_hash(self):
-        return hash((self.__class__, self.operation, self.inames,
-            self.expr))
+        return hash((self.__class__, self.operation, self.inames, self.exprs))
 
     def is_equal(self, other):
         return (other.__class__ == self.__class__
                 and other.operation == self.operation
                 and other.inames == self.inames
-                and other.expr == self.expr)
+                and other.exprs == self.exprs)
 
     def stringifier(self):
         return StringifyMapper
@@ -924,7 +930,7 @@ class FunctionToPrimitiveMapper(IdentityMapper):
     turns those into the actual pymbolic primitives used for that.
     """
 
-    def _parse_reduction(self, operation, inames, red_expr,
+    def _parse_reduction(self, operation, inames, red_exprs,
             allow_simultaneous=False):
         if isinstance(inames, p.Variable):
             inames = (inames,)
@@ -941,7 +947,7 @@ class FunctionToPrimitiveMapper(IdentityMapper):
 
             processed_inames.append(iname.name)
 
-        return Reduction(operation, tuple(processed_inames), red_expr,
+        return Reduction(operation, tuple(processed_inames), red_exprs,
                 allow_simultaneous=allow_simultaneous)
 
     def map_call(self, expr):
@@ -991,12 +997,17 @@ class FunctionToPrimitiveMapper(IdentityMapper):
 
             operation = parse_reduction_op(name)
             if operation:
-                if len(expr.parameters) != 2:
+                # arg_count counts arguments but not inames
+                if len(expr.parameters) != 1 + operation.arg_count:
                     raise RuntimeError("invalid invocation of "
-                            "reduction operation '%s'" % expr.function.name)
+                            "reduction operation '%s': expected %d arguments, "
+                            "got %d instead" % (expr.function.name,
+                                                1 + operation.arg_count,
+                                                len(expr.parameters)))
 
-                inames, red_expr = expr.parameters
-                return self._parse_reduction(operation, inames, self.rec(red_expr))
+                inames = expr.parameters[0]
+                red_exprs = tuple(self.rec(param) for param in expr.parameters[1:])
+                return self._parse_reduction(operation, inames, red_exprs)
 
             else:
                 return IdentityMapper.map_call(self, expr)
@@ -1385,7 +1396,7 @@ class IndexVariableFinder(CombineMapper):
         return result
 
     def map_reduction(self, expr):
-        result = self.rec(expr.expr)
+        result = self.combine(self.rec(sub_expr) for sub_expr in expr.exprs)
 
         if not (expr.inames_set & result):
             raise RuntimeError("reduction '%s' does not depend on "
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index 575311b11716f5a52e4713aa51922eb348c839d9..c6ff596b005bc782dbf33a1394e840c7f63ff2a5 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -683,7 +683,8 @@ def set_temporary_scope(kernel, temp_var_names, scope):
 
 # {{{ reduction_arg_to_subst_rule
 
-def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=None):
+def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=None,
+                                arg_number=0):
     if isinstance(inames, str):
         inames = [s.strip() for s in inames.split(",")]
 
@@ -693,12 +694,14 @@ def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=No
 
     var_name_gen = knl.get_var_name_generator()
 
+    # XXX
     def map_reduction(expr, rec, nresults=1):
         if frozenset(expr.inames) != inames_set:
+            assert len(expr.exprs) == 1
             return type(expr)(
                     operation=expr.operation,
                     inames=expr.inames,
-                    expr=rec(expr.expr),
+                    exprs=(rec(expr.exprs[0]),),
                     allow_simultaneous=expr.allow_simultaneous)
 
         if subst_rule_name is None:
@@ -715,15 +718,24 @@ def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=No
         substs[my_subst_rule_name] = SubstitutionRule(
                 name=my_subst_rule_name,
                 arguments=tuple(inames),
-                expression=expr.expr)
+                expression=expr.exprs[arg_number])
 
         from pymbolic import var
         iname_vars = [var(iname) for iname in inames]
 
+        new_exprs = []
+        for sub_expr_number, sub_expr in enumerate(expr.exprs):
+            if sub_expr_number == arg_number:
+                new_exprs.append(var(my_subst_rule_name)(*iname_vars))
+            else:
+                new_exprs.append(sub_expr)
+
+        new_exprs = tuple(new_exprs)
+
         return type(expr)(
                 operation=expr.operation,
                 inames=expr.inames,
-                expr=var(my_subst_rule_name)(*iname_vars),
+                exprs=new_exprs,
                 allow_simultaneous=expr.allow_simultaneous)
 
     from loopy.symbolic import ReductionCallbackMapper
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index c35b5064365293ac78cdd01af537c9d28bd67193..300faa638dd0434ac92507b3a26c39b8817ede83 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -145,7 +145,8 @@ class _InameSplitter(RuleAwareIdentityMapper):
 
             from loopy.symbolic import Reduction
             return Reduction(expr.operation, tuple(new_inames),
-                        self.rec(expr.expr, expn_state),
+                        tuple(self.rec(sub_expr, expn_state)
+                              for sub_expr in expr.exprs),
                         expr.allow_simultaneous)
         else:
             return super(_InameSplitter, self).map_reduction(expr, expn_state)
@@ -1191,13 +1192,15 @@ class _ReductionSplitter(RuleAwareIdentityMapper):
             if self.direction == "in":
                 return Reduction(expr.operation, tuple(leftover_inames),
                         Reduction(expr.operation, tuple(self.inames),
-                            self.rec(expr.expr, expn_state),
+                            tuple(self.rec(sub_expr, expn_state)
+                                  for sub_expr in expr.exprs),
                             expr.allow_simultaneous),
                         expr.allow_simultaneous)
             elif self.direction == "out":
                 return Reduction(expr.operation, tuple(self.inames),
                         Reduction(expr.operation, tuple(leftover_inames),
-                            self.rec(expr.expr, expn_state),
+                            tuple(self.rec(sub_expr, expn_state)
+                                  for sub_expr in expr.exprs),
                             expr.allow_simultaneous))
             else:
                 assert False
@@ -1589,10 +1592,11 @@ class _ReductionInameUniquifier(RuleAwareIdentityMapper):
 
             from loopy.symbolic import Reduction
             return Reduction(expr.operation, tuple(new_inames),
-                    self.rec(
-                        SubstitutionMapper(make_subst_func(subst_dict))(
-                            expr.expr),
-                        expn_state),
+                    tuple(self.rec(
+                            SubstitutionMapper(make_subst_func(subst_dict))(
+                                sub_expr),
+                            expn_state)
+                        for sub_expr in expr.exprs),
                     expr.allow_simultaneous)
         else:
             return super(_ReductionInameUniquifier, self).map_reduction(
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 7c9c9688604179dce2aa7dcd6954d76a0df32cc7..410274f907b250bb81583281ebe503e8c8c80373 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -34,7 +34,6 @@ def find_instructions(kernel, insn_match):
     match = parse_match(insn_match)
     return [insn for insn in kernel.instructions if match(kernel, insn)]
 
-
 # }}}
 
 
@@ -171,6 +170,7 @@ def replace_instruction_ids(kernel, replacements):
     for insn in kernel.instructions:
         changed = False
         new_depends_on = []
+        new_no_sync_with = []
 
         for dep in insn.depends_on:
             if dep in replacements:
@@ -179,8 +179,18 @@ def replace_instruction_ids(kernel, replacements):
             else:
                 new_depends_on.append(dep)
 
+        for insn_id, scope in insn.no_sync_with:
+            if insn_id in replacements:
+                new_no_sync_with.extend(
+                        (repl, scope) for repl in replacements[insn_id])
+                changed = True
+            else:
+                new_no_sync_with.append((insn_id, scope))
+
         new_insns.append(
-                insn.copy(depends_on=frozenset(new_depends_on))
+                insn.copy(
+                    depends_on=frozenset(new_depends_on),
+                    no_sync_with=frozenset(new_no_sync_with))
                 if changed else insn)
 
     return kernel.copy(instructions=new_insns)
@@ -207,4 +217,77 @@ def tag_instructions(kernel, new_tag, within=None):
 # }}}
 
 
+# {{{ add nosync
+
+def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False):
+    """Add a *no_sync_with* directive between *source* and *sink*.
+    *no_sync_with* is only added if a (syntactic) dependency edge
+    is present or if the instruction pair is in a conflicting group
+    (this does not check for memory dependencies).
+
+    :arg kernel:
+    :arg source: Either a single instruction id, or any instruction id
+        match understood by :func:`loopy.match.parse_match`.
+    :arg sink: Either a single instruction id, or any instruction id
+        match understood by :func:`loopy.match.parse_match`.
+    :arg scope: A string which is a valid *no_sync_with* scope.
+    :arg bidirectional: A :class:`bool`. If *True*, add a *no_sync_with*
+        to both the source and sink instructions, otherwise the directive
+        is only added to the sink instructions.
+    :arg force: A :class:`bool`. If *True*, will add a *no_sync_with*
+        even without the presence of a syntactic dependency edge/
+        conflicting instruction group.
+
+    :return: The updated kernel
+    """
+
+    if isinstance(source, str) and source in kernel.id_to_insn:
+        sources = frozenset([source])
+    else:
+        sources = frozenset(
+                source.id for source in find_instructions(kernel, source))
+
+    if isinstance(sink, str) and sink in kernel.id_to_insn:
+        sinks = frozenset([sink])
+    else:
+        sinks = frozenset(
+                sink.id for sink in find_instructions(kernel, sink))
+
+    def insns_in_conflicting_groups(insn1_id, insn2_id):
+        insn1 = kernel.id_to_insn[insn1_id]
+        insn2 = kernel.id_to_insn[insn2_id]
+        return (
+                bool(insn1.groups & insn2.conflicts_with_groups)
+                or
+                bool(insn2.groups & insn1.conflicts_with_groups))
+
+    from collections import defaultdict
+    nosync_to_add = defaultdict(set)
+
+    for sink in sinks:
+        for source in sources:
+
+            needs_nosync = force or (
+                    source in kernel.recursive_insn_dep_map()[sink]
+                    or insns_in_conflicting_groups(source, sink))
+
+            if not needs_nosync:
+                continue
+
+            nosync_to_add[sink].add((source, scope))
+            if bidirectional:
+                nosync_to_add[source].add((sink, scope))
+
+    new_instructions = list(kernel.instructions)
+
+    for i, insn in enumerate(new_instructions):
+        if insn.id in nosync_to_add:
+            new_instructions[i] = insn.copy(no_sync_with=insn.no_sync_with
+                    | frozenset(nosync_to_add[insn.id]))
+
+    return kernel.copy(instructions=new_instructions)
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py
new file mode 100644
index 0000000000000000000000000000000000000000..c63ac9d72a93994f7d6efd7545395611b1ec3279
--- /dev/null
+++ b/loopy/transform/reduction.py
@@ -0,0 +1,704 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2017 Matt Wala"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import loopy as lp
+import islpy as isl
+
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+__doc__ = """
+.. currentmodule:: loopy
+
+.. autofunction:: make_two_level_reduction
+.. autofunction:: make_two_level_scan
+"""
+
+
+def make_two_level_reduction(
+        kernel, insn_id, inner_length,
+        nonlocal_storage_scope=None,
+        nonlocal_tag=None,
+        outer_tag=None,
+        inner_tag=None):
+    """
+    Two level reduction, mediated through a "nonlocal" array.
+
+    This turns a reduction of the form::
+
+         [...] result = reduce(i, f(i))
+
+    into::
+
+         i -> inner + inner_length * outer
+
+         [..., nl] nonlocal[nl] = reduce(inner, f(nl, inner))
+         [...]     result       = reduce(outer, nonlocal[outer])
+    """
+
+    # {{{ sanity checks
+
+    reduction = kernel.id_to_insn[insn_id].expression
+    reduction_iname, = reduction.inames
+
+    # }}}
+
+    # {{{ get stable names for everything
+
+    var_name_gen = kernel.get_var_name_generator()
+
+    format_kwargs = {"insn": insn_id, "iname": reduction_iname}
+
+    nonlocal_storage_name = var_name_gen(
+            "{insn}_nonlocal".format(**format_kwargs))
+
+    inner_iname = var_name_gen(
+            "{iname}_inner".format(**format_kwargs))
+    outer_iname = var_name_gen(
+            "{iname}_outer".format(**format_kwargs))
+    nonlocal_iname = var_name_gen(
+            "{iname}_nonlocal".format(**format_kwargs))
+
+    inner_subst = var_name_gen(
+            "{insn}_inner_subst".format(**format_kwargs))
+
+    # }}}
+
+    # First we split this iname. This results in (roughly)
+    #
+    # [...] result = reduce([outer, inner], f(outer, inner))
+    #
+    # FIXME: within
+
+    kernel = lp.split_iname(kernel, reduction_iname, inner_length,
+            outer_iname=outer_iname, inner_iname=inner_iname)
+
+    # Next, we split the reduction inward and then extract a substitution
+    # rule for the reduction. This results in
+    #
+    # subst(outer) := reduce(inner, f(outer, inner))
+    # [...] result = reduce([outer], subst(outer))
+    #
+    # FIXME: within, insn_match...
+
+    kernel = lp.split_reduction_inward(kernel, inner_iname)
+    from loopy.transform.data import reduction_arg_to_subst_rule
+    kernel = reduction_arg_to_subst_rule(kernel, outer_iname,
+                                         subst_rule_name=inner_subst)
+
+    # Next, we precompute the inner iname into its own storage.
+
+    # [...,nl] nonlocal[nl] = reduce(inner, f(nl, inner))
+    # [...] result = reduce([outer], nonlocal[outer])
+
+    kernel = lp.precompute(kernel, inner_subst,
+                           sweep_inames=[outer_iname],
+                           precompute_inames=[nonlocal_iname],
+                           temporary_name=nonlocal_storage_name,
+                           temporary_scope=nonlocal_storage_scope)
+
+    return kernel
+
+
+def _update_instructions(kernel, id_to_new_insn, copy=True):
+    if not isinstance(id_to_new_insn, dict):
+        id_to_new_insn = dict((insn.id, insn) for insn in id_to_new_insn)
+
+    new_instructions = (
+        list(insn for insn in kernel.instructions
+             if insn.id not in id_to_new_insn)
+        + list(id_to_new_insn.values()))
+
+    if copy:
+        kernel = kernel.copy()
+
+    kernel.instructions = new_instructions
+    return kernel
+
+
+def _make_slab_set(iname, size):
+    # FIXME: There is a very similar identically named function in
+    # preprocess. Refactor.
+
+    if not isinstance(size, (isl.PwAff, isl.Aff)):
+        from loopy.symbolic import pwaff_from_expr
+        size = pwaff_from_expr(
+                isl.Space.params_alloc(isl.DEFAULT_CONTEXT, 0), size)
+
+    base_space = size.get_domain_space()
+
+    space = (base_space
+            .add_dims(isl.dim_type.set, 1)
+            .set_dim_name(isl.dim_type.set, base_space.dim(isl.dim_type.set), iname))
+
+    v = isl.affs_from_space(space)
+
+    size = isl.align_spaces(size, v[0])
+
+    bs, = (
+            v[0].le_set(v[iname])
+            &
+            v[iname].lt_set(v[0] + size)).get_basic_sets()
+
+    return bs
+
+
+def _add_subdomain_to_kernel(kernel, subdomain):
+    domains = list(kernel.domains)
+    # Filter out value parameters.
+    dep_inames = (
+            frozenset(subdomain.get_var_names(isl.dim_type.param))
+            & kernel.all_inames())
+
+    indices = kernel.get_leaf_domain_indices(dep_inames)
+
+    if len(indices) == 0:
+        domains.append(subdomain)
+    elif len(indices) == 1:
+        idx, = indices
+        domains.insert(idx + 1, subdomain)
+    else:
+        print(indices)
+        raise ValueError("more than 1 leaf index")
+
+    return kernel.copy(domains=domains)
+
+
+def _add_scan_subdomain(
+        kernel, scan_iname, sweep_iname):
+    """
+    Add the following domain to the kernel::
+
+        [sweep_iname] -> {[scan_iname] : 0 <= scan_iname <= sweep_iname }
+    """
+    sp = (
+            isl.Space.set_alloc(isl.DEFAULT_CONTEXT, 1, 1)
+            .set_dim_name(isl.dim_type.param, 0, sweep_iname)
+            .set_dim_name(isl.dim_type.set, 0, scan_iname))
+
+    affs = isl.affs_from_space(sp)
+
+    subd, = (
+            affs[scan_iname].le_set(affs[sweep_iname])
+            &
+            affs[scan_iname].ge_set(affs[0])).get_basic_sets()
+
+    return _add_subdomain_to_kernel(kernel, subd)
+
+
+def _expand_subst_within_expression(kernel, expr):
+    from loopy.symbolic import (
+            RuleAwareSubstitutionRuleExpander, SubstitutionRuleMappingContext)
+
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, kernel.get_var_name_generator())
+    submap = RuleAwareSubstitutionRuleExpander(
+            rule_mapping_context,
+            kernel.substitutions,
+            within=lambda *args: True
+            )
+    return submap(expr, kernel, insn=None)
+
+
+def _add_global_barrier(kernel, source, sink, barrier_id):
+    from loopy.kernel.instruction import BarrierInstruction
+
+    sources = (source,) if isinstance(source, str) else source
+    sinks = (sink,) if isinstance(sink, str) else sink
+
+    within_inames = kernel.id_to_insn[sources[0]].within_inames
+    from itertools import chain
+    for iname in chain(sources[1:], sinks):
+        within_inames &= kernel.id_to_insn[iname].within_inames
+
+    barrier_insn = BarrierInstruction(
+            id=barrier_id,
+            depends_on=frozenset(sources),
+            within_inames=within_inames,
+            kind="global")
+
+    sink_insns = (kernel.id_to_insn[sink] for sink in sinks)
+    updated_sinks = (
+            sink.copy(depends_on=sink.depends_on | frozenset([barrier_id]))
+            for sink in sink_insns)
+
+    kernel = _update_instructions(
+            kernel, chain([barrier_insn], updated_sinks), copy=True)
+
+    return kernel
+
+
+def _get_scan_level(sweep_iname):
+    SWEEP_RE = r".*__l(\d+)(?:_outer)?"  # noqa
+
+    import re
+    match_result = re.match(SWEEP_RE, sweep_iname)
+
+    if match_result is None:
+        return 0
+
+    return int(match_result.group(1))
+
+
+def _get_base_iname(iname):
+    BASE_INAME_RE = r"(.*)__l\d+(?:_outer)?"  # noqa
+
+    import re
+    match_result = re.match(BASE_INAME_RE, iname)
+
+    if match_result is None:
+        return iname
+
+    return match_result.group(1)
+
+
+def make_two_level_scan(
+        kernel, insn_id,
+        scan_iname,
+        sweep_iname,
+        inner_length,
+        local_storage_name=None,
+        local_storage_scope=None,
+        local_storage_axes=None,
+        nonlocal_storage_name=None,
+        nonlocal_storage_scope=None,
+        nonlocal_tag=None,
+        outer_local_tag=None,
+        inner_local_tag=None,
+        inner_tag=None,
+        outer_tag=None,
+        inner_iname=None,
+        outer_iname=None):
+    """
+    Two level scan, mediated through a "local" and "nonlocal" array.
+
+    This turns a scan of the form::
+
+         [...,i] result = reduce(j, f(j))
+
+    into::
+
+         [...,l',l''] <scan into local>
+         [...,nlinit] nonlocal[0] = 0
+         [...,nlinit] nonlocal[nlinit+1] = local[nlinit,-1]
+         [...,nl]     <scan into nonlocal>
+         [...,i',i''] result = nonlocal[i'] + local[i',i'']
+    """
+
+    # {{{ sanity checks
+
+    # FIXME: More sanity checks...
+
+    insn = kernel.id_to_insn[insn_id]
+    scan = insn.expression
+    assert scan.inames[0] == scan_iname
+    assert len(scan.inames) == 1
+
+    # }}}
+
+    # {{{ get stable names for everything
+
+    var_name_gen = kernel.get_var_name_generator()
+    insn_id_gen = kernel.get_instruction_id_generator()
+
+    level = _get_scan_level(sweep_iname)
+    base_scan_iname = _get_base_iname(scan_iname)
+    base_sweep_iname = _get_base_iname(sweep_iname)
+    base_insn_id = _get_base_iname(insn_id)
+
+    format_kwargs = {
+            "insn": base_insn_id,
+            "iname": base_scan_iname,
+            "sweep": base_sweep_iname,
+            "level": level,
+            "next_level": level + 1}
+
+    if inner_iname is None:
+        inner_iname = var_name_gen(
+                "{sweep}__l{level}".format(**format_kwargs))
+    else:
+        var_name_gen.add_name(inner_iname)
+
+    if outer_iname is None:
+        outer_iname = var_name_gen(
+                "{sweep}__l{level}_outer".format(**format_kwargs))
+    else:
+        var_name_gen.add_iname(outer_iname)
+
+    """
+    nonlocal_init_head_outer_iname = var_name_gen(
+            "{sweep}__l{level}_nlhead_outer".format(**format_kwargs))
+
+    nonlocal_init_head_inner_iname = var_name_gen(
+            "{sweep}__l{level}_nlhead_inner".format(**format_kwargs))
+    """
+
+    nonlocal_init_tail_outer_iname = var_name_gen(
+            "{sweep}__l{level}_nltail_outer".format(**format_kwargs))
+
+    nonlocal_init_tail_inner_iname = var_name_gen(
+            "{sweep}__l{level}_nltail_inner".format(**format_kwargs))
+
+    nonlocal_iname = var_name_gen(
+            "{sweep}__l{level}_nonloc".format(**format_kwargs))
+
+    inner_local_iname = var_name_gen(
+            "{sweep}__l{next_level}".format(**format_kwargs))
+
+    inner_scan_iname = var_name_gen(
+            "{iname}__l{next_level}".format(**format_kwargs))
+
+    outer_local_iname = var_name_gen(
+            "{sweep}__l{next_level}_outer".format(**format_kwargs))
+
+    outer_scan_iname = var_name_gen(
+            "{iname}__l{level}".format(**format_kwargs))
+
+    subst_name = var_name_gen(
+            "{insn}_inner_subst".format(**format_kwargs))
+
+    local_subst_name = var_name_gen(
+            "{insn}_local_subst".format(**format_kwargs))
+
+    if local_storage_name is None:
+        local_storage_name = var_name_gen(
+            "{insn}__l{next_level}".format(**format_kwargs))
+    else:
+        var_name_gen.add_name(local_storage_name)
+
+    if nonlocal_storage_name is None:
+        nonlocal_storage_name = var_name_gen(
+            "{insn}__l{level}_outer".format(**format_kwargs))
+    else:
+        var_name_gen.add_name(nonlocal_storage_name)
+
+    local_scan_insn_id = insn_id_gen(
+            "{insn}__l{next_level}".format(**format_kwargs))
+
+    nonlocal_scan_insn_id = insn_id_gen(
+            "{insn}__l{level}".format(**format_kwargs))
+
+    format_kwargs.update({"nonlocal": nonlocal_storage_name})
+
+    nonlocal_init_head_insn_id = insn_id_gen(
+            "{nonlocal}_init_head".format(**format_kwargs))
+
+    nonlocal_init_tail_insn_id = insn_id_gen(
+            "{nonlocal}_init_tail".format(**format_kwargs))
+
+    # }}}
+
+    # {{{ utils
+
+    if local_storage_axes is None:
+        local_storage_axes = (outer_iname, inner_iname)
+
+    def pick_out_relevant_axes(full_indices, strip_scalar=False):
+        assert len(full_indices) == 2
+        iname_to_index = dict(zip((outer_iname, inner_iname), full_indices))
+
+        result = []
+        for iname in local_storage_axes:
+            result.append(iname_to_index[iname])
+
+        assert len(result) > 0
+
+        return (tuple(result)
+                if not (strip_scalar and len(result) == 1)
+                else result[0])
+
+    # }}}
+
+    # {{{ prepare for two level scan
+
+    # Turn the scan into a substitution rule, replace the original scan with a
+    # nop and delete the scan iname.
+    #
+    # (The presence of the scan iname seems to be making precompute very confused.)
+
+    from loopy.transform.data import reduction_arg_to_subst_rule
+    kernel = reduction_arg_to_subst_rule(
+            kernel, scan_iname, subst_rule_name=subst_name)
+
+    kernel = _update_instructions(kernel, {insn_id: insn.copy(expression=0)})
+
+    """
+    from loopy.kernel.instruction import NoOpInstruction
+            {insn_id: NoOpInstruction(
+                id=insn_id,
+                depends_on=insn.depends_on,
+                groups=insn.groups,
+                conflicts_with_groups=insn.groups,
+                no_sync_with=insn.no_sync_with,
+                within_inames_is_final=insn.within_inames_is_final,
+                within_inames=insn.within_inames,
+                priority=insn.priority,
+                boostable=insn.boostable,
+                boostable_into=insn.boostable_into,
+                predicates=insn.predicates,
+                tags=insn.tags)},
+            copy=False)
+    """
+
+    kernel = lp.remove_unused_inames(kernel, inames=(scan_iname,))
+
+    # Make sure we got rid of everything
+    assert scan_iname not in kernel.all_inames()
+
+    # }}}
+
+    # {{{ implement local scan
+
+    from pymbolic import var
+
+    # FIXME: This can probably be done using split_reduction_inward()
+    # and will end up looking as less of a mess that way.
+
+    local_scan_expr = _expand_subst_within_expression(kernel,
+            var(subst_name)(var(outer_iname) * inner_length +
+                            var(inner_scan_iname)))
+
+    kernel = lp.split_iname(kernel, sweep_iname, inner_length,
+            inner_iname=inner_iname, outer_iname=outer_iname,
+            inner_tag=inner_tag, outer_tag=outer_tag)
+
+    from loopy.kernel.data import SubstitutionRule
+    from loopy.symbolic import Reduction
+
+    local_subst = SubstitutionRule(
+            name=local_subst_name,
+            arguments=(outer_iname, inner_iname),
+            expression=Reduction(
+                scan.operation, (inner_scan_iname,), local_scan_expr))
+
+    substitutions = kernel.substitutions.copy()
+    substitutions[local_subst_name] = local_subst
+
+    kernel = kernel.copy(substitutions=substitutions)
+
+    outer_local_iname = outer_iname
+
+    all_precompute_inames = (outer_local_iname, inner_local_iname)
+
+    precompute_inames = pick_out_relevant_axes(all_precompute_inames)
+    sweep_inames = pick_out_relevant_axes((outer_iname, inner_iname))
+
+    storage_axis_to_tag = {
+            outer_iname: outer_local_tag,
+            inner_iname: inner_local_tag,
+            outer_local_iname: outer_local_tag,
+            inner_local_iname: inner_local_tag}
+
+    precompute_outer_inames = (
+            frozenset(all_precompute_inames) - frozenset(precompute_inames))
+
+    insn = kernel.id_to_insn[insn_id]
+    within_inames = insn.within_inames - frozenset([outer_iname, inner_iname])
+
+    from pymbolic import var
+
+    kernel = lp.precompute(kernel,
+            [var(local_subst_name)(
+                var(outer_iname), var(inner_iname))],
+            sweep_inames=sweep_inames,
+            precompute_inames=precompute_inames,
+            storage_axes=local_storage_axes,
+            storage_axis_to_tag=storage_axis_to_tag,
+            precompute_outer_inames=precompute_outer_inames | within_inames,
+            temporary_name=local_storage_name,
+            compute_insn_id=local_scan_insn_id)
+
+    compute_insn_with_deps = kernel.id_to_insn[local_scan_insn_id]
+    compute_insn_with_deps = compute_insn_with_deps.copy(
+            depends_on=compute_insn_with_deps.depends_on | insn.depends_on)
+
+    kernel = _update_instructions(kernel, (compute_insn_with_deps,))
+
+    kernel = _add_scan_subdomain(kernel, inner_scan_iname, inner_local_iname)
+
+    # }}}
+
+    # {{{ implement local to nonlocal information transfer
+
+    from loopy.isl_helpers import static_max_of_pw_aff
+    from loopy.symbolic import pw_aff_to_expr
+
+    local_storage_local_axis_len = (
+            kernel.temporary_variables[local_storage_name].shape[-1])
+
+    nonlocal_storage_len_pw_aff = static_max_of_pw_aff(
+            kernel.get_iname_bounds(outer_iname).size,
+            constants_only=False)
+
+    # FIXME: this shouldn't have to have an extra element.
+    nonlocal_storage_len = pw_aff_to_expr(1 + nonlocal_storage_len_pw_aff)
+
+    nonlocal_tail_inner_subd = _make_slab_set(nonlocal_init_tail_inner_iname, 1)
+    kernel = _add_subdomain_to_kernel(kernel, nonlocal_tail_inner_subd)
+    nonlocal_tail_outer_subd = _make_slab_set(
+            nonlocal_init_tail_outer_iname, nonlocal_storage_len_pw_aff)
+    kernel = _add_subdomain_to_kernel(kernel, nonlocal_tail_outer_subd)
+
+    """
+    nonlocal_head_inner_subd = _make_slab_set(nonlocal_init_head_inner_iname, 1)
+    kernel = _add_subdomain_to_kernel(kernel, nonlocal_head_inner_subd)
+    nonlocal_head_outer_subd = _make_slab_set(nonlocal_init_head_outer_iname, 1)
+    kernel = _add_subdomain_to_kernel(kernel, nonlocal_head_outer_subd)
+    """
+
+    kernel = lp.tag_inames(kernel, {
+            #nonlocal_init_head_outer_iname: outer_local_tag,
+            #nonlocal_init_head_inner_iname: inner_local_tag,
+            nonlocal_init_tail_outer_iname: outer_local_tag,
+            nonlocal_init_tail_inner_iname: inner_local_tag})
+
+    if nonlocal_storage_name not in kernel.temporary_variables:
+
+        from loopy.kernel.data import TemporaryVariable
+        new_temporary_variables = kernel.temporary_variables.copy()
+
+        new_temporary_variables[nonlocal_storage_name] = (
+                TemporaryVariable(
+                    nonlocal_storage_name,
+                    shape=(nonlocal_storage_len,),
+                    scope=nonlocal_storage_scope,
+                    base_indices=lp.auto,
+                    dtype=lp.auto))
+
+        kernel = kernel.copy(temporary_variables=new_temporary_variables)
+
+    from loopy.kernel.instruction import make_assignment
+
+    # FIXME: neutral element...
+    nonlocal_init_head = make_assignment(
+            id=nonlocal_init_head_insn_id,
+            assignees=(var(nonlocal_storage_name)[0],),
+            expression=0,
+            within_inames=(
+                within_inames | frozenset([nonlocal_init_tail_outer_iname,
+                                           nonlocal_init_tail_inner_iname])),
+            no_sync_with=frozenset([(nonlocal_init_tail_insn_id, "any")]),
+            predicates=(var(nonlocal_init_tail_inner_iname).eq(0),
+                        var(nonlocal_init_tail_outer_iname).eq(0)),
+            depends_on=frozenset([local_scan_insn_id]))
+
+    nonlocal_init_tail = make_assignment(
+            id=nonlocal_init_tail_insn_id,
+            assignees=(
+                var(nonlocal_storage_name)[
+                    var(nonlocal_init_tail_outer_iname) + 1],),
+            expression=var(local_storage_name)[
+                pick_out_relevant_axes(
+                    (var(nonlocal_init_tail_outer_iname),
+                     var(nonlocal_init_tail_inner_iname)
+                     + local_storage_local_axis_len - 1),
+                    strip_scalar=True)],
+            no_sync_with=frozenset([(nonlocal_init_head_insn_id, "any")]),
+            within_inames=(
+                within_inames | frozenset([nonlocal_init_tail_outer_iname,
+                                           nonlocal_init_tail_inner_iname])),
+            depends_on=frozenset([local_scan_insn_id]))
+
+    kernel = _update_instructions(
+            kernel, (nonlocal_init_head, nonlocal_init_tail), copy=False)
+
+    # The write race warnings are spurious - the inner iname is length
+    # 1, so there's really no write race at all here.
+    kernel = kernel.copy(
+            silenced_warnings=kernel.silenced_warnings
+            + ["write_race(%s)" % nonlocal_init_tail_insn_id]
+            + ["write_race(%s)" % nonlocal_init_head_insn_id])
+
+    # }}}
+
+    # {{{ implement nonlocal scan
+
+    subd = _make_slab_set(nonlocal_iname, nonlocal_storage_len_pw_aff)
+    kernel = _add_subdomain_to_kernel(kernel, subd)
+
+    if nonlocal_tag is not None:
+        kernel = lp.tag_inames(kernel, {nonlocal_iname: nonlocal_tag})
+
+    kernel = _add_scan_subdomain(kernel, outer_scan_iname, nonlocal_iname)
+
+    nonlocal_scan = make_assignment(
+            id=nonlocal_scan_insn_id,
+            assignees=(var(nonlocal_storage_name)[var(nonlocal_iname)],),
+            expression=Reduction(
+                scan.operation,
+                (outer_scan_iname,),
+                var(nonlocal_storage_name)[var(outer_scan_iname)]),
+            within_inames=within_inames | frozenset([nonlocal_iname]),
+            depends_on=(
+                frozenset([nonlocal_init_tail_insn_id, nonlocal_init_head_insn_id])))
+
+    kernel = _update_instructions(kernel, (nonlocal_scan,), copy=False)
+
+    if nonlocal_storage_scope == lp.temp_var_scope.GLOBAL:
+        barrier_id = insn_id_gen(
+                "{insn}_nonlocal_init_barrier".format(**format_kwargs))
+        kernel = _add_global_barrier(kernel,
+                source=(nonlocal_init_tail_insn_id, nonlocal_init_head_insn_id),
+                sink=nonlocal_scan_insn_id,
+                barrier_id=barrier_id)
+
+    # }}}
+
+    # {{{ replace scan with local + nonlocal
+
+    updated_depends_on = (insn.depends_on
+            | frozenset([nonlocal_scan_insn_id, local_scan_insn_id]))
+
+    if nonlocal_storage_scope == lp.temp_var_scope.GLOBAL:
+        barrier_id = insn_id_gen(
+                "{insn}_nonlocal_scan_barrier".format(**format_kwargs))
+        kernel = (_add_global_barrier(kernel,
+                source=nonlocal_scan_insn_id, sink=insn_id, barrier_id=barrier_id))
+        updated_depends_on |= frozenset([barrier_id])
+
+    nonlocal_part = var(nonlocal_storage_name)[var(outer_iname)]
+
+    local_part = var(local_storage_name)[
+            pick_out_relevant_axes(
+                (var(outer_iname), var(inner_iname)), strip_scalar=True)]
+
+    updated_insn = insn.copy(
+            no_sync_with=(
+                insn.no_sync_with
+                | frozenset([(nonlocal_scan_insn_id, "any")])),
+            depends_on=updated_depends_on,
+            # XXX: scan binary op
+            expression=nonlocal_part + local_part)
+
+    kernel = _update_instructions(kernel, (updated_insn,), copy=False)
+
+    # }}}
+
+    return kernel
+
+
+# vim: foldmethod=marker
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index 8afc1695a38a37baf165f6ec6ef6567e2012173b..fa98f478d7e09bd77a3dafcd0327c5d6dce737d0 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -32,7 +32,7 @@ from loopy.schedule import (
             EnterLoop, LeaveLoop, RunInstruction,
             CallKernel, ReturnFromKernel, Barrier)
 
-from loopy.schedule.tools import (get_block_boundaries, InstructionQuery)
+from loopy.schedule.tools import get_block_boundaries
 
 
 import logging
@@ -193,13 +193,9 @@ class TemporarySaver(object):
 
             The name of the new temporary.
 
-        .. attribute:: orig_temporary
+        .. attribute:: orig_temporary_name
 
-            The original temporary variable object.
-
-        .. attribute:: hw_inames
-
-            The common list of hw axes that define the original object.
+            The name of original temporary variable object.
 
         .. attribute:: hw_dims
 
@@ -207,6 +203,10 @@ class TemporarySaver(object):
             of the promoted temporary value, corresponding to
             hardware dimensions
 
+        .. attribute:: hw_tags
+
+            The tags for the inames associated with hw_dims
+
         .. attribute:: non_hw_dims
 
             A list of expressions, to be added in front of the shape
@@ -214,9 +214,11 @@ class TemporarySaver(object):
             non-hardware dimensions
         """
 
-        @memoize_method
-        def as_variable(self):
-            temporary = self.orig_temporary
+        __slots__ = ["name", "orig_temporary_name", "hw_dims", "hw_tags",
+                "non_hw_dims"]
+
+        def as_kernel_temporary(self, kernel):
+            temporary = kernel.temporary_variables[self.orig_temporary_name]
             from loopy.kernel.data import TemporaryVariable
             return TemporaryVariable(
                 name=self.name,
@@ -230,16 +232,172 @@ class TemporarySaver(object):
 
     def __init__(self, kernel):
         self.kernel = kernel
-        self.insn_query = InstructionQuery(kernel)
         self.var_name_gen = kernel.get_var_name_generator()
         self.insn_name_gen = kernel.get_instruction_id_generator()
+
         # These fields keep track of updates to the kernel.
         self.insns_to_insert = []
         self.insns_to_update = {}
         self.extra_args_to_add = {}
         self.updated_iname_to_tag = {}
         self.updated_temporary_variables = {}
-        self.saves_or_reloads_added = {}
+
+        # temporary name -> save or reload insn ids
+        from collections import defaultdict
+        self.temporary_to_save_ids = defaultdict(set)
+        self.temporary_to_reload_ids = defaultdict(set)
+        self.subkernel_to_newly_added_insn_ids = defaultdict(set)
+
+        # Maps names of base_storage to the name of the temporary
+        # representative chosen for saves/reloads
+        self.base_storage_to_representative = {}
+
+        from loopy.kernel.data import ValueArg
+        import islpy as isl
+        self.new_subdomain = (
+                isl.BasicSet.universe(
+                    isl.Space.create_from_names(
+                        isl.DEFAULT_CONTEXT,
+                        set=[],
+                        params=set(
+                            arg.name for arg in kernel.args
+                            if isinstance(arg, ValueArg)))))
+
+    @property
+    @memoize_method
+    def subkernel_to_slice_indices(self):
+        result = {}
+
+        for sched_item_idx, sched_item in enumerate(self.kernel.schedule):
+            if isinstance(sched_item, CallKernel):
+                start_idx = sched_item_idx
+            elif isinstance(sched_item, ReturnFromKernel):
+                result[sched_item.kernel_name] = (start_idx, 1 + sched_item_idx)
+
+        return result
+
+    @property
+    @memoize_method
+    def subkernel_to_surrounding_inames(self):
+        current_outer_inames = set()
+        within_subkernel = False
+        result = {}
+
+        for sched_item_idx, sched_item in enumerate(self.kernel.schedule):
+            if isinstance(sched_item, CallKernel):
+                within_subkernel = True
+                result[sched_item.kernel_name] = frozenset(current_outer_inames)
+            elif isinstance(sched_item, ReturnFromKernel):
+                within_subkernel = False
+            elif isinstance(sched_item, EnterLoop):
+                if not within_subkernel:
+                    current_outer_inames.add(sched_item.iname)
+            elif isinstance(sched_item, LeaveLoop):
+                current_outer_inames.discard(sched_item.iname)
+
+        return result
+
+    @memoize_method
+    def get_enclosing_global_barrier_pair(self, subkernel):
+        subkernel_start, subkernel_end = (
+            self.subkernel_to_slice_indices[subkernel])
+
+        def is_global_barrier(item):
+            return isinstance(item, Barrier) and item.kind == "global"
+
+        try:
+            pre_barrier = next(item for item in
+                self.kernel.schedule[subkernel_start::-1]
+                if is_global_barrier(item)).originating_insn_id
+        except StopIteration:
+            pre_barrier = None
+
+        try:
+            post_barrier = next(item for item in
+                self.kernel.schedule[subkernel_end:]
+                if is_global_barrier(item)).originating_insn_id
+        except StopIteration:
+            post_barrier = None
+
+        return (pre_barrier, post_barrier)
+
+    def get_hw_axis_sizes_and_tags_for_save_slot(self, temporary):
+        """
+        This is used for determining the amount of global storage needed for saving
+        and restoring the temporary across kernel calls, due to hardware
+        parallel inames (the inferred axes get prefixed to the number of
+        dimensions in the temporary).
+
+        In the case of local temporaries, inames that are tagged
+        hw-local do not contribute to the global storage shape.
+        """
+        accessor_insn_ids = frozenset(
+            self.kernel.reader_map()[temporary.name]
+            | self.kernel.writer_map()[temporary.name])
+
+        group_tags = None
+        local_tags = None
+
+        def _sortedtags(tags):
+            return sorted(tags, key=lambda tag: tag.axis)
+
+        for insn_id in accessor_insn_ids:
+            insn = self.kernel.id_to_insn[insn_id]
+
+            my_group_tags = []
+            my_local_tags = []
+
+            for iname in insn.within_inames:
+                tag = self.kernel.iname_to_tag.get(iname)
+
+                if tag is None:
+                    continue
+
+                from loopy.kernel.data import (
+                    GroupIndexTag, LocalIndexTag, ParallelTag)
+
+                if isinstance(tag, GroupIndexTag):
+                    my_group_tags.append(tag)
+                elif isinstance(tag, LocalIndexTag):
+                    my_local_tags.append(tag)
+                elif isinstance(tag, ParallelTag):
+                    raise ValueError(
+                        "iname '%s' is tagged with '%s' - only "
+                        "group and local tags are supported for "
+                        "auto save/reload of temporaries" %
+                        (iname, tag))
+
+            if group_tags is None:
+                group_tags = _sortedtags(my_group_tags)
+                local_tags = _sortedtags(my_local_tags)
+                group_tags_originating_insn_id = insn_id
+
+            if (
+                    group_tags != _sortedtags(my_group_tags)
+                    or local_tags != _sortedtags(my_local_tags)):
+                raise ValueError(
+                    "inconsistent parallel tags across instructions that access "
+                    "'%s' (specifically, instruction '%s' has tags '%s' but "
+                    "instruction '%s' has tags '%s')"
+                    % (temporary.name,
+                       group_tags_originating_insn_id, group_tags + local_tags,
+                       insn_id, my_group_tags + my_local_tags))
+
+        if group_tags is None:
+            assert local_tags is None
+            return (), ()
+
+        group_sizes, local_sizes = (
+            self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids))
+
+        if temporary.scope == lp.temp_var_scope.LOCAL:
+            # Elide local axes in the save slot for local temporaries.
+            del local_tags[:]
+            local_sizes = ()
+
+        # We set hw_dims to be arranged according to the order:
+        #    g.0 < g.1 < ... < l.0 < l.1 < ...
+        return (group_sizes + local_sizes), tuple(group_tags + local_tags)
 
     @memoize_method
     def auto_promote_temporary(self, temporary_name):
@@ -255,52 +413,16 @@ class TemporarySaver(object):
             assert temporary.read_only
             return None
 
-        if temporary.base_storage is not None:
-            raise ValueError(
-                "Cannot promote temporaries with base_storage to global")
-
-        # `hw_inames`: The set of hw-parallel tagged inames that this temporary
-        # is associated with. This is used for determining the shape of the
-        # global storage needed for saving and restoring the temporary across
-        # kernel calls.
-        #
-        # TODO: Make a policy decision about which dimensions to use. Currently,
-        # the code looks at each instruction that defines or uses the temporary,
-        # and takes the common set of hw-parallel tagged inames associated with
-        # these instructions.
-        #
-        # Furthermore, in the case of local temporaries, inames that are tagged
-        # hw-local do not contribute to the global storage shape.
-        hw_inames = self.insn_query.common_hw_inames(
-            self.insn_query.insns_reading_or_writing(temporary.name))
-
-        # We want hw_inames to be arranged according to the order:
-        #    g.0 < g.1 < ... < l.0 < l.1 < ...
-        # Sorting lexicographically accomplishes this.
-        hw_inames = sorted(hw_inames,
-            key=lambda iname: str(self.kernel.iname_to_tag[iname]))
-
-        # Calculate the sizes of the dimensions that get added in front for
-        # the global storage of the temporary.
-        hw_dims = []
-
-        backing_hw_inames = []
-
-        for iname in hw_inames:
-            tag = self.kernel.iname_to_tag[iname]
-            from loopy.kernel.data import LocalIndexTag
-            is_local_iname = isinstance(tag, LocalIndexTag)
-            if is_local_iname and temporary.scope == temp_var_scope.LOCAL:
-                # Restrict shape to that of group inames for locals.
-                continue
-            backing_hw_inames.append(iname)
-            from loopy.isl_helpers import static_max_of_pw_aff
-            from loopy.symbolic import aff_to_expr
-            hw_dims.append(
-                aff_to_expr(
-                    static_max_of_pw_aff(
-                        self.kernel.get_iname_bounds(iname).size, False)))
+        base_storage_conflict = (
+            self.base_storage_to_representative.get(
+                temporary.base_storage, temporary) is not temporary)
+
+        if base_storage_conflict:
+            raise NotImplementedError(
+                "tried to save/reload multiple temporaries with the "
+                "same base_storage; this is currently not supported")
 
+        hw_dims, hw_tags = self.get_hw_axis_sizes_and_tags_for_save_slot(temporary)
         non_hw_dims = temporary.shape
 
         if len(non_hw_dims) == 0 and len(hw_dims) == 0:
@@ -308,11 +430,15 @@ class TemporarySaver(object):
             non_hw_dims = (1,)
 
         backing_temporary = self.PromotedTemporary(
-            name=self.var_name_gen(temporary.name + "_save_slot"),
-            orig_temporary=temporary,
-            hw_dims=tuple(hw_dims),
-            non_hw_dims=non_hw_dims,
-            hw_inames=backing_hw_inames)
+            name=self.var_name_gen(temporary.name + "__save_slot"),
+            orig_temporary_name=temporary.name,
+            hw_dims=hw_dims,
+            hw_tags=hw_tags,
+            non_hw_dims=non_hw_dims)
+
+        if temporary.base_storage is not None:
+            self.base_storage_to_representative[temporary.base_storage] = (
+                    backing_temporary)
 
         return backing_temporary
 
@@ -326,23 +452,16 @@ class TemporarySaver(object):
         if promoted_temporary is None:
             return
 
-        from loopy.kernel.tools import DomainChanger
-        dchg = DomainChanger(
-            self.kernel,
-            frozenset(
-                self.insn_query.inames_in_subkernel(subkernel) |
-                set(promoted_temporary.hw_inames)))
-
-        domain, hw_inames, dim_inames, iname_to_tag = \
+        new_subdomain, hw_inames, dim_inames, iname_to_tag = (
             self.augment_domain_for_save_or_reload(
-                dchg.domain, promoted_temporary, mode, subkernel)
+                self.new_subdomain, promoted_temporary, mode, subkernel))
 
-        self.kernel = dchg.get_kernel_with(domain)
+        self.new_subdomain = new_subdomain
 
         save_or_load_insn_id = self.insn_name_gen(
             "{name}.{mode}".format(name=temporary, mode=mode))
 
-        def subscript_or_var(agg, subscript=()):
+        def add_subscript_if_subscript_nonempty(agg, subscript=()):
             from pymbolic.primitives import Subscript, Variable
             if len(subscript) == 0:
                 return Variable(agg)
@@ -351,20 +470,24 @@ class TemporarySaver(object):
                     Variable(agg),
                     tuple(map(Variable, subscript)))
 
-        dim_inames_trunc = dim_inames[:len(promoted_temporary.orig_temporary.shape)]
+        orig_temporary = (
+            self.kernel.temporary_variables[
+                promoted_temporary.orig_temporary_name])
+        dim_inames_trunc = dim_inames[:len(orig_temporary.shape)]
 
         args = (
-            subscript_or_var(
-                temporary, dim_inames_trunc),
-            subscript_or_var(
-                promoted_temporary.name, hw_inames + dim_inames))
+            add_subscript_if_subscript_nonempty(
+                temporary, subscript=dim_inames_trunc),
+            add_subscript_if_subscript_nonempty(
+                promoted_temporary.name, subscript=hw_inames + dim_inames))
 
         if mode == "save":
             args = reversed(args)
 
-        accessing_insns_in_subkernel = (
-            self.insn_query.insns_reading_or_writing(temporary) &
-            self.insn_query.insns_in_subkernel(subkernel))
+        accessing_insns_in_subkernel = (frozenset(
+                self.kernel.reader_map()[temporary]
+                | self.kernel.writer_map()[temporary])
+            & self.kernel.subkernel_to_insn_ids[subkernel])
 
         if mode == "save":
             depends_on = accessing_insns_in_subkernel
@@ -373,7 +496,7 @@ class TemporarySaver(object):
             depends_on = frozenset()
             update_deps = accessing_insns_in_subkernel
 
-        pre_barrier, post_barrier = self.insn_query.pre_and_post_barriers(subkernel)
+        pre_barrier, post_barrier = self.get_enclosing_global_barrier_pair(subkernel)
 
         if pre_barrier is not None:
             depends_on |= set([pre_barrier])
@@ -387,16 +510,19 @@ class TemporarySaver(object):
             *args,
             id=save_or_load_insn_id,
             within_inames=(
-                self.insn_query.inames_in_subkernel(subkernel) |
-                frozenset(hw_inames + dim_inames)),
+                self.subkernel_to_surrounding_inames[subkernel]
+                | frozenset(hw_inames + dim_inames)),
             within_inames_is_final=True,
             depends_on=depends_on,
             boostable=False,
             boostable_into=frozenset())
 
-        if temporary not in self.saves_or_reloads_added:
-            self.saves_or_reloads_added[temporary] = set()
-        self.saves_or_reloads_added[temporary].add(save_or_load_insn_id)
+        if mode == "save":
+            self.temporary_to_save_ids[temporary].add(save_or_load_insn_id)
+        else:
+            self.temporary_to_reload_ids[temporary].add(save_or_load_insn_id)
+
+        self.subkernel_to_newly_added_insn_ids[subkernel].add(save_or_load_insn_id)
 
         self.insns_to_insert.append(save_or_load_insn)
 
@@ -405,8 +531,8 @@ class TemporarySaver(object):
             self.insns_to_update[insn_id] = insn.copy(
                 depends_on=insn.depends_on | frozenset([save_or_load_insn_id]))
 
-        self.updated_temporary_variables[promoted_temporary.name] = \
-            promoted_temporary.as_variable()
+        self.updated_temporary_variables[promoted_temporary.name] = (
+            promoted_temporary.as_kernel_temporary(self.kernel))
 
         self.updated_iname_to_tag.update(iname_to_tag)
 
@@ -416,15 +542,6 @@ class TemporarySaver(object):
 
         insns_to_insert = dict((insn.id, insn) for insn in self.insns_to_insert)
 
-        # Add global no_sync_with between any added reloads and saves
-        from six import iteritems
-        for temporary, added_insns in iteritems(self.saves_or_reloads_added):
-            for insn_id in added_insns:
-                insn = insns_to_insert[insn_id]
-                insns_to_insert[insn_id] = insn.copy(
-                    no_sync_with=frozenset(
-                        (added_insn, "global") for added_insn in added_insns))
-
         for orig_insn in self.kernel.instructions:
             if orig_insn.id in self.insns_to_update:
                 new_instructions.append(self.insns_to_update[orig_insn.id])
@@ -436,12 +553,30 @@ class TemporarySaver(object):
         self.updated_iname_to_tag.update(self.kernel.iname_to_tag)
         self.updated_temporary_variables.update(self.kernel.temporary_variables)
 
+        new_domains = list(self.kernel.domains)
+        import islpy as isl
+        if self.new_subdomain.dim(isl.dim_type.set) > 0:
+            new_domains.append(self.new_subdomain)
+
         kernel = self.kernel.copy(
+            domains=new_domains,
             instructions=new_instructions,
             iname_to_tag=self.updated_iname_to_tag,
             temporary_variables=self.updated_temporary_variables,
             overridden_get_grid_sizes_for_insn_ids=None)
 
+        # Add nosync directives to any saves or reloads that were added with a
+        # potential dependency chain.
+        for subkernel in self.kernel.subkernels:
+            relevant_insns = self.subkernel_to_newly_added_insn_ids[subkernel]
+
+            from itertools import product
+            for temporary in self.temporary_to_reload_ids:
+                for source, sink in product(
+                        relevant_insns & self.temporary_to_reload_ids[temporary],
+                        relevant_insns & self.temporary_to_save_ids[temporary]):
+                    kernel = lp.add_nosync(kernel, "global", source, sink)
+
         from loopy.kernel.tools import assign_automatic_axes
         return assign_automatic_axes(kernel)
 
@@ -456,22 +591,28 @@ class TemporarySaver(object):
         """
         Add new axes to the domain corresponding to the dimensions of
         `promoted_temporary`. These axes will be used in the save/
-        reload stage.
+        reload stage. These get prefixed onto the already existing axes.
         """
         assert mode in ("save", "reload")
         import islpy as isl
 
-        orig_temporary = promoted_temporary.orig_temporary
+        orig_temporary = (
+                self.kernel.temporary_variables[
+                    promoted_temporary.orig_temporary_name])
         orig_dim = domain.dim(isl.dim_type.set)
 
         # Tags for newly added inames
         iname_to_tag = {}
 
+        from loopy.symbolic import aff_from_expr
+
         # FIXME: Restrict size of new inames to access footprint.
 
         # Add dimension-dependent inames.
         dim_inames = []
-        domain = domain.add(isl.dim_type.set, len(promoted_temporary.non_hw_dims))
+        domain = domain.add(isl.dim_type.set,
+                            len(promoted_temporary.non_hw_dims)
+                            + len(promoted_temporary.hw_dims))
 
         for dim_idx, dim_size in enumerate(promoted_temporary.non_hw_dims):
             new_iname = self.insn_name_gen("{name}_{mode}_axis_{dim}_{sk}".
@@ -493,25 +634,31 @@ class TemporarySaver(object):
             # Add size information.
             aff = isl.affs_from_space(domain.space)
             domain &= aff[0].le_set(aff[new_iname])
-            from loopy.symbolic import aff_from_expr
             domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, dim_size))
 
-        # FIXME: Use promoted_temporary.hw_inames
-        hw_inames = []
+        dim_offset = orig_dim + len(promoted_temporary.non_hw_dims)
 
-        # Add hardware inames duplicates.
-        for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames):
+        hw_inames = []
+        # Add hardware dims.
+        for hw_iname_idx, (hw_tag, dim) in enumerate(
+                zip(promoted_temporary.hw_tags, promoted_temporary.hw_dims)):
             new_iname = self.insn_name_gen("{name}_{mode}_hw_dim_{dim}_{sk}".
                 format(name=orig_temporary.name,
                        mode=mode,
-                       dim=t_idx,
+                       dim=hw_iname_idx,
                        sk=subkernel))
-            hw_inames.append(new_iname)
-            iname_to_tag[new_iname] = self.kernel.iname_to_tag[hw_iname]
+            domain = domain.set_dim_name(
+                isl.dim_type.set, dim_offset + hw_iname_idx, new_iname)
 
-        from loopy.isl_helpers import duplicate_axes
-        domain = duplicate_axes(
-            domain, promoted_temporary.hw_inames, hw_inames)
+            aff = isl.affs_from_space(domain.space)
+            domain = (domain
+                &
+                aff[0].le_set(aff[new_iname])
+                &
+                aff[new_iname].lt_set(aff_from_expr(domain.space, dim)))
+
+            self.updated_iname_to_tag[new_iname] = hw_tag
+            hw_inames.append(new_iname)
 
         # The operations on the domain above return a Set object, but the
         # underlying domain should be expressible as a single BasicSet.
@@ -551,7 +698,8 @@ def save_and_reload_temporaries(knl):
     liveness = LivenessAnalysis(knl)
     saver = TemporarySaver(knl)
 
-    insn_query = InstructionQuery(knl)
+    from loopy.schedule.tools import (
+        temporaries_read_in_subkernel, temporaries_written_in_subkernel)
 
     for sched_idx, sched_item in enumerate(knl.schedule):
 
@@ -562,9 +710,10 @@ def save_and_reload_temporaries(knl):
                 # Kernel entry: nothing live
                 interesting_temporaries = set()
             else:
+                subkernel = sched_item.kernel_name
                 interesting_temporaries = (
-                    insn_query.temporaries_read_or_written_in_subkernel(
-                        sched_item.kernel_name))
+                    temporaries_read_in_subkernel(knl, subkernel)
+                    | temporaries_written_in_subkernel(knl, subkernel))
 
             for temporary in liveness[sched_idx].live_out & interesting_temporaries:
                 logger.info("reloading {0} at entry of {1}"
@@ -576,9 +725,9 @@ def save_and_reload_temporaries(knl):
                 # Kernel exit: nothing live
                 interesting_temporaries = set()
             else:
+                subkernel = sched_item.kernel_name
                 interesting_temporaries = (
-                    insn_query.temporaries_written_in_subkernel(
-                        sched_item.kernel_name))
+                    temporaries_written_in_subkernel(knl, subkernel))
 
             for temporary in liveness[sched_idx].live_in & interesting_temporaries:
                 logger.info("saving {0} before return of {1}"
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 4c1e423e93e104fecd0b49a2b1ef2b4a261e38e7..da7a5b52d227a1b9262ac5755d38c6e66c42dfe8 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -352,28 +352,30 @@ class TypeInferenceMapper(CombineMapper):
         return [self.kernel.index_dtype]
 
     def map_reduction(self, expr, return_tuple=False):
-        rec_result = self.rec(expr.expr)
-
-        if rec_result:
-            rec_result, = rec_result
-            result = expr.operation.result_dtypes(
-                    self.kernel, rec_result, expr.inames)
-        else:
-            result = expr.operation.result_dtypes(
-                    self.kernel, None, expr.inames)
+        """
+        :arg return_tuple: If *True*, treat the type of the reduction expression
+            as a tuple type. Otherwise, the number of expressions being reduced over
+            must equal 1, and the type of the first expression is returned.
+        """
+        rec_results = tuple(self.rec(sub_expr) for sub_expr in expr.exprs)
 
-        if result is None:
+        if any(len(rec_result) == 0 for rec_result in rec_results):
             return []
 
         if return_tuple:
-            return [result]
+            from itertools import product
+            return list(
+                    expr.operation.result_dtypes(self.kernel, *product_element)
+                    for product_element in product(*rec_results))
 
         else:
-            if len(result) != 1 and not return_tuple:
+            if len(rec_results) != 1:
                 raise LoopyError("reductions with more or fewer than one "
                         "return value may only be used in direct assignments")
 
-            return [result[0]]
+            return list(
+                    expr.operation.result_dtypes(self.kernel, rec_result)[0]
+                    for rec_result in rec_results[0])
 
 # }}}
 
diff --git a/loopy/version.py b/loopy/version.py
index 5c6ad47f8571ceb9100f4f7f8dece9d80a35d10c..77d0e21bdd2ef5383c5f874656c25fe1ede21a70 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v59-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v60-islpy%s" % _islpy_version
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 851a7f0762fcec3ccbb55399e183f5fb51322ac1..19a00108449b3be6905ce10905a7f313a3194b7d 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1987,19 +1987,28 @@ def test_integer_reduction(ctx_factory):
                                    dtype=to_loopy_type(vtype),
                                    shape=lp.auto)
 
-        reductions = [('max', lambda x: x == np.max(var_int)),
-                      ('min', lambda x: x == np.min(var_int)),
-                      ('sum', lambda x: x == np.sum(var_int)),
-                      ('product', lambda x: x == np.prod(var_int)),
-                      ('argmax', lambda x: (x[0] == np.max(var_int) and
-                        var_int[out[1]] == np.max(var_int))),
-                      ('argmin', lambda x: (x[0] == np.min(var_int) and
-                        var_int[out[1]] == np.min(var_int)))]
-
-        for reduction, function in reductions:
+        from collections import namedtuple
+        ReductionTest = namedtuple('ReductionTest', 'kind, check, args')
+
+        reductions = [
+            ReductionTest('max', lambda x: x == np.max(var_int), args='var[k]'),
+            ReductionTest('min', lambda x: x == np.min(var_int), args='var[k]'),
+            ReductionTest('sum', lambda x: x == np.sum(var_int), args='var[k]'),
+            ReductionTest('product', lambda x: x == np.prod(var_int), args='var[k]'),
+            ReductionTest('argmax',
+                lambda x: (
+                    x[0] == np.max(var_int) and var_int[out[1]] == np.max(var_int)),
+                args='var[k], k'),
+            ReductionTest('argmin',
+                lambda x: (
+                    x[0] == np.min(var_int) and var_int[out[1]] == np.min(var_int)),
+                args='var[k], k')
+        ]
+
+        for reduction, function, args in reductions:
             kstr = ("out" if 'arg' not in reduction
                         else "out[0], out[1]")
-            kstr += ' = {0}(k, var[k])'.format(reduction)
+            kstr += ' = {0}(k, {1})'.format(reduction, args)
             knl = lp.make_kernel('{[k]: 0<=k<n}',
                                 kstr,
                                 [var_lp, '...'])
@@ -2108,6 +2117,69 @@ def test_barrier_insertion_near_bottom_of_loop():
     assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1])
 
 
+def test_global_barrier_order_finding():
+    knl = lp.make_kernel(
+            "{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
+            """
+            for i
+                for itrip
+                    ... gbarrier {id=top}
+                    <> z[i] = z[i+1] + z[i]  {id=wr_z,dep=top}
+                    <> v[i] = 11  {id=wr_v,dep=top}
+                    ... gbarrier {dep=wr_z:wr_v,id=yoink}
+                    z[i] = z[i] - z[i+1] + v[i] {id=iupd, dep=yoink}
+                end
+                ... nop {id=nop}
+                ... gbarrier {dep=iupd,id=postloop}
+                z[i] = z[i] - z[i+1] + v[i]  {id=zzzv,dep=postloop}
+            end
+            """)
+
+    assert knl.global_barrier_order == ("top", "yoink", "postloop")
+
+    for insn, barrier in (
+            ("nop", None),
+            ("top", None),
+            ("wr_z", "top"),
+            ("wr_v", "top"),
+            ("yoink", "top"),
+            ("postloop", "yoink"),
+            ("zzzv", "postloop")):
+        assert knl.find_most_recent_global_barrier(insn) == barrier
+
+
+def test_global_barrier_error_if_unordered():
+    # FIXME: Should be illegal to declare this
+    knl = lp.make_kernel("{[i]: 0 <= i < 10}",
+            """
+            ... gbarrier
+            ... gbarrier
+            """)
+
+    from loopy.diagnostic import LoopyError
+    with pytest.raises(LoopyError):
+        knl.global_barrier_order
+
+
+def test_multi_base_storage_save_and_reload_not_supported():
+    # FIXME: This ought to work, change the test when it does.
+    knl = lp.make_kernel("{[i]: 0<=i<10}",
+        """
+        <>a[0] = 1
+        <>b[0] = 2
+        ... gbarrier
+        out = a[0] + b[0]
+        """,
+        seq_dependencies=True)
+
+    knl = lp.alias_temporaries(knl, ("a", "b"), synchronize_for_exclusive_use=False)
+    knl = lp.preprocess_kernel(knl)
+    knl = lp.get_one_scheduled_kernel(knl)
+
+    with pytest.raises(NotImplementedError):
+        lp.save_and_reload_temporaries(knl)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_reduction.py b/test/test_reduction.py
index 5887df7a628c46fbf09539fdd48c08aaacd8e409..96d85beb6831bca7369223decfc6ff1863976d35 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -240,12 +240,38 @@ def test_global_parallel_reduction(ctx_factory, size):
     knl = lp.add_dependency(
             knl, "writes:acc_i_outer",
             "id:red_i_outer_arg_barrier")
-
     lp.auto_test_vs_ref(
             ref_knl, ctx, knl, parameters={"n": size},
             print_ref_code=True)
 
 
+def test_global_parallel_reduction_2():
+    knl = lp.make_kernel(
+            "{[i]: 0 <= i < n }",
+            """
+            # Using z[0] instead of z works around a bug in ancient PyOpenCL.
+            z[0] = sum(i, i/13) {id=reduce}
+            """)
+
+    gsize = 128
+    knl = lp.make_two_level_reduction(knl,
+            "reduce",
+            inner_length=gsize * 20,
+            nonlocal_tag="g.0",
+            nonlocal_storage_scope=lp.temp_var_scope.GLOBAL,
+            outer_tag=None,
+            inner_tag=None)
+
+    print(knl)
+
+    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
+    knl = lp.split_reduction_inward(knl, "i_inner_inner")
+
+    knl = lp.realize_reduction(knl)
+
+    print(knl)
+
+
 @pytest.mark.parametrize("size", [1000])
 def test_global_mc_parallel_reduction(ctx_factory, size):
     ctx = ctx_factory()
@@ -297,7 +323,7 @@ def test_argmax(ctx_factory):
     knl = lp.make_kernel(
             "{[i]: 0<=i<%d}" % n,
             """
-            max_val, max_idx = argmax(i, fabs(a[i]))
+            max_val, max_idx = argmax(i, fabs(a[i]), i)
             """)
 
     knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
@@ -397,7 +423,7 @@ def test_parallel_multi_output_reduction():
     knl = lp.make_kernel(
                 "{[i]: 0<=i<128}",
                 """
-                max_val, max_indices = argmax(i, fabs(a[i]))
+                max_val, max_indices = argmax(i, fabs(a[i]), i)
                 """)
     knl = lp.tag_inames(knl, dict(i="l.0"))
     knl = lp.realize_reduction(knl)
diff --git a/test/test_scan.py b/test/test_scan.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c84d6e4d62f18693b1352d47bf78b0a01bb6508
--- /dev/null
+++ b/test/test_scan.py
@@ -0,0 +1,550 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = """
+Copyright (C) 2012 Andreas Kloeckner
+Copyright (C) 2016, 2017 Matt Wala
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import sys
+import numpy as np
+import loopy as lp
+import pyopencl as cl
+import pyopencl.clmath  # noqa
+import pyopencl.clrandom  # noqa
+import pytest
+from pytools import memoize
+
+import logging
+logger = logging.getLogger(__name__)
+
+try:
+    import faulthandler
+except ImportError:
+    pass
+else:
+    faulthandler.enable()
+
+from pyopencl.tools import pytest_generate_tests_for_pyopencl \
+        as pytest_generate_tests
+
+__all__ = [
+        "pytest_generate_tests",
+        "cl"  # 'cl.create_some_context'
+        ]
+
+
+# More things to test.
+# - scan(a) + scan(b)
+# - test for badly tagged inames
+# - global parallel scan
+
+# TO DO:
+# segmented<sum>(...) syntax
+
+
+@pytest.mark.parametrize("n", [1, 2, 3, 16])
+@pytest.mark.parametrize("stride", [1, 2])
+def test_sequential_scan(ctx_factory, n, stride):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "[n] -> {[i,j]: 0<=i<n and 0<=j<=%d*i}" % stride,
+        """
+        a[i] = sum(j, j**2)
+        """
+        )
+
+    knl = lp.fix_parameters(knl, n=n)
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    evt, (a,) = knl(queue)
+
+    assert (a.get() == np.cumsum(np.arange(stride*n)**2)[::stride]).all()
+
+
+@pytest.mark.parametrize("sweep_lbound, scan_lbound", [
+    (4, 0),
+    (3, 1),
+    (2, 2),
+    (1, 3),
+    (0, 4),
+    (5, -1),
+    ])
+def test_scan_with_different_lower_bound_from_sweep(
+        ctx_factory, sweep_lbound, scan_lbound):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "[n, sweep_lbound, scan_lbound] -> "
+        "{[i,j]: sweep_lbound<=i<n+sweep_lbound "
+        "and scan_lbound<=j<=2*(i-sweep_lbound)+scan_lbound}",
+        """
+        out[i-sweep_lbound] = sum(j, j**2)
+        """
+        )
+
+    n = 10
+
+    knl = lp.fix_parameters(knl, sweep_lbound=sweep_lbound, scan_lbound=scan_lbound)
+    knl = lp.realize_reduction(knl, force_scan=True)
+    evt, (out,) = knl(queue, n=n)
+
+    assert (out.get()
+            == np.cumsum(np.arange(scan_lbound, 2*n+scan_lbound)**2)[::2]).all()
+
+
+def test_automatic_scan_detection():
+    knl = lp.make_kernel(
+        [
+            "[n] -> {[i]: 0<=i<n}",
+            "{[j]: 0<=j<=2*i}"
+        ],
+        """
+        a[i] = sum(j, j**2)
+        """
+        )
+
+    cgr = lp.generate_code_v2(knl)
+    assert "scan" in cgr.device_code()
+
+
+def test_selective_scan_realization():
+    pass
+
+
+def test_force_outer_iname_for_scan():
+    knl = lp.make_kernel(
+        "[n] -> {[i,j,k]: 0<=k<n and 0<=i<=k and 0<=j<=i}",
+        "out[i] = product(j, a[j]) {inames=i:k}")
+
+    knl = lp.add_dtypes(knl, dict(a=np.float32))
+
+    # TODO: Maybe this deserves to work?
+    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
+        lp.realize_reduction(knl, force_scan=True)
+
+    knl = lp.realize_reduction(knl, force_scan=True, force_outer_iname_for_scan="i")
+
+
+def test_dependent_domain_scan(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        [
+            "[n] -> {[i]: 0<=i<n}",
+            "{[j]: 0<=j<=2*i}"
+        ],
+        """
+        a[i] = sum(j, j**2) {id=scan}
+        """
+        )
+    knl = lp.realize_reduction(knl, force_scan=True)
+    evt, (a,) = knl(queue, n=100)
+
+    assert (a.get() == np.cumsum(np.arange(200)**2)[::2]).all()
+
+
+@pytest.mark.parametrize("i_tag, j_tag", [
+    ("for", "for")
+    ])
+def test_nested_scan(ctx_factory, i_tag, j_tag):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        [
+            "[n] -> {[i]: 0 <= i < n}",
+            "[i] -> {[j]: 0 <= j <= i}",
+            "[i] -> {[k]: 0 <= k <= i}"
+        ],
+        """
+        <>tmp[i] = sum(k, 1)
+        out[i] = sum(j, tmp[j])
+        """)
+
+    knl = lp.fix_parameters(knl, n=10)
+    knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag))
+
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    print(knl)
+
+    evt, (out,) = knl(queue)
+
+    print(out)
+
+
+def test_scan_not_triangular():
+    knl = lp.make_kernel(
+        "{[i,j]: 0<=i<100 and 1<=j<=2*i}",
+        """
+        a[i] = sum(j, j**2)
+        """
+        )
+
+    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
+        knl = lp.realize_reduction(knl, force_scan=True)
+
+
+@pytest.mark.parametrize("n", [1, 2, 3, 16, 17])
+def test_local_parallel_scan(ctx_factory, n):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}",
+        """
+        out[i] = sum(j, a[j]**2)
+        """,
+        "..."
+        )
+
+    knl = lp.fix_parameters(knl, n=n)
+    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    knl = lp.realize_reduction(knl)
+
+    knl = lp.add_dtypes(knl, dict(a=int))
+
+    print(knl)
+
+    evt, (a,) = knl(queue, a=np.arange(n))
+    assert (a == np.cumsum(np.arange(n)**2)).all()
+
+
+def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "[n] -> {[i,j]: 1<=i<n+1 and 0<=j<=i-1}",
+        """
+        out[i-1] = sum(j, a[j]**2)
+        """,
+        "..."
+        )
+
+    knl = lp.fix_parameters(knl, n=16)
+    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.realize_reduction(knl, force_scan=True)
+    knl = lp.realize_reduction(knl)
+
+    knl = lp.add_dtypes(knl, dict(a=int))
+    evt, (out,) = knl(queue, a=np.arange(1, 17))
+
+    assert (out == np.cumsum(np.arange(1, 17)**2)).all()
+
+
+@pytest.mark.parametrize("sweep_iname_tag", ["for", "l.1"])
+def test_scan_with_outer_parallel_iname(ctx_factory, sweep_iname_tag):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        [
+            "{[k]: 0<=k<=1}",
+            "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}"
+        ],
+        "out[k,i] = k + sum(j, j**2)"
+        )
+
+    knl = lp.tag_inames(knl, dict(k="l.0", i=sweep_iname_tag))
+    n = 10
+    knl = lp.fix_parameters(knl, n=n)
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    evt, (out,) = knl(queue)
+
+    inner = np.cumsum(np.arange(n)**2)
+
+    assert (out.get() == np.array([inner, 1 + inner])).all()
+
+
+def test_scan_data_types():
+    # TODO
+    pass
+
+
+def test_scan_library():
+    # TODO
+    pass
+
+
+@pytest.mark.parametrize("i_tag", ["for", "l.0"])
+def test_argmax(ctx_factory, i_tag):
+    logging.basicConfig(level=logging.INFO)
+
+    dtype = np.dtype(np.float32)
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    n = 128
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<=i<%d and 0<=j<=i}" % n,
+            """
+            max_vals[i], max_indices[i] = argmax(j, fabs(a[j]), j)
+            """)
+
+    knl = lp.tag_inames(knl, dict(i=i_tag))
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    print(knl)
+
+    a = np.random.randn(n).astype(dtype)
+    evt, (max_indices, max_vals) = knl(queue, a=a, out_host=True)
+
+    assert (max_vals == [np.max(np.abs(a)[0:i+1]) for i in range(n)]).all()
+    assert (max_indices == [np.argmax(np.abs(a[0:i+1])) for i in range(n)]).all()
+
+
+@pytest.mark.parametrize("n, segment_boundaries_indices", (
+    (1, (0,)),
+    (2, (0,)),
+    (2, (0, 1)),
+    (3, (0,)),
+    (3, (0, 1)),
+    (3, (0, 2)),
+    (3, (0, 1, 2)),
+    (16, (0, 5)),
+    ))
+@pytest.mark.parametrize("iname_tag", ("for", "l.0"))
+def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    arr = np.ones(n, dtype=np.float32)
+    segment_boundaries = np.zeros(n, dtype=np.int32)
+    segment_boundaries[(segment_boundaries_indices,)] = 1
+
+    knl = lp.make_kernel(
+        "{[i,j]: 0<=i<n and 0<=j<=i}",
+        "out[i], <>_ = segmented_sum(j, arr[j], segflag[j])",
+        [
+            lp.GlobalArg("arr", np.float32, shape=("n",)),
+            lp.GlobalArg("segflag", np.int32, shape=("n",)),
+            "..."
+        ])
+
+    knl = lp.fix_parameters(knl, n=n)
+    knl = lp.tag_inames(knl, dict(i=iname_tag))
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    (evt, (out,)) = knl(queue, arr=arr, segflag=segment_boundaries)
+
+    class SegmentGrouper(object):
+
+        def __init__(self):
+            self.seg_idx = 0
+            self.idx = 0
+
+        def __call__(self, key):
+            if self.idx in segment_boundaries_indices:
+                self.seg_idx += 1
+            self.idx += 1
+            return self.seg_idx
+
+    from itertools import groupby
+
+    expected = [np.cumsum(list(group))
+            for _, group in groupby(arr, SegmentGrouper())]
+    actual = [np.array(list(group))
+            for _, group in groupby(out, SegmentGrouper())]
+
+    assert len(expected) == len(actual) == len(segment_boundaries_indices)
+    assert [(e == a).all() for e, a in zip(expected, actual)]
+
+
+# {{{ two and three level scan getters
+
+@memoize
+def _get_two_level_scan_kernel(g_size):
+    knl = lp.make_kernel(
+        [
+            "[n] -> {[i,j]: 0 <= i < n and 0 <= j <= i}",
+        ],
+        """
+        out[i] = sum(j, a[j]) {id=insn}
+        """,
+        "...",
+        assumptions="n > 0")
+
+    from loopy.transform.reduction import make_two_level_scan
+    knl = make_two_level_scan(
+        knl, "insn", inner_length=g_size,
+        scan_iname="j",
+        sweep_iname="i",
+        local_storage_axes=(("i__l0",)),
+        inner_iname="i__l0",
+        inner_tag="l.0",
+        outer_tag="g.0",
+        local_storage_scope=lp.temp_var_scope.LOCAL,
+        nonlocal_storage_scope=lp.temp_var_scope.GLOBAL,
+        inner_local_tag="l.0",
+        outer_local_tag="g.0")
+
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    knl = lp.add_nosync(
+            knl,
+            scope="global",
+            source="writes:acc_j__l0",
+            sink="reads:acc_j__l0")
+
+    from loopy.transform.save import save_and_reload_temporaries
+
+    knl = lp.preprocess_kernel(knl)
+    knl = lp.get_one_scheduled_kernel(knl)
+    knl = save_and_reload_temporaries(knl)
+    knl = lp.get_one_scheduled_kernel(knl)
+
+    return knl
+
+
+@memoize
+def _get_three_level_scan_kernel(g_size, p_size):
+    knl = lp.make_kernel(
+        [
+            "[n] -> {[i,j]: 0 <= i < n and 0 <= j <= i}",
+        ],
+        """
+        out[i] = sum(j, a[j]) {id=insn}
+        """,
+        "...",
+        assumptions="n > 0")
+
+    from loopy.transform.reduction import make_two_level_scan
+    knl = make_two_level_scan(
+        knl, "insn", inner_length=g_size,
+        scan_iname="j",
+        sweep_iname="i",
+        local_storage_axes=(("i__l0",)),
+        inner_iname="i__l0",
+        inner_tag=None,
+        outer_tag="g.0",
+        local_storage_scope=lp.temp_var_scope.LOCAL,
+        nonlocal_storage_scope=lp.temp_var_scope.GLOBAL,
+        inner_local_tag=None,
+        outer_local_tag="g.0")
+
+    knl = make_two_level_scan(
+        knl, "insn__l1", inner_length=p_size,
+        scan_iname="j__l1",
+        sweep_iname="i__l1",
+        inner_tag="for",
+        outer_tag="l.0",
+        nonlocal_tag="l.0",
+        local_storage_scope=lp.temp_var_scope.LOCAL,
+        nonlocal_storage_scope=lp.temp_var_scope.LOCAL,
+        inner_local_tag="for",
+        outer_local_tag="l.0")
+
+    knl = lp.tag_inames(knl, dict(i__l0="l.0",
+                                  i__l0_nltail_inner="l.0"))
+
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    knl = lp.add_nosync(
+            knl,
+            scope="global",
+            source="writes:acc_j__l0",
+            sink="reads:acc_j__l0")
+
+    knl = lp.alias_temporaries(knl,
+            ("insn__l1", "insn__l2"),
+            synchronize_for_exclusive_use=False)
+
+    from loopy.transform.save import save_and_reload_temporaries
+
+    knl = lp.preprocess_kernel(knl)
+    knl = lp.get_one_scheduled_kernel(knl)
+    knl = save_and_reload_temporaries(knl)
+    knl = lp.get_one_scheduled_kernel(knl)
+
+    return knl
+
+# }}}
+
+
+# TODO: Test everything from the matrix
+# (l.0, seq) x (l.0, seq)
+@pytest.mark.parametrize("input_len",
+        (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 32))
+@pytest.mark.parametrize("g_size", (16,))
+def test_two_level_scan(ctx_getter, input_len, g_size):
+    knl = _get_two_level_scan_kernel(g_size)
+
+    import numpy as np
+    np.random.seed(0)
+    a = np.random.randint(low=0, high=100, size=input_len)
+
+    c = ctx_getter()
+    q = cl.CommandQueue(c)
+
+    _, (out,) = knl(q, a=a)
+
+    assert (out == np.cumsum(a)).all()
+
+
+@pytest.mark.parametrize("input_len",
+        (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 32))
+@pytest.mark.parametrize("g_size", (16,))
+@pytest.mark.parametrize("p_size", (4,))
+def test_three_level_scan(ctx_getter, g_size, p_size, input_len):
+    knl = _get_three_level_scan_kernel(g_size, p_size)
+
+    import numpy as np
+    np.random.seed(0)
+    a = np.random.randint(low=0, high=100, size=input_len)
+
+    c = ctx_getter()
+    q = cl.CommandQueue(c)
+
+    _, (out,) = knl(q, a=a)
+
+    assert (out == np.cumsum(a)).all()
+
+
+def test_scan_extra_constraints_on_domain():
+    knl = lp.make_kernel(
+        "{[i,j,k]: 0<=i<n and 0<=j<=i and i=k}",
+        "out[i] = sum(j, a[j])")
+
+    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
+        knl = lp.realize_reduction(
+                knl, force_scan=True, force_outer_iname_for_scan="i")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])
+
+# vim: foldmethod=marker
diff --git a/test/test_transform.py b/test/test_transform.py
index ac5a26f6a5683bf9e86055a2729ea5ee995dee1e..b5fcdf04c4781c5f370c911ceb7efcb4042f6b4e 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -402,6 +402,42 @@ def test_precompute_with_preexisting_inames_fail():
                 precompute_inames="ii,jj")
 
 
+def test_add_nosync():
+    orig_knl = lp.make_kernel("{[i]: 0<=i<10}",
+        """
+        <>tmp[i] = 10 {id=insn1}
+        <>tmp2[i] = 10 {id=insn2}
+
+        <>tmp3[2*i] = 0 {id=insn3}
+        <>tmp4 = 1 + tmp3[2*i] {id=insn4}
+
+        <>tmp5[i] = 0 {id=insn5,groups=g1}
+        tmp5[i] = 1 {id=insn6,conflicts=g1}
+        """)
+
+    orig_knl = lp.set_temporary_scope(orig_knl, "tmp3", "local")
+    orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local")
+
+    # No dependency present - don't add nosync
+    knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2")
+    assert frozenset() == knl.id_to_insn["insn2"].no_sync_with
+
+    # Dependency present
+    knl = lp.add_nosync(orig_knl, "local", "writes:tmp3", "reads:tmp3")
+    assert frozenset() == knl.id_to_insn["insn3"].no_sync_with
+    assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with
+
+    # Bidirectional
+    knl = lp.add_nosync(
+            orig_knl, "local", "writes:tmp3", "reads:tmp3", bidirectional=True)
+    assert frozenset([("insn4", "local")]) == knl.id_to_insn["insn3"].no_sync_with
+    assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with
+
+    # Groups
+    knl = lp.add_nosync(orig_knl, "local", "insn5", "insn6")
+    assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])