diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index 8674434084077ba1f791c46123a083346715209e..9138d9a41d7b33db956fd8aba55c0b3b788db064 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -5,8 +5,8 @@ Reference: Loopy's Model of a Kernel
 
 .. _domain-tree:
 
-Loop Domain Tree
-----------------
+Loop Domain Forest
+------------------
 
 .. {{{
 
@@ -29,10 +29,29 @@ Note that *n* in the example is not an iname. It is a
 :ref:`domain-parameters` that is passed to the kernel by the user.
 
 To accommodate some data-dependent control flow, there is not actually
-a single loop domain, but rather a *tree of loop domains*,
-allowing more deeply nested domains to depend on inames
+a single loop domain, but rather a *forest of loop domains* (a collection
+of trees) allowing more deeply nested domains to depend on inames
 introduced by domains closer to the root.
 
+Here is an example::
+
+    { [l] : 0 <= l <= 2 }
+      { [i] : start <= i < end }
+      { [j] : start <= j < end }
+
+The i and j domains are "children" of the l domain (visible from indentation).
+This is also how :mod:`loopy` prints the domain forest, to make the parent/child
+relationship visible.  In the example, the parameters start/end might be read
+inside of the 'l' loop.
+
+The idea is that domains form a forest (a collection of trees), and a
+"sub-forest" is extracted that covers all the inames for each
+instruction. Each individual sub-tree is then checked for branching,
+which is ill-formed. It is declared ill-formed because intersecting, in
+the above case, the l, i, and j domains could result in restrictions from the
+i domain affecting the j domain by way of how i affects l--which would
+be counterintuitive to say the least.)
+
 .. _inames:
 
 Inames
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 5eaa12b8124f86cfaf08cf2e83c3382861d9e0f2..92ec799f7045cf63dc75d1386d8a51fd7d42954c 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1120,7 +1120,7 @@ all work items reach the same barrier, the kernel will hang during execution.
 
 By default, :mod:`loopy` inserts local barriers between two instructions when it
 detects that a dependency involving local memory may occur across work items. To
-see this in action, take a look at the section on :ref:`local_temporaries`. 
+see this in action, take a look at the section on :ref:`local_temporaries`.
 
 In contrast, :mod:`loopy` will *not* insert global barriers automatically.
 Global barriers require manual intervention along with some special
@@ -1308,7 +1308,7 @@ tagged, as in the following example::
             assumptions="n>0")
 
 .. [#global-barrier-note] In particular, this is *not* the same as a call to
- ``barrier(CLK_GLOBAL_MEM_FENCE)``. 
+ ``barrier(CLK_GLOBAL_MEM_FENCE)``.
 
 .. }}}
 
@@ -1533,12 +1533,12 @@ information provided. Now we will count the operations:
 
     >>> op_map = lp.get_op_map(knl)
     >>> print(lp.stringify_stats_mapping(op_map))
-    Op(np:dtype('float32'), add) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('float32'), div) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('float32'), mul) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('float64'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('float64'), mul) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('int32'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float32'), add) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float32'), div) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float32'), mul) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float64'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float64'), mul) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('int32'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
 
 :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{**
@@ -1596,9 +1596,9 @@ together into keys containing only the specified fields:
 
     >>> op_map_dtype = op_map.group_by('dtype')
     >>> print(lp.stringify_stats_mapping(op_map_dtype))
-    Op(np:dtype('float32'), None) : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('float64'), None) : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('int32'), None) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float32'), None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float64'), None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('int32'), None) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
     >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32)
     ...                           ].eval_with_dict(param_dict)
@@ -1619,12 +1619,12 @@ we'll continue using the kernel from the previous example:
 
     >>> mem_map = lp.get_mem_access_map(knl)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 2 * n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
 
 :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
@@ -1674,18 +1674,18 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
     >>> bytes_map = mem_map.to_bytes()
     >>> print(lp.stringify_stats_mapping(bytes_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 8 * n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
     >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
     ...                                         ).group_by('direction')
     >>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
-    MemAccess(None, None, None, load, None) : [n, m, l] -> { (16 * n * m + 12 * n * m * l) : n > 0 and m > 0 and l > 0 }
-    MemAccess(None, None, None, store, None) : [n, m, l] -> { (8 * n * m + 4 * n * m * l) : n > 0 and m > 0 and l > 0 }
+    MemAccess(None, None, None, load, None) : [m, l, n] -> { (16 * m + 12 * m * l) * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(None, None, None, store, None) : [m, l, n] -> { (8 * m + 4 * m * l) * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
     >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
     ...                            ].eval_with_dict(param_dict)
@@ -1712,12 +1712,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
     ...                             outer_tag="l.1", inner_tag="l.0")
     >>> mem_map = lp.get_mem_access_map(knl_consec)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, load, a) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, load, b) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, store, c) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, load, g) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, load, h) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, store, e) : [m, l, n] -> { ... }
     <BLANKLINE>
 
 With this parallelization, consecutive threads will access consecutive array
@@ -1753,12 +1753,12 @@ switch the inner and outer tags in our parallelization of the kernel:
     ...                                outer_tag="l.0", inner_tag="l.1")
     >>> mem_map = lp.get_mem_access_map(knl_nonconsec)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, load, a) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, load, b) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, store, c) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, load, g) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, load, h) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, store, e) : [m, l, n] -> { ... }
     <BLANKLINE>
 
 With this parallelization, consecutive threads will access *nonconsecutive*
diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py
index 82ff2e60dee345fe16771d09cb39d2d56e9f493d..7c5de5a1b1d7042498a12204959a59021ac5e0d8 100644
--- a/examples/python/hello-loopy.py
+++ b/examples/python/hello-loopy.py
@@ -26,5 +26,5 @@ knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
 evt, (out,) = knl(queue, a=a)
 # ENDEXAMPLE
 
-cknl = lp.CompiledKernel(ctx, knl)
-print(cknl.get_highlighted_code({"a": np.float32}))
+knl = lp.add_and_infer_dtypes(knl, {"a": np.dtype(np.float32)})
+print(lp.generate_code_v2(knl).device_code())
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index 0860caa5be8345b40531bdec59746248769c2039..a91eb51a0a90e987ec05a8ebf71a6759e047f5d3 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -519,9 +519,13 @@ def auto_test_vs_ref(
 
     args = None
     from loopy.kernel import kernel_state
+    from loopy.target.pyopencl import PyOpenCLTarget
     if test_knl.state not in [
             kernel_state.PREPROCESSED,
             kernel_state.SCHEDULED]:
+        if isinstance(test_knl.target, PyOpenCLTarget):
+            test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0]))
+
         test_knl = lp.preprocess_kernel(test_knl)
 
     if not test_knl.schedule:
diff --git a/loopy/check.py b/loopy/check.py
index d1ba1ab1aec0dd72acc4c268417e9b0e6653e55f..741195ae6ac87d01de3a4ac620ce510fd62ff470 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -354,7 +354,7 @@ def check_has_schedulable_iname_nesting(kernel):
 
 def pre_schedule_checks(kernel):
     try:
-        logger.info("%s: pre-schedule check: start" % kernel.name)
+        logger.debug("%s: pre-schedule check: start" % kernel.name)
 
         check_for_orphaned_user_hardware_axes(kernel)
         check_for_double_use_of_hw_axes(kernel)
@@ -367,7 +367,7 @@ def pre_schedule_checks(kernel):
         check_write_destinations(kernel)
         check_has_schedulable_iname_nesting(kernel)
 
-        logger.info("%s: pre-schedule check: done" % kernel.name)
+        logger.debug("%s: pre-schedule check: done" % kernel.name)
     except KeyboardInterrupt:
         raise
     except:
@@ -505,22 +505,23 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel):
 # {{{ check that temporaries are defined in subkernels where used
 
 def check_that_temporaries_are_defined_in_subkernels_where_used(kernel):
-    from loopy.schedule.tools import InstructionQuery
     from loopy.kernel.data import temp_var_scope
+    from loopy.kernel.tools import get_subkernels
 
-    insn_query = InstructionQuery(kernel)
-
-    for subkernel in insn_query.subkernels():
+    for subkernel in get_subkernels(kernel):
         defined_base_storage = set()
 
-        for temporary in insn_query.temporaries_written_in_subkernel(subkernel):
+        from loopy.schedule.tools import (
+                temporaries_written_in_subkernel, temporaries_read_in_subkernel)
+
+        for temporary in temporaries_written_in_subkernel(kernel, subkernel):
             tval = kernel.temporary_variables[temporary]
             if tval.base_storage is not None:
                 defined_base_storage.add(tval.base_storage)
 
         for temporary in (
-                insn_query.temporaries_read_in_subkernel(subkernel) -
-                insn_query.temporaries_written_in_subkernel(subkernel)):
+                temporaries_read_in_subkernel(kernel, subkernel) -
+                temporaries_written_in_subkernel(kernel, subkernel)):
             tval = kernel.temporary_variables[temporary]
 
             if tval.initializer is not None:
@@ -530,16 +531,17 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel):
             if tval.base_storage is not None:
                 if tval.base_storage not in defined_base_storage:
                     from loopy.diagnostic import MissingDefinitionError
-                    raise MissingDefinitionError("temporary variable '%s' gets used "
-                        "in subkernel '%s' and neither it nor its aliases have a "
-                        "definition" % (temporary, subkernel))
+                    raise MissingDefinitionError("temporary variable '%s' gets "
+                            "used in subkernel '%s' and neither it nor its "
+                            "aliases have a definition" % (temporary, subkernel))
                 continue
 
             if tval.scope in (temp_var_scope.PRIVATE, temp_var_scope.LOCAL):
                 from loopy.diagnostic import MissingDefinitionError
-                raise MissingDefinitionError("temporary variable '%s' gets used in "
-                    "subkernel '%s' without a definition (maybe you forgot to call "
-                    "loopy.save_and_reload_temporaries?)" % (temporary, subkernel))
+                raise MissingDefinitionError("temporary variable '%s' gets used "
+                        "in subkernel '%s' without a definition (maybe you forgot "
+                        "to call loopy.save_and_reload_temporaries?)"
+                        % (temporary, subkernel))
 
 # }}}
 
@@ -616,7 +618,7 @@ def check_that_shapes_and_strides_are_arguments(kernel):
 
 def pre_codegen_checks(kernel):
     try:
-        logger.info("pre-codegen check %s: start" % kernel.name)
+        logger.debug("pre-codegen check %s: start" % kernel.name)
 
         check_for_unused_hw_axes_in_insns(kernel)
         check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel)
@@ -625,7 +627,7 @@ def pre_codegen_checks(kernel):
         kernel.target.pre_codegen_check(kernel)
         check_that_shapes_and_strides_are_arguments(kernel)
 
-        logger.info("pre-codegen check %s: done" % kernel.name)
+        logger.debug("pre-codegen check %s: done" % kernel.name)
     except:
         print(75*"=")
         print("failing kernel during pre-schedule check:")
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index 7cc381f11d1239cba5656a9dc7a04cddaa14a368..61f4b3a9b8c38dfc25ebc81243812aa963423f8a 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -63,13 +63,20 @@ def get_usable_inames_for_conditional(kernel, sched_index):
     result = find_active_inames_at(kernel, sched_index)
     crosses_barrier = has_barrier_within(kernel, sched_index)
 
-    # Find our containing subkernel, grab inames for all insns from there.
-
-    subkernel_index = sched_index
-    from loopy.schedule import CallKernel
-
-    while not isinstance(kernel.schedule[subkernel_index], CallKernel):
-        subkernel_index -= 1
+    # Find our containing subkernel. Grab inames for all insns from there.
+    within_subkernel = False
+
+    for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index+1]):
+        from loopy.schedule import CallKernel, ReturnFromKernel
+        if isinstance(sched_item, CallKernel):
+            within_subkernel = True
+            subkernel_index = sched_item_index
+        elif isinstance(sched_item, ReturnFromKernel):
+            within_subkernel = False
+
+    if not within_subkernel:
+        # Outside all subkernels - use only inames available to host.
+        return frozenset(result)
 
     insn_ids_for_subkernel = get_insn_ids_for_block_at(
         kernel.schedule, subkernel_index)
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index 3ef7c8f6ad6c8af09dd01bf9e1341179d2be0be7..e590502fb5813af0a820d45228de8e11c35a46c8 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -200,7 +200,7 @@ def generate_assignment_instruction_code(codegen_state, insn):
                     "(%s).y" % lhs_code])
 
         if printf_args:
-            printf_args_str = ", " + ", ".join(printf_args)
+            printf_args_str = ", " + ", ".join(str(v) for v in printf_args)
         else:
             printf_args_str = ""
 
diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py
index 15ab8a1ee13df440926e51e676223bc6a398df57..512e4ac8619f33856d0a8ed929de0b574f7da014 100644
--- a/loopy/diagnostic.py
+++ b/loopy/diagnostic.py
@@ -103,6 +103,10 @@ class MissingDefinitionError(LoopyError):
 class UnscheduledInstructionError(LoopyError):
     pass
 
+
+class ReductionIsNotTriangularError(LoopyError):
+    pass
+
 # }}}
 
 
diff --git a/loopy/execution.py b/loopy/execution.py
new file mode 100644
index 0000000000000000000000000000000000000000..dac5b2ff80767ae00c126aba31c2851cfe3769ef
--- /dev/null
+++ b/loopy/execution.py
@@ -0,0 +1,234 @@
+from __future__ import division, with_statement, absolute_import
+
+__copyright__ = "Copyright (C) 2012-16 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import six
+import numpy as np
+from pytools import ImmutableRecord, memoize_method
+from loopy.diagnostic import LoopyError
+
+import logging
+logger = logging.getLogger(__name__)
+
+from pytools.persistent_dict import PersistentDict
+from loopy.tools import LoopyKeyBuilder
+from loopy.version import DATA_MODEL_VERSION
+
+
+# {{{ object array argument packing
+
+class _PackingInfo(ImmutableRecord):
+    """
+    .. attribute:: name
+    .. attribute:: sep_shape
+
+    .. attribute:: subscripts_and_names
+
+        A list of type ``[(index, unpacked_name), ...]``.
+    """
+
+
+class SeparateArrayPackingController(object):
+    """For argument arrays with axes tagged to be implemented as separate
+    arrays, this class provides preprocessing of the incoming arguments so that
+    all sub-arrays may be passed in one object array (under the original,
+    un-split argument name) and are unpacked into separate arrays before being
+    passed to the kernel.
+
+    It also repacks outgoing arrays of this type back into an object array.
+    """
+
+    def __init__(self, kernel):
+        # map from arg name
+        self.packing_info = {}
+
+        from loopy.kernel.array import ArrayBase
+        for arg in kernel.args:
+            if not isinstance(arg, ArrayBase):
+                continue
+
+            if arg.shape is None or arg.dim_tags is None:
+                continue
+
+            subscripts_and_names = arg.subscripts_and_names()
+
+            if subscripts_and_names is None:
+                continue
+
+            self.packing_info[arg.name] = _PackingInfo(
+                    name=arg.name,
+                    sep_shape=arg.sep_shape(),
+                    subscripts_and_names=subscripts_and_names,
+                    is_written=arg.name in kernel.get_written_variables())
+
+    def unpack(self, kernel_kwargs):
+        if not self.packing_info:
+            return kernel_kwargs
+
+        kernel_kwargs = kernel_kwargs.copy()
+
+        for packing_info in six.itervalues(self.packing_info):
+            arg_name = packing_info.name
+            if packing_info.name in kernel_kwargs:
+                arg = kernel_kwargs[arg_name]
+                for index, unpacked_name in packing_info.subscripts_and_names:
+                    assert unpacked_name not in kernel_kwargs
+                    kernel_kwargs[unpacked_name] = arg[index]
+                del kernel_kwargs[arg_name]
+
+        return kernel_kwargs
+
+    def pack(self, outputs):
+        if not self.packing_info:
+            return outputs
+
+        for packing_info in six.itervalues(self.packing_info):
+            if not packing_info.is_written:
+                continue
+
+            result = outputs[packing_info.name] = \
+                    np.zeros(packing_info.sep_shape, dtype=np.object)
+
+            for index, unpacked_name in packing_info.subscripts_and_names:
+                result[index] = outputs.pop(unpacked_name)
+
+        return outputs
+
+# }}}
+
+
+# {{{ KernelExecutorBase
+
+typed_and_scheduled_cache = PersistentDict(
+        "loopy-typed-and-scheduled-cache-v1-"+DATA_MODEL_VERSION,
+        key_builder=LoopyKeyBuilder())
+
+
+class KernelExecutorBase(object):
+    """An object connecting a kernel to a :class:`pyopencl.Context`
+    for execution.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+
+    def __init__(self, kernel):
+        """
+        :arg kernel: a loopy.LoopKernel
+        """
+
+        self.kernel = kernel
+
+        self.packing_controller = SeparateArrayPackingController(kernel)
+
+        self.output_names = tuple(arg.name for arg in self.kernel.args
+                if arg.name in self.kernel.get_written_variables())
+
+        self.has_runtime_typed_args = any(
+                arg.dtype is None
+                for arg in kernel.args)
+
+    def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set):
+        from loopy.kernel.tools import add_dtypes
+
+        kernel = self.kernel
+
+        if arg_to_dtype_set:
+            var_to_dtype = {}
+            for var, dtype in arg_to_dtype_set:
+                try:
+                    dest_name = kernel.impl_arg_to_arg[var].name
+                except KeyError:
+                    dest_name = var
+
+                try:
+                    var_to_dtype[dest_name] = dtype
+                except KeyError:
+                    raise LoopyError("cannot set type for '%s': "
+                            "no known variable/argument with that name"
+                            % var)
+
+            kernel = add_dtypes(kernel, var_to_dtype)
+
+            from loopy.type_inference import infer_unknown_types
+            kernel = infer_unknown_types(kernel, expect_completion=True)
+
+        if kernel.schedule is None:
+            from loopy.preprocess import preprocess_kernel
+            kernel = preprocess_kernel(kernel)
+
+            from loopy.schedule import get_one_scheduled_kernel
+            kernel = get_one_scheduled_kernel(kernel)
+
+        return kernel
+
+    @memoize_method
+    def get_typed_and_scheduled_kernel(self, arg_to_dtype_set):
+        from loopy import CACHING_ENABLED
+
+        cache_key = (type(self).__name__, self.kernel, arg_to_dtype_set)
+        if CACHING_ENABLED:
+            try:
+                return typed_and_scheduled_cache[cache_key]
+            except KeyError:
+                pass
+
+        logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name)
+
+        kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set)
+
+        if CACHING_ENABLED:
+            typed_and_scheduled_cache[cache_key] = kernel
+
+        return kernel
+
+    def arg_to_dtype_set(self, kwargs):
+        if not self.has_runtime_typed_args:
+            return None
+
+        from loopy.types import NumpyType
+        target = self.kernel.target
+
+        impl_arg_to_arg = self.kernel.impl_arg_to_arg
+        arg_to_dtype = {}
+        for arg_name, val in six.iteritems(kwargs):
+            arg = impl_arg_to_arg.get(arg_name, None)
+
+            if arg is None:
+                # offsets, strides and such
+                continue
+
+            if arg.dtype is None and val is not None:
+                try:
+                    dtype = val.dtype
+                except AttributeError:
+                    pass
+                else:
+                    arg_to_dtype[arg_name] = NumpyType(dtype, target)
+
+        return frozenset(six.iteritems(arg_to_dtype))
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 0ebe90fbca0d31c05eaee64321e2b73709292331..5f0884fd44ed5064f3f195d103b164f2163d1d19 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -594,37 +594,67 @@ def get_simple_strides(bset, key_by="name"):
     """
     result = {}
 
-    lspace = bset.get_local_space()
-    for idiv in range(lspace.dim(dim_type.div)):
-        div = lspace.get_div(idiv)
+    comp_div_set_pieces = convexify(bset.compute_divs()).get_basic_sets()
+    assert len(comp_div_set_pieces) == 1
+    bset, = comp_div_set_pieces
+
+    def _get_indices_and_coeffs(obj, dts):
+        result = []
+        for dt in dts:
+            for dim_idx in range(obj.dim(dt)):
+                coeff_val = obj.get_coefficient_val(dt, dim_idx)
+                if not coeff_val.is_zero():
+                    result.append((dt, dim_idx, coeff_val))
 
-        # check for sub-divs
-        supported = True
-        for dim_idx in range(div.dim(dim_type.div)):
-            coeff_val = div.get_coefficient_val(dim_type.div, dim_idx)
-            if not coeff_val.is_zero():
-                # sub-divs not supported
-                supported = False
-                break
+        return result
+
+    for cns in bset.get_constraints():
+        if not cns.is_equality():
+            continue
+        aff = cns.get_aff()
 
-        if not supported:
+        # recognizes constraints of the form
+        #  -i0 + 2*floor((i0)/2) == 0
+
+        if aff.dim(dim_type.div) != 1:
+            continue
+
+        idiv = 0
+        div = aff.get_div(idiv)
+
+        # check for sub-divs
+        if _get_indices_and_coeffs(div, [dim_type.div]):
+            # found one -> not supported
             continue
 
         denom = div.get_denominator_val().to_python()
 
-        inames_and_coeffs = []
-        for dt in [dim_type.param, dim_type.in_]:
-            for dim_idx in range(div.dim(dt)):
-                coeff_val = div.get_coefficient_val(dt, dim_idx) * denom
-                if not coeff_val.is_zero():
-                    inames_and_coeffs.append((dt, dim_idx, coeff_val))
+        # if the coefficient in front of the div is not the same as the denominator
+        if not aff.get_coefficient_val(dim_type.div, idiv).div(denom).is_one():
+            # not supported
+            continue
+
+        inames_and_coeffs = _get_indices_and_coeffs(
+                div, [dim_type.param, dim_type.in_])
 
         if len(inames_and_coeffs) != 1:
             continue
 
         (dt, dim_idx, coeff), = inames_and_coeffs
 
-        if coeff != 1:
+        if not (coeff * denom).is_one():
+            # not supported
+            continue
+
+        inames_and_coeffs = _get_indices_and_coeffs(
+                aff, [dim_type.param, dim_type.in_])
+
+        if len(inames_and_coeffs) != 1:
+            continue
+
+        (outer_dt, outer_dim_idx, outer_coeff), = inames_and_coeffs
+        if (not outer_coeff.neg().is_one()
+                or (outer_dt, outer_dim_idx) != (dt, dim_idx)):
             # not supported
             continue
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 324f7da1a21de0115ea060ff7ef55e52ab0913d4..622f5e49be1e40b4156113d92907fe8b1d9fb859 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -44,33 +44,49 @@ from loopy.diagnostic import CannotBranchDomainTree, LoopyError
 
 # {{{ unique var names
 
-def _is_var_name_conflicting_with_longer(name_a, name_b):
-    # Array dimensions implemented as separate arrays generate
-    # names by appending '_s<NUMBER>'. Make sure that no
-    # conflicts can arise from these names.
+class _UniqueVarNameGenerator(UniqueNameGenerator):
 
-    # Only deal with the case of b longer than a.
-    if not name_b.startswith(name_a):
-        return False
+    def __init__(self, existing_names=set(), forced_prefix=""):
+        super(_UniqueVarNameGenerator, self).__init__(existing_names, forced_prefix)
+        array_prefix_pattern = re.compile("(.*)_s[0-9]+$")
 
-    return re.match("^%s_s[0-9]+" % re.escape(name_b), name_a) is not None
+        array_prefixes = set()
+        for name in existing_names:
+            match = array_prefix_pattern.match(name)
+            if match is None:
+                continue
 
+            array_prefixes.add(match.group(1))
 
-def _is_var_name_conflicting(name_a, name_b):
-    if name_a == name_b:
-        return True
+        self.conflicting_array_prefixes = array_prefixes
+        self.array_prefix_pattern = array_prefix_pattern
 
-    return (
-            _is_var_name_conflicting_with_longer(name_a, name_b)
-            or _is_var_name_conflicting_with_longer(name_b, name_a))
+    def _name_added(self, name):
+        match = self.array_prefix_pattern.match(name)
+        if match is None:
+            return
 
+        self.conflicting_array_prefixes.add(match.group(1))
 
-class _UniqueVarNameGenerator(UniqueNameGenerator):
     def is_name_conflicting(self, name):
-        from pytools import any
-        return any(
-                _is_var_name_conflicting(name, other_name)
-                for other_name in self.existing_names)
+        if name in self.existing_names:
+            return True
+
+        # Array dimensions implemented as separate arrays generate
+        # names by appending '_s<NUMBER>'. Make sure that no
+        # conflicts can arise from these names.
+
+        # Case 1: a_s0 is already a name; we are trying to insert a
+        # Case 2: a is already a name; we are trying to insert a_s0
+
+        if name in self.conflicting_array_prefixes:
+            return True
+
+        match = self.array_prefix_pattern.match(name)
+        if match is None:
+            return False
+
+        return match.group(1) in self.existing_names
 
 # }}}
 
@@ -599,8 +615,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 # nothin' new
                 continue
 
-            domain_parents = [home_domain_index] + ppd[home_domain_index]
-            current_root = domain_parents[-1]
+            domain_path_to_root = [home_domain_index] + ppd[home_domain_index]
+            current_root = domain_path_to_root[-1]
             previous_leaf = root_to_leaf.get(current_root)
 
             if previous_leaf is not None:
@@ -610,8 +626,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 # it can introduce artificial restrictions on variables
                 # further up the tree.
 
-                prev_parents = set(ppd[previous_leaf])
-                if not prev_parents <= set(domain_parents):
+                prev_path_to_root = set([previous_leaf] + ppd[previous_leaf])
+                if not prev_path_to_root <= set(domain_path_to_root):
                     raise CannotBranchDomainTree("iname set '%s' requires "
                             "branch in domain tree (when adding '%s')"
                             % (", ".join(inames), iname))
@@ -620,7 +636,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 pass
 
             root_to_leaf[current_root] = home_domain_index
-            domain_indices.update(domain_parents)
+            domain_indices.update(domain_path_to_root)
 
         return list(root_to_leaf.values())
 
@@ -1095,7 +1111,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return embedding
 
-    def stringify(self, what=None, with_dependencies=False):
+    def stringify(self, what=None, with_dependencies=False, use_separators=True,
+            show_labels=True):
         all_what = set([
             "name",
             "arguments",
@@ -1134,7 +1151,10 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         kernel = self
 
-        sep = 75*"-"
+        if use_separators:
+            sep = [75*"-"]
+        else:
+            sep = []
 
         def natorder(key):
             # Return natural ordering for strings, as opposed to dictionary order.
@@ -1151,44 +1171,50 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             return sorted(seq, key=lambda y: natorder(key(y)))
 
         if "name" in what:
-            lines.append(sep)
+            lines.extend(sep)
             lines.append("KERNEL: " + kernel.name)
 
         if "arguments" in what:
-            lines.append(sep)
-            lines.append("ARGUMENTS:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("ARGUMENTS:")
             for arg_name in natsorted(kernel.arg_dict):
                 lines.append(str(kernel.arg_dict[arg_name]))
 
         if "domains" in what:
-            lines.append(sep)
-            lines.append("DOMAINS:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("DOMAINS:")
             for dom, parents in zip(kernel.domains, kernel.all_parents_per_domain()):
                 lines.append(len(parents)*"  " + str(dom))
 
         if "tags" in what:
-            lines.append(sep)
-            lines.append("INAME IMPLEMENTATION TAGS:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("INAME IMPLEMENTATION TAGS:")
             for iname in natsorted(kernel.all_inames()):
                 line = "%s: %s" % (iname, kernel.iname_to_tag.get(iname))
                 lines.append(line)
 
         if "variables" in what and kernel.temporary_variables:
-            lines.append(sep)
-            lines.append("TEMPORARIES:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("TEMPORARIES:")
             for tv in natsorted(six.itervalues(kernel.temporary_variables),
                     key=lambda tv: tv.name):
                 lines.append(str(tv))
 
         if "rules" in what and kernel.substitutions:
-            lines.append(sep)
-            lines.append("SUBSTIUTION RULES:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("SUBSTIUTION RULES:")
             for rule_name in natsorted(six.iterkeys(kernel.substitutions)):
                 lines.append(str(kernel.substitutions[rule_name]))
 
         if "instructions" in what:
-            lines.append(sep)
-            lines.append("INSTRUCTIONS:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("INSTRUCTIONS:")
             loop_list_width = 35
 
             # {{{ topological sort
@@ -1303,18 +1329,20 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 dep_lines.append("%s : %s" % (insn.id, ",".join(insn.depends_on)))
 
         if "Dependencies" in what and dep_lines:
-            lines.append(sep)
-            lines.append("DEPENDENCIES: "
-                    "(use loopy.show_dependency_graph to visualize)")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("DEPENDENCIES: "
+                        "(use loopy.show_dependency_graph to visualize)")
             lines.extend(dep_lines)
 
         if "schedule" in what and kernel.schedule is not None:
-            lines.append(sep)
-            lines.append("SCHEDULE:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("SCHEDULE:")
             from loopy.schedule import dump_schedule
             lines.append(dump_schedule(kernel, kernel.schedule))
 
-        lines.append(sep)
+        lines.extend(sep)
 
         return "\n".join(lines)
 
@@ -1384,17 +1412,35 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         result.pop("cache_manager", None)
 
-        return result
+        # make sure that kernels are pickled with a cached hash key in place
+        from loopy.tools import LoopyKeyBuilder
+        LoopyKeyBuilder()(self)
+
+        return (result, self._pytools_persistent_hash_digest)
 
     def __setstate__(self, state):
+        attribs, p_hash_digest = state
+
         new_fields = set()
 
-        for k, v in six.iteritems(state):
+        for k, v in six.iteritems(attribs):
             setattr(self, k, v)
             new_fields.add(k)
 
         self.register_fields(new_fields)
 
+        if 0:
+            # {{{ check that 'reconstituted' object has same hash
+
+            from loopy.tools import LoopyKeyBuilder
+            LoopyKeyBuilder()(self)
+
+            assert p_hash_digest == self._pytools_persistent_hash_digest
+
+            # }}}
+        else:
+            self._pytools_persistent_hash_digest = p_hash_digest
+
         from loopy.kernel.tools import SetOperationCacheManager
         self.cache_manager = SetOperationCacheManager()
         self._kernel_executor_cache = {}
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 0d22dbb88ed99c7c92480d1d39b924cc2198cc3f..08268ca9f27623a6d17a195d3c04acb55e5ec68a 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -714,7 +714,7 @@ class Assignment(MultiAssignmentBase):
 
                 z[i] = z[i+1-1] + a {atomic}
 
-        :mod:`loopy` may to evaluate the right-hand side *multiple times*
+        :mod:`loopy` may choose to evaluate the right-hand side *multiple times*
         as part of a single assignment. It is up to the user to ensure that
         this retains correct semantics.
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 17226b63addb9e2e30d556730aa326d2ed59128c..ced1aaaa13ed8275c1e3a376d1c24895287b3239 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -48,20 +48,22 @@ def prepare_for_caching(kernel):
     import loopy as lp
     new_args = []
 
+    tgt = kernel.target
+
     for arg in kernel.args:
         dtype = arg.dtype
-        if dtype is not None and dtype is not lp.auto:
-            dtype = dtype.with_target(kernel.target)
+        if dtype is not None and dtype is not lp.auto and dtype.target is not tgt:
+            arg = arg.copy(dtype=dtype.with_target(kernel.target))
 
-        new_args.append(arg.copy(dtype=dtype))
+        new_args.append(arg)
 
     new_temporary_variables = {}
     for name, temp in six.iteritems(kernel.temporary_variables):
         dtype = temp.dtype
-        if dtype is not None and dtype is not lp.auto:
-            dtype = dtype.with_target(kernel.target)
+        if dtype is not None and dtype is not lp.auto and dtype.target is not tgt:
+            temp = temp.copy(dtype=dtype.with_target(tgt))
 
-        new_temporary_variables[name] = temp.copy(dtype=dtype)
+        new_temporary_variables[name] = temp
 
     kernel = kernel.copy(
             args=new_args,
@@ -274,7 +276,425 @@ def find_temporary_scope(kernel):
 # {{{ rewrite reduction to imperative form
 
 
-# {{{ reduction utils
+# {{{ utils (not stateful)
+
+from collections import namedtuple
+
+
+_InameClassification = namedtuple("_InameClassifiction",
+                                  "sequential, local_parallel, nonlocal_parallel")
+
+
+def _classify_reduction_inames(kernel, inames):
+    sequential = []
+    local_par = []
+    nonlocal_par = []
+
+    from loopy.kernel.data import (
+            LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
+            ParallelTag)
+
+    for iname in inames:
+        iname_tag = kernel.iname_to_tag.get(iname)
+
+        if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)):
+            # These are nominally parallel, but we can live with
+            # them as sequential.
+            sequential.append(iname)
+
+        elif isinstance(iname_tag, LocalIndexTagBase):
+            local_par.append(iname)
+
+        elif isinstance(iname_tag, (ParallelTag, VectorizeTag)):
+            nonlocal_par.append(iname)
+
+        else:
+            sequential.append(iname)
+
+    return _InameClassification(
+            tuple(sequential), tuple(local_par), tuple(nonlocal_par))
+
+
+def _add_params_to_domain(domain, param_names):
+    dim_type = isl.dim_type
+    nparams_orig = domain.dim(dim_type.param)
+    domain = domain.add_dims(dim_type.param, len(param_names))
+
+    for param_idx, param_name in enumerate(param_names):
+        domain = domain.set_dim_name(
+                dim_type.param, param_idx + nparams_orig, param_name)
+
+    return domain
+
+
+def _move_set_to_param_dims_except(domain, except_dims):
+    dim_type = isl.dim_type
+
+    iname_idx = 0
+    for iname in domain.get_var_names(dim_type.set):
+        if iname not in except_dims:
+            domain = domain.move_dims(
+                    dim_type.param, 0,
+                    dim_type.set, iname_idx, 1)
+            iname_idx -= 1
+        iname_idx += 1
+
+    return domain
+
+
+def _domain_depends_on_given_set_dims(domain, set_dim_names):
+    set_dim_names = frozenset(set_dim_names)
+
+    return any(
+            set_dim_names & set(constr.get_coefficients_by_name())
+            for constr in domain.get_constraints())
+
+
+def _check_reduction_is_triangular(kernel, expr, scan_param):
+    """Check whether the reduction within `expr` with scan parameters described by
+    the structure `scan_param` is triangular. This attempts to verify that the
+    domain for the scan and sweep inames is as follows:
+
+    [params] -> {
+        [other inames..., scan_iname, sweep_iname]:
+            (sweep_min_value
+                <= sweep_iname
+                <= sweep_max_value)
+            and
+            (scan_min_value
+                <= scan_iname
+                <= stride * (sweep_iname - sweep_min_value) + scan_min_value)
+            and
+            (irrelevant constraints)
+    }
+    """
+
+    orig_domain = kernel.get_inames_domain(
+            frozenset((scan_param.sweep_iname, scan_param.scan_iname)))
+
+    sweep_iname = scan_param.sweep_iname
+    scan_iname = scan_param.scan_iname
+    affs = isl.affs_from_space(orig_domain.space)
+
+    sweep_lower_bound = isl.align_spaces(
+            scan_param.sweep_lower_bound,
+            affs[0],
+            across_dim_types=True)
+
+    sweep_upper_bound = isl.align_spaces(
+            scan_param.sweep_upper_bound,
+            affs[0],
+            across_dim_types=True)
+
+    scan_lower_bound = isl.align_spaces(
+            scan_param.scan_lower_bound,
+            affs[0],
+            across_dim_types=True)
+
+    from itertools import product
+
+    for (sweep_lb_domain, sweep_lb_aff), \
+        (sweep_ub_domain, sweep_ub_aff), \
+        (scan_lb_domain, scan_lb_aff) in \
+            product(sweep_lower_bound.get_pieces(),
+                    sweep_upper_bound.get_pieces(),
+                    scan_lower_bound.get_pieces()):
+
+        # Assumptions inherited from the domains of the pwaffs
+        assumptions = sweep_lb_domain & sweep_ub_domain & scan_lb_domain
+
+        # Sweep iname constraints
+        hyp_domain = affs[sweep_iname].ge_set(sweep_lb_aff)
+        hyp_domain &= affs[sweep_iname].le_set(sweep_ub_aff)
+
+        # Scan iname constraints
+        hyp_domain &= affs[scan_iname].ge_set(scan_lb_aff)
+        hyp_domain &= affs[scan_iname].le_set(
+                scan_param.stride * (affs[sweep_iname] - sweep_lb_aff)
+                + scan_lb_aff)
+
+        hyp_domain, = (hyp_domain & assumptions).get_basic_sets()
+        test_domain, = (orig_domain & assumptions).get_basic_sets()
+
+        hyp_gist_against_test = hyp_domain.gist(test_domain)
+        if _domain_depends_on_given_set_dims(hyp_gist_against_test,
+                (sweep_iname, scan_iname)):
+            return False, (
+                    "gist of hypothesis against test domain "
+                    "has sweep or scan dependent constraints: '%s'"
+                    % hyp_gist_against_test)
+
+        test_gist_against_hyp = test_domain.gist(hyp_domain)
+        if _domain_depends_on_given_set_dims(test_gist_against_hyp,
+                (sweep_iname, scan_iname)):
+            return False, (
+                   "gist of test against hypothesis domain "
+                   "has sweep or scan dependent constraint: '%s'"
+                   % test_gist_against_hyp)
+
+    return True, "ok"
+
+
+_ScanCandidateParameters = namedtuple(
+        "_ScanCandidateParameters",
+        "sweep_iname, scan_iname, sweep_lower_bound, "
+        "sweep_upper_bound, scan_lower_bound, stride")
+
+
+def _try_infer_scan_candidate_from_expr(
+        kernel, expr, within_inames, sweep_iname=None):
+    """Analyze `expr` and determine if it can be implemented as a scan.
+    """
+    from loopy.symbolic import Reduction
+    assert isinstance(expr, Reduction)
+
+    if len(expr.inames) != 1:
+        raise ValueError(
+                "Multiple inames in reduction: '%s'" % (", ".join(expr.inames),))
+
+    scan_iname, = expr.inames
+
+    from loopy.kernel.tools import DomainChanger
+    dchg = DomainChanger(kernel, (scan_iname,))
+    domain = dchg.get_original_domain()
+
+    if sweep_iname is None:
+        try:
+            sweep_iname = _try_infer_sweep_iname(
+                    domain, scan_iname, kernel.all_inames())
+        except ValueError as v:
+            raise ValueError(
+                    "Couldn't determine a sweep iname for the scan "
+                    "expression '%s': %s" % (expr, v))
+
+    try:
+        sweep_lower_bound, sweep_upper_bound, scan_lower_bound = (
+                _try_infer_scan_and_sweep_bounds(
+                    kernel, scan_iname, sweep_iname, within_inames))
+    except ValueError as v:
+        raise ValueError(
+                "Couldn't determine bounds for the scan with expression '%s' "
+                "(sweep iname: '%s', scan iname: '%s'): %s"
+                % (expr, sweep_iname, scan_iname, v))
+
+    try:
+        stride = _try_infer_scan_stride(
+                kernel, scan_iname, sweep_iname, sweep_lower_bound)
+    except ValueError as v:
+        raise ValueError(
+                "Couldn't determine a scan stride for the scan with expression '%s' "
+                "(sweep iname: '%s', scan iname: '%s'): %s"
+                % (expr, sweep_iname, scan_iname, v))
+
+    return _ScanCandidateParameters(sweep_iname, scan_iname, sweep_lower_bound,
+            sweep_upper_bound, scan_lower_bound, stride)
+
+
+def _try_infer_sweep_iname(domain, scan_iname, candidate_inames):
+    """The sweep iname is the outer iname which guides the scan.
+
+    E.g. for a domain of {[i,j]: 0<=i<n and 0<=j<=i}, i is the sweep iname.
+    """
+    constrs = domain.get_constraints()
+    sweep_iname_candidate = None
+
+    for constr in constrs:
+        candidate_vars = set([
+                var for var in constr.get_var_dict()
+                if var in candidate_inames])
+
+        # Irrelevant constraint - skip
+        if scan_iname not in candidate_vars:
+            continue
+
+        # No additional inames - skip
+        if len(candidate_vars) == 1:
+            continue
+
+        candidate_vars.remove(scan_iname)
+
+        # Depends on more than one iname - error
+        if len(candidate_vars) > 1:
+            raise ValueError(
+                    "More than one sweep iname candidate for scan iname '%s' found "
+                    "(via constraint '%s')" % (scan_iname, constr))
+
+        next_candidate = candidate_vars.pop()
+
+        if sweep_iname_candidate is None:
+            sweep_iname_candidate = next_candidate
+            defining_constraint = constr
+        else:
+            # Check next_candidate consistency
+            if sweep_iname_candidate != next_candidate:
+                raise ValueError(
+                        "More than one sweep iname candidate for scan iname '%s' "
+                        "found (via constraints '%s', '%s')" %
+                        (scan_iname, defining_constraint, constr))
+
+    if sweep_iname_candidate is None:
+        raise ValueError(
+                "Couldn't find any sweep iname candidates for "
+                "scan iname '%s'" % scan_iname)
+
+    return sweep_iname_candidate
+
+
+def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_inames):
+    domain = kernel.get_inames_domain(frozenset((sweep_iname, scan_iname)))
+    domain = _move_set_to_param_dims_except(domain, (sweep_iname, scan_iname))
+
+    var_dict = domain.get_var_dict()
+    sweep_idx = var_dict[sweep_iname][1]
+    scan_idx = var_dict[scan_iname][1]
+
+    domain = domain.project_out_except(
+            within_inames | kernel.non_iname_variable_names(), (isl.dim_type.param,))
+
+    try:
+        with isl.SuppressedWarnings(domain.get_ctx()):
+            sweep_lower_bound = domain.dim_min(sweep_idx)
+            sweep_upper_bound = domain.dim_max(sweep_idx)
+            scan_lower_bound = domain.dim_min(scan_idx)
+    except isl.Error as e:
+        raise ValueError("isl error: %s" % e)
+
+    return (sweep_lower_bound, sweep_upper_bound, scan_lower_bound)
+
+
+def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound):
+    """The stride is the number of steps the scan iname takes per iteration
+    of the sweep iname. This is allowed to be an integer constant.
+
+    E.g. for a domain of {[i,j]: 0<=i<n and 0<=j<=6*i}, the stride is 6.
+    """
+    dim_type = isl.dim_type
+
+    domain = kernel.get_inames_domain(frozenset([sweep_iname, scan_iname]))
+    domain_with_sweep_param = _move_set_to_param_dims_except(domain, (scan_iname,))
+
+    domain_with_sweep_param = domain_with_sweep_param.project_out_except(
+            (sweep_iname, scan_iname), (dim_type.set, dim_type.param))
+
+    scan_iname_idx = domain_with_sweep_param.find_dim_by_name(
+            dim_type.set, scan_iname)
+
+    # Should be equal to k * sweep_iname, where k is the stride.
+
+    try:
+        with isl.SuppressedWarnings(domain_with_sweep_param.get_ctx()):
+            scan_iname_range = (
+                    domain_with_sweep_param.dim_max(scan_iname_idx)
+                    - domain_with_sweep_param.dim_min(scan_iname_idx)
+                    ).gist(domain_with_sweep_param.params())
+    except isl.Error as e:
+        raise ValueError("isl error: '%s'" % e)
+
+    scan_iname_pieces = scan_iname_range.get_pieces()
+
+    if len(scan_iname_pieces) > 1:
+        raise ValueError("range in multiple pieces: %s" % scan_iname_range)
+    elif len(scan_iname_pieces) == 0:
+        raise ValueError("empty range found for iname '%s'" % scan_iname)
+
+    scan_iname_constr, scan_iname_aff = scan_iname_pieces[0]
+
+    if not scan_iname_constr.plain_is_universe():
+        raise ValueError("found constraints: %s" % scan_iname_constr)
+
+    if scan_iname_aff.dim(dim_type.div):
+        raise ValueError("aff has div: %s" % scan_iname_aff)
+
+    coeffs = scan_iname_aff.get_coefficients_by_name(dim_type.param)
+
+    if len(coeffs) == 0:
+        try:
+            scan_iname_aff.get_constant_val()
+        except:
+            raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff)
+
+        # If this point is reached we're assuming the domain is of the form
+        # {[i,j]: i=0 and j=0}, so the stride is technically 1 - any value
+        # this function returns will be verified later by
+        # _check_reduction_is_triangular().
+        return 1
+
+    if sweep_iname not in coeffs:
+        raise ValueError("didn't find sweep iname in coeffs: %s" % sweep_iname)
+
+    stride = coeffs[sweep_iname]
+
+    if not stride.is_int():
+        raise ValueError("stride not an integer: %s" % stride)
+
+    if not stride.is_pos():
+        raise ValueError("stride not positive: %s" % stride)
+
+    return stride.to_python()
+
+
+def _get_domain_with_iname_as_param(domain, iname):
+    dim_type = isl.dim_type
+
+    if domain.find_dim_by_name(dim_type.param, iname) >= 0:
+        return domain
+
+    iname_idx = domain.find_dim_by_name(dim_type.set, iname)
+
+    assert iname_idx >= 0, (iname, domain)
+
+    return domain.move_dims(
+        dim_type.param, domain.dim(dim_type.param),
+        dim_type.set, iname_idx, 1)
+
+
+def _create_domain_for_sweep_tracking(orig_domain,
+        tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride):
+    dim_type = isl.dim_type
+
+    subd = isl.BasicSet.universe(orig_domain.params().space)
+
+    # Add tracking_iname and sweep iname.
+
+    subd = _add_params_to_domain(subd, (sweep_iname, tracking_iname))
+
+    # Here we realize the domain:
+    #
+    # [..., i] -> {
+    #  [j]: 0 <= j - l
+    #       and
+    #       j - l <= k * (i - m)
+    #       and
+    #       k * (i - m - 1) < j - l }
+    # where
+    #   * i is the sweep iname
+    #   * j is the tracking iname
+    #   * k is the stride for the scan
+    #   * l is the lower bound for the scan
+    #   * m is the lower bound for the sweep iname
+    #
+    affs = isl.affs_from_space(subd.space)
+
+    subd &= (affs[tracking_iname] - scan_min_value).ge_set(affs[0])
+    subd &= (affs[tracking_iname] - scan_min_value)\
+            .le_set(stride * (affs[sweep_iname] - sweep_min_value))
+    subd &= (affs[tracking_iname] - scan_min_value)\
+            .gt_set(stride * (affs[sweep_iname] - sweep_min_value - 1))
+
+    # Move tracking_iname into a set dim (NOT sweep iname).
+    subd = subd.move_dims(
+            dim_type.set, 0,
+            dim_type.param, subd.dim(dim_type.param) - 1, 1)
+
+    # Simplify (maybe).
+    orig_domain_with_sweep_param = (
+            _get_domain_with_iname_as_param(orig_domain, sweep_iname))
+    subd = subd.gist_params(orig_domain_with_sweep_param.params())
+
+    subd, = subd.get_basic_sets()
+
+    return subd
+
 
 def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
     """
@@ -455,10 +875,22 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
     return kernel.copy(temporary_variables=new_temporary_variables,
                        instructions=new_instructions)
 
+
+def _insert_subdomain_into_domain_tree(kernel, domains, subdomain):
+    # Intersect with inames, because we could have captured some kernel params
+    # in here too...
+    dependent_inames = (
+            frozenset(subdomain.get_var_names(isl.dim_type.param))
+            & kernel.all_inames())
+    idx, = kernel.get_leaf_domain_indices(dependent_inames)
+    domains.insert(idx + 1, subdomain)
+
 # }}}
 
 
-def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
+def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
+                      automagic_scans_ok=False, force_scan=False,
+                      force_outer_iname_for_scan=None):
     """Rewrites reductions into their imperative form. With *insn_id_filter*
     specified, operate only on the instruction with an instruction id matching
     *insn_id_filter*.
@@ -469,6 +901,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
     If *insn_id_filter* is not given, all reductions in all instructions will
     be realized.
+
+    If *automagic_scans_ok*, this function will attempt to rewrite triangular
+    reductions as scans automatically.
+
+    If *force_scan* is *True*, this function will attempt to rewrite *all*
+    candidate reductions as scans and raise an error if this is not possible
+    (this is most useful combined with *insn_id_filter*).
+
+    If *force_outer_iname_for_scan* is not *None*, this function will attempt
+    to realize candidate reductions as scans using the specified iname as the
+    outer (sweep) iname.
     """
 
     logger.debug("%s: realize reduction" % kernel.name)
@@ -480,6 +923,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
     var_name_gen = kernel.get_var_name_generator()
     new_temporary_variables = kernel.temporary_variables.copy()
+    inames_added_for_scan = set()
+    inames_to_remove = set()
 
     # {{{ helpers
 
@@ -489,8 +934,44 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
         else:
             return val
 
+    def preprocess_scan_arguments(
+                insn, expr, nresults, scan_iname, track_iname,
+                newly_generated_insn_id_set):
+        """Does iname substitution within scan arguments and returns a set of values
+        suitable to be passed to the binary op. Returns a tuple."""
+
+        if nresults > 1:
+            inner_expr = expr
+
+            # In the case of a multi-argument scan, we need a name for each of
+            # the arguments in order to pass them to the binary op - so we expand
+            # items that are not "plain" tuples here.
+            if not isinstance(inner_expr, tuple):
+                get_args_insn_id = insn_id_gen(
+                        "%s_%s_get" % (insn.id, "_".join(expr.inames)))
+
+                inner_expr = expand_inner_reduction(
+                        id=get_args_insn_id,
+                        expr=inner_expr,
+                        nresults=nresults,
+                        depends_on=insn.depends_on,
+                        within_inames=insn.within_inames | expr.inames,
+                        within_inames_is_final=insn.within_inames_is_final)
+
+                newly_generated_insn_id_set.add(get_args_insn_id)
+
+            updated_inner_exprs = tuple(
+                    replace_var_within_expr(sub_expr, scan_iname, track_iname)
+                    for sub_expr in inner_expr)
+        else:
+            updated_inner_exprs = (
+                    replace_var_within_expr(expr, scan_iname, track_iname),)
+
+        return updated_inner_exprs
+
     def expand_inner_reduction(id, expr, nresults, depends_on, within_inames,
             within_inames_is_final):
+        # FIXME: use make_temporaries
         from pymbolic.primitives import Call
         from loopy.symbolic import Reduction
         assert isinstance(expr, (Call, Reduction))
@@ -530,20 +1011,23 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             reduction_dtypes):
         outer_insn_inames = temp_kernel.insn_inames(insn)
 
-        from pymbolic import var
-        acc_var_names = [
-                var_name_gen("acc_"+"_".join(expr.inames))
-                for i in range(nresults)]
-        acc_vars = tuple(var(n) for n in acc_var_names)
+        from loopy.kernel.data import temp_var_scope
+        acc_var_names = make_temporaries(
+                name_based_on="acc_"+"_".join(expr.inames),
+                nvars=nresults,
+                shape=(),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.PRIVATE)
 
-        from loopy.kernel.data import TemporaryVariable, temp_var_scope
+        init_insn_depends_on = frozenset()
 
-        for name, dtype in zip(acc_var_names, reduction_dtypes):
-            new_temporary_variables[name] = TemporaryVariable(
-                    name=name,
-                    shape=(),
-                    dtype=dtype,
-                    scope=temp_var_scope.PRIVATE)
+        global_barrier = lp.find_most_recent_global_barrier(temp_kernel, insn.id)
+
+        if global_barrier is not None:
+            init_insn_depends_on |= frozenset([global_barrier])
+
+        from pymbolic import var
+        acc_vars = tuple(var(n) for n in acc_var_names)
 
         init_id = insn_id_gen(
                 "%s_%s_init" % (insn.id, "_".join(expr.inames)))
@@ -553,7 +1037,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                 assignees=acc_vars,
                 within_inames=outer_insn_inames - frozenset(expr.inames),
                 within_inames_is_final=insn.within_inames_is_final,
-                depends_on=frozenset(),
+                depends_on=init_insn_depends_on,
                 expression=expr.operation.neutral_element(*arg_dtypes))
 
         generated_insns.append(init_insn)
@@ -567,17 +1051,20 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
         reduction_insn_depends_on = set([init_id])
 
+        # In the case of a multi-argument reduction, we need a name for each of
+        # the arguments in order to pass them to the binary op - so we expand
+        # items that are not "plain" tuples here.
         if nresults > 1 and not isinstance(expr.expr, tuple):
             get_args_insn_id = insn_id_gen(
                     "%s_%s_get" % (insn.id, "_".join(expr.inames)))
 
             reduction_expr = expand_inner_reduction(
-                id=get_args_insn_id,
-                expr=expr.expr,
-                nresults=nresults,
-                depends_on=insn.depends_on,
-                within_inames=update_insn_iname_deps,
-                within_inames_is_final=insn.within_inames_is_final)
+                    id=get_args_insn_id,
+                    expr=expr.expr,
+                    nresults=nresults,
+                    depends_on=insn.depends_on,
+                    within_inames=update_insn_iname_deps,
+                    within_inames_is_final=insn.within_inames_is_final)
 
             reduction_insn_depends_on.add(get_args_insn_id)
         else:
@@ -626,6 +1113,14 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                 v[iname].lt_set(v[0] + size)).get_basic_sets()
         return bs
 
+    def _make_slab_set_from_range(iname, lbound, ubound):
+        v = isl.make_zero_and_vars([iname])
+        bs, = (
+                v[iname].ge_set(v[0] + lbound)
+                &
+                v[iname].lt_set(v[0] + ubound)).get_basic_sets()
+        return bs
+
     def map_reduction_local(expr, rec, nresults, arg_dtypes,
             reduction_dtypes):
         red_iname, = expr.inames
@@ -650,6 +1145,24 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                 _get_int_iname_size(oiname)
                 for oiname in outer_local_inames)
 
+        from loopy.kernel.data import temp_var_scope
+
+        neutral_var_names = make_temporaries(
+                name_based_on="neutral_"+red_iname,
+                nvars=nresults,
+                shape=(),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.PRIVATE)
+
+        acc_var_names = make_temporaries(
+                name_based_on="acc_"+red_iname,
+                nvars=nresults,
+                shape=outer_local_iname_sizes + (size,),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.LOCAL)
+
+        acc_vars = tuple(var(n) for n in acc_var_names)
+
         # {{{ add separate iname to carry out the reduction
 
         # Doing this sheds any odd conditionals that may be active
@@ -661,32 +1174,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
         # }}}
 
-        neutral_var_names = [
-                var_name_gen("neutral_"+red_iname)
-                for i in range(nresults)]
-        acc_var_names = [
-                var_name_gen("acc_"+red_iname)
-                for i in range(nresults)]
-        acc_vars = tuple(var(n) for n in acc_var_names)
-
-        from loopy.kernel.data import TemporaryVariable, temp_var_scope
-        for name, dtype in zip(acc_var_names, reduction_dtypes):
-            new_temporary_variables[name] = TemporaryVariable(
-                    name=name,
-                    shape=outer_local_iname_sizes + (size,),
-                    dtype=dtype,
-                    scope=temp_var_scope.LOCAL)
-        for name, dtype in zip(neutral_var_names, reduction_dtypes):
-            new_temporary_variables[name] = TemporaryVariable(
-                    name=name,
-                    shape=(),
-                    dtype=dtype,
-                    scope=temp_var_scope.PRIVATE)
-
         base_iname_deps = outer_insn_inames - frozenset(expr.inames)
 
         neutral = expr.operation.neutral_element(*arg_dtypes)
-
         init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname))
         init_insn = make_assignment(
                 id=init_id,
@@ -711,6 +1201,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
         transfer_depends_on = set([init_neutral_id, init_id])
 
+        # In the case of a multi-argument reduction, we need a name for each of
+        # the arguments in order to pass them to the binary op - so we expand
+        # items that are not "plain" tuples here.
         if nresults > 1 and not isinstance(expr.expr, tuple):
             get_args_insn_id = insn_id_gen(
                     "%s_%s_get" % (insn.id, red_iname))
@@ -745,9 +1238,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     (outer_insn_inames - frozenset(expr.inames))
                     | frozenset([red_iname])),
                 within_inames_is_final=insn.within_inames_is_final,
-                depends_on=frozenset(transfer_depends_on) | insn.depends_on,
-                no_sync_with=frozenset(
-                    [(insn_id, "any") for insn_id in transfer_depends_on]))
+                depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on,
+                no_sync_with=frozenset([(init_id, "any")]))
         generated_insns.append(transfer_insn)
 
         cur_size = 1
@@ -759,6 +1251,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
         istage = 0
         while cur_size > 1:
+
             new_size = cur_size // 2
             assert new_size * 2 == cur_size
 
@@ -807,6 +1300,351 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars]
     # }}}
 
+    # {{{ utils (stateful)
+
+    from pytools import memoize
+
+    @memoize
+    def get_or_add_sweep_tracking_iname_and_domain(
+            scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
+            tracking_iname):
+        domain = temp_kernel.get_inames_domain(frozenset((scan_iname, sweep_iname)))
+
+        inames_added_for_scan.add(tracking_iname)
+
+        new_domain = _create_domain_for_sweep_tracking(domain,
+                tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride)
+
+        _insert_subdomain_into_domain_tree(temp_kernel, domains, new_domain)
+
+        return tracking_iname
+
+    def replace_var_within_expr(expr, from_var, to_var):
+        from pymbolic.mapper.substitutor import make_subst_func
+
+        from loopy.symbolic import (
+            SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper)
+
+        rule_mapping_context = SubstitutionRuleMappingContext(
+            temp_kernel.substitutions, var_name_gen)
+
+        from pymbolic import var
+        mapper = RuleAwareSubstitutionMapper(
+            rule_mapping_context,
+            make_subst_func({from_var: var(to_var)}),
+            within=lambda *args: True)
+
+        return mapper(expr, temp_kernel, None)
+
+    def make_temporaries(name_based_on, nvars, shape, dtypes, scope):
+        var_names = [
+                var_name_gen(name_based_on.format(index=i))
+                for i in range(nvars)]
+
+        from loopy.kernel.data import TemporaryVariable
+
+        for name, dtype in zip(var_names, dtypes):
+            new_temporary_variables[name] = TemporaryVariable(
+                    name=name,
+                    shape=shape,
+                    dtype=dtype,
+                    scope=scope)
+
+        return var_names
+
+    # }}}
+
+    # {{{ sequential scan
+
+    def map_scan_seq(expr, rec, nresults, arg_dtypes,
+            reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
+            scan_min_value, stride):
+        outer_insn_inames = temp_kernel.insn_inames(insn)
+        inames_to_remove.add(scan_iname)
+
+        track_iname = var_name_gen(
+                "{sweep_iname}__seq_scan"
+                .format(scan_iname=scan_iname, sweep_iname=sweep_iname))
+
+        get_or_add_sweep_tracking_iname_and_domain(
+                scan_iname, sweep_iname, sweep_min_value, scan_min_value,
+                stride, track_iname)
+
+        from loopy.kernel.data import temp_var_scope
+        acc_var_names = make_temporaries(
+                name_based_on="acc_" + scan_iname,
+                nvars=nresults,
+                shape=(),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.PRIVATE)
+
+        from pymbolic import var
+        acc_vars = tuple(var(n) for n in acc_var_names)
+
+        init_id = insn_id_gen(
+                "%s_%s_init" % (insn.id, "_".join(expr.inames)))
+
+        init_insn_depends_on = frozenset()
+
+        global_barrier = lp.find_most_recent_global_barrier(temp_kernel, insn.id)
+
+        if global_barrier is not None:
+            init_insn_depends_on |= frozenset([global_barrier])
+
+        init_insn = make_assignment(
+                id=init_id,
+                assignees=acc_vars,
+                within_inames=outer_insn_inames - frozenset(
+                    (sweep_iname,) + expr.inames),
+                within_inames_is_final=insn.within_inames_is_final,
+                depends_on=init_insn_depends_on,
+                expression=expr.operation.neutral_element(*arg_dtypes))
+
+        generated_insns.append(init_insn)
+
+        update_insn_depends_on = set([init_insn.id]) | insn.depends_on
+
+        updated_inner_exprs = (
+                preprocess_scan_arguments(insn, expr.expr, nresults,
+                    scan_iname, track_iname, update_insn_depends_on))
+
+        update_id = insn_id_gen(
+                based_on="%s_%s_update" % (insn.id, "_".join(expr.inames)))
+
+        update_insn_iname_deps = temp_kernel.insn_inames(insn) | set([track_iname])
+        if insn.within_inames_is_final:
+            update_insn_iname_deps = insn.within_inames | set([track_iname])
+
+        scan_insn = make_assignment(
+                id=update_id,
+                assignees=acc_vars,
+                expression=expr.operation(
+                    arg_dtypes,
+                    _strip_if_scalar(acc_vars, acc_vars),
+                    _strip_if_scalar(acc_vars, updated_inner_exprs)),
+                depends_on=frozenset(update_insn_depends_on),
+                within_inames=update_insn_iname_deps,
+                no_sync_with=insn.no_sync_with,
+                within_inames_is_final=insn.within_inames_is_final)
+
+        generated_insns.append(scan_insn)
+
+        new_insn_add_depends_on.add(scan_insn.id)
+
+        if nresults == 1:
+            assert len(acc_vars) == 1
+            return acc_vars[0]
+        else:
+            return acc_vars
+
+    # }}}
+
+    # {{{ local-parallel scan
+
+    def map_scan_local(expr, rec, nresults, arg_dtypes,
+            reduction_dtypes, sweep_iname, scan_iname,
+            sweep_min_value, scan_min_value, stride):
+
+        scan_size = _get_int_iname_size(sweep_iname)
+
+        assert scan_size > 0
+
+        if scan_size == 1:
+            return map_reduction_seq(
+                    expr, rec, nresults, arg_dtypes, reduction_dtypes)
+
+        outer_insn_inames = temp_kernel.insn_inames(insn)
+
+        from loopy.kernel.data import LocalIndexTagBase
+        outer_local_inames = tuple(
+                oiname
+                for oiname in outer_insn_inames
+                if isinstance(
+                    kernel.iname_to_tag.get(oiname),
+                    LocalIndexTagBase)
+                and oiname != sweep_iname)
+
+        from pymbolic import var
+        outer_local_iname_vars = tuple(
+                var(oiname) for oiname in outer_local_inames)
+
+        outer_local_iname_sizes = tuple(
+                _get_int_iname_size(oiname)
+                for oiname in outer_local_inames)
+
+        track_iname = var_name_gen(
+                "{sweep_iname}__pre_scan"
+                .format(scan_iname=scan_iname, sweep_iname=sweep_iname))
+
+        get_or_add_sweep_tracking_iname_and_domain(
+                scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
+                track_iname)
+
+        # {{{ add separate iname to carry out the scan
+
+        # Doing this sheds any odd conditionals that may be active
+        # on our scan_iname.
+
+        base_exec_iname = var_name_gen(sweep_iname + "__scan")
+        domains.append(_make_slab_set(base_exec_iname, scan_size))
+        new_iname_tags[base_exec_iname] = kernel.iname_to_tag[sweep_iname]
+
+        # }}}
+
+        from loopy.kernel.data import temp_var_scope
+
+        read_var_names = make_temporaries(
+                name_based_on="read_"+scan_iname+"_arg_{index}",
+                nvars=nresults,
+                shape=(),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.PRIVATE)
+
+        acc_var_names = make_temporaries(
+                name_based_on="acc_"+scan_iname,
+                nvars=nresults,
+                shape=outer_local_iname_sizes + (scan_size,),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.LOCAL)
+
+        acc_vars = tuple(var(n) for n in acc_var_names)
+        read_vars = tuple(var(n) for n in read_var_names)
+
+        base_iname_deps = (outer_insn_inames
+                - frozenset(expr.inames) - frozenset([sweep_iname]))
+
+        neutral = expr.operation.neutral_element(*arg_dtypes)
+
+        init_insn_depends_on = insn.depends_on
+
+        global_barrier = lp.find_most_recent_global_barrier(temp_kernel, insn.id)
+
+        if global_barrier is not None:
+            init_insn_depends_on |= frozenset([global_barrier])
+
+        init_id = insn_id_gen("%s_%s_init" % (insn.id, scan_iname))
+        init_insn = make_assignment(
+                id=init_id,
+                assignees=tuple(
+                    acc_var[outer_local_iname_vars + (var(base_exec_iname),)]
+                    for acc_var in acc_vars),
+                expression=neutral,
+                within_inames=base_iname_deps | frozenset([base_exec_iname]),
+                within_inames_is_final=insn.within_inames_is_final,
+                depends_on=init_insn_depends_on)
+        generated_insns.append(init_insn)
+
+        transfer_insn_depends_on = set([init_insn.id]) | insn.depends_on
+
+        updated_inner_exprs = (
+                preprocess_scan_arguments(insn, expr.expr, nresults,
+                    scan_iname, track_iname, transfer_insn_depends_on))
+
+        from loopy.symbolic import Reduction
+
+        from loopy.symbolic import pw_aff_to_expr
+        sweep_min_value_expr = pw_aff_to_expr(sweep_min_value)
+
+        transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, scan_iname))
+        transfer_insn = make_assignment(
+                id=transfer_id,
+                assignees=tuple(
+                    acc_var[outer_local_iname_vars
+                            + (var(sweep_iname) - sweep_min_value_expr,)]
+                    for acc_var in acc_vars),
+                expression=Reduction(
+                    operation=expr.operation,
+                    inames=(track_iname,),
+                    expr=_strip_if_scalar(acc_vars, updated_inner_exprs),
+                    allow_simultaneous=False,
+                    ),
+                within_inames=outer_insn_inames - frozenset(expr.inames),
+                within_inames_is_final=insn.within_inames_is_final,
+                depends_on=frozenset(transfer_insn_depends_on),
+                no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with)
+
+        generated_insns.append(transfer_insn)
+
+        prev_id = transfer_id
+
+        istage = 0
+        cur_size = 1
+
+        while cur_size < scan_size:
+            stage_exec_iname = var_name_gen("%s__scan_s%d" % (sweep_iname, istage))
+            domains.append(
+                    _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size))
+            new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[sweep_iname]
+
+            for read_var, acc_var in zip(read_vars, acc_vars):
+                read_stage_id = insn_id_gen(
+                        "scan_%s_read_stage_%d" % (scan_iname, istage))
+
+                read_stage_insn = make_assignment(
+                        id=read_stage_id,
+                        assignees=(read_var,),
+                        expression=(
+                                acc_var[
+                                    outer_local_iname_vars
+                                    + (var(stage_exec_iname) - cur_size,)]),
+                        within_inames=(
+                            base_iname_deps | frozenset([stage_exec_iname])),
+                        within_inames_is_final=insn.within_inames_is_final,
+                        depends_on=frozenset([prev_id]))
+
+                if cur_size == 1:
+                    # Performance hack: don't add a barrier here with transfer_insn.
+                    # NOTE: This won't work if the way that local inames
+                    # are lowered changes.
+                    read_stage_insn = read_stage_insn.copy(
+                            no_sync_with=(
+                                read_stage_insn.no_sync_with
+                                | frozenset([(transfer_id, "any")])))
+
+                generated_insns.append(read_stage_insn)
+                prev_id = read_stage_id
+
+            write_stage_id = insn_id_gen(
+                    "scan_%s_write_stage_%d" % (scan_iname, istage))
+            write_stage_insn = make_assignment(
+                    id=write_stage_id,
+                    assignees=tuple(
+                        acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
+                        for acc_var in acc_vars),
+                    expression=expr.operation(
+                        arg_dtypes,
+                        _strip_if_scalar(acc_vars, read_vars),
+                        _strip_if_scalar(acc_vars, tuple(
+                            acc_var[
+                                outer_local_iname_vars + (var(stage_exec_iname),)]
+                            for acc_var in acc_vars))
+                        ),
+                    within_inames=(
+                        base_iname_deps | frozenset([stage_exec_iname])),
+                    within_inames_is_final=insn.within_inames_is_final,
+                    depends_on=frozenset([prev_id]),
+                    )
+
+            generated_insns.append(write_stage_insn)
+            prev_id = write_stage_id
+
+            cur_size *= 2
+            istage += 1
+
+        new_insn_add_depends_on.add(prev_id)
+        new_insn_add_within_inames.add(sweep_iname)
+
+        output_idx = var(sweep_iname) - sweep_min_value_expr
+
+        if nresults == 1:
+            assert len(acc_vars) == 1
+            return acc_vars[0][outer_local_iname_vars + (output_idx,)]
+        else:
+            return [acc_var[outer_local_iname_vars + (output_idx,)]
+                    for acc_var in acc_vars]
+
+    # }}}
+
     # {{{ seq/par dispatch
 
     def map_reduction(expr, rec, nresults=1):
@@ -825,31 +1663,43 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             raise LoopyError("reduction used within loop(s) that it was "
                     "supposed to reduce over: " + ", ".join(bad_inames))
 
-        n_sequential = 0
-        n_local_par = 0
+        iname_classes = _classify_reduction_inames(temp_kernel, expr.inames)
 
-        from loopy.kernel.data import (
-                LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
-                ParallelTag)
-        for iname in expr.inames:
-            iname_tag = kernel.iname_to_tag.get(iname)
+        n_sequential = len(iname_classes.sequential)
+        n_local_par = len(iname_classes.local_parallel)
+        n_nonlocal_par = len(iname_classes.nonlocal_parallel)
+
+        really_force_scan = force_scan and (
+                len(expr.inames) != 1 or expr.inames[0] not in inames_added_for_scan)
+
+        def _error_if_force_scan_on(cls, msg):
+            if really_force_scan:
+                raise cls(msg)
 
-            if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)):
-                # These are nominally parallel, but we can live with
-                # them as sequential.
-                n_sequential += 1
+        may_be_implemented_as_scan = False
+        if force_scan or automagic_scans_ok:
+            from loopy.diagnostic import ReductionIsNotTriangularError
 
-            elif isinstance(iname_tag, LocalIndexTagBase):
-                n_local_par += 1
+            try:
+                # Try to determine scan candidate information (sweep iname, scan
+                # iname, etc).
+                scan_param = _try_infer_scan_candidate_from_expr(
+                        temp_kernel, expr, outer_insn_inames,
+                        sweep_iname=force_outer_iname_for_scan)
 
-            elif isinstance(iname_tag, (ParallelTag, VectorizeTag)):
-                raise LoopyError("the only form of parallelism supported "
-                        "by reductions is 'local'--found iname '%s' "
-                        "tagged '%s'"
-                        % (iname, type(iname_tag).__name__))
+            except ValueError as v:
+                error = str(v)
 
             else:
-                n_sequential += 1
+                # Ensures the reduction is triangular (somewhat expensive).
+                may_be_implemented_as_scan, error = (
+                        _check_reduction_is_triangular(
+                            temp_kernel, expr, scan_param))
+
+            if not may_be_implemented_as_scan:
+                _error_if_force_scan_on(ReductionIsNotTriangularError, error)
+
+        # {{{ sanity checks
 
         if n_local_par and n_sequential:
             raise LoopyError("Reduction over '%s' contains both parallel and "
@@ -865,21 +1715,87 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     "before code generation."
                     % ", ".join(expr.inames))
 
-        if n_sequential:
-            assert n_local_par == 0
-            return map_reduction_seq(expr, rec, nresults, arg_dtypes,
-                    reduction_dtypes)
-        elif n_local_par:
-            return map_reduction_local(expr, rec, nresults, arg_dtypes,
-                    reduction_dtypes)
-        else:
+        if n_nonlocal_par:
+            bad_inames = iname_classes.nonlocal_parallel
+            raise LoopyError("the only form of parallelism supported "
+                    "by reductions is 'local'--found iname(s) '%s' "
+                    "respectively tagged '%s'"
+                    % (", ".join(bad_inames),
+                       ", ".join(kernel.iname_to_tag[iname]
+                                 for iname in bad_inames)))
+
+        if n_local_par == 0 and n_sequential == 0:
             from loopy.diagnostic import warn_with_kernel
             warn_with_kernel(kernel, "empty_reduction",
                     "Empty reduction found (no inames to reduce over). "
                     "Eliminating.")
 
+            # We're not supposed to reduce/sum at all. (Note how this is distinct
+            # from an empty reduction--there is an element here, just no inames
+            # to reduce over. It's rather similar to an array with () shape in
+            # numpy.)
+
             return expr.expr
 
+        # }}}
+
+        if may_be_implemented_as_scan:
+            assert force_scan or automagic_scans_ok
+
+            # We require the "scan" iname to be tagged sequential.
+            if n_sequential:
+                sweep_iname = scan_param.sweep_iname
+                sweep_class = _classify_reduction_inames(kernel, (sweep_iname,))
+
+                sequential = sweep_iname in sweep_class.sequential
+                parallel = sweep_iname in sweep_class.local_parallel
+                bad_parallel = sweep_iname in sweep_class.nonlocal_parallel
+
+                if sweep_iname not in outer_insn_inames:
+                    _error_if_force_scan_on(LoopyError,
+                            "Sweep iname '%s' was detected, but is not an iname "
+                            "for the instruction." % sweep_iname)
+                elif bad_parallel:
+                    _error_if_force_scan_on(LoopyError,
+                            "Sweep iname '%s' has an unsupported parallel tag '%s' "
+                            "- the only parallelism allowed is 'local'." %
+                            (sweep_iname, temp_kernel.iname_to_tag[sweep_iname]))
+                elif parallel:
+                    return map_scan_local(
+                            expr, rec, nresults, arg_dtypes, reduction_dtypes,
+                            sweep_iname, scan_param.scan_iname,
+                            scan_param.sweep_lower_bound,
+                            scan_param.scan_lower_bound,
+                            scan_param.stride)
+                elif sequential:
+                    return map_scan_seq(
+                            expr, rec, nresults, arg_dtypes, reduction_dtypes,
+                            sweep_iname, scan_param.scan_iname,
+                            scan_param.sweep_lower_bound,
+                            scan_param.scan_lower_bound,
+                            scan_param.stride)
+
+                # fallthrough to reduction implementation
+
+            else:
+                assert n_local_par > 0
+                scan_iname, = expr.inames
+                _error_if_force_scan_on(LoopyError,
+                        "Scan iname '%s' is parallel tagged: this is not allowed "
+                        "(only the sweep iname should be tagged if parallelism "
+                        "is desired)." % scan_iname)
+
+                # fallthrough to reduction implementation
+
+        if n_sequential:
+            assert n_local_par == 0
+            return map_reduction_seq(
+                    expr, rec, nresults, arg_dtypes, reduction_dtypes)
+        else:
+            assert n_local_par > 0
+            return map_reduction_local(
+                    expr, rec, nresults, arg_dtypes, reduction_dtypes)
+
     # }}}
 
     from loopy.symbolic import ReductionCallbackMapper
@@ -985,6 +1901,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
     kernel = lp.tag_inames(kernel, new_iname_tags)
 
+    # TODO: remove unused inames...
+
     kernel = (
             _hackily_ensure_multi_assignment_return_values_are_scoped_private(
                 kernel))
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index c078da2ec58dabbbf646bfcf593ea0138941cc85..57cf74b808ae1a7107e76a18a3876785ab8baabd 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -1908,14 +1908,14 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
 
                 if (gsize or lsize):
                     if not kernel.options.disable_global_barriers:
-                        logger.info("%s: barrier insertion: global" % kernel.name)
+                        logger.debug("%s: barrier insertion: global" % kernel.name)
                         gen_sched = insert_barriers(kernel, gen_sched,
                                 kind="global", verify_only=True)
 
-                    logger.info("%s: barrier insertion: local" % kernel.name)
+                    logger.debug("%s: barrier insertion: local" % kernel.name)
                     gen_sched = insert_barriers(kernel, gen_sched, kind="local",
                             verify_only=False)
-                    logger.info("%s: barrier insertion: done" % kernel.name)
+                    logger.debug("%s: barrier insertion: done" % kernel.name)
 
                 new_kernel = kernel.copy(
                         schedule=gen_sched,
diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py
index 5de677e72708be844a5276b3d40ace8b1dad9da0..f9b08d3434556f912107726f125abbfa110f5676 100644
--- a/loopy/schedule/tools.py
+++ b/loopy/schedule/tools.py
@@ -23,10 +23,6 @@ THE SOFTWARE.
 """
 
 from loopy.kernel.data import temp_var_scope
-from loopy.schedule import (BeginBlockItem, CallKernel, EndBlockItem,
-                            RunInstruction, Barrier)
-
-from pytools import memoize_method
 
 
 # {{{ block boundary finder
@@ -37,6 +33,7 @@ def get_block_boundaries(schedule):
     :class:`loopy.schedule.BlockBeginItem`s to
     :class:`loopy.schedule.BlockEndItem`s and vice versa.
     """
+    from loopy.schedule import (BeginBlockItem, EndBlockItem)
     block_bounds = {}
     active_blocks = []
     for idx, sched_item in enumerate(schedule):
@@ -51,109 +48,24 @@ def get_block_boundaries(schedule):
 # }}}
 
 
-# {{{ instruction query utility
-
-class InstructionQuery(object):
-
-    def __init__(self, kernel):
-        self.kernel = kernel
-        block_bounds = get_block_boundaries(kernel.schedule)
-        subkernel_slices = {}
-        from six import iteritems
-        for start, end in iteritems(block_bounds):
-            sched_item = kernel.schedule[start]
-            if isinstance(sched_item, CallKernel):
-                subkernel_slices[sched_item.kernel_name] = slice(start, end + 1)
-        self.subkernel_slices = subkernel_slices
-
-    @memoize_method
-    def subkernels(self):
-        return frozenset(self.subkernel_slices.keys())
-
-    @memoize_method
-    def insns_reading_or_writing(self, var):
-        return frozenset(insn.id for insn in self.kernel.instructions
-            if var in insn.read_dependency_names()
-                or var in insn.assignee_var_names())
-
-    @memoize_method
-    def insns_in_subkernel(self, subkernel):
-        return frozenset(sched_item.insn_id for sched_item
-            in self.kernel.schedule[self.subkernel_slices[subkernel]]
-            if isinstance(sched_item, RunInstruction))
-
-    @memoize_method
-    def temporaries_read_in_subkernel(self, subkernel):
-        return frozenset(
-            var
-            for insn in self.insns_in_subkernel(subkernel)
-            for var in self.kernel.id_to_insn[insn].read_dependency_names()
-            if var in self.kernel.temporary_variables)
-
-    @memoize_method
-    def temporaries_written_in_subkernel(self, subkernel):
-        return frozenset(
-            var
-            for insn in self.insns_in_subkernel(subkernel)
-            for var in self.kernel.id_to_insn[insn].assignee_var_names()
-            if var in self.kernel.temporary_variables)
-
-    @memoize_method
-    def temporaries_read_or_written_in_subkernel(self, subkernel):
-        return (
-            self.temporaries_read_in_subkernel(subkernel) |
-            self.temporaries_written_in_subkernel(subkernel))
-
-    @memoize_method
-    def inames_in_subkernel(self, subkernel):
-        subkernel_start = self.subkernel_slices[subkernel].start
-        return frozenset(self.kernel.schedule[subkernel_start].extra_inames)
-
-    @memoize_method
-    def pre_and_post_barriers(self, subkernel):
-        subkernel_start = self.subkernel_slices[subkernel].start
-        subkernel_end = self.subkernel_slices[subkernel].stop
-
-        def is_global_barrier(item):
-            return isinstance(item, Barrier) and item.kind == "global"
-
-        try:
-            pre_barrier = next(item for item in
-                    self.kernel.schedule[subkernel_start::-1]
-                    if is_global_barrier(item)).originating_insn_id
-        except StopIteration:
-            pre_barrier = None
-
-        try:
-            post_barrier = next(item for item in
-                    self.kernel.schedule[subkernel_end:]
-                    if is_global_barrier(item)).originating_insn_id
-        except StopIteration:
-            post_barrier = None
-
-        return (pre_barrier, post_barrier)
-
-    @memoize_method
-    def hw_inames(self, insn_id):
-        """
-        Return the inames that insn runs in and that are tagged as hardware
-        parallel.
-        """
-        from loopy.kernel.data import HardwareParallelTag
-        return set(iname for iname in self.kernel.insn_inames(insn_id)
-                   if isinstance(self.kernel.iname_to_tag.get(iname),
-                                 HardwareParallelTag))
-
-    @memoize_method
-    def common_hw_inames(self, insn_ids):
-        """
-        Return the common set of hardware parallel tagged inames among
-        the list of instructions.
-        """
-        # Get the list of hardware inames in which the temporary is defined.
-        if len(insn_ids) == 0:
-            return set()
-        return set.intersection(*(self.hw_inames(id) for id in insn_ids))
+# {{{ subkernel tools
+
+def temporaries_read_in_subkernel(kernel, subkernel):
+    from loopy.kernel.tools import get_subkernel_to_insn_id_map
+    insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel]
+    return frozenset(tv
+            for insn_id in insn_ids
+            for tv in kernel.id_to_insn[insn_id].read_dependency_names()
+            if tv in kernel.temporary_variables)
+
+
+def temporaries_written_in_subkernel(kernel, subkernel):
+    from loopy.kernel.tools import get_subkernel_to_insn_id_map
+    insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel]
+    return frozenset(tv
+            for insn_id in insn_ids
+            for tv in kernel.id_to_insn[insn_id].write_dependency_names()
+            if tv in kernel.temporary_variables)
 
 # }}}
 
@@ -166,23 +78,27 @@ def add_extra_args_to_schedule(kernel):
     instructions in the schedule with global temporaries.
     """
     new_schedule = []
-
-    insn_query = InstructionQuery(kernel)
+    from loopy.schedule import CallKernel
 
     for sched_item in kernel.schedule:
         if isinstance(sched_item, CallKernel):
-            subrange_temporaries = (insn_query
-                .temporaries_read_or_written_in_subkernel(sched_item.kernel_name))
+            subkernel = sched_item.kernel_name
+
+            used_temporaries = (
+                    temporaries_read_in_subkernel(kernel, subkernel)
+                    | temporaries_written_in_subkernel(kernel, subkernel))
+
             more_args = set(tv
-                for tv in subrange_temporaries
-                if
-                kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL
-                and
-                kernel.temporary_variables[tv].initializer is None
-                and
-                tv not in sched_item.extra_args)
+                    for tv in used_temporaries
+                    if
+                    kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL
+                    and
+                    kernel.temporary_variables[tv].initializer is None
+                    and
+                    tv not in sched_item.extra_args)
+
             new_schedule.append(sched_item.copy(
-                extra_args=sched_item.extra_args + sorted(more_args)))
+                    extra_args=sched_item.extra_args + sorted(more_args)))
         else:
             new_schedule.append(sched_item)
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index fde8643bf92b7ad56bb47975fa7ede1bda9b399c..9b15ec471fb681698b85c1dd2f92376fbc731f00 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -38,6 +38,7 @@ __doc__ = """
 
 .. currentmodule:: loopy
 
+.. autoclass:: GuardedPwQPolynomial
 .. autoclass:: ToCountMap
 .. autoclass:: Op
 .. autoclass:: MemAccess
@@ -52,6 +53,66 @@ __doc__ = """
 """
 
 
+# {{{ GuardedPwQPolynomial
+
+class GuardedPwQPolynomial(object):
+    def __init__(self, pwqpolynomial, valid_domain):
+        self.pwqpolynomial = pwqpolynomial
+        self.valid_domain = valid_domain
+
+    def __add__(self, other):
+        if isinstance(other, GuardedPwQPolynomial):
+            return GuardedPwQPolynomial(
+                    self.pwqpolynomial + other.pwqpolynomial,
+                    self.valid_domain & other.valid_domain)
+        else:
+            return GuardedPwQPolynomial(
+                    self.pwqpolynomial + other,
+                    self.valid_domain)
+
+    __radd__ = __add__
+
+    def __mul__(self, other):
+        if isinstance(other, GuardedPwQPolynomial):
+            return GuardedPwQPolynomial(
+                    self.pwqpolynomial * other.pwqpolynomial,
+                    self.valid_domain & other.valid_domain)
+        else:
+            return GuardedPwQPolynomial(
+                    self.pwqpolynomial * other,
+                    self.valid_domain)
+
+    __rmul__ = __mul__
+
+    def eval_with_dict(self, value_dict):
+        space = self.pwqpolynomial.space
+        pt = isl.Point.zero(space.params())
+
+        for i in range(space.dim(dim_type.param)):
+            par_name = space.get_dim_name(dim_type.param, i)
+            pt = pt.set_coordinate_val(
+                dim_type.param, i, value_dict[par_name])
+
+        if not (isl.Set.from_point(pt) <= self.valid_domain):
+            raise ValueError("evaluation point outside of domain of "
+                    "definition of piecewise quasipolynomial")
+
+        return self.pwqpolynomial.eval(pt).to_python()
+
+    @staticmethod
+    def zero():
+        p = isl.PwQPolynomial('{ 0 }')
+        return GuardedPwQPolynomial(p, isl.Set.universe(p.domain().space))
+
+    def __str__(self):
+        return str(self.pwqpolynomial)
+
+    def __repr__(self):
+        return repr(self.pwqpolynomial)
+
+# }}}
+
+
 # {{{ ToCountMap
 
 class ToCountMap(object):
@@ -66,16 +127,17 @@ class ToCountMap(object):
 
     """
 
-    def __init__(self, init_dict=None):
+    def __init__(self, init_dict=None, val_type=GuardedPwQPolynomial):
         if init_dict is None:
             init_dict = {}
         self.count_map = init_dict
+        self.val_type = val_type
 
     def __add__(self, other):
         result = self.count_map.copy()
         for k, v in six.iteritems(other.count_map):
             result[k] = self.count_map.get(k, 0) + v
-        return ToCountMap(result)
+        return ToCountMap(result, self.val_type)
 
     def __radd__(self, other):
         if other != 0:
@@ -86,7 +148,7 @@ class ToCountMap(object):
         return self
 
     def __mul__(self, other):
-        if isinstance(other, isl.PwQPolynomial):
+        if isinstance(other, GuardedPwQPolynomial):
             return ToCountMap(dict(
                 (index, self.count_map[index]*other)
                 for index in self.keys()))
@@ -101,7 +163,11 @@ class ToCountMap(object):
         try:
             return self.count_map[index]
         except KeyError:
-            return isl.PwQPolynomial('{ 0 }')
+            #TODO what is the best way to handle this?
+            if self.val_type is GuardedPwQPolynomial:
+                return GuardedPwQPolynomial.zero()
+            else:
+                return 0
 
     def __setitem__(self, index, value):
         self.count_map[index] = value
@@ -112,6 +178,9 @@ class ToCountMap(object):
     def __len__(self):
         return len(self.count_map)
 
+    def get(self, key, default=None):
+        return self.count_map.get(key, default)
+
     def items(self):
         return self.count_map.items()
 
@@ -122,14 +191,20 @@ class ToCountMap(object):
         return self.count_map.pop(item)
 
     def copy(self):
-        return ToCountMap(dict(self.count_map))
+        return ToCountMap(dict(self.count_map), self.val_type)
+
+    def with_set_attributes(self, **kwargs):
+        return ToCountMap(dict(
+            (key.copy(**kwargs), val)
+            for key, val in six.iteritems(self.count_map)),
+            self.val_type)
 
     def filter_by(self, **kwargs):
         """Remove items without specified key fields.
 
-        :parameter \*\*kwargs: Keyword arguments matching fields in the keys of
-                             the :class:`ToCountMap`, each given a list of
-                             allowable values for that key field.
+        :arg kwargs: Keyword arguments matching fields in the keys of
+                 the :class:`ToCountMap`, each given a list of
+                 allowable values for that key field.
 
         :return: A :class:`ToCountMap` containing the subset of the items in
                  the original :class:`ToCountMap` that match the field values
@@ -149,7 +224,7 @@ class ToCountMap(object):
 
         """
 
-        result_map = ToCountMap()
+        result_map = ToCountMap(val_type=self.val_type)
 
         from loopy.types import to_loopy_type
         if 'dtype' in kwargs.keys():
@@ -175,10 +250,10 @@ class ToCountMap(object):
     def filter_by_func(self, func):
         """Keep items that pass a test.
 
-        :parameter func: A function that takes a map key a parameter and
-                         returns a :class:`bool`.
+        :arg func: A function that takes a map key a parameter and
+             returns a :class:`bool`.
 
-        :return: A :class:`ToCountMap` containing the subset of the items in
+        :arg: A :class:`ToCountMap` containing the subset of the items in
                  the original :class:`ToCountMap` for which func(key) is true.
 
         Example usage::
@@ -197,7 +272,7 @@ class ToCountMap(object):
 
         """
 
-        result_map = ToCountMap()
+        result_map = ToCountMap(val_type=self.val_type)
 
         # for each item in self.count_map, call func on the key
         for self_key, self_val in self.items():
@@ -210,7 +285,7 @@ class ToCountMap(object):
         """Group map items together, distinguishing by only the key fields
            passed in args.
 
-        :parameter \*args: Zero or more :class:`str` fields of map keys.
+        :arg args: Zero or more :class:`str` fields of map keys.
 
         :return: A :class:`ToCountMap` containing the same total counts
                  grouped together by new keys that only contain the fields
@@ -252,7 +327,7 @@ class ToCountMap(object):
 
         """
 
-        result_map = ToCountMap()
+        result_map = ToCountMap(val_type=self.val_type)
 
         # make sure all item keys have same type
         if self.count_map:
@@ -315,23 +390,36 @@ class ToCountMap(object):
             bytes_processed = int(key.dtype.itemsize) * val
             result[key] = bytes_processed
 
+        #TODO again, is this okay?
+        result.val_type = int
+
         return result
 
     def sum(self):
         """Add all counts in ToCountMap.
 
-        :return: A :class:`islpy.PwQPolynomial` containing the sum of counts.
+        :return: A :class:`islpy.PwQPolynomial` or :class:`int` containing the sum of
+                 counts.
 
         """
-        total = isl.PwQPolynomial('{ 0 }')
+
+        if self.val_type is GuardedPwQPolynomial:
+            total = GuardedPwQPolynomial.zero()
+        else:
+            total = 0
+
         for k, v in self.items():
-            if not isinstance(v, isl.PwQPolynomial):
-                raise ValueError("ToCountMap: sum() encountered type {0} but "
-                                 "may only be used on PwQPolynomials."
-                                 .format(type(v)))
             total += v
         return total
 
+    #TODO test and document
+    def eval(self, params):
+        result = self.copy()
+        for key, val in self.items():
+            result[key] = val.eval_with_dict(params)
+        result.val_type = int
+        return result
+
     def eval_and_sum(self, params):
         """Add all counts in :class:`ToCountMap` and evaluate with provided
         parameter dict.
@@ -364,8 +452,10 @@ def stringify_stats_mapping(m):
     return result
 
 
+# {{{ Op descriptor
+
 class Op(object):
-    """An arithmetic operation.
+    """A descriptor for a type of arithmetic operation.
 
     .. attribute:: dtype
 
@@ -379,6 +469,8 @@ class Op(object):
 
     """
 
+    # FIXME: This could be done much more briefly by inheriting from Record.
+
     def __init__(self, dtype=None, name=None):
         self.name = name
         if dtype is None:
@@ -398,19 +490,15 @@ class Op(object):
         return hash(str(self))
 
     def __repr__(self):
-        if self.dtype is None:
-            dtype = 'None'
-        else:
-            dtype = str(self.dtype)
-        if self.name is None:
-            name = 'None'
-        else:
-            name = self.name
-        return "Op("+dtype+", "+name+")"
+        return "Op(%s, %s)" % (self.dtype, self.name)
+
+# }}}
 
 
+# {{{ MemAccess descriptor
+
 class MemAccess(object):
-    """A memory access.
+    """A descriptor for a type of memory access.
 
     .. attribute:: mtype
 
@@ -439,6 +527,8 @@ class MemAccess(object):
 
     """
 
+    # FIXME: This could be done much more briefly by inheriting from Record.
+
     def __init__(self, mtype=None, dtype=None, stride=None, direction=None,
                  variable=None):
         self.mtype = mtype
@@ -461,6 +551,16 @@ class MemAccess(object):
             raise NotImplementedError("MemAccess: variable must be None when "
                                       "mtype is 'local'")
 
+    def copy(self, mtype=None, dtype=None, stride=None, direction=None,
+            variable=None):
+        return MemAccess(
+                mtype=mtype if mtype is not None else self.mtype,
+                dtype=dtype if dtype is not None else self.dtype,
+                stride=stride if stride is not None else self.stride,
+                direction=direction if direction is not None else self.direction,
+                variable=variable if variable is not None else self.variable,
+                )
+
     def __eq__(self, other):
         return isinstance(other, MemAccess) and (
                 (self.mtype is None or other.mtype is None or
@@ -501,11 +601,70 @@ class MemAccess(object):
         return "MemAccess(" + mtype + ", " + dtype + ", " + stride + ", " \
                + direction + ", " + variable + ")"
 
+# }}}
 
-# {{{ ExpressionOpCounter
 
-class ExpressionOpCounter(CombineMapper):
+# {{{ counter base
+
+class CounterBase(CombineMapper):
+    def __init__(self, knl):
+        self.knl = knl
+        from loopy.type_inference import TypeInferenceMapper
+        self.type_inf = TypeInferenceMapper(knl)
+
+    def combine(self, values):
+        return sum(values)
+
+    def map_constant(self, expr):
+        return ToCountMap()
+
+    def map_call(self, expr):
+        return self.rec(expr.parameters)
+
+    def map_sum(self, expr):
+        if expr.children:
+            return sum(self.rec(child) for child in expr.children)
+        else:
+            return ToCountMap()
+
+    map_product = map_sum
+
+    def map_comparison(self, expr):
+        return self.rec(expr.left)+self.rec(expr.right)
+
+    def map_if(self, expr):
+        warn_with_kernel(self.knl, "summing_if_branches",
+                         "%s counting sum of if-expression branches."
+                         % type(self).__name__)
+        return self.rec(expr.condition) + self.rec(expr.then) \
+               + self.rec(expr.else_)
+
+    def map_if_positive(self, expr):
+        warn_with_kernel(self.knl, "summing_if_branches",
+                         "%s counting sum of if-expression branches."
+                         % type(self).__name__)
+        return self.rec(expr.criterion) + self.rec(expr.then) \
+               + self.rec(expr.else_)
+
+    def map_common_subexpression(self, expr):
+        raise RuntimeError("%s encountered %s--not supposed to happen"
+                % (type(self).__name__, type(expr).__name__))
+
+    map_substitution = map_common_subexpression
+    map_derivative = map_common_subexpression
+    map_slice = map_common_subexpression
+
+    # preprocessing should have removed these
+    def map_reduction(self, expr):
+        raise RuntimeError("%s encountered %s--not supposed to happen"
+                % (type(self).__name__, type(expr).__name__))
+
+# }}}
+
+
+# {{{ ExpressionOpCounter
 
+class ExpressionOpCounter(CounterBase):
     def __init__(self, knl):
         self.knl = knl
         from loopy.type_inference import TypeInferenceMapper
@@ -620,106 +779,59 @@ class ExpressionOpCounter(CombineMapper):
 # }}}
 
 
-# {{{ LocalSubscriptCounter
-
-class LocalSubscriptCounter(CombineMapper):
-
-    def __init__(self, knl):
-        self.knl = knl
-        from loopy.type_inference import TypeInferenceMapper
-        self.type_inf = TypeInferenceMapper(knl)
-
-    def combine(self, values):
-        return sum(values)
-
-    def map_constant(self, expr):
-        return ToCountMap()
+class MemAccessCounter(CounterBase):
+    pass
 
-    map_tagged_variable = map_constant
-    map_variable = map_constant
 
-    def map_call(self, expr):
-        return self.rec(expr.parameters)
+# {{{ LocalMemAccessCounter
 
-    def map_subscript(self, expr):
+class LocalMemAccessCounter(MemAccessCounter):
+    def count_var_access(self, dtype, name, subscript):
         sub_map = ToCountMap()
-        name = expr.aggregate.name  # name of array
         if name in self.knl.temporary_variables:
             array = self.knl.temporary_variables[name]
             if array.is_local:
-                sub_map[MemAccess(mtype='local', dtype=self.type_inf(expr))] = 1
-        return sub_map + self.rec(expr.index)
-
-    def map_sum(self, expr):
-        if expr.children:
-            return sum(self.rec(child) for child in expr.children)
-        else:
-            return ToCountMap()
-
-    map_product = map_sum
-
-    def map_comparison(self, expr):
-        return self.rec(expr.left)+self.rec(expr.right)
-
-    def map_if(self, expr):
-        warn_with_kernel(self.knl, "summing_if_branches_lsubs",
-                         "LocalSubscriptCounter counting LMEM accesses as sum "
-                         "of if-statement branches.")
-        return self.rec(expr.condition) + self.rec(expr.then) \
-               + self.rec(expr.else_)
-
-    def map_if_positive(self, expr):
-        warn_with_kernel(self.knl, "summing_ifpos_branches_lsubs",
-                         "LocalSubscriptCounter counting LMEM accesses as sum "
-                         "of if_pos-statement branches.")
-        return self.rec(expr.criterion) + self.rec(expr.then) \
-               + self.rec(expr.else_)
-
-    def map_common_subexpression(self, expr):
-        raise NotImplementedError("LocalSubscriptCounter encountered "
-                                  "common_subexpression, "
-                                  "map_common_subexpression not implemented.")
+                sub_map[MemAccess(mtype='local', dtype=dtype)] = 1
+        return sub_map
 
-    def map_substitution(self, expr):
-        raise NotImplementedError("LocalSubscriptCounter encountered "
-                                  "substitution, "
-                                  "map_substitution not implemented.")
+    def map_variable(self, expr):
+        return self.count_var_access(
+                    self.type_inf(expr), expr.name, None)
 
-    def map_derivative(self, expr):
-        raise NotImplementedError("LocalSubscriptCounter encountered "
-                                  "derivative, "
-                                  "map_derivative not implemented.")
+    map_tagged_variable = map_variable
 
-    def map_slice(self, expr):
-        raise NotImplementedError("LocalSubscriptCounter encountered slice, "
-                                  "map_slice not implemented.")
+    def map_subscript(self, expr):
+        return (
+                self.count_var_access(
+                    self.type_inf(expr), expr.aggregate.name, expr.index)
+                + self.rec(expr.index))
 
 # }}}
 
 
-# {{{ GlobalSubscriptCounter
-
-class GlobalSubscriptCounter(CombineMapper):
+# {{{ GlobalMemAccessCounter
 
-    def __init__(self, knl):
-        self.knl = knl
-        from loopy.type_inference import TypeInferenceMapper
-        self.type_inf = TypeInferenceMapper(knl)
-
-    def combine(self, values):
-        return sum(values)
+class GlobalMemAccessCounter(MemAccessCounter):
+    def map_variable(self, expr):
+        name = expr.name
 
-    def map_constant(self, expr):
-        return ToCountMap()
+        if name in self.knl.arg_dict:
+            array = self.knl.arg_dict[name]
+        else:
+            # this is a temporary variable
+            return ToCountMap()
 
-    map_tagged_variable = map_constant
-    map_variable = map_constant
+        if not isinstance(array, lp.GlobalArg):
+            # this array is not in global memory
+            return ToCountMap()
 
-    def map_call(self, expr):
-        return self.rec(expr.parameters)
+        return ToCountMap({MemAccess(mtype='global',
+                                     dtype=self.type_inf(expr), stride=0,
+                                     variable=name): 1}
+                          ) + self.rec(expr.index)
 
     def map_subscript(self, expr):
-        name = expr.aggregate.name  # name of array
+        name = expr.aggregate.name
 
         if name in self.knl.arg_dict:
             array = self.knl.arg_dict[name]
@@ -806,47 +918,6 @@ class GlobalSubscriptCounter(CombineMapper):
                                      stride=total_stride, variable=name): 1}
                           ) + self.rec(expr.index)
 
-    def map_sum(self, expr):
-        if expr.children:
-            return sum(self.rec(child) for child in expr.children)
-        else:
-            return ToCountMap()
-
-    map_product = map_sum
-
-    def map_if(self, expr):
-        warn_with_kernel(self.knl, "summing_if_branches_gsubs",
-                         "GlobalSubscriptCounter counting GMEM accesses as "
-                         "sum of if-statement branches.")
-        return self.rec(expr.condition) + self.rec(expr.then) \
-               + self.rec(expr.else_)
-
-    def map_if_positive(self, expr):
-        warn_with_kernel(self.knl, "summing_ifpos_branches_gsubs",
-                         "GlobalSubscriptCounter counting GMEM accesses as "
-                         "sum of if_pos-statement branches.")
-        return self.rec(expr.criterion) + self.rec(expr.then) \
-               + self.rec(expr.else_)
-
-    def map_common_subexpression(self, expr):
-        raise NotImplementedError("GlobalSubscriptCounter encountered "
-                                  "common_subexpression, "
-                                  "map_common_subexpression not implemented.")
-
-    def map_substitution(self, expr):
-        raise NotImplementedError("GlobalSubscriptCounter encountered "
-                                  "substitution, "
-                                  "map_substitution not implemented.")
-
-    def map_derivative(self, expr):
-        raise NotImplementedError("GlobalSubscriptCounter encountered "
-                                  "derivative, "
-                                  "map_derivative not implemented.")
-
-    def map_slice(self, expr):
-        raise NotImplementedError("GlobalSubscriptCounter encountered slice, "
-                                  "map_slice not implemented.")
-
 # }}}
 
 
@@ -919,9 +990,13 @@ class AccessFootprintGatherer(CombineMapper):
 
 # {{{ count
 
-def count(kernel, set):
+def add_assumptions_guard(kernel, pwqpolynomial):
+    return GuardedPwQPolynomial(pwqpolynomial, kernel.assumptions)
+
+
+def count(kernel, set, space=None):
     try:
-        return set.card()
+        return add_assumptions_guard(kernel, set.card())
     except AttributeError:
         pass
 
@@ -948,7 +1023,11 @@ def count(kernel, set):
             if stride is None:
                 stride = 1
 
-            length = isl.PwQPolynomial.from_pw_aff(dmax - dmin + stride)
+            length_pwaff = dmax - dmin + stride
+            if space is not None:
+                length_pwaff = length_pwaff.align_params(space)
+
+            length = isl.PwQPolynomial.from_pw_aff(length_pwaff)
             length = length.scale_down_val(stride)
 
             if bset_count is None:
@@ -1008,46 +1087,102 @@ def count(kernel, set):
                         "number of integer points in your loop "
                         "domain.")
 
-    return count
+    return add_assumptions_guard(kernel, count)
 
-# }}}
 
+def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None):
+    # FIXME: Multi-kernel support
+    gsize, lsize = knl.get_grid_size_upper_bounds()
 
-# {{{ get_op_poly
+    g_used = set()
+    l_used = set()
 
-def get_op_poly(knl, numpy_types=True):
+    from loopy.kernel.data import LocalIndexTag, GroupIndexTag
+    for iname in knl.insn_inames(insn):
+        tag = knl.iname_to_tag.get(iname)
 
-    """Count the number of operations in a loopy kernel.
+        if isinstance(tag, LocalIndexTag):
+            l_used.add(tag.axis)
+        elif isinstance(tag, GroupIndexTag):
+            g_used.add(tag.axis)
 
-    get_op_poly is deprecated. Use get_op_map instead.
+    def mult_grid_factor(used_axes, size):
+        result = 1
+        for iaxis, size in enumerate(size):
+            if iaxis not in used_axes:
+                if not isinstance(size, int):
+                    if space is not None:
+                        size = size.align_params(space)
 
-    """
-    warn_with_kernel(knl, "depricated_get_op_poly",
-                     "get_op_poly is deprecated. Use get_op_map instead.")
-    return get_op_map(knl, numpy_types)
+                    size = isl.PwQPolynomial.from_pw_aff(size)
+
+                result = result * size
+
+        return result
+
+    if disregard_local_axes:
+        result = mult_grid_factor(g_used, gsize)
+    else:
+        result = mult_grid_factor(g_used, gsize) * mult_grid_factor(l_used, lsize)
+
+    return add_assumptions_guard(knl, result)
+
+
+def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False):
+    insn_inames = knl.insn_inames(insn)
+
+    if disregard_local_axes:
+        from loopy.kernel.data import LocalIndexTag
+        insn_inames = [iname for iname in insn_inames if not
+                       isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)]
+
+    inames_domain = knl.get_inames_domain(insn_inames)
+    domain = (inames_domain.project_out_except(
+                            insn_inames, [dim_type.set]))
+
+    space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT,
+            set=[], params=knl.outer_params())
+
+    c = count(knl, domain, space=space)
+
+    if count_redundant_work:
+        unused_fac = get_unused_hw_axes_factor(knl, insn,
+                        disregard_local_axes=disregard_local_axes,
+                        space=space)
+        return c * unused_fac
+    else:
+        return c
 
 # }}}
 
 
-def get_op_map(knl, numpy_types=True):
+# {{{ get_op_map
+
+def get_op_map(knl, numpy_types=True, count_redundant_work=False):
 
     """Count the number of operations in a loopy kernel.
 
-    :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted.
+    :arg knl: A :class:`loopy.LoopKernel` whose operations are to be counted.
+
+    :arg numpy_types: A :class:`bool` specifying whether the types
+         in the returned mapping should be numpy types
+         instead of :class:`loopy.LoopyType`.
 
-    :parameter numpy_types: A :class:`bool` specifying whether the types
-                            in the returned mapping should be numpy types
-                            instead of :class:`loopy.LoopyType`.
+    :arg count_redundant_work: Based on usage of hardware axes or other
+        specifics, a kernel may perform work redundantly. This :class:`bool`
+        flag indicates whether this work should be included in the count.
+        (Likely desirable for performance modeling, but undesirable for
+        code optimization.)
 
     :return: A :class:`ToCountMap` of **{** :class:`Op` **:**
-             :class:`islpy.PwQPolynomial` **}**.
+        :class:`islpy.PwQPolynomial` **}**.
 
-             - The :class:`Op` specifies the characteristics of the arithmetic
-               operation.
+        - The :class:`Op` specifies the characteristics of the arithmetic
+          operation.
 
-             - The :class:`islpy.PwQPolynomial` holds the number of operations of
-               the kind specified in the key (in terms of the
-               :class:`loopy.LoopKernel` parameter *inames*).
+        - The :class:`islpy.PwQPolynomial` holds the number of operations of
+          the kind specified in the key (in terms of the
+          :class:`loopy.LoopKernel` parameter *inames*).
 
     Example usage::
 
@@ -1069,14 +1204,10 @@ def get_op_map(knl, numpy_types=True):
     op_map = ToCountMap()
     op_counter = ExpressionOpCounter(knl)
     for insn in knl.instructions:
-        # how many times is this instruction executed?
-        # check domain size:
-        insn_inames = knl.insn_inames(insn)
-        inames_domain = knl.get_inames_domain(insn_inames)
-        domain = (inames_domain.project_out_except(
-                                        insn_inames, [dim_type.set]))
         ops = op_counter(insn.assignee) + op_counter(insn.expression)
-        op_map = op_map + ops*count(knl, domain)
+        op_map = op_map + ops*count_insn_runs(
+                knl, insn,
+                count_redundant_work=count_redundant_work)
 
     if numpy_types:
         op_map.count_map = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name),
@@ -1085,73 +1216,36 @@ def get_op_map(knl, numpy_types=True):
 
     return op_map
 
-
-#TODO test deprecated functions?
-def get_lmem_access_poly(knl):
-    """Count the number of local memory accesses in a loopy kernel.
-
-    get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the
-    result with the mtype=['local'] option.
-
-    """
-    warn_with_kernel(knl, "depricated_get_lmem_access_poly",
-                     "get_lmem_access_poly is deprecated. Use "
-                     "get_mem_access_map and filter the result with the "
-                     "mtype=['local'] option.")
-    return get_mem_access_map(knl).filter_by(mtype=['local'])
-
-
-def get_DRAM_access_poly(knl):
-    """Count the number of global memory accesses in a loopy kernel.
-
-    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
-    result with the mtype=['global'] option.
-
-    """
-    warn_with_kernel(knl, "depricated_get_DRAM_access_poly",
-                     "get_DRAM_access_poly is deprecated. Use "
-                     "get_mem_access_map and filter the result with the "
-                     "mtype=['global'] option.")
-    return get_mem_access_map(knl).filter_by(mtype=['global'])
-
-
-# {{{ get_gmem_access_poly
-
-def get_gmem_access_poly(knl):
-    """Count the number of global memory accesses in a loopy kernel.
-
-    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
-    result with the mtype=['global'] option.
-
-    """
-    warn_with_kernel(knl, "depricated_get_gmem_access_poly",
-                     "get_DRAM_access_poly is deprecated. Use "
-                     "get_mem_access_map and filter the result with the "
-                     "mtype=['global'] option.")
-    return get_mem_access_map(knl).filter_by(mtype=['global'])
-
 # }}}
 
 
-def get_mem_access_map(knl, numpy_types=True):
+# {{{ get_mem_access_map
+
+def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False):
     """Count the number of memory accesses in a loopy kernel.
 
-    :parameter knl: A :class:`loopy.LoopKernel` whose memory accesses are to be
-                    counted.
+    :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be
+        counted.
+
+    :arg numpy_types: A :class:`bool` specifying whether the types
+        in the returned mapping should be numpy types
+        instead of :class:`loopy.LoopyType`.
 
-    :parameter numpy_types: A :class:`bool` specifying whether the types
-                            in the returned mapping should be numpy types
-                            instead of :class:`loopy.LoopyType`.
+    :arg count_redundant_work: Based on usage of hardware axes or other
+        specifics, a kernel may perform work redundantly. This :class:`bool`
+        flag indicates whether this work should be included in the count.
+        (Likely desirable for performance modeling, but undesirable for
+        code optimization.)
 
     :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:**
-             :class:`islpy.PwQPolynomial` **}**.
+        :class:`islpy.PwQPolynomial` **}**.
 
-             - The :class:`MemAccess` specifies the characteristics of the
-               memory access.
+        - The :class:`MemAccess` specifies the characteristics of the
+          memory access.
 
-             - The :class:`islpy.PwQPolynomial` holds the number of memory
-               accesses with the characteristics specified in the key (in terms
-               of the :class:`loopy.LoopKernel` *inames*).
+        - The :class:`islpy.PwQPolynomial` holds the number of memory
+          accesses with the characteristics specified in the key (in terms
+          of the :class:`loopy.LoopKernel` *inames*).
 
     Example usage::
 
@@ -1196,102 +1290,74 @@ def get_mem_access_map(knl, numpy_types=True):
     cache_holder = CacheHolder()
 
     @memoize_in(cache_holder, "insn_count")
-    def get_insn_count(knl, insn_inames, uniform=False):
-        if uniform:
-            from loopy.kernel.data import LocalIndexTag
-            insn_inames = [iname for iname in insn_inames if not
-                           isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)]
-        inames_domain = knl.get_inames_domain(insn_inames)
-        domain = (inames_domain.project_out_except(
-                                insn_inames, [dim_type.set]))
-        return count(knl, domain)
+    def get_insn_count(knl, insn_id, uniform=False):
+        insn = knl.id_to_insn[insn_id]
+        return count_insn_runs(
+                knl, insn, disregard_local_axes=uniform,
+                count_redundant_work=count_redundant_work)
 
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
 
-    subs_map = ToCountMap()
-    subs_counter_g = GlobalSubscriptCounter(knl)
-    subs_counter_l = LocalSubscriptCounter(knl)
+    access_map = ToCountMap()
+    access_counter_g = GlobalMemAccessCounter(knl)
+    access_counter_l = LocalMemAccessCounter(knl)
 
     for insn in knl.instructions:
-        # count subscripts
-        subs_expr = subs_counter_g(insn.expression) \
-                    + subs_counter_l(insn.expression)
-
-        # distinguish loads and stores
-        for key in subs_expr.count_map:
-            subs_expr[MemAccess(mtype=key.mtype, dtype=key.dtype,
-                                stride=key.stride, direction='load',
-                                variable=key.variable)
-                      ] = subs_expr.pop(key)
-
-        subs_assignee_g = subs_counter_g(insn.assignee)
-        for key in subs_assignee_g.count_map:
-            subs_assignee_g[MemAccess(mtype=key.mtype, dtype=key.dtype,
-                                      stride=key.stride,
-                                      direction='store',
-                                      variable=key.variable)
-                            ] = subs_assignee_g.pop(key)
-        # for now, don't count writes to local mem
-
-        insn_inames = knl.insn_inames(insn)
+        access_expr = (
+                access_counter_g(insn.expression)
+                + access_counter_l(insn.expression)
+                ).with_set_attributes(direction="load")
+
+        access_assignee_g = access_counter_g(insn.assignee).with_set_attributes(
+                direction="store")
+
+        # FIXME: (!!!!) for now, don't count writes to local mem
 
         # use count excluding local index tags for uniform accesses
-        for key in subs_expr.count_map:
-            map = ToCountMap({key: subs_expr[key]})
-            if (key.mtype == 'global' and
+        for key, val in six.iteritems(access_expr.count_map):
+            is_uniform = (key.mtype == 'global' and
                     isinstance(key.stride, int) and
-                    key.stride == 0):
-                subs_map = subs_map \
-                            + map*get_insn_count(knl, insn_inames, True)
-            else:
-                subs_map = subs_map + map*get_insn_count(knl, insn_inames)
-                #currently not counting stride of local mem access
-
-        for key in subs_assignee_g.count_map:
-            map = ToCountMap({key: subs_assignee_g[key]})
-            if isinstance(key.stride, int) and key.stride == 0:
-                subs_map = subs_map \
-                            + map*get_insn_count(knl, insn_inames, True)
-            else:
-                subs_map = subs_map + map*get_insn_count(knl, insn_inames)
+                    key.stride == 0)
+            access_map = (
+                    access_map
+                    + ToCountMap({key: val})
+                    * get_insn_count(knl, insn.id, is_uniform))
+            #currently not counting stride of local mem access
+
+        for key, val in six.iteritems(access_assignee_g.count_map):
+            is_uniform = (key.mtype == 'global' and
+                    isinstance(key.stride, int) and
+                    key.stride == 0)
+            access_map = (
+                    access_map
+                    + ToCountMap({key: val})
+                    * get_insn_count(knl, insn.id, is_uniform))
             # for now, don't count writes to local mem
 
     if numpy_types:
-        subs_map.count_map = dict((MemAccess(mtype=mem_access.mtype,
+        # FIXME: Don't modify in-place
+        access_map.count_map = dict((MemAccess(mtype=mem_access.mtype,
                                              dtype=mem_access.dtype.numpy_dtype,
                                              stride=mem_access.stride,
                                              direction=mem_access.direction,
                                              variable=mem_access.variable),
                                   count)
-                      for mem_access, count in six.iteritems(subs_map.count_map))
-
-    return subs_map
-
-
-# {{{ get_synchronization_poly
-
-def get_synchronization_poly(knl):
-    """Count the number of synchronization events each thread encounters in a
-    loopy kernel.
+                      for mem_access, count in six.iteritems(access_map.count_map))
 
-    get_synchronization_poly is deprecated. Use get_synchronization_map instead.
-
-    """
-    warn_with_kernel(knl, "depricated_get_synchronization_poly",
-                     "get_synchronization_poly is deprecated. Use "
-                     "get_synchronization_map instead.")
-    return get_synchronization_map(knl)
+    return access_map
 
 # }}}
 
 
+# {{{ get_synchronization_map
+
 def get_synchronization_map(knl):
 
     """Count the number of synchronization events each thread encounters in a
     loopy kernel.
 
-    :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
+    :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
     :return: A dictionary mapping each type of synchronization event to a
             :class:`islpy.PwQPolynomial` holding the number of events per
@@ -1358,9 +1424,10 @@ def get_synchronization_map(knl):
             raise LoopyError("unexpected schedule item: %s"
                     % type(sched_item).__name__)
 
-    #return result.count_map #TODO is this change okay?
     return result
 
+# }}}
+
 
 # {{{ gather_access_footprints
 
@@ -1456,4 +1523,74 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
 
 # }}}
 
+
+# {{{ compat goop
+
+def get_lmem_access_poly(knl):
+    """Count the number of local memory accesses in a loopy kernel.
+
+    get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['local'] option.
+
+    """
+    warn_with_kernel(knl, "deprecated_get_lmem_access_poly",
+                     "get_lmem_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['local'] option.")
+    return get_mem_access_map(knl).filter_by(mtype=['local'])
+
+
+def get_DRAM_access_poly(knl):
+    """Count the number of global memory accesses in a loopy kernel.
+
+    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['global'] option.
+
+    """
+    warn_with_kernel(knl, "deprecated_get_DRAM_access_poly",
+                     "get_DRAM_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['global'] option.")
+    return get_mem_access_map(knl).filter_by(mtype=['global'])
+
+
+def get_gmem_access_poly(knl):
+    """Count the number of global memory accesses in a loopy kernel.
+
+    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['global'] option.
+
+    """
+    warn_with_kernel(knl, "deprecated_get_gmem_access_poly",
+                     "get_DRAM_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['global'] option.")
+    return get_mem_access_map(knl).filter_by(mtype=['global'])
+
+
+def get_synchronization_poly(knl):
+    """Count the number of synchronization events each thread encounters in a
+    loopy kernel.
+
+    get_synchronization_poly is deprecated. Use get_synchronization_map instead.
+
+    """
+    warn_with_kernel(knl, "deprecated_get_synchronization_poly",
+                     "get_synchronization_poly is deprecated. Use "
+                     "get_synchronization_map instead.")
+    return get_synchronization_map(knl)
+
+
+def get_op_poly(knl, numpy_types=True):
+    """Count the number of operations in a loopy kernel.
+
+    get_op_poly is deprecated. Use get_op_map instead.
+
+    """
+    warn_with_kernel(knl, "deprecated_get_op_poly",
+                     "get_op_poly is deprecated. Use get_op_map instead.")
+    return get_op_map(knl, numpy_types)
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 68cc32e56be077c7e45d11b9e2aade86b04494cc..8f924d3aee3b9f2982006fdb7b558cccac6785e3 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -167,6 +167,13 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         def base_impl(expr, type_context):
             return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')]
 
+        def make_var(name):
+            from loopy import TaggedVariable
+            if isinstance(expr.aggregate, TaggedVariable):
+                return TaggedVariable(name, expr.aggregate.tag)
+            else:
+                return var(name)
+
         from pymbolic.primitives import Variable
         if not isinstance(expr.aggregate, Variable):
             return base_impl(expr, type_context)
@@ -224,16 +231,16 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                         (isinstance(ary, (ConstantArg, GlobalArg)) or
                          (isinstance(ary, TemporaryVariable) and ary.base_storage))):
                     # unsubscripted global args are pointers
-                    result = var(access_info.array_name)[0]
+                    result = make_var(access_info.array_name)[0]
 
                 else:
                     # unsubscripted temp vars are scalars
                     # (unless they use base_storage)
-                    result = var(access_info.array_name)
+                    result = make_var(access_info.array_name)
 
             else:
                 subscript, = access_info.subscripts
-                result = var(access_info.array_name)[self.rec(subscript, 'i')]
+                result = make_var(access_info.array_name)[self.rec(subscript, 'i')]
 
             if access_info.vector_index is not None:
                 return self.codegen_state.ast_builder.add_vector_access(
@@ -716,6 +723,8 @@ class CExpressionToCodeMapper(RecursiveMapper):
     def map_variable(self, expr, enclosing_prec):
         return expr.name
 
+    map_tagged_variable = map_variable
+
     def map_lookup(self, expr, enclosing_prec):
         return self.parenthesize_if_needed(
                 "%s.%s" % (
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 80a69bd00c99258b709ea18b2a716c339b888b02..35dade90494906b61aad9eb66e7271f2c5d1e180 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -32,6 +32,7 @@ from loopy.diagnostic import LoopyError
 from loopy.symbolic import Literal
 from pymbolic import var
 import pymbolic.primitives as p
+from loopy.kernel.data import temp_var_scope
 from pymbolic.mapper.stringifier import PREC_NONE
 
 from pytools import memoize_method
@@ -81,7 +82,6 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
     def map_variable(self, expr, type_context):
         tv = self.kernel.temporary_variables.get(expr.name)
 
-        from loopy.kernel.data import temp_var_scope
         if tv is not None and tv.scope == temp_var_scope.PRIVATE:
             # FIXME: This is a pretty coarse way of deciding what
             # private temporaries get duplicated. Refine? (See also
@@ -101,7 +101,10 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
 
         ary = self.find_array(expr)
 
-        if isinstance(ary, TemporaryVariable):
+        if (isinstance(ary, TemporaryVariable)
+                and ary.scope == temp_var_scope.PRIVATE):
+            # generate access code for acccess to private-index temporaries
+
             gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs()
             if lsize:
                 lsize, = lsize
@@ -305,7 +308,6 @@ class ISPCASTBuilder(CASTBuilder):
 
         shape = decl_info.shape
 
-        from loopy.kernel.data import temp_var_scope
         if temp_var.scope == temp_var_scope.PRIVATE:
             # FIXME: This is a pretty coarse way of deciding what
             # private temporaries get duplicated. Refine? (See also
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 467bc8ee801a090641c7e7f8d7f2e7c12a921232..f24b115fd5a35af94e4a6d437550bccf86b5bee0 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -335,6 +335,8 @@ class PyOpenCLTarget(OpenCLTarget):
                         % dev_id)
 
     def preprocess(self, kernel):
+        if self.device is not None:
+            kernel = adjust_local_temp_var_storage(kernel, self.device)
         return kernel
 
     def pre_codegen_check(self, kernel):
@@ -363,14 +365,26 @@ class PyOpenCLTarget(OpenCLTarget):
             raise NotImplementedError("atomics flavor: %s" % self.atomics_flavor)
 
     def is_vector_dtype(self, dtype):
-        from pyopencl.array import vec
+        try:
+            import pyopencl.cltypes as cltypes
+            vec_types = cltypes.vec_types
+        except ImportError:
+            from pyopencl.array import vec
+            vec_types = vec.types
+
         return (isinstance(dtype, NumpyType)
-                and dtype.numpy_dtype in list(vec.types.values()))
+                and dtype.numpy_dtype in list(vec_types.values()))
 
     def vector_dtype(self, base, count):
-        from pyopencl.array import vec
+        try:
+            import pyopencl.cltypes as cltypes
+            vec_types = cltypes.vec_types
+        except ImportError:
+            from pyopencl.array import vec
+            vec_types = vec.types
+
         return NumpyType(
-                vec.types[base.numpy_dtype, count],
+                vec_types[base.numpy_dtype, count],
                 target=self)
 
     def alignment_requirement(self, type_decl):
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index df2067bfa352c9e6a66e74d280c3c14b8924036c..0da502fbab8aa45ed58a0491e0f43323ecf9aeff 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -26,8 +26,8 @@ from six.moves import range, zip
 
 from pytools import memoize_method
 from pytools.py_codegen import Indentation
-from loopy.target.execution import (KernelExecutorBase, ExecutionWrapperGeneratorBase,
-                             _KernelInfo, _Kernels)
+from loopy.target.execution import (
+    KernelExecutorBase, ExecutionWrapperGeneratorBase, _KernelInfo, _Kernels)
 import logging
 logger = logging.getLogger(__name__)
 
diff --git a/loopy/target/python.py b/loopy/target/python.py
index 99ec42f44b49f546cda324dfdb3c6a5b001d2222..11951abcf17e94c0fdba51042e3060735215b423 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -133,15 +133,17 @@ class ExpressionToPythonMapper(StringifyMapper):
     def map_if(self, expr, enclosing_prec):
         # Synthesize PREC_IFTHENELSE, make sure it is in the right place in the
         # operator precedence hierarchy (right above "or").
-        from pymbolic.mapper.stringifier import PREC_LOGICAL_OR, PREC_NONE
+        from pymbolic.mapper.stringifier import PREC_LOGICAL_OR
         PREC_IFTHENELSE = PREC_LOGICAL_OR - 1  # noqa
 
         return self.parenthesize_if_needed(
             "{then} if {cond} else {else_}".format(
-                then=self.rec(expr.then, PREC_IFTHENELSE),
-                cond=self.rec(expr.condition, PREC_IFTHENELSE),
-                else_=self.rec(expr.else_, PREC_IFTHENELSE)),
-            enclosing_prec, PREC_NONE)
+                # "1 if 0 if 1 else 2 else 3" is not valid Python.
+                # So force parens by using an artificially higher precedence.
+                then=self.rec(expr.then, PREC_LOGICAL_OR),
+                cond=self.rec(expr.condition, PREC_LOGICAL_OR),
+                else_=self.rec(expr.else_, PREC_LOGICAL_OR)),
+            enclosing_prec, PREC_IFTHENELSE)
 
 # }}}
 
@@ -257,7 +259,7 @@ class PythonASTBuilderBase(ASTBuilderBase):
             lbound, ubound, inner):
         ecm = codegen_state.expression_to_code_mapper
 
-        from pymbolic.mapper.stringifier import PREC_NONE
+        from pymbolic.mapper.stringifier import PREC_NONE, PREC_SUM
         from genpy import For
 
         return For(
@@ -265,7 +267,7 @@ class PythonASTBuilderBase(ASTBuilderBase):
                 "range(%s, %s + 1)"
                 % (
                     ecm(lbound, PREC_NONE, "i"),
-                    ecm(ubound, PREC_NONE, "i"),
+                    ecm(ubound, PREC_SUM, "i"),
                     ),
                 inner)
 
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index a19e06ecdf7c9966501ebb9600ea4e01614363f4..6077332c4fc4322ac7ffb02ade4a0e24c7066245 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -681,12 +681,18 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
                 dt, dim_idx = var_dict[primed_non1_saxis_names[i]]
                 mod_domain = mod_domain.set_dim_name(dt, dim_idx, saxis)
 
+        def add_assumptions(d):
+            assumption_non_param = isl.BasicSet.from_params(kernel.assumptions)
+            assumptions, domain = isl.align_two(assumption_non_param, d)
+            return assumptions & domain
+
         # {{{ check that we got the desired domain
 
-        check_domain = check_domain.project_out_except(
-                primed_non1_saxis_names, [isl.dim_type.set])
+        check_domain = add_assumptions(
+            check_domain.project_out_except(
+                primed_non1_saxis_names, [isl.dim_type.set]))
 
-        mod_check_domain = mod_domain
+        mod_check_domain = add_assumptions(mod_domain)
 
         # re-add the prime from the new variable
         var_dict = mod_check_domain.get_var_dict(isl.dim_type.set)
@@ -716,10 +722,11 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
         # project out the new names from the modified domain
         orig_domain_inames = list(domch.domain.get_var_dict(isl.dim_type.set))
-        mod_check_domain = mod_domain.project_out_except(
-                orig_domain_inames, [isl.dim_type.set])
+        mod_check_domain = add_assumptions(
+                mod_domain.project_out_except(
+                    orig_domain_inames, [isl.dim_type.set]))
 
-        check_domain = domch.domain
+        check_domain = add_assumptions(domch.domain)
 
         mod_check_domain, check_domain = isl.align_two(
                 mod_check_domain, check_domain)
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index 8afc1695a38a37baf165f6ec6ef6567e2012173b..3d4f5c2d4765aa7cbf1e56c76d127bf8f4d61a06 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 
 from loopy.diagnostic import LoopyError
 import loopy as lp
+import six
 
 from loopy.kernel.data import auto, temp_var_scope
 from pytools import memoize_method, Record
@@ -32,7 +33,7 @@ from loopy.schedule import (
             EnterLoop, LeaveLoop, RunInstruction,
             CallKernel, ReturnFromKernel, Barrier)
 
-from loopy.schedule.tools import (get_block_boundaries, InstructionQuery)
+from loopy.schedule.tools import get_block_boundaries
 
 
 import logging
@@ -135,7 +136,7 @@ class LivenessAnalysis(object):
 
     @memoize_method
     def liveness(self):
-        logging.info("running liveness analysis")
+        logger.info("running liveness analysis")
         successors = self.get_successor_relation()
         gen, kill = self.get_gen_and_kill_sets()
 
@@ -152,7 +153,7 @@ class LivenessAnalysis(object):
                     lr[idx].live_out.update(lr[succ].live_in)
                 lr[idx].live_in = gen[idx] | (lr[idx].live_out - kill[idx])
 
-        logging.info("done running liveness analysis")
+        logger.info("done running liveness analysis")
 
         return lr
 
@@ -193,13 +194,9 @@ class TemporarySaver(object):
 
             The name of the new temporary.
 
-        .. attribute:: orig_temporary
+        .. attribute:: orig_temporary_name
 
-            The original temporary variable object.
-
-        .. attribute:: hw_inames
-
-            The common list of hw axes that define the original object.
+            The name of original temporary variable object.
 
         .. attribute:: hw_dims
 
@@ -207,6 +204,10 @@ class TemporarySaver(object):
             of the promoted temporary value, corresponding to
             hardware dimensions
 
+        .. attribute:: hw_tags
+
+            The tags for the inames associated with hw_dims
+
         .. attribute:: non_hw_dims
 
             A list of expressions, to be added in front of the shape
@@ -214,9 +215,15 @@ class TemporarySaver(object):
             non-hardware dimensions
         """
 
-        @memoize_method
-        def as_variable(self):
-            temporary = self.orig_temporary
+        __slots__ = """
+                name
+                orig_temporary_name
+                hw_dims
+                hw_tags
+                non_hw_dims""".split()
+
+        def as_kernel_temporary(self, kernel):
+            temporary = kernel.temporary_variables[self.orig_temporary_name]
             from loopy.kernel.data import TemporaryVariable
             return TemporaryVariable(
                 name=self.name,
@@ -230,16 +237,215 @@ class TemporarySaver(object):
 
     def __init__(self, kernel):
         self.kernel = kernel
-        self.insn_query = InstructionQuery(kernel)
         self.var_name_gen = kernel.get_var_name_generator()
         self.insn_name_gen = kernel.get_instruction_id_generator()
+
         # These fields keep track of updates to the kernel.
         self.insns_to_insert = []
         self.insns_to_update = {}
         self.extra_args_to_add = {}
         self.updated_iname_to_tag = {}
         self.updated_temporary_variables = {}
-        self.saves_or_reloads_added = {}
+
+        # temporary name -> save or reload insn ids
+        from collections import defaultdict
+        self.temporary_to_save_ids = defaultdict(set)
+        self.temporary_to_reload_ids = defaultdict(set)
+        self.subkernel_to_newly_added_insn_ids = defaultdict(set)
+
+        # Maps names of base_storage to the name of the temporary
+        # representative chosen for saves/reloads
+        self.base_storage_to_representative = {}
+
+        from loopy.kernel.data import ValueArg
+        import islpy as isl
+        self.new_subdomain = (
+                isl.BasicSet.universe(
+                    isl.Space.create_from_names(
+                        isl.DEFAULT_CONTEXT,
+                        set=[],
+                        params=set(
+                            arg.name for arg in kernel.args
+                            if isinstance(arg, ValueArg)))))
+
+    def find_accessing_instructions_in_subkernel(self, temporary, subkernel):
+        # Find all accessing instructions in the subkernel. If base_storage is
+        # present, this includes instructions that access aliasing memory.
+
+        aliasing_names = set([temporary])
+        base_storage = self.kernel.temporary_variables[temporary].base_storage
+
+        if base_storage is not None:
+            aliasing_names |= self.base_storage_to_temporary_map[base_storage]
+
+        from loopy.kernel.tools import get_subkernel_to_insn_id_map
+        accessing_insns_in_subkernel = set()
+        subkernel_insns = get_subkernel_to_insn_id_map(self.kernel)[subkernel]
+
+        for name in aliasing_names:
+            try:
+                accessing_insns_in_subkernel |= (
+                        self.kernel.reader_map()[name] & subkernel_insns)
+            except KeyError:
+                pass
+
+            try:
+                accessing_insns_in_subkernel |= (
+                        self.kernel.writer_map()[name] & subkernel_insns)
+            except KeyError:
+                pass
+
+        return frozenset(accessing_insns_in_subkernel)
+
+    @property
+    @memoize_method
+    def base_storage_to_temporary_map(self):
+        from collections import defaultdict
+
+        result = defaultdict(set)
+
+        for temporary in six.itervalues(self.kernel.temporary_variables):
+            if temporary.base_storage is None:
+                continue
+            result[temporary.base_storage].add(temporary.name)
+
+        return result
+
+    @property
+    @memoize_method
+    def subkernel_to_slice_indices(self):
+        result = {}
+
+        for sched_item_idx, sched_item in enumerate(self.kernel.schedule):
+            if isinstance(sched_item, CallKernel):
+                start_idx = sched_item_idx
+            elif isinstance(sched_item, ReturnFromKernel):
+                result[sched_item.kernel_name] = (start_idx, 1 + sched_item_idx)
+
+        return result
+
+    @property
+    @memoize_method
+    def subkernel_to_surrounding_inames(self):
+        current_outer_inames = set()
+        within_subkernel = False
+        result = {}
+
+        for sched_item_idx, sched_item in enumerate(self.kernel.schedule):
+            if isinstance(sched_item, CallKernel):
+                within_subkernel = True
+                result[sched_item.kernel_name] = frozenset(current_outer_inames)
+            elif isinstance(sched_item, ReturnFromKernel):
+                within_subkernel = False
+            elif isinstance(sched_item, EnterLoop):
+                if not within_subkernel:
+                    current_outer_inames.add(sched_item.iname)
+            elif isinstance(sched_item, LeaveLoop):
+                current_outer_inames.discard(sched_item.iname)
+
+        return result
+
+    @memoize_method
+    def get_enclosing_global_barrier_pair(self, subkernel):
+        subkernel_start, subkernel_end = (
+            self.subkernel_to_slice_indices[subkernel])
+
+        def is_global_barrier(item):
+            return isinstance(item, Barrier) and item.kind == "global"
+
+        try:
+            pre_barrier = next(item for item in
+                self.kernel.schedule[subkernel_start::-1]
+                if is_global_barrier(item)).originating_insn_id
+        except StopIteration:
+            pre_barrier = None
+
+        try:
+            post_barrier = next(item for item in
+                self.kernel.schedule[subkernel_end:]
+                if is_global_barrier(item)).originating_insn_id
+        except StopIteration:
+            post_barrier = None
+
+        return (pre_barrier, post_barrier)
+
+    def get_hw_axis_sizes_and_tags_for_save_slot(self, temporary):
+        """
+        This is used for determining the amount of global storage needed for saving
+        and restoring the temporary across kernel calls, due to hardware
+        parallel inames (the inferred axes get prefixed to the number of
+        dimensions in the temporary).
+
+        In the case of local temporaries, inames that are tagged
+        hw-local do not contribute to the global storage shape.
+        """
+        accessor_insn_ids = frozenset(
+            self.kernel.reader_map()[temporary.name]
+            | self.kernel.writer_map()[temporary.name])
+
+        group_tags = None
+        local_tags = None
+
+        def _sortedtags(tags):
+            return sorted(tags, key=lambda tag: tag.axis)
+
+        for insn_id in accessor_insn_ids:
+            insn = self.kernel.id_to_insn[insn_id]
+
+            my_group_tags = []
+            my_local_tags = []
+
+            for iname in insn.within_inames:
+                tag = self.kernel.iname_to_tag.get(iname)
+
+                if tag is None:
+                    continue
+
+                from loopy.kernel.data import (
+                    GroupIndexTag, LocalIndexTag, ParallelTag)
+
+                if isinstance(tag, GroupIndexTag):
+                    my_group_tags.append(tag)
+                elif isinstance(tag, LocalIndexTag):
+                    my_local_tags.append(tag)
+                elif isinstance(tag, ParallelTag):
+                    raise LoopyError(
+                        "iname '%s' is tagged with '%s' - only "
+                        "group and local tags are supported for "
+                        "auto save/reload of temporaries" %
+                        (iname, tag))
+
+            if group_tags is None:
+                group_tags = _sortedtags(my_group_tags)
+                local_tags = _sortedtags(my_local_tags)
+                group_tags_originating_insn_id = insn_id
+
+            if (
+                    group_tags != _sortedtags(my_group_tags)
+                    or local_tags != _sortedtags(my_local_tags)):
+                raise LoopyError(
+                    "inconsistent parallel tags across instructions that access "
+                    "'%s' (specifically, instruction '%s' has tags '%s' but "
+                    "instruction '%s' has tags '%s')"
+                    % (temporary.name,
+                       group_tags_originating_insn_id, group_tags + local_tags,
+                       insn_id, my_group_tags + my_local_tags))
+
+        if group_tags is None:
+            assert local_tags is None
+            return (), ()
+
+        group_sizes, local_sizes = (
+            self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids))
+
+        if temporary.scope == lp.temp_var_scope.LOCAL:
+            # Elide local axes in the save slot for local temporaries.
+            del local_tags[:]
+            local_sizes = ()
+
+        # We set hw_dims to be arranged according to the order:
+        #    g.0 < g.1 < ... < l.0 < l.1 < ...
+        return (group_sizes + local_sizes), tuple(group_tags + local_tags)
 
     @memoize_method
     def auto_promote_temporary(self, temporary_name):
@@ -255,52 +461,16 @@ class TemporarySaver(object):
             assert temporary.read_only
             return None
 
-        if temporary.base_storage is not None:
-            raise ValueError(
-                "Cannot promote temporaries with base_storage to global")
-
-        # `hw_inames`: The set of hw-parallel tagged inames that this temporary
-        # is associated with. This is used for determining the shape of the
-        # global storage needed for saving and restoring the temporary across
-        # kernel calls.
-        #
-        # TODO: Make a policy decision about which dimensions to use. Currently,
-        # the code looks at each instruction that defines or uses the temporary,
-        # and takes the common set of hw-parallel tagged inames associated with
-        # these instructions.
-        #
-        # Furthermore, in the case of local temporaries, inames that are tagged
-        # hw-local do not contribute to the global storage shape.
-        hw_inames = self.insn_query.common_hw_inames(
-            self.insn_query.insns_reading_or_writing(temporary.name))
-
-        # We want hw_inames to be arranged according to the order:
-        #    g.0 < g.1 < ... < l.0 < l.1 < ...
-        # Sorting lexicographically accomplishes this.
-        hw_inames = sorted(hw_inames,
-            key=lambda iname: str(self.kernel.iname_to_tag[iname]))
-
-        # Calculate the sizes of the dimensions that get added in front for
-        # the global storage of the temporary.
-        hw_dims = []
-
-        backing_hw_inames = []
-
-        for iname in hw_inames:
-            tag = self.kernel.iname_to_tag[iname]
-            from loopy.kernel.data import LocalIndexTag
-            is_local_iname = isinstance(tag, LocalIndexTag)
-            if is_local_iname and temporary.scope == temp_var_scope.LOCAL:
-                # Restrict shape to that of group inames for locals.
-                continue
-            backing_hw_inames.append(iname)
-            from loopy.isl_helpers import static_max_of_pw_aff
-            from loopy.symbolic import aff_to_expr
-            hw_dims.append(
-                aff_to_expr(
-                    static_max_of_pw_aff(
-                        self.kernel.get_iname_bounds(iname).size, False)))
+        base_storage_conflict = (
+            self.base_storage_to_representative.get(
+                temporary.base_storage, temporary) is not temporary)
 
+        if base_storage_conflict:
+            raise NotImplementedError(
+                "tried to save/reload multiple temporaries with the "
+                "same base_storage; this is currently not supported")
+
+        hw_dims, hw_tags = self.get_hw_axis_sizes_and_tags_for_save_slot(temporary)
         non_hw_dims = temporary.shape
 
         if len(non_hw_dims) == 0 and len(hw_dims) == 0:
@@ -309,10 +479,14 @@ class TemporarySaver(object):
 
         backing_temporary = self.PromotedTemporary(
             name=self.var_name_gen(temporary.name + "_save_slot"),
-            orig_temporary=temporary,
-            hw_dims=tuple(hw_dims),
-            non_hw_dims=non_hw_dims,
-            hw_inames=backing_hw_inames)
+            orig_temporary_name=temporary.name,
+            hw_dims=hw_dims,
+            hw_tags=hw_tags,
+            non_hw_dims=non_hw_dims)
+
+        if temporary.base_storage is not None:
+            self.base_storage_to_representative[temporary.base_storage] = (
+                    backing_temporary)
 
         return backing_temporary
 
@@ -326,23 +500,16 @@ class TemporarySaver(object):
         if promoted_temporary is None:
             return
 
-        from loopy.kernel.tools import DomainChanger
-        dchg = DomainChanger(
-            self.kernel,
-            frozenset(
-                self.insn_query.inames_in_subkernel(subkernel) |
-                set(promoted_temporary.hw_inames)))
-
-        domain, hw_inames, dim_inames, iname_to_tag = \
+        new_subdomain, hw_inames, dim_inames, iname_to_tag = (
             self.augment_domain_for_save_or_reload(
-                dchg.domain, promoted_temporary, mode, subkernel)
+                self.new_subdomain, promoted_temporary, mode, subkernel))
 
-        self.kernel = dchg.get_kernel_with(domain)
+        self.new_subdomain = new_subdomain
 
         save_or_load_insn_id = self.insn_name_gen(
             "{name}.{mode}".format(name=temporary, mode=mode))
 
-        def subscript_or_var(agg, subscript=()):
+        def add_subscript_if_subscript_nonempty(agg, subscript=()):
             from pymbolic.primitives import Subscript, Variable
             if len(subscript) == 0:
                 return Variable(agg)
@@ -351,20 +518,22 @@ class TemporarySaver(object):
                     Variable(agg),
                     tuple(map(Variable, subscript)))
 
-        dim_inames_trunc = dim_inames[:len(promoted_temporary.orig_temporary.shape)]
+        orig_temporary = (
+            self.kernel.temporary_variables[
+                promoted_temporary.orig_temporary_name])
+        dim_inames_trunc = dim_inames[:len(orig_temporary.shape)]
 
         args = (
-            subscript_or_var(
-                temporary, dim_inames_trunc),
-            subscript_or_var(
-                promoted_temporary.name, hw_inames + dim_inames))
+            add_subscript_if_subscript_nonempty(
+                temporary, subscript=dim_inames_trunc),
+            add_subscript_if_subscript_nonempty(
+                promoted_temporary.name, subscript=hw_inames + dim_inames))
 
         if mode == "save":
             args = reversed(args)
 
-        accessing_insns_in_subkernel = (
-            self.insn_query.insns_reading_or_writing(temporary) &
-            self.insn_query.insns_in_subkernel(subkernel))
+        accessing_insns_in_subkernel = self.find_accessing_instructions_in_subkernel(
+                temporary, subkernel)
 
         if mode == "save":
             depends_on = accessing_insns_in_subkernel
@@ -373,7 +542,7 @@ class TemporarySaver(object):
             depends_on = frozenset()
             update_deps = accessing_insns_in_subkernel
 
-        pre_barrier, post_barrier = self.insn_query.pre_and_post_barriers(subkernel)
+        pre_barrier, post_barrier = self.get_enclosing_global_barrier_pair(subkernel)
 
         if pre_barrier is not None:
             depends_on |= set([pre_barrier])
@@ -387,16 +556,19 @@ class TemporarySaver(object):
             *args,
             id=save_or_load_insn_id,
             within_inames=(
-                self.insn_query.inames_in_subkernel(subkernel) |
-                frozenset(hw_inames + dim_inames)),
+                self.subkernel_to_surrounding_inames[subkernel]
+                | frozenset(hw_inames + dim_inames)),
             within_inames_is_final=True,
             depends_on=depends_on,
             boostable=False,
             boostable_into=frozenset())
 
-        if temporary not in self.saves_or_reloads_added:
-            self.saves_or_reloads_added[temporary] = set()
-        self.saves_or_reloads_added[temporary].add(save_or_load_insn_id)
+        if mode == "save":
+            self.temporary_to_save_ids[temporary].add(save_or_load_insn_id)
+        else:
+            self.temporary_to_reload_ids[temporary].add(save_or_load_insn_id)
+
+        self.subkernel_to_newly_added_insn_ids[subkernel].add(save_or_load_insn_id)
 
         self.insns_to_insert.append(save_or_load_insn)
 
@@ -405,8 +577,8 @@ class TemporarySaver(object):
             self.insns_to_update[insn_id] = insn.copy(
                 depends_on=insn.depends_on | frozenset([save_or_load_insn_id]))
 
-        self.updated_temporary_variables[promoted_temporary.name] = \
-            promoted_temporary.as_variable()
+        self.updated_temporary_variables[promoted_temporary.name] = (
+            promoted_temporary.as_kernel_temporary(self.kernel))
 
         self.updated_iname_to_tag.update(iname_to_tag)
 
@@ -416,15 +588,6 @@ class TemporarySaver(object):
 
         insns_to_insert = dict((insn.id, insn) for insn in self.insns_to_insert)
 
-        # Add global no_sync_with between any added reloads and saves
-        from six import iteritems
-        for temporary, added_insns in iteritems(self.saves_or_reloads_added):
-            for insn_id in added_insns:
-                insn = insns_to_insert[insn_id]
-                insns_to_insert[insn_id] = insn.copy(
-                    no_sync_with=frozenset(
-                        (added_insn, "global") for added_insn in added_insns))
-
         for orig_insn in self.kernel.instructions:
             if orig_insn.id in self.insns_to_update:
                 new_instructions.append(self.insns_to_update[orig_insn.id])
@@ -436,12 +599,31 @@ class TemporarySaver(object):
         self.updated_iname_to_tag.update(self.kernel.iname_to_tag)
         self.updated_temporary_variables.update(self.kernel.temporary_variables)
 
+        new_domains = list(self.kernel.domains)
+        import islpy as isl
+        if self.new_subdomain.dim(isl.dim_type.set) > 0:
+            new_domains.append(self.new_subdomain)
+
         kernel = self.kernel.copy(
+            domains=new_domains,
             instructions=new_instructions,
             iname_to_tag=self.updated_iname_to_tag,
             temporary_variables=self.updated_temporary_variables,
             overridden_get_grid_sizes_for_insn_ids=None)
 
+        # Add nosync directives to any saves or reloads that were added with a
+        # potential dependency chain.
+        from loopy.kernel.tools import get_subkernels
+        for subkernel in get_subkernels(kernel):
+            relevant_insns = self.subkernel_to_newly_added_insn_ids[subkernel]
+
+            from itertools import product
+            for temporary in self.temporary_to_reload_ids:
+                for source, sink in product(
+                        relevant_insns & self.temporary_to_reload_ids[temporary],
+                        relevant_insns & self.temporary_to_save_ids[temporary]):
+                    kernel = lp.add_nosync(kernel, "global", source, sink)
+
         from loopy.kernel.tools import assign_automatic_axes
         return assign_automatic_axes(kernel)
 
@@ -456,22 +638,28 @@ class TemporarySaver(object):
         """
         Add new axes to the domain corresponding to the dimensions of
         `promoted_temporary`. These axes will be used in the save/
-        reload stage.
+        reload stage. These get prefixed onto the already existing axes.
         """
         assert mode in ("save", "reload")
         import islpy as isl
 
-        orig_temporary = promoted_temporary.orig_temporary
+        orig_temporary = (
+                self.kernel.temporary_variables[
+                    promoted_temporary.orig_temporary_name])
         orig_dim = domain.dim(isl.dim_type.set)
 
         # Tags for newly added inames
         iname_to_tag = {}
 
+        from loopy.symbolic import aff_from_expr
+
         # FIXME: Restrict size of new inames to access footprint.
 
         # Add dimension-dependent inames.
         dim_inames = []
-        domain = domain.add(isl.dim_type.set, len(promoted_temporary.non_hw_dims))
+        domain = domain.add(isl.dim_type.set,
+                            len(promoted_temporary.non_hw_dims)
+                            + len(promoted_temporary.hw_dims))
 
         for dim_idx, dim_size in enumerate(promoted_temporary.non_hw_dims):
             new_iname = self.insn_name_gen("{name}_{mode}_axis_{dim}_{sk}".
@@ -493,25 +681,31 @@ class TemporarySaver(object):
             # Add size information.
             aff = isl.affs_from_space(domain.space)
             domain &= aff[0].le_set(aff[new_iname])
-            from loopy.symbolic import aff_from_expr
             domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, dim_size))
 
-        # FIXME: Use promoted_temporary.hw_inames
-        hw_inames = []
+        dim_offset = orig_dim + len(promoted_temporary.non_hw_dims)
 
-        # Add hardware inames duplicates.
-        for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames):
+        hw_inames = []
+        # Add hardware dims.
+        for hw_iname_idx, (hw_tag, dim) in enumerate(
+                zip(promoted_temporary.hw_tags, promoted_temporary.hw_dims)):
             new_iname = self.insn_name_gen("{name}_{mode}_hw_dim_{dim}_{sk}".
                 format(name=orig_temporary.name,
                        mode=mode,
-                       dim=t_idx,
+                       dim=hw_iname_idx,
                        sk=subkernel))
-            hw_inames.append(new_iname)
-            iname_to_tag[new_iname] = self.kernel.iname_to_tag[hw_iname]
+            domain = domain.set_dim_name(
+                isl.dim_type.set, dim_offset + hw_iname_idx, new_iname)
 
-        from loopy.isl_helpers import duplicate_axes
-        domain = duplicate_axes(
-            domain, promoted_temporary.hw_inames, hw_inames)
+            aff = isl.affs_from_space(domain.space)
+            domain = (domain
+                &
+                aff[0].le_set(aff[new_iname])
+                &
+                aff[new_iname].lt_set(aff_from_expr(domain.space, dim)))
+
+            self.updated_iname_to_tag[new_iname] = hw_tag
+            hw_inames.append(new_iname)
 
         # The operations on the domain above return a Set object, but the
         # underlying domain should be expressible as a single BasicSet.
@@ -551,7 +745,8 @@ def save_and_reload_temporaries(knl):
     liveness = LivenessAnalysis(knl)
     saver = TemporarySaver(knl)
 
-    insn_query = InstructionQuery(knl)
+    from loopy.schedule.tools import (
+        temporaries_read_in_subkernel, temporaries_written_in_subkernel)
 
     for sched_idx, sched_item in enumerate(knl.schedule):
 
@@ -562,9 +757,10 @@ def save_and_reload_temporaries(knl):
                 # Kernel entry: nothing live
                 interesting_temporaries = set()
             else:
+                subkernel = sched_item.kernel_name
                 interesting_temporaries = (
-                    insn_query.temporaries_read_or_written_in_subkernel(
-                        sched_item.kernel_name))
+                    temporaries_read_in_subkernel(knl, subkernel)
+                    | temporaries_written_in_subkernel(knl, subkernel))
 
             for temporary in liveness[sched_idx].live_out & interesting_temporaries:
                 logger.info("reloading {0} at entry of {1}"
@@ -576,9 +772,9 @@ def save_and_reload_temporaries(knl):
                 # Kernel exit: nothing live
                 interesting_temporaries = set()
             else:
+                subkernel = sched_item.kernel_name
                 interesting_temporaries = (
-                    insn_query.temporaries_written_in_subkernel(
-                        sched_item.kernel_name))
+                    temporaries_written_in_subkernel(knl, subkernel))
 
             for temporary in liveness[sched_idx].live_in & interesting_temporaries:
                 logger.info("saving {0} before return of {1}"
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index b8b0cbcbf1236cdf712da998922ac238261a3e6e..78d817ce73724d90a6cc6f380b24290971f6c1e7 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -332,7 +332,20 @@ class TypeInferenceMapper(CombineMapper):
         if not agg_result:
             return agg_result
 
-        field = agg_result[0].numpy_dtype.fields[expr.name]
+        numpy_dtype = agg_result[0].numpy_dtype
+        fields = numpy_dtype.fields
+        if fields is None:
+            raise LoopyError("cannot look up attribute '%s' in "
+                    "non-aggregate expression '%s'"
+                    % (expr.aggregate, expr.name))
+
+        try:
+            field = fields[expr.name]
+        except KeyError:
+            raise LoopyError("cannot look up attribute '%s' in "
+                    "aggregate expression '%s' of dtype '%s'"
+                    % (expr.aggregate, expr.name, numpy_dtype))
+
         dtype = field[0]
         return [NumpyType(dtype)]
 
diff --git a/loopy/version.py b/loopy/version.py
index 77d0e21bdd2ef5383c5f874656c25fe1ede21a70..02244f55d0dbf207a4641c3ebf6cc33b536f0421 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v60-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v64-islpy%s" % _islpy_version
diff --git a/setup.py b/setup.py
index a941eecd2b58daf413830fc22500179d3e8a8cf1..67d943af3be4446834bf7262a91b8596b601ca85 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@ setup(name="loo.py",
           ],
 
       install_requires=[
-          "pytools>=2016.2.6",
+          "pytools>=2017.3",
           "pymbolic>=2016.2",
           "genpy>=2016.1.2",
           "cgen>=2016.1",
diff --git a/test/test_loopy.py b/test/test_loopy.py
index c9328a252b708770e6f83590275be92e0c5d8b05..2ac1026c0660573d97cd2f65c0502ec69b63803d 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1146,7 +1146,7 @@ def save_and_reload_temporaries_test(queue, knl, out_expect, debug=False):
         1/0
 
     _, (out,) = knl(queue, out_host=True)
-    assert (out == out_expect).all()
+    assert (out == out_expect).all(), (out, out_expect)
 
 
 @pytest.mark.parametrize("hw_loop", [True, False])
@@ -1338,6 +1338,73 @@ def test_save_local_multidim_array(ctx_factory, debug=False):
     save_and_reload_temporaries_test(queue, knl, 1, debug)
 
 
+def test_save_with_base_storage(ctx_factory, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{[i]: 0 <= i < 10}",
+            """
+            <>a[i] = 0
+            <>b[i] = i
+            ... gbarrier
+            out[i] = a[i]
+            """,
+            "...",
+            seq_dependencies=True)
+
+    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.set_temporary_scope(knl, "a", "local")
+    knl = lp.set_temporary_scope(knl, "b", "local")
+
+    knl = lp.alias_temporaries(knl, ["a", "b"],
+            synchronize_for_exclusive_use=False)
+
+    save_and_reload_temporaries_test(queue, knl, np.arange(10), debug)
+
+
+def test_save_ambiguous_storage_requirements():
+    knl = lp.make_kernel(
+            "{[i,j]: 0 <= i < 10 and 0 <= j < 10}",
+            """
+            <>a[j] = j
+            ... gbarrier
+            out[i,j] = a[j]
+            """,
+            seq_dependencies=True)
+
+    knl = lp.tag_inames(knl, dict(i="g.0", j="l.0"))
+    knl = lp.duplicate_inames(knl, "j", within="writes:out", tags={"j": "l.0"})
+    knl = lp.set_temporary_scope(knl, "a", "local")
+
+    knl = lp.preprocess_kernel(knl)
+    knl = lp.get_one_scheduled_kernel(knl)
+
+    from loopy.diagnostic import LoopyError
+    with pytest.raises(LoopyError):
+        lp.save_and_reload_temporaries(knl)
+
+
+def test_save_across_inames_with_same_tag(ctx_factory, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{[i]: 0 <= i < 10}",
+            """
+            <>a[i] = i
+            ... gbarrier
+            out[i] = a[i]
+            """,
+            "...",
+            seq_dependencies=True)
+
+    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.duplicate_inames(knl, "i", within="reads:a", tags={"i": "l.0"})
+
+    save_and_reload_temporaries_test(queue, knl, np.arange(10), debug)
+
+
 def test_missing_temporary_definition_detection():
     knl = lp.make_kernel(
             "{ [i]: 0<=i<10 }",
@@ -2117,6 +2184,41 @@ def test_barrier_insertion_near_bottom_of_loop():
     assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1])
 
 
+def test_multi_argument_reduction_type_inference():
+    from loopy.type_inference import TypeInferenceMapper
+    from loopy.library.reduction import SegmentedSumReductionOperation
+    from loopy.types import to_loopy_type
+    op = SegmentedSumReductionOperation()
+
+    knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=j<i}", "")
+
+    int32 = to_loopy_type(np.int32)
+
+    expr = lp.symbolic.Reduction(
+            operation=op,
+            inames=("i",),
+            expr=lp.symbolic.Reduction(
+                operation=op,
+                inames="j",
+                expr=(1, 2),
+                allow_simultaneous=True),
+            allow_simultaneous=True)
+
+    t_inf_mapper = TypeInferenceMapper(knl)
+
+    assert (
+            t_inf_mapper(expr, return_tuple=True, return_dtype_set=True)
+            == [(int32, int32)])
+
+
+def test_multi_argument_reduction_parsing():
+    from loopy.symbolic import parse, Reduction
+
+    assert isinstance(
+            parse("reduce(argmax, i, reduce(argmax, j, i, j))").expr,
+            Reduction)
+
+
 def test_global_barrier_order_finding():
     knl = lp.make_kernel(
             "{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
@@ -2161,41 +2263,6 @@ def test_global_barrier_error_if_unordered():
         lp.get_global_barrier_order(knl)
 
 
-def test_multi_argument_reduction_type_inference():
-    from loopy.type_inference import TypeInferenceMapper
-    from loopy.library.reduction import SegmentedSumReductionOperation
-    from loopy.types import to_loopy_type
-    op = SegmentedSumReductionOperation()
-
-    knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=j<i}", "")
-
-    int32 = to_loopy_type(np.int32)
-
-    expr = lp.symbolic.Reduction(
-            operation=op,
-            inames=("i",),
-            expr=lp.symbolic.Reduction(
-                operation=op,
-                inames="j",
-                expr=(1, 2),
-                allow_simultaneous=True),
-            allow_simultaneous=True)
-
-    t_inf_mapper = TypeInferenceMapper(knl)
-
-    assert (
-            t_inf_mapper(expr, return_tuple=True, return_dtype_set=True)
-            == [(int32, int32)])
-
-
-def test_multi_argument_reduction_parsing():
-    from loopy.symbolic import parse, Reduction
-
-    assert isinstance(
-            parse("reduce(argmax, i, reduce(argmax, j, i, j))").expr,
-            Reduction)
-
-
 def test_struct_assignment(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
@@ -2231,6 +2298,43 @@ def test_struct_assignment(ctx_factory):
     knl(queue, N=200)
 
 
+def test_inames_conditional_generation(ctx_factory):
+    ctx = ctx_factory()
+    knl = lp.make_kernel(
+            "{[i,j,k]: 0 < k < i and 0 < j < 10 and 0 < i < 10}",
+            """
+            for k
+                ... gbarrier
+                <>tmp1 = 0
+            end
+            for j
+                ... gbarrier
+                <>tmp2 = i
+            end
+            """,
+            "...",
+            seq_dependencies=True)
+
+    knl = lp.tag_inames(knl, dict(i="g.0"))
+
+    with cl.CommandQueue(ctx) as queue:
+        knl(queue)
+
+
+def test_kernel_var_name_generator():
+    knl = lp.make_kernel(
+            "{[i]: 0 <= i <= 10}",
+            """
+            <>a = 0
+            <>b_s0 = 0
+            """)
+
+    vng = knl.get_var_name_generator()
+
+    assert vng("a_s0") != "a_s0"
+    assert vng("b") != "b"
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_scan.py b/test/test_scan.py
new file mode 100644
index 0000000000000000000000000000000000000000..08754819c9a156403aba689cb3e9c238144e7905
--- /dev/null
+++ b/test/test_scan.py
@@ -0,0 +1,432 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = """
+Copyright (C) 2012 Andreas Kloeckner
+Copyright (C) 2016, 2017 Matt Wala
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import sys
+import numpy as np
+import loopy as lp
+import pyopencl as cl
+import pyopencl.clmath  # noqa
+import pyopencl.clrandom  # noqa
+import pytest
+
+import logging
+logger = logging.getLogger(__name__)
+
+try:
+    import faulthandler
+except ImportError:
+    pass
+else:
+    faulthandler.enable()
+
+from pyopencl.tools import pytest_generate_tests_for_pyopencl \
+        as pytest_generate_tests
+
+__all__ = [
+        "pytest_generate_tests",
+        "cl"  # 'cl.create_some_context'
+        ]
+
+
+# More things to test.
+# - scan(a) + scan(b)
+# - test for badly tagged inames
+
+@pytest.mark.parametrize("n", [1, 2, 3, 16])
+@pytest.mark.parametrize("stride", [1, 2])
+def test_sequential_scan(ctx_factory, n, stride):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "[n] -> {[i,j]: 0<=i<n and 0<=j<=%d*i}" % stride,
+        """
+        a[i] = sum(j, j**2)
+        """
+        )
+
+    knl = lp.fix_parameters(knl, n=n)
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    evt, (a,) = knl(queue)
+
+    assert (a.get() == np.cumsum(np.arange(stride*n)**2)[::stride]).all()
+
+
+@pytest.mark.parametrize("sweep_lbound, scan_lbound", [
+    (4, 0),
+    (3, 1),
+    (2, 2),
+    (1, 3),
+    (0, 4),
+    (5, -1),
+    ])
+def test_scan_with_different_lower_bound_from_sweep(
+        ctx_factory, sweep_lbound, scan_lbound):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "[n, sweep_lbound, scan_lbound] -> "
+        "{[i,j]: sweep_lbound<=i<n+sweep_lbound "
+        "and scan_lbound<=j<=2*(i-sweep_lbound)+scan_lbound}",
+        """
+        out[i-sweep_lbound] = sum(j, j**2)
+        """
+        )
+
+    n = 10
+
+    knl = lp.fix_parameters(knl, sweep_lbound=sweep_lbound, scan_lbound=scan_lbound)
+    knl = lp.realize_reduction(knl, force_scan=True)
+    evt, (out,) = knl(queue, n=n)
+
+    assert (out.get()
+            == np.cumsum(np.arange(scan_lbound, 2*n+scan_lbound)**2)[::2]).all()
+
+
+def test_automatic_scan_detection():
+    knl = lp.make_kernel(
+        [
+            "[n] -> {[i]: 0<=i<n}",
+            "{[j]: 0<=j<=2*i}"
+        ],
+        """
+        a[i] = sum(j, j**2)
+        """
+        )
+
+    cgr = lp.generate_code_v2(knl)
+    assert "scan" not in cgr.device_code()
+
+
+def test_selective_scan_realization():
+    pass
+
+
+def test_force_outer_iname_for_scan():
+    knl = lp.make_kernel(
+        "[n] -> {[i,j,k]: 0<=k<n and 0<=i<=k and 0<=j<=i}",
+        "out[i] = product(j, a[j]) {inames=i:k}")
+
+    knl = lp.add_dtypes(knl, dict(a=np.float32))
+
+    # TODO: Maybe this deserves to work?
+    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
+        lp.realize_reduction(knl, force_scan=True)
+
+    knl = lp.realize_reduction(knl, force_scan=True, force_outer_iname_for_scan="i")
+
+
+def test_dependent_domain_scan(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        [
+            "[n] -> {[i]: 0<=i<n}",
+            "{[j]: 0<=j<=2*i}"
+        ],
+        """
+        a[i] = sum(j, j**2) {id=scan}
+        """
+        )
+    knl = lp.realize_reduction(knl, force_scan=True)
+    evt, (a,) = knl(queue, n=100)
+
+    assert (a.get() == np.cumsum(np.arange(200)**2)[::2]).all()
+
+
+@pytest.mark.parametrize("i_tag, j_tag", [
+    ("for", "for")
+    ])
+def test_nested_scan(ctx_factory, i_tag, j_tag):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        [
+            "[n] -> {[i]: 0 <= i < n}",
+            "[i] -> {[j]: 0 <= j <= i}",
+            "[i] -> {[k]: 0 <= k <= i}"
+        ],
+        """
+        <>tmp[i] = sum(k, 1)
+        out[i] = sum(j, tmp[j])
+        """)
+
+    knl = lp.fix_parameters(knl, n=10)
+    knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag))
+
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    print(knl)
+
+    evt, (out,) = knl(queue)
+
+    print(out)
+
+
+def test_scan_not_triangular():
+    knl = lp.make_kernel(
+        "{[i,j]: 0<=i<100 and 1<=j<=2*i}",
+        """
+        a[i] = sum(j, j**2)
+        """
+        )
+
+    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
+        knl = lp.realize_reduction(knl, force_scan=True)
+
+
+@pytest.mark.parametrize("n", [1, 2, 3, 16, 17])
+def test_local_parallel_scan(ctx_factory, n):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}",
+        """
+        out[i] = sum(j, a[j]**2)
+        """,
+        "..."
+        )
+
+    knl = lp.fix_parameters(knl, n=n)
+    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    knl = lp.realize_reduction(knl)
+
+    knl = lp.add_dtypes(knl, dict(a=int))
+
+    print(knl)
+
+    evt, (a,) = knl(queue, a=np.arange(n))
+    assert (a == np.cumsum(np.arange(n)**2)).all()
+
+
+def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "[n] -> {[i,j]: 1<=i<n+1 and 0<=j<=i-1}",
+        """
+        out[i-1] = sum(j, a[j]**2)
+        """,
+        "..."
+        )
+
+    knl = lp.fix_parameters(knl, n=16)
+    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.realize_reduction(knl, force_scan=True)
+    knl = lp.realize_reduction(knl)
+
+    knl = lp.add_dtypes(knl, dict(a=int))
+    evt, (out,) = knl(queue, a=np.arange(1, 17))
+
+    assert (out == np.cumsum(np.arange(1, 17)**2)).all()
+
+
+def test_scan_extra_constraints_on_domain():
+    knl = lp.make_kernel(
+        "{[i,j,k]: 0<=i<n and 0<=j<=i and i=k}",
+        "out[i] = sum(j, a[j])")
+
+    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
+        knl = lp.realize_reduction(
+                knl, force_scan=True, force_outer_iname_for_scan="i")
+
+
+@pytest.mark.parametrize("sweep_iname_tag", ["for", "l.1"])
+def test_scan_with_outer_parallel_iname(ctx_factory, sweep_iname_tag):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        [
+            "{[k]: 0<=k<=1}",
+            "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}"
+        ],
+        "out[k,i] = k + sum(j, j**2)"
+        )
+
+    knl = lp.tag_inames(knl, dict(k="l.0", i=sweep_iname_tag))
+    n = 10
+    knl = lp.fix_parameters(knl, n=n)
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    evt, (out,) = knl(queue)
+
+    inner = np.cumsum(np.arange(n)**2)
+
+    assert (out.get() == np.array([inner, 1 + inner])).all()
+
+
+@pytest.mark.parametrize("dtype", [
+    np.int32, np.int64, np.float32, np.float64])
+def test_scan_data_types(ctx_factory, dtype):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<=i<n and 0<=j<=i }",
+            "res[i] = reduce(sum, j, a[j])",
+            assumptions="n>=1")
+
+    a = np.random.randn(20).astype(dtype)
+    knl = lp.add_dtypes(knl, dict(a=dtype))
+    knl = lp.realize_reduction(knl, force_scan=True)
+    evt, (res,) = knl(queue, a=a)
+
+    assert np.allclose(res, np.cumsum(a))
+
+
+@pytest.mark.parametrize(("op_name", "np_op"), [
+    ("sum", np.sum),
+    ("product", np.prod),
+    ("min", np.min),
+    ("max", np.max),
+    ])
+def test_scan_library(ctx_factory, op_name, np_op):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<=i<n and 0<=j<=i }",
+            "res[i] = reduce(%s, j, a[j])" % op_name,
+            assumptions="n>=1")
+
+    a = np.random.randn(20)
+    knl = lp.add_dtypes(knl, dict(a=np.float))
+    knl = lp.realize_reduction(knl, force_scan=True)
+    evt, (res,) = knl(queue, a=a)
+
+    assert np.allclose(res, np.array(
+            [np_op(a[:i+1]) for i in range(len(a))]))
+
+
+def test_scan_unsupported_tags():
+    pass
+
+
+@pytest.mark.parametrize("i_tag", ["for", "l.0"])
+def test_argmax(ctx_factory, i_tag):
+    logging.basicConfig(level=logging.INFO)
+
+    dtype = np.dtype(np.float32)
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    n = 128
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<=i<%d and 0<=j<=i}" % n,
+            """
+            max_vals[i], max_indices[i] = argmax(j, fabs(a[j]), j)
+            """)
+
+    knl = lp.tag_inames(knl, dict(i=i_tag))
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    a = np.random.randn(n).astype(dtype)
+    evt, (max_indices, max_vals) = knl(queue, a=a, out_host=True)
+
+    assert (max_vals == [np.max(np.abs(a)[0:i+1]) for i in range(n)]).all()
+    assert (max_indices == [np.argmax(np.abs(a[0:i+1])) for i in range(n)]).all()
+
+
+def check_segmented_scan_output(arr, segment_boundaries_indices, out):
+    class SegmentGrouper(object):
+
+        def __init__(self):
+            self.seg_idx = 0
+            self.idx = 0
+
+        def __call__(self, key):
+            if self.idx in segment_boundaries_indices:
+                self.seg_idx += 1
+            self.idx += 1
+            return self.seg_idx
+
+    from itertools import groupby
+
+    expected = [np.cumsum(list(group))
+            for _, group in groupby(arr, SegmentGrouper())]
+    actual = [np.array(list(group))
+            for _, group in groupby(out, SegmentGrouper())]
+
+    assert len(expected) == len(actual) == len(segment_boundaries_indices)
+    assert [(e == a).all() for e, a in zip(expected, actual)]
+
+
+@pytest.mark.parametrize("n, segment_boundaries_indices", [
+    (1, (0,)),
+    (2, (0,)),
+    (2, (0, 1)),
+    (3, (0,)),
+    (3, (0, 1)),
+    (3, (0, 2)),
+    (3, (0, 1, 2)),
+    (16, (0, 4, 8, 12))])
+@pytest.mark.parametrize("iname_tag", ("for", "l.0"))
+def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    arr = np.ones(n, dtype=np.float32)
+    segment_boundaries = np.zeros(n, dtype=np.int32)
+    segment_boundaries[(segment_boundaries_indices,)] = 1
+
+    knl = lp.make_kernel(
+        "{[i,j]: 0<=i<n and 0<=j<=i}",
+        "out[i], <>_ = reduce(segmented(sum), j, arr[j], segflag[j])",
+        [
+            lp.GlobalArg("arr", np.float32, shape=("n",)),
+            lp.GlobalArg("segflag", np.int32, shape=("n",)),
+            "..."
+        ])
+
+    knl = lp.fix_parameters(knl, n=n)
+    knl = lp.tag_inames(knl, dict(i=iname_tag))
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    (evt, (out,)) = knl(queue, arr=arr, segflag=segment_boundaries)
+
+    check_segmented_scan_output(arr, segment_boundaries_indices, out)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])
+
+# vim: foldmethod=marker
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 5e363f13594ee8e4cf170faa232b0783cca9d018..a72b62af90050008f837e144f1f28d4a4de1c730 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -49,7 +49,7 @@ def test_op_counter_basic():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -74,7 +74,7 @@ def test_op_counter_reduction():
             name="matmul_serial", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -100,7 +100,7 @@ def test_op_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -130,7 +130,7 @@ def test_op_counter_specialops():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -166,7 +166,7 @@ def test_op_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
 
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -205,7 +205,7 @@ def test_op_counter_triangular_domain():
     else:
         expect_fallback = False
 
-    op_map = lp.get_op_map(knl)[lp.Op(np.float64, 'mul')]
+    op_map = lp.get_op_map(knl, count_redundant_work=True)[lp.Op(np.float64, 'mul')]
     value_dict = dict(m=13, n=200)
     flops = op_map.eval_with_dict(value_dict)
 
@@ -229,7 +229,7 @@ def test_mem_access_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -269,7 +269,7 @@ def test_mem_access_counter_reduction():
             name="matmul", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -307,7 +307,7 @@ def test_mem_access_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -343,7 +343,7 @@ def test_mem_access_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -395,7 +395,7 @@ def test_mem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -437,11 +437,11 @@ def test_mem_access_counter_mixed():
     knl = lp.add_and_infer_dtypes(knl, dict(
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64,
                 x=np.float32))
-    threads = 16
-    knl = lp.split_iname(knl, "j", threads)
+    bsize = 16
+    knl = lp.split_iname(knl, "j", bsize)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
-    mem_map = lp.get_mem_access_map(knl)  # noqa
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)  # noqa
     n = 512
     m = 256
     l = 128
@@ -463,8 +463,8 @@ def test_mem_access_counter_mixed():
                                    stride=Variable('m'), direction='load',
                                    variable='b')
                             ].eval_with_dict(params)
-    assert f64uniform == 2*n*m
-    assert f32uniform == n*m*l/threads
+    assert f64uniform == 2*n*m*l/bsize
+    assert f32uniform == n*m*l/bsize
     assert f32nonconsec == 3*n*m*l
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
@@ -474,7 +474,7 @@ def test_mem_access_counter_mixed():
                                   stride=Variable('m'), direction='store',
                                   variable='c')
                            ].eval_with_dict(params)
-    assert f64uniform == n*m
+    assert f64uniform == n*m*l/bsize
     assert f32nonconsec == n*m*l
 
 
@@ -494,7 +494,7 @@ def test_mem_access_counter_nonconsec():
     knl = lp.split_iname(knl, "i", 16)
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
-    mem_map = lp.get_mem_access_map(knl)  # noqa
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)  # noqa
     n = 512
     m = 256
     l = 128
@@ -545,7 +545,7 @@ def test_mem_access_counter_consec():
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -563,7 +563,7 @@ def test_mem_access_counter_consec():
     f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         stride=1, direction='load', variable='b')
                          ].eval_with_dict(params)
-    assert f64consec == 2*n*m
+    assert f64consec == 2*n*m*l
     assert f32consec == 3*n*m*l
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
@@ -572,7 +572,7 @@ def test_mem_access_counter_consec():
     f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='store', variable='c')
                         ].eval_with_dict(params)
-    assert f64consec == n*m
+    assert f64consec == n*m*l
     assert f32consec == n*m*l
 
 
@@ -628,6 +628,7 @@ def test_barrier_counter_barriers():
 
 def test_all_counters_parallel_matmul():
 
+    bsize = 16
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
             [
@@ -635,9 +636,9 @@ def test_all_counters_parallel_matmul():
             ],
             name="matmul", assumptions="n,m,l >= 1")
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
-    knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
-    knl = lp.split_iname(knl, "k", 16)
+    knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
+    knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
+    knl = lp.split_iname(knl, "k", bsize)
     knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
     knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])
 
@@ -649,9 +650,9 @@ def test_all_counters_parallel_matmul():
     sync_map = lp.get_synchronization_map(knl)
     assert len(sync_map) == 2
     assert sync_map["kernel_launch"].eval_with_dict(params) == 1
-    assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/16
+    assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize
 
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     f32mul = op_map[
                         lp.Op(np.float32, 'mul')
                         ].eval_with_dict(params)
@@ -667,16 +668,17 @@ def test_all_counters_parallel_matmul():
 
     assert f32mul+f32add == n*m*l*2
 
-    op_map = lp.get_mem_access_map(knl)
+    op_map = lp.get_mem_access_map(knl, count_redundant_work=True)
 
-    f32coal = op_map[lp.MemAccess('global', np.float32,
+    f32s1lb = op_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='load', variable='b')
                      ].eval_with_dict(params)
-    f32coal += op_map[lp.MemAccess('global', np.float32,
-                      stride=1, direction='load', variable='a')
-                      ].eval_with_dict(params)
+    f32s1la = op_map[lp.MemAccess('global', np.float32,
+                     stride=1, direction='load', variable='a')
+                     ].eval_with_dict(params)
 
-    assert f32coal == n*m+m*l
+    assert f32s1lb == n*m*l/bsize
+    assert f32s1la == n*m*l/bsize
 
     f32coal = op_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='store', variable='c')
@@ -684,7 +686,8 @@ def test_all_counters_parallel_matmul():
 
     assert f32coal == n*l
 
-    local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local'])
+    local_mem_map = lp.get_mem_access_map(knl,
+                        count_redundant_work=True).filter_by(mtype=['local'])
     local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                              direction='load')
                                 ].eval_with_dict(params)
@@ -742,7 +745,7 @@ def test_summations_and_filters():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
 
     loads_a = mem_map.filter_by(direction=['load'], variable=['a']
                                 ).eval_and_sum(params)
@@ -768,7 +771,7 @@ def test_summations_and_filters():
     assert f32lall == 3*n*m*l
     assert f64lall == 2*n*m
 
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     #for k, v in op_map.items():
     #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)