diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 5eaa12b8124f86cfaf08cf2e83c3382861d9e0f2..92ec799f7045cf63dc75d1386d8a51fd7d42954c 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1120,7 +1120,7 @@ all work items reach the same barrier, the kernel will hang during execution.
 
 By default, :mod:`loopy` inserts local barriers between two instructions when it
 detects that a dependency involving local memory may occur across work items. To
-see this in action, take a look at the section on :ref:`local_temporaries`. 
+see this in action, take a look at the section on :ref:`local_temporaries`.
 
 In contrast, :mod:`loopy` will *not* insert global barriers automatically.
 Global barriers require manual intervention along with some special
@@ -1308,7 +1308,7 @@ tagged, as in the following example::
             assumptions="n>0")
 
 .. [#global-barrier-note] In particular, this is *not* the same as a call to
- ``barrier(CLK_GLOBAL_MEM_FENCE)``. 
+ ``barrier(CLK_GLOBAL_MEM_FENCE)``.
 
 .. }}}
 
@@ -1533,12 +1533,12 @@ information provided. Now we will count the operations:
 
     >>> op_map = lp.get_op_map(knl)
     >>> print(lp.stringify_stats_mapping(op_map))
-    Op(np:dtype('float32'), add) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('float32'), div) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('float32'), mul) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('float64'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('float64'), mul) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('int32'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float32'), add) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float32'), div) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float32'), mul) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float64'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float64'), mul) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('int32'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
 
 :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{**
@@ -1596,9 +1596,9 @@ together into keys containing only the specified fields:
 
     >>> op_map_dtype = op_map.group_by('dtype')
     >>> print(lp.stringify_stats_mapping(op_map_dtype))
-    Op(np:dtype('float32'), None) : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('float64'), None) : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('int32'), None) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float32'), None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float64'), None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('int32'), None) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
     >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32)
     ...                           ].eval_with_dict(param_dict)
@@ -1619,12 +1619,12 @@ we'll continue using the kernel from the previous example:
 
     >>> mem_map = lp.get_mem_access_map(knl)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 2 * n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
 
 :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
@@ -1674,18 +1674,18 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
     >>> bytes_map = mem_map.to_bytes()
     >>> print(lp.stringify_stats_mapping(bytes_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 8 * n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
     >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
     ...                                         ).group_by('direction')
     >>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
-    MemAccess(None, None, None, load, None) : [n, m, l] -> { (16 * n * m + 12 * n * m * l) : n > 0 and m > 0 and l > 0 }
-    MemAccess(None, None, None, store, None) : [n, m, l] -> { (8 * n * m + 4 * n * m * l) : n > 0 and m > 0 and l > 0 }
+    MemAccess(None, None, None, load, None) : [m, l, n] -> { (16 * m + 12 * m * l) * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(None, None, None, store, None) : [m, l, n] -> { (8 * m + 4 * m * l) * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
     >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
     ...                            ].eval_with_dict(param_dict)
@@ -1712,12 +1712,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
     ...                             outer_tag="l.1", inner_tag="l.0")
     >>> mem_map = lp.get_mem_access_map(knl_consec)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, load, a) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, load, b) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, store, c) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, load, g) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, load, h) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, store, e) : [m, l, n] -> { ... }
     <BLANKLINE>
 
 With this parallelization, consecutive threads will access consecutive array
@@ -1753,12 +1753,12 @@ switch the inner and outer tags in our parallelization of the kernel:
     ...                                outer_tag="l.0", inner_tag="l.1")
     >>> mem_map = lp.get_mem_access_map(knl_nonconsec)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, load, a) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, load, b) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, store, c) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, load, g) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, load, h) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, store, e) : [m, l, n] -> { ... }
     <BLANKLINE>
 
 With this parallelization, consecutive threads will access *nonconsecutive*
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index 6a4d559758bc1d7ca52e9dc4da1b7e503e22cc29..56ed87176f891d362ac0555024ef0d8098cd843e 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -518,9 +518,13 @@ def auto_test_vs_ref(
 
     args = None
     from loopy.kernel import kernel_state
+    from loopy.target.pyopencl import PyOpenCLTarget
     if test_knl.state not in [
             kernel_state.PREPROCESSED,
             kernel_state.SCHEDULED]:
+        if isinstance(test_knl.target, PyOpenCLTarget):
+            test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0]))
+
         test_knl = lp.preprocess_kernel(test_knl)
 
     if not test_knl.schedule:
diff --git a/loopy/check.py b/loopy/check.py
index 54ab043d6a38f36852920eb3008d26e28b5cedfb..e72f9e3e6c4db797220729a5f282d4944b31d6ac 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -354,7 +354,7 @@ def check_has_schedulable_iname_nesting(kernel):
 
 def pre_schedule_checks(kernel):
     try:
-        logger.info("%s: pre-schedule check: start" % kernel.name)
+        logger.debug("%s: pre-schedule check: start" % kernel.name)
 
         check_for_orphaned_user_hardware_axes(kernel)
         check_for_double_use_of_hw_axes(kernel)
@@ -367,7 +367,7 @@ def pre_schedule_checks(kernel):
         check_write_destinations(kernel)
         check_has_schedulable_iname_nesting(kernel)
 
-        logger.info("%s: pre-schedule check: done" % kernel.name)
+        logger.debug("%s: pre-schedule check: done" % kernel.name)
     except KeyboardInterrupt:
         raise
     except:
@@ -618,7 +618,7 @@ def check_that_shapes_and_strides_are_arguments(kernel):
 
 def pre_codegen_checks(kernel):
     try:
-        logger.info("pre-codegen check %s: start" % kernel.name)
+        logger.debug("pre-codegen check %s: start" % kernel.name)
 
         check_for_unused_hw_axes_in_insns(kernel)
         check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel)
@@ -627,7 +627,7 @@ def pre_codegen_checks(kernel):
         kernel.target.pre_codegen_check(kernel)
         check_that_shapes_and_strides_are_arguments(kernel)
 
-        logger.info("pre-codegen check %s: done" % kernel.name)
+        logger.debug("pre-codegen check %s: done" % kernel.name)
     except:
         print(75*"=")
         print("failing kernel during pre-schedule check:")
diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py
index 15ab8a1ee13df440926e51e676223bc6a398df57..512e4ac8619f33856d0a8ed929de0b574f7da014 100644
--- a/loopy/diagnostic.py
+++ b/loopy/diagnostic.py
@@ -103,6 +103,10 @@ class MissingDefinitionError(LoopyError):
 class UnscheduledInstructionError(LoopyError):
     pass
 
+
+class ReductionIsNotTriangularError(LoopyError):
+    pass
+
 # }}}
 
 
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 36fbb49f4bb77c959877fb0bd21e1de6fb49c74b..5f0884fd44ed5064f3f195d103b164f2163d1d19 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -598,37 +598,63 @@ def get_simple_strides(bset, key_by="name"):
     assert len(comp_div_set_pieces) == 1
     bset, = comp_div_set_pieces
 
-    lspace = bset.get_local_space()
-    for idiv in range(lspace.dim(dim_type.div)):
-        div = lspace.get_div(idiv)
+    def _get_indices_and_coeffs(obj, dts):
+        result = []
+        for dt in dts:
+            for dim_idx in range(obj.dim(dt)):
+                coeff_val = obj.get_coefficient_val(dt, dim_idx)
+                if not coeff_val.is_zero():
+                    result.append((dt, dim_idx, coeff_val))
 
-        # check for sub-divs
-        supported = True
-        for dim_idx in range(div.dim(dim_type.div)):
-            coeff_val = div.get_coefficient_val(dim_type.div, dim_idx)
-            if not coeff_val.is_zero():
-                # sub-divs not supported
-                supported = False
-                break
+        return result
+
+    for cns in bset.get_constraints():
+        if not cns.is_equality():
+            continue
+        aff = cns.get_aff()
 
-        if not supported:
+        # recognizes constraints of the form
+        #  -i0 + 2*floor((i0)/2) == 0
+
+        if aff.dim(dim_type.div) != 1:
+            continue
+
+        idiv = 0
+        div = aff.get_div(idiv)
+
+        # check for sub-divs
+        if _get_indices_and_coeffs(div, [dim_type.div]):
+            # found one -> not supported
             continue
 
         denom = div.get_denominator_val().to_python()
 
-        inames_and_coeffs = []
-        for dt in [dim_type.param, dim_type.in_]:
-            for dim_idx in range(div.dim(dt)):
-                coeff_val = div.get_coefficient_val(dt, dim_idx) * denom
-                if not coeff_val.is_zero():
-                    inames_and_coeffs.append((dt, dim_idx, coeff_val))
+        # if the coefficient in front of the div is not the same as the denominator
+        if not aff.get_coefficient_val(dim_type.div, idiv).div(denom).is_one():
+            # not supported
+            continue
+
+        inames_and_coeffs = _get_indices_and_coeffs(
+                div, [dim_type.param, dim_type.in_])
 
         if len(inames_and_coeffs) != 1:
             continue
 
         (dt, dim_idx, coeff), = inames_and_coeffs
 
-        if coeff != 1:
+        if not (coeff * denom).is_one():
+            # not supported
+            continue
+
+        inames_and_coeffs = _get_indices_and_coeffs(
+                aff, [dim_type.param, dim_type.in_])
+
+        if len(inames_and_coeffs) != 1:
+            continue
+
+        (outer_dt, outer_dim_idx, outer_coeff), = inames_and_coeffs
+        if (not outer_coeff.neg().is_one()
+                or (outer_dt, outer_dim_idx) != (dt, dim_idx)):
             # not supported
             continue
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index dccaca2ec104a4749289f7cd89c491292f618e3d..622f5e49be1e40b4156113d92907fe8b1d9fb859 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1111,7 +1111,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return embedding
 
-    def stringify(self, what=None, with_dependencies=False):
+    def stringify(self, what=None, with_dependencies=False, use_separators=True,
+            show_labels=True):
         all_what = set([
             "name",
             "arguments",
@@ -1150,7 +1151,10 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         kernel = self
 
-        sep = 75*"-"
+        if use_separators:
+            sep = [75*"-"]
+        else:
+            sep = []
 
         def natorder(key):
             # Return natural ordering for strings, as opposed to dictionary order.
@@ -1167,44 +1171,50 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             return sorted(seq, key=lambda y: natorder(key(y)))
 
         if "name" in what:
-            lines.append(sep)
+            lines.extend(sep)
             lines.append("KERNEL: " + kernel.name)
 
         if "arguments" in what:
-            lines.append(sep)
-            lines.append("ARGUMENTS:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("ARGUMENTS:")
             for arg_name in natsorted(kernel.arg_dict):
                 lines.append(str(kernel.arg_dict[arg_name]))
 
         if "domains" in what:
-            lines.append(sep)
-            lines.append("DOMAINS:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("DOMAINS:")
             for dom, parents in zip(kernel.domains, kernel.all_parents_per_domain()):
                 lines.append(len(parents)*"  " + str(dom))
 
         if "tags" in what:
-            lines.append(sep)
-            lines.append("INAME IMPLEMENTATION TAGS:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("INAME IMPLEMENTATION TAGS:")
             for iname in natsorted(kernel.all_inames()):
                 line = "%s: %s" % (iname, kernel.iname_to_tag.get(iname))
                 lines.append(line)
 
         if "variables" in what and kernel.temporary_variables:
-            lines.append(sep)
-            lines.append("TEMPORARIES:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("TEMPORARIES:")
             for tv in natsorted(six.itervalues(kernel.temporary_variables),
                     key=lambda tv: tv.name):
                 lines.append(str(tv))
 
         if "rules" in what and kernel.substitutions:
-            lines.append(sep)
-            lines.append("SUBSTIUTION RULES:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("SUBSTIUTION RULES:")
             for rule_name in natsorted(six.iterkeys(kernel.substitutions)):
                 lines.append(str(kernel.substitutions[rule_name]))
 
         if "instructions" in what:
-            lines.append(sep)
-            lines.append("INSTRUCTIONS:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("INSTRUCTIONS:")
             loop_list_width = 35
 
             # {{{ topological sort
@@ -1319,18 +1329,20 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 dep_lines.append("%s : %s" % (insn.id, ",".join(insn.depends_on)))
 
         if "Dependencies" in what and dep_lines:
-            lines.append(sep)
-            lines.append("DEPENDENCIES: "
-                    "(use loopy.show_dependency_graph to visualize)")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("DEPENDENCIES: "
+                        "(use loopy.show_dependency_graph to visualize)")
             lines.extend(dep_lines)
 
         if "schedule" in what and kernel.schedule is not None:
-            lines.append(sep)
-            lines.append("SCHEDULE:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("SCHEDULE:")
             from loopy.schedule import dump_schedule
             lines.append(dump_schedule(kernel, kernel.schedule))
 
-        lines.append(sep)
+        lines.extend(sep)
 
         return "\n".join(lines)
 
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 0d22dbb88ed99c7c92480d1d39b924cc2198cc3f..08268ca9f27623a6d17a195d3c04acb55e5ec68a 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -714,7 +714,7 @@ class Assignment(MultiAssignmentBase):
 
                 z[i] = z[i+1-1] + a {atomic}
 
-        :mod:`loopy` may to evaluate the right-hand side *multiple times*
+        :mod:`loopy` may choose to evaluate the right-hand side *multiple times*
         as part of a single assignment. It is up to the user to ensure that
         this retains correct semantics.
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 38499cb91c9b9b677aa5b65ebe6d18d6f1983559..de7f2b593db6d376338aea40171a99fd18778b1e 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -276,7 +276,425 @@ def find_temporary_scope(kernel):
 # {{{ rewrite reduction to imperative form
 
 
-# {{{ reduction utils
+# {{{ utils (not stateful)
+
+from collections import namedtuple
+
+
+_InameClassification = namedtuple("_InameClassifiction",
+                                  "sequential, local_parallel, nonlocal_parallel")
+
+
+def _classify_reduction_inames(kernel, inames):
+    sequential = []
+    local_par = []
+    nonlocal_par = []
+
+    from loopy.kernel.data import (
+            LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
+            ParallelTag)
+
+    for iname in inames:
+        iname_tag = kernel.iname_to_tag.get(iname)
+
+        if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)):
+            # These are nominally parallel, but we can live with
+            # them as sequential.
+            sequential.append(iname)
+
+        elif isinstance(iname_tag, LocalIndexTagBase):
+            local_par.append(iname)
+
+        elif isinstance(iname_tag, (ParallelTag, VectorizeTag)):
+            nonlocal_par.append(iname)
+
+        else:
+            sequential.append(iname)
+
+    return _InameClassification(
+            tuple(sequential), tuple(local_par), tuple(nonlocal_par))
+
+
+def _add_params_to_domain(domain, param_names):
+    dim_type = isl.dim_type
+    nparams_orig = domain.dim(dim_type.param)
+    domain = domain.add_dims(dim_type.param, len(param_names))
+
+    for param_idx, param_name in enumerate(param_names):
+        domain = domain.set_dim_name(
+                dim_type.param, param_idx + nparams_orig, param_name)
+
+    return domain
+
+
+def _move_set_to_param_dims_except(domain, except_dims):
+    dim_type = isl.dim_type
+
+    iname_idx = 0
+    for iname in domain.get_var_names(dim_type.set):
+        if iname not in except_dims:
+            domain = domain.move_dims(
+                    dim_type.param, 0,
+                    dim_type.set, iname_idx, 1)
+            iname_idx -= 1
+        iname_idx += 1
+
+    return domain
+
+
+def _domain_depends_on_given_set_dims(domain, set_dim_names):
+    set_dim_names = frozenset(set_dim_names)
+
+    return any(
+            set_dim_names & set(constr.get_coefficients_by_name())
+            for constr in domain.get_constraints())
+
+
+def _check_reduction_is_triangular(kernel, expr, scan_param):
+    """Check whether the reduction within `expr` with scan parameters described by
+    the structure `scan_param` is triangular. This attempts to verify that the
+    domain for the scan and sweep inames is as follows:
+
+    [params] -> {
+        [other inames..., scan_iname, sweep_iname]:
+            (sweep_min_value
+                <= sweep_iname
+                <= sweep_max_value)
+            and
+            (scan_min_value
+                <= scan_iname
+                <= stride * (sweep_iname - sweep_min_value) + scan_min_value)
+            and
+            (irrelevant constraints)
+    }
+    """
+
+    orig_domain = kernel.get_inames_domain(
+            frozenset((scan_param.sweep_iname, scan_param.scan_iname)))
+
+    sweep_iname = scan_param.sweep_iname
+    scan_iname = scan_param.scan_iname
+    affs = isl.affs_from_space(orig_domain.space)
+
+    sweep_lower_bound = isl.align_spaces(
+            scan_param.sweep_lower_bound,
+            affs[0],
+            across_dim_types=True)
+
+    sweep_upper_bound = isl.align_spaces(
+            scan_param.sweep_upper_bound,
+            affs[0],
+            across_dim_types=True)
+
+    scan_lower_bound = isl.align_spaces(
+            scan_param.scan_lower_bound,
+            affs[0],
+            across_dim_types=True)
+
+    from itertools import product
+
+    for (sweep_lb_domain, sweep_lb_aff), \
+        (sweep_ub_domain, sweep_ub_aff), \
+        (scan_lb_domain, scan_lb_aff) in \
+            product(sweep_lower_bound.get_pieces(),
+                    sweep_upper_bound.get_pieces(),
+                    scan_lower_bound.get_pieces()):
+
+        # Assumptions inherited from the domains of the pwaffs
+        assumptions = sweep_lb_domain & sweep_ub_domain & scan_lb_domain
+
+        # Sweep iname constraints
+        hyp_domain = affs[sweep_iname].ge_set(sweep_lb_aff)
+        hyp_domain &= affs[sweep_iname].le_set(sweep_ub_aff)
+
+        # Scan iname constraints
+        hyp_domain &= affs[scan_iname].ge_set(scan_lb_aff)
+        hyp_domain &= affs[scan_iname].le_set(
+                scan_param.stride * (affs[sweep_iname] - sweep_lb_aff)
+                + scan_lb_aff)
+
+        hyp_domain, = (hyp_domain & assumptions).get_basic_sets()
+        test_domain, = (orig_domain & assumptions).get_basic_sets()
+
+        hyp_gist_against_test = hyp_domain.gist(test_domain)
+        if _domain_depends_on_given_set_dims(hyp_gist_against_test,
+                (sweep_iname, scan_iname)):
+            return False, (
+                    "gist of hypothesis against test domain "
+                    "has sweep or scan dependent constraints: '%s'"
+                    % hyp_gist_against_test)
+
+        test_gist_against_hyp = test_domain.gist(hyp_domain)
+        if _domain_depends_on_given_set_dims(test_gist_against_hyp,
+                (sweep_iname, scan_iname)):
+            return False, (
+                   "gist of test against hypothesis domain "
+                   "has sweep or scan dependent constraint: '%s'"
+                   % test_gist_against_hyp)
+
+    return True, "ok"
+
+
+_ScanCandidateParameters = namedtuple(
+        "_ScanCandidateParameters",
+        "sweep_iname, scan_iname, sweep_lower_bound, "
+        "sweep_upper_bound, scan_lower_bound, stride")
+
+
+def _try_infer_scan_candidate_from_expr(
+        kernel, expr, within_inames, sweep_iname=None):
+    """Analyze `expr` and determine if it can be implemented as a scan.
+    """
+    from loopy.symbolic import Reduction
+    assert isinstance(expr, Reduction)
+
+    if len(expr.inames) != 1:
+        raise ValueError(
+                "Multiple inames in reduction: '%s'" % (", ".join(expr.inames),))
+
+    scan_iname, = expr.inames
+
+    from loopy.kernel.tools import DomainChanger
+    dchg = DomainChanger(kernel, (scan_iname,))
+    domain = dchg.get_original_domain()
+
+    if sweep_iname is None:
+        try:
+            sweep_iname = _try_infer_sweep_iname(
+                    domain, scan_iname, kernel.all_inames())
+        except ValueError as v:
+            raise ValueError(
+                    "Couldn't determine a sweep iname for the scan "
+                    "expression '%s': %s" % (expr, v))
+
+    try:
+        sweep_lower_bound, sweep_upper_bound, scan_lower_bound = (
+                _try_infer_scan_and_sweep_bounds(
+                    kernel, scan_iname, sweep_iname, within_inames))
+    except ValueError as v:
+        raise ValueError(
+                "Couldn't determine bounds for the scan with expression '%s' "
+                "(sweep iname: '%s', scan iname: '%s'): %s"
+                % (expr, sweep_iname, scan_iname, v))
+
+    try:
+        stride = _try_infer_scan_stride(
+                kernel, scan_iname, sweep_iname, sweep_lower_bound)
+    except ValueError as v:
+        raise ValueError(
+                "Couldn't determine a scan stride for the scan with expression '%s' "
+                "(sweep iname: '%s', scan iname: '%s'): %s"
+                % (expr, sweep_iname, scan_iname, v))
+
+    return _ScanCandidateParameters(sweep_iname, scan_iname, sweep_lower_bound,
+            sweep_upper_bound, scan_lower_bound, stride)
+
+
+def _try_infer_sweep_iname(domain, scan_iname, candidate_inames):
+    """The sweep iname is the outer iname which guides the scan.
+
+    E.g. for a domain of {[i,j]: 0<=i<n and 0<=j<=i}, i is the sweep iname.
+    """
+    constrs = domain.get_constraints()
+    sweep_iname_candidate = None
+
+    for constr in constrs:
+        candidate_vars = set([
+                var for var in constr.get_var_dict()
+                if var in candidate_inames])
+
+        # Irrelevant constraint - skip
+        if scan_iname not in candidate_vars:
+            continue
+
+        # No additional inames - skip
+        if len(candidate_vars) == 1:
+            continue
+
+        candidate_vars.remove(scan_iname)
+
+        # Depends on more than one iname - error
+        if len(candidate_vars) > 1:
+            raise ValueError(
+                    "More than one sweep iname candidate for scan iname '%s' found "
+                    "(via constraint '%s')" % (scan_iname, constr))
+
+        next_candidate = candidate_vars.pop()
+
+        if sweep_iname_candidate is None:
+            sweep_iname_candidate = next_candidate
+            defining_constraint = constr
+        else:
+            # Check next_candidate consistency
+            if sweep_iname_candidate != next_candidate:
+                raise ValueError(
+                        "More than one sweep iname candidate for scan iname '%s' "
+                        "found (via constraints '%s', '%s')" %
+                        (scan_iname, defining_constraint, constr))
+
+    if sweep_iname_candidate is None:
+        raise ValueError(
+                "Couldn't find any sweep iname candidates for "
+                "scan iname '%s'" % scan_iname)
+
+    return sweep_iname_candidate
+
+
+def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_inames):
+    domain = kernel.get_inames_domain(frozenset((sweep_iname, scan_iname)))
+    domain = _move_set_to_param_dims_except(domain, (sweep_iname, scan_iname))
+
+    var_dict = domain.get_var_dict()
+    sweep_idx = var_dict[sweep_iname][1]
+    scan_idx = var_dict[scan_iname][1]
+
+    domain = domain.project_out_except(
+            within_inames | kernel.non_iname_variable_names(), (isl.dim_type.param,))
+
+    try:
+        with isl.SuppressedWarnings(domain.get_ctx()):
+            sweep_lower_bound = domain.dim_min(sweep_idx)
+            sweep_upper_bound = domain.dim_max(sweep_idx)
+            scan_lower_bound = domain.dim_min(scan_idx)
+    except isl.Error as e:
+        raise ValueError("isl error: %s" % e)
+
+    return (sweep_lower_bound, sweep_upper_bound, scan_lower_bound)
+
+
+def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound):
+    """The stride is the number of steps the scan iname takes per iteration
+    of the sweep iname. This is allowed to be an integer constant.
+
+    E.g. for a domain of {[i,j]: 0<=i<n and 0<=j<=6*i}, the stride is 6.
+    """
+    dim_type = isl.dim_type
+
+    domain = kernel.get_inames_domain(frozenset([sweep_iname, scan_iname]))
+    domain_with_sweep_param = _move_set_to_param_dims_except(domain, (scan_iname,))
+
+    domain_with_sweep_param = domain_with_sweep_param.project_out_except(
+            (sweep_iname, scan_iname), (dim_type.set, dim_type.param))
+
+    scan_iname_idx = domain_with_sweep_param.find_dim_by_name(
+            dim_type.set, scan_iname)
+
+    # Should be equal to k * sweep_iname, where k is the stride.
+
+    try:
+        with isl.SuppressedWarnings(domain_with_sweep_param.get_ctx()):
+            scan_iname_range = (
+                    domain_with_sweep_param.dim_max(scan_iname_idx)
+                    - domain_with_sweep_param.dim_min(scan_iname_idx)
+                    ).gist(domain_with_sweep_param.params())
+    except isl.Error as e:
+        raise ValueError("isl error: '%s'" % e)
+
+    scan_iname_pieces = scan_iname_range.get_pieces()
+
+    if len(scan_iname_pieces) > 1:
+        raise ValueError("range in multiple pieces: %s" % scan_iname_range)
+    elif len(scan_iname_pieces) == 0:
+        raise ValueError("empty range found for iname '%s'" % scan_iname)
+
+    scan_iname_constr, scan_iname_aff = scan_iname_pieces[0]
+
+    if not scan_iname_constr.plain_is_universe():
+        raise ValueError("found constraints: %s" % scan_iname_constr)
+
+    if scan_iname_aff.dim(dim_type.div):
+        raise ValueError("aff has div: %s" % scan_iname_aff)
+
+    coeffs = scan_iname_aff.get_coefficients_by_name(dim_type.param)
+
+    if len(coeffs) == 0:
+        try:
+            scan_iname_aff.get_constant_val()
+        except:
+            raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff)
+
+        # If this point is reached we're assuming the domain is of the form
+        # {[i,j]: i=0 and j=0}, so the stride is technically 1 - any value
+        # this function returns will be verified later by
+        # _check_reduction_is_triangular().
+        return 1
+
+    if sweep_iname not in coeffs:
+        raise ValueError("didn't find sweep iname in coeffs: %s" % sweep_iname)
+
+    stride = coeffs[sweep_iname]
+
+    if not stride.is_int():
+        raise ValueError("stride not an integer: %s" % stride)
+
+    if not stride.is_pos():
+        raise ValueError("stride not positive: %s" % stride)
+
+    return stride.to_python()
+
+
+def _get_domain_with_iname_as_param(domain, iname):
+    dim_type = isl.dim_type
+
+    if domain.find_dim_by_name(dim_type.param, iname) >= 0:
+        return domain
+
+    iname_idx = domain.find_dim_by_name(dim_type.set, iname)
+
+    assert iname_idx >= 0, (iname, domain)
+
+    return domain.move_dims(
+        dim_type.param, domain.dim(dim_type.param),
+        dim_type.set, iname_idx, 1)
+
+
+def _create_domain_for_sweep_tracking(orig_domain,
+        tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride):
+    dim_type = isl.dim_type
+
+    subd = isl.BasicSet.universe(orig_domain.params().space)
+
+    # Add tracking_iname and sweep iname.
+
+    subd = _add_params_to_domain(subd, (sweep_iname, tracking_iname))
+
+    # Here we realize the domain:
+    #
+    # [..., i] -> {
+    #  [j]: 0 <= j - l
+    #       and
+    #       j - l <= k * (i - m)
+    #       and
+    #       k * (i - m - 1) < j - l }
+    # where
+    #   * i is the sweep iname
+    #   * j is the tracking iname
+    #   * k is the stride for the scan
+    #   * l is the lower bound for the scan
+    #   * m is the lower bound for the sweep iname
+    #
+    affs = isl.affs_from_space(subd.space)
+
+    subd &= (affs[tracking_iname] - scan_min_value).ge_set(affs[0])
+    subd &= (affs[tracking_iname] - scan_min_value)\
+            .le_set(stride * (affs[sweep_iname] - sweep_min_value))
+    subd &= (affs[tracking_iname] - scan_min_value)\
+            .gt_set(stride * (affs[sweep_iname] - sweep_min_value - 1))
+
+    # Move tracking_iname into a set dim (NOT sweep iname).
+    subd = subd.move_dims(
+            dim_type.set, 0,
+            dim_type.param, subd.dim(dim_type.param) - 1, 1)
+
+    # Simplify (maybe).
+    orig_domain_with_sweep_param = (
+            _get_domain_with_iname_as_param(orig_domain, sweep_iname))
+    subd = subd.gist_params(orig_domain_with_sweep_param.params())
+
+    subd, = subd.get_basic_sets()
+
+    return subd
+
 
 def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
     """
@@ -462,10 +880,22 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
     return kernel.copy(temporary_variables=new_temporary_variables,
                        instructions=new_instructions)
 
+
+def _insert_subdomain_into_domain_tree(kernel, domains, subdomain):
+    # Intersect with inames, because we could have captured some kernel params
+    # in here too...
+    dependent_inames = (
+            frozenset(subdomain.get_var_names(isl.dim_type.param))
+            & kernel.all_inames())
+    idx, = kernel.get_leaf_domain_indices(dependent_inames)
+    domains.insert(idx + 1, subdomain)
+
 # }}}
 
 
-def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
+def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
+                      automagic_scans_ok=False, force_scan=False,
+                      force_outer_iname_for_scan=None):
     """Rewrites reductions into their imperative form. With *insn_id_filter*
     specified, operate only on the instruction with an instruction id matching
     *insn_id_filter*.
@@ -476,6 +906,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
     If *insn_id_filter* is not given, all reductions in all instructions will
     be realized.
+
+    If *automagic_scans_ok*, this function will attempt to rewrite triangular
+    reductions as scans automatically.
+
+    If *force_scan* is *True*, this function will attempt to rewrite *all*
+    candidate reductions as scans and raise an error if this is not possible
+    (this is most useful combined with *insn_id_filter*).
+
+    If *force_outer_iname_for_scan* is not *None*, this function will attempt
+    to realize candidate reductions as scans using the specified iname as the
+    outer (sweep) iname.
     """
 
     logger.debug("%s: realize reduction" % kernel.name)
@@ -487,6 +928,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
     var_name_gen = kernel.get_var_name_generator()
     new_temporary_variables = kernel.temporary_variables.copy()
+    inames_added_for_scan = set()
+    inames_to_remove = set()
 
     # {{{ helpers
 
@@ -496,8 +939,44 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
         else:
             return val
 
+    def preprocess_scan_arguments(
+                insn, expr, nresults, scan_iname, track_iname,
+                newly_generated_insn_id_set):
+        """Does iname substitution within scan arguments and returns a set of values
+        suitable to be passed to the binary op. Returns a tuple."""
+
+        if nresults > 1:
+            inner_expr = expr
+
+            # In the case of a multi-argument scan, we need a name for each of
+            # the arguments in order to pass them to the binary op - so we expand
+            # items that are not "plain" tuples here.
+            if not isinstance(inner_expr, tuple):
+                get_args_insn_id = insn_id_gen(
+                        "%s_%s_get" % (insn.id, "_".join(expr.inames)))
+
+                inner_expr = expand_inner_reduction(
+                        id=get_args_insn_id,
+                        expr=inner_expr,
+                        nresults=nresults,
+                        depends_on=insn.depends_on,
+                        within_inames=insn.within_inames | expr.inames,
+                        within_inames_is_final=insn.within_inames_is_final)
+
+                newly_generated_insn_id_set.add(get_args_insn_id)
+
+            updated_inner_exprs = tuple(
+                    replace_var_within_expr(sub_expr, scan_iname, track_iname)
+                    for sub_expr in inner_expr)
+        else:
+            updated_inner_exprs = (
+                    replace_var_within_expr(expr, scan_iname, track_iname),)
+
+        return updated_inner_exprs
+
     def expand_inner_reduction(id, expr, nresults, depends_on, within_inames,
             within_inames_is_final):
+        # FIXME: use make_temporaries
         from pymbolic.primitives import Call
         from loopy.symbolic import Reduction
         assert isinstance(expr, (Call, Reduction))
@@ -537,20 +1016,23 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             reduction_dtypes):
         outer_insn_inames = temp_kernel.insn_inames(insn)
 
-        from pymbolic import var
-        acc_var_names = [
-                var_name_gen("acc_"+"_".join(expr.inames))
-                for i in range(nresults)]
-        acc_vars = tuple(var(n) for n in acc_var_names)
+        from loopy.kernel.data import temp_var_scope
+        acc_var_names = make_temporaries(
+                name_based_on="acc_"+"_".join(expr.inames),
+                nvars=nresults,
+                shape=(),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.PRIVATE)
 
-        from loopy.kernel.data import TemporaryVariable, temp_var_scope
+        init_insn_depends_on = frozenset()
 
-        for name, dtype in zip(acc_var_names, reduction_dtypes):
-            new_temporary_variables[name] = TemporaryVariable(
-                    name=name,
-                    shape=(),
-                    dtype=dtype,
-                    scope=temp_var_scope.PRIVATE)
+        global_barrier = lp.find_most_recent_global_barrier(temp_kernel, insn.id)
+
+        if global_barrier is not None:
+            init_insn_depends_on |= frozenset([global_barrier])
+
+        from pymbolic import var
+        acc_vars = tuple(var(n) for n in acc_var_names)
 
         init_id = insn_id_gen(
                 "%s_%s_init" % (insn.id, "_".join(expr.inames)))
@@ -560,7 +1042,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                 assignees=acc_vars,
                 within_inames=outer_insn_inames - frozenset(expr.inames),
                 within_inames_is_final=insn.within_inames_is_final,
-                depends_on=frozenset(),
+                depends_on=init_insn_depends_on,
                 expression=expr.operation.neutral_element(*arg_dtypes))
 
         generated_insns.append(init_insn)
@@ -574,17 +1056,20 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
         reduction_insn_depends_on = set([init_id])
 
+        # In the case of a multi-argument reduction, we need a name for each of
+        # the arguments in order to pass them to the binary op - so we expand
+        # items that are not "plain" tuples here.
         if nresults > 1 and not isinstance(expr.expr, tuple):
             get_args_insn_id = insn_id_gen(
                     "%s_%s_get" % (insn.id, "_".join(expr.inames)))
 
             reduction_expr = expand_inner_reduction(
-                id=get_args_insn_id,
-                expr=expr.expr,
-                nresults=nresults,
-                depends_on=insn.depends_on,
-                within_inames=update_insn_iname_deps,
-                within_inames_is_final=insn.within_inames_is_final)
+                    id=get_args_insn_id,
+                    expr=expr.expr,
+                    nresults=nresults,
+                    depends_on=insn.depends_on,
+                    within_inames=update_insn_iname_deps,
+                    within_inames_is_final=insn.within_inames_is_final)
 
             reduction_insn_depends_on.add(get_args_insn_id)
         else:
@@ -633,6 +1118,14 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                 v[iname].lt_set(v[0] + size)).get_basic_sets()
         return bs
 
+    def _make_slab_set_from_range(iname, lbound, ubound):
+        v = isl.make_zero_and_vars([iname])
+        bs, = (
+                v[iname].ge_set(v[0] + lbound)
+                &
+                v[iname].lt_set(v[0] + ubound)).get_basic_sets()
+        return bs
+
     def map_reduction_local(expr, rec, nresults, arg_dtypes,
             reduction_dtypes):
         red_iname, = expr.inames
@@ -657,6 +1150,24 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                 _get_int_iname_size(oiname)
                 for oiname in outer_local_inames)
 
+        from loopy.kernel.data import temp_var_scope
+
+        neutral_var_names = make_temporaries(
+                name_based_on="neutral_"+red_iname,
+                nvars=nresults,
+                shape=(),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.PRIVATE)
+
+        acc_var_names = make_temporaries(
+                name_based_on="acc_"+red_iname,
+                nvars=nresults,
+                shape=outer_local_iname_sizes + (size,),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.LOCAL)
+
+        acc_vars = tuple(var(n) for n in acc_var_names)
+
         # {{{ add separate iname to carry out the reduction
 
         # Doing this sheds any odd conditionals that may be active
@@ -668,32 +1179,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
         # }}}
 
-        neutral_var_names = [
-                var_name_gen("neutral_"+red_iname)
-                for i in range(nresults)]
-        acc_var_names = [
-                var_name_gen("acc_"+red_iname)
-                for i in range(nresults)]
-        acc_vars = tuple(var(n) for n in acc_var_names)
-
-        from loopy.kernel.data import TemporaryVariable, temp_var_scope
-        for name, dtype in zip(acc_var_names, reduction_dtypes):
-            new_temporary_variables[name] = TemporaryVariable(
-                    name=name,
-                    shape=outer_local_iname_sizes + (size,),
-                    dtype=dtype,
-                    scope=temp_var_scope.LOCAL)
-        for name, dtype in zip(neutral_var_names, reduction_dtypes):
-            new_temporary_variables[name] = TemporaryVariable(
-                    name=name,
-                    shape=(),
-                    dtype=dtype,
-                    scope=temp_var_scope.PRIVATE)
-
         base_iname_deps = outer_insn_inames - frozenset(expr.inames)
 
         neutral = expr.operation.neutral_element(*arg_dtypes)
-
         init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname))
         init_insn = make_assignment(
                 id=init_id,
@@ -718,6 +1206,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
         transfer_depends_on = set([init_neutral_id, init_id])
 
+        # In the case of a multi-argument reduction, we need a name for each of
+        # the arguments in order to pass them to the binary op - so we expand
+        # items that are not "plain" tuples here.
         if nresults > 1 and not isinstance(expr.expr, tuple):
             get_args_insn_id = insn_id_gen(
                     "%s_%s_get" % (insn.id, red_iname))
@@ -752,9 +1243,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     (outer_insn_inames - frozenset(expr.inames))
                     | frozenset([red_iname])),
                 within_inames_is_final=insn.within_inames_is_final,
-                depends_on=frozenset(transfer_depends_on) | insn.depends_on,
-                no_sync_with=frozenset(
-                    [(insn_id, "any") for insn_id in transfer_depends_on]))
+                depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on,
+                no_sync_with=frozenset([(init_id, "any")]))
         generated_insns.append(transfer_insn)
 
         cur_size = 1
@@ -766,6 +1256,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
         istage = 0
         while cur_size > 1:
+
             new_size = cur_size // 2
             assert new_size * 2 == cur_size
 
@@ -814,6 +1305,351 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars]
     # }}}
 
+    # {{{ utils (stateful)
+
+    from pytools import memoize
+
+    @memoize
+    def get_or_add_sweep_tracking_iname_and_domain(
+            scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
+            tracking_iname):
+        domain = temp_kernel.get_inames_domain(frozenset((scan_iname, sweep_iname)))
+
+        inames_added_for_scan.add(tracking_iname)
+
+        new_domain = _create_domain_for_sweep_tracking(domain,
+                tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride)
+
+        _insert_subdomain_into_domain_tree(temp_kernel, domains, new_domain)
+
+        return tracking_iname
+
+    def replace_var_within_expr(expr, from_var, to_var):
+        from pymbolic.mapper.substitutor import make_subst_func
+
+        from loopy.symbolic import (
+            SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper)
+
+        rule_mapping_context = SubstitutionRuleMappingContext(
+            temp_kernel.substitutions, var_name_gen)
+
+        from pymbolic import var
+        mapper = RuleAwareSubstitutionMapper(
+            rule_mapping_context,
+            make_subst_func({from_var: var(to_var)}),
+            within=lambda *args: True)
+
+        return mapper(expr, temp_kernel, None)
+
+    def make_temporaries(name_based_on, nvars, shape, dtypes, scope):
+        var_names = [
+                var_name_gen(name_based_on.format(index=i))
+                for i in range(nvars)]
+
+        from loopy.kernel.data import TemporaryVariable
+
+        for name, dtype in zip(var_names, dtypes):
+            new_temporary_variables[name] = TemporaryVariable(
+                    name=name,
+                    shape=shape,
+                    dtype=dtype,
+                    scope=scope)
+
+        return var_names
+
+    # }}}
+
+    # {{{ sequential scan
+
+    def map_scan_seq(expr, rec, nresults, arg_dtypes,
+            reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
+            scan_min_value, stride):
+        outer_insn_inames = temp_kernel.insn_inames(insn)
+        inames_to_remove.add(scan_iname)
+
+        track_iname = var_name_gen(
+                "{sweep_iname}__seq_scan"
+                .format(scan_iname=scan_iname, sweep_iname=sweep_iname))
+
+        get_or_add_sweep_tracking_iname_and_domain(
+                scan_iname, sweep_iname, sweep_min_value, scan_min_value,
+                stride, track_iname)
+
+        from loopy.kernel.data import temp_var_scope
+        acc_var_names = make_temporaries(
+                name_based_on="acc_" + scan_iname,
+                nvars=nresults,
+                shape=(),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.PRIVATE)
+
+        from pymbolic import var
+        acc_vars = tuple(var(n) for n in acc_var_names)
+
+        init_id = insn_id_gen(
+                "%s_%s_init" % (insn.id, "_".join(expr.inames)))
+
+        init_insn_depends_on = frozenset()
+
+        global_barrier = lp.find_most_recent_global_barrier(temp_kernel, insn.id)
+
+        if global_barrier is not None:
+            init_insn_depends_on |= frozenset([global_barrier])
+
+        init_insn = make_assignment(
+                id=init_id,
+                assignees=acc_vars,
+                within_inames=outer_insn_inames - frozenset(
+                    (sweep_iname,) + expr.inames),
+                within_inames_is_final=insn.within_inames_is_final,
+                depends_on=init_insn_depends_on,
+                expression=expr.operation.neutral_element(*arg_dtypes))
+
+        generated_insns.append(init_insn)
+
+        update_insn_depends_on = set([init_insn.id]) | insn.depends_on
+
+        updated_inner_exprs = (
+                preprocess_scan_arguments(insn, expr.expr, nresults,
+                    scan_iname, track_iname, update_insn_depends_on))
+
+        update_id = insn_id_gen(
+                based_on="%s_%s_update" % (insn.id, "_".join(expr.inames)))
+
+        update_insn_iname_deps = temp_kernel.insn_inames(insn) | set([track_iname])
+        if insn.within_inames_is_final:
+            update_insn_iname_deps = insn.within_inames | set([track_iname])
+
+        scan_insn = make_assignment(
+                id=update_id,
+                assignees=acc_vars,
+                expression=expr.operation(
+                    arg_dtypes,
+                    _strip_if_scalar(acc_vars, acc_vars),
+                    _strip_if_scalar(acc_vars, updated_inner_exprs)),
+                depends_on=frozenset(update_insn_depends_on),
+                within_inames=update_insn_iname_deps,
+                no_sync_with=insn.no_sync_with,
+                within_inames_is_final=insn.within_inames_is_final)
+
+        generated_insns.append(scan_insn)
+
+        new_insn_add_depends_on.add(scan_insn.id)
+
+        if nresults == 1:
+            assert len(acc_vars) == 1
+            return acc_vars[0]
+        else:
+            return acc_vars
+
+    # }}}
+
+    # {{{ local-parallel scan
+
+    def map_scan_local(expr, rec, nresults, arg_dtypes,
+            reduction_dtypes, sweep_iname, scan_iname,
+            sweep_min_value, scan_min_value, stride):
+
+        scan_size = _get_int_iname_size(sweep_iname)
+
+        assert scan_size > 0
+
+        if scan_size == 1:
+            return map_reduction_seq(
+                    expr, rec, nresults, arg_dtypes, reduction_dtypes)
+
+        outer_insn_inames = temp_kernel.insn_inames(insn)
+
+        from loopy.kernel.data import LocalIndexTagBase
+        outer_local_inames = tuple(
+                oiname
+                for oiname in outer_insn_inames
+                if isinstance(
+                    kernel.iname_to_tag.get(oiname),
+                    LocalIndexTagBase)
+                and oiname != sweep_iname)
+
+        from pymbolic import var
+        outer_local_iname_vars = tuple(
+                var(oiname) for oiname in outer_local_inames)
+
+        outer_local_iname_sizes = tuple(
+                _get_int_iname_size(oiname)
+                for oiname in outer_local_inames)
+
+        track_iname = var_name_gen(
+                "{sweep_iname}__pre_scan"
+                .format(scan_iname=scan_iname, sweep_iname=sweep_iname))
+
+        get_or_add_sweep_tracking_iname_and_domain(
+                scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
+                track_iname)
+
+        # {{{ add separate iname to carry out the scan
+
+        # Doing this sheds any odd conditionals that may be active
+        # on our scan_iname.
+
+        base_exec_iname = var_name_gen(sweep_iname + "__scan")
+        domains.append(_make_slab_set(base_exec_iname, scan_size))
+        new_iname_tags[base_exec_iname] = kernel.iname_to_tag[sweep_iname]
+
+        # }}}
+
+        from loopy.kernel.data import temp_var_scope
+
+        read_var_names = make_temporaries(
+                name_based_on="read_"+scan_iname+"_arg_{index}",
+                nvars=nresults,
+                shape=(),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.PRIVATE)
+
+        acc_var_names = make_temporaries(
+                name_based_on="acc_"+scan_iname,
+                nvars=nresults,
+                shape=outer_local_iname_sizes + (scan_size,),
+                dtypes=reduction_dtypes,
+                scope=temp_var_scope.LOCAL)
+
+        acc_vars = tuple(var(n) for n in acc_var_names)
+        read_vars = tuple(var(n) for n in read_var_names)
+
+        base_iname_deps = (outer_insn_inames
+                - frozenset(expr.inames) - frozenset([sweep_iname]))
+
+        neutral = expr.operation.neutral_element(*arg_dtypes)
+
+        init_insn_depends_on = insn.depends_on
+
+        global_barrier = lp.find_most_recent_global_barrier(temp_kernel, insn.id)
+
+        if global_barrier is not None:
+            init_insn_depends_on |= frozenset([global_barrier])
+
+        init_id = insn_id_gen("%s_%s_init" % (insn.id, scan_iname))
+        init_insn = make_assignment(
+                id=init_id,
+                assignees=tuple(
+                    acc_var[outer_local_iname_vars + (var(base_exec_iname),)]
+                    for acc_var in acc_vars),
+                expression=neutral,
+                within_inames=base_iname_deps | frozenset([base_exec_iname]),
+                within_inames_is_final=insn.within_inames_is_final,
+                depends_on=init_insn_depends_on)
+        generated_insns.append(init_insn)
+
+        transfer_insn_depends_on = set([init_insn.id]) | insn.depends_on
+
+        updated_inner_exprs = (
+                preprocess_scan_arguments(insn, expr.expr, nresults,
+                    scan_iname, track_iname, transfer_insn_depends_on))
+
+        from loopy.symbolic import Reduction
+
+        from loopy.symbolic import pw_aff_to_expr
+        sweep_min_value_expr = pw_aff_to_expr(sweep_min_value)
+
+        transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, scan_iname))
+        transfer_insn = make_assignment(
+                id=transfer_id,
+                assignees=tuple(
+                    acc_var[outer_local_iname_vars
+                            + (var(sweep_iname) - sweep_min_value_expr,)]
+                    for acc_var in acc_vars),
+                expression=Reduction(
+                    operation=expr.operation,
+                    inames=(track_iname,),
+                    expr=_strip_if_scalar(acc_vars, updated_inner_exprs),
+                    allow_simultaneous=False,
+                    ),
+                within_inames=outer_insn_inames - frozenset(expr.inames),
+                within_inames_is_final=insn.within_inames_is_final,
+                depends_on=frozenset(transfer_insn_depends_on),
+                no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with)
+
+        generated_insns.append(transfer_insn)
+
+        prev_id = transfer_id
+
+        istage = 0
+        cur_size = 1
+
+        while cur_size < scan_size:
+            stage_exec_iname = var_name_gen("%s__scan_s%d" % (sweep_iname, istage))
+            domains.append(
+                    _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size))
+            new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[sweep_iname]
+
+            for read_var, acc_var in zip(read_vars, acc_vars):
+                read_stage_id = insn_id_gen(
+                        "scan_%s_read_stage_%d" % (scan_iname, istage))
+
+                read_stage_insn = make_assignment(
+                        id=read_stage_id,
+                        assignees=(read_var,),
+                        expression=(
+                                acc_var[
+                                    outer_local_iname_vars
+                                    + (var(stage_exec_iname) - cur_size,)]),
+                        within_inames=(
+                            base_iname_deps | frozenset([stage_exec_iname])),
+                        within_inames_is_final=insn.within_inames_is_final,
+                        depends_on=frozenset([prev_id]))
+
+                if cur_size == 1:
+                    # Performance hack: don't add a barrier here with transfer_insn.
+                    # NOTE: This won't work if the way that local inames
+                    # are lowered changes.
+                    read_stage_insn = read_stage_insn.copy(
+                            no_sync_with=(
+                                read_stage_insn.no_sync_with
+                                | frozenset([(transfer_id, "any")])))
+
+                generated_insns.append(read_stage_insn)
+                prev_id = read_stage_id
+
+            write_stage_id = insn_id_gen(
+                    "scan_%s_write_stage_%d" % (scan_iname, istage))
+            write_stage_insn = make_assignment(
+                    id=write_stage_id,
+                    assignees=tuple(
+                        acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
+                        for acc_var in acc_vars),
+                    expression=expr.operation(
+                        arg_dtypes,
+                        _strip_if_scalar(acc_vars, read_vars),
+                        _strip_if_scalar(acc_vars, tuple(
+                            acc_var[
+                                outer_local_iname_vars + (var(stage_exec_iname),)]
+                            for acc_var in acc_vars))
+                        ),
+                    within_inames=(
+                        base_iname_deps | frozenset([stage_exec_iname])),
+                    within_inames_is_final=insn.within_inames_is_final,
+                    depends_on=frozenset([prev_id]),
+                    )
+
+            generated_insns.append(write_stage_insn)
+            prev_id = write_stage_id
+
+            cur_size *= 2
+            istage += 1
+
+        new_insn_add_depends_on.add(prev_id)
+        new_insn_add_within_inames.add(sweep_iname)
+
+        output_idx = var(sweep_iname) - sweep_min_value_expr
+
+        if nresults == 1:
+            assert len(acc_vars) == 1
+            return acc_vars[0][outer_local_iname_vars + (output_idx,)]
+        else:
+            return [acc_var[outer_local_iname_vars + (output_idx,)]
+                    for acc_var in acc_vars]
+
+    # }}}
+
     # {{{ seq/par dispatch
 
     def map_reduction(expr, rec, nresults=1):
@@ -832,31 +1668,43 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
             raise LoopyError("reduction used within loop(s) that it was "
                     "supposed to reduce over: " + ", ".join(bad_inames))
 
-        n_sequential = 0
-        n_local_par = 0
+        iname_classes = _classify_reduction_inames(temp_kernel, expr.inames)
 
-        from loopy.kernel.data import (
-                LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
-                ParallelTag)
-        for iname in expr.inames:
-            iname_tag = kernel.iname_to_tag.get(iname)
+        n_sequential = len(iname_classes.sequential)
+        n_local_par = len(iname_classes.local_parallel)
+        n_nonlocal_par = len(iname_classes.nonlocal_parallel)
+
+        really_force_scan = force_scan and (
+                len(expr.inames) != 1 or expr.inames[0] not in inames_added_for_scan)
+
+        def _error_if_force_scan_on(cls, msg):
+            if really_force_scan:
+                raise cls(msg)
 
-            if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)):
-                # These are nominally parallel, but we can live with
-                # them as sequential.
-                n_sequential += 1
+        may_be_implemented_as_scan = False
+        if force_scan or automagic_scans_ok:
+            from loopy.diagnostic import ReductionIsNotTriangularError
 
-            elif isinstance(iname_tag, LocalIndexTagBase):
-                n_local_par += 1
+            try:
+                # Try to determine scan candidate information (sweep iname, scan
+                # iname, etc).
+                scan_param = _try_infer_scan_candidate_from_expr(
+                        temp_kernel, expr, outer_insn_inames,
+                        sweep_iname=force_outer_iname_for_scan)
 
-            elif isinstance(iname_tag, (ParallelTag, VectorizeTag)):
-                raise LoopyError("the only form of parallelism supported "
-                        "by reductions is 'local'--found iname '%s' "
-                        "tagged '%s'"
-                        % (iname, type(iname_tag).__name__))
+            except ValueError as v:
+                error = str(v)
 
             else:
-                n_sequential += 1
+                # Ensures the reduction is triangular (somewhat expensive).
+                may_be_implemented_as_scan, error = (
+                        _check_reduction_is_triangular(
+                            temp_kernel, expr, scan_param))
+
+            if not may_be_implemented_as_scan:
+                _error_if_force_scan_on(ReductionIsNotTriangularError, error)
+
+        # {{{ sanity checks
 
         if n_local_par and n_sequential:
             raise LoopyError("Reduction over '%s' contains both parallel and "
@@ -872,21 +1720,87 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     "before code generation."
                     % ", ".join(expr.inames))
 
-        if n_sequential:
-            assert n_local_par == 0
-            return map_reduction_seq(expr, rec, nresults, arg_dtypes,
-                    reduction_dtypes)
-        elif n_local_par:
-            return map_reduction_local(expr, rec, nresults, arg_dtypes,
-                    reduction_dtypes)
-        else:
+        if n_nonlocal_par:
+            bad_inames = iname_classes.nonlocal_parallel
+            raise LoopyError("the only form of parallelism supported "
+                    "by reductions is 'local'--found iname(s) '%s' "
+                    "respectively tagged '%s'"
+                    % (", ".join(bad_inames),
+                       ", ".join(kernel.iname_to_tag[iname]
+                                 for iname in bad_inames)))
+
+        if n_local_par == 0 and n_sequential == 0:
             from loopy.diagnostic import warn_with_kernel
             warn_with_kernel(kernel, "empty_reduction",
                     "Empty reduction found (no inames to reduce over). "
                     "Eliminating.")
 
+            # We're not supposed to reduce/sum at all. (Note how this is distinct
+            # from an empty reduction--there is an element here, just no inames
+            # to reduce over. It's rather similar to an array with () shape in
+            # numpy.)
+
             return expr.expr
 
+        # }}}
+
+        if may_be_implemented_as_scan:
+            assert force_scan or automagic_scans_ok
+
+            # We require the "scan" iname to be tagged sequential.
+            if n_sequential:
+                sweep_iname = scan_param.sweep_iname
+                sweep_class = _classify_reduction_inames(kernel, (sweep_iname,))
+
+                sequential = sweep_iname in sweep_class.sequential
+                parallel = sweep_iname in sweep_class.local_parallel
+                bad_parallel = sweep_iname in sweep_class.nonlocal_parallel
+
+                if sweep_iname not in outer_insn_inames:
+                    _error_if_force_scan_on(LoopyError,
+                            "Sweep iname '%s' was detected, but is not an iname "
+                            "for the instruction." % sweep_iname)
+                elif bad_parallel:
+                    _error_if_force_scan_on(LoopyError,
+                            "Sweep iname '%s' has an unsupported parallel tag '%s' "
+                            "- the only parallelism allowed is 'local'." %
+                            (sweep_iname, temp_kernel.iname_to_tag[sweep_iname]))
+                elif parallel:
+                    return map_scan_local(
+                            expr, rec, nresults, arg_dtypes, reduction_dtypes,
+                            sweep_iname, scan_param.scan_iname,
+                            scan_param.sweep_lower_bound,
+                            scan_param.scan_lower_bound,
+                            scan_param.stride)
+                elif sequential:
+                    return map_scan_seq(
+                            expr, rec, nresults, arg_dtypes, reduction_dtypes,
+                            sweep_iname, scan_param.scan_iname,
+                            scan_param.sweep_lower_bound,
+                            scan_param.scan_lower_bound,
+                            scan_param.stride)
+
+                # fallthrough to reduction implementation
+
+            else:
+                assert n_local_par > 0
+                scan_iname, = expr.inames
+                _error_if_force_scan_on(LoopyError,
+                        "Scan iname '%s' is parallel tagged: this is not allowed "
+                        "(only the sweep iname should be tagged if parallelism "
+                        "is desired)." % scan_iname)
+
+                # fallthrough to reduction implementation
+
+        if n_sequential:
+            assert n_local_par == 0
+            return map_reduction_seq(
+                    expr, rec, nresults, arg_dtypes, reduction_dtypes)
+        else:
+            assert n_local_par > 0
+            return map_reduction_local(
+                    expr, rec, nresults, arg_dtypes, reduction_dtypes)
+
     # }}}
 
     from loopy.symbolic import ReductionCallbackMapper
@@ -992,6 +1906,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
     kernel = lp.tag_inames(kernel, new_iname_tags)
 
+    # TODO: remove unused inames...
+
     kernel = (
             _hackily_ensure_multi_assignment_return_values_are_scoped_private(
                 kernel))
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index c078da2ec58dabbbf646bfcf593ea0138941cc85..57cf74b808ae1a7107e76a18a3876785ab8baabd 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -1908,14 +1908,14 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
 
                 if (gsize or lsize):
                     if not kernel.options.disable_global_barriers:
-                        logger.info("%s: barrier insertion: global" % kernel.name)
+                        logger.debug("%s: barrier insertion: global" % kernel.name)
                         gen_sched = insert_barriers(kernel, gen_sched,
                                 kind="global", verify_only=True)
 
-                    logger.info("%s: barrier insertion: local" % kernel.name)
+                    logger.debug("%s: barrier insertion: local" % kernel.name)
                     gen_sched = insert_barriers(kernel, gen_sched, kind="local",
                             verify_only=False)
-                    logger.info("%s: barrier insertion: done" % kernel.name)
+                    logger.debug("%s: barrier insertion: done" % kernel.name)
 
                 new_kernel = kernel.copy(
                         schedule=gen_sched,
diff --git a/loopy/statistics.py b/loopy/statistics.py
index cb15eb55498bcafe4ae537747e387e47ddbd8254..9b15ec471fb681698b85c1dd2f92376fbc731f00 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -38,6 +38,7 @@ __doc__ = """
 
 .. currentmodule:: loopy
 
+.. autoclass:: GuardedPwQPolynomial
 .. autoclass:: ToCountMap
 .. autoclass:: Op
 .. autoclass:: MemAccess
@@ -52,6 +53,66 @@ __doc__ = """
 """
 
 
+# {{{ GuardedPwQPolynomial
+
+class GuardedPwQPolynomial(object):
+    def __init__(self, pwqpolynomial, valid_domain):
+        self.pwqpolynomial = pwqpolynomial
+        self.valid_domain = valid_domain
+
+    def __add__(self, other):
+        if isinstance(other, GuardedPwQPolynomial):
+            return GuardedPwQPolynomial(
+                    self.pwqpolynomial + other.pwqpolynomial,
+                    self.valid_domain & other.valid_domain)
+        else:
+            return GuardedPwQPolynomial(
+                    self.pwqpolynomial + other,
+                    self.valid_domain)
+
+    __radd__ = __add__
+
+    def __mul__(self, other):
+        if isinstance(other, GuardedPwQPolynomial):
+            return GuardedPwQPolynomial(
+                    self.pwqpolynomial * other.pwqpolynomial,
+                    self.valid_domain & other.valid_domain)
+        else:
+            return GuardedPwQPolynomial(
+                    self.pwqpolynomial * other,
+                    self.valid_domain)
+
+    __rmul__ = __mul__
+
+    def eval_with_dict(self, value_dict):
+        space = self.pwqpolynomial.space
+        pt = isl.Point.zero(space.params())
+
+        for i in range(space.dim(dim_type.param)):
+            par_name = space.get_dim_name(dim_type.param, i)
+            pt = pt.set_coordinate_val(
+                dim_type.param, i, value_dict[par_name])
+
+        if not (isl.Set.from_point(pt) <= self.valid_domain):
+            raise ValueError("evaluation point outside of domain of "
+                    "definition of piecewise quasipolynomial")
+
+        return self.pwqpolynomial.eval(pt).to_python()
+
+    @staticmethod
+    def zero():
+        p = isl.PwQPolynomial('{ 0 }')
+        return GuardedPwQPolynomial(p, isl.Set.universe(p.domain().space))
+
+    def __str__(self):
+        return str(self.pwqpolynomial)
+
+    def __repr__(self):
+        return repr(self.pwqpolynomial)
+
+# }}}
+
+
 # {{{ ToCountMap
 
 class ToCountMap(object):
@@ -66,7 +127,7 @@ class ToCountMap(object):
 
     """
 
-    def __init__(self, init_dict=None, val_type=isl.PwQPolynomial):
+    def __init__(self, init_dict=None, val_type=GuardedPwQPolynomial):
         if init_dict is None:
             init_dict = {}
         self.count_map = init_dict
@@ -87,7 +148,7 @@ class ToCountMap(object):
         return self
 
     def __mul__(self, other):
-        if isinstance(other, isl.PwQPolynomial):
+        if isinstance(other, GuardedPwQPolynomial):
             return ToCountMap(dict(
                 (index, self.count_map[index]*other)
                 for index in self.keys()))
@@ -103,8 +164,8 @@ class ToCountMap(object):
             return self.count_map[index]
         except KeyError:
             #TODO what is the best way to handle this?
-            if self.val_type is isl.PwQPolynomial:
-                return isl.PwQPolynomial('{ 0 }')
+            if self.val_type is GuardedPwQPolynomial:
+                return GuardedPwQPolynomial.zero()
             else:
                 return 0
 
@@ -132,12 +193,18 @@ class ToCountMap(object):
     def copy(self):
         return ToCountMap(dict(self.count_map), self.val_type)
 
+    def with_set_attributes(self, **kwargs):
+        return ToCountMap(dict(
+            (key.copy(**kwargs), val)
+            for key, val in six.iteritems(self.count_map)),
+            self.val_type)
+
     def filter_by(self, **kwargs):
         """Remove items without specified key fields.
 
-        :parameter \*\*kwargs: Keyword arguments matching fields in the keys of
-                             the :class:`ToCountMap`, each given a list of
-                             allowable values for that key field.
+        :arg kwargs: Keyword arguments matching fields in the keys of
+                 the :class:`ToCountMap`, each given a list of
+                 allowable values for that key field.
 
         :return: A :class:`ToCountMap` containing the subset of the items in
                  the original :class:`ToCountMap` that match the field values
@@ -183,10 +250,10 @@ class ToCountMap(object):
     def filter_by_func(self, func):
         """Keep items that pass a test.
 
-        :parameter func: A function that takes a map key a parameter and
-                         returns a :class:`bool`.
+        :arg func: A function that takes a map key a parameter and
+             returns a :class:`bool`.
 
-        :return: A :class:`ToCountMap` containing the subset of the items in
+        :arg: A :class:`ToCountMap` containing the subset of the items in
                  the original :class:`ToCountMap` for which func(key) is true.
 
         Example usage::
@@ -218,7 +285,7 @@ class ToCountMap(object):
         """Group map items together, distinguishing by only the key fields
            passed in args.
 
-        :parameter \*args: Zero or more :class:`str` fields of map keys.
+        :arg args: Zero or more :class:`str` fields of map keys.
 
         :return: A :class:`ToCountMap` containing the same total counts
                  grouped together by new keys that only contain the fields
@@ -336,8 +403,8 @@ class ToCountMap(object):
 
         """
 
-        if self.val_type is isl.PwQPolynomial:
-            total = isl.PwQPolynomial('{ 0 }')
+        if self.val_type is GuardedPwQPolynomial:
+            total = GuardedPwQPolynomial.zero()
         else:
             total = 0
 
@@ -385,8 +452,10 @@ def stringify_stats_mapping(m):
     return result
 
 
+# {{{ Op descriptor
+
 class Op(object):
-    """An arithmetic operation.
+    """A descriptor for a type of arithmetic operation.
 
     .. attribute:: dtype
 
@@ -400,6 +469,8 @@ class Op(object):
 
     """
 
+    # FIXME: This could be done much more briefly by inheriting from Record.
+
     def __init__(self, dtype=None, name=None):
         self.name = name
         if dtype is None:
@@ -419,19 +490,15 @@ class Op(object):
         return hash(str(self))
 
     def __repr__(self):
-        if self.dtype is None:
-            dtype = 'None'
-        else:
-            dtype = str(self.dtype)
-        if self.name is None:
-            name = 'None'
-        else:
-            name = self.name
-        return "Op("+dtype+", "+name+")"
+        return "Op(%s, %s)" % (self.dtype, self.name)
 
+# }}}
+
+
+# {{{ MemAccess descriptor
 
 class MemAccess(object):
-    """A memory access.
+    """A descriptor for a type of memory access.
 
     .. attribute:: mtype
 
@@ -460,6 +527,8 @@ class MemAccess(object):
 
     """
 
+    # FIXME: This could be done much more briefly by inheriting from Record.
+
     def __init__(self, mtype=None, dtype=None, stride=None, direction=None,
                  variable=None):
         self.mtype = mtype
@@ -482,6 +551,16 @@ class MemAccess(object):
             raise NotImplementedError("MemAccess: variable must be None when "
                                       "mtype is 'local'")
 
+    def copy(self, mtype=None, dtype=None, stride=None, direction=None,
+            variable=None):
+        return MemAccess(
+                mtype=mtype if mtype is not None else self.mtype,
+                dtype=dtype if dtype is not None else self.dtype,
+                stride=stride if stride is not None else self.stride,
+                direction=direction if direction is not None else self.direction,
+                variable=variable if variable is not None else self.variable,
+                )
+
     def __eq__(self, other):
         return isinstance(other, MemAccess) and (
                 (self.mtype is None or other.mtype is None or
@@ -522,11 +601,70 @@ class MemAccess(object):
         return "MemAccess(" + mtype + ", " + dtype + ", " + stride + ", " \
                + direction + ", " + variable + ")"
 
+# }}}
 
-# {{{ ExpressionOpCounter
 
-class ExpressionOpCounter(CombineMapper):
+# {{{ counter base
 
+class CounterBase(CombineMapper):
+    def __init__(self, knl):
+        self.knl = knl
+        from loopy.type_inference import TypeInferenceMapper
+        self.type_inf = TypeInferenceMapper(knl)
+
+    def combine(self, values):
+        return sum(values)
+
+    def map_constant(self, expr):
+        return ToCountMap()
+
+    def map_call(self, expr):
+        return self.rec(expr.parameters)
+
+    def map_sum(self, expr):
+        if expr.children:
+            return sum(self.rec(child) for child in expr.children)
+        else:
+            return ToCountMap()
+
+    map_product = map_sum
+
+    def map_comparison(self, expr):
+        return self.rec(expr.left)+self.rec(expr.right)
+
+    def map_if(self, expr):
+        warn_with_kernel(self.knl, "summing_if_branches",
+                         "%s counting sum of if-expression branches."
+                         % type(self).__name__)
+        return self.rec(expr.condition) + self.rec(expr.then) \
+               + self.rec(expr.else_)
+
+    def map_if_positive(self, expr):
+        warn_with_kernel(self.knl, "summing_if_branches",
+                         "%s counting sum of if-expression branches."
+                         % type(self).__name__)
+        return self.rec(expr.criterion) + self.rec(expr.then) \
+               + self.rec(expr.else_)
+
+    def map_common_subexpression(self, expr):
+        raise RuntimeError("%s encountered %s--not supposed to happen"
+                % (type(self).__name__, type(expr).__name__))
+
+    map_substitution = map_common_subexpression
+    map_derivative = map_common_subexpression
+    map_slice = map_common_subexpression
+
+    # preprocessing should have removed these
+    def map_reduction(self, expr):
+        raise RuntimeError("%s encountered %s--not supposed to happen"
+                % (type(self).__name__, type(expr).__name__))
+
+# }}}
+
+
+# {{{ ExpressionOpCounter
+
+class ExpressionOpCounter(CounterBase):
     def __init__(self, knl):
         self.knl = knl
         from loopy.type_inference import TypeInferenceMapper
@@ -641,106 +779,59 @@ class ExpressionOpCounter(CombineMapper):
 # }}}
 
 
-# {{{ LocalSubscriptCounter
+class MemAccessCounter(CounterBase):
+    pass
 
-class LocalSubscriptCounter(CombineMapper):
 
-    def __init__(self, knl):
-        self.knl = knl
-        from loopy.type_inference import TypeInferenceMapper
-        self.type_inf = TypeInferenceMapper(knl)
+# {{{ LocalMemAccessCounter
 
-    def combine(self, values):
-        return sum(values)
-
-    def map_constant(self, expr):
-        return ToCountMap()
-
-    map_tagged_variable = map_constant
-    map_variable = map_constant
-
-    def map_call(self, expr):
-        return self.rec(expr.parameters)
-
-    def map_subscript(self, expr):
+class LocalMemAccessCounter(MemAccessCounter):
+    def count_var_access(self, dtype, name, subscript):
         sub_map = ToCountMap()
-        name = expr.aggregate.name  # name of array
         if name in self.knl.temporary_variables:
             array = self.knl.temporary_variables[name]
             if array.is_local:
-                sub_map[MemAccess(mtype='local', dtype=self.type_inf(expr))] = 1
-        return sub_map + self.rec(expr.index)
-
-    def map_sum(self, expr):
-        if expr.children:
-            return sum(self.rec(child) for child in expr.children)
-        else:
-            return ToCountMap()
-
-    map_product = map_sum
+                sub_map[MemAccess(mtype='local', dtype=dtype)] = 1
+        return sub_map
 
-    def map_comparison(self, expr):
-        return self.rec(expr.left)+self.rec(expr.right)
-
-    def map_if(self, expr):
-        warn_with_kernel(self.knl, "summing_if_branches_lsubs",
-                         "LocalSubscriptCounter counting LMEM accesses as sum "
-                         "of if-statement branches.")
-        return self.rec(expr.condition) + self.rec(expr.then) \
-               + self.rec(expr.else_)
-
-    def map_if_positive(self, expr):
-        warn_with_kernel(self.knl, "summing_ifpos_branches_lsubs",
-                         "LocalSubscriptCounter counting LMEM accesses as sum "
-                         "of if_pos-statement branches.")
-        return self.rec(expr.criterion) + self.rec(expr.then) \
-               + self.rec(expr.else_)
-
-    def map_common_subexpression(self, expr):
-        raise NotImplementedError("LocalSubscriptCounter encountered "
-                                  "common_subexpression, "
-                                  "map_common_subexpression not implemented.")
-
-    def map_substitution(self, expr):
-        raise NotImplementedError("LocalSubscriptCounter encountered "
-                                  "substitution, "
-                                  "map_substitution not implemented.")
+    def map_variable(self, expr):
+        return self.count_var_access(
+                    self.type_inf(expr), expr.name, None)
 
-    def map_derivative(self, expr):
-        raise NotImplementedError("LocalSubscriptCounter encountered "
-                                  "derivative, "
-                                  "map_derivative not implemented.")
+    map_tagged_variable = map_variable
 
-    def map_slice(self, expr):
-        raise NotImplementedError("LocalSubscriptCounter encountered slice, "
-                                  "map_slice not implemented.")
+    def map_subscript(self, expr):
+        return (
+                self.count_var_access(
+                    self.type_inf(expr), expr.aggregate.name, expr.index)
+                + self.rec(expr.index))
 
 # }}}
 
 
-# {{{ GlobalSubscriptCounter
-
-class GlobalSubscriptCounter(CombineMapper):
+# {{{ GlobalMemAccessCounter
 
-    def __init__(self, knl):
-        self.knl = knl
-        from loopy.type_inference import TypeInferenceMapper
-        self.type_inf = TypeInferenceMapper(knl)
-
-    def combine(self, values):
-        return sum(values)
+class GlobalMemAccessCounter(MemAccessCounter):
+    def map_variable(self, expr):
+        name = expr.name
 
-    def map_constant(self, expr):
-        return ToCountMap()
+        if name in self.knl.arg_dict:
+            array = self.knl.arg_dict[name]
+        else:
+            # this is a temporary variable
+            return ToCountMap()
 
-    map_tagged_variable = map_constant
-    map_variable = map_constant
+        if not isinstance(array, lp.GlobalArg):
+            # this array is not in global memory
+            return ToCountMap()
 
-    def map_call(self, expr):
-        return self.rec(expr.parameters)
+        return ToCountMap({MemAccess(mtype='global',
+                                     dtype=self.type_inf(expr), stride=0,
+                                     variable=name): 1}
+                          ) + self.rec(expr.index)
 
     def map_subscript(self, expr):
-        name = expr.aggregate.name  # name of array
+        name = expr.aggregate.name
 
         if name in self.knl.arg_dict:
             array = self.knl.arg_dict[name]
@@ -827,47 +918,6 @@ class GlobalSubscriptCounter(CombineMapper):
                                      stride=total_stride, variable=name): 1}
                           ) + self.rec(expr.index)
 
-    def map_sum(self, expr):
-        if expr.children:
-            return sum(self.rec(child) for child in expr.children)
-        else:
-            return ToCountMap()
-
-    map_product = map_sum
-
-    def map_if(self, expr):
-        warn_with_kernel(self.knl, "summing_if_branches_gsubs",
-                         "GlobalSubscriptCounter counting GMEM accesses as "
-                         "sum of if-statement branches.")
-        return self.rec(expr.condition) + self.rec(expr.then) \
-               + self.rec(expr.else_)
-
-    def map_if_positive(self, expr):
-        warn_with_kernel(self.knl, "summing_ifpos_branches_gsubs",
-                         "GlobalSubscriptCounter counting GMEM accesses as "
-                         "sum of if_pos-statement branches.")
-        return self.rec(expr.criterion) + self.rec(expr.then) \
-               + self.rec(expr.else_)
-
-    def map_common_subexpression(self, expr):
-        raise NotImplementedError("GlobalSubscriptCounter encountered "
-                                  "common_subexpression, "
-                                  "map_common_subexpression not implemented.")
-
-    def map_substitution(self, expr):
-        raise NotImplementedError("GlobalSubscriptCounter encountered "
-                                  "substitution, "
-                                  "map_substitution not implemented.")
-
-    def map_derivative(self, expr):
-        raise NotImplementedError("GlobalSubscriptCounter encountered "
-                                  "derivative, "
-                                  "map_derivative not implemented.")
-
-    def map_slice(self, expr):
-        raise NotImplementedError("GlobalSubscriptCounter encountered slice, "
-                                  "map_slice not implemented.")
-
 # }}}
 
 
@@ -940,9 +990,13 @@ class AccessFootprintGatherer(CombineMapper):
 
 # {{{ count
 
-def count(kernel, set):
+def add_assumptions_guard(kernel, pwqpolynomial):
+    return GuardedPwQPolynomial(pwqpolynomial, kernel.assumptions)
+
+
+def count(kernel, set, space=None):
     try:
-        return set.card()
+        return add_assumptions_guard(kernel, set.card())
     except AttributeError:
         pass
 
@@ -969,7 +1023,11 @@ def count(kernel, set):
             if stride is None:
                 stride = 1
 
-            length = isl.PwQPolynomial.from_pw_aff(dmax - dmin + stride)
+            length_pwaff = dmax - dmin + stride
+            if space is not None:
+                length_pwaff = length_pwaff.align_params(space)
+
+            length = isl.PwQPolynomial.from_pw_aff(length_pwaff)
             length = length.scale_down_val(stride)
 
             if bset_count is None:
@@ -1029,46 +1087,102 @@ def count(kernel, set):
                         "number of integer points in your loop "
                         "domain.")
 
-    return count
+    return add_assumptions_guard(kernel, count)
 
-# }}}
 
+def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None):
+    # FIXME: Multi-kernel support
+    gsize, lsize = knl.get_grid_size_upper_bounds()
 
-# {{{ get_op_poly
+    g_used = set()
+    l_used = set()
 
-def get_op_poly(knl, numpy_types=True):
+    from loopy.kernel.data import LocalIndexTag, GroupIndexTag
+    for iname in knl.insn_inames(insn):
+        tag = knl.iname_to_tag.get(iname)
 
-    """Count the number of operations in a loopy kernel.
+        if isinstance(tag, LocalIndexTag):
+            l_used.add(tag.axis)
+        elif isinstance(tag, GroupIndexTag):
+            g_used.add(tag.axis)
 
-    get_op_poly is deprecated. Use get_op_map instead.
+    def mult_grid_factor(used_axes, size):
+        result = 1
+        for iaxis, size in enumerate(size):
+            if iaxis not in used_axes:
+                if not isinstance(size, int):
+                    if space is not None:
+                        size = size.align_params(space)
+
+                    size = isl.PwQPolynomial.from_pw_aff(size)
+
+                result = result * size
+
+        return result
+
+    if disregard_local_axes:
+        result = mult_grid_factor(g_used, gsize)
+    else:
+        result = mult_grid_factor(g_used, gsize) * mult_grid_factor(l_used, lsize)
+
+    return add_assumptions_guard(knl, result)
 
-    """
-    warn_with_kernel(knl, "depricated_get_op_poly",
-                     "get_op_poly is deprecated. Use get_op_map instead.")
-    return get_op_map(knl, numpy_types)
+
+def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False):
+    insn_inames = knl.insn_inames(insn)
+
+    if disregard_local_axes:
+        from loopy.kernel.data import LocalIndexTag
+        insn_inames = [iname for iname in insn_inames if not
+                       isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)]
+
+    inames_domain = knl.get_inames_domain(insn_inames)
+    domain = (inames_domain.project_out_except(
+                            insn_inames, [dim_type.set]))
+
+    space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT,
+            set=[], params=knl.outer_params())
+
+    c = count(knl, domain, space=space)
+
+    if count_redundant_work:
+        unused_fac = get_unused_hw_axes_factor(knl, insn,
+                        disregard_local_axes=disregard_local_axes,
+                        space=space)
+        return c * unused_fac
+    else:
+        return c
 
 # }}}
 
 
-def get_op_map(knl, numpy_types=True):
+# {{{ get_op_map
+
+def get_op_map(knl, numpy_types=True, count_redundant_work=False):
 
     """Count the number of operations in a loopy kernel.
 
-    :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted.
+    :arg knl: A :class:`loopy.LoopKernel` whose operations are to be counted.
 
-    :parameter numpy_types: A :class:`bool` specifying whether the types
-                            in the returned mapping should be numpy types
-                            instead of :class:`loopy.LoopyType`.
+    :arg numpy_types: A :class:`bool` specifying whether the types
+         in the returned mapping should be numpy types
+         instead of :class:`loopy.LoopyType`.
+
+    :arg count_redundant_work: Based on usage of hardware axes or other
+        specifics, a kernel may perform work redundantly. This :class:`bool`
+        flag indicates whether this work should be included in the count.
+        (Likely desirable for performance modeling, but undesirable for
+        code optimization.)
 
     :return: A :class:`ToCountMap` of **{** :class:`Op` **:**
-             :class:`islpy.PwQPolynomial` **}**.
+        :class:`islpy.PwQPolynomial` **}**.
 
-             - The :class:`Op` specifies the characteristics of the arithmetic
-               operation.
+        - The :class:`Op` specifies the characteristics of the arithmetic
+          operation.
 
-             - The :class:`islpy.PwQPolynomial` holds the number of operations of
-               the kind specified in the key (in terms of the
-               :class:`loopy.LoopKernel` parameter *inames*).
+        - The :class:`islpy.PwQPolynomial` holds the number of operations of
+          the kind specified in the key (in terms of the
+          :class:`loopy.LoopKernel` parameter *inames*).
 
     Example usage::
 
@@ -1090,14 +1204,10 @@ def get_op_map(knl, numpy_types=True):
     op_map = ToCountMap()
     op_counter = ExpressionOpCounter(knl)
     for insn in knl.instructions:
-        # how many times is this instruction executed?
-        # check domain size:
-        insn_inames = knl.insn_inames(insn)
-        inames_domain = knl.get_inames_domain(insn_inames)
-        domain = (inames_domain.project_out_except(
-                                        insn_inames, [dim_type.set]))
         ops = op_counter(insn.assignee) + op_counter(insn.expression)
-        op_map = op_map + ops*count(knl, domain)
+        op_map = op_map + ops*count_insn_runs(
+                knl, insn,
+                count_redundant_work=count_redundant_work)
 
     if numpy_types:
         op_map.count_map = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name),
@@ -1106,73 +1216,36 @@ def get_op_map(knl, numpy_types=True):
 
     return op_map
 
-
-#TODO test deprecated functions?
-def get_lmem_access_poly(knl):
-    """Count the number of local memory accesses in a loopy kernel.
-
-    get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the
-    result with the mtype=['local'] option.
-
-    """
-    warn_with_kernel(knl, "depricated_get_lmem_access_poly",
-                     "get_lmem_access_poly is deprecated. Use "
-                     "get_mem_access_map and filter the result with the "
-                     "mtype=['local'] option.")
-    return get_mem_access_map(knl).filter_by(mtype=['local'])
-
-
-def get_DRAM_access_poly(knl):
-    """Count the number of global memory accesses in a loopy kernel.
-
-    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
-    result with the mtype=['global'] option.
-
-    """
-    warn_with_kernel(knl, "depricated_get_DRAM_access_poly",
-                     "get_DRAM_access_poly is deprecated. Use "
-                     "get_mem_access_map and filter the result with the "
-                     "mtype=['global'] option.")
-    return get_mem_access_map(knl).filter_by(mtype=['global'])
-
-
-# {{{ get_gmem_access_poly
-
-def get_gmem_access_poly(knl):
-    """Count the number of global memory accesses in a loopy kernel.
-
-    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
-    result with the mtype=['global'] option.
-
-    """
-    warn_with_kernel(knl, "depricated_get_gmem_access_poly",
-                     "get_DRAM_access_poly is deprecated. Use "
-                     "get_mem_access_map and filter the result with the "
-                     "mtype=['global'] option.")
-    return get_mem_access_map(knl).filter_by(mtype=['global'])
-
 # }}}
 
 
-def get_mem_access_map(knl, numpy_types=True):
+# {{{ get_mem_access_map
+
+def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False):
     """Count the number of memory accesses in a loopy kernel.
 
-    :parameter knl: A :class:`loopy.LoopKernel` whose memory accesses are to be
-                    counted.
+    :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be
+        counted.
 
-    :parameter numpy_types: A :class:`bool` specifying whether the types
-                            in the returned mapping should be numpy types
-                            instead of :class:`loopy.LoopyType`.
+    :arg numpy_types: A :class:`bool` specifying whether the types
+        in the returned mapping should be numpy types
+        instead of :class:`loopy.LoopyType`.
+
+    :arg count_redundant_work: Based on usage of hardware axes or other
+        specifics, a kernel may perform work redundantly. This :class:`bool`
+        flag indicates whether this work should be included in the count.
+        (Likely desirable for performance modeling, but undesirable for
+        code optimization.)
 
     :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:**
-             :class:`islpy.PwQPolynomial` **}**.
+        :class:`islpy.PwQPolynomial` **}**.
 
-             - The :class:`MemAccess` specifies the characteristics of the
-               memory access.
+        - The :class:`MemAccess` specifies the characteristics of the
+          memory access.
 
-             - The :class:`islpy.PwQPolynomial` holds the number of memory
-               accesses with the characteristics specified in the key (in terms
-               of the :class:`loopy.LoopKernel` *inames*).
+        - The :class:`islpy.PwQPolynomial` holds the number of memory
+          accesses with the characteristics specified in the key (in terms
+          of the :class:`loopy.LoopKernel` *inames*).
 
     Example usage::
 
@@ -1217,102 +1290,74 @@ def get_mem_access_map(knl, numpy_types=True):
     cache_holder = CacheHolder()
 
     @memoize_in(cache_holder, "insn_count")
-    def get_insn_count(knl, insn_inames, uniform=False):
-        if uniform:
-            from loopy.kernel.data import LocalIndexTag
-            insn_inames = [iname for iname in insn_inames if not
-                           isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)]
-        inames_domain = knl.get_inames_domain(insn_inames)
-        domain = (inames_domain.project_out_except(
-                                insn_inames, [dim_type.set]))
-        return count(knl, domain)
+    def get_insn_count(knl, insn_id, uniform=False):
+        insn = knl.id_to_insn[insn_id]
+        return count_insn_runs(
+                knl, insn, disregard_local_axes=uniform,
+                count_redundant_work=count_redundant_work)
 
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
 
-    subs_map = ToCountMap()
-    subs_counter_g = GlobalSubscriptCounter(knl)
-    subs_counter_l = LocalSubscriptCounter(knl)
+    access_map = ToCountMap()
+    access_counter_g = GlobalMemAccessCounter(knl)
+    access_counter_l = LocalMemAccessCounter(knl)
 
     for insn in knl.instructions:
-        # count subscripts
-        subs_expr = subs_counter_g(insn.expression) \
-                    + subs_counter_l(insn.expression)
-
-        # distinguish loads and stores
-        for key in subs_expr.count_map:
-            subs_expr[MemAccess(mtype=key.mtype, dtype=key.dtype,
-                                stride=key.stride, direction='load',
-                                variable=key.variable)
-                      ] = subs_expr.pop(key)
-
-        subs_assignee_g = subs_counter_g(insn.assignee)
-        for key in subs_assignee_g.count_map:
-            subs_assignee_g[MemAccess(mtype=key.mtype, dtype=key.dtype,
-                                      stride=key.stride,
-                                      direction='store',
-                                      variable=key.variable)
-                            ] = subs_assignee_g.pop(key)
-        # for now, don't count writes to local mem
-
-        insn_inames = knl.insn_inames(insn)
+        access_expr = (
+                access_counter_g(insn.expression)
+                + access_counter_l(insn.expression)
+                ).with_set_attributes(direction="load")
+
+        access_assignee_g = access_counter_g(insn.assignee).with_set_attributes(
+                direction="store")
+
+        # FIXME: (!!!!) for now, don't count writes to local mem
 
         # use count excluding local index tags for uniform accesses
-        for key in subs_expr.count_map:
-            map = ToCountMap({key: subs_expr[key]})
-            if (key.mtype == 'global' and
+        for key, val in six.iteritems(access_expr.count_map):
+            is_uniform = (key.mtype == 'global' and
                     isinstance(key.stride, int) and
-                    key.stride == 0):
-                subs_map = subs_map \
-                            + map*get_insn_count(knl, insn_inames, True)
-            else:
-                subs_map = subs_map + map*get_insn_count(knl, insn_inames)
-                #currently not counting stride of local mem access
-
-        for key in subs_assignee_g.count_map:
-            map = ToCountMap({key: subs_assignee_g[key]})
-            if isinstance(key.stride, int) and key.stride == 0:
-                subs_map = subs_map \
-                            + map*get_insn_count(knl, insn_inames, True)
-            else:
-                subs_map = subs_map + map*get_insn_count(knl, insn_inames)
+                    key.stride == 0)
+            access_map = (
+                    access_map
+                    + ToCountMap({key: val})
+                    * get_insn_count(knl, insn.id, is_uniform))
+            #currently not counting stride of local mem access
+
+        for key, val in six.iteritems(access_assignee_g.count_map):
+            is_uniform = (key.mtype == 'global' and
+                    isinstance(key.stride, int) and
+                    key.stride == 0)
+            access_map = (
+                    access_map
+                    + ToCountMap({key: val})
+                    * get_insn_count(knl, insn.id, is_uniform))
             # for now, don't count writes to local mem
 
     if numpy_types:
-        subs_map.count_map = dict((MemAccess(mtype=mem_access.mtype,
+        # FIXME: Don't modify in-place
+        access_map.count_map = dict((MemAccess(mtype=mem_access.mtype,
                                              dtype=mem_access.dtype.numpy_dtype,
                                              stride=mem_access.stride,
                                              direction=mem_access.direction,
                                              variable=mem_access.variable),
                                   count)
-                      for mem_access, count in six.iteritems(subs_map.count_map))
-
-    return subs_map
-
+                      for mem_access, count in six.iteritems(access_map.count_map))
 
-# {{{ get_synchronization_poly
-
-def get_synchronization_poly(knl):
-    """Count the number of synchronization events each thread encounters in a
-    loopy kernel.
-
-    get_synchronization_poly is deprecated. Use get_synchronization_map instead.
-
-    """
-    warn_with_kernel(knl, "depricated_get_synchronization_poly",
-                     "get_synchronization_poly is deprecated. Use "
-                     "get_synchronization_map instead.")
-    return get_synchronization_map(knl)
+    return access_map
 
 # }}}
 
 
+# {{{ get_synchronization_map
+
 def get_synchronization_map(knl):
 
     """Count the number of synchronization events each thread encounters in a
     loopy kernel.
 
-    :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
+    :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
     :return: A dictionary mapping each type of synchronization event to a
             :class:`islpy.PwQPolynomial` holding the number of events per
@@ -1379,9 +1424,10 @@ def get_synchronization_map(knl):
             raise LoopyError("unexpected schedule item: %s"
                     % type(sched_item).__name__)
 
-    #return result.count_map #TODO is this change okay?
     return result
 
+# }}}
+
 
 # {{{ gather_access_footprints
 
@@ -1477,4 +1523,74 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
 
 # }}}
 
+
+# {{{ compat goop
+
+def get_lmem_access_poly(knl):
+    """Count the number of local memory accesses in a loopy kernel.
+
+    get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['local'] option.
+
+    """
+    warn_with_kernel(knl, "deprecated_get_lmem_access_poly",
+                     "get_lmem_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['local'] option.")
+    return get_mem_access_map(knl).filter_by(mtype=['local'])
+
+
+def get_DRAM_access_poly(knl):
+    """Count the number of global memory accesses in a loopy kernel.
+
+    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['global'] option.
+
+    """
+    warn_with_kernel(knl, "deprecated_get_DRAM_access_poly",
+                     "get_DRAM_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['global'] option.")
+    return get_mem_access_map(knl).filter_by(mtype=['global'])
+
+
+def get_gmem_access_poly(knl):
+    """Count the number of global memory accesses in a loopy kernel.
+
+    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['global'] option.
+
+    """
+    warn_with_kernel(knl, "deprecated_get_gmem_access_poly",
+                     "get_DRAM_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['global'] option.")
+    return get_mem_access_map(knl).filter_by(mtype=['global'])
+
+
+def get_synchronization_poly(knl):
+    """Count the number of synchronization events each thread encounters in a
+    loopy kernel.
+
+    get_synchronization_poly is deprecated. Use get_synchronization_map instead.
+
+    """
+    warn_with_kernel(knl, "deprecated_get_synchronization_poly",
+                     "get_synchronization_poly is deprecated. Use "
+                     "get_synchronization_map instead.")
+    return get_synchronization_map(knl)
+
+
+def get_op_poly(knl, numpy_types=True):
+    """Count the number of operations in a loopy kernel.
+
+    get_op_poly is deprecated. Use get_op_map instead.
+
+    """
+    warn_with_kernel(knl, "deprecated_get_op_poly",
+                     "get_op_poly is deprecated. Use get_op_map instead.")
+    return get_op_map(knl, numpy_types)
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 8f371085e0f1655651397c16873f10a95a799f79..f24b115fd5a35af94e4a6d437550bccf86b5bee0 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -335,6 +335,8 @@ class PyOpenCLTarget(OpenCLTarget):
                         % dev_id)
 
     def preprocess(self, kernel):
+        if self.device is not None:
+            kernel = adjust_local_temp_var_storage(kernel, self.device)
         return kernel
 
     def pre_codegen_check(self, kernel):
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index a8f47adb991e331f8a473c4eb14b1ea634c7a3b1..2da25ba39ceef38a4af105913973226bd3773729 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 import six
 from six.moves import range, zip
 
+import numpy as np
 from pytools import ImmutableRecord, memoize_method
 from loopy.diagnostic import ParameterFinderWarning
 from pytools.py_codegen import (
@@ -686,8 +687,17 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
     # {{{ debugging aids
 
     def get_code(self, arg_to_dtype=None):
+        def process_dtype(dtype):
+            if isinstance(dtype, type) and issubclass(dtype, np.generic):
+                dtype = np.dtype(dtype)
+            if isinstance(dtype, np.dtype):
+                dtype = NumpyType(dtype, self.kernel.target)
+
+            return dtype
+
         if arg_to_dtype is not None:
-            arg_to_dtype = frozenset(six.iteritems(arg_to_dtype))
+            arg_to_dtype = frozenset(
+                    (k, process_dtype(v)) for k, v in six.iteritems(arg_to_dtype))
 
         kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype)
 
diff --git a/loopy/target/python.py b/loopy/target/python.py
index 99ec42f44b49f546cda324dfdb3c6a5b001d2222..11951abcf17e94c0fdba51042e3060735215b423 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -133,15 +133,17 @@ class ExpressionToPythonMapper(StringifyMapper):
     def map_if(self, expr, enclosing_prec):
         # Synthesize PREC_IFTHENELSE, make sure it is in the right place in the
         # operator precedence hierarchy (right above "or").
-        from pymbolic.mapper.stringifier import PREC_LOGICAL_OR, PREC_NONE
+        from pymbolic.mapper.stringifier import PREC_LOGICAL_OR
         PREC_IFTHENELSE = PREC_LOGICAL_OR - 1  # noqa
 
         return self.parenthesize_if_needed(
             "{then} if {cond} else {else_}".format(
-                then=self.rec(expr.then, PREC_IFTHENELSE),
-                cond=self.rec(expr.condition, PREC_IFTHENELSE),
-                else_=self.rec(expr.else_, PREC_IFTHENELSE)),
-            enclosing_prec, PREC_NONE)
+                # "1 if 0 if 1 else 2 else 3" is not valid Python.
+                # So force parens by using an artificially higher precedence.
+                then=self.rec(expr.then, PREC_LOGICAL_OR),
+                cond=self.rec(expr.condition, PREC_LOGICAL_OR),
+                else_=self.rec(expr.else_, PREC_LOGICAL_OR)),
+            enclosing_prec, PREC_IFTHENELSE)
 
 # }}}
 
@@ -257,7 +259,7 @@ class PythonASTBuilderBase(ASTBuilderBase):
             lbound, ubound, inner):
         ecm = codegen_state.expression_to_code_mapper
 
-        from pymbolic.mapper.stringifier import PREC_NONE
+        from pymbolic.mapper.stringifier import PREC_NONE, PREC_SUM
         from genpy import For
 
         return For(
@@ -265,7 +267,7 @@ class PythonASTBuilderBase(ASTBuilderBase):
                 "range(%s, %s + 1)"
                 % (
                     ecm(lbound, PREC_NONE, "i"),
-                    ecm(ubound, PREC_NONE, "i"),
+                    ecm(ubound, PREC_SUM, "i"),
                     ),
                 inner)
 
diff --git a/loopy/version.py b/loopy/version.py
index 8516ce006bde8b8616172a72a766ec86dfcd44f1..02244f55d0dbf207a4641c3ebf6cc33b536f0421 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v63-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v64-islpy%s" % _islpy_version
diff --git a/test/test_loopy.py b/test/test_loopy.py
index ad5fd72b65b9946156d1067aabdb4ff510d6ec63..3ac857478bf4ac1d4cd6868f22896ca63de34f04 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2184,6 +2184,41 @@ def test_barrier_insertion_near_bottom_of_loop():
     assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1])
 
 
+def test_multi_argument_reduction_type_inference():
+    from loopy.type_inference import TypeInferenceMapper
+    from loopy.library.reduction import SegmentedSumReductionOperation
+    from loopy.types import to_loopy_type
+    op = SegmentedSumReductionOperation()
+
+    knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=j<i}", "")
+
+    int32 = to_loopy_type(np.int32)
+
+    expr = lp.symbolic.Reduction(
+            operation=op,
+            inames=("i",),
+            expr=lp.symbolic.Reduction(
+                operation=op,
+                inames="j",
+                expr=(1, 2),
+                allow_simultaneous=True),
+            allow_simultaneous=True)
+
+    t_inf_mapper = TypeInferenceMapper(knl)
+
+    assert (
+            t_inf_mapper(expr, return_tuple=True, return_dtype_set=True)
+            == [(int32, int32)])
+
+
+def test_multi_argument_reduction_parsing():
+    from loopy.symbolic import parse, Reduction
+
+    assert isinstance(
+            parse("reduce(argmax, i, reduce(argmax, j, i, j))").expr,
+            Reduction)
+
+
 def test_global_barrier_order_finding():
     knl = lp.make_kernel(
             "{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
@@ -2228,41 +2263,6 @@ def test_global_barrier_error_if_unordered():
         lp.get_global_barrier_order(knl)
 
 
-def test_multi_argument_reduction_type_inference():
-    from loopy.type_inference import TypeInferenceMapper
-    from loopy.library.reduction import SegmentedSumReductionOperation
-    from loopy.types import to_loopy_type
-    op = SegmentedSumReductionOperation()
-
-    knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=j<i}", "")
-
-    int32 = to_loopy_type(np.int32)
-
-    expr = lp.symbolic.Reduction(
-            operation=op,
-            inames=("i",),
-            expr=lp.symbolic.Reduction(
-                operation=op,
-                inames="j",
-                expr=(1, 2),
-                allow_simultaneous=True),
-            allow_simultaneous=True)
-
-    t_inf_mapper = TypeInferenceMapper(knl)
-
-    assert (
-            t_inf_mapper(expr, return_tuple=True, return_dtype_set=True)
-            == [(int32, int32)])
-
-
-def test_multi_argument_reduction_parsing():
-    from loopy.symbolic import parse, Reduction
-
-    assert isinstance(
-            parse("reduce(argmax, i, reduce(argmax, j, i, j))").expr,
-            Reduction)
-
-
 def test_struct_assignment(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
diff --git a/test/test_scan.py b/test/test_scan.py
new file mode 100644
index 0000000000000000000000000000000000000000..08754819c9a156403aba689cb3e9c238144e7905
--- /dev/null
+++ b/test/test_scan.py
@@ -0,0 +1,432 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = """
+Copyright (C) 2012 Andreas Kloeckner
+Copyright (C) 2016, 2017 Matt Wala
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import sys
+import numpy as np
+import loopy as lp
+import pyopencl as cl
+import pyopencl.clmath  # noqa
+import pyopencl.clrandom  # noqa
+import pytest
+
+import logging
+logger = logging.getLogger(__name__)
+
+try:
+    import faulthandler
+except ImportError:
+    pass
+else:
+    faulthandler.enable()
+
+from pyopencl.tools import pytest_generate_tests_for_pyopencl \
+        as pytest_generate_tests
+
+__all__ = [
+        "pytest_generate_tests",
+        "cl"  # 'cl.create_some_context'
+        ]
+
+
+# More things to test.
+# - scan(a) + scan(b)
+# - test for badly tagged inames
+
+@pytest.mark.parametrize("n", [1, 2, 3, 16])
+@pytest.mark.parametrize("stride", [1, 2])
+def test_sequential_scan(ctx_factory, n, stride):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "[n] -> {[i,j]: 0<=i<n and 0<=j<=%d*i}" % stride,
+        """
+        a[i] = sum(j, j**2)
+        """
+        )
+
+    knl = lp.fix_parameters(knl, n=n)
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    evt, (a,) = knl(queue)
+
+    assert (a.get() == np.cumsum(np.arange(stride*n)**2)[::stride]).all()
+
+
+@pytest.mark.parametrize("sweep_lbound, scan_lbound", [
+    (4, 0),
+    (3, 1),
+    (2, 2),
+    (1, 3),
+    (0, 4),
+    (5, -1),
+    ])
+def test_scan_with_different_lower_bound_from_sweep(
+        ctx_factory, sweep_lbound, scan_lbound):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "[n, sweep_lbound, scan_lbound] -> "
+        "{[i,j]: sweep_lbound<=i<n+sweep_lbound "
+        "and scan_lbound<=j<=2*(i-sweep_lbound)+scan_lbound}",
+        """
+        out[i-sweep_lbound] = sum(j, j**2)
+        """
+        )
+
+    n = 10
+
+    knl = lp.fix_parameters(knl, sweep_lbound=sweep_lbound, scan_lbound=scan_lbound)
+    knl = lp.realize_reduction(knl, force_scan=True)
+    evt, (out,) = knl(queue, n=n)
+
+    assert (out.get()
+            == np.cumsum(np.arange(scan_lbound, 2*n+scan_lbound)**2)[::2]).all()
+
+
+def test_automatic_scan_detection():
+    knl = lp.make_kernel(
+        [
+            "[n] -> {[i]: 0<=i<n}",
+            "{[j]: 0<=j<=2*i}"
+        ],
+        """
+        a[i] = sum(j, j**2)
+        """
+        )
+
+    cgr = lp.generate_code_v2(knl)
+    assert "scan" not in cgr.device_code()
+
+
+def test_selective_scan_realization():
+    pass
+
+
+def test_force_outer_iname_for_scan():
+    knl = lp.make_kernel(
+        "[n] -> {[i,j,k]: 0<=k<n and 0<=i<=k and 0<=j<=i}",
+        "out[i] = product(j, a[j]) {inames=i:k}")
+
+    knl = lp.add_dtypes(knl, dict(a=np.float32))
+
+    # TODO: Maybe this deserves to work?
+    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
+        lp.realize_reduction(knl, force_scan=True)
+
+    knl = lp.realize_reduction(knl, force_scan=True, force_outer_iname_for_scan="i")
+
+
+def test_dependent_domain_scan(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        [
+            "[n] -> {[i]: 0<=i<n}",
+            "{[j]: 0<=j<=2*i}"
+        ],
+        """
+        a[i] = sum(j, j**2) {id=scan}
+        """
+        )
+    knl = lp.realize_reduction(knl, force_scan=True)
+    evt, (a,) = knl(queue, n=100)
+
+    assert (a.get() == np.cumsum(np.arange(200)**2)[::2]).all()
+
+
+@pytest.mark.parametrize("i_tag, j_tag", [
+    ("for", "for")
+    ])
+def test_nested_scan(ctx_factory, i_tag, j_tag):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        [
+            "[n] -> {[i]: 0 <= i < n}",
+            "[i] -> {[j]: 0 <= j <= i}",
+            "[i] -> {[k]: 0 <= k <= i}"
+        ],
+        """
+        <>tmp[i] = sum(k, 1)
+        out[i] = sum(j, tmp[j])
+        """)
+
+    knl = lp.fix_parameters(knl, n=10)
+    knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag))
+
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    print(knl)
+
+    evt, (out,) = knl(queue)
+
+    print(out)
+
+
+def test_scan_not_triangular():
+    knl = lp.make_kernel(
+        "{[i,j]: 0<=i<100 and 1<=j<=2*i}",
+        """
+        a[i] = sum(j, j**2)
+        """
+        )
+
+    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
+        knl = lp.realize_reduction(knl, force_scan=True)
+
+
+@pytest.mark.parametrize("n", [1, 2, 3, 16, 17])
+def test_local_parallel_scan(ctx_factory, n):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}",
+        """
+        out[i] = sum(j, a[j]**2)
+        """,
+        "..."
+        )
+
+    knl = lp.fix_parameters(knl, n=n)
+    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    knl = lp.realize_reduction(knl)
+
+    knl = lp.add_dtypes(knl, dict(a=int))
+
+    print(knl)
+
+    evt, (a,) = knl(queue, a=np.arange(n))
+    assert (a == np.cumsum(np.arange(n)**2)).all()
+
+
+def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "[n] -> {[i,j]: 1<=i<n+1 and 0<=j<=i-1}",
+        """
+        out[i-1] = sum(j, a[j]**2)
+        """,
+        "..."
+        )
+
+    knl = lp.fix_parameters(knl, n=16)
+    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.realize_reduction(knl, force_scan=True)
+    knl = lp.realize_reduction(knl)
+
+    knl = lp.add_dtypes(knl, dict(a=int))
+    evt, (out,) = knl(queue, a=np.arange(1, 17))
+
+    assert (out == np.cumsum(np.arange(1, 17)**2)).all()
+
+
+def test_scan_extra_constraints_on_domain():
+    knl = lp.make_kernel(
+        "{[i,j,k]: 0<=i<n and 0<=j<=i and i=k}",
+        "out[i] = sum(j, a[j])")
+
+    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
+        knl = lp.realize_reduction(
+                knl, force_scan=True, force_outer_iname_for_scan="i")
+
+
+@pytest.mark.parametrize("sweep_iname_tag", ["for", "l.1"])
+def test_scan_with_outer_parallel_iname(ctx_factory, sweep_iname_tag):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        [
+            "{[k]: 0<=k<=1}",
+            "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}"
+        ],
+        "out[k,i] = k + sum(j, j**2)"
+        )
+
+    knl = lp.tag_inames(knl, dict(k="l.0", i=sweep_iname_tag))
+    n = 10
+    knl = lp.fix_parameters(knl, n=n)
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    evt, (out,) = knl(queue)
+
+    inner = np.cumsum(np.arange(n)**2)
+
+    assert (out.get() == np.array([inner, 1 + inner])).all()
+
+
+@pytest.mark.parametrize("dtype", [
+    np.int32, np.int64, np.float32, np.float64])
+def test_scan_data_types(ctx_factory, dtype):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<=i<n and 0<=j<=i }",
+            "res[i] = reduce(sum, j, a[j])",
+            assumptions="n>=1")
+
+    a = np.random.randn(20).astype(dtype)
+    knl = lp.add_dtypes(knl, dict(a=dtype))
+    knl = lp.realize_reduction(knl, force_scan=True)
+    evt, (res,) = knl(queue, a=a)
+
+    assert np.allclose(res, np.cumsum(a))
+
+
+@pytest.mark.parametrize(("op_name", "np_op"), [
+    ("sum", np.sum),
+    ("product", np.prod),
+    ("min", np.min),
+    ("max", np.max),
+    ])
+def test_scan_library(ctx_factory, op_name, np_op):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<=i<n and 0<=j<=i }",
+            "res[i] = reduce(%s, j, a[j])" % op_name,
+            assumptions="n>=1")
+
+    a = np.random.randn(20)
+    knl = lp.add_dtypes(knl, dict(a=np.float))
+    knl = lp.realize_reduction(knl, force_scan=True)
+    evt, (res,) = knl(queue, a=a)
+
+    assert np.allclose(res, np.array(
+            [np_op(a[:i+1]) for i in range(len(a))]))
+
+
+def test_scan_unsupported_tags():
+    pass
+
+
+@pytest.mark.parametrize("i_tag", ["for", "l.0"])
+def test_argmax(ctx_factory, i_tag):
+    logging.basicConfig(level=logging.INFO)
+
+    dtype = np.dtype(np.float32)
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    n = 128
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<=i<%d and 0<=j<=i}" % n,
+            """
+            max_vals[i], max_indices[i] = argmax(j, fabs(a[j]), j)
+            """)
+
+    knl = lp.tag_inames(knl, dict(i=i_tag))
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    a = np.random.randn(n).astype(dtype)
+    evt, (max_indices, max_vals) = knl(queue, a=a, out_host=True)
+
+    assert (max_vals == [np.max(np.abs(a)[0:i+1]) for i in range(n)]).all()
+    assert (max_indices == [np.argmax(np.abs(a[0:i+1])) for i in range(n)]).all()
+
+
+def check_segmented_scan_output(arr, segment_boundaries_indices, out):
+    class SegmentGrouper(object):
+
+        def __init__(self):
+            self.seg_idx = 0
+            self.idx = 0
+
+        def __call__(self, key):
+            if self.idx in segment_boundaries_indices:
+                self.seg_idx += 1
+            self.idx += 1
+            return self.seg_idx
+
+    from itertools import groupby
+
+    expected = [np.cumsum(list(group))
+            for _, group in groupby(arr, SegmentGrouper())]
+    actual = [np.array(list(group))
+            for _, group in groupby(out, SegmentGrouper())]
+
+    assert len(expected) == len(actual) == len(segment_boundaries_indices)
+    assert [(e == a).all() for e, a in zip(expected, actual)]
+
+
+@pytest.mark.parametrize("n, segment_boundaries_indices", [
+    (1, (0,)),
+    (2, (0,)),
+    (2, (0, 1)),
+    (3, (0,)),
+    (3, (0, 1)),
+    (3, (0, 2)),
+    (3, (0, 1, 2)),
+    (16, (0, 4, 8, 12))])
+@pytest.mark.parametrize("iname_tag", ("for", "l.0"))
+def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    arr = np.ones(n, dtype=np.float32)
+    segment_boundaries = np.zeros(n, dtype=np.int32)
+    segment_boundaries[(segment_boundaries_indices,)] = 1
+
+    knl = lp.make_kernel(
+        "{[i,j]: 0<=i<n and 0<=j<=i}",
+        "out[i], <>_ = reduce(segmented(sum), j, arr[j], segflag[j])",
+        [
+            lp.GlobalArg("arr", np.float32, shape=("n",)),
+            lp.GlobalArg("segflag", np.int32, shape=("n",)),
+            "..."
+        ])
+
+    knl = lp.fix_parameters(knl, n=n)
+    knl = lp.tag_inames(knl, dict(i=iname_tag))
+    knl = lp.realize_reduction(knl, force_scan=True)
+
+    (evt, (out,)) = knl(queue, arr=arr, segflag=segment_boundaries)
+
+    check_segmented_scan_output(arr, segment_boundaries_indices, out)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])
+
+# vim: foldmethod=marker
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 5e363f13594ee8e4cf170faa232b0783cca9d018..a72b62af90050008f837e144f1f28d4a4de1c730 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -49,7 +49,7 @@ def test_op_counter_basic():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -74,7 +74,7 @@ def test_op_counter_reduction():
             name="matmul_serial", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -100,7 +100,7 @@ def test_op_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -130,7 +130,7 @@ def test_op_counter_specialops():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -166,7 +166,7 @@ def test_op_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
 
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -205,7 +205,7 @@ def test_op_counter_triangular_domain():
     else:
         expect_fallback = False
 
-    op_map = lp.get_op_map(knl)[lp.Op(np.float64, 'mul')]
+    op_map = lp.get_op_map(knl, count_redundant_work=True)[lp.Op(np.float64, 'mul')]
     value_dict = dict(m=13, n=200)
     flops = op_map.eval_with_dict(value_dict)
 
@@ -229,7 +229,7 @@ def test_mem_access_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -269,7 +269,7 @@ def test_mem_access_counter_reduction():
             name="matmul", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -307,7 +307,7 @@ def test_mem_access_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -343,7 +343,7 @@ def test_mem_access_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -395,7 +395,7 @@ def test_mem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -437,11 +437,11 @@ def test_mem_access_counter_mixed():
     knl = lp.add_and_infer_dtypes(knl, dict(
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64,
                 x=np.float32))
-    threads = 16
-    knl = lp.split_iname(knl, "j", threads)
+    bsize = 16
+    knl = lp.split_iname(knl, "j", bsize)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
-    mem_map = lp.get_mem_access_map(knl)  # noqa
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)  # noqa
     n = 512
     m = 256
     l = 128
@@ -463,8 +463,8 @@ def test_mem_access_counter_mixed():
                                    stride=Variable('m'), direction='load',
                                    variable='b')
                             ].eval_with_dict(params)
-    assert f64uniform == 2*n*m
-    assert f32uniform == n*m*l/threads
+    assert f64uniform == 2*n*m*l/bsize
+    assert f32uniform == n*m*l/bsize
     assert f32nonconsec == 3*n*m*l
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
@@ -474,7 +474,7 @@ def test_mem_access_counter_mixed():
                                   stride=Variable('m'), direction='store',
                                   variable='c')
                            ].eval_with_dict(params)
-    assert f64uniform == n*m
+    assert f64uniform == n*m*l/bsize
     assert f32nonconsec == n*m*l
 
 
@@ -494,7 +494,7 @@ def test_mem_access_counter_nonconsec():
     knl = lp.split_iname(knl, "i", 16)
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
-    mem_map = lp.get_mem_access_map(knl)  # noqa
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)  # noqa
     n = 512
     m = 256
     l = 128
@@ -545,7 +545,7 @@ def test_mem_access_counter_consec():
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -563,7 +563,7 @@ def test_mem_access_counter_consec():
     f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         stride=1, direction='load', variable='b')
                          ].eval_with_dict(params)
-    assert f64consec == 2*n*m
+    assert f64consec == 2*n*m*l
     assert f32consec == 3*n*m*l
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
@@ -572,7 +572,7 @@ def test_mem_access_counter_consec():
     f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='store', variable='c')
                         ].eval_with_dict(params)
-    assert f64consec == n*m
+    assert f64consec == n*m*l
     assert f32consec == n*m*l
 
 
@@ -628,6 +628,7 @@ def test_barrier_counter_barriers():
 
 def test_all_counters_parallel_matmul():
 
+    bsize = 16
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
             [
@@ -635,9 +636,9 @@ def test_all_counters_parallel_matmul():
             ],
             name="matmul", assumptions="n,m,l >= 1")
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
-    knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
-    knl = lp.split_iname(knl, "k", 16)
+    knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
+    knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
+    knl = lp.split_iname(knl, "k", bsize)
     knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
     knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])
 
@@ -649,9 +650,9 @@ def test_all_counters_parallel_matmul():
     sync_map = lp.get_synchronization_map(knl)
     assert len(sync_map) == 2
     assert sync_map["kernel_launch"].eval_with_dict(params) == 1
-    assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/16
+    assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize
 
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     f32mul = op_map[
                         lp.Op(np.float32, 'mul')
                         ].eval_with_dict(params)
@@ -667,16 +668,17 @@ def test_all_counters_parallel_matmul():
 
     assert f32mul+f32add == n*m*l*2
 
-    op_map = lp.get_mem_access_map(knl)
+    op_map = lp.get_mem_access_map(knl, count_redundant_work=True)
 
-    f32coal = op_map[lp.MemAccess('global', np.float32,
+    f32s1lb = op_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='load', variable='b')
                      ].eval_with_dict(params)
-    f32coal += op_map[lp.MemAccess('global', np.float32,
-                      stride=1, direction='load', variable='a')
-                      ].eval_with_dict(params)
+    f32s1la = op_map[lp.MemAccess('global', np.float32,
+                     stride=1, direction='load', variable='a')
+                     ].eval_with_dict(params)
 
-    assert f32coal == n*m+m*l
+    assert f32s1lb == n*m*l/bsize
+    assert f32s1la == n*m*l/bsize
 
     f32coal = op_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='store', variable='c')
@@ -684,7 +686,8 @@ def test_all_counters_parallel_matmul():
 
     assert f32coal == n*l
 
-    local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local'])
+    local_mem_map = lp.get_mem_access_map(knl,
+                        count_redundant_work=True).filter_by(mtype=['local'])
     local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                              direction='load')
                                 ].eval_with_dict(params)
@@ -742,7 +745,7 @@ def test_summations_and_filters():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
 
     loads_a = mem_map.filter_by(direction=['load'], variable=['a']
                                 ).eval_and_sum(params)
@@ -768,7 +771,7 @@ def test_summations_and_filters():
     assert f32lall == 3*n*m*l
     assert f64lall == 2*n*m
 
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     #for k, v in op_map.items():
     #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)