diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 5eaa12b8124f86cfaf08cf2e83c3382861d9e0f2..92ec799f7045cf63dc75d1386d8a51fd7d42954c 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1120,7 +1120,7 @@ all work items reach the same barrier, the kernel will hang during execution.
 
 By default, :mod:`loopy` inserts local barriers between two instructions when it
 detects that a dependency involving local memory may occur across work items. To
-see this in action, take a look at the section on :ref:`local_temporaries`. 
+see this in action, take a look at the section on :ref:`local_temporaries`.
 
 In contrast, :mod:`loopy` will *not* insert global barriers automatically.
 Global barriers require manual intervention along with some special
@@ -1308,7 +1308,7 @@ tagged, as in the following example::
             assumptions="n>0")
 
 .. [#global-barrier-note] In particular, this is *not* the same as a call to
- ``barrier(CLK_GLOBAL_MEM_FENCE)``. 
+ ``barrier(CLK_GLOBAL_MEM_FENCE)``.
 
 .. }}}
 
@@ -1533,12 +1533,12 @@ information provided. Now we will count the operations:
 
     >>> op_map = lp.get_op_map(knl)
     >>> print(lp.stringify_stats_mapping(op_map))
-    Op(np:dtype('float32'), add) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('float32'), div) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('float32'), mul) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('float64'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('float64'), mul) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('int32'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float32'), add) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float32'), div) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float32'), mul) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float64'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float64'), mul) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('int32'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
 
 :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{**
@@ -1596,9 +1596,9 @@ together into keys containing only the specified fields:
 
     >>> op_map_dtype = op_map.group_by('dtype')
     >>> print(lp.stringify_stats_mapping(op_map_dtype))
-    Op(np:dtype('float32'), None) : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('float64'), None) : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 }
-    Op(np:dtype('int32'), None) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float32'), None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float64'), None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('int32'), None) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
     >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32)
     ...                           ].eval_with_dict(param_dict)
@@ -1619,12 +1619,12 @@ we'll continue using the kernel from the previous example:
 
     >>> mem_map = lp.get_mem_access_map(knl)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 2 * n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
 
 :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
@@ -1674,18 +1674,18 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
     >>> bytes_map = mem_map.to_bytes()
     >>> print(lp.stringify_stats_mapping(bytes_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 8 * n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
     >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
     ...                                         ).group_by('direction')
     >>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
-    MemAccess(None, None, None, load, None) : [n, m, l] -> { (16 * n * m + 12 * n * m * l) : n > 0 and m > 0 and l > 0 }
-    MemAccess(None, None, None, store, None) : [n, m, l] -> { (8 * n * m + 4 * n * m * l) : n > 0 and m > 0 and l > 0 }
+    MemAccess(None, None, None, load, None) : [m, l, n] -> { (16 * m + 12 * m * l) * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(None, None, None, store, None) : [m, l, n] -> { (8 * m + 4 * m * l) * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
     >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
     ...                            ].eval_with_dict(param_dict)
@@ -1712,12 +1712,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
     ...                             outer_tag="l.1", inner_tag="l.0")
     >>> mem_map = lp.get_mem_access_map(knl_consec)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, load, a) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, load, b) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, store, c) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, load, g) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, load, h) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, store, e) : [m, l, n] -> { ... }
     <BLANKLINE>
 
 With this parallelization, consecutive threads will access consecutive array
@@ -1753,12 +1753,12 @@ switch the inner and outer tags in our parallelization of the kernel:
     ...                                outer_tag="l.0", inner_tag="l.1")
     >>> mem_map = lp.get_mem_access_map(knl_nonconsec)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, load, a) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, load, b) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, store, c) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, load, g) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, load, h) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, store, e) : [m, l, n] -> { ... }
     <BLANKLINE>
 
 With this parallelization, consecutive threads will access *nonconsecutive*
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 36fbb49f4bb77c959877fb0bd21e1de6fb49c74b..5f0884fd44ed5064f3f195d103b164f2163d1d19 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -598,37 +598,63 @@ def get_simple_strides(bset, key_by="name"):
     assert len(comp_div_set_pieces) == 1
     bset, = comp_div_set_pieces
 
-    lspace = bset.get_local_space()
-    for idiv in range(lspace.dim(dim_type.div)):
-        div = lspace.get_div(idiv)
+    def _get_indices_and_coeffs(obj, dts):
+        result = []
+        for dt in dts:
+            for dim_idx in range(obj.dim(dt)):
+                coeff_val = obj.get_coefficient_val(dt, dim_idx)
+                if not coeff_val.is_zero():
+                    result.append((dt, dim_idx, coeff_val))
 
-        # check for sub-divs
-        supported = True
-        for dim_idx in range(div.dim(dim_type.div)):
-            coeff_val = div.get_coefficient_val(dim_type.div, dim_idx)
-            if not coeff_val.is_zero():
-                # sub-divs not supported
-                supported = False
-                break
+        return result
+
+    for cns in bset.get_constraints():
+        if not cns.is_equality():
+            continue
+        aff = cns.get_aff()
 
-        if not supported:
+        # recognizes constraints of the form
+        #  -i0 + 2*floor((i0)/2) == 0
+
+        if aff.dim(dim_type.div) != 1:
+            continue
+
+        idiv = 0
+        div = aff.get_div(idiv)
+
+        # check for sub-divs
+        if _get_indices_and_coeffs(div, [dim_type.div]):
+            # found one -> not supported
             continue
 
         denom = div.get_denominator_val().to_python()
 
-        inames_and_coeffs = []
-        for dt in [dim_type.param, dim_type.in_]:
-            for dim_idx in range(div.dim(dt)):
-                coeff_val = div.get_coefficient_val(dt, dim_idx) * denom
-                if not coeff_val.is_zero():
-                    inames_and_coeffs.append((dt, dim_idx, coeff_val))
+        # if the coefficient in front of the div is not the same as the denominator
+        if not aff.get_coefficient_val(dim_type.div, idiv).div(denom).is_one():
+            # not supported
+            continue
+
+        inames_and_coeffs = _get_indices_and_coeffs(
+                div, [dim_type.param, dim_type.in_])
 
         if len(inames_and_coeffs) != 1:
             continue
 
         (dt, dim_idx, coeff), = inames_and_coeffs
 
-        if coeff != 1:
+        if not (coeff * denom).is_one():
+            # not supported
+            continue
+
+        inames_and_coeffs = _get_indices_and_coeffs(
+                aff, [dim_type.param, dim_type.in_])
+
+        if len(inames_and_coeffs) != 1:
+            continue
+
+        (outer_dt, outer_dim_idx, outer_coeff), = inames_and_coeffs
+        if (not outer_coeff.neg().is_one()
+                or (outer_dt, outer_dim_idx) != (dt, dim_idx)):
             # not supported
             continue
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index dccaca2ec104a4749289f7cd89c491292f618e3d..622f5e49be1e40b4156113d92907fe8b1d9fb859 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1111,7 +1111,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return embedding
 
-    def stringify(self, what=None, with_dependencies=False):
+    def stringify(self, what=None, with_dependencies=False, use_separators=True,
+            show_labels=True):
         all_what = set([
             "name",
             "arguments",
@@ -1150,7 +1151,10 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         kernel = self
 
-        sep = 75*"-"
+        if use_separators:
+            sep = [75*"-"]
+        else:
+            sep = []
 
         def natorder(key):
             # Return natural ordering for strings, as opposed to dictionary order.
@@ -1167,44 +1171,50 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             return sorted(seq, key=lambda y: natorder(key(y)))
 
         if "name" in what:
-            lines.append(sep)
+            lines.extend(sep)
             lines.append("KERNEL: " + kernel.name)
 
         if "arguments" in what:
-            lines.append(sep)
-            lines.append("ARGUMENTS:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("ARGUMENTS:")
             for arg_name in natsorted(kernel.arg_dict):
                 lines.append(str(kernel.arg_dict[arg_name]))
 
         if "domains" in what:
-            lines.append(sep)
-            lines.append("DOMAINS:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("DOMAINS:")
             for dom, parents in zip(kernel.domains, kernel.all_parents_per_domain()):
                 lines.append(len(parents)*"  " + str(dom))
 
         if "tags" in what:
-            lines.append(sep)
-            lines.append("INAME IMPLEMENTATION TAGS:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("INAME IMPLEMENTATION TAGS:")
             for iname in natsorted(kernel.all_inames()):
                 line = "%s: %s" % (iname, kernel.iname_to_tag.get(iname))
                 lines.append(line)
 
         if "variables" in what and kernel.temporary_variables:
-            lines.append(sep)
-            lines.append("TEMPORARIES:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("TEMPORARIES:")
             for tv in natsorted(six.itervalues(kernel.temporary_variables),
                     key=lambda tv: tv.name):
                 lines.append(str(tv))
 
         if "rules" in what and kernel.substitutions:
-            lines.append(sep)
-            lines.append("SUBSTIUTION RULES:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("SUBSTIUTION RULES:")
             for rule_name in natsorted(six.iterkeys(kernel.substitutions)):
                 lines.append(str(kernel.substitutions[rule_name]))
 
         if "instructions" in what:
-            lines.append(sep)
-            lines.append("INSTRUCTIONS:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("INSTRUCTIONS:")
             loop_list_width = 35
 
             # {{{ topological sort
@@ -1319,18 +1329,20 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 dep_lines.append("%s : %s" % (insn.id, ",".join(insn.depends_on)))
 
         if "Dependencies" in what and dep_lines:
-            lines.append(sep)
-            lines.append("DEPENDENCIES: "
-                    "(use loopy.show_dependency_graph to visualize)")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("DEPENDENCIES: "
+                        "(use loopy.show_dependency_graph to visualize)")
             lines.extend(dep_lines)
 
         if "schedule" in what and kernel.schedule is not None:
-            lines.append(sep)
-            lines.append("SCHEDULE:")
+            lines.extend(sep)
+            if show_labels:
+                lines.append("SCHEDULE:")
             from loopy.schedule import dump_schedule
             lines.append(dump_schedule(kernel, kernel.schedule))
 
-        lines.append(sep)
+        lines.extend(sep)
 
         return "\n".join(lines)
 
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 0d22dbb88ed99c7c92480d1d39b924cc2198cc3f..08268ca9f27623a6d17a195d3c04acb55e5ec68a 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -714,7 +714,7 @@ class Assignment(MultiAssignmentBase):
 
                 z[i] = z[i+1-1] + a {atomic}
 
-        :mod:`loopy` may to evaluate the right-hand side *multiple times*
+        :mod:`loopy` may choose to evaluate the right-hand side *multiple times*
         as part of a single assignment. It is up to the user to ensure that
         this retains correct semantics.
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index cb15eb55498bcafe4ae537747e387e47ddbd8254..9b15ec471fb681698b85c1dd2f92376fbc731f00 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -38,6 +38,7 @@ __doc__ = """
 
 .. currentmodule:: loopy
 
+.. autoclass:: GuardedPwQPolynomial
 .. autoclass:: ToCountMap
 .. autoclass:: Op
 .. autoclass:: MemAccess
@@ -52,6 +53,66 @@ __doc__ = """
 """
 
 
+# {{{ GuardedPwQPolynomial
+
+class GuardedPwQPolynomial(object):
+    def __init__(self, pwqpolynomial, valid_domain):
+        self.pwqpolynomial = pwqpolynomial
+        self.valid_domain = valid_domain
+
+    def __add__(self, other):
+        if isinstance(other, GuardedPwQPolynomial):
+            return GuardedPwQPolynomial(
+                    self.pwqpolynomial + other.pwqpolynomial,
+                    self.valid_domain & other.valid_domain)
+        else:
+            return GuardedPwQPolynomial(
+                    self.pwqpolynomial + other,
+                    self.valid_domain)
+
+    __radd__ = __add__
+
+    def __mul__(self, other):
+        if isinstance(other, GuardedPwQPolynomial):
+            return GuardedPwQPolynomial(
+                    self.pwqpolynomial * other.pwqpolynomial,
+                    self.valid_domain & other.valid_domain)
+        else:
+            return GuardedPwQPolynomial(
+                    self.pwqpolynomial * other,
+                    self.valid_domain)
+
+    __rmul__ = __mul__
+
+    def eval_with_dict(self, value_dict):
+        space = self.pwqpolynomial.space
+        pt = isl.Point.zero(space.params())
+
+        for i in range(space.dim(dim_type.param)):
+            par_name = space.get_dim_name(dim_type.param, i)
+            pt = pt.set_coordinate_val(
+                dim_type.param, i, value_dict[par_name])
+
+        if not (isl.Set.from_point(pt) <= self.valid_domain):
+            raise ValueError("evaluation point outside of domain of "
+                    "definition of piecewise quasipolynomial")
+
+        return self.pwqpolynomial.eval(pt).to_python()
+
+    @staticmethod
+    def zero():
+        p = isl.PwQPolynomial('{ 0 }')
+        return GuardedPwQPolynomial(p, isl.Set.universe(p.domain().space))
+
+    def __str__(self):
+        return str(self.pwqpolynomial)
+
+    def __repr__(self):
+        return repr(self.pwqpolynomial)
+
+# }}}
+
+
 # {{{ ToCountMap
 
 class ToCountMap(object):
@@ -66,7 +127,7 @@ class ToCountMap(object):
 
     """
 
-    def __init__(self, init_dict=None, val_type=isl.PwQPolynomial):
+    def __init__(self, init_dict=None, val_type=GuardedPwQPolynomial):
         if init_dict is None:
             init_dict = {}
         self.count_map = init_dict
@@ -87,7 +148,7 @@ class ToCountMap(object):
         return self
 
     def __mul__(self, other):
-        if isinstance(other, isl.PwQPolynomial):
+        if isinstance(other, GuardedPwQPolynomial):
             return ToCountMap(dict(
                 (index, self.count_map[index]*other)
                 for index in self.keys()))
@@ -103,8 +164,8 @@ class ToCountMap(object):
             return self.count_map[index]
         except KeyError:
             #TODO what is the best way to handle this?
-            if self.val_type is isl.PwQPolynomial:
-                return isl.PwQPolynomial('{ 0 }')
+            if self.val_type is GuardedPwQPolynomial:
+                return GuardedPwQPolynomial.zero()
             else:
                 return 0
 
@@ -132,12 +193,18 @@ class ToCountMap(object):
     def copy(self):
         return ToCountMap(dict(self.count_map), self.val_type)
 
+    def with_set_attributes(self, **kwargs):
+        return ToCountMap(dict(
+            (key.copy(**kwargs), val)
+            for key, val in six.iteritems(self.count_map)),
+            self.val_type)
+
     def filter_by(self, **kwargs):
         """Remove items without specified key fields.
 
-        :parameter \*\*kwargs: Keyword arguments matching fields in the keys of
-                             the :class:`ToCountMap`, each given a list of
-                             allowable values for that key field.
+        :arg kwargs: Keyword arguments matching fields in the keys of
+                 the :class:`ToCountMap`, each given a list of
+                 allowable values for that key field.
 
         :return: A :class:`ToCountMap` containing the subset of the items in
                  the original :class:`ToCountMap` that match the field values
@@ -183,10 +250,10 @@ class ToCountMap(object):
     def filter_by_func(self, func):
         """Keep items that pass a test.
 
-        :parameter func: A function that takes a map key a parameter and
-                         returns a :class:`bool`.
+        :arg func: A function that takes a map key a parameter and
+             returns a :class:`bool`.
 
-        :return: A :class:`ToCountMap` containing the subset of the items in
+        :arg: A :class:`ToCountMap` containing the subset of the items in
                  the original :class:`ToCountMap` for which func(key) is true.
 
         Example usage::
@@ -218,7 +285,7 @@ class ToCountMap(object):
         """Group map items together, distinguishing by only the key fields
            passed in args.
 
-        :parameter \*args: Zero or more :class:`str` fields of map keys.
+        :arg args: Zero or more :class:`str` fields of map keys.
 
         :return: A :class:`ToCountMap` containing the same total counts
                  grouped together by new keys that only contain the fields
@@ -336,8 +403,8 @@ class ToCountMap(object):
 
         """
 
-        if self.val_type is isl.PwQPolynomial:
-            total = isl.PwQPolynomial('{ 0 }')
+        if self.val_type is GuardedPwQPolynomial:
+            total = GuardedPwQPolynomial.zero()
         else:
             total = 0
 
@@ -385,8 +452,10 @@ def stringify_stats_mapping(m):
     return result
 
 
+# {{{ Op descriptor
+
 class Op(object):
-    """An arithmetic operation.
+    """A descriptor for a type of arithmetic operation.
 
     .. attribute:: dtype
 
@@ -400,6 +469,8 @@ class Op(object):
 
     """
 
+    # FIXME: This could be done much more briefly by inheriting from Record.
+
     def __init__(self, dtype=None, name=None):
         self.name = name
         if dtype is None:
@@ -419,19 +490,15 @@ class Op(object):
         return hash(str(self))
 
     def __repr__(self):
-        if self.dtype is None:
-            dtype = 'None'
-        else:
-            dtype = str(self.dtype)
-        if self.name is None:
-            name = 'None'
-        else:
-            name = self.name
-        return "Op("+dtype+", "+name+")"
+        return "Op(%s, %s)" % (self.dtype, self.name)
 
+# }}}
+
+
+# {{{ MemAccess descriptor
 
 class MemAccess(object):
-    """A memory access.
+    """A descriptor for a type of memory access.
 
     .. attribute:: mtype
 
@@ -460,6 +527,8 @@ class MemAccess(object):
 
     """
 
+    # FIXME: This could be done much more briefly by inheriting from Record.
+
     def __init__(self, mtype=None, dtype=None, stride=None, direction=None,
                  variable=None):
         self.mtype = mtype
@@ -482,6 +551,16 @@ class MemAccess(object):
             raise NotImplementedError("MemAccess: variable must be None when "
                                       "mtype is 'local'")
 
+    def copy(self, mtype=None, dtype=None, stride=None, direction=None,
+            variable=None):
+        return MemAccess(
+                mtype=mtype if mtype is not None else self.mtype,
+                dtype=dtype if dtype is not None else self.dtype,
+                stride=stride if stride is not None else self.stride,
+                direction=direction if direction is not None else self.direction,
+                variable=variable if variable is not None else self.variable,
+                )
+
     def __eq__(self, other):
         return isinstance(other, MemAccess) and (
                 (self.mtype is None or other.mtype is None or
@@ -522,11 +601,70 @@ class MemAccess(object):
         return "MemAccess(" + mtype + ", " + dtype + ", " + stride + ", " \
                + direction + ", " + variable + ")"
 
+# }}}
 
-# {{{ ExpressionOpCounter
 
-class ExpressionOpCounter(CombineMapper):
+# {{{ counter base
 
+class CounterBase(CombineMapper):
+    def __init__(self, knl):
+        self.knl = knl
+        from loopy.type_inference import TypeInferenceMapper
+        self.type_inf = TypeInferenceMapper(knl)
+
+    def combine(self, values):
+        return sum(values)
+
+    def map_constant(self, expr):
+        return ToCountMap()
+
+    def map_call(self, expr):
+        return self.rec(expr.parameters)
+
+    def map_sum(self, expr):
+        if expr.children:
+            return sum(self.rec(child) for child in expr.children)
+        else:
+            return ToCountMap()
+
+    map_product = map_sum
+
+    def map_comparison(self, expr):
+        return self.rec(expr.left)+self.rec(expr.right)
+
+    def map_if(self, expr):
+        warn_with_kernel(self.knl, "summing_if_branches",
+                         "%s counting sum of if-expression branches."
+                         % type(self).__name__)
+        return self.rec(expr.condition) + self.rec(expr.then) \
+               + self.rec(expr.else_)
+
+    def map_if_positive(self, expr):
+        warn_with_kernel(self.knl, "summing_if_branches",
+                         "%s counting sum of if-expression branches."
+                         % type(self).__name__)
+        return self.rec(expr.criterion) + self.rec(expr.then) \
+               + self.rec(expr.else_)
+
+    def map_common_subexpression(self, expr):
+        raise RuntimeError("%s encountered %s--not supposed to happen"
+                % (type(self).__name__, type(expr).__name__))
+
+    map_substitution = map_common_subexpression
+    map_derivative = map_common_subexpression
+    map_slice = map_common_subexpression
+
+    # preprocessing should have removed these
+    def map_reduction(self, expr):
+        raise RuntimeError("%s encountered %s--not supposed to happen"
+                % (type(self).__name__, type(expr).__name__))
+
+# }}}
+
+
+# {{{ ExpressionOpCounter
+
+class ExpressionOpCounter(CounterBase):
     def __init__(self, knl):
         self.knl = knl
         from loopy.type_inference import TypeInferenceMapper
@@ -641,106 +779,59 @@ class ExpressionOpCounter(CombineMapper):
 # }}}
 
 
-# {{{ LocalSubscriptCounter
+class MemAccessCounter(CounterBase):
+    pass
 
-class LocalSubscriptCounter(CombineMapper):
 
-    def __init__(self, knl):
-        self.knl = knl
-        from loopy.type_inference import TypeInferenceMapper
-        self.type_inf = TypeInferenceMapper(knl)
+# {{{ LocalMemAccessCounter
 
-    def combine(self, values):
-        return sum(values)
-
-    def map_constant(self, expr):
-        return ToCountMap()
-
-    map_tagged_variable = map_constant
-    map_variable = map_constant
-
-    def map_call(self, expr):
-        return self.rec(expr.parameters)
-
-    def map_subscript(self, expr):
+class LocalMemAccessCounter(MemAccessCounter):
+    def count_var_access(self, dtype, name, subscript):
         sub_map = ToCountMap()
-        name = expr.aggregate.name  # name of array
         if name in self.knl.temporary_variables:
             array = self.knl.temporary_variables[name]
             if array.is_local:
-                sub_map[MemAccess(mtype='local', dtype=self.type_inf(expr))] = 1
-        return sub_map + self.rec(expr.index)
-
-    def map_sum(self, expr):
-        if expr.children:
-            return sum(self.rec(child) for child in expr.children)
-        else:
-            return ToCountMap()
-
-    map_product = map_sum
+                sub_map[MemAccess(mtype='local', dtype=dtype)] = 1
+        return sub_map
 
-    def map_comparison(self, expr):
-        return self.rec(expr.left)+self.rec(expr.right)
-
-    def map_if(self, expr):
-        warn_with_kernel(self.knl, "summing_if_branches_lsubs",
-                         "LocalSubscriptCounter counting LMEM accesses as sum "
-                         "of if-statement branches.")
-        return self.rec(expr.condition) + self.rec(expr.then) \
-               + self.rec(expr.else_)
-
-    def map_if_positive(self, expr):
-        warn_with_kernel(self.knl, "summing_ifpos_branches_lsubs",
-                         "LocalSubscriptCounter counting LMEM accesses as sum "
-                         "of if_pos-statement branches.")
-        return self.rec(expr.criterion) + self.rec(expr.then) \
-               + self.rec(expr.else_)
-
-    def map_common_subexpression(self, expr):
-        raise NotImplementedError("LocalSubscriptCounter encountered "
-                                  "common_subexpression, "
-                                  "map_common_subexpression not implemented.")
-
-    def map_substitution(self, expr):
-        raise NotImplementedError("LocalSubscriptCounter encountered "
-                                  "substitution, "
-                                  "map_substitution not implemented.")
+    def map_variable(self, expr):
+        return self.count_var_access(
+                    self.type_inf(expr), expr.name, None)
 
-    def map_derivative(self, expr):
-        raise NotImplementedError("LocalSubscriptCounter encountered "
-                                  "derivative, "
-                                  "map_derivative not implemented.")
+    map_tagged_variable = map_variable
 
-    def map_slice(self, expr):
-        raise NotImplementedError("LocalSubscriptCounter encountered slice, "
-                                  "map_slice not implemented.")
+    def map_subscript(self, expr):
+        return (
+                self.count_var_access(
+                    self.type_inf(expr), expr.aggregate.name, expr.index)
+                + self.rec(expr.index))
 
 # }}}
 
 
-# {{{ GlobalSubscriptCounter
-
-class GlobalSubscriptCounter(CombineMapper):
+# {{{ GlobalMemAccessCounter
 
-    def __init__(self, knl):
-        self.knl = knl
-        from loopy.type_inference import TypeInferenceMapper
-        self.type_inf = TypeInferenceMapper(knl)
-
-    def combine(self, values):
-        return sum(values)
+class GlobalMemAccessCounter(MemAccessCounter):
+    def map_variable(self, expr):
+        name = expr.name
 
-    def map_constant(self, expr):
-        return ToCountMap()
+        if name in self.knl.arg_dict:
+            array = self.knl.arg_dict[name]
+        else:
+            # this is a temporary variable
+            return ToCountMap()
 
-    map_tagged_variable = map_constant
-    map_variable = map_constant
+        if not isinstance(array, lp.GlobalArg):
+            # this array is not in global memory
+            return ToCountMap()
 
-    def map_call(self, expr):
-        return self.rec(expr.parameters)
+        return ToCountMap({MemAccess(mtype='global',
+                                     dtype=self.type_inf(expr), stride=0,
+                                     variable=name): 1}
+                          ) + self.rec(expr.index)
 
     def map_subscript(self, expr):
-        name = expr.aggregate.name  # name of array
+        name = expr.aggregate.name
 
         if name in self.knl.arg_dict:
             array = self.knl.arg_dict[name]
@@ -827,47 +918,6 @@ class GlobalSubscriptCounter(CombineMapper):
                                      stride=total_stride, variable=name): 1}
                           ) + self.rec(expr.index)
 
-    def map_sum(self, expr):
-        if expr.children:
-            return sum(self.rec(child) for child in expr.children)
-        else:
-            return ToCountMap()
-
-    map_product = map_sum
-
-    def map_if(self, expr):
-        warn_with_kernel(self.knl, "summing_if_branches_gsubs",
-                         "GlobalSubscriptCounter counting GMEM accesses as "
-                         "sum of if-statement branches.")
-        return self.rec(expr.condition) + self.rec(expr.then) \
-               + self.rec(expr.else_)
-
-    def map_if_positive(self, expr):
-        warn_with_kernel(self.knl, "summing_ifpos_branches_gsubs",
-                         "GlobalSubscriptCounter counting GMEM accesses as "
-                         "sum of if_pos-statement branches.")
-        return self.rec(expr.criterion) + self.rec(expr.then) \
-               + self.rec(expr.else_)
-
-    def map_common_subexpression(self, expr):
-        raise NotImplementedError("GlobalSubscriptCounter encountered "
-                                  "common_subexpression, "
-                                  "map_common_subexpression not implemented.")
-
-    def map_substitution(self, expr):
-        raise NotImplementedError("GlobalSubscriptCounter encountered "
-                                  "substitution, "
-                                  "map_substitution not implemented.")
-
-    def map_derivative(self, expr):
-        raise NotImplementedError("GlobalSubscriptCounter encountered "
-                                  "derivative, "
-                                  "map_derivative not implemented.")
-
-    def map_slice(self, expr):
-        raise NotImplementedError("GlobalSubscriptCounter encountered slice, "
-                                  "map_slice not implemented.")
-
 # }}}
 
 
@@ -940,9 +990,13 @@ class AccessFootprintGatherer(CombineMapper):
 
 # {{{ count
 
-def count(kernel, set):
+def add_assumptions_guard(kernel, pwqpolynomial):
+    return GuardedPwQPolynomial(pwqpolynomial, kernel.assumptions)
+
+
+def count(kernel, set, space=None):
     try:
-        return set.card()
+        return add_assumptions_guard(kernel, set.card())
     except AttributeError:
         pass
 
@@ -969,7 +1023,11 @@ def count(kernel, set):
             if stride is None:
                 stride = 1
 
-            length = isl.PwQPolynomial.from_pw_aff(dmax - dmin + stride)
+            length_pwaff = dmax - dmin + stride
+            if space is not None:
+                length_pwaff = length_pwaff.align_params(space)
+
+            length = isl.PwQPolynomial.from_pw_aff(length_pwaff)
             length = length.scale_down_val(stride)
 
             if bset_count is None:
@@ -1029,46 +1087,102 @@ def count(kernel, set):
                         "number of integer points in your loop "
                         "domain.")
 
-    return count
+    return add_assumptions_guard(kernel, count)
 
-# }}}
 
+def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None):
+    # FIXME: Multi-kernel support
+    gsize, lsize = knl.get_grid_size_upper_bounds()
 
-# {{{ get_op_poly
+    g_used = set()
+    l_used = set()
 
-def get_op_poly(knl, numpy_types=True):
+    from loopy.kernel.data import LocalIndexTag, GroupIndexTag
+    for iname in knl.insn_inames(insn):
+        tag = knl.iname_to_tag.get(iname)
 
-    """Count the number of operations in a loopy kernel.
+        if isinstance(tag, LocalIndexTag):
+            l_used.add(tag.axis)
+        elif isinstance(tag, GroupIndexTag):
+            g_used.add(tag.axis)
 
-    get_op_poly is deprecated. Use get_op_map instead.
+    def mult_grid_factor(used_axes, size):
+        result = 1
+        for iaxis, size in enumerate(size):
+            if iaxis not in used_axes:
+                if not isinstance(size, int):
+                    if space is not None:
+                        size = size.align_params(space)
+
+                    size = isl.PwQPolynomial.from_pw_aff(size)
+
+                result = result * size
+
+        return result
+
+    if disregard_local_axes:
+        result = mult_grid_factor(g_used, gsize)
+    else:
+        result = mult_grid_factor(g_used, gsize) * mult_grid_factor(l_used, lsize)
+
+    return add_assumptions_guard(knl, result)
 
-    """
-    warn_with_kernel(knl, "depricated_get_op_poly",
-                     "get_op_poly is deprecated. Use get_op_map instead.")
-    return get_op_map(knl, numpy_types)
+
+def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False):
+    insn_inames = knl.insn_inames(insn)
+
+    if disregard_local_axes:
+        from loopy.kernel.data import LocalIndexTag
+        insn_inames = [iname for iname in insn_inames if not
+                       isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)]
+
+    inames_domain = knl.get_inames_domain(insn_inames)
+    domain = (inames_domain.project_out_except(
+                            insn_inames, [dim_type.set]))
+
+    space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT,
+            set=[], params=knl.outer_params())
+
+    c = count(knl, domain, space=space)
+
+    if count_redundant_work:
+        unused_fac = get_unused_hw_axes_factor(knl, insn,
+                        disregard_local_axes=disregard_local_axes,
+                        space=space)
+        return c * unused_fac
+    else:
+        return c
 
 # }}}
 
 
-def get_op_map(knl, numpy_types=True):
+# {{{ get_op_map
+
+def get_op_map(knl, numpy_types=True, count_redundant_work=False):
 
     """Count the number of operations in a loopy kernel.
 
-    :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted.
+    :arg knl: A :class:`loopy.LoopKernel` whose operations are to be counted.
 
-    :parameter numpy_types: A :class:`bool` specifying whether the types
-                            in the returned mapping should be numpy types
-                            instead of :class:`loopy.LoopyType`.
+    :arg numpy_types: A :class:`bool` specifying whether the types
+         in the returned mapping should be numpy types
+         instead of :class:`loopy.LoopyType`.
+
+    :arg count_redundant_work: Based on usage of hardware axes or other
+        specifics, a kernel may perform work redundantly. This :class:`bool`
+        flag indicates whether this work should be included in the count.
+        (Likely desirable for performance modeling, but undesirable for
+        code optimization.)
 
     :return: A :class:`ToCountMap` of **{** :class:`Op` **:**
-             :class:`islpy.PwQPolynomial` **}**.
+        :class:`islpy.PwQPolynomial` **}**.
 
-             - The :class:`Op` specifies the characteristics of the arithmetic
-               operation.
+        - The :class:`Op` specifies the characteristics of the arithmetic
+          operation.
 
-             - The :class:`islpy.PwQPolynomial` holds the number of operations of
-               the kind specified in the key (in terms of the
-               :class:`loopy.LoopKernel` parameter *inames*).
+        - The :class:`islpy.PwQPolynomial` holds the number of operations of
+          the kind specified in the key (in terms of the
+          :class:`loopy.LoopKernel` parameter *inames*).
 
     Example usage::
 
@@ -1090,14 +1204,10 @@ def get_op_map(knl, numpy_types=True):
     op_map = ToCountMap()
     op_counter = ExpressionOpCounter(knl)
     for insn in knl.instructions:
-        # how many times is this instruction executed?
-        # check domain size:
-        insn_inames = knl.insn_inames(insn)
-        inames_domain = knl.get_inames_domain(insn_inames)
-        domain = (inames_domain.project_out_except(
-                                        insn_inames, [dim_type.set]))
         ops = op_counter(insn.assignee) + op_counter(insn.expression)
-        op_map = op_map + ops*count(knl, domain)
+        op_map = op_map + ops*count_insn_runs(
+                knl, insn,
+                count_redundant_work=count_redundant_work)
 
     if numpy_types:
         op_map.count_map = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name),
@@ -1106,73 +1216,36 @@ def get_op_map(knl, numpy_types=True):
 
     return op_map
 
-
-#TODO test deprecated functions?
-def get_lmem_access_poly(knl):
-    """Count the number of local memory accesses in a loopy kernel.
-
-    get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the
-    result with the mtype=['local'] option.
-
-    """
-    warn_with_kernel(knl, "depricated_get_lmem_access_poly",
-                     "get_lmem_access_poly is deprecated. Use "
-                     "get_mem_access_map and filter the result with the "
-                     "mtype=['local'] option.")
-    return get_mem_access_map(knl).filter_by(mtype=['local'])
-
-
-def get_DRAM_access_poly(knl):
-    """Count the number of global memory accesses in a loopy kernel.
-
-    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
-    result with the mtype=['global'] option.
-
-    """
-    warn_with_kernel(knl, "depricated_get_DRAM_access_poly",
-                     "get_DRAM_access_poly is deprecated. Use "
-                     "get_mem_access_map and filter the result with the "
-                     "mtype=['global'] option.")
-    return get_mem_access_map(knl).filter_by(mtype=['global'])
-
-
-# {{{ get_gmem_access_poly
-
-def get_gmem_access_poly(knl):
-    """Count the number of global memory accesses in a loopy kernel.
-
-    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
-    result with the mtype=['global'] option.
-
-    """
-    warn_with_kernel(knl, "depricated_get_gmem_access_poly",
-                     "get_DRAM_access_poly is deprecated. Use "
-                     "get_mem_access_map and filter the result with the "
-                     "mtype=['global'] option.")
-    return get_mem_access_map(knl).filter_by(mtype=['global'])
-
 # }}}
 
 
-def get_mem_access_map(knl, numpy_types=True):
+# {{{ get_mem_access_map
+
+def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False):
     """Count the number of memory accesses in a loopy kernel.
 
-    :parameter knl: A :class:`loopy.LoopKernel` whose memory accesses are to be
-                    counted.
+    :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be
+        counted.
 
-    :parameter numpy_types: A :class:`bool` specifying whether the types
-                            in the returned mapping should be numpy types
-                            instead of :class:`loopy.LoopyType`.
+    :arg numpy_types: A :class:`bool` specifying whether the types
+        in the returned mapping should be numpy types
+        instead of :class:`loopy.LoopyType`.
+
+    :arg count_redundant_work: Based on usage of hardware axes or other
+        specifics, a kernel may perform work redundantly. This :class:`bool`
+        flag indicates whether this work should be included in the count.
+        (Likely desirable for performance modeling, but undesirable for
+        code optimization.)
 
     :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:**
-             :class:`islpy.PwQPolynomial` **}**.
+        :class:`islpy.PwQPolynomial` **}**.
 
-             - The :class:`MemAccess` specifies the characteristics of the
-               memory access.
+        - The :class:`MemAccess` specifies the characteristics of the
+          memory access.
 
-             - The :class:`islpy.PwQPolynomial` holds the number of memory
-               accesses with the characteristics specified in the key (in terms
-               of the :class:`loopy.LoopKernel` *inames*).
+        - The :class:`islpy.PwQPolynomial` holds the number of memory
+          accesses with the characteristics specified in the key (in terms
+          of the :class:`loopy.LoopKernel` *inames*).
 
     Example usage::
 
@@ -1217,102 +1290,74 @@ def get_mem_access_map(knl, numpy_types=True):
     cache_holder = CacheHolder()
 
     @memoize_in(cache_holder, "insn_count")
-    def get_insn_count(knl, insn_inames, uniform=False):
-        if uniform:
-            from loopy.kernel.data import LocalIndexTag
-            insn_inames = [iname for iname in insn_inames if not
-                           isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)]
-        inames_domain = knl.get_inames_domain(insn_inames)
-        domain = (inames_domain.project_out_except(
-                                insn_inames, [dim_type.set]))
-        return count(knl, domain)
+    def get_insn_count(knl, insn_id, uniform=False):
+        insn = knl.id_to_insn[insn_id]
+        return count_insn_runs(
+                knl, insn, disregard_local_axes=uniform,
+                count_redundant_work=count_redundant_work)
 
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
 
-    subs_map = ToCountMap()
-    subs_counter_g = GlobalSubscriptCounter(knl)
-    subs_counter_l = LocalSubscriptCounter(knl)
+    access_map = ToCountMap()
+    access_counter_g = GlobalMemAccessCounter(knl)
+    access_counter_l = LocalMemAccessCounter(knl)
 
     for insn in knl.instructions:
-        # count subscripts
-        subs_expr = subs_counter_g(insn.expression) \
-                    + subs_counter_l(insn.expression)
-
-        # distinguish loads and stores
-        for key in subs_expr.count_map:
-            subs_expr[MemAccess(mtype=key.mtype, dtype=key.dtype,
-                                stride=key.stride, direction='load',
-                                variable=key.variable)
-                      ] = subs_expr.pop(key)
-
-        subs_assignee_g = subs_counter_g(insn.assignee)
-        for key in subs_assignee_g.count_map:
-            subs_assignee_g[MemAccess(mtype=key.mtype, dtype=key.dtype,
-                                      stride=key.stride,
-                                      direction='store',
-                                      variable=key.variable)
-                            ] = subs_assignee_g.pop(key)
-        # for now, don't count writes to local mem
-
-        insn_inames = knl.insn_inames(insn)
+        access_expr = (
+                access_counter_g(insn.expression)
+                + access_counter_l(insn.expression)
+                ).with_set_attributes(direction="load")
+
+        access_assignee_g = access_counter_g(insn.assignee).with_set_attributes(
+                direction="store")
+
+        # FIXME: (!!!!) for now, don't count writes to local mem
 
         # use count excluding local index tags for uniform accesses
-        for key in subs_expr.count_map:
-            map = ToCountMap({key: subs_expr[key]})
-            if (key.mtype == 'global' and
+        for key, val in six.iteritems(access_expr.count_map):
+            is_uniform = (key.mtype == 'global' and
                     isinstance(key.stride, int) and
-                    key.stride == 0):
-                subs_map = subs_map \
-                            + map*get_insn_count(knl, insn_inames, True)
-            else:
-                subs_map = subs_map + map*get_insn_count(knl, insn_inames)
-                #currently not counting stride of local mem access
-
-        for key in subs_assignee_g.count_map:
-            map = ToCountMap({key: subs_assignee_g[key]})
-            if isinstance(key.stride, int) and key.stride == 0:
-                subs_map = subs_map \
-                            + map*get_insn_count(knl, insn_inames, True)
-            else:
-                subs_map = subs_map + map*get_insn_count(knl, insn_inames)
+                    key.stride == 0)
+            access_map = (
+                    access_map
+                    + ToCountMap({key: val})
+                    * get_insn_count(knl, insn.id, is_uniform))
+            #currently not counting stride of local mem access
+
+        for key, val in six.iteritems(access_assignee_g.count_map):
+            is_uniform = (key.mtype == 'global' and
+                    isinstance(key.stride, int) and
+                    key.stride == 0)
+            access_map = (
+                    access_map
+                    + ToCountMap({key: val})
+                    * get_insn_count(knl, insn.id, is_uniform))
             # for now, don't count writes to local mem
 
     if numpy_types:
-        subs_map.count_map = dict((MemAccess(mtype=mem_access.mtype,
+        # FIXME: Don't modify in-place
+        access_map.count_map = dict((MemAccess(mtype=mem_access.mtype,
                                              dtype=mem_access.dtype.numpy_dtype,
                                              stride=mem_access.stride,
                                              direction=mem_access.direction,
                                              variable=mem_access.variable),
                                   count)
-                      for mem_access, count in six.iteritems(subs_map.count_map))
-
-    return subs_map
-
+                      for mem_access, count in six.iteritems(access_map.count_map))
 
-# {{{ get_synchronization_poly
-
-def get_synchronization_poly(knl):
-    """Count the number of synchronization events each thread encounters in a
-    loopy kernel.
-
-    get_synchronization_poly is deprecated. Use get_synchronization_map instead.
-
-    """
-    warn_with_kernel(knl, "depricated_get_synchronization_poly",
-                     "get_synchronization_poly is deprecated. Use "
-                     "get_synchronization_map instead.")
-    return get_synchronization_map(knl)
+    return access_map
 
 # }}}
 
 
+# {{{ get_synchronization_map
+
 def get_synchronization_map(knl):
 
     """Count the number of synchronization events each thread encounters in a
     loopy kernel.
 
-    :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
+    :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
     :return: A dictionary mapping each type of synchronization event to a
             :class:`islpy.PwQPolynomial` holding the number of events per
@@ -1379,9 +1424,10 @@ def get_synchronization_map(knl):
             raise LoopyError("unexpected schedule item: %s"
                     % type(sched_item).__name__)
 
-    #return result.count_map #TODO is this change okay?
     return result
 
+# }}}
+
 
 # {{{ gather_access_footprints
 
@@ -1477,4 +1523,74 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
 
 # }}}
 
+
+# {{{ compat goop
+
+def get_lmem_access_poly(knl):
+    """Count the number of local memory accesses in a loopy kernel.
+
+    get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['local'] option.
+
+    """
+    warn_with_kernel(knl, "deprecated_get_lmem_access_poly",
+                     "get_lmem_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['local'] option.")
+    return get_mem_access_map(knl).filter_by(mtype=['local'])
+
+
+def get_DRAM_access_poly(knl):
+    """Count the number of global memory accesses in a loopy kernel.
+
+    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['global'] option.
+
+    """
+    warn_with_kernel(knl, "deprecated_get_DRAM_access_poly",
+                     "get_DRAM_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['global'] option.")
+    return get_mem_access_map(knl).filter_by(mtype=['global'])
+
+
+def get_gmem_access_poly(knl):
+    """Count the number of global memory accesses in a loopy kernel.
+
+    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['global'] option.
+
+    """
+    warn_with_kernel(knl, "deprecated_get_gmem_access_poly",
+                     "get_DRAM_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['global'] option.")
+    return get_mem_access_map(knl).filter_by(mtype=['global'])
+
+
+def get_synchronization_poly(knl):
+    """Count the number of synchronization events each thread encounters in a
+    loopy kernel.
+
+    get_synchronization_poly is deprecated. Use get_synchronization_map instead.
+
+    """
+    warn_with_kernel(knl, "deprecated_get_synchronization_poly",
+                     "get_synchronization_poly is deprecated. Use "
+                     "get_synchronization_map instead.")
+    return get_synchronization_map(knl)
+
+
+def get_op_poly(knl, numpy_types=True):
+    """Count the number of operations in a loopy kernel.
+
+    get_op_poly is deprecated. Use get_op_map instead.
+
+    """
+    warn_with_kernel(knl, "deprecated_get_op_poly",
+                     "get_op_poly is deprecated. Use get_op_map instead.")
+    return get_op_map(knl, numpy_types)
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 5e363f13594ee8e4cf170faa232b0783cca9d018..a72b62af90050008f837e144f1f28d4a4de1c730 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -49,7 +49,7 @@ def test_op_counter_basic():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -74,7 +74,7 @@ def test_op_counter_reduction():
             name="matmul_serial", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -100,7 +100,7 @@ def test_op_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -130,7 +130,7 @@ def test_op_counter_specialops():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -166,7 +166,7 @@ def test_op_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
 
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -205,7 +205,7 @@ def test_op_counter_triangular_domain():
     else:
         expect_fallback = False
 
-    op_map = lp.get_op_map(knl)[lp.Op(np.float64, 'mul')]
+    op_map = lp.get_op_map(knl, count_redundant_work=True)[lp.Op(np.float64, 'mul')]
     value_dict = dict(m=13, n=200)
     flops = op_map.eval_with_dict(value_dict)
 
@@ -229,7 +229,7 @@ def test_mem_access_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -269,7 +269,7 @@ def test_mem_access_counter_reduction():
             name="matmul", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -307,7 +307,7 @@ def test_mem_access_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -343,7 +343,7 @@ def test_mem_access_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -395,7 +395,7 @@ def test_mem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -437,11 +437,11 @@ def test_mem_access_counter_mixed():
     knl = lp.add_and_infer_dtypes(knl, dict(
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64,
                 x=np.float32))
-    threads = 16
-    knl = lp.split_iname(knl, "j", threads)
+    bsize = 16
+    knl = lp.split_iname(knl, "j", bsize)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
-    mem_map = lp.get_mem_access_map(knl)  # noqa
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)  # noqa
     n = 512
     m = 256
     l = 128
@@ -463,8 +463,8 @@ def test_mem_access_counter_mixed():
                                    stride=Variable('m'), direction='load',
                                    variable='b')
                             ].eval_with_dict(params)
-    assert f64uniform == 2*n*m
-    assert f32uniform == n*m*l/threads
+    assert f64uniform == 2*n*m*l/bsize
+    assert f32uniform == n*m*l/bsize
     assert f32nonconsec == 3*n*m*l
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
@@ -474,7 +474,7 @@ def test_mem_access_counter_mixed():
                                   stride=Variable('m'), direction='store',
                                   variable='c')
                            ].eval_with_dict(params)
-    assert f64uniform == n*m
+    assert f64uniform == n*m*l/bsize
     assert f32nonconsec == n*m*l
 
 
@@ -494,7 +494,7 @@ def test_mem_access_counter_nonconsec():
     knl = lp.split_iname(knl, "i", 16)
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
-    mem_map = lp.get_mem_access_map(knl)  # noqa
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)  # noqa
     n = 512
     m = 256
     l = 128
@@ -545,7 +545,7 @@ def test_mem_access_counter_consec():
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
     l = 128
@@ -563,7 +563,7 @@ def test_mem_access_counter_consec():
     f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         stride=1, direction='load', variable='b')
                          ].eval_with_dict(params)
-    assert f64consec == 2*n*m
+    assert f64consec == 2*n*m*l
     assert f32consec == 3*n*m*l
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
@@ -572,7 +572,7 @@ def test_mem_access_counter_consec():
     f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='store', variable='c')
                         ].eval_with_dict(params)
-    assert f64consec == n*m
+    assert f64consec == n*m*l
     assert f32consec == n*m*l
 
 
@@ -628,6 +628,7 @@ def test_barrier_counter_barriers():
 
 def test_all_counters_parallel_matmul():
 
+    bsize = 16
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
             [
@@ -635,9 +636,9 @@ def test_all_counters_parallel_matmul():
             ],
             name="matmul", assumptions="n,m,l >= 1")
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
-    knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
-    knl = lp.split_iname(knl, "k", 16)
+    knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
+    knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
+    knl = lp.split_iname(knl, "k", bsize)
     knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
     knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])
 
@@ -649,9 +650,9 @@ def test_all_counters_parallel_matmul():
     sync_map = lp.get_synchronization_map(knl)
     assert len(sync_map) == 2
     assert sync_map["kernel_launch"].eval_with_dict(params) == 1
-    assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/16
+    assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize
 
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     f32mul = op_map[
                         lp.Op(np.float32, 'mul')
                         ].eval_with_dict(params)
@@ -667,16 +668,17 @@ def test_all_counters_parallel_matmul():
 
     assert f32mul+f32add == n*m*l*2
 
-    op_map = lp.get_mem_access_map(knl)
+    op_map = lp.get_mem_access_map(knl, count_redundant_work=True)
 
-    f32coal = op_map[lp.MemAccess('global', np.float32,
+    f32s1lb = op_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='load', variable='b')
                      ].eval_with_dict(params)
-    f32coal += op_map[lp.MemAccess('global', np.float32,
-                      stride=1, direction='load', variable='a')
-                      ].eval_with_dict(params)
+    f32s1la = op_map[lp.MemAccess('global', np.float32,
+                     stride=1, direction='load', variable='a')
+                     ].eval_with_dict(params)
 
-    assert f32coal == n*m+m*l
+    assert f32s1lb == n*m*l/bsize
+    assert f32s1la == n*m*l/bsize
 
     f32coal = op_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='store', variable='c')
@@ -684,7 +686,8 @@ def test_all_counters_parallel_matmul():
 
     assert f32coal == n*l
 
-    local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local'])
+    local_mem_map = lp.get_mem_access_map(knl,
+                        count_redundant_work=True).filter_by(mtype=['local'])
     local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                              direction='load')
                                 ].eval_with_dict(params)
@@ -742,7 +745,7 @@ def test_summations_and_filters():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    mem_map = lp.get_mem_access_map(knl)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
 
     loads_a = mem_map.filter_by(direction=['load'], variable=['a']
                                 ).eval_and_sum(params)
@@ -768,7 +771,7 @@ def test_summations_and_filters():
     assert f32lall == 3*n*m*l
     assert f64lall == 2*n*m
 
-    op_map = lp.get_op_map(knl)
+    op_map = lp.get_op_map(knl, count_redundant_work=True)
     #for k, v in op_map.items():
     #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)