From 8f5bb4b21061e54b73013786769e51aaefaa107a Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@illinois.edu>
Date: Mon, 26 Oct 2015 02:23:17 -0500
Subject: [PATCH] updated tutorial, cleaning up code for merge with upstream

---
 doc/reference.rst       |   2 +
 doc/tutorial.rst        |  40 ++--
 loopy/__init__.py       |   7 +-
 loopy/statistics.py     | 404 ++++++++++------------------------------
 test/test_statistics.py | 244 ++++++------------------
 5 files changed, 186 insertions(+), 511 deletions(-)
diff --git a/doc/reference.rst b/doc/reference.rst
index 59ab3c986..9dad1182c 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -548,4 +548,6 @@ Obtaining Kernel Statistics
 
 .. autofunction:: get_barrier_poly
 
+.. autofunction:: estimate_regs_per_thread
+
 .. vim: tw=75:spell
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 24cb03463..53938cba6 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1229,17 +1229,21 @@ information provided. Now we will count the operations:
     >>> from loopy.statistics import get_op_poly
     >>> op_map = get_op_poly(knl)
 
-:func:`loopy.get_op_poly` returns a mapping of **{** :class:`numpy.dtype` **:**
-:class:`islpy.PwQPolynomial` **}**. The :class:`islpy.PwQPolynomial` holds the
-number of operations for the :class:`numpy.dtype` specified in the key (in terms of
-the :class:`loopy.LoopKernel` *inames*). We'll print this map now:
+:func:`loopy.get_op_poly` returns a mapping of **{(** :class:`numpy.dtype` **,** 
+:class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. The 
+:class:`islpy.PwQPolynomial` holds the number of operations for the type specified 
+in the key (in terms of the :class:`loopy.LoopKernel` *inames*). We'll print this 
+map now:
 
 .. doctest::
 
     >>> print(lp.stringify_stats_mapping(op_map))
-    float32 : [n, m, l] -> { 3 * n * m * l : n >= 1 and m >= 1 and l >= 1 }
-    float64 : [n, m, l] -> { 2 * n * m : n >= 1 and m >= 1 and l >= 1 }
-    int32 : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('float32'), 'add') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('float32'), 'div') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('float32'), 'mul') : [n, m, l] -> { n * m * l : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('float64'), 'add') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('float64'), 'mul') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 }
+    (dtype('int32'), 'add') : [n, m, l] -> { n * m : n >= 1 and m >= 1 and l >= 1 }
     <BLANKLINE>
 
 We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
@@ -1247,14 +1251,20 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 .. doctest::
 
     >>> param_dict = {'n': 256, 'm': 256, 'l': 8}
-    >>> i32ops = op_map[np.dtype(np.int32)].eval_with_dict(param_dict)
-    >>> f32ops = op_map[np.dtype(np.float32)].eval_with_dict(param_dict)
-    >>> f64ops = op_map[np.dtype(np.float64)].eval_with_dict(param_dict)
-    >>> print("integer ops: %i\nfloat32 ops: %i\nfloat64 ops: %i" %
-    ...     (i32ops, f32ops, f64ops))
-    integer ops: 65536
-    float32 ops: 1572864
-    float64 ops: 131072
+    >>> f32add = op_map[(np.dtype(np.float32), 'add')].eval_with_dict(param_dict)
+    >>> f32div = op_map[(np.dtype(np.float32), 'div')].eval_with_dict(param_dict)
+    >>> f32mul = op_map[(np.dtype(np.float32), 'mul')].eval_with_dict(param_dict)
+    >>> f64add = op_map[(np.dtype(np.float64), 'add')].eval_with_dict(param_dict)
+    >>> f64mul = op_map[(np.dtype(np.float64), 'mul')].eval_with_dict(param_dict)
+    >>> i32add = op_map[(np.dtype(np.int32), 'add')].eval_with_dict(param_dict)
+    >>> print("%i\n%i\n%i\n%i\n%i\n%i" % 
+    ...     (f32add, f32div, f32mul, f64add, f64mul, i32add))
+    524288
+    524288
+    524288
+    65536
+    65536
+    65536
 
 Counting array accesses
 ~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/loopy/__init__.py b/loopy/__init__.py
index ff3a004d9..391504b8c 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -64,8 +64,8 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction,
         infer_unknown_types)
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
 from loopy.statistics import (get_op_poly, get_gmem_access_poly,
-        get_DRAM_access_poly, get_barrier_poly, stringify_stats_mapping,
-        sum_mem_access_to_bytes)
+        get_DRAM_access_poly, get_barrier_poly, estimate_regs_per_thread,
+        stringify_stats_mapping, sum_mem_access_to_bytes)
 from loopy.codegen import generate_code, generate_body
 from loopy.compiled import CompiledKernel
 from loopy.options import Options
@@ -106,7 +106,8 @@ __all__ = [
         "generate_code", "generate_body",
 
         "get_op_poly", "get_gmem_access_poly", "get_DRAM_access_poly",
-        "get_barrier_poly", "stringify_stats_mapping", "sum_mem_access_to_bytes",
+        "get_barrier_poly", "estimate_regs_per_thread", "stringify_stats_mapping",
+        "sum_mem_access_to_bytes",
 
         "CompiledKernel",
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index d81050c65..9ddd8bced 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -38,15 +38,12 @@ class ToCountMap:
     def __init__(self, init_dict=None):
         if init_dict is None:
             init_dict = {}
-
         self.dict = init_dict
 
     def __add__(self, other):
         result = self.dict.copy()
-
         for k, v in six.iteritems(other.dict):
             result[k] = self.dict.get(k, 0) + v
-
         return ToCountMap(result)
 
     def __radd__(self, other):
@@ -55,7 +52,6 @@ class ToCountMap:
                                 "to {} {}. ToCountMap may only be added to "
                                 "0 and other ToCountMap objects."
                                 .format(type(other), other))
-
         return self
 
     def __mul__(self, other):
@@ -87,7 +83,7 @@ def stringify_stats_mapping(m):
     return result
 
 
-class ExpressionOpCounter(CombineMapper):
+class ExpressionOpCounter_Old(CombineMapper):
 
     def __init__(self, knl):
         self.knl = knl
@@ -109,7 +105,8 @@ class ExpressionOpCounter(CombineMapper):
     #def map_function_symbol(self, expr):
     #    return 0,0
 
-    map_call = map_constant
+    def map_call(self, expr):
+        return self.rec(expr.parameters)
 
     # def map_call_with_kwargs(self, expr):  # implemented in CombineMapper
 
@@ -131,11 +128,6 @@ class ExpressionOpCounter(CombineMapper):
         if expr.children:
             return sum(ToCountMap({self.type_inf(expr): 1}) + self.rec(child)
                        for child in expr.children
-                       # Do not count '(-1)* ' (as produced by
-                       # subtraction in pymbolic): Assume this
-                       # gets implemented as a sign flip or
-                       # as subtraction. (Confirmed to be true on
-                       # at least Nvidia 352.30.)
                        if not is_zero(child + 1)) + \
                        ToCountMap({self.type_inf(expr): -1})
         else:
@@ -199,9 +191,7 @@ class ExpressionOpCounter(CombineMapper):
         return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
 
     map_min = map_bitwise_or
-    # implemented in CombineMapper, maps to map_sum;  # TODO test
-
-    map_max = map_min  # implemented in CombineMapper, maps to map_sum;  # TODO test
+    map_max = map_min
 
     def map_common_subexpression(self, expr):
         raise NotImplementedError("ExpressionOpCounter encountered "
@@ -220,7 +210,8 @@ class ExpressionOpCounter(CombineMapper):
         raise NotImplementedError("ExpressionOpCounter encountered slice, "
                                   "map_slice not implemented.")
 
-class ExpressionOpCounter2(CombineMapper):
+
+class ExpressionOpCounter(CombineMapper):
 
     def __init__(self, knl):
         self.knl = knl
@@ -242,7 +233,8 @@ class ExpressionOpCounter2(CombineMapper):
     #def map_function_symbol(self, expr):
     #    return 0,0
 
-    map_call = map_constant
+    def map_call(self, expr):
+        return self.rec(expr.parameters)
 
     # def map_call_with_kwargs(self, expr):  # implemented in CombineMapper
 
@@ -252,28 +244,19 @@ class ExpressionOpCounter2(CombineMapper):
     # def map_lookup(self, expr):  # implemented in CombineMapper
 
     def map_sum(self, expr):
-        if expr.children:
-            return ToCountMap(
-                        {(self.type_inf(expr), 'add'): len(expr.children)-1}
-                        ) + sum(self.rec(child) for child in expr.children)
-        else:
-            return ToCountMap() #TODO when does this happen?
+        assert expr.children
+        return ToCountMap(
+                    {(self.type_inf(expr), 'add'): len(expr.children)-1}
+                    ) + sum(self.rec(child) for child in expr.children)
 
     def map_product(self, expr):
         from pymbolic.primitives import is_zero
-        if expr.children:
-            # Do not count '(-1)* ' (as produced by
-            # subtraction in pymbolic): Assume this
-            # gets implemented as a sign flip or
-            # as subtraction. (Confirmed to be true on
-            # at least Nvidia 352.30.)
-            return sum(ToCountMap({(self.type_inf(expr), 'mul'): 1})
-                       + self.rec(child)
-                       for child in expr.children
-                       if not is_zero(child + 1)) + \
-                       ToCountMap({(self.type_inf(expr), 'mul'): -1})
-        else:
-            return ToCountMap() #TODO when does this happen?
+        assert expr.children
+        return sum(ToCountMap({(self.type_inf(expr), 'mul'): 1})
+                   + self.rec(child)
+                   for child in expr.children
+                   if not is_zero(child + 1)) + \
+                   ToCountMap({(self.type_inf(expr), 'mul'): -1})
 
     def map_quotient(self, expr, *args):
         return ToCountMap({(self.type_inf(expr), 'div'): 1}) \
@@ -281,37 +264,33 @@ class ExpressionOpCounter2(CombineMapper):
                                 + self.rec(expr.denominator)
 
     map_floor_div = map_quotient
-    map_remainder = map_quotient  # implemented in CombineMapper
+    map_remainder = map_quotient
 
     def map_power(self, expr):
         return ToCountMap({(self.type_inf(expr), 'pow'): 1}) \
                                 + self.rec(expr.base) \
                                 + self.rec(expr.exponent)
 
-    def map_left_shift(self, expr):  # implemented in CombineMapper
+    def map_left_shift(self, expr):
         return ToCountMap({(self.type_inf(expr), 'shift'): 1}) \
                                 + self.rec(expr.shiftee) \
                                 + self.rec(expr.shift)
 
     map_right_shift = map_left_shift
 
-    def map_bitwise_not(self, expr):  # implemented in CombineMapper
+    def map_bitwise_not(self, expr):
         return ToCountMap({(self.type_inf(expr), 'bw'): 1}) \
                                 + self.rec(expr.child)
 
     def map_bitwise_or(self, expr):
-        # implemented in CombineMapper, maps to map_sum;
         return ToCountMap(
                         {(self.type_inf(expr), 'bw'): len(expr.children)-1}
                         ) + sum(self.rec(child) for child in expr.children)
 
     map_bitwise_xor = map_bitwise_or
-    # implemented in CombineMapper, maps to map_sum;
-
     map_bitwise_and = map_bitwise_or
-    # implemented in CombineMapper, maps to map_sum;
 
-    def map_comparison(self, expr):  # implemented in CombineMapper
+    def map_comparison(self, expr):
         return self.rec(expr.left)+self.rec(expr.right)
 
     def map_logical_not(self, expr):
@@ -322,24 +301,22 @@ class ExpressionOpCounter2(CombineMapper):
 
     map_logical_and = map_logical_or
 
-    def map_if(self, expr):  # implemented in CombineMapper, recurses
-        warnings.warn("ExpressionOpCounter counting DRAM accesses as "
+    def map_if(self, expr):
+        warnings.warn("ExpressionOpCounter counting ops as "
                       "sum of if-statement branches.")
         return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_)
 
-    def map_if_positive(self, expr):  # implemented in FlopCounter
-        warnings.warn("ExpressionOpCounter counting DRAM accesses as "
+    def map_if_positive(self, expr):
+        warnings.warn("ExpressionOpCounter counting ops as "
                       "sum of if_pos-statement branches.")
         return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
 
     def map_min(self, expr):
-        # implemented in CombineMapper, maps to map_sum;
         return ToCountMap(
                         {(self.type_inf(expr), 'maxmin'): len(expr.children)-1}
                         ) + sum(self.rec(child) for child in expr.children)
-    # implemented in CombineMapper, maps to map_sum;  # TODO test
 
-    map_max = map_min  # implemented in CombineMapper, maps to map_sum;  # TODO test
+    map_max = map_min
 
     def map_common_subexpression(self, expr):
         raise NotImplementedError("ExpressionOpCounter encountered "
@@ -374,7 +351,9 @@ class GlobalSubscriptCounter(CombineMapper):
 
     map_tagged_variable = map_constant
     map_variable = map_constant
-    map_call = map_constant
+
+    def map_call(self, expr):
+        return self.rec(expr.parameters)
 
     def map_subscript(self, expr):
         name = expr.aggregate.name  # name of array
@@ -497,12 +476,12 @@ class GlobalSubscriptCounter(CombineMapper):
     map_logical_and = map_logical_or
 
     def map_if(self, expr):
-        warnings.warn("GlobalSubscriptCounter counting DRAM accesses as "
+        warnings.warn("GlobalSubscriptCounter counting GMEM accesses as "
                       "sum of if-statement branches.")
         return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_)
 
     def map_if_positive(self, expr):
-        warnings.warn("GlobalSubscriptCounter counting DRAM accesses as "
+        warnings.warn("GlobalSubscriptCounter counting GMEM accesses as "
                       "sum of if_pos-statement branches.")
         return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
 
@@ -528,6 +507,7 @@ class GlobalSubscriptCounter(CombineMapper):
         raise NotImplementedError("GlobalSubscriptCounter encountered slice, "
                                   "map_slice not implemented.")
 
+
 class RegisterUsageEstimator(CombineMapper):
 
     def __init__(self, knl):
@@ -548,7 +528,7 @@ class RegisterUsageEstimator(CombineMapper):
 
     def map_constant(self, expr):
         return 0
-    #'''
+
     def map_variable(self, expr):
         name = expr.name
         if expr in self.vars_found:
@@ -557,15 +537,15 @@ class RegisterUsageEstimator(CombineMapper):
         self.vars_found.append(expr)
         if name in self.knl.temporary_variables:
             if self.knl.temporary_variables[name].is_local:
-                print("found temp var with local tag, not counting: ", expr) #TODO remove after debug
                 return 0
             else:
                 return 1
         elif name in self.knl.all_inames():
-            from loopy.kernel.data import AxisTag
-            if (self.knl.iname_to_tag.get(name) is None or
-                    not isinstance(self.knl.iname_to_tag.get(name), AxisTag)):
-                #TODO use more specific positive instead of negative
+            from loopy.kernel.data import AxisTag, VectorizeTag, UnrollTag
+            tag = self.knl.iname_to_tag.get(name)
+            if (tag is None or not(isinstance(tag, AxisTag)
+                                   or isinstance(tag, VectorizeTag)
+                                   or isinstance(tag, UnrollTag))):
                 return 1
             else:
                 return 0
@@ -574,8 +554,8 @@ class RegisterUsageEstimator(CombineMapper):
 
     map_tagged_variable = map_variable
 
-    #map_variable = map_tagged_variable
-    map_call = map_constant  # TODO what is this?
+    def map_call(self, expr):
+        return self.rec(expr.parameters)
 
     def map_subscript(self, expr):
         name = expr.aggregate.name  # name of array
@@ -597,10 +577,10 @@ class RegisterUsageEstimator(CombineMapper):
         # expr is not a temporary variable
 
         if not isinstance(array, lp.GlobalArg):
-            print("debug... When does this happen? ", expr, array)
-            1/0
-            # this array is not in global memory
-            return 1 + self.rec(expr.index)  # TODO
+            # This array is not in global memory, and is not a temporary variable
+            # TODO how should we count arrays in const/texture mem? ImageArg?
+            # Ignore for now
+            return self.rec(expr.index)
 
         # this is a global mem access
         if (expr.index, expr.aggregate) in self.subs_found:
@@ -610,10 +590,8 @@ class RegisterUsageEstimator(CombineMapper):
             return 1 + self.rec(expr.index)
 
     def map_sum(self, expr):
-        if expr.children:
-            return sum(self.rec(child) for child in expr.children)
-        else:
-            return 0  # TODO when does this happen?
+        assert expr.children
+        return sum(self.rec(child) for child in expr.children)
 
     map_product = map_sum
 
@@ -679,6 +657,7 @@ class RegisterUsageEstimator(CombineMapper):
         raise NotImplementedError("GlobalSubscriptCounter encountered slice, "
                                   "map_slice not implemented.")
 
+
 def count(kernel, bset):
     try:
         return bset.card()
@@ -709,7 +688,7 @@ def count(kernel, bset):
     return result
 
 
-def get_op_poly(knl):
+def get_op_poly_old(knl):
 
     """Count the number of operations in a loopy kernel.
 
@@ -752,13 +731,42 @@ def get_op_poly(knl):
     return op_poly.dict
 
 
-def get_op_poly2(knl):
+def get_op_poly(knl):
+    """Count the number of operations in a loopy kernel.
+
+    :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted.
+
+    :return: A mapping of **{(** :class:`numpy.dtype` **,** :class:`string` **)**
+             **:** :class:`islpy.PwQPolynomial` **}**.
+
+             - The :class:`numpy.dtype` specifies the type of the data being
+               operated on.
+
+             - The string specifies the operation type as
+               *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
+
+             - The :class:`islpy.PwQPolynomial` holds the number of operations of
+               the kind specified in the key (in terms of the
+               :class:`loopy.LoopKernel` *inames*).
+
+    Example usage::
+
+        # (first create loopy kernel and specify array data types)
+
+        poly = get_op_poly(knl)
+        params = {'n': 512, 'm': 256, 'l': 128}
+        f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
+        f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+
+        # (now use these counts to predict performance)
+
+    """
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
 
     op_poly = ToCountMap()
-    op_counter = ExpressionOpCounter2(knl)
+    op_counter = ExpressionOpCounter(knl)
     for insn in knl.instructions:
         # how many times is this instruction executed?
         # check domain size:
@@ -771,6 +779,7 @@ def get_op_poly2(knl):
 
 
 def get_gmem_access_poly(knl):  # for now just counting subscripts
+
     """Count the number of global memory accesses in a loopy kernel.
 
     :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
@@ -917,271 +926,50 @@ def get_barrier_poly(knl):
     return barrier_poly
 
 
-def get_regs_per_thread(knl):
-    return get_regs_per_thread3_2(knl)
-
-
-def get_regs_per_thread3_2(knl):
-
-    """Estimate registers per thread usage by a loopy kernel.
-
-    :parameter knl: A :class:`loopy.LoopKernel` whose reg usage will be estimated.
-
-    """
-
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    from loopy.schedule import EnterLoop, LeaveLoop, Barrier, RunInstruction
-    from operator import mul
-    knl = infer_unknown_types(knl, expect_completion=True)
-    knl = preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
-    max_regs = 0
-    block_reg_totals = [0]
-    reg_counters = [RegisterUsageEstimator(knl)]
-    # multiple counters to track nested sets of previously used iname+index combinations
-
-    for sched_item in knl.schedule:
-        if isinstance(sched_item, EnterLoop):
-            if sched_item.iname:  # (if not empty)
-                block_reg_totals.append(0)
-                # start a new estimator
-                reg_counters.append(RegisterUsageEstimator(knl))
-            else:
-                print("Error, how does this happen?") #TODO
-                1/0
-
-        elif isinstance(sched_item, LeaveLoop):
-            if sched_item.iname:  # (if not empty)
-                if block_reg_totals[-1] > max_regs:
-                    max_regs = block_reg_totals[-1]
-                # pop to resume previous total
-                block_reg_totals.pop()
-                reg_counters.pop()
-            else:
-                print("Error, how does this happen?") #TODO
-                1/0
-        elif isinstance(sched_item, RunInstruction):
-            insn = knl.id_to_insn[sched_item.insn_id]
-            block_reg_totals[-1] += reg_counters[-1](insn.assignee) + \
-                                    reg_counters[-1](insn.expression)
-
-    # finished looping, check outer block
-    if block_reg_totals[-1] > max_regs:
-        max_regs = block_reg_totals[-1]
-
-    return max_regs
-
-'''
-# map_var and map_tagged_var returned 1, no checking for any duplication
-def get_regs_per_thread1(knl):
-
-    """Estimate registers per thread usage by a loopy kernel.
-
-    :parameter knl: A :class:`loopy.LoopKernel` whose reg usage will be estimated.
-
-    """
-
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    from loopy.schedule import EnterLoop, LeaveLoop, Barrier, RunInstruction
-    from operator import mul
-    knl = infer_unknown_types(knl, expect_completion=True)
-    knl = preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
-
-    max_regs = 0
-    current_loop_indices = 0
-    reg_counter = RegisterUsageEstimator(knl)
-
-    #TODO test blocks vs lines
-    for sched_item in knl.schedule:
-        if isinstance(sched_item, EnterLoop):
-            # need to add indices to index count
-            # if counting by blocks, check current blk total vs max, save if bigger
-            if sched_item.iname:  # (if not empty)
-                current_loop_indices += 1  # TODO assumes all loops add 1 new index
-                #print("enter loop: ", sched_item)
-        elif isinstance(sched_item, LeaveLoop):
-            # need to subtract indices from index count
-            # if counting by blocks, check current blk total vs max, save if bigger
-            if sched_item.iname:  # (if not empty)
-                current_loop_indices -= 1  # TODO assumes all loops add 1 new index
-                #print("leave loop: ", sched_item)
-        elif isinstance(sched_item, RunInstruction):
-            # count regs for this instruction
-            # if counting by blocks, add to current block total
-            # if counting by lines, check current line total vs max, save if bigger
-            insn = knl.id_to_insn[sched_item.insn_id]
-            regs = current_loop_indices + \
-                   reg_counter(insn.assignee) + \
-                   reg_counter(insn.expression)
-            if regs > max_regs:
-                max_regs = regs
-            #print("RunInstruction, regs, max_regs ", sched_item, regs, max_regs)
-            # TODO check for iname reuse
-            # TODO don't count variables if they are loop indices?
-
-    return max_regs
-
-# no duplicate vars, subs
-def get_regs_per_thread2(knl):
+def estimate_regs_per_thread(knl):
 
     """Estimate registers per thread usage by a loopy kernel.
 
     :parameter knl: A :class:`loopy.LoopKernel` whose reg usage will be estimated.
 
-    """
-
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    from loopy.schedule import EnterLoop, LeaveLoop, Barrier, RunInstruction
-    from operator import mul
-    knl = infer_unknown_types(knl, expect_completion=True)
-    knl = preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
-    #print(knl)
-    max_regs = 0
-    current_loop_indices = 0
-    reg_counter = RegisterUsageEstimator(knl)
-
-    #TODO test blocks vs lines
-    for sched_item in knl.schedule:
-        reg_counter.forget_prev_vars()
-        reg_counter.forget_prev_subs()
-        if isinstance(sched_item, EnterLoop):
-            # need to add indices to index count
-            # if counting by blocks, check current blk total vs max, save if bigger
-            if sched_item.iname:  # (if not empty)
-                current_loop_indices += 1  # TODO assumes all loops add 1 new index
-                #print("enter loop: ", sched_item)
-        elif isinstance(sched_item, LeaveLoop):
-            # need to subtract indices from index count
-            # if counting by blocks, check current blk total vs max, save if bigger
-            if sched_item.iname:  # (if not empty)
-                current_loop_indices -= 1  # TODO assumes all loops add 1 new index
-                #print("leave loop: ", sched_item)
-        elif isinstance(sched_item, RunInstruction):
-            # count regs for this instruction
-            # if counting by blocks, add to current block total
-            # if counting by lines, check current line total vs max, save if bigger
-            insn = knl.id_to_insn[sched_item.insn_id]
-            regs = current_loop_indices + \
-                   reg_counter(insn.assignee) + \
-                   reg_counter(insn.expression)
-            if regs > max_regs:
-                max_regs = regs
-            #print("RunInstruction, regs, max_regs ", sched_item, regs, max_regs)
-            # TODO check for iname reuse
-            # TODO don't count variables if they are loop indices?
-
-    return max_regs
-
-def get_regs_per_thread3(knl):
-
-    """Estimate registers per thread usage by a loopy kernel.
-
-    :parameter knl: A :class:`loopy.LoopKernel` whose reg usage will be estimated.
+    :return: An :class:`integer` holding an estimate for the number of registers
+             used per thread. This number will most likely be too low, but will
+             hopefully be consistantly too low by the same constant factor.
 
     """
 
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    from loopy.schedule import EnterLoop, LeaveLoop, Barrier, RunInstruction
-    from operator import mul
+    from loopy.schedule import EnterLoop, LeaveLoop, Barrier, RunInstruction  # noqa
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
     knl = lp.get_one_scheduled_kernel(knl)
-    #print(knl)
     max_regs = 0
-    current_loop_indices = 0
     block_reg_totals = [0]
+    # counters to track nested sets of previously used iname+index combinations
     reg_counters = [RegisterUsageEstimator(knl)]
-    # multiple counters to track nested sets of previously used iname+index combinations
 
     for sched_item in knl.schedule:
         if isinstance(sched_item, EnterLoop):
-            if sched_item.iname:  # (if not empty)
-                #print("entering loop, totals: \n", block_reg_totals, max_regs) 
-                current_loop_indices += 1  # TODO assumes all loops add 1 new index
-                # start a new block total
-                block_reg_totals.append(current_loop_indices)
-                # start a new estimator
-                reg_counters.append(RegisterUsageEstimator(knl))
-                #print("entered loop, totals: \n", block_reg_totals, max_regs) 
-            else:
-                print("Error, how does this happen?")
-                1/0
+            block_reg_totals.append(0)
+            # start a new estimator
+            reg_counters.append(RegisterUsageEstimator(knl))
 
         elif isinstance(sched_item, LeaveLoop):
-            if sched_item.iname:  # (if not empty)
-                #print("leaving loop, totals: \n", block_reg_totals, max_regs) 
-                current_loop_indices -= 1  # TODO assumes all loops add 1 new index
-                if block_reg_totals[-1] > max_regs:
-                    max_regs = block_reg_totals[-1]
-                # pop to resume previous total
-                #block_reg_totals[-2] += block_reg_totals[-1]
-                block_reg_totals.pop()
-                reg_counters.pop()
-                #print("left loop, totals: \n", block_reg_totals, max_regs) 
-            else:
-                print("Error, how does this happen?")
-                1/0
+            if block_reg_totals[-1] > max_regs:
+                max_regs = block_reg_totals[-1]
+            # pop to resume previous total
+            block_reg_totals.pop()
+            reg_counters.pop()
+
         elif isinstance(sched_item, RunInstruction):
             insn = knl.id_to_insn[sched_item.insn_id]
-            #print("instruction found: ", insn) 
-            #print("pre insn totals: \n", block_reg_totals, max_regs) 
             block_reg_totals[-1] += reg_counters[-1](insn.assignee) + \
                                     reg_counters[-1](insn.expression)
-            #print("post insn totals: \n", block_reg_totals, max_regs) 
-            # TODO don't count variables if they are loop indices? (also try this with ctr2)
 
-    #print("finished schedule, totals: \n", block_reg_totals, max_regs)
     # finished looping, check outer block
     if block_reg_totals[-1] > max_regs:
         max_regs = block_reg_totals[-1]
-    #print("final, totals: \n", block_reg_totals, max_regs)
 
     return max_regs
-'''
-
-'''
-#add all sub blocks to containing block
-#aka add everything together
-def get_regs_per_thread4(knl):
-
-    """Estimate registers per thread usage by a loopy kernel.
-
-    :parameter knl: A :class:`loopy.LoopKernel` whose reg usage will be estimated.
-
-    """
 
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    from loopy.schedule import EnterLoop, LeaveLoop, Barrier, RunInstruction
-    from operator import mul
-    knl = infer_unknown_types(knl, expect_completion=True)
-    knl = preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
-    #print(knl)
-
-    regs = 0
-    max_loop_indices = 0
-    current_loop_indices = 0
-    reg_counter = RegisterUsageEstimator(knl)
-
-    for sched_item in knl.schedule:
-        if isinstance(sched_item, EnterLoop):
-            if sched_item.iname:  # (if not empty)
-                current_loop_indices += 1  # TODO assumes all loops add 1 new index
-                if current_loop_indices > max_loop_indices:
-                    max_loop_indices = current_loop_indices
-                #print("enter loop: ", sched_item)
-        elif isinstance(sched_item, LeaveLoop):
-            # need to subtract indices from index count
-            if sched_item.iname:  # (if not empty)
-                current_loop_indices -= 1  # TODO assumes all loops add 1 new index
-                #print("leave loop: ", sched_item)
-        elif isinstance(sched_item, RunInstruction):
-            # count regs for this instruction
-            insn = knl.id_to_insn[sched_item.insn_id]
-            regs += reg_counter(insn.assignee) + \
-                   reg_counter(insn.expression)
 
-    return regs+max_loop_indices
-'''
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 3ae1139e6..5cd6a7781 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -28,7 +28,7 @@ from pyopencl.tools import (  # noqa
         as pytest_generate_tests)
 import loopy as lp
 from loopy.statistics import get_op_poly, get_gmem_access_poly, get_barrier_poly
-from loopy.statistics import get_op_poly2, get_regs_per_thread
+from loopy.statistics import estimate_regs_per_thread
 import numpy as np
 
 
@@ -51,164 +51,6 @@ def test_op_counter_basic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[np.dtype(np.float32)].eval_with_dict(params)
-    f64 = poly[np.dtype(np.float64)].eval_with_dict(params)
-    i32 = poly[np.dtype(np.int32)].eval_with_dict(params)
-    assert f32 == 3*n*m*l
-    assert f64 == n*m
-    assert i32 == n*m*2
-
-
-def test_op_counter_reduction():
-
-    knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
-            [
-                "c[i, j] = sum(k, a[i, k]*b[k, j])"
-            ],
-            name="matmul_serial", assumptions="n,m,l >= 1")
-
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    poly = get_op_poly(knl)
-    n = 512
-    m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[np.dtype(np.float32)].eval_with_dict(params)
-    assert f32 == 2*n*m*l
-
-
-def test_op_counter_logic():
-
-    knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
-            [
-                """
-                e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
-                """
-            ],
-            name="logic", assumptions="n,m,l >= 1")
-
-    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = get_op_poly(knl)
-    n = 512
-    m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[np.dtype(np.float32)].eval_with_dict(params)
-    f64 = poly[np.dtype(np.float64)].eval_with_dict(params)
-    i32 = poly[np.dtype(np.int32)].eval_with_dict(params)
-    assert f32 == n*m
-    assert f64 == 3*n*m
-    assert i32 == n*m
-
-
-def test_op_counter_specialops():
-
-    knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
-            [
-                """
-                c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
-                e[i, k] = (1+g[i,k])**(1+h[i,k+1])
-                """
-            ],
-            name="specialops", assumptions="n,m,l >= 1")
-
-    knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = get_op_poly(knl)
-    n = 512
-    m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[np.dtype(np.float32)].eval_with_dict(params)
-    f64 = poly[np.dtype(np.float64)].eval_with_dict(params)
-    i32 = poly[np.dtype(np.int32)].eval_with_dict(params)
-    assert f32 == 4*n*m*l
-    assert f64 == 3*n*m
-    assert i32 == n*m
-
-
-def test_op_counter_bitwise():
-
-    knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
-            [
-                """
-                c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
-                e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
-                """
-            ],
-            name="bitwise", assumptions="n,m,l >= 1")
-
-    knl = lp.add_and_infer_dtypes(
-            knl, dict(
-                a=np.int32, b=np.int32,
-                g=np.int64, h=np.int64))
-
-    poly = get_op_poly(knl)
-    n = 512
-    m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
-    i32 = poly[np.dtype(np.int32)].eval_with_dict(params)
-    i64 = poly[np.dtype(np.int64)].eval_with_dict(params)  # noqa
-    assert np.dtype(np.float64) not in poly
-    assert i32 == n*m+3*n*m*l
-    assert i64 == 6*n*m
-
-
-def test_op_counter_triangular_domain():
-
-    knl = lp.make_kernel(
-            "{[i,j]: 0<=i<n and 0<=j<m and i<j}",
-            """
-            a[i, j] = b[i,j] * 2
-            """,
-            name="bitwise", assumptions="n,m >= 1")
-
-    knl = lp.add_and_infer_dtypes(knl,
-            dict(b=np.float64))
-
-    expect_fallback = False
-    import islpy as isl
-    try:
-        isl.BasicSet.card
-    except AttributeError:
-        expect_fallback = True
-    else:
-        expect_fallback = False
-
-    poly = get_op_poly(knl)[np.dtype(np.float64)]
-    value_dict = dict(m=13, n=200)
-    flops = poly.eval_with_dict(value_dict)
-
-    if expect_fallback:
-        assert flops == 144
-    else:
-        assert flops == 78
-
-
-def test_op_counter2_basic():
-
-    knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
-            [
-                """
-                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
-                e[i, k+1] = -g[i,k]*h[i,k+1]
-                """
-            ],
-            name="basic", assumptions="n,m,l >= 1")
-
-    knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = get_op_poly2(knl)
-    n = 512
-    m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
     f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
     f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
     f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params)
@@ -219,7 +61,7 @@ def test_op_counter2_basic():
     assert i32add == n*m*2
 
 
-def test_op_counter2_reduction():
+def test_op_counter_reduction():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -229,7 +71,7 @@ def test_op_counter2_reduction():
             name="matmul_serial", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    poly = get_op_poly2(knl)
+    poly = get_op_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -239,7 +81,7 @@ def test_op_counter2_reduction():
     assert f32add == f32mul == n*m*l
 
 
-def test_op_counter2_logic():
+def test_op_counter_logic():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -251,7 +93,7 @@ def test_op_counter2_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = get_op_poly2(knl)
+    poly = get_op_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -260,17 +102,13 @@ def test_op_counter2_logic():
     f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params)
     f64div = poly[(np.dtype(np.float64), 'div')].eval_with_dict(params)
     i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
-    #f32 = poly[np.dtype(np.float32)].eval_with_dict(params)
-    #f64 = poly[np.dtype(np.float64)].eval_with_dict(params)
-    #i32 = poly[np.dtype(np.int32)].eval_with_dict(params)
     assert f32mul == n*m
-    #assert f64 == 3*n*m
-    assert f64div == 2*n*m  #TODO why?
+    assert f64div == 2*n*m  # TODO why?
     assert f64add == n*m
     assert i32add == n*m
 
 
-def test_op_counter2_specialops():
+def test_op_counter_specialops():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -284,7 +122,7 @@ def test_op_counter2_specialops():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = get_op_poly2(knl)
+    poly = get_op_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -301,7 +139,7 @@ def test_op_counter2_specialops():
     assert f64pow == i32add == n*m
 
 
-def test_op_counter2_bitwise():
+def test_op_counter_bitwise():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -318,7 +156,7 @@ def test_op_counter2_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
 
-    poly = get_op_poly2(knl)
+    poly = get_op_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -336,6 +174,37 @@ def test_op_counter2_bitwise():
     assert i64shift == 2*n*m
 
 
+def test_op_counter_triangular_domain():
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<=i<n and 0<=j<m and i<j}",
+            """
+            a[i, j] = b[i,j] * 2
+            """,
+            name="bitwise", assumptions="n,m >= 1")
+
+    knl = lp.add_and_infer_dtypes(knl,
+            dict(b=np.float64))
+
+    expect_fallback = False
+    import islpy as isl
+    try:
+        isl.BasicSet.card
+    except AttributeError:
+        expect_fallback = True
+    else:
+        expect_fallback = False
+
+    poly = get_op_poly(knl)[(np.dtype(np.float64), 'mul')]
+    value_dict = dict(m=13, n=200)
+    flops = poly.eval_with_dict(value_dict)
+
+    if expect_fallback:
+        assert flops == 144
+    else:
+        assert flops == 78
+
+
 def test_gmem_access_counter_basic():
 
     knl = lp.make_kernel(
@@ -686,7 +555,7 @@ def test_reg_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    regs = get_regs_per_thread(knl)
+    regs = estimate_regs_per_thread(knl)
     assert regs == 6
 
 
@@ -700,7 +569,7 @@ def test_reg_counter_reduction():
             name="matmul_serial", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    regs = get_regs_per_thread(knl)
+    regs = estimate_regs_per_thread(knl)
     assert regs == 6
 
 
@@ -716,7 +585,7 @@ def test_reg_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    regs = get_regs_per_thread(knl)
+    regs = estimate_regs_per_thread(knl)
     assert regs == 6
 
 
@@ -734,7 +603,7 @@ def test_reg_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    regs = get_regs_per_thread(knl)
+    regs = estimate_regs_per_thread(knl)
     assert regs == 6
 
 
@@ -754,7 +623,7 @@ def test_reg_counter_bitwise():
             knl, dict(
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
-    regs = get_regs_per_thread(knl)
+    regs = estimate_regs_per_thread(knl)
     assert regs == 6
 
 
@@ -779,14 +648,20 @@ def test_all_counters_parallel_matmul():
     assert barrier_count == 0
 
     op_map = get_op_poly(knl)
-    f32ops = op_map[
-                        np.dtype(np.float32)
+    f32mul = op_map[
+                        (np.dtype(np.float32), 'mul')
+                        ].eval_with_dict(params)
+    f32add = op_map[
+                        (np.dtype(np.float32), 'add')
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        np.dtype(np.int32)
+                        (np.dtype(np.int32), 'add')
+                        ].eval_with_dict(params)
+    i32ops += op_map[
+                        (np.dtype(np.int32), 'mul')
                         ].eval_with_dict(params)
 
-    assert f32ops == n*m*l*2
+    assert f32mul+f32add == n*m*l*2
     assert i32ops == n*m*l*4 + l*n*4
 
     subscript_map = get_gmem_access_poly(knl)
@@ -805,10 +680,9 @@ def test_all_counters_parallel_matmul():
                         ].eval_with_dict(params)
 
     assert f32coal == n*l
-    '''
-    regs = get_regs_per_thread(knl)
-    assert regs == 8
-    '''
+
+    regs = estimate_regs_per_thread(knl)
+    assert regs == 4
 
 
 if __name__ == "__main__":
-- 
GitLab