From 6622a32d5b402973703f1d74f7e56158ec3b7e66 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@illinois.edu>
Date: Mon, 26 Oct 2015 15:30:25 -0500
Subject: [PATCH] moved reg counter to perf model

---
 doc/reference.rst       |   2 -
 loopy/__init__.py       |   7 +-
 loopy/statistics.py     | 198 ----------------------------------------
 test/test_statistics.py |  90 ------------------
 4 files changed, 3 insertions(+), 294 deletions(-)

diff --git a/doc/reference.rst b/doc/reference.rst
index 9dad1182c..59ab3c986 100644
--- a/doc/reference.rst
+++ b/doc/reference.rst
@@ -548,6 +548,4 @@ Obtaining Kernel Statistics
 
 .. autofunction:: get_barrier_poly
 
-.. autofunction:: estimate_regs_per_thread
-
 .. vim: tw=75:spell
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 2df476e1f..7073108b2 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -64,8 +64,8 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction,
         infer_unknown_types)
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
 from loopy.statistics import (get_op_poly, get_gmem_access_poly,
-        get_DRAM_access_poly, get_barrier_poly, estimate_regs_per_thread,
-        stringify_stats_mapping, sum_mem_access_to_bytes)
+        get_DRAM_access_poly, get_barrier_poly, stringify_stats_mapping,
+        sum_mem_access_to_bytes)
 from loopy.codegen import generate_code, generate_body
 from loopy.compiled import CompiledKernel
 from loopy.options import Options
@@ -106,8 +106,7 @@ __all__ = [
         "generate_code", "generate_body",
 
         "get_op_poly", "get_gmem_access_poly", "get_DRAM_access_poly",
-        "get_barrier_poly", "estimate_regs_per_thread", "stringify_stats_mapping",
-        "sum_mem_access_to_bytes",
+        "get_barrier_poly", "stringify_stats_mapping", "sum_mem_access_to_bytes",
 
         "CompiledKernel",
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index ae441255b..834f48207 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -380,156 +380,6 @@ class GlobalSubscriptCounter(CombineMapper):
                                   "map_slice not implemented.")
 
 
-class RegisterUsageEstimator(CombineMapper):
-
-    def __init__(self, knl):
-        self.knl = knl
-        from loopy.expression import TypeInferenceMapper
-        self.type_inf = TypeInferenceMapper(knl)
-        self.vars_found = []
-        self.subs_found = []
-
-    def combine(self, values):
-        return sum(values)
-
-    def forget_prev_vars(self):
-        del self.vars_found[:]
-
-    def forget_prev_subs(self):
-        del self.subs_found[:]
-
-    def map_constant(self, expr):
-        return 0
-
-    def map_variable(self, expr):
-        name = expr.name
-        if expr in self.vars_found:
-            return 0
-
-        self.vars_found.append(expr)
-        if name in self.knl.temporary_variables:
-            if self.knl.temporary_variables[name].is_local:
-                return 0
-            else:
-                return 1
-        elif name in self.knl.all_inames():
-            from loopy.kernel.data import AxisTag, VectorizeTag, UnrollTag
-            tag = self.knl.iname_to_tag.get(name)
-            if (tag is None or not(isinstance(tag, AxisTag)
-                                   or isinstance(tag, VectorizeTag)
-                                   or isinstance(tag, UnrollTag))):
-                return 1
-            else:
-                return 0
-        else:
-            return 1
-
-    map_tagged_variable = map_variable
-
-    def map_call(self, expr):
-        return self.rec(expr.parameters)
-
-    def map_subscript(self, expr):
-        name = expr.aggregate.name  # name of array
-
-        if name in self.knl.arg_dict:
-            # not a temporary variable
-            array = self.knl.arg_dict[name]
-        elif self.knl.temporary_variables[name].is_local:
-            # temp var is in shared mem
-            return 0 + self.rec(expr.index)
-        elif (expr.index, expr.aggregate) in self.subs_found:
-            # temp var is NOT shared, but already counted
-            return 0 + self.rec(expr.index)
-        else:
-            # temp var is NOT shared and NOT already counted
-            self.subs_found.append((expr.index, expr.aggregate))
-            return 1 + self.rec(expr.index)
-
-        # expr is not a temporary variable
-
-        if not isinstance(array, lp.GlobalArg):
-            # This array is not in global memory, and is not a temporary variable
-            # TODO how should we count arrays in const/texture mem? ImageArg?
-            # Ignore for now
-            return self.rec(expr.index)
-
-        # this is a global mem access
-        if (expr.index, expr.aggregate) in self.subs_found:
-            return 0 + self.rec(expr.index)
-        else:
-            self.subs_found.append((expr.index, expr.aggregate))
-            return 1 + self.rec(expr.index)
-
-    def map_sum(self, expr):
-        assert expr.children
-        return sum(self.rec(child) for child in expr.children)
-
-    map_product = map_sum
-
-    def map_quotient(self, expr, *args):
-        return self.rec(expr.numerator) + self.rec(expr.denominator)
-
-    map_floor_div = map_quotient
-    map_remainder = map_quotient
-
-    def map_power(self, expr):
-        return self.rec(expr.base) + self.rec(expr.exponent)
-
-    def map_left_shift(self, expr):
-        return self.rec(expr.shiftee)+self.rec(expr.shift)
-
-    map_right_shift = map_left_shift
-
-    def map_bitwise_not(self, expr):
-        return self.rec(expr.child)
-
-    def map_bitwise_or(self, expr):
-        return sum(self.rec(child) for child in expr.children)
-
-    map_bitwise_xor = map_bitwise_or
-    map_bitwise_and = map_bitwise_or
-
-    def map_comparison(self, expr):
-        return self.rec(expr.left)+self.rec(expr.right)
-
-    map_logical_not = map_bitwise_not
-    map_logical_or = map_bitwise_or
-    map_logical_and = map_logical_or
-
-    def map_if(self, expr):
-        warnings.warn("RegisterUsageEstimator counting register usage as "
-                      "sum of if-statement branches.")
-        return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_)
-
-    def map_if_positive(self, expr):
-        warnings.warn("RegisterUsageEstimator counting register usage as "
-                      "sum of if_pos-statement branches.")
-        return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
-
-    map_min = map_bitwise_or
-    map_max = map_min
-
-    def map_common_subexpression(self, expr):
-        raise NotImplementedError("GlobalSubscriptCounter encountered "
-                                  "common_subexpression, "
-                                  "map_common_subexpression not implemented.")
-
-    def map_substitution(self, expr):
-        raise NotImplementedError("GlobalSubscriptCounter encountered "
-                                  "substitution, "
-                                  "map_substitution not implemented.")
-
-    def map_derivative(self, expr):
-        raise NotImplementedError("GlobalSubscriptCounter encountered "
-                                  "derivative, "
-                                  "map_derivative not implemented.")
-
-    def map_slice(self, expr):
-        raise NotImplementedError("GlobalSubscriptCounter encountered slice, "
-                                  "map_slice not implemented.")
-
-
 def count(kernel, bset):
     try:
         return bset.card()
@@ -756,51 +606,3 @@ def get_barrier_poly(knl):
 
     return barrier_poly
 
-
-def estimate_regs_per_thread(knl):
-
-    """Estimate registers per thread usage by a loopy kernel.
-
-    :parameter knl: A :class:`loopy.LoopKernel` whose reg usage will be estimated.
-
-    :return: An :class:`integer` holding an estimate for the number of registers
-             used per thread. This number will most likely be too low, but will
-             hopefully be consistantly too low by the same constant factor.
-
-    """
-
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    from loopy.schedule import EnterLoop, LeaveLoop, Barrier, RunInstruction  # noqa
-    knl = infer_unknown_types(knl, expect_completion=True)
-    knl = preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
-    max_regs = 0
-    block_reg_totals = [0]
-    # counters to track nested sets of previously used iname+index combinations
-    reg_counters = [RegisterUsageEstimator(knl)]
-
-    for sched_item in knl.schedule:
-        if isinstance(sched_item, EnterLoop):
-            block_reg_totals.append(0)
-            # start a new estimator
-            reg_counters.append(RegisterUsageEstimator(knl))
-
-        elif isinstance(sched_item, LeaveLoop):
-            if block_reg_totals[-1] > max_regs:
-                max_regs = block_reg_totals[-1]
-            # pop to resume previous total
-            block_reg_totals.pop()
-            reg_counters.pop()
-
-        elif isinstance(sched_item, RunInstruction):
-            insn = knl.id_to_insn[sched_item.insn_id]
-            block_reg_totals[-1] += reg_counters[-1](insn.assignee) + \
-                                    reg_counters[-1](insn.expression)
-
-    # finished looping, check outer block
-    if block_reg_totals[-1] > max_regs:
-        max_regs = block_reg_totals[-1]
-
-    return max_regs
-
-
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 5cd6a7781..0dffe5c35 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -28,7 +28,6 @@ from pyopencl.tools import (  # noqa
         as pytest_generate_tests)
 import loopy as lp
 from loopy.statistics import get_op_poly, get_gmem_access_poly, get_barrier_poly
-from loopy.statistics import estimate_regs_per_thread
 import numpy as np
 
 
@@ -541,92 +540,6 @@ def test_barrier_counter_barriers():
     assert barrier_count == 50*10*2
 
 
-def test_reg_counter_basic():
-
-    knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
-            [
-                """
-                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
-                e[i, k+1] = g[i,k]*h[i,k+1]
-                """
-            ],
-            name="basic", assumptions="n,m,l >= 1")
-
-    knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    regs = estimate_regs_per_thread(knl)
-    assert regs == 6
-
-
-def test_reg_counter_reduction():
-
-    knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
-            [
-                "c[i, j] = sum(k, a[i, k]*b[k, j])"
-            ],
-            name="matmul_serial", assumptions="n,m,l >= 1")
-
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    regs = estimate_regs_per_thread(knl)
-    assert regs == 6
-
-
-def test_reg_counter_logic():
-
-    knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
-            [
-                """
-                e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
-                """
-            ],
-            name="logic", assumptions="n,m,l >= 1")
-
-    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    regs = estimate_regs_per_thread(knl)
-    assert regs == 6
-
-
-def test_reg_counter_specialops():
-
-    knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
-            [
-                """
-                c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
-                e[i, k] = (1+g[i,k])**(1+h[i,k+1])
-                """
-            ],
-            name="specialops", assumptions="n,m,l >= 1")
-
-    knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    regs = estimate_regs_per_thread(knl)
-    assert regs == 6
-
-
-def test_reg_counter_bitwise():
-
-    knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
-            [
-                """
-                c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
-                e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
-                """
-            ],
-            name="bitwise", assumptions="n,m,l >= 1")
-
-    knl = lp.add_and_infer_dtypes(
-            knl, dict(
-                a=np.int32, b=np.int32,
-                g=np.int64, h=np.int64))
-    regs = estimate_regs_per_thread(knl)
-    assert regs == 6
-
-
 def test_all_counters_parallel_matmul():
 
     knl = lp.make_kernel(
@@ -681,9 +594,6 @@ def test_all_counters_parallel_matmul():
 
     assert f32coal == n*l
 
-    regs = estimate_regs_per_thread(knl)
-    assert regs == 4
-
 
 if __name__ == "__main__":
     if len(sys.argv) > 1:
-- 
GitLab