From 6622a32d5b402973703f1d74f7e56158ec3b7e66 Mon Sep 17 00:00:00 2001 From: James Stevens <jdsteve2@illinois.edu> Date: Mon, 26 Oct 2015 15:30:25 -0500 Subject: [PATCH] moved reg counter to perf model --- doc/reference.rst | 2 - loopy/__init__.py | 7 +- loopy/statistics.py | 198 ---------------------------------------- test/test_statistics.py | 90 ------------------ 4 files changed, 3 insertions(+), 294 deletions(-) diff --git a/doc/reference.rst b/doc/reference.rst index 9dad1182c..59ab3c986 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -548,6 +548,4 @@ Obtaining Kernel Statistics .. autofunction:: get_barrier_poly -.. autofunction:: estimate_regs_per_thread - .. vim: tw=75:spell diff --git a/loopy/__init__.py b/loopy/__init__.py index 2df476e1f..7073108b2 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -64,8 +64,8 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction, infer_unknown_types) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel from loopy.statistics import (get_op_poly, get_gmem_access_poly, - get_DRAM_access_poly, get_barrier_poly, estimate_regs_per_thread, - stringify_stats_mapping, sum_mem_access_to_bytes) + get_DRAM_access_poly, get_barrier_poly, stringify_stats_mapping, + sum_mem_access_to_bytes) from loopy.codegen import generate_code, generate_body from loopy.compiled import CompiledKernel from loopy.options import Options @@ -106,8 +106,7 @@ __all__ = [ "generate_code", "generate_body", "get_op_poly", "get_gmem_access_poly", "get_DRAM_access_poly", - "get_barrier_poly", "estimate_regs_per_thread", "stringify_stats_mapping", - "sum_mem_access_to_bytes", + "get_barrier_poly", "stringify_stats_mapping", "sum_mem_access_to_bytes", "CompiledKernel", diff --git a/loopy/statistics.py b/loopy/statistics.py index ae441255b..834f48207 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -380,156 +380,6 @@ class GlobalSubscriptCounter(CombineMapper): "map_slice not implemented.") -class RegisterUsageEstimator(CombineMapper): - - def __init__(self, knl): - self.knl = knl - from loopy.expression import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) - self.vars_found = [] - self.subs_found = [] - - def combine(self, values): - return sum(values) - - def forget_prev_vars(self): - del self.vars_found[:] - - def forget_prev_subs(self): - del self.subs_found[:] - - def map_constant(self, expr): - return 0 - - def map_variable(self, expr): - name = expr.name - if expr in self.vars_found: - return 0 - - self.vars_found.append(expr) - if name in self.knl.temporary_variables: - if self.knl.temporary_variables[name].is_local: - return 0 - else: - return 1 - elif name in self.knl.all_inames(): - from loopy.kernel.data import AxisTag, VectorizeTag, UnrollTag - tag = self.knl.iname_to_tag.get(name) - if (tag is None or not(isinstance(tag, AxisTag) - or isinstance(tag, VectorizeTag) - or isinstance(tag, UnrollTag))): - return 1 - else: - return 0 - else: - return 1 - - map_tagged_variable = map_variable - - def map_call(self, expr): - return self.rec(expr.parameters) - - def map_subscript(self, expr): - name = expr.aggregate.name # name of array - - if name in self.knl.arg_dict: - # not a temporary variable - array = self.knl.arg_dict[name] - elif self.knl.temporary_variables[name].is_local: - # temp var is in shared mem - return 0 + self.rec(expr.index) - elif (expr.index, expr.aggregate) in self.subs_found: - # temp var is NOT shared, but already counted - return 0 + self.rec(expr.index) - else: - # temp var is NOT shared and NOT already counted - self.subs_found.append((expr.index, expr.aggregate)) - return 1 + self.rec(expr.index) - - # expr is not a temporary variable - - if not isinstance(array, lp.GlobalArg): - # This array is not in global memory, and is not a temporary variable - # TODO how should we count arrays in const/texture mem? ImageArg? - # Ignore for now - return self.rec(expr.index) - - # this is a global mem access - if (expr.index, expr.aggregate) in self.subs_found: - return 0 + self.rec(expr.index) - else: - self.subs_found.append((expr.index, expr.aggregate)) - return 1 + self.rec(expr.index) - - def map_sum(self, expr): - assert expr.children - return sum(self.rec(child) for child in expr.children) - - map_product = map_sum - - def map_quotient(self, expr, *args): - return self.rec(expr.numerator) + self.rec(expr.denominator) - - map_floor_div = map_quotient - map_remainder = map_quotient - - def map_power(self, expr): - return self.rec(expr.base) + self.rec(expr.exponent) - - def map_left_shift(self, expr): - return self.rec(expr.shiftee)+self.rec(expr.shift) - - map_right_shift = map_left_shift - - def map_bitwise_not(self, expr): - return self.rec(expr.child) - - def map_bitwise_or(self, expr): - return sum(self.rec(child) for child in expr.children) - - map_bitwise_xor = map_bitwise_or - map_bitwise_and = map_bitwise_or - - def map_comparison(self, expr): - return self.rec(expr.left)+self.rec(expr.right) - - map_logical_not = map_bitwise_not - map_logical_or = map_bitwise_or - map_logical_and = map_logical_or - - def map_if(self, expr): - warnings.warn("RegisterUsageEstimator counting register usage as " - "sum of if-statement branches.") - return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_) - - def map_if_positive(self, expr): - warnings.warn("RegisterUsageEstimator counting register usage as " - "sum of if_pos-statement branches.") - return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_) - - map_min = map_bitwise_or - map_max = map_min - - def map_common_subexpression(self, expr): - raise NotImplementedError("GlobalSubscriptCounter encountered " - "common_subexpression, " - "map_common_subexpression not implemented.") - - def map_substitution(self, expr): - raise NotImplementedError("GlobalSubscriptCounter encountered " - "substitution, " - "map_substitution not implemented.") - - def map_derivative(self, expr): - raise NotImplementedError("GlobalSubscriptCounter encountered " - "derivative, " - "map_derivative not implemented.") - - def map_slice(self, expr): - raise NotImplementedError("GlobalSubscriptCounter encountered slice, " - "map_slice not implemented.") - - def count(kernel, bset): try: return bset.card() @@ -756,51 +606,3 @@ def get_barrier_poly(knl): return barrier_poly - -def estimate_regs_per_thread(knl): - - """Estimate registers per thread usage by a loopy kernel. - - :parameter knl: A :class:`loopy.LoopKernel` whose reg usage will be estimated. - - :return: An :class:`integer` holding an estimate for the number of registers - used per thread. This number will most likely be too low, but will - hopefully be consistantly too low by the same constant factor. - - """ - - from loopy.preprocess import preprocess_kernel, infer_unknown_types - from loopy.schedule import EnterLoop, LeaveLoop, Barrier, RunInstruction # noqa - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) - max_regs = 0 - block_reg_totals = [0] - # counters to track nested sets of previously used iname+index combinations - reg_counters = [RegisterUsageEstimator(knl)] - - for sched_item in knl.schedule: - if isinstance(sched_item, EnterLoop): - block_reg_totals.append(0) - # start a new estimator - reg_counters.append(RegisterUsageEstimator(knl)) - - elif isinstance(sched_item, LeaveLoop): - if block_reg_totals[-1] > max_regs: - max_regs = block_reg_totals[-1] - # pop to resume previous total - block_reg_totals.pop() - reg_counters.pop() - - elif isinstance(sched_item, RunInstruction): - insn = knl.id_to_insn[sched_item.insn_id] - block_reg_totals[-1] += reg_counters[-1](insn.assignee) + \ - reg_counters[-1](insn.expression) - - # finished looping, check outer block - if block_reg_totals[-1] > max_regs: - max_regs = block_reg_totals[-1] - - return max_regs - - diff --git a/test/test_statistics.py b/test/test_statistics.py index 5cd6a7781..0dffe5c35 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -28,7 +28,6 @@ from pyopencl.tools import ( # noqa as pytest_generate_tests) import loopy as lp from loopy.statistics import get_op_poly, get_gmem_access_poly, get_barrier_poly -from loopy.statistics import estimate_regs_per_thread import numpy as np @@ -541,92 +540,6 @@ def test_barrier_counter_barriers(): assert barrier_count == 50*10*2 -def test_reg_counter_basic(): - - knl = lp.make_kernel( - "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", - [ - """ - c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] - e[i, k+1] = g[i,k]*h[i,k+1] - """ - ], - name="basic", assumptions="n,m,l >= 1") - - knl = lp.add_and_infer_dtypes(knl, - dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - regs = estimate_regs_per_thread(knl) - assert regs == 6 - - -def test_reg_counter_reduction(): - - knl = lp.make_kernel( - "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", - [ - "c[i, j] = sum(k, a[i, k]*b[k, j])" - ], - name="matmul_serial", assumptions="n,m,l >= 1") - - knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - regs = estimate_regs_per_thread(knl) - assert regs == 6 - - -def test_reg_counter_logic(): - - knl = lp.make_kernel( - "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", - [ - """ - e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) - """ - ], - name="logic", assumptions="n,m,l >= 1") - - knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - regs = estimate_regs_per_thread(knl) - assert regs == 6 - - -def test_reg_counter_specialops(): - - knl = lp.make_kernel( - "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", - [ - """ - c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) - e[i, k] = (1+g[i,k])**(1+h[i,k+1]) - """ - ], - name="specialops", assumptions="n,m,l >= 1") - - knl = lp.add_and_infer_dtypes(knl, - dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - regs = estimate_regs_per_thread(knl) - assert regs == 6 - - -def test_reg_counter_bitwise(): - - knl = lp.make_kernel( - "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", - [ - """ - c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) - e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) - """ - ], - name="bitwise", assumptions="n,m,l >= 1") - - knl = lp.add_and_infer_dtypes( - knl, dict( - a=np.int32, b=np.int32, - g=np.int64, h=np.int64)) - regs = estimate_regs_per_thread(knl) - assert regs == 6 - - def test_all_counters_parallel_matmul(): knl = lp.make_kernel( @@ -681,9 +594,6 @@ def test_all_counters_parallel_matmul(): assert f32coal == n*l - regs = estimate_regs_per_thread(knl) - assert regs == 4 - if __name__ == "__main__": if len(sys.argv) > 1: -- GitLab