diff --git a/loopy/statistics.py b/loopy/statistics.py index f363523b860347320dd04e2a8b71325e5558c1c0..fde8643bf92b7ad56bb47975fa7ede1bda9b399c 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -25,8 +25,6 @@ THE SOFTWARE. import six import loopy as lp -import numpy as np -import warnings from islpy import dim_type import islpy as isl from pytools import memoize_in @@ -319,7 +317,6 @@ class ToCountMap(object): return result - def sum(self): """Add all counts in ToCountMap. @@ -335,7 +332,6 @@ class ToCountMap(object): total += v return total - def eval_and_sum(self, params): """Add all counts in :class:`ToCountMap` and evaluate with provided parameter dict. @@ -443,7 +439,8 @@ class MemAccess(object): """ - def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None): + def __init__(self, mtype=None, dtype=None, stride=None, direction=None, + variable=None): self.mtype = mtype self.stride = stride self.direction = direction @@ -501,8 +498,8 @@ class MemAccess(object): variable = 'None' else: variable = self.variable - return "MemAccess("+mtype+", "+dtype+", "+stride+", "+direction+", " \ - +variable+")" + return "MemAccess(" + mtype + ", " + dtype + ", " + stride + ", " \ + + direction + ", " + variable + ")" # {{{ ExpressionOpCounter @@ -574,14 +571,14 @@ class ExpressionOpCounter(CombineMapper): def map_bitwise_or(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'): - len(expr.children)-1} - ) + sum(self.rec(child) for child in expr.children) + len(expr.children)-1}) \ + + sum(self.rec(child) for child in expr.children) map_bitwise_xor = map_bitwise_or map_bitwise_and = map_bitwise_or def map_if(self, expr): - warn_with_kernel(self.knl, "summing_if_branches_ops", + warn_with_kernel(self.knl, "summing_if_branches_ops", "ExpressionOpCounter counting ops as sum of " "if-statement branches.") return self.rec(expr.condition) + self.rec(expr.then) \ @@ -596,8 +593,8 @@ class ExpressionOpCounter(CombineMapper): def map_min(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='maxmin'): - len(expr.children)-1} - ) + sum(self.rec(child) for child in expr.children) + len(expr.children)-1}) \ + + sum(self.rec(child) for child in expr.children) map_max = map_min @@ -652,7 +649,7 @@ class LocalSubscriptCounter(CombineMapper): if array.is_local: sub_map[MemAccess(mtype='local', dtype=self.type_inf(expr))] = 1 return sub_map + self.rec(expr.index) - + def map_sum(self, expr): if expr.children: return sum(self.rec(child) for child in expr.children) @@ -665,14 +662,14 @@ class LocalSubscriptCounter(CombineMapper): return self.rec(expr.left)+self.rec(expr.right) def map_if(self, expr): - warn_with_kernel(self.knl, "summing_if_branches_lsubs", + warn_with_kernel(self.knl, "summing_if_branches_lsubs", "LocalSubscriptCounter counting LMEM accesses as sum " "of if-statement branches.") return self.rec(expr.condition) + self.rec(expr.then) \ + self.rec(expr.else_) def map_if_positive(self, expr): - warn_with_kernel(self.knl, "summing_ifpos_branches_lsubs", + warn_with_kernel(self.knl, "summing_ifpos_branches_lsubs", "LocalSubscriptCounter counting LMEM accesses as sum " "of if_pos-statement branches.") return self.rec(expr.criterion) + self.rec(expr.then) \ @@ -739,7 +736,7 @@ class GlobalSubscriptCounter(CombineMapper): index = (index,) from loopy.symbolic import get_dependencies - from loopy.kernel.data import LocalIndexTag, GroupIndexTag + from loopy.kernel.data import LocalIndexTag my_inames = get_dependencies(index) & self.knl.all_inames() # find min tag axis @@ -758,7 +755,7 @@ class GlobalSubscriptCounter(CombineMapper): return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), stride=0, variable=name): 1} - ) + self.rec(expr.index) + ) + self.rec(expr.index) if min_tag_axis != 0: warn_with_kernel(self.knl, "unknown_gmem_stride", @@ -768,7 +765,7 @@ class GlobalSubscriptCounter(CombineMapper): return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), stride=sys.maxsize, variable=name): 1} - ) + self.rec(expr.index) + ) + self.rec(expr.index) # get local_id associated with minimum tag axis min_lid = None @@ -807,7 +804,7 @@ class GlobalSubscriptCounter(CombineMapper): return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), stride=total_stride, variable=name): 1} - ) + self.rec(expr.index) + ) + self.rec(expr.index) def map_sum(self, expr): if expr.children: @@ -818,14 +815,14 @@ class GlobalSubscriptCounter(CombineMapper): map_product = map_sum def map_if(self, expr): - warn_with_kernel(self.knl, "summing_if_branches_gsubs", + warn_with_kernel(self.knl, "summing_if_branches_gsubs", "GlobalSubscriptCounter counting GMEM accesses as " "sum of if-statement branches.") return self.rec(expr.condition) + self.rec(expr.then) \ + self.rec(expr.else_) def map_if_positive(self, expr): - warn_with_kernel(self.knl, "summing_ifpos_branches_gsubs", + warn_with_kernel(self.knl, "summing_ifpos_branches_gsubs", "GlobalSubscriptCounter counting GMEM accesses as " "sum of if_pos-statement branches.") return self.rec(expr.criterion) + self.rec(expr.then) \ @@ -1203,8 +1200,7 @@ def get_mem_access_map(knl, numpy_types=True): if uniform: from loopy.kernel.data import LocalIndexTag insn_inames = [iname for iname in insn_inames if not - isinstance( - knl.iname_to_tag.get(iname), LocalIndexTag)] + isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)] inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except( insn_inames, [dim_type.set])) @@ -1227,7 +1223,7 @@ def get_mem_access_map(knl, numpy_types=True): subs_expr[MemAccess(mtype=key.mtype, dtype=key.dtype, stride=key.stride, direction='load', variable=key.variable) - ] = subs_expr.pop(key) + ] = subs_expr.pop(key) subs_assignee_g = subs_counter_g(insn.assignee) for key in subs_assignee_g.count_map: @@ -1235,7 +1231,7 @@ def get_mem_access_map(knl, numpy_types=True): stride=key.stride, direction='store', variable=key.variable) - ] = subs_assignee_g.pop(key) + ] = subs_assignee_g.pop(key) # for now, don't count writes to local mem insn_inames = knl.insn_inames(insn) @@ -1243,7 +1239,9 @@ def get_mem_access_map(knl, numpy_types=True): # use count excluding local index tags for uniform accesses for key in subs_expr.count_map: map = ToCountMap({key: subs_expr[key]}) - if key.mtype == 'global' and isinstance(key.stride, int) and key.stride == 0: + if (key.mtype == 'global' and + isinstance(key.stride, int) and + key.stride == 0): subs_map = subs_map \ + map*get_insn_count(knl, insn_inames, True) else: @@ -1264,8 +1262,8 @@ def get_mem_access_map(knl, numpy_types=True): dtype=mem_access.dtype.numpy_dtype, stride=mem_access.stride, direction=mem_access.direction, - variable=mem_access.variable) - , count) + variable=mem_access.variable), + count) for mem_access, count in six.iteritems(subs_map.count_map)) return subs_map diff --git a/setup.cfg b/setup.cfg index 56341fa9898c17d540e2c83ae4c7270f93049937..b939ce0cf8b680bb1eb3501ed6d7f563e9c1c7b6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,6 +3,4 @@ ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814 max-line-length=85 exclude= loopy/target/c/compyte/ndarray, - loopy/target/c/compyte/array.py, - loopy/statistics.py, - test/test_statistics.py + loopy/target/c/compyte/array.py diff --git a/test/test_statistics.py b/test/test_statistics.py index fb502045c7b6b2c7e02d11ad3ebda3b5d13c8bda..5e363f13594ee8e4cf170faa232b0783cca9d018 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -33,6 +33,7 @@ import numpy as np from pymbolic.primitives import Variable + def test_op_counter_basic(): knl = lp.make_kernel( @@ -61,7 +62,7 @@ def test_op_counter_basic(): assert f32add == f32mul == f32div == n*m*l assert f64mul == n*m assert i32add == n*m*2 - + def test_op_counter_reduction(): @@ -235,25 +236,25 @@ def test_mem_access_counter_basic(): params = {'n': n, 'm': m, 'l': l} f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a') - ].eval_with_dict(params) + ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b') - ].eval_with_dict(params) + ].eval_with_dict(params) f64l = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g') - ].eval_with_dict(params) + ].eval_with_dict(params) f64l += mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h') - ].eval_with_dict(params) + ].eval_with_dict(params) assert f32l == 3*n*m*l assert f64l == 2*n*m f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c') - ].eval_with_dict(params) + ].eval_with_dict(params) f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='store', variable='e') - ].eval_with_dict(params) + ].eval_with_dict(params) assert f32s == n*m*l assert f64s == n*m @@ -275,21 +276,21 @@ def test_mem_access_counter_reduction(): params = {'n': n, 'm': m, 'l': l} f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a') - ].eval_with_dict(params) + ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b') - ].eval_with_dict(params) + ].eval_with_dict(params) assert f32l == 2*n*m*l f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c') - ].eval_with_dict(params) + ].eval_with_dict(params) assert f32s == n*l ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] - ).to_bytes().eval_and_sum(params) + ).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'] - ).to_bytes().eval_and_sum(params) + ).to_bytes().eval_and_sum(params) assert ld_bytes == 4*f32l assert st_bytes == 4*f32s @@ -316,13 +317,13 @@ def test_mem_access_counter_logic(): f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32), direction='load') - ].eval_with_dict(params) + ].eval_with_dict(params) f64_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), direction='load') - ].eval_with_dict(params) + ].eval_with_dict(params) f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), direction='store') - ].eval_with_dict(params) + ].eval_with_dict(params) assert f32_g_l == 2*n*m assert f64_g_l == n*m assert f64_g_s == n*m @@ -349,33 +350,34 @@ def test_mem_access_counter_specialops(): params = {'n': n, 'm': m, 'l': l} f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a') - ].eval_with_dict(params) + ].eval_with_dict(params) f32 += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b') - ].eval_with_dict(params) + ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='g') - ].eval_with_dict(params) + ].eval_with_dict(params) f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='h') - ].eval_with_dict(params) + ].eval_with_dict(params) assert f32 == 2*n*m*l assert f64 == 2*n*m f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='store', variable='c') - ].eval_with_dict(params) + ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e') - ].eval_with_dict(params) + ].eval_with_dict(params) assert f32 == n*m*l assert f64 == n*m - filtered_map = mem_map.filter_by(direction=['load'], variable=['a','g']) + filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g']) #tot = lp.eval_and_sum_polys(filtered_map, params) tot = filtered_map.eval_and_sum(params) assert tot == n*m*l + n*m + def test_mem_access_counter_bitwise(): knl = lp.make_kernel( @@ -398,26 +400,26 @@ def test_mem_access_counter_bitwise(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - i32 = mem_map[lp.MemAccess('global', np.int32, + i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='a') - ].eval_with_dict(params) - i32 += mem_map[lp.MemAccess('global', np.int32, + ].eval_with_dict(params) + i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='b') - ].eval_with_dict(params) - i32 += mem_map[lp.MemAccess('global', np.int32, + ].eval_with_dict(params) + i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='g') - ].eval_with_dict(params) - i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), + ].eval_with_dict(params) + i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), stride=0, direction='load', variable='h') - ].eval_with_dict(params) + ].eval_with_dict(params) assert i32 == 4*n*m+2*n*m*l - i32 = mem_map[lp.MemAccess('global', np.int32, + i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='c') - ].eval_with_dict(params) - i32 += mem_map[lp.MemAccess('global', np.int32, + ].eval_with_dict(params) + i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='e') - ].eval_with_dict(params) + ].eval_with_dict(params) assert i32 == n*m+n*m*l @@ -444,34 +446,34 @@ def test_mem_access_counter_mixed(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f64uniform = mem_map[lp.MemAccess('global', np.float64, + f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g') - ].eval_with_dict(params) - f64uniform += mem_map[lp.MemAccess('global', np.float64, + ].eval_with_dict(params) + f64uniform += mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h') - ].eval_with_dict(params) - f32uniform = mem_map[lp.MemAccess('global', np.float32, + ].eval_with_dict(params) + f32uniform = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='x') - ].eval_with_dict(params) - f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), + ].eval_with_dict(params) + f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='a') - ].eval_with_dict(params) - f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), + ].eval_with_dict(params) + f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='b') - ].eval_with_dict(params) + ].eval_with_dict(params) assert f64uniform == 2*n*m assert f32uniform == n*m*l/threads assert f32nonconsec == 3*n*m*l - f64uniform = mem_map[lp.MemAccess('global', np.float64, + f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e') - ].eval_with_dict(params) - f32nonconsec = mem_map[lp.MemAccess('global', np.float32, + ].eval_with_dict(params) + f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m'), direction='store', variable='c') - ].eval_with_dict(params) + ].eval_with_dict(params) assert f64uniform == n*m assert f32nonconsec == n*m*l @@ -497,33 +499,33 @@ def test_mem_access_counter_nonconsec(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f64nonconsec = mem_map[lp.MemAccess('global', np.float64, + f64nonconsec = mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='g') - ].eval_with_dict(params) - f64nonconsec += mem_map[lp.MemAccess('global', np.float64, + ].eval_with_dict(params) + f64nonconsec += mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='h') - ].eval_with_dict(params) - f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), + ].eval_with_dict(params) + f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m')*Variable('l'), direction='load', variable='a') - ].eval_with_dict(params) - f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), + ].eval_with_dict(params) + f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m')*Variable('l'), direction='load', variable='b') - ].eval_with_dict(params) + ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*l - f64nonconsec = mem_map[lp.MemAccess('global', np.float64, + f64nonconsec = mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='store', variable='e') - ].eval_with_dict(params) - f32nonconsec = mem_map[lp.MemAccess('global', np.float32, + ].eval_with_dict(params) + f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m')*Variable('l'), direction='store', variable='c') - ].eval_with_dict(params) + ].eval_with_dict(params) assert f64nonconsec == n*m assert f32nonconsec == n*m*l @@ -549,30 +551,27 @@ def test_mem_access_counter_consec(): l = 128 params = {'n': n, 'm': m, 'l': l} - #for k in mem_map: - # print(k.mtype, k.dtype, type(k.dtype), k.stride, k.direction, k.variable, " :\n", mem_map[k]) - - f64consec = mem_map[lp.MemAccess('global', np.float64, + f64consec = mem_map[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='g') - ].eval_with_dict(params) - f64consec += mem_map[lp.MemAccess('global', np.float64, + ].eval_with_dict(params) + f64consec += mem_map[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='h') - ].eval_with_dict(params) - f32consec = mem_map[lp.MemAccess('global', np.float32, + ].eval_with_dict(params) + f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a') - ].eval_with_dict(params) - f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), + ].eval_with_dict(params) + f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=1, direction='load', variable='b') - ].eval_with_dict(params) + ].eval_with_dict(params) assert f64consec == 2*n*m assert f32consec == 3*n*m*l - f64consec = mem_map[lp.MemAccess('global', np.float64, + f64consec = mem_map[lp.MemAccess('global', np.float64, stride=1, direction='store', variable='e') - ].eval_with_dict(params) - f32consec = mem_map[lp.MemAccess('global', np.float32, + ].eval_with_dict(params) + f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c') - ].eval_with_dict(params) + ].eval_with_dict(params) assert f64consec == n*m assert f32consec == n*m*l @@ -670,27 +669,28 @@ def test_all_counters_parallel_matmul(): op_map = lp.get_mem_access_map(knl) - f32coal = op_map[lp.MemAccess('global', np.float32, - stride=1, direction='load', variable='b') - ].eval_with_dict(params) - f32coal += op_map[lp.MemAccess('global', np.float32, - stride=1, direction='load', variable='a') - ].eval_with_dict(params) + f32coal = op_map[lp.MemAccess('global', np.float32, + stride=1, direction='load', variable='b') + ].eval_with_dict(params) + f32coal += op_map[lp.MemAccess('global', np.float32, + stride=1, direction='load', variable='a') + ].eval_with_dict(params) assert f32coal == n*m+m*l - f32coal = op_map[lp.MemAccess('global', np.float32, - stride=1, direction='store', variable='c') - ].eval_with_dict(params) + f32coal = op_map[lp.MemAccess('global', np.float32, + stride=1, direction='store', variable='c') + ].eval_with_dict(params) assert f32coal == n*l local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local']) local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), - direction='load') - ].eval_with_dict(params) + direction='load') + ].eval_with_dict(params) assert local_mem_l == n*m*l*2 + def test_gather_access_footprint(): knl = lp.make_kernel( "{[i,k,j]: 0<=i,j,k 1: exec(sys.argv[1])