diff --git a/loopy/statistics.py b/loopy/statistics.py index c273edd54a364096ad74685d6c9a1549a3d4ef57..5faeb12e3bb8f6f600c01cb2d50f43895b7f2bb9 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -105,6 +105,18 @@ def stringify_stats_mapping(m): return result +class DataAccess: + + def __init__(self, stride=0): + self.stride = stride + + def __eq__(self, other): + return isinstance(other, DataAccess) and other.stride == self.stride #TODO is this okay? + + def __hash__(self): + return hash(self.stride) + + # {{{ ExpressionOpCounter class ExpressionOpCounter(CombineMapper): @@ -277,67 +289,63 @@ class GlobalSubscriptCounter(CombineMapper): from loopy.symbolic import get_dependencies from loopy.kernel.data import LocalIndexTag my_inames = get_dependencies(index) & self.knl.all_inames() - local_id0 = None + + # find min tag axis + import sys + min_tag_axis = sys.maxsize local_id_found = False for iname in my_inames: - # find local id0 tag = self.knl.iname_to_tag.get(iname) if isinstance(tag, LocalIndexTag): local_id_found = True - if tag.axis == 0: - local_id0 = iname - break # there will be only one local_id0 + if tag.axis < min_tag_axis: + min_tag_axis = tag.axis if not local_id_found: # count as uniform access return ToCountMap( - {(self.type_inf(expr), 'uniform'): 1} + {(self.type_inf(expr), DataAccess(stride=0)): 1} ) + self.rec(expr.index) - if local_id0 is None: - # only non-zero local id(s) found, assume non-consecutive access - return ToCountMap( - {(self.type_inf(expr), 'nonconsecutive'): 1} - ) + self.rec(expr.index) + # get local_id associated with minimum tag axis + min_local_id = None + for iname in my_inames: + tag = self.knl.iname_to_tag.get(iname) + if isinstance(tag, LocalIndexTag): + if tag.axis == min_tag_axis: + min_local_id = iname + break # there will be only one min local_id + + # found local_id associated with minimum tag axis - # check coefficient of local_id0 for each axis + total_stride = None + # check coefficient of min_local_id for each axis from loopy.symbolic import CoefficientCollector from pymbolic.primitives import Variable for idx, axis_tag in zip(index, array.dim_tags): coeffs = CoefficientCollector()(idx) - # check if he contains the lid 0 guy + # check if he contains the min lid guy try: - coeff_id0 = coeffs[Variable(local_id0)] + coeff_min_lid = coeffs[Variable(min_local_id)] except KeyError: - # does not contain local_id0 + # does not contain min_local_id continue - if coeff_id0 != 1: - # non-consecutive access - return ToCountMap( - {(self.type_inf(expr), 'nonconsecutive'): 1} - ) + self.rec(expr.index) - - # coefficient is 1, now determine if stride is 1 + # found coefficient of min_local_id + # now determine stride from loopy.kernel.array import FixedStrideArrayDimTag if isinstance(axis_tag, FixedStrideArrayDimTag): stride = axis_tag.stride else: continue - if stride != 1: - # non-consecutive - return ToCountMap( - {(self.type_inf(expr), 'nonconsecutive'): 1} - ) + self.rec(expr.index) - - # else, stride == 1, continue since another idx could contain id0 + total_stride = stride*coeff_min_lid + #TODO is there a case where this^ does not execute, or executes more than once for two different axes? - # loop finished without returning, stride==1 for every instance of local_id0 - return ToCountMap( - {(self.type_inf(expr), 'consecutive'): 1} - ) + self.rec(expr.index) + return ToCountMap({(self.type_inf(expr), + DataAccess(stride=total_stride)): 1} + ) + self.rec(expr.index) def map_sum(self, expr): if expr.children: @@ -717,13 +725,13 @@ def get_gmem_access_poly(knl): # for now just counting subscripts # use count excluding local index tags for uniform accesses for key in subs_expr.dict: poly = ToCountMap({key: subs_expr.dict[key]}) - if key[1] == "uniform": + if key[1].stride == 0: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True) else: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) for key in subs_assignee.dict: poly = ToCountMap({key: subs_assignee.dict[key]}) - if key[1] == "uniform": + if key[1].stride == 0: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True) else: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) diff --git a/test/test_statistics.py b/test/test_statistics.py index 2cf537f5ed9c039d09cb1d10066ec9294898d9b9..6e5b6270be571ae7c66e2219bcafdad0d4b63efd 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -28,9 +28,15 @@ from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) import loopy as lp -from loopy.statistics import get_op_poly, get_gmem_access_poly, get_barrier_poly +from loopy.statistics import ( + get_op_poly, + get_gmem_access_poly, + get_barrier_poly, + DataAccess) + import numpy as np +from pymbolic.primitives import Variable def test_op_counter_basic(): @@ -227,19 +233,19 @@ def test_gmem_access_counter_basic(): l = 128 params = {'n': n, 'm': m, 'l': l} f32 = poly[ - (np.dtype(np.float32), 'uniform', 'load') + (np.dtype(np.float32), DataAccess(stride=0), 'load') ].eval_with_dict(params) f64 = poly[ - (np.dtype(np.float64), 'uniform', 'load') + (np.dtype(np.float64), DataAccess(stride=0), 'load') ].eval_with_dict(params) assert f32 == 3*n*m*l assert f64 == 2*n*m f32 = poly[ - (np.dtype(np.float32), 'uniform', 'store') + (np.dtype(np.float32), DataAccess(stride=0), 'store') ].eval_with_dict(params) f64 = poly[ - (np.dtype(np.float64), 'uniform', 'store') + (np.dtype(np.float64), DataAccess(stride=0), 'store') ].eval_with_dict(params) assert f32 == n*m*l assert f64 == n*m @@ -261,12 +267,12 @@ def test_gmem_access_counter_reduction(): l = 128 params = {'n': n, 'm': m, 'l': l} f32 = poly[ - (np.dtype(np.float32), 'uniform', 'load') + (np.dtype(np.float32), DataAccess(stride=0), 'load') ].eval_with_dict(params) assert f32 == 2*n*m*l f32 = poly[ - (np.dtype(np.float32), 'uniform', 'store') + (np.dtype(np.float32), DataAccess(stride=0), 'store') ].eval_with_dict(params) assert f32 == n*l @@ -289,16 +295,16 @@ def test_gmem_access_counter_logic(): l = 128 params = {'n': n, 'm': m, 'l': l} f32 = poly[ - (np.dtype(np.float32), 'uniform', 'load') + (np.dtype(np.float32), DataAccess(stride=0), 'load') ].eval_with_dict(params) f64 = poly[ - (np.dtype(np.float64), 'uniform', 'load') + (np.dtype(np.float64), DataAccess(stride=0), 'load') ].eval_with_dict(params) assert f32 == 2*n*m assert f64 == n*m f64 = poly[ - (np.dtype(np.float64), 'uniform', 'store') + (np.dtype(np.float64), DataAccess(stride=0), 'store') ].eval_with_dict(params) assert f64 == n*m @@ -323,19 +329,19 @@ def test_gmem_access_counter_specialops(): l = 128 params = {'n': n, 'm': m, 'l': l} f32 = poly[ - (np.dtype(np.float32), 'uniform', 'load') + (np.dtype(np.float32), DataAccess(stride=0), 'load') ].eval_with_dict(params) f64 = poly[ - (np.dtype(np.float64), 'uniform', 'load') + (np.dtype(np.float64), DataAccess(stride=0), 'load') ].eval_with_dict(params) assert f32 == 2*n*m*l assert f64 == 2*n*m f32 = poly[ - (np.dtype(np.float32), 'uniform', 'store') + (np.dtype(np.float32), DataAccess(stride=0), 'store') ].eval_with_dict(params) f64 = poly[ - (np.dtype(np.float64), 'uniform', 'store') + (np.dtype(np.float64), DataAccess(stride=0), 'store') ].eval_with_dict(params) assert f32 == n*m*l assert f64 == n*m @@ -364,12 +370,12 @@ def test_gmem_access_counter_bitwise(): l = 128 params = {'n': n, 'm': m, 'l': l} i32 = poly[ - (np.dtype(np.int32), 'uniform', 'load') + (np.dtype(np.int32), DataAccess(stride=0), 'load') ].eval_with_dict(params) assert i32 == 4*n*m+2*n*m*l i32 = poly[ - (np.dtype(np.int32), 'uniform', 'store') + (np.dtype(np.int32), DataAccess(stride=0), 'store') ].eval_with_dict(params) assert i32 == n*m+n*m*l @@ -398,23 +404,23 @@ def test_gmem_access_counter_mixed(): l = 128 params = {'n': n, 'm': m, 'l': l} f64uniform = poly[ - (np.dtype(np.float64), 'uniform', 'load') + (np.dtype(np.float64), DataAccess(stride=0), 'load') ].eval_with_dict(params) f32uniform = poly[ - (np.dtype(np.float32), 'uniform', 'load') + (np.dtype(np.float32), DataAccess(stride=0), 'load') ].eval_with_dict(params) f32nonconsec = poly[ - (np.dtype(np.float32), 'nonconsecutive', 'load') + (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'load') ].eval_with_dict(params) assert f64uniform == 2*n*m assert f32uniform == n*m*l/threads assert f32nonconsec == 3*n*m*l f64uniform = poly[ - (np.dtype(np.float64), 'uniform', 'store') + (np.dtype(np.float64), DataAccess(stride=0), 'store') ].eval_with_dict(params) f32nonconsec = poly[ - (np.dtype(np.float32), 'nonconsecutive', 'store') + (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'store') ].eval_with_dict(params) assert f64uniform == n*m assert f32nonconsec == n*m*l @@ -442,19 +448,19 @@ def test_gmem_access_counter_nonconsec(): l = 128 params = {'n': n, 'm': m, 'l': l} f64nonconsec = poly[ - (np.dtype(np.float64), 'nonconsecutive', 'load') + (np.dtype(np.float64), DataAccess(stride=Variable('m')), 'load') ].eval_with_dict(params) f32nonconsec = poly[ - (np.dtype(np.float32), 'nonconsecutive', 'load') + (np.dtype(np.float32), DataAccess(stride=Variable('m')*Variable('l')), 'load') ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*l f64nonconsec = poly[ - (np.dtype(np.float64), 'nonconsecutive', 'store') + (np.dtype(np.float64), DataAccess(stride=Variable('m')), 'store') ].eval_with_dict(params) f32nonconsec = poly[ - (np.dtype(np.float32), 'nonconsecutive', 'store') + (np.dtype(np.float32), DataAccess(stride=Variable('m')*Variable('l')), 'store') ].eval_with_dict(params) assert f64nonconsec == n*m assert f32nonconsec == n*m*l @@ -482,19 +488,19 @@ def test_gmem_access_counter_consec(): params = {'n': n, 'm': m, 'l': l} f64consec = poly[ - (np.dtype(np.float64), 'consecutive', 'load') + (np.dtype(np.float64), DataAccess(stride=1), 'load') ].eval_with_dict(params) f32consec = poly[ - (np.dtype(np.float32), 'consecutive', 'load') + (np.dtype(np.float32), DataAccess(stride=1), 'load') ].eval_with_dict(params) assert f64consec == 2*n*m assert f32consec == 3*n*m*l f64consec = poly[ - (np.dtype(np.float64), 'consecutive', 'store') + (np.dtype(np.float64), DataAccess(stride=1), 'store') ].eval_with_dict(params) f32consec = poly[ - (np.dtype(np.float32), 'consecutive', 'store') + (np.dtype(np.float32), DataAccess(stride=1), 'store') ].eval_with_dict(params) assert f64consec == n*m assert f32consec == n*m*l @@ -588,17 +594,17 @@ def test_all_counters_parallel_matmul(): subscript_map = get_gmem_access_poly(knl) f32uncoal = subscript_map[ - (np.dtype(np.float32), 'nonconsecutive', 'load') + (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'load') ].eval_with_dict(params) f32coal = subscript_map[ - (np.dtype(np.float32), 'consecutive', 'load') + (np.dtype(np.float32), DataAccess(stride=1), 'load') ].eval_with_dict(params) assert f32uncoal == n*m*l assert f32coal == n*m*l f32coal = subscript_map[ - (np.dtype(np.float32), 'consecutive', 'store') + (np.dtype(np.float32), DataAccess(stride=1), 'store') ].eval_with_dict(params) assert f32coal == n*l