From 713a8244768b140202dac2389d71c01725e5c545 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sat, 3 Mar 2018 23:16:11 -0600 Subject: [PATCH 1/8] now collecting all gmem strides for all local ids in the index --- loopy/statistics.py | 139 ++++++++++++++++++++-------------------- test/test_statistics.py | 101 +++++++++++++++-------------- 2 files changed, 119 insertions(+), 121 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index e27a0f482..d04b84e3c 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -583,12 +583,12 @@ class MemAccess(Record): """ - def __init__(self, mtype=None, dtype=None, stride=None, direction=None, + def __init__(self, mtype=None, dtype=None, lid_strides=None, direction=None, variable=None, count_granularity=None): - #TODO currently giving all lmem access stride=None - if (mtype == 'local') and (stride is not None): - raise NotImplementedError("MemAccess: stride must be None when " + #TODO currently giving all lmem access lid_strides=None + if mtype == 'local' and lid_strides is not None: + raise NotImplementedError("MemAccess: lid_strides must be None when " "mtype is 'local'") #TODO currently giving all lmem access variable=None @@ -602,14 +602,14 @@ class MemAccess(Record): % (count_granularity, CountGranularity.ALL+[None])) if dtype is None: - Record.__init__(self, mtype=mtype, dtype=dtype, stride=stride, + Record.__init__(self, mtype=mtype, dtype=dtype, lid_strides=lid_strides, direction=direction, variable=variable, count_granularity=count_granularity) else: from loopy.types import to_loopy_type Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype), - stride=stride, direction=direction, variable=variable, - count_granularity=count_granularity) + lid_strides=lid_strides, direction=direction, + variable=variable, count_granularity=count_granularity) def __hash__(self): return hash(str(self)) @@ -619,7 +619,7 @@ class MemAccess(Record): return "MemAccess(%s, %s, %s, %s, %s, %s)" % ( self.mtype, self.dtype, - self.stride, + self.lid_strides, self.direction, self.variable, self.count_granularity) @@ -870,7 +870,7 @@ class GlobalMemAccessCounter(MemAccessCounter): return ToCountMap() return ToCountMap({MemAccess(mtype='global', - dtype=self.type_inf(expr), stride=0, + dtype=self.type_inf(expr), lid_strides=[], variable=name, count_granularity=CountGranularity.WORKITEM): 1} ) + self.rec(expr.index) @@ -896,86 +896,87 @@ class GlobalMemAccessCounter(MemAccessCounter): from loopy.kernel.data import LocalIndexTag my_inames = get_dependencies(index) & self.knl.all_inames() - # find min tag axis - import sys - min_tag_axis = sys.maxsize - local_id_found = False + # find all local index tags and corresponding inames + lid_to_iname = {} for iname in my_inames: tag = self.knl.iname_to_tag.get(iname) if isinstance(tag, LocalIndexTag): - local_id_found = True - if tag.axis < min_tag_axis: - min_tag_axis = tag.axis + lid_to_iname[tag.axis] = iname + + if not lid_to_iname: + + # no local id found, count as uniform access + # Note: + # lid_strides=[] when no local ids were found, + # lid_strides=[0, ...] if any local id is found and the lid0 stride is 0, + # either because no lid0 is found or because the stride of lid0 is 0 + warn_with_kernel(self.knl, "no_lid_found", + "GlobalSubscriptCounter: No local id found, " + "setting lid_strides to []. Expression: %s" + % (expr)) - if not local_id_found: - # count as uniform access return ToCountMap({MemAccess( mtype='global', - dtype=self.type_inf(expr), stride=0, + dtype=self.type_inf(expr), lid_strides=[], variable=name, count_granularity=CountGranularity.SUBGROUP): 1} ) + self.rec(expr.index) - if min_tag_axis != 0: - warn_with_kernel(self.knl, "unknown_gmem_stride", - "GlobalSubscriptCounter: Memory access minimum " - "tag axis %d != 0, stride unknown, using " - "sys.maxsize." % (min_tag_axis)) - return ToCountMap({MemAccess( - mtype='global', - dtype=self.type_inf(expr), - stride=sys.maxsize, variable=name, - count_granularity=CountGranularity.WORKITEM): 1} - ) + self.rec(expr.index) - - # get local_id associated with minimum tag axis - min_lid = None - for iname in my_inames: - tag = self.knl.iname_to_tag.get(iname) - if isinstance(tag, LocalIndexTag): - if tag.axis == min_tag_axis: - min_lid = iname - break # there will be only one min local_id - # found local_id associated with minimum tag axis + # create lid_strides dict (strides are coefficents in flattened index) + # i.e., we want {0:A, 1:B, 2:C, ...} where A, B, & C + # come from flattened index [... + C*lid2 + B*lid1 + A*lid0] - total_stride = 0 - # check coefficient of min_lid for each axis from loopy.symbolic import CoefficientCollector from loopy.kernel.array import FixedStrideArrayDimTag from pymbolic.primitives import Variable - for idx, axis_tag in zip(index, array.dim_tags): - from loopy.symbolic import simplify_using_aff - from loopy.diagnostic import ExpressionNotAffineError - try: - coeffs = CoefficientCollector()( - simplify_using_aff(self.knl, idx)) - except ExpressionNotAffineError: - total_stride = None - break - # check if he contains the lid 0 guy - try: - coeff_min_lid = coeffs[Variable(min_lid)] - except KeyError: - # does not contain min_lid - continue - # found coefficient of min_lid - # now determine stride - if isinstance(axis_tag, FixedStrideArrayDimTag): - stride = axis_tag.stride - else: - continue + lid_strides = {} + + for ltag, iname in six.iteritems(lid_to_iname): + ltag_stride = 0 + # check coefficient of this lid for each axis + for idx, axis_tag in zip(index, array.dim_tags): + + from loopy.symbolic import simplify_using_aff + from loopy.diagnostic import ExpressionNotAffineError + try: + coeffs = CoefficientCollector()( + simplify_using_aff(self.knl, idx)) + except ExpressionNotAffineError: + ltag_stride = None + break + + # check if idx contains this lid + try: + coeff_lid = coeffs[Variable(lid_to_iname[ltag])] + except KeyError: + # idx does not contain this lid + continue + + # found coefficient of this lid + # now determine stride + if isinstance(axis_tag, FixedStrideArrayDimTag): + stride = axis_tag.stride + else: + continue - total_stride += stride*coeff_min_lid + ltag_stride += stride*coeff_lid + lid_strides[ltag] = ltag_stride - count_granularity = CountGranularity.WORKITEM if total_stride is not 0 \ + # insert 0s for coeffs of missing *lesser* lids + for i in range(max(lid_strides.keys())+1): + if i not in lid_strides.keys(): + lid_strides[i] = 0 + + count_granularity = CountGranularity.WORKITEM if lid_strides[0] != 0 \ else CountGranularity.SUBGROUP return ToCountMap({MemAccess( mtype='global', dtype=self.type_inf(expr), - stride=total_stride, + lid_strides=[lid_strides[i] + for i in sorted(lid_strides)], variable=name, count_granularity=count_granularity ): 1} @@ -1532,14 +1533,12 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, + access_counter_l(insn.assignee) ).with_set_attributes(direction="store") - # use count excluding local index tags for uniform accesses for key, val in six.iteritems(access_expr.count_map): access_map = ( access_map + ToCountMap({key: val}) * get_insn_count(knl, insn.id, key.count_granularity)) - #currently not counting stride of local mem access for key, val in six.iteritems(access_assignee.count_map): @@ -1547,7 +1546,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, access_map + ToCountMap({key: val}) * get_insn_count(knl, insn.id, key.count_granularity)) - # for now, don't count writes to local mem + elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): pass else: @@ -1560,7 +1559,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, (MemAccess( mtype=mem_access.mtype, dtype=mem_access.dtype.numpy_dtype, - stride=mem_access.stride, + lid_strides=mem_access.lid_strides, direction=mem_access.direction, variable=mem_access.variable, count_granularity=mem_access.count_granularity), diff --git a/test/test_statistics.py b/test/test_statistics.py index ea0bdb62b..732b9afe2 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -269,19 +269,19 @@ def test_mem_access_counter_basic(): subgroups_per_group = div_ceil(group_size, subgroup_size) f32l = mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='a', + lid_strides=[], direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='b', + lid_strides=[], direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64l = mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='load', variable='g', + lid_strides=[], direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64l += mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='load', variable='h', + lid_strides=[], direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -290,11 +290,11 @@ def test_mem_access_counter_basic(): assert f64l == (2*n*m)*n_workgroups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=0, direction='store', variable='c', + lid_strides=[], direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), - stride=0, direction='store', variable='e', + lid_strides=[], direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -328,11 +328,11 @@ def test_mem_access_counter_reduction(): subgroups_per_group = div_ceil(group_size, subgroup_size) f32l = mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='a', + lid_strides=[], direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='b', + lid_strides=[], direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -340,7 +340,7 @@ def test_mem_access_counter_reduction(): assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=0, direction='store', variable='c', + lid_strides=[], direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -430,19 +430,19 @@ def test_mem_access_counter_specialops(): subgroups_per_group = div_ceil(group_size, subgroup_size) f32 = mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='a', + lid_strides=[], direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32 += mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='b', + lid_strides=[], direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), - stride=0, direction='load', variable='g', + lid_strides=[], direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), - stride=0, direction='load', variable='h', + lid_strides=[], direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -451,11 +451,11 @@ def test_mem_access_counter_specialops(): assert f64 == (2*n*m)*n_workgroups*subgroups_per_group f32 = mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='store', variable='c', + lid_strides=[], direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='store', variable='e', + lid_strides=[], direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -502,19 +502,19 @@ def test_mem_access_counter_bitwise(): subgroups_per_group = div_ceil(group_size, subgroup_size) i32 = mem_map[lp.MemAccess('global', np.int32, - stride=0, direction='load', variable='a', + lid_strides=[], direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, - stride=0, direction='load', variable='b', + lid_strides=[], direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, - stride=0, direction='load', variable='g', + lid_strides=[], direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), - stride=0, direction='load', variable='h', + lid_strides=[], direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -522,11 +522,11 @@ def test_mem_access_counter_bitwise(): assert i32 == (4*n*m+2*n*m*ell)*n_workgroups*subgroups_per_group i32 = mem_map[lp.MemAccess('global', np.int32, - stride=0, direction='store', variable='c', + lid_strides=[], direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, - stride=0, direction='store', variable='e', + lid_strides=[], direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -567,24 +567,24 @@ def test_mem_access_counter_mixed(): mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=subgroup_size) f64uniform = mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='load', variable='g', + lid_strides=[], direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64uniform += mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='load', variable='h', + lid_strides=[], direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32uniform = mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='x', + lid_strides=[], direction='load', variable='x', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=Variable('m'), direction='load', + lid_strides=[Variable('m')], direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=Variable('m'), direction='load', + lid_strides=[Variable('m')], direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -611,11 +611,11 @@ def test_mem_access_counter_mixed(): assert f32nonconsec == 3*n*m*ell f64uniform = mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='store', variable='e', + lid_strides=[], direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, - stride=Variable('m'), direction='store', + lid_strides=[Variable('m')], direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -655,22 +655,22 @@ def test_mem_access_counter_nonconsec(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} f64nonconsec = mem_map[lp.MemAccess('global', np.float64, - stride=Variable('m'), direction='load', + lid_strides=[Variable('m')], direction='load', variable='g', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64nonconsec += mem_map[lp.MemAccess('global', np.float64, - stride=Variable('m'), direction='load', + lid_strides=[Variable('m')], direction='load', variable='h', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=Variable('m')*Variable('ell'), + lid_strides=[Variable('m')*Variable('ell')], direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=Variable('m')*Variable('ell'), + lid_strides=[Variable('m')*Variable('ell')], direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -678,12 +678,12 @@ def test_mem_access_counter_nonconsec(): assert f32nonconsec == 3*n*m*ell f64nonconsec = mem_map[lp.MemAccess('global', np.float64, - stride=Variable('m'), direction='store', + lid_strides=[Variable('m')], direction='store', variable='e', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, - stride=Variable('m')*Variable('ell'), + lid_strides=[Variable('m')*Variable('ell')], direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -694,20 +694,20 @@ def test_mem_access_counter_nonconsec(): subgroup_size=64) f64nonconsec = mem_map64[lp.MemAccess( 'global', - np.float64, stride=Variable('m'), + np.float64, lid_strides=[Variable('m')], direction='load', variable='g', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64nonconsec += mem_map64[lp.MemAccess( 'global', - np.float64, stride=Variable('m'), + np.float64, lid_strides=[Variable('m')], direction='load', variable='h', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map64[lp.MemAccess( 'global', np.dtype(np.float32), - stride=Variable('m')*Variable('ell'), + lid_strides=[Variable('m')*Variable('ell')], direction='load', variable='a', count_granularity=CG.WORKITEM) @@ -715,7 +715,7 @@ def test_mem_access_counter_nonconsec(): f32nonconsec += mem_map64[lp.MemAccess( 'global', np.dtype(np.float32), - stride=Variable('m')*Variable('ell'), + lid_strides=[Variable('m')*Variable('ell')], direction='load', variable='b', count_granularity=CG.WORKITEM) @@ -747,30 +747,30 @@ def test_mem_access_counter_consec(): params = {'n': n, 'm': m, 'ell': ell} f64consec = mem_map[lp.MemAccess('global', np.float64, - stride=1, direction='load', variable='g', + lid_strides=[1], direction='load', variable='g', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64consec += mem_map[lp.MemAccess('global', np.float64, - stride=1, direction='load', variable='h', + lid_strides=[1], direction='load', variable='h', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, - stride=1, direction='load', variable='a', + lid_strides=[1], direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=1, direction='load', variable='b', + lid_strides=[1], direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64consec == 2*n*m*ell assert f32consec == 3*n*m*ell f64consec = mem_map[lp.MemAccess('global', np.float64, - stride=1, direction='store', variable='e', + lid_strides=[1], direction='store', variable='e', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, - stride=1, direction='store', variable='c', + lid_strides=[1], direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64consec == n*m*ell @@ -853,7 +853,6 @@ def test_barrier_counter_barriers(): def test_all_counters_parallel_matmul(): - bsize = 16 knl = lp.make_kernel( "{[i,k,j]: 0<=i Date: Sun, 4 Mar 2018 01:09:53 -0600 Subject: [PATCH 2/8] fixing flake8 issues --- test/test_statistics.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index 732b9afe2..cc81e1592 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -897,20 +897,20 @@ def test_all_counters_parallel_matmul(): subgroup_size=32) f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides=[1, Variable('ell')], direction='load', variable='b', - count_granularity=CG.WORKITEM) + lid_strides=[1, Variable('ell')], direction='load', + variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides=[1, Variable('m')], direction='load', variable='a', - count_granularity=CG.WORKITEM) + lid_strides=[1, Variable('m')], direction='load', + variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize assert f32s1la == n*m*ell/bsize f32coal = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides=[1, Variable('ell')], direction='store', variable='c', - count_granularity=CG.WORKITEM) + lid_strides=[1, Variable('ell')], direction='store', + variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f32coal == n*ell -- GitLab From fa12cdb412ec5ba533fb0bef596c0e404468db4b Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sun, 4 Mar 2018 02:10:03 -0600 Subject: [PATCH 3/8] switched lid_strides from list to dict in order to distinguish between two cases: (1) where an lid was found with stride 0 and (2) where no lid was found --- loopy/statistics.py | 36 +++++++------- test/test_statistics.py | 102 ++++++++++++++++++++-------------------- 2 files changed, 71 insertions(+), 67 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index d04b84e3c..91d15e7e0 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -529,7 +529,7 @@ class Op(Record): count_granularity=count_granularity) def __hash__(self): - return hash(str(self)) + return hash(repr(self)) def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness @@ -612,14 +612,16 @@ class MemAccess(Record): variable=variable, count_granularity=count_granularity) def __hash__(self): - return hash(str(self)) + # Note that this means lid_strides must be sorted in self.__repr__() + return hash(repr(self)) def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness return "MemAccess(%s, %s, %s, %s, %s, %s)" % ( self.mtype, self.dtype, - self.lid_strides, + None if self.lid_strides is None else dict( + sorted(six.iteritems(self.lid_strides))), self.direction, self.variable, self.count_granularity) @@ -870,7 +872,7 @@ class GlobalMemAccessCounter(MemAccessCounter): return ToCountMap() return ToCountMap({MemAccess(mtype='global', - dtype=self.type_inf(expr), lid_strides=[], + dtype=self.type_inf(expr), lid_strides={}, variable=name, count_granularity=CountGranularity.WORKITEM): 1} ) + self.rec(expr.index) @@ -906,18 +908,18 @@ class GlobalMemAccessCounter(MemAccessCounter): if not lid_to_iname: # no local id found, count as uniform access - # Note: - # lid_strides=[] when no local ids were found, - # lid_strides=[0, ...] if any local id is found and the lid0 stride is 0, - # either because no lid0 is found or because the stride of lid0 is 0 + # Note, a few different cases may be considered uniform: + # lid_strides={} if no local ids were found, + # lid_strides={1:1, 2:32} if no local id 0 was found, + # lid_strides={0:0, ...} if a local id 0 is found and its stride is 0 warn_with_kernel(self.knl, "no_lid_found", "GlobalSubscriptCounter: No local id found, " - "setting lid_strides to []. Expression: %s" + "setting lid_strides to {}. Expression: %s" % (expr)) return ToCountMap({MemAccess( mtype='global', - dtype=self.type_inf(expr), lid_strides=[], + dtype=self.type_inf(expr), lid_strides={}, variable=name, count_granularity=CountGranularity.SUBGROUP): 1} ) + self.rec(expr.index) @@ -965,18 +967,18 @@ class GlobalMemAccessCounter(MemAccessCounter): lid_strides[ltag] = ltag_stride # insert 0s for coeffs of missing *lesser* lids - for i in range(max(lid_strides.keys())+1): - if i not in lid_strides.keys(): - lid_strides[i] = 0 + #for i in range(max(lid_strides.keys())+1): + # if i not in lid_strides.keys(): + # lid_strides[i] = 0 - count_granularity = CountGranularity.WORKITEM if lid_strides[0] != 0 \ - else CountGranularity.SUBGROUP + count_granularity = CountGranularity.WORKITEM if ( + 0 in lid_strides and lid_strides[0] != 0 + ) else CountGranularity.SUBGROUP return ToCountMap({MemAccess( mtype='global', dtype=self.type_inf(expr), - lid_strides=[lid_strides[i] - for i in sorted(lid_strides)], + lid_strides=dict(sorted(six.iteritems(lid_strides))), variable=name, count_granularity=count_granularity ): 1} diff --git a/test/test_statistics.py b/test/test_statistics.py index cc81e1592..e42c43f60 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -269,19 +269,19 @@ def test_mem_access_counter_basic(): subgroups_per_group = div_ceil(group_size, subgroup_size) f32l = mem_map[lp.MemAccess('global', np.float32, - lid_strides=[], direction='load', variable='a', + lid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, - lid_strides=[], direction='load', variable='b', + lid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64l = mem_map[lp.MemAccess('global', np.float64, - lid_strides=[], direction='load', variable='g', + lid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64l += mem_map[lp.MemAccess('global', np.float64, - lid_strides=[], direction='load', variable='h', + lid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -290,11 +290,11 @@ def test_mem_access_counter_basic(): assert f64l == (2*n*m)*n_workgroups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides=[], direction='store', variable='c', + lid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), - lid_strides=[], direction='store', variable='e', + lid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -328,11 +328,11 @@ def test_mem_access_counter_reduction(): subgroups_per_group = div_ceil(group_size, subgroup_size) f32l = mem_map[lp.MemAccess('global', np.float32, - lid_strides=[], direction='load', variable='a', + lid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, - lid_strides=[], direction='load', variable='b', + lid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -340,7 +340,7 @@ def test_mem_access_counter_reduction(): assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides=[], direction='store', variable='c', + lid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -430,19 +430,19 @@ def test_mem_access_counter_specialops(): subgroups_per_group = div_ceil(group_size, subgroup_size) f32 = mem_map[lp.MemAccess('global', np.float32, - lid_strides=[], direction='load', variable='a', + lid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32 += mem_map[lp.MemAccess('global', np.float32, - lid_strides=[], direction='load', variable='b', + lid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), - lid_strides=[], direction='load', variable='g', + lid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), - lid_strides=[], direction='load', variable='h', + lid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -451,11 +451,11 @@ def test_mem_access_counter_specialops(): assert f64 == (2*n*m)*n_workgroups*subgroups_per_group f32 = mem_map[lp.MemAccess('global', np.float32, - lid_strides=[], direction='store', variable='c', + lid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.float64, - lid_strides=[], direction='store', variable='e', + lid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -502,19 +502,19 @@ def test_mem_access_counter_bitwise(): subgroups_per_group = div_ceil(group_size, subgroup_size) i32 = mem_map[lp.MemAccess('global', np.int32, - lid_strides=[], direction='load', variable='a', + lid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, - lid_strides=[], direction='load', variable='b', + lid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, - lid_strides=[], direction='load', variable='g', + lid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), - lid_strides=[], direction='load', variable='h', + lid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -522,11 +522,11 @@ def test_mem_access_counter_bitwise(): assert i32 == (4*n*m+2*n*m*ell)*n_workgroups*subgroups_per_group i32 = mem_map[lp.MemAccess('global', np.int32, - lid_strides=[], direction='store', variable='c', + lid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, - lid_strides=[], direction='store', variable='e', + lid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -567,24 +567,24 @@ def test_mem_access_counter_mixed(): mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=subgroup_size) f64uniform = mem_map[lp.MemAccess('global', np.float64, - lid_strides=[], direction='load', variable='g', + lid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64uniform += mem_map[lp.MemAccess('global', np.float64, - lid_strides=[], direction='load', variable='h', + lid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32uniform = mem_map[lp.MemAccess('global', np.float32, - lid_strides=[], direction='load', variable='x', + lid_strides={}, direction='load', variable='x', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides=[Variable('m')], direction='load', + lid_strides={0: Variable('m')}, direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides=[Variable('m')], direction='load', + lid_strides={0: Variable('m')}, direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -611,11 +611,11 @@ def test_mem_access_counter_mixed(): assert f32nonconsec == 3*n*m*ell f64uniform = mem_map[lp.MemAccess('global', np.float64, - lid_strides=[], direction='store', variable='e', + lid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, - lid_strides=[Variable('m')], direction='store', + lid_strides={0: Variable('m')}, direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -655,22 +655,22 @@ def test_mem_access_counter_nonconsec(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} f64nonconsec = mem_map[lp.MemAccess('global', np.float64, - lid_strides=[Variable('m')], direction='load', + lid_strides={0: Variable('m')}, direction='load', variable='g', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64nonconsec += mem_map[lp.MemAccess('global', np.float64, - lid_strides=[Variable('m')], direction='load', + lid_strides={0: Variable('m')}, direction='load', variable='h', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides=[Variable('m')*Variable('ell')], + lid_strides={0: Variable('m')*Variable('ell')}, direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides=[Variable('m')*Variable('ell')], + lid_strides={0: Variable('m')*Variable('ell')}, direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -678,12 +678,12 @@ def test_mem_access_counter_nonconsec(): assert f32nonconsec == 3*n*m*ell f64nonconsec = mem_map[lp.MemAccess('global', np.float64, - lid_strides=[Variable('m')], direction='store', + lid_strides={0: Variable('m')}, direction='store', variable='e', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, - lid_strides=[Variable('m')*Variable('ell')], + lid_strides={0: Variable('m')*Variable('ell')}, direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -694,20 +694,20 @@ def test_mem_access_counter_nonconsec(): subgroup_size=64) f64nonconsec = mem_map64[lp.MemAccess( 'global', - np.float64, lid_strides=[Variable('m')], + np.float64, lid_strides={0: Variable('m')}, direction='load', variable='g', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64nonconsec += mem_map64[lp.MemAccess( 'global', - np.float64, lid_strides=[Variable('m')], + np.float64, lid_strides={0: Variable('m')}, direction='load', variable='h', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map64[lp.MemAccess( 'global', np.dtype(np.float32), - lid_strides=[Variable('m')*Variable('ell')], + lid_strides={0: Variable('m')*Variable('ell')}, direction='load', variable='a', count_granularity=CG.WORKITEM) @@ -715,7 +715,7 @@ def test_mem_access_counter_nonconsec(): f32nonconsec += mem_map64[lp.MemAccess( 'global', np.dtype(np.float32), - lid_strides=[Variable('m')*Variable('ell')], + lid_strides={0: Variable('m')*Variable('ell')}, direction='load', variable='b', count_granularity=CG.WORKITEM) @@ -747,30 +747,30 @@ def test_mem_access_counter_consec(): params = {'n': n, 'm': m, 'ell': ell} f64consec = mem_map[lp.MemAccess('global', np.float64, - lid_strides=[1], direction='load', variable='g', + lid_strides={0: 1}, direction='load', variable='g', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64consec += mem_map[lp.MemAccess('global', np.float64, - lid_strides=[1], direction='load', variable='h', + lid_strides={0: 1}, direction='load', variable='h', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, - lid_strides=[1], direction='load', variable='a', + lid_strides={0: 1}, direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides=[1], direction='load', variable='b', + lid_strides={0: 1}, direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64consec == 2*n*m*ell assert f32consec == 3*n*m*ell f64consec = mem_map[lp.MemAccess('global', np.float64, - lid_strides=[1], direction='store', variable='e', + lid_strides={0: 1}, direction='store', variable='e', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, - lid_strides=[1], direction='store', variable='c', + lid_strides={0: 1}, direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64consec == n*m*ell @@ -897,11 +897,12 @@ def test_all_counters_parallel_matmul(): subgroup_size=32) f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides=[1, Variable('ell')], direction='load', - variable='b', count_granularity=CG.WORKITEM) + lid_strides={0: 1, 1: Variable('ell')}, + direction='load', variable='b', + count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides=[1, Variable('m')], direction='load', + lid_strides={0: 1, 1: Variable('m')}, direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -909,8 +910,9 @@ def test_all_counters_parallel_matmul(): assert f32s1la == n*m*ell/bsize f32coal = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides=[1, Variable('ell')], direction='store', - variable='c', count_granularity=CG.WORKITEM) + lid_strides={0: 1, 1: Variable('ell')}, + direction='store', variable='c', + count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f32coal == n*ell @@ -1056,7 +1058,7 @@ def test_summations_and_filters(): assert f64ops_all == n*m def func_filter(key): - return key.lid_strides == [] and key.dtype == to_loopy_type(np.float64) and \ + return key.lid_strides == {} and key.dtype == to_loopy_type(np.float64) and \ key.direction == 'load' f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) -- GitLab From 19525df93ff6e82093548f0e1f3aa60b7137a8cb Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sun, 4 Mar 2018 02:37:15 -0600 Subject: [PATCH 4/8] updated docstrings --- loopy/statistics.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 91d15e7e0..07f29d8ab 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -269,7 +269,7 @@ class ToCountMap(object): params = {'n': 512, 'm': 256, 'l': 128} mem_map = lp.get_mem_access_map(knl) def filter_func(key): - return key.stride > 1 and key.stride <= 4: + return key.lid_strides[0] > 1 and key.lid_strides[0] <= 4: filtered_map = mem_map.filter_by_func(filter_func) tot = filtered_map.eval_and_sum(params) @@ -374,16 +374,16 @@ class ToCountMap(object): params = {'n': 512, 'm': 256, 'l': 128} s1_g_ld_byt = bytes_map.filter_by( - mtype=['global'], stride=[1], + mtype=['global'], lid_strides={0: 1}, direction=['load']).eval_and_sum(params) s2_g_ld_byt = bytes_map.filter_by( - mtype=['global'], stride=[2], + mtype=['global'], lid_strides={0: 2}, direction=['load']).eval_and_sum(params) s1_g_st_byt = bytes_map.filter_by( - mtype=['global'], stride=[1], + mtype=['global'], lid_strides={0: 1}, direction=['store']).eval_and_sum(params) s2_g_st_byt = bytes_map.filter_by( - mtype=['global'], stride=[2], + mtype=['global'], lid_strides={0: 2}, direction=['store']).eval_and_sum(params) # (now use these counts to, e.g., predict performance) @@ -440,7 +440,7 @@ class ToCountMap(object): params = {'n': 512, 'm': 256, 'l': 128} mem_map = lp.get_mem_access_map(knl) filtered_map = mem_map.filter_by(direction=['load'], - variable=['a','g']) + variable=['a', 'g']) tot_loads_a_g = filtered_map.eval_and_sum(params) # (now use these counts to, e.g., predict performance) @@ -553,10 +553,16 @@ class MemAccess(Record): A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the data type accessed. - .. attribute:: stride + .. attribute:: lid_strides - An :class:`int` that specifies stride of the memory access. A stride of - 0 indicates a uniform access (i.e. all work-items access the same item). + An :class:`dict` of **{** :class:`int` **:** + :class:`pymbolic.primitives.Variable` or :class:`int` **}** that + specifies local strides for each local id in the memory access index. + Local ids not found will not be present in ``lid_strides.keys()``. + Uniform access (i.e. work-items within a sub-group access the same + item) is indicated by setting ``lid_strides[0]=0``, but may also occur + when no local id 0 is found, in which case the 0 key will not be + present in lid_strides. .. attribute:: direction @@ -966,11 +972,6 @@ class GlobalMemAccessCounter(MemAccessCounter): ltag_stride += stride*coeff_lid lid_strides[ltag] = ltag_stride - # insert 0s for coeffs of missing *lesser* lids - #for i in range(max(lid_strides.keys())+1): - # if i not in lid_strides.keys(): - # lid_strides[i] = 0 - count_granularity = CountGranularity.WORKITEM if ( 0 in lid_strides and lid_strides[0] != 0 ) else CountGranularity.SUBGROUP @@ -1389,7 +1390,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, f32_s1_g_ld_a = mem_map[MemAccess( mtype='global', dtype=np.float32, - stride=1, + lid_strides={0: 1}, direction='load', variable='a', count_granularity=CountGranularity.WORKITEM) @@ -1397,7 +1398,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, f32_s1_g_st_a = mem_map[MemAccess( mtype='global', dtype=np.float32, - stride=1, + lid_strides={0: 1}, direction='store', variable='a', count_granularity=CountGranularity.WORKITEM) @@ -1405,7 +1406,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, f32_s1_l_ld_x = mem_map[MemAccess( mtype='local', dtype=np.float32, - stride=1, + lid_strides={0: 1}, direction='load', variable='x', count_granularity=CountGranularity.WORKITEM) @@ -1413,7 +1414,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, f32_s1_l_st_x = mem_map[MemAccess( mtype='local', dtype=np.float32, - stride=1, + lid_strides={0: 1}, direction='store', variable='x', count_granularity=CountGranularity.WORKITEM) -- GitLab From a42e7ffbc230f5eca7ce591eb50c95202e8102e8 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sun, 4 Mar 2018 02:48:37 -0600 Subject: [PATCH 5/8] fixing grammar in doc string --- loopy/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 07f29d8ab..b9d147fe4 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -555,7 +555,7 @@ class MemAccess(Record): .. attribute:: lid_strides - An :class:`dict` of **{** :class:`int` **:** + A :class:`dict` of **{** :class:`int` **:** :class:`pymbolic.primitives.Variable` or :class:`int` **}** that specifies local strides for each local id in the memory access index. Local ids not found will not be present in ``lid_strides.keys()``. -- GitLab From 24abd998b3cf60ae4183c17dd12aefdd6061e32e Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sun, 4 Mar 2018 02:58:36 -0600 Subject: [PATCH 6/8] updated tutorial for lid_strides change --- doc/tutorial.rst | 87 +++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 4efc13de4..3c8ca28fd 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1641,15 +1641,15 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : ... Each line of output will look roughly like:: - MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, store, c, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1661,8 +1661,13 @@ Each line of output will look roughly like:: - dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the data type accessed. -- stride: An :class:`int` that specifies stride of the memory access. A stride - of 0 indicates a uniform access (i.e. all work-items access the same item). +- stride: A :class:`dict` of **{** :class:`int` **:** + :class:`pymbolic.primitives.Variable` or :class:`int` **}** that specifies + local strides for each local id in the memory access index. Local ids not + found will not be present in ``lid_strides.keys()``. Uniform access (i.e. + work-items within a sub-group access the same item) is indicated by setting + ``lid_strides[0]=0``, but may also occur when no local id 0 is found, in + which case the 0 key will not be present in lid_strides. - direction: A :class:`str` that specifies the direction of memory access as **load** or **store**. @@ -1674,13 +1679,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g', CG.SUBGROUP) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, 'load', 'g', CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e', CG.SUBGROUP) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, 'store', 'e', CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a', CG.SUBGROUP) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, 'load', 'a', CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c', CG.SUBGROUP) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, 'store', 'c', CG.SUBGROUP) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1698,7 +1703,7 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : ... >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') @@ -1716,12 +1721,12 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: The lines of output above might look like:: - MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, g, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, h, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, store, e, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, load, b, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, store, c, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), {}, load, g, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), {}, load, h, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), {}, store, e, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } One can see how these functions might be useful in computing, for example, achieved memory bandwidth in byte/sec or performance in FLOP/sec. @@ -1730,7 +1735,7 @@ achieved memory bandwidth in byte/sec or performance in FLOP/sec. Since we have not tagged any of the inames or parallelized the kernel across work-items (which would have produced iname tags), :func:`loopy.get_mem_access_map` -considers the memory accesses *uniform*, so the *stride* of each access is 0. +finds no local id strides, leaving ``lid_strides`` empty for each memory access. Now we'll parallelize the kernel and count the array accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated this time. @@ -1740,12 +1745,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 1, load, a, workitem) : ... - MemAccess(global, np:dtype('float32'), 1, load, b, workitem) : ... - MemAccess(global, np:dtype('float32'), 1, store, c, workitem) : ... - MemAccess(global, np:dtype('float64'), 1, load, g, workitem) : ... - MemAccess(global, np:dtype('float64'), 1, load, h, workitem) : ... - MemAccess(global, np:dtype('float64'), 1, store, e, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, load, a, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, load, b, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, store, c, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, load, g, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, load, h, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, store, e, workitem) : ... With this parallelization, consecutive work-items will access consecutive array @@ -1755,13 +1760,13 @@ array accesses has not changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g', CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, 'load', 'g', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e', CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, 'store', 'e', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a', CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, 'load', 'a', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c', CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, 'store', 'c', CG.WORKITEM) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1772,8 +1777,8 @@ array accesses has not changed: ~~~~~~~~~~~ -To produce *nonconsecutive* array accesses with stride greater than 1, we'll -switch the inner and outer tags in our parallelization of the kernel: +To produce *nonconsecutive* array accesses with local id 0 stride greater than 1, +we'll switch the inner and outer tags in our parallelization of the kernel: .. doctest:: @@ -1781,12 +1786,12 @@ switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 128, load, a, workitem) : ... - MemAccess(global, np:dtype('float32'), 128, load, b, workitem) : ... - MemAccess(global, np:dtype('float32'), 128, store, c, workitem) : ... - MemAccess(global, np:dtype('float64'), 128, load, g, workitem) : ... - MemAccess(global, np:dtype('float64'), 128, load, h, workitem) : ... - MemAccess(global, np:dtype('float64'), 128, store, e, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, load, a, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, load, b, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, store, c, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, load, g, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, load, h, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, store, e, workitem) : ... With this parallelization, consecutive work-items will access *nonconsecutive* @@ -1795,13 +1800,13 @@ changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g', CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, 'load', 'g', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e', CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, 'store', 'e', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a', CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, 'load', 'a', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c', CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, 'store', 'c', CG.WORKITEM) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1819,7 +1824,7 @@ criteria are more complicated than a simple list of allowable values: >>> def f(key): ... from loopy.types import to_loopy_type ... return key.dtype == to_loopy_type(np.float32) and \ - ... key.stride > 1 + ... key.lid_strides[0] > 1 >>> count = mem_map.filter_by_func(f).eval_and_sum(param_dict) >>> print(count) 2097152 -- GitLab From 358f2a8b1097b80f3965cadc7d97e3aeda1e18b5 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sun, 4 Mar 2018 03:00:28 -0600 Subject: [PATCH 7/8] removed extra blank line --- loopy/statistics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index b9d147fe4..b27888b1d 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -930,7 +930,6 @@ class GlobalMemAccessCounter(MemAccessCounter): count_granularity=CountGranularity.SUBGROUP): 1} ) + self.rec(expr.index) - # create lid_strides dict (strides are coefficents in flattened index) # i.e., we want {0:A, 1:B, 2:C, ...} where A, B, & C # come from flattened index [... + C*lid2 + B*lid1 + A*lid0] -- GitLab From 5f433a50fef21f16c7693f6dbf85d52469927621 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sun, 4 Mar 2018 03:52:25 -0600 Subject: [PATCH 8/8] corrected lid_strides doc --- doc/tutorial.rst | 2 +- loopy/statistics.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 3c8ca28fd..af8c8281c 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1662,7 +1662,7 @@ Each line of output will look roughly like:: data type accessed. - stride: A :class:`dict` of **{** :class:`int` **:** - :class:`pymbolic.primitives.Variable` or :class:`int` **}** that specifies + :class:`pymbolic.primitives.Expression` or :class:`int` **}** that specifies local strides for each local id in the memory access index. Local ids not found will not be present in ``lid_strides.keys()``. Uniform access (i.e. work-items within a sub-group access the same item) is indicated by setting diff --git a/loopy/statistics.py b/loopy/statistics.py index b27888b1d..5e929b618 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -556,7 +556,7 @@ class MemAccess(Record): .. attribute:: lid_strides A :class:`dict` of **{** :class:`int` **:** - :class:`pymbolic.primitives.Variable` or :class:`int` **}** that + :class:`pymbolic.primitives.Expression` or :class:`int` **}** that specifies local strides for each local id in the memory access index. Local ids not found will not be present in ``lid_strides.keys()``. Uniform access (i.e. work-items within a sub-group access the same -- GitLab