diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 4efc13de4bc93b1024ad4ccd40d4d1ef5395643b..af8c8281cf7f5b7bbf0f00a0841a15c5f05f14dd 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1641,15 +1641,15 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : ... Each line of output will look roughly like:: - MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, store, c, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1661,8 +1661,13 @@ Each line of output will look roughly like:: - dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the data type accessed. -- stride: An :class:`int` that specifies stride of the memory access. A stride - of 0 indicates a uniform access (i.e. all work-items access the same item). +- stride: A :class:`dict` of **{** :class:`int` **:** + :class:`pymbolic.primitives.Expression` or :class:`int` **}** that specifies + local strides for each local id in the memory access index. Local ids not + found will not be present in ``lid_strides.keys()``. Uniform access (i.e. + work-items within a sub-group access the same item) is indicated by setting + ``lid_strides[0]=0``, but may also occur when no local id 0 is found, in + which case the 0 key will not be present in lid_strides. - direction: A :class:`str` that specifies the direction of memory access as **load** or **store**. @@ -1674,13 +1679,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g', CG.SUBGROUP) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, 'load', 'g', CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e', CG.SUBGROUP) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, 'store', 'e', CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a', CG.SUBGROUP) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, 'load', 'a', CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c', CG.SUBGROUP) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, 'store', 'c', CG.SUBGROUP) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1698,7 +1703,7 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : ... >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') @@ -1716,12 +1721,12 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: The lines of output above might look like:: - MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, g, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, h, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, store, e, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, load, b, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, store, c, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), {}, load, g, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), {}, load, h, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), {}, store, e, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } One can see how these functions might be useful in computing, for example, achieved memory bandwidth in byte/sec or performance in FLOP/sec. @@ -1730,7 +1735,7 @@ achieved memory bandwidth in byte/sec or performance in FLOP/sec. Since we have not tagged any of the inames or parallelized the kernel across work-items (which would have produced iname tags), :func:`loopy.get_mem_access_map` -considers the memory accesses *uniform*, so the *stride* of each access is 0. +finds no local id strides, leaving ``lid_strides`` empty for each memory access. Now we'll parallelize the kernel and count the array accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated this time. @@ -1740,12 +1745,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 1, load, a, workitem) : ... - MemAccess(global, np:dtype('float32'), 1, load, b, workitem) : ... - MemAccess(global, np:dtype('float32'), 1, store, c, workitem) : ... - MemAccess(global, np:dtype('float64'), 1, load, g, workitem) : ... - MemAccess(global, np:dtype('float64'), 1, load, h, workitem) : ... - MemAccess(global, np:dtype('float64'), 1, store, e, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, load, a, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, load, b, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, store, c, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, load, g, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, load, h, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, store, e, workitem) : ... With this parallelization, consecutive work-items will access consecutive array @@ -1755,13 +1760,13 @@ array accesses has not changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g', CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, 'load', 'g', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e', CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, 'store', 'e', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a', CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, 'load', 'a', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c', CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, 'store', 'c', CG.WORKITEM) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1772,8 +1777,8 @@ array accesses has not changed: ~~~~~~~~~~~ -To produce *nonconsecutive* array accesses with stride greater than 1, we'll -switch the inner and outer tags in our parallelization of the kernel: +To produce *nonconsecutive* array accesses with local id 0 stride greater than 1, +we'll switch the inner and outer tags in our parallelization of the kernel: .. doctest:: @@ -1781,12 +1786,12 @@ switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 128, load, a, workitem) : ... - MemAccess(global, np:dtype('float32'), 128, load, b, workitem) : ... - MemAccess(global, np:dtype('float32'), 128, store, c, workitem) : ... - MemAccess(global, np:dtype('float64'), 128, load, g, workitem) : ... - MemAccess(global, np:dtype('float64'), 128, load, h, workitem) : ... - MemAccess(global, np:dtype('float64'), 128, store, e, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, load, a, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, load, b, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, store, c, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, load, g, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, load, h, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, store, e, workitem) : ... With this parallelization, consecutive work-items will access *nonconsecutive* @@ -1795,13 +1800,13 @@ changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g', CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, 'load', 'g', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e', CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, 'store', 'e', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a', CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, 'load', 'a', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c', CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, 'store', 'c', CG.WORKITEM) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1819,7 +1824,7 @@ criteria are more complicated than a simple list of allowable values: >>> def f(key): ... from loopy.types import to_loopy_type ... return key.dtype == to_loopy_type(np.float32) and \ - ... key.stride > 1 + ... key.lid_strides[0] > 1 >>> count = mem_map.filter_by_func(f).eval_and_sum(param_dict) >>> print(count) 2097152 diff --git a/loopy/statistics.py b/loopy/statistics.py index e27a0f482885658888c97081e4fc1d97fcd149fd..5e929b61830ef3dc286728ee63a659a0b16d19c3 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -269,7 +269,7 @@ class ToCountMap(object): params = {'n': 512, 'm': 256, 'l': 128} mem_map = lp.get_mem_access_map(knl) def filter_func(key): - return key.stride > 1 and key.stride <= 4: + return key.lid_strides[0] > 1 and key.lid_strides[0] <= 4: filtered_map = mem_map.filter_by_func(filter_func) tot = filtered_map.eval_and_sum(params) @@ -374,16 +374,16 @@ class ToCountMap(object): params = {'n': 512, 'm': 256, 'l': 128} s1_g_ld_byt = bytes_map.filter_by( - mtype=['global'], stride=[1], + mtype=['global'], lid_strides={0: 1}, direction=['load']).eval_and_sum(params) s2_g_ld_byt = bytes_map.filter_by( - mtype=['global'], stride=[2], + mtype=['global'], lid_strides={0: 2}, direction=['load']).eval_and_sum(params) s1_g_st_byt = bytes_map.filter_by( - mtype=['global'], stride=[1], + mtype=['global'], lid_strides={0: 1}, direction=['store']).eval_and_sum(params) s2_g_st_byt = bytes_map.filter_by( - mtype=['global'], stride=[2], + mtype=['global'], lid_strides={0: 2}, direction=['store']).eval_and_sum(params) # (now use these counts to, e.g., predict performance) @@ -440,7 +440,7 @@ class ToCountMap(object): params = {'n': 512, 'm': 256, 'l': 128} mem_map = lp.get_mem_access_map(knl) filtered_map = mem_map.filter_by(direction=['load'], - variable=['a','g']) + variable=['a', 'g']) tot_loads_a_g = filtered_map.eval_and_sum(params) # (now use these counts to, e.g., predict performance) @@ -529,7 +529,7 @@ class Op(Record): count_granularity=count_granularity) def __hash__(self): - return hash(str(self)) + return hash(repr(self)) def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness @@ -553,10 +553,16 @@ class MemAccess(Record): A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the data type accessed. - .. attribute:: stride + .. attribute:: lid_strides - An :class:`int` that specifies stride of the memory access. A stride of - 0 indicates a uniform access (i.e. all work-items access the same item). + A :class:`dict` of **{** :class:`int` **:** + :class:`pymbolic.primitives.Expression` or :class:`int` **}** that + specifies local strides for each local id in the memory access index. + Local ids not found will not be present in ``lid_strides.keys()``. + Uniform access (i.e. work-items within a sub-group access the same + item) is indicated by setting ``lid_strides[0]=0``, but may also occur + when no local id 0 is found, in which case the 0 key will not be + present in lid_strides. .. attribute:: direction @@ -583,12 +589,12 @@ class MemAccess(Record): """ - def __init__(self, mtype=None, dtype=None, stride=None, direction=None, + def __init__(self, mtype=None, dtype=None, lid_strides=None, direction=None, variable=None, count_granularity=None): - #TODO currently giving all lmem access stride=None - if (mtype == 'local') and (stride is not None): - raise NotImplementedError("MemAccess: stride must be None when " + #TODO currently giving all lmem access lid_strides=None + if mtype == 'local' and lid_strides is not None: + raise NotImplementedError("MemAccess: lid_strides must be None when " "mtype is 'local'") #TODO currently giving all lmem access variable=None @@ -602,24 +608,26 @@ class MemAccess(Record): % (count_granularity, CountGranularity.ALL+[None])) if dtype is None: - Record.__init__(self, mtype=mtype, dtype=dtype, stride=stride, + Record.__init__(self, mtype=mtype, dtype=dtype, lid_strides=lid_strides, direction=direction, variable=variable, count_granularity=count_granularity) else: from loopy.types import to_loopy_type Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype), - stride=stride, direction=direction, variable=variable, - count_granularity=count_granularity) + lid_strides=lid_strides, direction=direction, + variable=variable, count_granularity=count_granularity) def __hash__(self): - return hash(str(self)) + # Note that this means lid_strides must be sorted in self.__repr__() + return hash(repr(self)) def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness return "MemAccess(%s, %s, %s, %s, %s, %s)" % ( self.mtype, self.dtype, - self.stride, + None if self.lid_strides is None else dict( + sorted(six.iteritems(self.lid_strides))), self.direction, self.variable, self.count_granularity) @@ -870,7 +878,7 @@ class GlobalMemAccessCounter(MemAccessCounter): return ToCountMap() return ToCountMap({MemAccess(mtype='global', - dtype=self.type_inf(expr), stride=0, + dtype=self.type_inf(expr), lid_strides={}, variable=name, count_granularity=CountGranularity.WORKITEM): 1} ) + self.rec(expr.index) @@ -896,86 +904,81 @@ class GlobalMemAccessCounter(MemAccessCounter): from loopy.kernel.data import LocalIndexTag my_inames = get_dependencies(index) & self.knl.all_inames() - # find min tag axis - import sys - min_tag_axis = sys.maxsize - local_id_found = False + # find all local index tags and corresponding inames + lid_to_iname = {} for iname in my_inames: tag = self.knl.iname_to_tag.get(iname) if isinstance(tag, LocalIndexTag): - local_id_found = True - if tag.axis < min_tag_axis: - min_tag_axis = tag.axis + lid_to_iname[tag.axis] = iname + + if not lid_to_iname: + + # no local id found, count as uniform access + # Note, a few different cases may be considered uniform: + # lid_strides={} if no local ids were found, + # lid_strides={1:1, 2:32} if no local id 0 was found, + # lid_strides={0:0, ...} if a local id 0 is found and its stride is 0 + warn_with_kernel(self.knl, "no_lid_found", + "GlobalSubscriptCounter: No local id found, " + "setting lid_strides to {}. Expression: %s" + % (expr)) - if not local_id_found: - # count as uniform access return ToCountMap({MemAccess( mtype='global', - dtype=self.type_inf(expr), stride=0, + dtype=self.type_inf(expr), lid_strides={}, variable=name, count_granularity=CountGranularity.SUBGROUP): 1} ) + self.rec(expr.index) - if min_tag_axis != 0: - warn_with_kernel(self.knl, "unknown_gmem_stride", - "GlobalSubscriptCounter: Memory access minimum " - "tag axis %d != 0, stride unknown, using " - "sys.maxsize." % (min_tag_axis)) - return ToCountMap({MemAccess( - mtype='global', - dtype=self.type_inf(expr), - stride=sys.maxsize, variable=name, - count_granularity=CountGranularity.WORKITEM): 1} - ) + self.rec(expr.index) + # create lid_strides dict (strides are coefficents in flattened index) + # i.e., we want {0:A, 1:B, 2:C, ...} where A, B, & C + # come from flattened index [... + C*lid2 + B*lid1 + A*lid0] - # get local_id associated with minimum tag axis - min_lid = None - for iname in my_inames: - tag = self.knl.iname_to_tag.get(iname) - if isinstance(tag, LocalIndexTag): - if tag.axis == min_tag_axis: - min_lid = iname - break # there will be only one min local_id - - # found local_id associated with minimum tag axis - - total_stride = 0 - # check coefficient of min_lid for each axis from loopy.symbolic import CoefficientCollector from loopy.kernel.array import FixedStrideArrayDimTag from pymbolic.primitives import Variable - for idx, axis_tag in zip(index, array.dim_tags): - from loopy.symbolic import simplify_using_aff - from loopy.diagnostic import ExpressionNotAffineError - try: - coeffs = CoefficientCollector()( - simplify_using_aff(self.knl, idx)) - except ExpressionNotAffineError: - total_stride = None - break - # check if he contains the lid 0 guy - try: - coeff_min_lid = coeffs[Variable(min_lid)] - except KeyError: - # does not contain min_lid - continue - # found coefficient of min_lid - # now determine stride - if isinstance(axis_tag, FixedStrideArrayDimTag): - stride = axis_tag.stride - else: - continue + lid_strides = {} + + for ltag, iname in six.iteritems(lid_to_iname): + ltag_stride = 0 + # check coefficient of this lid for each axis + for idx, axis_tag in zip(index, array.dim_tags): + + from loopy.symbolic import simplify_using_aff + from loopy.diagnostic import ExpressionNotAffineError + try: + coeffs = CoefficientCollector()( + simplify_using_aff(self.knl, idx)) + except ExpressionNotAffineError: + ltag_stride = None + break + + # check if idx contains this lid + try: + coeff_lid = coeffs[Variable(lid_to_iname[ltag])] + except KeyError: + # idx does not contain this lid + continue + + # found coefficient of this lid + # now determine stride + if isinstance(axis_tag, FixedStrideArrayDimTag): + stride = axis_tag.stride + else: + continue - total_stride += stride*coeff_min_lid + ltag_stride += stride*coeff_lid + lid_strides[ltag] = ltag_stride - count_granularity = CountGranularity.WORKITEM if total_stride is not 0 \ - else CountGranularity.SUBGROUP + count_granularity = CountGranularity.WORKITEM if ( + 0 in lid_strides and lid_strides[0] != 0 + ) else CountGranularity.SUBGROUP return ToCountMap({MemAccess( mtype='global', dtype=self.type_inf(expr), - stride=total_stride, + lid_strides=dict(sorted(six.iteritems(lid_strides))), variable=name, count_granularity=count_granularity ): 1} @@ -1386,7 +1389,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, f32_s1_g_ld_a = mem_map[MemAccess( mtype='global', dtype=np.float32, - stride=1, + lid_strides={0: 1}, direction='load', variable='a', count_granularity=CountGranularity.WORKITEM) @@ -1394,7 +1397,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, f32_s1_g_st_a = mem_map[MemAccess( mtype='global', dtype=np.float32, - stride=1, + lid_strides={0: 1}, direction='store', variable='a', count_granularity=CountGranularity.WORKITEM) @@ -1402,7 +1405,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, f32_s1_l_ld_x = mem_map[MemAccess( mtype='local', dtype=np.float32, - stride=1, + lid_strides={0: 1}, direction='load', variable='x', count_granularity=CountGranularity.WORKITEM) @@ -1410,7 +1413,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, f32_s1_l_st_x = mem_map[MemAccess( mtype='local', dtype=np.float32, - stride=1, + lid_strides={0: 1}, direction='store', variable='x', count_granularity=CountGranularity.WORKITEM) @@ -1532,14 +1535,12 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, + access_counter_l(insn.assignee) ).with_set_attributes(direction="store") - # use count excluding local index tags for uniform accesses for key, val in six.iteritems(access_expr.count_map): access_map = ( access_map + ToCountMap({key: val}) * get_insn_count(knl, insn.id, key.count_granularity)) - #currently not counting stride of local mem access for key, val in six.iteritems(access_assignee.count_map): @@ -1547,7 +1548,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, access_map + ToCountMap({key: val}) * get_insn_count(knl, insn.id, key.count_granularity)) - # for now, don't count writes to local mem + elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): pass else: @@ -1560,7 +1561,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, (MemAccess( mtype=mem_access.mtype, dtype=mem_access.dtype.numpy_dtype, - stride=mem_access.stride, + lid_strides=mem_access.lid_strides, direction=mem_access.direction, variable=mem_access.variable, count_granularity=mem_access.count_granularity), diff --git a/test/test_statistics.py b/test/test_statistics.py index ea0bdb62bb75d8a5bcf7dd987c00c33b848091fd..e42c43f60179321114fb695978cc1c91f182e8ee 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -269,19 +269,19 @@ def test_mem_access_counter_basic(): subgroups_per_group = div_ceil(group_size, subgroup_size) f32l = mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='a', + lid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='b', + lid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64l = mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='load', variable='g', + lid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64l += mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='load', variable='h', + lid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -290,11 +290,11 @@ def test_mem_access_counter_basic(): assert f64l == (2*n*m)*n_workgroups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=0, direction='store', variable='c', + lid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), - stride=0, direction='store', variable='e', + lid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -328,11 +328,11 @@ def test_mem_access_counter_reduction(): subgroups_per_group = div_ceil(group_size, subgroup_size) f32l = mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='a', + lid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='b', + lid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -340,7 +340,7 @@ def test_mem_access_counter_reduction(): assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=0, direction='store', variable='c', + lid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -430,19 +430,19 @@ def test_mem_access_counter_specialops(): subgroups_per_group = div_ceil(group_size, subgroup_size) f32 = mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='a', + lid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32 += mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='b', + lid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), - stride=0, direction='load', variable='g', + lid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), - stride=0, direction='load', variable='h', + lid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -451,11 +451,11 @@ def test_mem_access_counter_specialops(): assert f64 == (2*n*m)*n_workgroups*subgroups_per_group f32 = mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='store', variable='c', + lid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='store', variable='e', + lid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -502,19 +502,19 @@ def test_mem_access_counter_bitwise(): subgroups_per_group = div_ceil(group_size, subgroup_size) i32 = mem_map[lp.MemAccess('global', np.int32, - stride=0, direction='load', variable='a', + lid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, - stride=0, direction='load', variable='b', + lid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, - stride=0, direction='load', variable='g', + lid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), - stride=0, direction='load', variable='h', + lid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -522,11 +522,11 @@ def test_mem_access_counter_bitwise(): assert i32 == (4*n*m+2*n*m*ell)*n_workgroups*subgroups_per_group i32 = mem_map[lp.MemAccess('global', np.int32, - stride=0, direction='store', variable='c', + lid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, - stride=0, direction='store', variable='e', + lid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) @@ -567,24 +567,24 @@ def test_mem_access_counter_mixed(): mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=subgroup_size) f64uniform = mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='load', variable='g', + lid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64uniform += mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='load', variable='h', + lid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32uniform = mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='x', + lid_strides={}, direction='load', variable='x', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=Variable('m'), direction='load', + lid_strides={0: Variable('m')}, direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=Variable('m'), direction='load', + lid_strides={0: Variable('m')}, direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -611,11 +611,11 @@ def test_mem_access_counter_mixed(): assert f32nonconsec == 3*n*m*ell f64uniform = mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='store', variable='e', + lid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, - stride=Variable('m'), direction='store', + lid_strides={0: Variable('m')}, direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -655,22 +655,22 @@ def test_mem_access_counter_nonconsec(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} f64nonconsec = mem_map[lp.MemAccess('global', np.float64, - stride=Variable('m'), direction='load', + lid_strides={0: Variable('m')}, direction='load', variable='g', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64nonconsec += mem_map[lp.MemAccess('global', np.float64, - stride=Variable('m'), direction='load', + lid_strides={0: Variable('m')}, direction='load', variable='h', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=Variable('m')*Variable('ell'), + lid_strides={0: Variable('m')*Variable('ell')}, direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=Variable('m')*Variable('ell'), + lid_strides={0: Variable('m')*Variable('ell')}, direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -678,12 +678,12 @@ def test_mem_access_counter_nonconsec(): assert f32nonconsec == 3*n*m*ell f64nonconsec = mem_map[lp.MemAccess('global', np.float64, - stride=Variable('m'), direction='store', + lid_strides={0: Variable('m')}, direction='store', variable='e', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, - stride=Variable('m')*Variable('ell'), + lid_strides={0: Variable('m')*Variable('ell')}, direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -694,20 +694,20 @@ def test_mem_access_counter_nonconsec(): subgroup_size=64) f64nonconsec = mem_map64[lp.MemAccess( 'global', - np.float64, stride=Variable('m'), + np.float64, lid_strides={0: Variable('m')}, direction='load', variable='g', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64nonconsec += mem_map64[lp.MemAccess( 'global', - np.float64, stride=Variable('m'), + np.float64, lid_strides={0: Variable('m')}, direction='load', variable='h', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map64[lp.MemAccess( 'global', np.dtype(np.float32), - stride=Variable('m')*Variable('ell'), + lid_strides={0: Variable('m')*Variable('ell')}, direction='load', variable='a', count_granularity=CG.WORKITEM) @@ -715,7 +715,7 @@ def test_mem_access_counter_nonconsec(): f32nonconsec += mem_map64[lp.MemAccess( 'global', np.dtype(np.float32), - stride=Variable('m')*Variable('ell'), + lid_strides={0: Variable('m')*Variable('ell')}, direction='load', variable='b', count_granularity=CG.WORKITEM) @@ -747,30 +747,30 @@ def test_mem_access_counter_consec(): params = {'n': n, 'm': m, 'ell': ell} f64consec = mem_map[lp.MemAccess('global', np.float64, - stride=1, direction='load', variable='g', + lid_strides={0: 1}, direction='load', variable='g', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64consec += mem_map[lp.MemAccess('global', np.float64, - stride=1, direction='load', variable='h', + lid_strides={0: 1}, direction='load', variable='h', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, - stride=1, direction='load', variable='a', + lid_strides={0: 1}, direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=1, direction='load', variable='b', + lid_strides={0: 1}, direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64consec == 2*n*m*ell assert f32consec == 3*n*m*ell f64consec = mem_map[lp.MemAccess('global', np.float64, - stride=1, direction='store', variable='e', + lid_strides={0: 1}, direction='store', variable='e', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, - stride=1, direction='store', variable='c', + lid_strides={0: 1}, direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64consec == n*m*ell @@ -853,7 +853,6 @@ def test_barrier_counter_barriers(): def test_all_counters_parallel_matmul(): - bsize = 16 knl = lp.make_kernel( "{[i,k,j]: 0<=i