From 561dbbbb590bbb731bb93ac0b00f4c0d2a6fdef6 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 2 Apr 2018 14:16:04 -0500 Subject: [PATCH 1/3] collecting global id strides in MemAccessCounter, updated tests and tutorial --- doc/tutorial.rst | 88 +++++++------- loopy/statistics.py | 103 +++++++++++----- test/test_statistics.py | 256 +++++++++++++++++++++++++--------------- 3 files changed, 283 insertions(+), 164 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index af8c8281c..5d4e972ee 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1641,15 +1641,15 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : ... Each line of output will look roughly like:: - MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, store, c, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, store, c, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1661,7 +1661,7 @@ Each line of output will look roughly like:: - dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the data type accessed. -- stride: A :class:`dict` of **{** :class:`int` **:** +- lid_strides: A :class:`dict` of **{** :class:`int` **:** :class:`pymbolic.primitives.Expression` or :class:`int` **}** that specifies local strides for each local id in the memory access index. Local ids not found will not be present in ``lid_strides.keys()``. Uniform access (i.e. @@ -1669,6 +1669,11 @@ Each line of output will look roughly like:: ``lid_strides[0]=0``, but may also occur when no local id 0 is found, in which case the 0 key will not be present in lid_strides. +- gid_strides: A :class:`dict` of **{** :class:`int` **:** + :class:`pymbolic.primitives.Expression` or :class:`int` **}** that specifies + global strides for each global id in the memory access index. Global ids not + found will not be present in ``gid_strides.keys()``. + - direction: A :class:`str` that specifies the direction of memory access as **load** or **store**. @@ -1679,13 +1684,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, 'load', 'g', CG.SUBGROUP) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, 'store', 'e', CG.SUBGROUP) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, 'load', 'a', CG.SUBGROUP) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, 'store', 'c', CG.SUBGROUP) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', CG.SUBGROUP) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1703,13 +1708,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : ... >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) - MemAccess(None, None, None, load, None, None) : ... - MemAccess(None, None, None, store, None, None) : ... + MemAccess(None, None, None, None, load, None, None) : ... + MemAccess(None, None, None, None, store, None, None) : ... >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') ... ].eval_with_dict(param_dict) @@ -1721,12 +1726,12 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: The lines of output above might look like:: - MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, load, b, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, store, c, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), {}, load, g, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), {}, load, h, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), {}, store, e, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, b, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, store, c, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), {}, {}, load, g, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), {}, {}, load, h, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), {}, {}, store, e, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } One can see how these functions might be useful in computing, for example, achieved memory bandwidth in byte/sec or performance in FLOP/sec. @@ -1735,9 +1740,10 @@ achieved memory bandwidth in byte/sec or performance in FLOP/sec. Since we have not tagged any of the inames or parallelized the kernel across work-items (which would have produced iname tags), :func:`loopy.get_mem_access_map` -finds no local id strides, leaving ``lid_strides`` empty for each memory access. -Now we'll parallelize the kernel and count the array accesses again. The -resulting :class:`islpy.PwQPolynomial` will be more complicated this time. +finds no local or global id strides, leaving ``lid_strides`` and ``gid_strides`` +empty for each memory access. Now we'll parallelize the kernel and count the array +accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated +this time. .. doctest:: @@ -1745,12 +1751,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, load, a, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, load, b, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, store, c, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, load, g, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, load, h, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, store, e, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, workitem) : ... With this parallelization, consecutive work-items will access consecutive array @@ -1760,13 +1766,13 @@ array accesses has not changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, 'load', 'g', CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, 'store', 'e', CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, 'load', 'a', CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, 'store', 'c', CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', CG.WORKITEM) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1786,12 +1792,12 @@ we'll switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, load, a, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, load, b, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, store, c, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, load, g, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, load, h, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, store, e, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, workitem) : ... With this parallelization, consecutive work-items will access *nonconsecutive* @@ -1800,13 +1806,13 @@ changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, 'load', 'g', CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, 'store', 'e', CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, 'load', 'a', CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, 'store', 'c', CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', CG.WORKITEM) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) diff --git a/loopy/statistics.py b/loopy/statistics.py index 5e929b618..7cb70026d 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -564,6 +564,13 @@ class MemAccess(Record): when no local id 0 is found, in which case the 0 key will not be present in lid_strides. + .. attribute:: gid_strides + + A :class:`dict` of **{** :class:`int` **:** + :class:`pymbolic.primitives.Expression` or :class:`int` **}** that + specifies global strides for each global id in the memory access index. + global ids not found will not be present in ``gid_strides.keys()``. + .. attribute:: direction A :class:`str` that specifies the direction of memory access as @@ -589,14 +596,19 @@ class MemAccess(Record): """ - def __init__(self, mtype=None, dtype=None, lid_strides=None, direction=None, - variable=None, count_granularity=None): + def __init__(self, mtype=None, dtype=None, lid_strides=None, gid_strides=None, + direction=None, variable=None, count_granularity=None): #TODO currently giving all lmem access lid_strides=None if mtype == 'local' and lid_strides is not None: raise NotImplementedError("MemAccess: lid_strides must be None when " "mtype is 'local'") + #TODO currently giving all gmem access lid_strides=None + if mtype == 'local' and gid_strides is not None: + raise NotImplementedError("MemAccess: gid_strides must be None when " + "mtype is 'local'") + #TODO currently giving all lmem access variable=None if (mtype == 'local') and (variable is not None): raise NotImplementedError("MemAccess: variable must be None when " @@ -609,25 +621,29 @@ class MemAccess(Record): if dtype is None: Record.__init__(self, mtype=mtype, dtype=dtype, lid_strides=lid_strides, - direction=direction, variable=variable, - count_granularity=count_granularity) + gid_strides=gid_strides, direction=direction, + variable=variable, count_granularity=count_granularity) else: from loopy.types import to_loopy_type Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype), - lid_strides=lid_strides, direction=direction, - variable=variable, count_granularity=count_granularity) + lid_strides=lid_strides, gid_strides=gid_strides, + direction=direction, variable=variable, + count_granularity=count_granularity) def __hash__(self): - # Note that this means lid_strides must be sorted in self.__repr__() + # Note that this means lid_strides and gid_strides must be sorted + # in self.__repr__() return hash(repr(self)) def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness - return "MemAccess(%s, %s, %s, %s, %s, %s)" % ( + return "MemAccess(%s, %s, %s, %s, %s, %s, %s)" % ( self.mtype, self.dtype, None if self.lid_strides is None else dict( sorted(six.iteritems(self.lid_strides))), + None if self.gid_strides is None else dict( + sorted(six.iteritems(self.gid_strides))), self.direction, self.variable, self.count_granularity) @@ -879,7 +895,7 @@ class GlobalMemAccessCounter(MemAccessCounter): return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), lid_strides={}, - variable=name, + gid_strides={}, variable=name, count_granularity=CountGranularity.WORKITEM): 1} ) + self.rec(expr.index) @@ -901,34 +917,18 @@ class GlobalMemAccessCounter(MemAccessCounter): index = (index,) from loopy.symbolic import get_dependencies - from loopy.kernel.data import LocalIndexTag + from loopy.kernel.data import LocalIndexTag, GroupIndexTag my_inames = get_dependencies(index) & self.knl.all_inames() - # find all local index tags and corresponding inames + # find all local and global index tags and corresponding inames lid_to_iname = {} + gid_to_iname = {} for iname in my_inames: tag = self.knl.iname_to_tag.get(iname) if isinstance(tag, LocalIndexTag): lid_to_iname[tag.axis] = iname - - if not lid_to_iname: - - # no local id found, count as uniform access - # Note, a few different cases may be considered uniform: - # lid_strides={} if no local ids were found, - # lid_strides={1:1, 2:32} if no local id 0 was found, - # lid_strides={0:0, ...} if a local id 0 is found and its stride is 0 - warn_with_kernel(self.knl, "no_lid_found", - "GlobalSubscriptCounter: No local id found, " - "setting lid_strides to {}. Expression: %s" - % (expr)) - - return ToCountMap({MemAccess( - mtype='global', - dtype=self.type_inf(expr), lid_strides={}, - variable=name, - count_granularity=CountGranularity.SUBGROUP): 1} - ) + self.rec(expr.index) + elif isinstance(tag, GroupIndexTag): + gid_to_iname[tag.axis] = iname # create lid_strides dict (strides are coefficents in flattened index) # i.e., we want {0:A, 1:B, 2:C, ...} where A, B, & C @@ -939,7 +939,6 @@ class GlobalMemAccessCounter(MemAccessCounter): from pymbolic.primitives import Variable lid_strides = {} - for ltag, iname in six.iteritems(lid_to_iname): ltag_stride = 0 # check coefficient of this lid for each axis @@ -971,6 +970,42 @@ class GlobalMemAccessCounter(MemAccessCounter): ltag_stride += stride*coeff_lid lid_strides[ltag] = ltag_stride + # create gid_strides dict (strides are coefficents in flattened index) + # i.e., we want {0:A, 1:B, 2:C, ...} where A, B, & C + # come from flattened index [... + C*gid2 + B*gid1 + A*gid0] + + gid_strides = {} + for gtag, iname in six.iteritems(gid_to_iname): + gtag_stride = 0 + # check coefficient of this gid for each axis + for idx, axis_tag in zip(index, array.dim_tags): + + from loopy.symbolic import simplify_using_aff + from loopy.diagnostic import ExpressionNotAffineError + try: + coeffs = CoefficientCollector()( + simplify_using_aff(self.knl, idx)) + except ExpressionNotAffineError: + gtag_stride = None + break + + # check if idx contains this gid + try: + coeff_gid = coeffs[Variable(gid_to_iname[gtag])] + except KeyError: + # idx does not contain this gid + continue + + # found coefficient of this gid + # now determine stride + if isinstance(axis_tag, FixedStrideArrayDimTag): + stride = axis_tag.stride + else: + continue + + gtag_stride += stride*coeff_gid + gid_strides[gtag] = gtag_stride + count_granularity = CountGranularity.WORKITEM if ( 0 in lid_strides and lid_strides[0] != 0 ) else CountGranularity.SUBGROUP @@ -979,6 +1014,7 @@ class GlobalMemAccessCounter(MemAccessCounter): mtype='global', dtype=self.type_inf(expr), lid_strides=dict(sorted(six.iteritems(lid_strides))), + gid_strides=dict(sorted(six.iteritems(gid_strides))), variable=name, count_granularity=count_granularity ): 1} @@ -1390,6 +1426,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, mtype='global', dtype=np.float32, lid_strides={0: 1}, + gid_strides={0: 256}, direction='load', variable='a', count_granularity=CountGranularity.WORKITEM) @@ -1398,6 +1435,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, mtype='global', dtype=np.float32, lid_strides={0: 1}, + gid_strides={0: 256}, direction='store', variable='a', count_granularity=CountGranularity.WORKITEM) @@ -1406,6 +1444,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, mtype='local', dtype=np.float32, lid_strides={0: 1}, + gid_strides={0: 256}, direction='load', variable='x', count_granularity=CountGranularity.WORKITEM) @@ -1414,6 +1453,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, mtype='local', dtype=np.float32, lid_strides={0: 1}, + gid_strides={0: 256}, direction='store', variable='x', count_granularity=CountGranularity.WORKITEM) @@ -1562,6 +1602,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, mtype=mem_access.mtype, dtype=mem_access.dtype.numpy_dtype, lid_strides=mem_access.lid_strides, + gid_strides=mem_access.gid_strides, direction=mem_access.direction, variable=mem_access.variable, count_granularity=mem_access.count_granularity), diff --git a/test/test_statistics.py b/test/test_statistics.py index e42c43f60..0f57c8f20 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -269,20 +269,24 @@ def test_mem_access_counter_basic(): subgroups_per_group = div_ceil(group_size, subgroup_size) f32l = mem_map[lp.MemAccess('global', np.float32, - lid_strides={}, direction='load', variable='a', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='load', variable='a', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, - lid_strides={}, direction='load', variable='b', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='load', variable='b', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64l = mem_map[lp.MemAccess('global', np.float64, - lid_strides={}, direction='load', variable='g', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='load', variable='g', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64l += mem_map[lp.MemAccess('global', np.float64, - lid_strides={}, direction='load', variable='h', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='load', variable='h', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group @@ -290,12 +294,14 @@ def test_mem_access_counter_basic(): assert f64l == (2*n*m)*n_workgroups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides={}, direction='store', variable='c', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='store', variable='c', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), - lid_strides={}, direction='store', variable='e', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='store', variable='e', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group @@ -328,20 +334,23 @@ def test_mem_access_counter_reduction(): subgroups_per_group = div_ceil(group_size, subgroup_size) f32l = mem_map[lp.MemAccess('global', np.float32, - lid_strides={}, direction='load', variable='a', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='load', variable='a', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, - lid_strides={}, direction='load', variable='b', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='load', variable='b', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides={}, direction='store', variable='c', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='store', variable='c', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group @@ -430,20 +439,24 @@ def test_mem_access_counter_specialops(): subgroups_per_group = div_ceil(group_size, subgroup_size) f32 = mem_map[lp.MemAccess('global', np.float32, - lid_strides={}, direction='load', variable='a', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='load', variable='a', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32 += mem_map[lp.MemAccess('global', np.float32, - lid_strides={}, direction='load', variable='b', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='load', variable='b', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), - lid_strides={}, direction='load', variable='g', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='load', variable='g', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), - lid_strides={}, direction='load', variable='h', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='load', variable='h', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group @@ -451,12 +464,14 @@ def test_mem_access_counter_specialops(): assert f64 == (2*n*m)*n_workgroups*subgroups_per_group f32 = mem_map[lp.MemAccess('global', np.float32, - lid_strides={}, direction='store', variable='c', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='store', variable='c', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.float64, - lid_strides={}, direction='store', variable='e', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='store', variable='e', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group @@ -502,32 +517,38 @@ def test_mem_access_counter_bitwise(): subgroups_per_group = div_ceil(group_size, subgroup_size) i32 = mem_map[lp.MemAccess('global', np.int32, - lid_strides={}, direction='load', variable='a', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='load', variable='a', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, - lid_strides={}, direction='load', variable='b', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='load', variable='b', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, - lid_strides={}, direction='load', variable='g', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='load', variable='g', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), - lid_strides={}, direction='load', variable='h', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='load', variable='h', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group assert i32 == (4*n*m+2*n*m*ell)*n_workgroups*subgroups_per_group i32 = mem_map[lp.MemAccess('global', np.int32, - lid_strides={}, direction='store', variable='c', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='store', variable='c', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, - lid_strides={}, direction='store', variable='e', - count_granularity=CG.SUBGROUP) + lid_strides={}, gid_strides={}, + direction='store', variable='e', + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group @@ -567,24 +588,31 @@ def test_mem_access_counter_mixed(): mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=subgroup_size) f64uniform = mem_map[lp.MemAccess('global', np.float64, - lid_strides={}, direction='load', variable='g', + lid_strides={}, gid_strides={}, + direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64uniform += mem_map[lp.MemAccess('global', np.float64, - lid_strides={}, direction='load', variable='h', + lid_strides={}, gid_strides={}, + direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32uniform = mem_map[lp.MemAccess('global', np.float32, - lid_strides={}, direction='load', variable='x', + lid_strides={}, gid_strides={}, + direction='load', variable='x', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides={0: Variable('m')}, direction='load', + lid_strides={0: Variable('m')}, + gid_strides={0: Variable('m')*group_size_0}, + direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides={0: Variable('m')}, direction='load', + lid_strides={0: Variable('m')}, + gid_strides={0: Variable('m')*group_size_0}, + direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -611,11 +639,14 @@ def test_mem_access_counter_mixed(): assert f32nonconsec == 3*n*m*ell f64uniform = mem_map[lp.MemAccess('global', np.float64, - lid_strides={}, direction='store', variable='e', + lid_strides={}, gid_strides={}, + direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, - lid_strides={0: Variable('m')}, direction='store', + lid_strides={0: Variable('m')}, + gid_strides={0: Variable('m')*group_size_0}, + direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -645,7 +676,8 @@ def test_mem_access_counter_nonconsec(): name="nonconsec", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict( a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - knl = lp.split_iname(knl, "i", 16) + lsize0 = 16 + knl = lp.split_iname(knl, "i", lsize0) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, @@ -655,37 +687,52 @@ def test_mem_access_counter_nonconsec(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} f64nonconsec = mem_map[lp.MemAccess('global', np.float64, - lid_strides={0: Variable('m')}, direction='load', + lid_strides={0: Variable('m')}, + gid_strides={0: Variable('m')*lsize0}, + direction='load', variable='g', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64nonconsec += mem_map[lp.MemAccess('global', np.float64, - lid_strides={0: Variable('m')}, direction='load', + lid_strides={0: Variable('m')}, + gid_strides={0: Variable('m')*lsize0}, + direction='load', variable='h', count_granularity=CG.WORKITEM) ].eval_with_dict(params) - f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides={0: Variable('m')*Variable('ell')}, - direction='load', variable='a', - count_granularity=CG.WORKITEM) + f32nonconsec = mem_map[lp.MemAccess( + 'global', np.dtype(np.float32), + lid_strides={0: Variable('m')*Variable('ell')}, + gid_strides={0: Variable('m')*Variable('ell')*lsize0}, + direction='load', variable='a', + count_granularity=CG.WORKITEM + ) ].eval_with_dict(params) - f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides={0: Variable('m')*Variable('ell')}, - direction='load', variable='b', - count_granularity=CG.WORKITEM) + f32nonconsec += mem_map[lp.MemAccess( + 'global', np.dtype(np.float32), + lid_strides={0: Variable('m')*Variable('ell')}, + gid_strides={0: Variable('m')*Variable('ell')*lsize0}, + direction='load', variable='b', + count_granularity=CG.WORKITEM + ) ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*ell f64nonconsec = mem_map[lp.MemAccess('global', np.float64, - lid_strides={0: Variable('m')}, direction='store', + lid_strides={0: Variable('m')}, + gid_strides={0: Variable('m')*lsize0}, + direction='store', variable='e', count_granularity=CG.WORKITEM) ].eval_with_dict(params) - f32nonconsec = mem_map[lp.MemAccess('global', np.float32, - lid_strides={0: Variable('m')*Variable('ell')}, - direction='store', variable='c', - count_granularity=CG.WORKITEM) + f32nonconsec = mem_map[lp.MemAccess( + 'global', np.float32, + lid_strides={0: Variable('m')*Variable('ell')}, + gid_strides={0: Variable('m')*Variable('ell')*lsize0}, + direction='store', variable='c', + count_granularity=CG.WORKITEM + ) ].eval_with_dict(params) assert f64nonconsec == n*m assert f32nonconsec == n*m*ell @@ -694,13 +741,17 @@ def test_mem_access_counter_nonconsec(): subgroup_size=64) f64nonconsec = mem_map64[lp.MemAccess( 'global', - np.float64, lid_strides={0: Variable('m')}, + np.float64, + lid_strides={0: Variable('m')}, + gid_strides={0: Variable('m')*lsize0}, direction='load', variable='g', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64nonconsec += mem_map64[lp.MemAccess( 'global', - np.float64, lid_strides={0: Variable('m')}, + np.float64, + lid_strides={0: Variable('m')}, + gid_strides={0: Variable('m')*lsize0}, direction='load', variable='h', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -708,6 +759,7 @@ def test_mem_access_counter_nonconsec(): 'global', np.dtype(np.float32), lid_strides={0: Variable('m')*Variable('ell')}, + gid_strides={0: Variable('m')*Variable('ell')*lsize0}, direction='load', variable='a', count_granularity=CG.WORKITEM) @@ -716,6 +768,7 @@ def test_mem_access_counter_nonconsec(): 'global', np.dtype(np.float32), lid_strides={0: Variable('m')*Variable('ell')}, + gid_strides={0: Variable('m')*Variable('ell')*lsize0}, direction='load', variable='b', count_granularity=CG.WORKITEM) @@ -746,33 +799,48 @@ def test_mem_access_counter_consec(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f64consec = mem_map[lp.MemAccess('global', np.float64, - lid_strides={0: 1}, direction='load', variable='g', - count_granularity=CG.WORKITEM) - ].eval_with_dict(params) - f64consec += mem_map[lp.MemAccess('global', np.float64, - lid_strides={0: 1}, direction='load', variable='h', - count_granularity=CG.WORKITEM) - ].eval_with_dict(params) - f32consec = mem_map[lp.MemAccess('global', np.float32, - lid_strides={0: 1}, direction='load', variable='a', - count_granularity=CG.WORKITEM) - ].eval_with_dict(params) - f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - lid_strides={0: 1}, direction='load', variable='b', - count_granularity=CG.WORKITEM) - ].eval_with_dict(params) + f64consec = mem_map[lp.MemAccess( + 'global', np.float64, + lid_strides={0: 1}, gid_strides={0: Variable('m')}, + direction='load', variable='g', + count_granularity=CG.WORKITEM) + ].eval_with_dict(params) + f64consec += mem_map[lp.MemAccess( + 'global', np.float64, + lid_strides={0: 1}, gid_strides={0: Variable('m')}, + direction='load', variable='h', + count_granularity=CG.WORKITEM) + ].eval_with_dict(params) + f32consec = mem_map[lp.MemAccess( + 'global', np.float32, + lid_strides={0: 1}, + gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, + direction='load', variable='a', + count_granularity=CG.WORKITEM) + ].eval_with_dict(params) + f32consec += mem_map[lp.MemAccess( + 'global', np.dtype(np.float32), + lid_strides={0: 1}, + gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, + direction='load', variable='b', + count_granularity=CG.WORKITEM) + ].eval_with_dict(params) assert f64consec == 2*n*m*ell assert f32consec == 3*n*m*ell - f64consec = mem_map[lp.MemAccess('global', np.float64, - lid_strides={0: 1}, direction='store', variable='e', - count_granularity=CG.WORKITEM) - ].eval_with_dict(params) - f32consec = mem_map[lp.MemAccess('global', np.float32, - lid_strides={0: 1}, direction='store', variable='c', - count_granularity=CG.WORKITEM) - ].eval_with_dict(params) + f64consec = mem_map[lp.MemAccess( + 'global', np.float64, + lid_strides={0: 1}, gid_strides={0: Variable('m')}, + direction='store', variable='e', + count_granularity=CG.WORKITEM) + ].eval_with_dict(params) + f32consec = mem_map[lp.MemAccess( + 'global', np.float32, + lid_strides={0: 1}, + gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, + direction='store', variable='c', + count_granularity=CG.WORKITEM) + ].eval_with_dict(params) assert f64consec == n*m*ell assert f32consec == n*m*ell @@ -898,11 +966,14 @@ def test_all_counters_parallel_matmul(): f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('ell')}, + gid_strides={1: bsize}, direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess('global', np.float32, - lid_strides={0: 1, 1: Variable('m')}, direction='load', + lid_strides={0: 1, 1: Variable('m')}, + gid_strides={0: Variable('m')*bsize}, + direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) @@ -911,6 +982,7 @@ def test_all_counters_parallel_matmul(): f32coal = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('ell')}, + gid_strides={0: Variable('ell')*bsize, 1: bsize}, direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) -- GitLab From 6d8f6f701f6703ee3315fbe1d467c9ab9761f4ca Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 2 Apr 2018 14:21:03 -0500 Subject: [PATCH 2/3] fixing flake8 issues --- loopy/statistics.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 7cb70026d..1fe55111c 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -937,6 +937,8 @@ class GlobalMemAccessCounter(MemAccessCounter): from loopy.symbolic import CoefficientCollector from loopy.kernel.array import FixedStrideArrayDimTag from pymbolic.primitives import Variable + from loopy.symbolic import simplify_using_aff + from loopy.diagnostic import ExpressionNotAffineError lid_strides = {} for ltag, iname in six.iteritems(lid_to_iname): @@ -944,8 +946,6 @@ class GlobalMemAccessCounter(MemAccessCounter): # check coefficient of this lid for each axis for idx, axis_tag in zip(index, array.dim_tags): - from loopy.symbolic import simplify_using_aff - from loopy.diagnostic import ExpressionNotAffineError try: coeffs = CoefficientCollector()( simplify_using_aff(self.knl, idx)) @@ -980,8 +980,6 @@ class GlobalMemAccessCounter(MemAccessCounter): # check coefficient of this gid for each axis for idx, axis_tag in zip(index, array.dim_tags): - from loopy.symbolic import simplify_using_aff - from loopy.diagnostic import ExpressionNotAffineError try: coeffs = CoefficientCollector()( simplify_using_aff(self.knl, idx)) -- GitLab From c6445465d65a7eedf41ecbd04ad5c92c090a6b94 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 2 Apr 2018 18:04:47 -0500 Subject: [PATCH 3/3] function get_iname_strides() eliminating repeated code --- loopy/statistics.py | 107 +++++++++++++++++--------------------------- 1 file changed, 42 insertions(+), 65 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 1fe55111c..c4f8c9e26 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -930,9 +930,13 @@ class GlobalMemAccessCounter(MemAccessCounter): elif isinstance(tag, GroupIndexTag): gid_to_iname[tag.axis] = iname - # create lid_strides dict (strides are coefficents in flattened index) - # i.e., we want {0:A, 1:B, 2:C, ...} where A, B, & C - # come from flattened index [... + C*lid2 + B*lid1 + A*lid0] + # create lid_strides and gid_strides dicts + + # strides are coefficents in flattened index, i.e., we want + # lid_strides = {0:l0, 1:l1, 2:l2, ...} and + # gid_strides = {0:g0, 1:g1, 2:g2, ...}, + # where l0, l1, l2, g0, g1, and g2 come from flattened index + # [... + g2*gid2 + g1*gid1 + g0*gid0 + ... + l2*lid2 + l1*lid1 + l0*lid0] from loopy.symbolic import CoefficientCollector from loopy.kernel.array import FixedStrideArrayDimTag @@ -940,69 +944,42 @@ class GlobalMemAccessCounter(MemAccessCounter): from loopy.symbolic import simplify_using_aff from loopy.diagnostic import ExpressionNotAffineError - lid_strides = {} - for ltag, iname in six.iteritems(lid_to_iname): - ltag_stride = 0 - # check coefficient of this lid for each axis - for idx, axis_tag in zip(index, array.dim_tags): - - try: - coeffs = CoefficientCollector()( - simplify_using_aff(self.knl, idx)) - except ExpressionNotAffineError: - ltag_stride = None - break - - # check if idx contains this lid - try: - coeff_lid = coeffs[Variable(lid_to_iname[ltag])] - except KeyError: - # idx does not contain this lid - continue - - # found coefficient of this lid - # now determine stride - if isinstance(axis_tag, FixedStrideArrayDimTag): - stride = axis_tag.stride - else: - continue - - ltag_stride += stride*coeff_lid - lid_strides[ltag] = ltag_stride - - # create gid_strides dict (strides are coefficents in flattened index) - # i.e., we want {0:A, 1:B, 2:C, ...} where A, B, & C - # come from flattened index [... + C*gid2 + B*gid1 + A*gid0] - - gid_strides = {} - for gtag, iname in six.iteritems(gid_to_iname): - gtag_stride = 0 - # check coefficient of this gid for each axis - for idx, axis_tag in zip(index, array.dim_tags): - - try: - coeffs = CoefficientCollector()( - simplify_using_aff(self.knl, idx)) - except ExpressionNotAffineError: - gtag_stride = None - break - - # check if idx contains this gid - try: - coeff_gid = coeffs[Variable(gid_to_iname[gtag])] - except KeyError: - # idx does not contain this gid - continue - - # found coefficient of this gid - # now determine stride - if isinstance(axis_tag, FixedStrideArrayDimTag): - stride = axis_tag.stride - else: - continue + def get_iname_strides(tag_to_iname_dict): + tag_to_stride_dict = {} + for tag, iname in six.iteritems(tag_to_iname_dict): + total_iname_stride = 0 + # find total stride of this iname for each axis + for idx, axis_tag in zip(index, array.dim_tags): + # collect index coefficients + try: + coeffs = CoefficientCollector()( + simplify_using_aff(self.knl, idx)) + except ExpressionNotAffineError: + total_iname_stride = None + break + + # check if idx contains this iname + try: + coeff = coeffs[Variable(tag_to_iname_dict[tag])] + except KeyError: + # idx does not contain this iname + continue + + # found coefficient of this iname + # now determine stride + if isinstance(axis_tag, FixedStrideArrayDimTag): + axis_tag_stride = axis_tag.stride + else: + continue + + total_iname_stride += axis_tag_stride*coeff + + tag_to_stride_dict[tag] = total_iname_stride + + return tag_to_stride_dict - gtag_stride += stride*coeff_gid - gid_strides[gtag] = gtag_stride + lid_strides = get_iname_strides(lid_to_iname) + gid_strides = get_iname_strides(gid_to_iname) count_granularity = CountGranularity.WORKITEM if ( 0 in lid_strides and lid_strides[0] != 0 -- GitLab