From fe4ed770ab4d037c53888d96290bb163ac56e33c Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 28 Nov 2018 18:58:17 -0600 Subject: [PATCH 1/2] added variable_tag to MemAccess; GlobalMemAccessCounter tracking variable tags for tagged global variables --- loopy/statistics.py | 21 ++++++++++++--- test/test_statistics.py | 59 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 3 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index b467e3334..9ce2bb081 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -581,6 +581,11 @@ class MemAccess(Record): A :class:`str` that specifies the variable name of the data accessed. + .. attribute:: variable_tag + + A :class:`str` that specifies the variable tag of a + :class:`pymbolic.primitives.TaggedVariable`. + .. attribute:: count_granularity A :class:`str` that specifies whether this operation should be counted @@ -597,7 +602,8 @@ class MemAccess(Record): """ def __init__(self, mtype=None, dtype=None, lid_strides=None, gid_strides=None, - direction=None, variable=None, count_granularity=None): + direction=None, variable=None, variable_tag=None, + count_granularity=None): if count_granularity not in CountGranularity.ALL+[None]: raise ValueError("Op.__init__: count_granularity '%s' is " @@ -607,12 +613,14 @@ class MemAccess(Record): if dtype is None: Record.__init__(self, mtype=mtype, dtype=dtype, lid_strides=lid_strides, gid_strides=gid_strides, direction=direction, - variable=variable, count_granularity=count_granularity) + variable=variable, variable_tag=variable_tag, + count_granularity=count_granularity) else: from loopy.types import to_loopy_type Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype), lid_strides=lid_strides, gid_strides=gid_strides, direction=direction, variable=variable, + variable_tag=variable_tag, count_granularity=count_granularity) def __hash__(self): @@ -622,7 +630,7 @@ class MemAccess(Record): def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness - return "MemAccess(%s, %s, %s, %s, %s, %s, %s)" % ( + return "MemAccess(%s, %s, %s, %s, %s, %s, %s, %s)" % ( self.mtype, self.dtype, None if self.lid_strides is None else dict( @@ -631,6 +639,7 @@ class MemAccess(Record): sorted(six.iteritems(self.gid_strides))), self.direction, self.variable, + self.variable_tag, self.count_granularity) # }}} @@ -985,6 +994,10 @@ class GlobalMemAccessCounter(MemAccessCounter): def map_subscript(self, expr): name = expr.aggregate.name + try: + var_tag = expr.aggregate.tag + except AttributeError: + var_tag = None if name in self.knl.arg_dict: array = self.knl.arg_dict[name] @@ -1013,6 +1026,7 @@ class GlobalMemAccessCounter(MemAccessCounter): lid_strides=dict(sorted(six.iteritems(lid_strides))), gid_strides=dict(sorted(six.iteritems(gid_strides))), variable=name, + variable_tag=var_tag, count_granularity=count_granularity ): 1} ) + self.rec(expr.index_tuple) @@ -1634,6 +1648,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, gid_strides=mem_access.gid_strides, direction=mem_access.direction, variable=mem_access.variable, + variable_tag=mem_access.variable_tag, count_granularity=mem_access.count_granularity), ct) for mem_access, ct in six.iteritems(access_map.count_map)), diff --git a/test/test_statistics.py b/test/test_statistics.py index 41b44b5a7..b29edf1ed 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -1060,6 +1060,65 @@ def test_all_counters_parallel_matmul(): assert local_mem_s == m*2/bsize*n_subgroups +def test_mem_access_tagged_variables(): + bsize = 16 + knl = lp.make_kernel( + "{[i,k,j]: 0<=i Date: Wed, 28 Nov 2018 19:08:11 -0600 Subject: [PATCH 2/2] updated tutotial w/variable tag printing --- doc/tutorial.rst | 74 ++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 73f5dea75..397f34a98 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1641,15 +1641,15 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ... Each line of output will look roughly like:: - MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, store, c, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1684,13 +1684,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', CG.SUBGROUP) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', CG.SUBGROUP) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', CG.SUBGROUP) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', CG.SUBGROUP) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1708,13 +1708,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ... >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) - MemAccess(None, None, None, None, load, None, None) : ... - MemAccess(None, None, None, None, store, None, None) : ... + MemAccess(None, None, None, None, load, None, None, None) : ... + MemAccess(None, None, None, None, store, None, None, None) : ... >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') ... ].eval_with_dict(param_dict) @@ -1726,12 +1726,12 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: The lines of output above might look like:: - MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, load, b, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, store, c, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), {}, {}, load, g, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), {}, {}, load, h, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), {}, {}, store, e, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), {}, {}, load, g, None, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), {}, {}, load, h, None, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), {}, {}, store, e, None, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } One can see how these functions might be useful in computing, for example, achieved memory bandwidth in byte/sec or performance in FLOP/sec. @@ -1751,12 +1751,12 @@ this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem) : ... With this parallelization, consecutive work-items will access consecutive array @@ -1766,13 +1766,13 @@ array accesses has not changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1792,12 +1792,12 @@ we'll switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem) : ... With this parallelization, consecutive work-items will access *nonconsecutive* @@ -1806,13 +1806,13 @@ changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) -- GitLab