From 47f60c3ec535c5785d378d8839e62a0828716a6d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Sep 2019 10:53:22 -0500 Subject: [PATCH 1/5] Stats part of the changes --- doc/tutorial.rst | 82 +++++++-------- loopy/statistics.py | 60 ++++++++--- test/test_statistics.py | 217 +++++++++++++++++++++++++--------------- 3 files changed, 224 insertions(+), 135 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 2a9756b20..c98fe8d0c 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1581,12 +1581,12 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} >>> from loopy.statistics import CountGranularity as CG - >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) + >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 @@ -1643,15 +1643,15 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : ... Each line of output will look roughly like:: - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup, loopy_kernel) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup, loopy_kernel) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1686,13 +1686,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP, knl.name) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP, knl.name) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP, knl.name) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP, knl.name) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1710,13 +1710,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : ... >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) - MemAccess(None, None, None, None, load, None, None, None) : ... - MemAccess(None, None, None, None, store, None, None, None) : ... + MemAccess(None, None, None, None, load, None, None, None, None) : ... + MemAccess(None, None, None, None, store, None, None, None, None) : ... >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') ... ].eval_with_dict(param_dict) @@ -1753,12 +1753,12 @@ this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem, loopy_kernel) : ... With this parallelization, consecutive work-items will access consecutive array @@ -1768,13 +1768,13 @@ array accesses has not changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1794,12 +1794,12 @@ we'll switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem, loopy_kernel) : ... With this parallelization, consecutive work-items will access *nonconsecutive* @@ -1808,13 +1808,13 @@ changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1848,14 +1848,14 @@ kernel from the previous example: >>> sync_map = lp.get_synchronization_map(knl) >>> print(lp.stringify_stats_mapping(sync_map)) - kernel_launch : { 1 } + Sync(kernel_launch, loopy_kernel) : [l, m, n] -> { 1 } We can evaluate this polynomial using :func:`islpy.eval_with_dict`: .. doctest:: - >>> launch_count = sync_map["kernel_launch"].eval_with_dict(param_dict) + >>> launch_count = sync_map[lp.Sync("kernel_launch", knl.name)].eval_with_dict(param_dict) >>> print("Kernel launch count: %s" % launch_count) Kernel launch count: 1 @@ -1908,8 +1908,8 @@ count the barriers using :func:`loopy.get_synchronization_map`: >>> sync_map = lp.get_synchronization_map(knl) >>> print(lp.stringify_stats_mapping(sync_map)) - barrier_local : { 1000 } - kernel_launch : { 1 } + Sync(barrier_local, loopy_kernel) : { 1000 } + Sync(kernel_launch, loopy_kernel) : { 1 } Based on the kernel code printed above, we would expect each work-item to diff --git a/loopy/statistics.py b/loopy/statistics.py index 2c3d4f36f..92ea5f696 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -83,7 +83,7 @@ __doc__ = """ def get_kernel_parameter_space(kernel): return isl.Space.create_from_names(kernel.isl_context, - set=[], params=kernel.outer_params()).params() + set=[], params=sorted(list(kernel.outer_params()))).params() def get_kernel_zero_pwqpolynomial(kernel): @@ -160,7 +160,7 @@ class GuardedPwQPolynomial(object): return str(self.pwqpolynomial) def __repr__(self): - return repr(self.pwqpolynomial) + return "Guarded" + repr(self.pwqpolynomial) # }}} @@ -218,7 +218,7 @@ class ToCountMap(object): def __mul__(self, other): return self.copy(dict( - (index, value*other) + (index, other*value) for index, value in six.iteritems(self.count_map))) __rmul__ = __mul__ @@ -232,7 +232,8 @@ class ToCountMap(object): def __str__(self): return "\n".join( "%s: %s" % (k, v) - for k, v in six.iteritems(self.count_map)) + for k, v in sorted(six.iteritems(self.count_map), + key=lambda k: str(k))) def __len__(self): return len(self.count_map) @@ -501,11 +502,13 @@ class ToCountPolynomialMap(ToCountMap): #TODO test and document def eval(self, params): - result = self.copy() - for key, val in self.items(): - result[key] = val.eval_with_dict(params) - result.val_type = int - return result + raise NotImplementedError() + # FIXME: Not sure what you are trying to achieve here. + # result = self.copy() + # for key, val in self.items(): + # result[key] = val.eval_with_dict(params) + # result.val_type = int + # return result def eval_and_sum(self, params=None): """Add all counts and evaluate with provided parameter dict *params* @@ -575,6 +578,18 @@ def subst_into_to_count_map(space, tcm, subst_dict): # }}} +def stringify_stats_mapping(m): + + from warnings import warn + warn("stringify_stats_mapping is deprecated and will be removed in 2020." + " Use ToCountMap.__str__() instead.", DeprecationWarning, stacklevel=2) + + result = "" + for key in sorted(m.keys(), key=lambda k: str(k)): + result += ("%s : %s\n" % (key, m[key])) + return result + + # {{{ CountGranularity class CountGranularity(object): @@ -810,8 +825,10 @@ class CounterBase(CombineMapper): from loopy.type_inference import TypeInferenceMapper self.type_inf = TypeInferenceMapper(knl, callables_table) - self.zero = get_kernel_zero_pwqpolynomial(self.knl) - self.one = self.zero + 1 + zero_qpoly = isl.QPolynomial.zero_on_domain(self.param_space) + one_qpoly = zero_qpoly + 1 + self.zero = isl.PwQPolynomial.from_qpolynomial(zero_qpoly) + self.one = isl.PwQPolynomial.from_qpolynomial(one_qpoly) @property @memoize_method @@ -840,7 +857,6 @@ class CounterBase(CombineMapper): if isinstance(clbl, CallableKernel): sub_result = self.kernel_rec(clbl.subkernel) - assert len(clbl.subkernel.args) == len(expr.parameters) arg_dict = dict( (arg.name, value) for arg, value in zip( @@ -911,7 +927,8 @@ class ExpressionOpCounter(CounterBase): self.count_within_subscripts = count_within_subscripts # FIXME: Revert to SUBGROUP - arithmetic_count_granularity = CountGranularity.WORKITEM + # KK: Trying that now... + arithmetic_count_granularity = CountGranularity.SUBGROUP def combine(self, values): return sum(values) @@ -1179,7 +1196,9 @@ class MemAccessCounterBase(CounterBase): class LocalMemAccessCounter(MemAccessCounterBase): # FIXME: Revert to SUBGROUP - local_mem_count_granularity = CountGranularity.WORKITEM + # KK: Trying that now... + # local_mem_count_granularity = CountGranularity.WORKITEM + local_mem_count_granularity = CountGranularity.SUBGROUP def count_var_access(self, dtype, name, index): count_map = {} @@ -1280,7 +1299,8 @@ class GlobalMemAccessCounter(MemAccessCounterBase): self.knl, array, index_tuple) # FIXME: Revert to subgroup - global_access_count_granularity = CountGranularity.WORKITEM + # global_access_count_granularity = CountGranularity.WORKITEM + global_access_count_granularity = CountGranularity.SUBGROUP # Account for broadcasts once per subgroup count_granularity = CountGranularity.WORKITEM if ( @@ -1734,6 +1754,16 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, count_within_subscripts=count_within_subscripts, subgroup_size=subgroup_size) + # FIXME: Maybe we want this, but the current structure of + # ToCountPolynomialMap doesn't allow it. + return sum(_get_op_map_for_single_kernel( + clbl.subkernel, program.callables_table, + count_redundant_work=count_redundant_work, + count_within_subscripts=count_within_subscripts, + subgroup_size=subgroup_size) for clbl in + program.callables_table.values() if isinstance(clbl, + CallableKernel)) + # }}} diff --git a/test/test_statistics.py b/test/test_statistics.py index cadca9fc1..ef5450599 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -67,12 +67,15 @@ def test_op_counter_basic(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) - f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params) - f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP, knl.name) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == f32div == n*m*ell*n_subgroups @@ -99,8 +102,9 @@ def test_op_counter_reduction(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) - f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP, knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == n*m*ell*n_subgroups @@ -134,11 +138,13 @@ def test_op_counter_logic(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) - f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(params) - f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP) + f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP, knl.name) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32mul == n*m*n_subgroups @@ -172,17 +178,21 @@ def test_op_counter_specialops(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params) - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) - f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP)].eval_with_dict(params) - f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP) + f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP, knl.name) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name) ].eval_with_dict(params) - f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP) + f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP, knl.name) ].eval_with_dict(params) - f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.SUBGROUP) + f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.SUBGROUP, knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32div == 2*n*m*ell*n_subgroups @@ -270,7 +280,7 @@ def test_op_counter_triangular_domain(): knl, subgroup_size=SGS, count_redundant_work=True - )[lp.Op(np.float64, 'mul', CG.SUBGROUP)] + )[lp.Op(np.float64, 'mul', CG.SUBGROUP, knl.name)] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) @@ -316,22 +326,26 @@ def test_mem_access_counter_basic(): f32l = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='a', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='b', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f64l = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='g', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f64l += mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='h', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -341,12 +355,14 @@ def test_mem_access_counter_basic(): f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={}, gid_strides={}, direction='store', variable='c', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), lid_strides={}, gid_strides={}, direction='store', variable='e', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -380,12 +396,14 @@ def test_mem_access_counter_reduction(): f32l = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='a', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='b', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -394,7 +412,8 @@ def test_mem_access_counter_reduction(): f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={}, gid_strides={}, direction='store', variable='c', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -483,22 +502,26 @@ def test_mem_access_counter_specialops(): f32 = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='a', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f32 += mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='b', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), lid_strides={}, gid_strides={}, direction='load', variable='g', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), lid_strides={}, gid_strides={}, direction='load', variable='h', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -508,12 +531,14 @@ def test_mem_access_counter_specialops(): f32 = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='store', variable='c', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='store', variable='e', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -560,22 +585,26 @@ def test_mem_access_counter_bitwise(): i32 = mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='load', variable='a', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='load', variable='b', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='load', variable='g', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), lid_strides={}, gid_strides={}, direction='load', variable='h', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -584,12 +613,14 @@ def test_mem_access_counter_bitwise(): i32 = mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='store', variable='c', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='store', variable='e', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -631,31 +662,36 @@ def test_mem_access_counter_mixed(): f64uniform = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='g', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f64uniform += mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='h', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f32uniform = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='x', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*group_size_0}, direction='load', variable='a', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*group_size_0}, direction='load', variable='b', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -682,14 +718,16 @@ def test_mem_access_counter_mixed(): f64uniform = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='store', variable='e', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*group_size_0}, direction='store', variable='c', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -732,30 +770,32 @@ def test_mem_access_counter_nonconsec(): gid_strides={0: Variable('m')*lsize0}, direction='load', variable='g', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f64nonconsec += mem_map[lp.MemAccess('global', np.float64, lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*lsize0}, direction='load', variable='h', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess( 'global', np.dtype(np.float32), lid_strides={0: Variable('m')*Variable('ell')}, gid_strides={0: Variable('m')*Variable('ell')*lsize0}, direction='load', variable='a', - count_granularity=CG.WORKITEM - ) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess( 'global', np.dtype(np.float32), lid_strides={0: Variable('m')*Variable('ell')}, gid_strides={0: Variable('m')*Variable('ell')*lsize0}, direction='load', variable='b', - count_granularity=CG.WORKITEM - ) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*ell @@ -765,15 +805,16 @@ def test_mem_access_counter_nonconsec(): gid_strides={0: Variable('m')*lsize0}, direction='store', variable='e', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess( 'global', np.float32, lid_strides={0: Variable('m')*Variable('ell')}, gid_strides={0: Variable('m')*Variable('ell')*lsize0}, direction='store', variable='c', - count_granularity=CG.WORKITEM - ) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f64nonconsec == n*m assert f32nonconsec == n*m*ell @@ -786,7 +827,8 @@ def test_mem_access_counter_nonconsec(): lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*lsize0}, direction='load', variable='g', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f64nonconsec += mem_map64[lp.MemAccess( 'global', @@ -794,7 +836,8 @@ def test_mem_access_counter_nonconsec(): lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*lsize0}, direction='load', variable='h', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec = mem_map64[lp.MemAccess( 'global', @@ -803,7 +846,8 @@ def test_mem_access_counter_nonconsec(): gid_strides={0: Variable('m')*Variable('ell')*lsize0}, direction='load', variable='a', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec += mem_map64[lp.MemAccess( 'global', @@ -812,7 +856,8 @@ def test_mem_access_counter_nonconsec(): gid_strides={0: Variable('m')*Variable('ell')*lsize0}, direction='load', variable='b', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*ell @@ -844,27 +889,31 @@ def test_mem_access_counter_consec(): 'global', np.float64, lid_strides={0: 1}, gid_strides={0: Variable('m')}, direction='load', variable='g', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f64consec += mem_map[lp.MemAccess( 'global', np.float64, lid_strides={0: 1}, gid_strides={0: Variable('m')}, direction='load', variable='h', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess( 'global', np.float32, lid_strides={0: 1}, gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, direction='load', variable='a', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32consec += mem_map[lp.MemAccess( 'global', np.dtype(np.float32), lid_strides={0: 1}, gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, direction='load', variable='b', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f64consec == 2*n*m*ell assert f32consec == 3*n*m*ell @@ -873,14 +922,16 @@ def test_mem_access_counter_consec(): 'global', np.float64, lid_strides={0: 1}, gid_strides={0: Variable('m')}, direction='store', variable='e', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess( 'global', np.float32, lid_strides={0: 1}, gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, direction='store', variable='c', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f64consec == n*m*ell assert f32consec == n*m*ell @@ -1006,16 +1057,16 @@ def test_all_counters_parallel_matmul(): op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) f32mul = op_map[ - lp.Op(np.float32, 'mul', CG.SUBGROUP) + lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name) ].eval_with_dict(params) f32add = op_map[ - lp.Op(np.float32, 'add', CG.SUBGROUP) + lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name) ].eval_with_dict(params) i32ops = op_map[ - lp.Op(np.int32, 'add', CG.SUBGROUP) + lp.Op(np.int32, 'add', CG.SUBGROUP, knl.name) ].eval_with_dict(params) i32ops += op_map[ - lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP) + lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP, knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups @@ -1028,13 +1079,15 @@ def test_all_counters_parallel_matmul(): lid_strides={0: 1, 1: Variable('ell')}, gid_strides={1: bsize}, direction='load', variable='b', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('m')}, gid_strides={0: Variable('m')*bsize}, direction='load', - variable='a', count_granularity=CG.WORKITEM) + variable='a', count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize @@ -1044,7 +1097,8 @@ def test_all_counters_parallel_matmul(): lid_strides={0: 1, 1: Variable('ell')}, gid_strides={0: Variable('ell')*bsize, 1: bsize}, direction='store', variable='c', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f32coal == n*ell @@ -1063,14 +1117,16 @@ def test_all_counters_parallel_matmul(): lid_strides={1: 16}, gid_strides={}, variable='a_fetch', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', lid_strides={0: 1}, gid_strides={}, variable='b_fetch', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups @@ -1158,7 +1214,8 @@ def test_mem_access_tagged_variables(): gid_strides={1: bsize}, direction='load', variable='b', variable_tag='mmbload', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={1: Variable('m')}, @@ -1166,7 +1223,8 @@ def test_mem_access_tagged_variables(): direction='load', variable='a', variable_tag='mmaload', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) assert f32s1lb == n*m*ell @@ -1179,7 +1237,8 @@ def test_mem_access_tagged_variables(): gid_strides={0: Variable('ell')*bsize, 1: bsize}, direction='store', variable='c', variable_tag='mmresult', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f32coal == n*ell -- GitLab From 20d9310fc2faa35c2f6fd483a21f98b9b9b94a01 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Sep 2019 11:05:47 -0500 Subject: [PATCH 2/5] removes unnecessary comments --- loopy/statistics.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 92ea5f696..f9a4b62bc 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -218,7 +218,7 @@ class ToCountMap(object): def __mul__(self, other): return self.copy(dict( - (index, other*value) + (index, value*other) for index, value in six.iteritems(self.count_map))) __rmul__ = __mul__ @@ -503,7 +503,7 @@ class ToCountPolynomialMap(ToCountMap): #TODO test and document def eval(self, params): raise NotImplementedError() - # FIXME: Not sure what you are trying to achieve here. + # FIXME: Not sure what's the goal here, I get a PyLint error. # result = self.copy() # for key, val in self.items(): # result[key] = val.eval_with_dict(params) @@ -926,7 +926,7 @@ class ExpressionOpCounter(CounterBase): knl, callables_table, kernel_rec) self.count_within_subscripts = count_within_subscripts - # FIXME: Revert to SUBGROUP + # FIXME(AK): Revert to SUBGROUP # KK: Trying that now... arithmetic_count_granularity = CountGranularity.SUBGROUP @@ -1195,7 +1195,7 @@ class MemAccessCounterBase(CounterBase): # {{{ LocalMemAccessCounter class LocalMemAccessCounter(MemAccessCounterBase): - # FIXME: Revert to SUBGROUP + # FIXME(AK): Revert to SUBGROUP # KK: Trying that now... # local_mem_count_granularity = CountGranularity.WORKITEM local_mem_count_granularity = CountGranularity.SUBGROUP @@ -1298,7 +1298,7 @@ class GlobalMemAccessCounter(MemAccessCounterBase): lid_strides, gid_strides = _get_lid_and_gid_strides( self.knl, array, index_tuple) - # FIXME: Revert to subgroup + # FIXME(AK): Revert to subgroup # global_access_count_granularity = CountGranularity.WORKITEM global_access_count_granularity = CountGranularity.SUBGROUP @@ -1754,16 +1754,6 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, count_within_subscripts=count_within_subscripts, subgroup_size=subgroup_size) - # FIXME: Maybe we want this, but the current structure of - # ToCountPolynomialMap doesn't allow it. - return sum(_get_op_map_for_single_kernel( - clbl.subkernel, program.callables_table, - count_redundant_work=count_redundant_work, - count_within_subscripts=count_within_subscripts, - subgroup_size=subgroup_size) for clbl in - program.callables_table.values() if isinstance(clbl, - CallableKernel)) - # }}} -- GitLab From 1f90b5590cdf4e3eca32cbbfb1926ff7fc65dba9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Sep 2019 20:01:42 -0500 Subject: [PATCH 3/5] removes unhelpful comments --- loopy/statistics.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index f9a4b62bc..39f43ef5d 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -926,8 +926,6 @@ class ExpressionOpCounter(CounterBase): knl, callables_table, kernel_rec) self.count_within_subscripts = count_within_subscripts - # FIXME(AK): Revert to SUBGROUP - # KK: Trying that now... arithmetic_count_granularity = CountGranularity.SUBGROUP def combine(self, values): @@ -1195,9 +1193,6 @@ class MemAccessCounterBase(CounterBase): # {{{ LocalMemAccessCounter class LocalMemAccessCounter(MemAccessCounterBase): - # FIXME(AK): Revert to SUBGROUP - # KK: Trying that now... - # local_mem_count_granularity = CountGranularity.WORKITEM local_mem_count_granularity = CountGranularity.SUBGROUP def count_var_access(self, dtype, name, index): @@ -1298,8 +1293,6 @@ class GlobalMemAccessCounter(MemAccessCounterBase): lid_strides, gid_strides = _get_lid_and_gid_strides( self.knl, array, index_tuple) - # FIXME(AK): Revert to subgroup - # global_access_count_granularity = CountGranularity.WORKITEM global_access_count_granularity = CountGranularity.SUBGROUP # Account for broadcasts once per subgroup -- GitLab From e86a16d4cfb26c79f01fe2c7a4ec244f04c3cfc0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Sep 2019 00:10:05 -0500 Subject: [PATCH 4/5] removes `eval`, since no one uses it and its not documented --- loopy/statistics.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 39f43ef5d..06ca06283 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -500,16 +500,6 @@ class ToCountPolynomialMap(ToCountMap): return type(self)(space, count_map) - #TODO test and document - def eval(self, params): - raise NotImplementedError() - # FIXME: Not sure what's the goal here, I get a PyLint error. - # result = self.copy() - # for key, val in self.items(): - # result[key] = val.eval_with_dict(params) - # result.val_type = int - # return result - def eval_and_sum(self, params=None): """Add all counts and evaluate with provided parameter dict *params* -- GitLab From b7e98ffa321b9f6063ecb8d518c6b11d6f675056 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 18 Sep 2019 15:14:25 -0500 Subject: [PATCH 5/5] reverts back pwqpolynomial initialization --- loopy/statistics.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 06ca06283..86f39e55b 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -814,11 +814,8 @@ class CounterBase(CombineMapper): from loopy.type_inference import TypeInferenceMapper self.type_inf = TypeInferenceMapper(knl, callables_table) - - zero_qpoly = isl.QPolynomial.zero_on_domain(self.param_space) - one_qpoly = zero_qpoly + 1 - self.zero = isl.PwQPolynomial.from_qpolynomial(zero_qpoly) - self.one = isl.PwQPolynomial.from_qpolynomial(one_qpoly) + self.zero = get_kernel_zero_pwqpolynomial(self.knl) + self.one = self.zero + 1 @property @memoize_method -- GitLab