From 47f60c3ec535c5785d378d8839e62a0828716a6d Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 16 Sep 2019 10:53:22 -0500
Subject: [PATCH 1/5] Stats part of the changes

---
 doc/tutorial.rst        |  82 +++++++--------
 loopy/statistics.py     |  60 ++++++++---
 test/test_statistics.py | 217 +++++++++++++++++++++++++---------------
 3 files changed, 224 insertions(+), 135 deletions(-)
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 2a9756b20..c98fe8d0c 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1581,12 +1581,12 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
 
     >>> param_dict = {'n': 256, 'm': 256, 'l': 8}
     >>> from loopy.statistics import CountGranularity as CG
-    >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict)
-    >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(param_dict)
-    >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict)
-    >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(param_dict)
-    >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict)
-    >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict)
+    >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict)
+    >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict)
+    >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict)
+    >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict)
+    >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict)
+    >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict)
     >>> print("%i\n%i\n%i\n%i\n%i\n%i" %
     ...     (f32add, f32div, f32mul, f64add, f64mul, i32add))
     524288
@@ -1643,15 +1643,15 @@ we'll continue using the kernel from the previous example:
 
     >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ...
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : ...
     <BLANKLINE>
 
 Each line of output will look roughly like::
 
 
-    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup, loopy_kernel) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup, loopy_kernel) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
 
 :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
 :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
@@ -1686,13 +1686,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP, knl.name)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP, knl.name)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP, knl.name)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP, knl.name)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1710,13 +1710,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
     >>> bytes_map = mem_map.to_bytes()
     >>> print(lp.stringify_stats_mapping(bytes_map))
-    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ...
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : ...
     <BLANKLINE>
     >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
     ...                                         ).group_by('direction')
     >>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
-    MemAccess(None, None, None, None, load, None, None, None) : ...
-    MemAccess(None, None, None, None, store, None, None, None) : ...
+    MemAccess(None, None, None, None, load, None, None, None, None) : ...
+    MemAccess(None, None, None, None, store, None, None, None, None) : ...
     <BLANKLINE>
     >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
     ...                            ].eval_with_dict(param_dict)
@@ -1753,12 +1753,12 @@ this time.
     ...                             outer_tag="l.1", inner_tag="l.0")
     >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem, loopy_kernel) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem, loopy_kernel) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem, loopy_kernel) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem, loopy_kernel) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem, loopy_kernel) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem, loopy_kernel) : ...
     <BLANKLINE>
 
 With this parallelization, consecutive work-items will access consecutive array
@@ -1768,13 +1768,13 @@ array accesses has not changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM, knl.name)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM, knl.name)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM, knl.name)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM, knl.name)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1794,12 +1794,12 @@ we'll switch the inner and outer tags in our parallelization of the kernel:
     ...                                outer_tag="l.0", inner_tag="l.1")
     >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem, loopy_kernel) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem, loopy_kernel) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem, loopy_kernel) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem, loopy_kernel) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem, loopy_kernel) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem, loopy_kernel) : ...
     <BLANKLINE>
 
 With this parallelization, consecutive work-items will access *nonconsecutive*
@@ -1808,13 +1808,13 @@ changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM, knl.name)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM, knl.name)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM, knl.name)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM, knl.name)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1848,14 +1848,14 @@ kernel from the previous example:
 
     >>> sync_map = lp.get_synchronization_map(knl)
     >>> print(lp.stringify_stats_mapping(sync_map))
-    kernel_launch : { 1 }
+    Sync(kernel_launch, loopy_kernel) : [l, m, n] -> { 1 }
     <BLANKLINE>
 
 We can evaluate this polynomial using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> launch_count = sync_map["kernel_launch"].eval_with_dict(param_dict)
+    >>> launch_count = sync_map[lp.Sync("kernel_launch", knl.name)].eval_with_dict(param_dict)
     >>> print("Kernel launch count: %s" % launch_count)
     Kernel launch count: 1
 
@@ -1908,8 +1908,8 @@ count the barriers using :func:`loopy.get_synchronization_map`:
 
     >>> sync_map = lp.get_synchronization_map(knl)
     >>> print(lp.stringify_stats_mapping(sync_map))
-    barrier_local : { 1000 }
-    kernel_launch : { 1 }
+    Sync(barrier_local, loopy_kernel) : { 1000 }
+    Sync(kernel_launch, loopy_kernel) : { 1 }
     <BLANKLINE>
 
 Based on the kernel code printed above, we would expect each work-item to
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 2c3d4f36f..92ea5f696 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -83,7 +83,7 @@ __doc__ = """
 
 def get_kernel_parameter_space(kernel):
     return isl.Space.create_from_names(kernel.isl_context,
-            set=[], params=kernel.outer_params()).params()
+            set=[], params=sorted(list(kernel.outer_params()))).params()
 
 
 def get_kernel_zero_pwqpolynomial(kernel):
@@ -160,7 +160,7 @@ class GuardedPwQPolynomial(object):
         return str(self.pwqpolynomial)
 
     def __repr__(self):
-        return repr(self.pwqpolynomial)
+        return "Guarded" + repr(self.pwqpolynomial)
 
 # }}}
 
@@ -218,7 +218,7 @@ class ToCountMap(object):
 
     def __mul__(self, other):
         return self.copy(dict(
-            (index, value*other)
+            (index, other*value)
             for index, value in six.iteritems(self.count_map)))
 
     __rmul__ = __mul__
@@ -232,7 +232,8 @@ class ToCountMap(object):
     def __str__(self):
         return "\n".join(
                 "%s: %s" % (k, v)
-                for k, v in six.iteritems(self.count_map))
+                for k, v in sorted(six.iteritems(self.count_map),
+                    key=lambda k: str(k)))
 
     def __len__(self):
         return len(self.count_map)
@@ -501,11 +502,13 @@ class ToCountPolynomialMap(ToCountMap):
 
     #TODO test and document
     def eval(self, params):
-        result = self.copy()
-        for key, val in self.items():
-            result[key] = val.eval_with_dict(params)
-        result.val_type = int
-        return result
+        raise NotImplementedError()
+        # FIXME: Not sure what you are trying to achieve here.
+        # result = self.copy()
+        # for key, val in self.items():
+        #     result[key] = val.eval_with_dict(params)
+        # result.val_type = int
+        # return result
 
     def eval_and_sum(self, params=None):
         """Add all counts and evaluate with provided parameter dict *params*
@@ -575,6 +578,18 @@ def subst_into_to_count_map(space, tcm, subst_dict):
 # }}}
 
 
+def stringify_stats_mapping(m):
+
+    from warnings import warn
+    warn("stringify_stats_mapping is deprecated and will be removed in 2020."
+            " Use ToCountMap.__str__() instead.", DeprecationWarning, stacklevel=2)
+
+    result = ""
+    for key in sorted(m.keys(), key=lambda k: str(k)):
+        result += ("%s : %s\n" % (key, m[key]))
+    return result
+
+
 # {{{ CountGranularity
 
 class CountGranularity(object):
@@ -810,8 +825,10 @@ class CounterBase(CombineMapper):
         from loopy.type_inference import TypeInferenceMapper
         self.type_inf = TypeInferenceMapper(knl, callables_table)
 
-        self.zero = get_kernel_zero_pwqpolynomial(self.knl)
-        self.one = self.zero + 1
+        zero_qpoly = isl.QPolynomial.zero_on_domain(self.param_space)
+        one_qpoly = zero_qpoly + 1
+        self.zero = isl.PwQPolynomial.from_qpolynomial(zero_qpoly)
+        self.one = isl.PwQPolynomial.from_qpolynomial(one_qpoly)
 
     @property
     @memoize_method
@@ -840,7 +857,6 @@ class CounterBase(CombineMapper):
         if isinstance(clbl, CallableKernel):
             sub_result = self.kernel_rec(clbl.subkernel)
 
-            assert len(clbl.subkernel.args) == len(expr.parameters)
             arg_dict = dict(
                     (arg.name, value)
                     for arg, value in zip(
@@ -911,7 +927,8 @@ class ExpressionOpCounter(CounterBase):
         self.count_within_subscripts = count_within_subscripts
 
     # FIXME: Revert to SUBGROUP
-    arithmetic_count_granularity = CountGranularity.WORKITEM
+    # KK: Trying that now...
+    arithmetic_count_granularity = CountGranularity.SUBGROUP
 
     def combine(self, values):
         return sum(values)
@@ -1179,7 +1196,9 @@ class MemAccessCounterBase(CounterBase):
 
 class LocalMemAccessCounter(MemAccessCounterBase):
     # FIXME: Revert to SUBGROUP
-    local_mem_count_granularity = CountGranularity.WORKITEM
+    # KK: Trying that now...
+    # local_mem_count_granularity = CountGranularity.WORKITEM
+    local_mem_count_granularity = CountGranularity.SUBGROUP
 
     def count_var_access(self, dtype, name, index):
         count_map = {}
@@ -1280,7 +1299,8 @@ class GlobalMemAccessCounter(MemAccessCounterBase):
                                         self.knl, array, index_tuple)
 
         # FIXME: Revert to subgroup
-        global_access_count_granularity = CountGranularity.WORKITEM
+        # global_access_count_granularity = CountGranularity.WORKITEM
+        global_access_count_granularity = CountGranularity.SUBGROUP
 
         # Account for broadcasts once per subgroup
         count_granularity = CountGranularity.WORKITEM if (
@@ -1734,6 +1754,16 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False,
             count_within_subscripts=count_within_subscripts,
             subgroup_size=subgroup_size)
 
+    # FIXME: Maybe we want this, but the current structure of
+    # ToCountPolynomialMap doesn't allow it.
+    return sum(_get_op_map_for_single_kernel(
+            clbl.subkernel, program.callables_table,
+            count_redundant_work=count_redundant_work,
+            count_within_subscripts=count_within_subscripts,
+            subgroup_size=subgroup_size) for clbl in
+            program.callables_table.values() if isinstance(clbl,
+                CallableKernel))
+
 # }}}
 
 
diff --git a/test/test_statistics.py b/test/test_statistics.py
index cadca9fc1..ef5450599 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -67,12 +67,15 @@ def test_op_counter_basic():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params)
-    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP)
+    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(
+            params)
+    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(
+            params)
+    f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict(
+            params)
+    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert f32add == f32mul == f32div == n*m*ell*n_subgroups
@@ -99,8 +102,9 @@ def test_op_counter_reduction():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP)
+    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(
+            params)
+    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert f32add == f32mul == n*m*ell*n_subgroups
@@ -134,11 +138,13 @@ def test_op_counter_logic():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(params)
-    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP)
+    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(
+            params)
+    f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(
+            params)
+    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert f32mul == n*m*n_subgroups
@@ -172,17 +178,21 @@ def test_op_counter_specialops():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params)
-    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
-    f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP)].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP)
+    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(
+            params)
+    f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict(
+            params)
+    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(
+            params)
+    f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP, knl.name)].eval_with_dict(
+            params)
+    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
-    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP)
+    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
-    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.SUBGROUP)
+    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.SUBGROUP, knl.name)
                     ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert f32div == 2*n*m*ell*n_subgroups
@@ -270,7 +280,7 @@ def test_op_counter_triangular_domain():
                     knl,
                     subgroup_size=SGS,
                     count_redundant_work=True
-                    )[lp.Op(np.float64, 'mul', CG.SUBGROUP)]
+                    )[lp.Op(np.float64, 'mul', CG.SUBGROUP, knl.name)]
     value_dict = dict(m=13, n=200)
     flops = op_map.eval_with_dict(value_dict)
 
@@ -316,22 +326,26 @@ def test_mem_access_counter_basic():
     f32l = mem_map[lp.MemAccess('global', np.float32,
                         lid_strides={}, gid_strides={},
                         direction='load', variable='a',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
                         lid_strides={}, gid_strides={},
                         direction='load', variable='b',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                     ].eval_with_dict(params)
     f64l = mem_map[lp.MemAccess('global', np.float64,
                         lid_strides={}, gid_strides={},
                         direction='load', variable='g',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                    ].eval_with_dict(params)
     f64l += mem_map[lp.MemAccess('global', np.float64,
                         lid_strides={}, gid_strides={},
                         direction='load', variable='h',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                     ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -341,12 +355,14 @@ def test_mem_access_counter_basic():
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         lid_strides={}, gid_strides={},
                         direction='store', variable='c',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                    ].eval_with_dict(params)
     f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                         lid_strides={}, gid_strides={},
                         direction='store', variable='e',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -380,12 +396,14 @@ def test_mem_access_counter_reduction():
     f32l = mem_map[lp.MemAccess('global', np.float32,
                         lid_strides={}, gid_strides={},
                         direction='load', variable='a',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
                         lid_strides={}, gid_strides={},
                         direction='load', variable='b',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                     ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -394,7 +412,8 @@ def test_mem_access_counter_reduction():
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         lid_strides={}, gid_strides={},
                         direction='store', variable='c',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -483,22 +502,26 @@ def test_mem_access_counter_specialops():
     f32 = mem_map[lp.MemAccess('global', np.float32,
                         lid_strides={}, gid_strides={},
                         direction='load', variable='a',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                   ].eval_with_dict(params)
     f32 += mem_map[lp.MemAccess('global', np.float32,
                         lid_strides={}, gid_strides={},
                         direction='load', variable='b',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                    ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                         lid_strides={}, gid_strides={},
                         direction='load', variable='g',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                   ].eval_with_dict(params)
     f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
                         lid_strides={}, gid_strides={},
                         direction='load', variable='h',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -508,12 +531,14 @@ def test_mem_access_counter_specialops():
     f32 = mem_map[lp.MemAccess('global', np.float32,
                         lid_strides={}, gid_strides={},
                         direction='store', variable='c',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                   ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.float64,
                         lid_strides={}, gid_strides={},
                         direction='store', variable='e',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                   ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -560,22 +585,26 @@ def test_mem_access_counter_bitwise():
     i32 = mem_map[lp.MemAccess('global', np.int32,
                         lid_strides={}, gid_strides={},
                         direction='load', variable='a',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                         lid_strides={}, gid_strides={},
                         direction='load', variable='b',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                         lid_strides={}, gid_strides={},
                         direction='load', variable='g',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
                         lid_strides={}, gid_strides={},
                         direction='load', variable='h',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -584,12 +613,14 @@ def test_mem_access_counter_bitwise():
     i32 = mem_map[lp.MemAccess('global', np.int32,
                         lid_strides={}, gid_strides={},
                         direction='store', variable='c',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                         lid_strides={}, gid_strides={},
                         direction='store', variable='e',
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name=knl.name)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -631,31 +662,36 @@ def test_mem_access_counter_mixed():
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 lid_strides={}, gid_strides={},
                                 direction='load', variable='g',
-                                count_granularity=CG.SUBGROUP)
+                                count_granularity=CG.SUBGROUP,
+                                kernel_name=knl.name)
                          ].eval_with_dict(params)
     f64uniform += mem_map[lp.MemAccess('global', np.float64,
                                 lid_strides={}, gid_strides={},
                                 direction='load', variable='h',
-                                count_granularity=CG.SUBGROUP)
+                                count_granularity=CG.SUBGROUP,
+                                kernel_name=knl.name)
                           ].eval_with_dict(params)
     f32uniform = mem_map[lp.MemAccess('global', np.float32,
                                 lid_strides={}, gid_strides={},
                                 direction='load', variable='x',
-                                count_granularity=CG.SUBGROUP)
+                                count_granularity=CG.SUBGROUP,
+                                kernel_name=knl.name)
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 lid_strides={0: Variable('m')},
                                 gid_strides={0: Variable('m')*group_size_0},
                                 direction='load',
                                 variable='a',
-                                count_granularity=CG.WORKITEM)
+                                count_granularity=CG.WORKITEM,
+                                kernel_name=knl.name)
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 lid_strides={0: Variable('m')},
                                 gid_strides={0: Variable('m')*group_size_0},
                                 direction='load',
                                 variable='b',
-                                count_granularity=CG.WORKITEM)
+                                count_granularity=CG.WORKITEM,
+                                kernel_name=knl.name)
                             ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -682,14 +718,16 @@ def test_mem_access_counter_mixed():
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 lid_strides={}, gid_strides={},
                                 direction='store', variable='e',
-                                count_granularity=CG.SUBGROUP)
+                                count_granularity=CG.SUBGROUP,
+                                kernel_name=knl.name)
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                 lid_strides={0: Variable('m')},
                                 gid_strides={0: Variable('m')*group_size_0},
                                 direction='store',
                                 variable='c',
-                                count_granularity=CG.WORKITEM)
+                                count_granularity=CG.WORKITEM,
+                                kernel_name=knl.name)
                            ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -732,30 +770,32 @@ def test_mem_access_counter_nonconsec():
                                 gid_strides={0: Variable('m')*lsize0},
                                 direction='load',
                                 variable='g',
-                                count_granularity=CG.WORKITEM)
+                                count_granularity=CG.WORKITEM,
+                                kernel_name=knl.name)
                            ].eval_with_dict(params)
     f64nonconsec += mem_map[lp.MemAccess('global', np.float64,
                                 lid_strides={0: Variable('m')},
                                 gid_strides={0: Variable('m')*lsize0},
                                 direction='load',
                                 variable='h',
-                                count_granularity=CG.WORKITEM)
+                                count_granularity=CG.WORKITEM,
+                                kernel_name=knl.name)
                             ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess(
                             'global', np.dtype(np.float32),
                             lid_strides={0: Variable('m')*Variable('ell')},
                             gid_strides={0: Variable('m')*Variable('ell')*lsize0},
                             direction='load', variable='a',
-                            count_granularity=CG.WORKITEM
-                            )
+                            count_granularity=CG.WORKITEM,
+                            kernel_name=knl.name)
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess(
                             'global', np.dtype(np.float32),
                             lid_strides={0: Variable('m')*Variable('ell')},
                             gid_strides={0: Variable('m')*Variable('ell')*lsize0},
                             direction='load', variable='b',
-                            count_granularity=CG.WORKITEM
-                            )
+                            count_granularity=CG.WORKITEM,
+                            kernel_name=knl.name)
                             ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
@@ -765,15 +805,16 @@ def test_mem_access_counter_nonconsec():
                                 gid_strides={0: Variable('m')*lsize0},
                                 direction='store',
                                 variable='e',
-                                count_granularity=CG.WORKITEM)
+                                count_granularity=CG.WORKITEM,
+                                kernel_name=knl.name)
                            ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess(
                             'global', np.float32,
                             lid_strides={0: Variable('m')*Variable('ell')},
                             gid_strides={0: Variable('m')*Variable('ell')*lsize0},
                             direction='store', variable='c',
-                            count_granularity=CG.WORKITEM
-                            )
+                            count_granularity=CG.WORKITEM,
+                            kernel_name=knl.name)
                            ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*ell
@@ -786,7 +827,8 @@ def test_mem_access_counter_nonconsec():
                     lid_strides={0: Variable('m')},
                     gid_strides={0: Variable('m')*lsize0},
                     direction='load', variable='g',
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name=knl.name)
                     ].eval_with_dict(params)
     f64nonconsec += mem_map64[lp.MemAccess(
                     'global',
@@ -794,7 +836,8 @@ def test_mem_access_counter_nonconsec():
                     lid_strides={0: Variable('m')},
                     gid_strides={0: Variable('m')*lsize0},
                     direction='load', variable='h',
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name=knl.name)
                     ].eval_with_dict(params)
     f32nonconsec = mem_map64[lp.MemAccess(
                     'global',
@@ -803,7 +846,8 @@ def test_mem_access_counter_nonconsec():
                     gid_strides={0: Variable('m')*Variable('ell')*lsize0},
                     direction='load',
                     variable='a',
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name=knl.name)
                     ].eval_with_dict(params)
     f32nonconsec += mem_map64[lp.MemAccess(
                     'global',
@@ -812,7 +856,8 @@ def test_mem_access_counter_nonconsec():
                     gid_strides={0: Variable('m')*Variable('ell')*lsize0},
                     direction='load',
                     variable='b',
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name=knl.name)
                     ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
@@ -844,27 +889,31 @@ def test_mem_access_counter_consec():
                     'global', np.float64,
                     lid_strides={0: 1}, gid_strides={0: Variable('m')},
                     direction='load', variable='g',
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name=knl.name)
                     ].eval_with_dict(params)
     f64consec += mem_map[lp.MemAccess(
                     'global', np.float64,
                     lid_strides={0: 1}, gid_strides={0: Variable('m')},
                     direction='load', variable='h',
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name=knl.name)
                     ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess(
                     'global', np.float32,
                     lid_strides={0: 1},
                     gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')},
                     direction='load', variable='a',
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name=knl.name)
                     ].eval_with_dict(params)
     f32consec += mem_map[lp.MemAccess(
                     'global', np.dtype(np.float32),
                     lid_strides={0: 1},
                     gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')},
                     direction='load', variable='b',
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name=knl.name)
                     ].eval_with_dict(params)
     assert f64consec == 2*n*m*ell
     assert f32consec == 3*n*m*ell
@@ -873,14 +922,16 @@ def test_mem_access_counter_consec():
                     'global', np.float64,
                     lid_strides={0: 1}, gid_strides={0: Variable('m')},
                     direction='store', variable='e',
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name=knl.name)
                     ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess(
                     'global', np.float32,
                     lid_strides={0: 1},
                     gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')},
                     direction='store', variable='c',
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name=knl.name)
                     ].eval_with_dict(params)
     assert f64consec == n*m*ell
     assert f32consec == n*m*ell
@@ -1006,16 +1057,16 @@ def test_all_counters_parallel_matmul():
 
     op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
     f32mul = op_map[
-                        lp.Op(np.float32, 'mul', CG.SUBGROUP)
+                        lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)
                         ].eval_with_dict(params)
     f32add = op_map[
-                        lp.Op(np.float32, 'add', CG.SUBGROUP)
+                        lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        lp.Op(np.int32, 'add', CG.SUBGROUP)
+                        lp.Op(np.int32, 'add', CG.SUBGROUP, knl.name)
                         ].eval_with_dict(params)
     i32ops += op_map[
-                        lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP)
+                        lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP, knl.name)
                         ].eval_with_dict(params)
 
     # (count-per-sub-group)*n_subgroups
@@ -1028,13 +1079,15 @@ def test_all_counters_parallel_matmul():
                              lid_strides={0: 1, 1: Variable('ell')},
                              gid_strides={1: bsize},
                              direction='load', variable='b',
-                             count_granularity=CG.WORKITEM)
+                             count_granularity=CG.WORKITEM,
+                             kernel_name=knl.name)
                              ].eval_with_dict(params)
     f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
                              lid_strides={0: 1, 1: Variable('m')},
                              gid_strides={0: Variable('m')*bsize},
                              direction='load',
-                             variable='a', count_granularity=CG.WORKITEM)
+                             variable='a', count_granularity=CG.WORKITEM,
+                             kernel_name=knl.name)
                              ].eval_with_dict(params)
 
     assert f32s1lb == n*m*ell/bsize
@@ -1044,7 +1097,8 @@ def test_all_counters_parallel_matmul():
                              lid_strides={0: 1, 1: Variable('ell')},
                              gid_strides={0: Variable('ell')*bsize, 1: bsize},
                              direction='store', variable='c',
-                             count_granularity=CG.WORKITEM)
+                             count_granularity=CG.WORKITEM,
+                             kernel_name=knl.name)
                              ].eval_with_dict(params)
 
     assert f32coal == n*ell
@@ -1063,14 +1117,16 @@ def test_all_counters_parallel_matmul():
                                                lid_strides={1: 16},
                                                gid_strides={},
                                                variable='a_fetch',
-                                               count_granularity=CG.SUBGROUP)
+                                               count_granularity=CG.SUBGROUP,
+                                               kernel_name=knl.name)
                                   ].eval_with_dict(params)
     local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                                direction='load',
                                                lid_strides={0: 1},
                                                gid_strides={},
                                                variable='b_fetch',
-                                               count_granularity=CG.SUBGROUP)
+                                               count_granularity=CG.SUBGROUP,
+                                               kernel_name=knl.name)
                                   ].eval_with_dict(params)
 
     # (count-per-sub-group)*n_subgroups
@@ -1158,7 +1214,8 @@ def test_mem_access_tagged_variables():
                              gid_strides={1: bsize},
                              direction='load', variable='b',
                              variable_tag='mmbload',
-                             count_granularity=CG.WORKITEM)
+                             count_granularity=CG.WORKITEM,
+                             kernel_name=knl.name)
                              ].eval_with_dict(params)
     f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
                              lid_strides={1: Variable('m')},
@@ -1166,7 +1223,8 @@ def test_mem_access_tagged_variables():
                              direction='load',
                              variable='a',
                              variable_tag='mmaload',
-                             count_granularity=CG.SUBGROUP)
+                             count_granularity=CG.SUBGROUP,
+                             kernel_name=knl.name)
                              ].eval_with_dict(params)
 
     assert f32s1lb == n*m*ell
@@ -1179,7 +1237,8 @@ def test_mem_access_tagged_variables():
                              gid_strides={0: Variable('ell')*bsize, 1: bsize},
                              direction='store', variable='c',
                              variable_tag='mmresult',
-                             count_granularity=CG.WORKITEM)
+                             count_granularity=CG.WORKITEM,
+                             kernel_name=knl.name)
                              ].eval_with_dict(params)
 
     assert f32coal == n*ell
-- 
GitLab


From 20d9310fc2faa35c2f6fd483a21f98b9b9b94a01 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 16 Sep 2019 11:05:47 -0500
Subject: [PATCH 2/5] removes unnecessary comments

---
 loopy/statistics.py | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 92ea5f696..f9a4b62bc 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -218,7 +218,7 @@ class ToCountMap(object):
 
     def __mul__(self, other):
         return self.copy(dict(
-            (index, other*value)
+            (index, value*other)
             for index, value in six.iteritems(self.count_map)))
 
     __rmul__ = __mul__
@@ -503,7 +503,7 @@ class ToCountPolynomialMap(ToCountMap):
     #TODO test and document
     def eval(self, params):
         raise NotImplementedError()
-        # FIXME: Not sure what you are trying to achieve here.
+        # FIXME: Not sure what's the goal here, I get a PyLint error.
         # result = self.copy()
         # for key, val in self.items():
         #     result[key] = val.eval_with_dict(params)
@@ -926,7 +926,7 @@ class ExpressionOpCounter(CounterBase):
                 knl, callables_table, kernel_rec)
         self.count_within_subscripts = count_within_subscripts
 
-    # FIXME: Revert to SUBGROUP
+    # FIXME(AK): Revert to SUBGROUP
     # KK: Trying that now...
     arithmetic_count_granularity = CountGranularity.SUBGROUP
 
@@ -1195,7 +1195,7 @@ class MemAccessCounterBase(CounterBase):
 # {{{ LocalMemAccessCounter
 
 class LocalMemAccessCounter(MemAccessCounterBase):
-    # FIXME: Revert to SUBGROUP
+    # FIXME(AK): Revert to SUBGROUP
     # KK: Trying that now...
     # local_mem_count_granularity = CountGranularity.WORKITEM
     local_mem_count_granularity = CountGranularity.SUBGROUP
@@ -1298,7 +1298,7 @@ class GlobalMemAccessCounter(MemAccessCounterBase):
         lid_strides, gid_strides = _get_lid_and_gid_strides(
                                         self.knl, array, index_tuple)
 
-        # FIXME: Revert to subgroup
+        # FIXME(AK): Revert to subgroup
         # global_access_count_granularity = CountGranularity.WORKITEM
         global_access_count_granularity = CountGranularity.SUBGROUP
 
@@ -1754,16 +1754,6 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False,
             count_within_subscripts=count_within_subscripts,
             subgroup_size=subgroup_size)
 
-    # FIXME: Maybe we want this, but the current structure of
-    # ToCountPolynomialMap doesn't allow it.
-    return sum(_get_op_map_for_single_kernel(
-            clbl.subkernel, program.callables_table,
-            count_redundant_work=count_redundant_work,
-            count_within_subscripts=count_within_subscripts,
-            subgroup_size=subgroup_size) for clbl in
-            program.callables_table.values() if isinstance(clbl,
-                CallableKernel))
-
 # }}}
 
 
-- 
GitLab


From 1f90b5590cdf4e3eca32cbbfb1926ff7fc65dba9 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 16 Sep 2019 20:01:42 -0500
Subject: [PATCH 3/5] removes unhelpful comments

---
 loopy/statistics.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index f9a4b62bc..39f43ef5d 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -926,8 +926,6 @@ class ExpressionOpCounter(CounterBase):
                 knl, callables_table, kernel_rec)
         self.count_within_subscripts = count_within_subscripts
 
-    # FIXME(AK): Revert to SUBGROUP
-    # KK: Trying that now...
     arithmetic_count_granularity = CountGranularity.SUBGROUP
 
     def combine(self, values):
@@ -1195,9 +1193,6 @@ class MemAccessCounterBase(CounterBase):
 # {{{ LocalMemAccessCounter
 
 class LocalMemAccessCounter(MemAccessCounterBase):
-    # FIXME(AK): Revert to SUBGROUP
-    # KK: Trying that now...
-    # local_mem_count_granularity = CountGranularity.WORKITEM
     local_mem_count_granularity = CountGranularity.SUBGROUP
 
     def count_var_access(self, dtype, name, index):
@@ -1298,8 +1293,6 @@ class GlobalMemAccessCounter(MemAccessCounterBase):
         lid_strides, gid_strides = _get_lid_and_gid_strides(
                                         self.knl, array, index_tuple)
 
-        # FIXME(AK): Revert to subgroup
-        # global_access_count_granularity = CountGranularity.WORKITEM
         global_access_count_granularity = CountGranularity.SUBGROUP
 
         # Account for broadcasts once per subgroup
-- 
GitLab


From e86a16d4cfb26c79f01fe2c7a4ec244f04c3cfc0 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 17 Sep 2019 00:10:05 -0500
Subject: [PATCH 4/5] removes `eval`, since no one uses it and its not
 documented

---
 loopy/statistics.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 39f43ef5d..06ca06283 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -500,16 +500,6 @@ class ToCountPolynomialMap(ToCountMap):
 
         return type(self)(space, count_map)
 
-    #TODO test and document
-    def eval(self, params):
-        raise NotImplementedError()
-        # FIXME: Not sure what's the goal here, I get a PyLint error.
-        # result = self.copy()
-        # for key, val in self.items():
-        #     result[key] = val.eval_with_dict(params)
-        # result.val_type = int
-        # return result
-
     def eval_and_sum(self, params=None):
         """Add all counts and evaluate with provided parameter dict *params*
 
-- 
GitLab


From b7e98ffa321b9f6063ecb8d518c6b11d6f675056 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 18 Sep 2019 15:14:25 -0500
Subject: [PATCH 5/5] reverts back pwqpolynomial initialization

---
 loopy/statistics.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 06ca06283..86f39e55b 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -814,11 +814,8 @@ class CounterBase(CombineMapper):
 
         from loopy.type_inference import TypeInferenceMapper
         self.type_inf = TypeInferenceMapper(knl, callables_table)
-
-        zero_qpoly = isl.QPolynomial.zero_on_domain(self.param_space)
-        one_qpoly = zero_qpoly + 1
-        self.zero = isl.PwQPolynomial.from_qpolynomial(zero_qpoly)
-        self.one = isl.PwQPolynomial.from_qpolynomial(one_qpoly)
+        self.zero = get_kernel_zero_pwqpolynomial(self.knl)
+        self.one = self.zero + 1
 
     @property
     @memoize_method
-- 
GitLab