From 3e1faa27cd6984091c68ddb79d47eea1b8060b7c Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Fri, 12 Jan 2018 14:25:35 -0600
Subject: [PATCH 01/59] added truediv to guarded poly

---
 loopy/statistics.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index a2dcb6846..dd9f3854b 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -89,6 +89,14 @@ class GuardedPwQPolynomial(object):
 
     __rmul__ = __mul__
 
+    def __truediv__(self, other):
+        if not isinstance(other, int):
+            raise ValueError("GuardedPwQPolynomial.__truediv__ only valid for "
+                    "type int. Attempted to divide by %s" % (type(other)))
+        return GuardedPwQPolynomial(
+                self.pwqpolynomial.scale_val(isl.Val(1).div(isl.Val(other))),
+                self.valid_domain)
+
     def eval_with_dict(self, value_dict):
         space = self.pwqpolynomial.space
         pt = isl.Point.zero(space.params())
-- 
GitLab


From f89abc95bce233ba02aece3e39ecb6a82fed2ef8 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Fri, 12 Jan 2018 14:31:04 -0600
Subject: [PATCH 02/59] added count_granularity to Op

---
 loopy/statistics.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index dd9f3854b..7be64eb67 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -484,8 +484,9 @@ class Op(object):
 
     # FIXME: This could be done much more briefly by inheriting from Record.
 
-    def __init__(self, dtype=None, name=None):
+    def __init__(self, dtype=None, name=None, count_granularity='thread'):
         self.name = name
+        self.count_granularity = count_granularity
         if dtype is None:
             self.dtype = dtype
         else:
@@ -497,13 +498,16 @@ class Op(object):
                 (self.dtype is None or other.dtype is None or
                  self.dtype == other.dtype) and
                 (self.name is None or other.name is None or
-                 self.name == other.name))
+                 self.name == other.name) and
+                (self.count_granularity is None or
+                 other.count_granularity is None or
+                 self.count_granularity == other.count_granularity))
 
     def __hash__(self):
         return hash(str(self))
 
     def __repr__(self):
-        return "Op(%s, %s)" % (self.dtype, self.name)
+        return "Op(%s, %s, %s)" % (self.dtype, self.name, self.count_granularity)
 
 # }}}
 
-- 
GitLab


From feee7c9628bf6929b8839597a0e51fc135128732 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Fri, 12 Jan 2018 14:37:11 -0600
Subject: [PATCH 03/59] added count_granularity to MemAccess

---
 loopy/statistics.py | 45 +++++++++++++++++++--------------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 7be64eb67..b023b7317 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -547,11 +547,13 @@ class MemAccess(object):
     # FIXME: This could be done much more briefly by inheriting from Record.
 
     def __init__(self, mtype=None, dtype=None, stride=None, direction=None,
-                 variable=None):
+                 variable=None, count_granularity='thread'):
         self.mtype = mtype
         self.stride = stride
         self.direction = direction
         self.variable = variable
+        self.count_granularity = count_granularity
+
         if dtype is None:
             self.dtype = dtype
         else:
@@ -569,14 +571,16 @@ class MemAccess(object):
                                       "mtype is 'local'")
 
     def copy(self, mtype=None, dtype=None, stride=None, direction=None,
-            variable=None):
+            variable=None, count_granularity=None):
         return MemAccess(
                 mtype=mtype if mtype is not None else self.mtype,
                 dtype=dtype if dtype is not None else self.dtype,
                 stride=stride if stride is not None else self.stride,
                 direction=direction if direction is not None else self.direction,
                 variable=variable if variable is not None else self.variable,
-                )
+                count_granularity=count_granularity
+                if count_granularity is not None
+                else self.count_granularity)
 
     def __eq__(self, other):
         return isinstance(other, MemAccess) and (
@@ -589,34 +593,23 @@ class MemAccess(object):
                 (self.direction is None or other.direction is None or
                  self.direction == other.direction) and
                 (self.variable is None or other.variable is None or
-                 self.variable == other.variable))
+                 self.variable == other.variable) and
+                (self.count_granularity is None or
+                 other.count_granularity is None or
+                 self.count_granularity == other.count_granularity)
+                )
 
     def __hash__(self):
         return hash(str(self))
 
     def __repr__(self):
-        if self.mtype is None:
-            mtype = 'None'
-        else:
-            mtype = self.mtype
-        if self.dtype is None:
-            dtype = 'None'
-        else:
-            dtype = str(self.dtype)
-        if self.stride is None:
-            stride = 'None'
-        else:
-            stride = str(self.stride)
-        if self.direction is None:
-            direction = 'None'
-        else:
-            direction = self.direction
-        if self.variable is None:
-            variable = 'None'
-        else:
-            variable = self.variable
-        return "MemAccess(" + mtype + ", " + dtype + ", " + stride + ", " \
-               + direction + ", " + variable + ")"
+        return "MemAccess(%s, %s, %s, %s, %s, %s)" % (
+            self.mtype,
+            self.dtype,
+            self.stride,
+            self.direction,
+            self.variable,
+            self.count_granularity)
 
 # }}}
 
-- 
GitLab


From f82702defa97bc2dc4ef2a6b65c9ab0644d2f998 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Fri, 12 Jan 2018 14:48:00 -0600
Subject: [PATCH 04/59] setting count_granularity in MemAccessCounter

---
 loopy/statistics.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index b023b7317..45907f579 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -877,7 +877,8 @@ class GlobalMemAccessCounter(MemAccessCounter):
             # count as uniform access
             return ToCountMap({MemAccess(mtype='global',
                                          dtype=self.type_inf(expr), stride=0,
-                                         variable=name): 1}
+                                         variable=name,
+                                         count_granularity='warp'): 1}
                               ) + self.rec(expr.index)
 
         if min_tag_axis != 0:
@@ -931,8 +932,15 @@ class GlobalMemAccessCounter(MemAccessCounter):
 
             total_stride += stride*coeff_min_lid
 
-        return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr),
-                                     stride=total_stride, variable=name): 1}
+        count_granularity = 'thread' if total_stride is not 0 else 'warp'
+
+        return ToCountMap({MemAccess(
+                            mtype='global',
+                            dtype=self.type_inf(expr),
+                            stride=total_stride,
+                            variable=name,
+                            count_granularity=count_granularity
+                            ): 1}
                           ) + self.rec(expr.index)
 
 # }}}
-- 
GitLab


From 8b4750d5d24bce3f43071885e9c51a0238b420df Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Fri, 12 Jan 2018 14:59:40 -0600
Subject: [PATCH 05/59] get_mem_access_map() using count_granularity in
 counting

---
 loopy/statistics.py | 62 +++++++++++++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 19 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 45907f579..f2989e1aa 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1326,13 +1326,37 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False):
 
     cache_holder = CacheHolder()
 
-    @memoize_in(cache_holder, "insn_count")
-    def get_insn_count(knl, insn_id, uniform=False):
+    #@memoize_in(cache_holder, "insn_count")  # TODO why doesn't this work anymore?
+    def get_insn_count(knl, insn_id,
+                       disregard_local_axes=False,
+                       count_granularity='thread'):
         insn = knl.id_to_insn[insn_id]
-        return count_insn_runs(
-                knl, insn, disregard_local_axes=uniform,
+        ct = count_insn_runs(
+                knl, insn, disregard_local_axes=disregard_local_axes,
                 count_redundant_work=count_redundant_work)
 
+        if count_granularity == 'thread':
+            return ct
+        elif count_granularity == 'warp':
+            return ct/wsize
+        elif count_granularity == 'group':
+            from loopy.symbolic import aff_to_expr
+            _, local_size = knl.get_grid_size_upper_bounds()
+            group_threads = 1
+            for size in local_size:
+                try:
+                    s = aff_to_expr(size)
+                except AttributeError:
+                    raise LoopyError("Cannot count insn with group granularity, "
+                                     "group size is not integer: %s"
+                                     % (local_size))
+                group_threads *= s
+            return ct/group_threads
+        else:
+            raise ValueError("get_insn_count: count_granularity '%s' is"
+                    "not allowed. count_granularity must be 'group', "
+                    "'warp', or 'thread'." % (count_granularity))
+
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
 
@@ -1358,23 +1382,21 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False):
 
             # use count excluding local index tags for uniform accesses
             for key, val in six.iteritems(access_expr.count_map):
-                is_uniform = (key.mtype == 'global' and
-                        isinstance(key.stride, int) and
-                        key.stride == 0)
+
                 access_map = (
                         access_map
                         + ToCountMap({key: val})
-                        * get_insn_count(knl, insn.id, is_uniform))
+                        * get_insn_count(knl, insn.id,
+                                         count_granularity=key.count_granularity))
                 #currently not counting stride of local mem access
 
             for key, val in six.iteritems(access_assignee_g.count_map):
-                is_uniform = (key.mtype == 'global' and
-                        isinstance(key.stride, int) and
-                        key.stride == 0)
+
                 access_map = (
                         access_map
                         + ToCountMap({key: val})
-                        * get_insn_count(knl, insn.id, is_uniform))
+                        * get_insn_count(knl, insn.id,
+                                         count_granularity=key.count_granularity))
                 # for now, don't count writes to local mem
         elif isinstance(insn, (NoOpInstruction, BarrierInstruction)):
             pass
@@ -1384,13 +1406,15 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False):
 
     if numpy_types:
         # FIXME: Don't modify in-place
-        access_map.count_map = dict((MemAccess(mtype=mem_access.mtype,
-                                             dtype=mem_access.dtype.numpy_dtype,
-                                             stride=mem_access.stride,
-                                             direction=mem_access.direction,
-                                             variable=mem_access.variable),
-                                  count)
-                      for mem_access, count in six.iteritems(access_map.count_map))
+        access_map.count_map = dict(
+            (MemAccess(
+                       mtype=mem_access.mtype,
+                       dtype=mem_access.dtype.numpy_dtype,
+                       stride=mem_access.stride,
+                       direction=mem_access.direction,
+                       variable=mem_access.variable
+                      ), count)
+            for mem_access, count in six.iteritems(access_map.count_map))
 
     return access_map
 
-- 
GitLab


From 62932b143e3f9f50d27d2d01c606225ac8c52c47 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Fri, 12 Jan 2018 15:08:14 -0600
Subject: [PATCH 06/59] added wsize argument to get_mem_access_map() for
 count_granularity

---
 loopy/statistics.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index f2989e1aa..71c16214d 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1258,7 +1258,8 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False):
 
 # {{{ get_mem_access_map
 
-def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False):
+def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
+                       wsize=None):
     """Count the number of memory accesses in a loopy kernel.
 
     :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be
@@ -1321,6 +1322,13 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False):
     """
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
 
+    if wsize is None:
+        wsize = 32
+        warn_with_kernel(knl, "get_mem_access_map_assumes_warpsize",
+                         "get_mem_access_map: No warp size passed, "
+                         "assuming warp size is %d."
+                         % (wsize))
+
     class CacheHolder(object):
         pass
 
-- 
GitLab


From 3c84a24103cab1e1e5d84864b5d656a22cc63e86 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Fri, 12 Jan 2018 15:33:22 -0600
Subject: [PATCH 07/59] updated stats tests to use/test count_granularity

---
 test/test_statistics.py | 95 +++++++++++++++++++++++++++--------------
 1 file changed, 62 insertions(+), 33 deletions(-)

diff --git a/test/test_statistics.py b/test/test_statistics.py
index eeb4a5a28..8c3c16c0d 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -232,7 +232,8 @@ def test_mem_access_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
+
     n = 512
     m = 256
     ell = 128
@@ -249,8 +250,8 @@ def test_mem_access_counter_basic():
     f64l += mem_map[lp.MemAccess('global', np.float64,
                           stride=0, direction='load', variable='h')
                     ].eval_with_dict(params)
-    assert f32l == 3*n*m*ell
-    assert f64l == 2*n*m
+    assert f32l == 3*n*m*ell/32  # /warpsize because these are considered uniform
+    assert f64l == 2*n*m/32  # /warpsize because these are considered uniform
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c')
@@ -258,8 +259,8 @@ def test_mem_access_counter_basic():
     f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='store', variable='e')
                    ].eval_with_dict(params)
-    assert f32s == n*m*ell
-    assert f64s == n*m
+    assert f32s == n*m*ell/32  # /warpsize because these are considered uniform
+    assert f64s == n*m/32  # /warpsize because these are considered uniform
 
 
 def test_mem_access_counter_reduction():
@@ -272,7 +273,7 @@ def test_mem_access_counter_reduction():
             name="matmul", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
     n = 512
     m = 256
     ell = 128
@@ -283,12 +284,12 @@ def test_mem_access_counter_reduction():
     f32l += mem_map[lp.MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
                     ].eval_with_dict(params)
-    assert f32l == 2*n*m*ell
+    assert f32l == 2*n*m*ell/32  # /warpsize because these are considered uniform
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c')
                    ].eval_with_dict(params)
-    assert f32s == n*ell
+    assert f32s == n*ell/32  # /warpsize because these are considered uniform
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                                  ).to_bytes().eval_and_sum(params)
@@ -312,7 +313,7 @@ def test_mem_access_counter_logic():
             name="logic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
     n = 512
     m = 256
     ell = 128
@@ -329,9 +330,9 @@ def test_mem_access_counter_logic():
     f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
                                        direction='store')
                           ].eval_with_dict(params)
-    assert f32_g_l == 2*n*m
-    assert f64_g_l == n*m
-    assert f64_g_s == n*m
+    assert f32_g_l == 2*n*m/32  # /warpsize because these are considered uniform
+    assert f64_g_l == n*m/32  # /warpsize because these are considered uniform
+    assert f64_g_s == n*m/32  # /warpsize because these are considered uniform
 
 
 def test_mem_access_counter_specialops():
@@ -348,7 +349,7 @@ def test_mem_access_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
     n = 512
     m = 256
     ell = 128
@@ -365,8 +366,8 @@ def test_mem_access_counter_specialops():
     f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
                           stride=0, direction='load', variable='h')
                    ].eval_with_dict(params)
-    assert f32 == 2*n*m*ell
-    assert f64 == 2*n*m
+    assert f32 == 2*n*m*ell/32  # /warpsize because these are considered uniform
+    assert f64 == 2*n*m/32  # /warpsize because these are considered uniform
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='store', variable='c')
@@ -374,13 +375,13 @@ def test_mem_access_counter_specialops():
     f64 = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='store', variable='e')
                   ].eval_with_dict(params)
-    assert f32 == n*m*ell
-    assert f64 == n*m
+    assert f32 == n*m*ell/32  # /warpsize because these are considered uniform
+    assert f64 == n*m/32  # /warpsize because these are considered uniform
 
     filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'])
     #tot = lp.eval_and_sum_polys(filtered_map, params)
     tot = filtered_map.eval_and_sum(params)
-    assert tot == n*m*ell + n*m
+    assert tot == (n*m*ell + n*m)/32  # /warpsize for uniform
 
 
 def test_mem_access_counter_bitwise():
@@ -400,7 +401,7 @@ def test_mem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
     n = 512
     m = 256
     ell = 128
@@ -417,7 +418,7 @@ def test_mem_access_counter_bitwise():
     i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
                           stride=0, direction='load', variable='h')
                    ].eval_with_dict(params)
-    assert i32 == 4*n*m+2*n*m*ell
+    assert i32 == (4*n*m+2*n*m*ell)/32  # /warpsize for uniform
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='c')
@@ -425,7 +426,7 @@ def test_mem_access_counter_bitwise():
     i32 += mem_map[lp.MemAccess('global', np.int32,
                           stride=0, direction='store', variable='e')
                    ].eval_with_dict(params)
-    assert i32 == n*m+n*m*ell
+    assert i32 == (n*m+n*m*ell)/32  # /warpsize because these are considered uniform
 
 
 def test_mem_access_counter_mixed():
@@ -446,7 +447,7 @@ def test_mem_access_counter_mixed():
     knl = lp.split_iname(knl, "j", bsize)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)  # noqa
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)  # noqa
     n = 512
     m = 256
     ell = 128
@@ -468,8 +469,8 @@ def test_mem_access_counter_mixed():
                                    stride=Variable('m'), direction='load',
                                    variable='b')
                             ].eval_with_dict(params)
-    assert f64uniform == 2*n*m*ell/bsize
-    assert f32uniform == n*m*ell/bsize
+    assert f64uniform == 2*n*m*ell/32  # /warpsize for uniform
+    assert f32uniform == n*m*ell/32  # /warpsize for uniform
     assert f32nonconsec == 3*n*m*ell
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
@@ -479,7 +480,7 @@ def test_mem_access_counter_mixed():
                                   stride=Variable('m'), direction='store',
                                   variable='c')
                            ].eval_with_dict(params)
-    assert f64uniform == n*m*ell/bsize
+    assert f64uniform == n*m*ell/32  # /warpsize because these are considered uniform
     assert f32nonconsec == n*m*ell
 
 
@@ -534,6 +535,34 @@ def test_mem_access_counter_nonconsec():
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*ell
 
+    mem_map64 = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=64)
+    f64nonconsec = mem_map64[lp.MemAccess(
+                    'global',
+                    np.float64, stride=Variable('m'),
+                    direction='load', variable='g')
+                    ].eval_with_dict(params)
+    f64nonconsec += mem_map64[lp.MemAccess(
+                    'global',
+                    np.float64, stride=Variable('m'),
+                    direction='load', variable='h')
+                    ].eval_with_dict(params)
+    f32nonconsec = mem_map64[lp.MemAccess(
+                    'global',
+                    np.dtype(np.float32),
+                    stride=Variable('m')*Variable('ell'),
+                    direction='load',
+                    variable='a')
+                    ].eval_with_dict(params)
+    f32nonconsec += mem_map64[lp.MemAccess(
+                    'global',
+                    np.dtype(np.float32),
+                    stride=Variable('m')*Variable('ell'),
+                    direction='load',
+                    variable='b')
+                    ].eval_with_dict(params)
+    assert f64nonconsec == 2*n*m
+    assert f32nonconsec == 3*n*m*ell
+
 
 def test_mem_access_counter_consec():
 
@@ -750,22 +779,22 @@ def test_summations_and_filters():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
 
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
 
     loads_a = mem_map.filter_by(direction=['load'], variable=['a']
                                 ).eval_and_sum(params)
-    assert loads_a == 2*n*m*ell
+    assert loads_a == 2*n*m*ell/32  # /warpsize because these are considered uniform
 
     global_stores = mem_map.filter_by(mtype=['global'], direction=['store']
                                       ).eval_and_sum(params)
-    assert global_stores == n*m*ell + n*m
+    assert global_stores == (n*m*ell + n*m)/32  # /warpsize for uniform
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                                  ).to_bytes().eval_and_sum(params)
     st_bytes = mem_map.filter_by(mtype=['global'], direction=['store']
                                  ).to_bytes().eval_and_sum(params)
-    assert ld_bytes == 4*n*m*ell*3 + 8*n*m*2
-    assert st_bytes == 4*n*m*ell + 8*n*m
+    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)/32  # /warpsize for uniform
+    assert st_bytes == (4*n*m*ell + 8*n*m)/32  # /warpsize for uniform
 
     # ignore stride and variable names in this map
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
@@ -773,8 +802,8 @@ def test_summations_and_filters():
                           ].eval_with_dict(params)
     f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
                           ].eval_with_dict(params)
-    assert f32lall == 3*n*m*ell
-    assert f64lall == 2*n*m
+    assert f32lall == 3*n*m*ell/32  # /warpsize because these are considered uniform
+    assert f64lall == 2*n*m/32  # /warpsize because these are considered uniform
 
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     #for k, v in op_map.items():
@@ -807,7 +836,7 @@ def test_summations_and_filters():
         return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \
                key.direction == 'load'
     s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
-    assert s1f64l == 2*n*m
+    assert s1f64l == 2*n*m/32  # /warpsize because these are considered uniform
 
 
 def test_strided_footprint():
-- 
GitLab


From 70d17d3bfca757a2fb247c98d74dc9c16eb581c2 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Fri, 12 Jan 2018 19:53:11 -0600
Subject: [PATCH 08/59] updated tutorial doctests for addition of
 count_granularity, still needs explanations

---
 doc/tutorial.rst | 118 +++++++++++++++++++++++------------------------
 1 file changed, 59 insertions(+), 59 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 7196dad86..8e05cf0f4 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1552,12 +1552,12 @@ information provided. Now we will count the operations:
 
     >>> op_map = lp.get_op_map(knl)
     >>> print(lp.stringify_stats_mapping(op_map))
-    Op(np:dtype('float32'), add) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float32'), div) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float32'), mul) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float64'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float64'), mul) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('int32'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float32'), add, thread) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float32'), div, thread) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float32'), mul, thread) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float64'), add, thread) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float64'), mul, thread) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('int32'), add, thread) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
 
 :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{**
@@ -1579,12 +1579,12 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
 .. doctest::
 
     >>> param_dict = {'n': 256, 'm': 256, 'l': 8}
-    >>> f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(param_dict)
-    >>> f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(param_dict)
-    >>> f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(param_dict)
-    >>> f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(param_dict)
-    >>> f64mul = op_map[lp.Op(np.float64, 'mul')].eval_with_dict(param_dict)
-    >>> i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(param_dict)
+    >>> f32add = op_map[lp.Op(np.float32, 'add', 'thread')].eval_with_dict(param_dict)
+    >>> f32div = op_map[lp.Op(np.float32, 'div', 'thread')].eval_with_dict(param_dict)
+    >>> f32mul = op_map[lp.Op(np.float32, 'mul', 'thread')].eval_with_dict(param_dict)
+    >>> f64add = op_map[lp.Op(np.float64, 'add', 'thread')].eval_with_dict(param_dict)
+    >>> f64mul = op_map[lp.Op(np.float64, 'mul', 'thread')].eval_with_dict(param_dict)
+    >>> i32add = op_map[lp.Op(np.int32, 'add', 'thread')].eval_with_dict(param_dict)
     >>> print("%i\n%i\n%i\n%i\n%i\n%i" %
     ...     (f32add, f32div, f32mul, f64add, f64mul, i32add))
     524288
@@ -1615,9 +1615,9 @@ together into keys containing only the specified fields:
 
     >>> op_map_dtype = op_map.group_by('dtype')
     >>> print(lp.stringify_stats_mapping(op_map_dtype))
-    Op(np:dtype('float32'), None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float64'), None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('int32'), None) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float32'), None, None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float64'), None, None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('int32'), None, None) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
     >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32)
     ...                           ].eval_with_dict(param_dict)
@@ -1638,12 +1638,12 @@ we'll continue using the kernel from the previous example:
 
     >>> mem_map = lp.get_mem_access_map(knl)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, a, warp) : [m, l, n] -> { 1/16 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b, warp) : [m, l, n] -> { 1/32 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c, warp) : [m, l, n] -> { 1/32 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g, warp) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h, warp) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e, warp) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
 
 :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
@@ -1669,20 +1669,20 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g')
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g', 'warp')
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e')
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e', 'warp')
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a')
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a', 'warp')
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c')
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c', 'warp')
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
-    f32 ld a: 1048576
-    f32 st c: 524288
-    f64 ld g: 65536
-    f64 st e: 65536
+    f32 ld a: 32768
+    f32 st c: 16384
+    f64 ld g: 2048
+    f64 st e: 2048
 
 :class:`loopy.ToCountMap` also makes it easy to determine the total amount
 of data moved in bytes. Suppose we want to know the total amount of global
@@ -1693,26 +1693,26 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
     >>> bytes_map = mem_map.to_bytes()
     >>> print(lp.stringify_stats_mapping(bytes_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, a, warp) : [m, l, n] -> { 1/4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b, warp) : [m, l, n] -> { 1/8 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c, warp) : [m, l, n] -> { 1/8 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g, warp) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h, warp) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e, warp) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
     >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
     ...                                         ).group_by('direction')
     >>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
-    MemAccess(None, None, None, load, None) : [m, l, n] -> { (16 * m + 12 * m * l) * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(None, None, None, store, None) : [m, l, n] -> { (8 * m + 4 * m * l) * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(None, None, None, load, None, None) : [m, l, n] -> { (1/2 + 3/8 * l) * n * m : m > 0 and l > 0 and n > 0 }
+    MemAccess(None, None, None, store, None, None) : [m, l, n] -> { (1/4 + 1/8 * l) * n * m : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
     >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
     ...                            ].eval_with_dict(param_dict)
     >>> stored = global_ld_st_bytes[lp.MemAccess(direction='store')
     ...                            ].eval_with_dict(param_dict)
     >>> print("bytes loaded: %s\nbytes stored: %s" % (loaded, stored))
-    bytes loaded: 7340032
-    bytes stored: 2621440
+    bytes loaded: 229376
+    bytes stored: 81920
 
 One can see how these functions might be useful in computing, for example,
 achieved memory bandwidth in byte/sec or performance in FLOP/sec.
@@ -1731,12 +1731,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
     ...                             outer_tag="l.1", inner_tag="l.0")
     >>> mem_map = lp.get_mem_access_map(knl_consec)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 1, load, a) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float32'), 1, load, b) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float32'), 1, store, c) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, load, g) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, load, h) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, store, e) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, load, a, thread) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, load, b, thread) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, store, c, thread) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, load, g, thread) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, load, h, thread) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, store, e, thread) : [m, l, n] -> { ... }
     <BLANKLINE>
 
 With this parallelization, consecutive threads will access consecutive array
@@ -1746,13 +1746,13 @@ array accesses has not changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g')
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g', 'thread')
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e')
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e', 'thread')
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a')
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a', 'thread')
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c')
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c', 'thread')
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1772,12 +1772,12 @@ switch the inner and outer tags in our parallelization of the kernel:
     ...                                outer_tag="l.0", inner_tag="l.1")
     >>> mem_map = lp.get_mem_access_map(knl_nonconsec)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 128, load, a) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float32'), 128, load, b) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float32'), 128, store, c) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, load, g) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, load, h) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, store, e) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, load, a, thread) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, load, b, thread) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, store, c, thread) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, load, g, thread) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, load, h, thread) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, store, e, thread) : [m, l, n] -> { ... }
     <BLANKLINE>
 
 With this parallelization, consecutive threads will access *nonconsecutive*
@@ -1786,13 +1786,13 @@ changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g')
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g', 'thread')
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e')
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e', 'thread')
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a')
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a', 'thread')
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c')
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c', 'thread')
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
-- 
GitLab


From 152e17eabfdce30ab1163e32128598675eec708b Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Fri, 12 Jan 2018 20:09:53 -0600
Subject: [PATCH 09/59] changed default count granularity to None instead of
 thread, updated tests and everything else accordingly

---
 loopy/statistics.py     |  84 +++++++++----
 test/test_statistics.py | 256 +++++++++++++++++++++++++---------------
 2 files changed, 221 insertions(+), 119 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 71c16214d..0776eb1c3 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -484,7 +484,7 @@ class Op(object):
 
     # FIXME: This could be done much more briefly by inheriting from Record.
 
-    def __init__(self, dtype=None, name=None, count_granularity='thread'):
+    def __init__(self, dtype=None, name=None, count_granularity=None):
         self.name = name
         self.count_granularity = count_granularity
         if dtype is None:
@@ -547,7 +547,7 @@ class MemAccess(object):
     # FIXME: This could be done much more briefly by inheriting from Record.
 
     def __init__(self, mtype=None, dtype=None, stride=None, direction=None,
-                 variable=None, count_granularity='thread'):
+                 variable=None, count_granularity=None):
         self.mtype = mtype
         self.stride = stride
         self.direction = direction
@@ -571,7 +571,7 @@ class MemAccess(object):
                                       "mtype is 'local'")
 
     def copy(self, mtype=None, dtype=None, stride=None, direction=None,
-            variable=None, count_granularity=None):
+             variable=None, count_granularity=None):
         return MemAccess(
                 mtype=mtype if mtype is not None else self.mtype,
                 dtype=dtype if dtype is not None else self.dtype,
@@ -692,7 +692,8 @@ class ExpressionOpCounter(CounterBase):
     def map_call(self, expr):
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
-                        name='func:'+str(expr.function)): 1}
+                        name='func:'+str(expr.function),
+                        count_granularity='thread'): 1}
                     ) + self.rec(expr.parameters)
 
     def map_subscript(self, expr):
@@ -702,20 +703,27 @@ class ExpressionOpCounter(CounterBase):
         assert expr.children
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
-                        name='add'): len(expr.children)-1}
+                        name='add',
+                        count_granularity='thread'): len(expr.children)-1}
                     ) + sum(self.rec(child) for child in expr.children)
 
     def map_product(self, expr):
         from pymbolic.primitives import is_zero
         assert expr.children
-        return sum(ToCountMap({Op(dtype=self.type_inf(expr), name='mul'): 1})
+        return sum(ToCountMap({Op(dtype=self.type_inf(expr),
+                                  name='mul',
+                                  count_granularity='thread'): 1})
                    + self.rec(child)
                    for child in expr.children
                    if not is_zero(child + 1)) + \
-                   ToCountMap({Op(dtype=self.type_inf(expr), name='mul'): -1})
+                   ToCountMap({Op(dtype=self.type_inf(expr),
+                                  name='mul',
+                                  count_granularity='thread'): -1})
 
     def map_quotient(self, expr, *args):
-        return ToCountMap({Op(dtype=self.type_inf(expr), name='div'): 1}) \
+        return ToCountMap({Op(dtype=self.type_inf(expr),
+                              name='div',
+                              count_granularity='thread'): 1}) \
                                 + self.rec(expr.numerator) \
                                 + self.rec(expr.denominator)
 
@@ -723,23 +731,31 @@ class ExpressionOpCounter(CounterBase):
     map_remainder = map_quotient
 
     def map_power(self, expr):
-        return ToCountMap({Op(dtype=self.type_inf(expr), name='pow'): 1}) \
+        return ToCountMap({Op(dtype=self.type_inf(expr),
+                              name='pow',
+                              count_granularity='thread'): 1}) \
                                 + self.rec(expr.base) \
                                 + self.rec(expr.exponent)
 
     def map_left_shift(self, expr):
-        return ToCountMap({Op(dtype=self.type_inf(expr), name='shift'): 1}) \
+        return ToCountMap({Op(dtype=self.type_inf(expr),
+                              name='shift',
+                              count_granularity='thread'): 1}) \
                                 + self.rec(expr.shiftee) \
                                 + self.rec(expr.shift)
 
     map_right_shift = map_left_shift
 
     def map_bitwise_not(self, expr):
-        return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'): 1}) \
+        return ToCountMap({Op(dtype=self.type_inf(expr),
+                              name='bw',
+                              count_granularity='thread'): 1}) \
                                 + self.rec(expr.child)
 
     def map_bitwise_or(self, expr):
-        return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'):
+        return ToCountMap({Op(dtype=self.type_inf(expr),
+                              name='bw',
+                              count_granularity='thread'):
                            len(expr.children)-1}) \
                                 + sum(self.rec(child) for child in expr.children)
 
@@ -761,7 +777,9 @@ class ExpressionOpCounter(CounterBase):
                + self.rec(expr.else_)
 
     def map_min(self, expr):
-        return ToCountMap({Op(dtype=self.type_inf(expr), name='maxmin'):
+        return ToCountMap({Op(dtype=self.type_inf(expr),
+                              name='maxmin',
+                              count_granularity='thread'):
                            len(expr.children)-1}) \
                + sum(self.rec(child) for child in expr.children)
 
@@ -802,7 +820,8 @@ class LocalMemAccessCounter(MemAccessCounter):
             array = self.knl.temporary_variables[name]
             if isinstance(array, TemporaryVariable) and (
                     array.scope == temp_var_scope.LOCAL):
-                sub_map[MemAccess(mtype='local', dtype=dtype)] = 1
+                sub_map[MemAccess(mtype='local', dtype=dtype,
+                                  count_granularity='thread')] = 1
         return sub_map
 
     def map_variable(self, expr):
@@ -838,7 +857,8 @@ class GlobalMemAccessCounter(MemAccessCounter):
 
         return ToCountMap({MemAccess(mtype='global',
                                      dtype=self.type_inf(expr), stride=0,
-                                     variable=name): 1}
+                                     variable=name,
+                                     count_granularity='thread'): 1}
                           ) + self.rec(expr.index)
 
     def map_subscript(self, expr):
@@ -888,7 +908,8 @@ class GlobalMemAccessCounter(MemAccessCounter):
                              "sys.maxsize." % (min_tag_axis))
             return ToCountMap({MemAccess(mtype='global',
                                          dtype=self.type_inf(expr),
-                                         stride=sys.maxsize, variable=name): 1}
+                                         stride=sys.maxsize, variable=name,
+                                         count_granularity='thread'): 1}
                               ) + self.rec(expr.index)
 
         # get local_id associated with minimum tag axis
@@ -1218,8 +1239,8 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False):
 
         op_map = get_op_map(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-        f32add = op_map[Op(np.float32, 'add')].eval_with_dict(params)
-        f32mul = op_map[Op(np.float32, 'mul')].eval_with_dict(params)
+        f32add = op_map[Op(np.float32, 'add', count_granularity='thread')].eval_with_dict(params)
+        f32mul = op_map[Op(np.float32, 'mul', count_granularity='thread')].eval_with_dict(params)
 
         # (now use these counts to predict performance)
 
@@ -1247,7 +1268,10 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False):
                     % type(insn).__name__)
 
     if numpy_types:
-        op_map.count_map = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name),
+        op_map.count_map = dict((Op(
+                                    dtype=op.dtype.numpy_dtype,
+                                    name=op.name,
+                                    count_granularity=op.count_granularity),
                                  count)
                 for op, count in six.iteritems(op_map.count_map))
 
@@ -1296,25 +1320,29 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                                           dtype=np.float32,
                                           stride=1,
                                           direction='load',
-                                          variable='a')
+                                          variable='a',
+                                          count_granularity='thread')
                                ].eval_with_dict(params)
         f32_s1_g_st_a = mem_map[MemAccess(mtype='global',
                                           dtype=np.float32,
                                           stride=1,
                                           direction='store',
-                                          variable='a')
+                                          variable='a',
+                                          count_granularity='thread')
                                ].eval_with_dict(params)
         f32_s1_l_ld_x = mem_map[MemAccess(mtype='local',
                                           dtype=np.float32,
                                           stride=1,
                                           direction='load',
-                                          variable='x')
+                                          variable='x',
+                                          count_granularity='thread')
                                ].eval_with_dict(params)
         f32_s1_l_st_x = mem_map[MemAccess(mtype='local',
                                           dtype=np.float32,
                                           stride=1,
                                           direction='store',
-                                          variable='x')
+                                          variable='x',
+                                          count_granularity='thread')
                                ].eval_with_dict(params)
 
         # (now use these counts to predict performance)
@@ -1343,7 +1371,12 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                 knl, insn, disregard_local_axes=disregard_local_axes,
                 count_redundant_work=count_redundant_work)
 
-        if count_granularity == 'thread':
+        if count_granularity is None:
+            warn_with_kernel(knl, "get_insn_count_assumes_granularity",
+                             "get_insn_count: No count granularity passed for "
+                             "MemAccess, assuming thread granularity.")
+            return ct
+        elif count_granularity == 'thread':
             return ct
         elif count_granularity == 'warp':
             return ct/wsize
@@ -1420,7 +1453,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                        dtype=mem_access.dtype.numpy_dtype,
                        stride=mem_access.stride,
                        direction=mem_access.direction,
-                       variable=mem_access.variable
+                       variable=mem_access.variable,
+                       count_granularity=mem_access.count_granularity
                       ), count)
             for mem_access, count in six.iteritems(access_map.count_map))
 
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 8c3c16c0d..a5132b94f 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -54,11 +54,13 @@ def test_op_counter_basic():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params)
-    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul')].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add', 'thread')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul', 'thread')].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div', 'thread')].eval_with_dict(params)
+    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', 'thread')
+                    ].eval_with_dict(params)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'thread')
+                    ].eval_with_dict(params)
     assert f32add == f32mul == f32div == n*m*ell
     assert f64mul == n*m
     assert i32add == n*m*2
@@ -79,8 +81,9 @@ def test_op_counter_reduction():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add', 'thread')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', 'thread')
+                    ].eval_with_dict(params)
     assert f32add == f32mul == n*m*ell
 
     op_map_dtype = op_map.group_by('dtype')
@@ -108,10 +111,12 @@ def test_op_counter_logic():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(params)
-    f64div = op_map[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul', 'thread')].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.float64, 'add', 'thread')].eval_with_dict(params)
+    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', 'thread')
+                    ].eval_with_dict(params)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'thread')
+                    ].eval_with_dict(params)
     assert f32mul == n*m
     assert f64div == 2*n*m  # TODO why?
     assert f64add == n*m
@@ -138,14 +143,18 @@ def test_op_counter_specialops():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params)
-    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
-    f64pow = op_map[lp.Op(np.float64, 'pow')].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.dtype(np.float64), 'add')].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
-    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params)
-    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul', 'thread')].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div', 'thread')].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add', 'thread')].eval_with_dict(params)
+    f64pow = op_map[lp.Op(np.float64, 'pow', 'thread')].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', 'thread')
+                    ].eval_with_dict(params)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'thread')
+                    ].eval_with_dict(params)
+    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', 'thread')
+                    ].eval_with_dict(params)
+    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', 'thread')
+                    ].eval_with_dict(params)
     assert f32div == 2*n*m*ell
     assert f32mul == f32add == n*m*ell
     assert f64add == 3*n*m
@@ -174,12 +183,15 @@ def test_op_counter_bitwise():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params)
-    i32bw = op_map[lp.Op(np.int32, 'bw')].eval_with_dict(params)
-    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params)
-    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params)
-    i64add = op_map[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params)
-    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params)
+    i32add = op_map[lp.Op(np.int32, 'add', 'thread')].eval_with_dict(params)
+    i32bw = op_map[lp.Op(np.int32, 'bw', 'thread')].eval_with_dict(params)
+    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', 'thread')].eval_with_dict(params)
+    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', 'thread')
+                    ].eval_with_dict(params)
+    i64add = op_map[lp.Op(np.dtype(np.int64), 'add', 'thread')
+                    ].eval_with_dict(params)
+    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', 'thread')
+                    ].eval_with_dict(params)
     assert i32add == n*m+n*m*ell
     assert i32bw == 2*n*m*ell
     assert i64bw == 2*n*m
@@ -208,7 +220,10 @@ def test_op_counter_triangular_domain():
     else:
         expect_fallback = False
 
-    op_map = lp.get_op_map(knl, count_redundant_work=True)[lp.Op(np.float64, 'mul')]
+    op_map = lp.get_op_map(
+                    knl,
+                    count_redundant_work=True
+                    )[lp.Op(np.float64, 'mul', 'thread')]
     value_dict = dict(m=13, n=200)
     flops = op_map.eval_with_dict(value_dict)
 
@@ -231,7 +246,7 @@ def test_mem_access_counter_basic():
             name="basic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
+                    dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
 
     n = 512
@@ -239,25 +254,31 @@ def test_mem_access_counter_basic():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
     f32l = mem_map[lp.MemAccess('global', np.float32,
-                         stride=0, direction='load', variable='a')
+                         stride=0, direction='load', variable='a',
+                         count_granularity='warp')
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
-                          stride=0, direction='load', variable='b')
+                         stride=0, direction='load', variable='b',
+                         count_granularity='warp')
                     ].eval_with_dict(params)
     f64l = mem_map[lp.MemAccess('global', np.float64,
-                         stride=0, direction='load', variable='g')
+                         stride=0, direction='load', variable='g',
+                         count_granularity='warp')
                    ].eval_with_dict(params)
     f64l += mem_map[lp.MemAccess('global', np.float64,
-                          stride=0, direction='load', variable='h')
+                         stride=0, direction='load', variable='h',
+                         count_granularity='warp')
                     ].eval_with_dict(params)
     assert f32l == 3*n*m*ell/32  # /warpsize because these are considered uniform
     assert f64l == 2*n*m/32  # /warpsize because these are considered uniform
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                         stride=0, direction='store', variable='c')
+                         stride=0, direction='store', variable='c',
+                         count_granularity='warp')
                    ].eval_with_dict(params)
     f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
-                         stride=0, direction='store', variable='e')
+                         stride=0, direction='store', variable='e',
+                         count_granularity='warp')
                    ].eval_with_dict(params)
     assert f32s == n*m*ell/32  # /warpsize because these are considered uniform
     assert f64s == n*m/32  # /warpsize because these are considered uniform
@@ -279,15 +300,18 @@ def test_mem_access_counter_reduction():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
     f32l = mem_map[lp.MemAccess('global', np.float32,
-                         stride=0, direction='load', variable='a')
+                         stride=0, direction='load', variable='a',
+                         count_granularity='warp')
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
-                          stride=0, direction='load', variable='b')
+                         stride=0, direction='load', variable='b',
+                         count_granularity='warp')
                     ].eval_with_dict(params)
     assert f32l == 2*n*m*ell/32  # /warpsize because these are considered uniform
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                         stride=0, direction='store', variable='c')
+                         stride=0, direction='store', variable='c',
+                         count_granularity='warp')
                    ].eval_with_dict(params)
     assert f32s == n*ell/32  # /warpsize because these are considered uniform
 
@@ -355,30 +379,37 @@ def test_mem_access_counter_specialops():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
     f32 = mem_map[lp.MemAccess('global', np.float32,
-                         stride=0, direction='load', variable='a')
+                         stride=0, direction='load', variable='a',
+                         count_granularity='warp')
                   ].eval_with_dict(params)
     f32 += mem_map[lp.MemAccess('global', np.float32,
-                          stride=0, direction='load', variable='b')
+                         stride=0, direction='load', variable='b',
+                         count_granularity='warp')
                    ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
-                         stride=0, direction='load', variable='g')
+                         stride=0, direction='load', variable='g',
+                         count_granularity='warp')
                   ].eval_with_dict(params)
     f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
-                          stride=0, direction='load', variable='h')
+                         stride=0, direction='load', variable='h',
+                         count_granularity='warp')
                    ].eval_with_dict(params)
     assert f32 == 2*n*m*ell/32  # /warpsize because these are considered uniform
     assert f64 == 2*n*m/32  # /warpsize because these are considered uniform
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
-                         stride=0, direction='store', variable='c')
+                         stride=0, direction='store', variable='c',
+                         count_granularity='warp')
                   ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.float64,
-                         stride=0, direction='store', variable='e')
+                         stride=0, direction='store', variable='e',
+                         count_granularity='warp')
                   ].eval_with_dict(params)
     assert f32 == n*m*ell/32  # /warpsize because these are considered uniform
     assert f64 == n*m/32  # /warpsize because these are considered uniform
 
-    filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'])
+    filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'],
+                         count_granularity='warp')
     #tot = lp.eval_and_sum_polys(filtered_map, params)
     tot = filtered_map.eval_and_sum(params)
     assert tot == (n*m*ell + n*m)/32  # /warpsize for uniform
@@ -407,24 +438,30 @@ def test_mem_access_counter_bitwise():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
     i32 = mem_map[lp.MemAccess('global', np.int32,
-                         stride=0, direction='load', variable='a')
+                         stride=0, direction='load', variable='a',
+                         count_granularity='warp')
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
-                          stride=0, direction='load', variable='b')
+                         stride=0, direction='load', variable='b',
+                         count_granularity='warp')
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
-                          stride=0, direction='load', variable='g')
+                         stride=0, direction='load', variable='g',
+                         count_granularity='warp')
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
-                          stride=0, direction='load', variable='h')
+                         stride=0, direction='load', variable='h',
+                         count_granularity='warp')
                    ].eval_with_dict(params)
     assert i32 == (4*n*m+2*n*m*ell)/32  # /warpsize for uniform
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
-                         stride=0, direction='store', variable='c')
+                         stride=0, direction='store', variable='c',
+                         count_granularity='warp')
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
-                          stride=0, direction='store', variable='e')
+                         stride=0, direction='store', variable='e',
+                         count_granularity='warp')
                    ].eval_with_dict(params)
     assert i32 == (n*m+n*m*ell)/32  # /warpsize because these are considered uniform
 
@@ -453,32 +490,39 @@ def test_mem_access_counter_mixed():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
-                                stride=0, direction='load', variable='g')
+                                stride=0, direction='load', variable='g',
+                                count_granularity='warp')
                          ].eval_with_dict(params)
     f64uniform += mem_map[lp.MemAccess('global', np.float64,
-                                 stride=0, direction='load', variable='h')
+                                stride=0, direction='load', variable='h',
+                                count_granularity='warp')
                           ].eval_with_dict(params)
     f32uniform = mem_map[lp.MemAccess('global', np.float32,
-                                stride=0, direction='load', variable='x')
+                                stride=0, direction='load', variable='x',
+                                count_granularity='warp')
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                  stride=Variable('m'), direction='load',
-                                  variable='a')
+                                stride=Variable('m'), direction='load',
+                                variable='a',
+                                count_granularity='thread')
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                   stride=Variable('m'), direction='load',
-                                   variable='b')
+                                stride=Variable('m'), direction='load',
+                                variable='b',
+                                count_granularity='thread')
                             ].eval_with_dict(params)
     assert f64uniform == 2*n*m*ell/32  # /warpsize for uniform
     assert f32uniform == n*m*ell/32  # /warpsize for uniform
     assert f32nonconsec == 3*n*m*ell
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
-                                stride=0, direction='store', variable='e')
+                                stride=0, direction='store', variable='e',
+                                count_granularity='warp')
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
-                                  stride=Variable('m'), direction='store',
-                                  variable='c')
+                                stride=Variable('m'), direction='store',
+                                variable='c',
+                                count_granularity='thread')
                            ].eval_with_dict(params)
     assert f64uniform == n*m*ell/32  # /warpsize because these are considered uniform
     assert f32nonconsec == n*m*ell
@@ -506,31 +550,37 @@ def test_mem_access_counter_nonconsec():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
-                                  stride=Variable('m'), direction='load',
-                                  variable='g')
+                                stride=Variable('m'), direction='load',
+                                variable='g',
+                                count_granularity='thread')
                            ].eval_with_dict(params)
     f64nonconsec += mem_map[lp.MemAccess('global', np.float64,
-                                   stride=Variable('m'), direction='load',
-                                   variable='h')
+                                stride=Variable('m'), direction='load',
+                                variable='h',
+                                count_granularity='thread')
                             ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                  stride=Variable('m')*Variable('ell'),
-                                  direction='load', variable='a')
+                                stride=Variable('m')*Variable('ell'),
+                                direction='load', variable='a',
+                                count_granularity='thread')
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                   stride=Variable('m')*Variable('ell'),
-                                   direction='load', variable='b')
+                                stride=Variable('m')*Variable('ell'),
+                                direction='load', variable='b',
+                                count_granularity='thread')
                             ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
 
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
-                                  stride=Variable('m'), direction='store',
-                                  variable='e')
+                                stride=Variable('m'), direction='store',
+                                variable='e',
+                                count_granularity='thread')
                            ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
-                                  stride=Variable('m')*Variable('ell'),
-                                  direction='store', variable='c')
+                                stride=Variable('m')*Variable('ell'),
+                                direction='store', variable='c',
+                                count_granularity='thread')
                            ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*ell
@@ -539,26 +589,30 @@ def test_mem_access_counter_nonconsec():
     f64nonconsec = mem_map64[lp.MemAccess(
                     'global',
                     np.float64, stride=Variable('m'),
-                    direction='load', variable='g')
+                    direction='load', variable='g',
+                    count_granularity='thread')
                     ].eval_with_dict(params)
     f64nonconsec += mem_map64[lp.MemAccess(
                     'global',
                     np.float64, stride=Variable('m'),
-                    direction='load', variable='h')
+                    direction='load', variable='h',
+                    count_granularity='thread')
                     ].eval_with_dict(params)
     f32nonconsec = mem_map64[lp.MemAccess(
                     'global',
                     np.dtype(np.float32),
                     stride=Variable('m')*Variable('ell'),
                     direction='load',
-                    variable='a')
+                    variable='a',
+                    count_granularity='thread')
                     ].eval_with_dict(params)
     f32nonconsec += mem_map64[lp.MemAccess(
                     'global',
                     np.dtype(np.float32),
                     stride=Variable('m')*Variable('ell'),
                     direction='load',
-                    variable='b')
+                    variable='b',
+                    count_granularity='thread')
                     ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
@@ -586,25 +640,31 @@ def test_mem_access_counter_consec():
     params = {'n': n, 'm': m, 'ell': ell}
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
-                        stride=1, direction='load', variable='g')
+                        stride=1, direction='load', variable='g',
+                        count_granularity='thread')
                         ].eval_with_dict(params)
     f64consec += mem_map[lp.MemAccess('global', np.float64,
-                        stride=1, direction='load', variable='h')
+                        stride=1, direction='load', variable='h',
+                        count_granularity='thread')
                          ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess('global', np.float32,
-                        stride=1, direction='load', variable='a')
+                        stride=1, direction='load', variable='a',
+                        count_granularity='thread')
                         ].eval_with_dict(params)
     f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                        stride=1, direction='load', variable='b')
+                        stride=1, direction='load', variable='b',
+                        count_granularity='thread')
                          ].eval_with_dict(params)
     assert f64consec == 2*n*m*ell
     assert f32consec == 3*n*m*ell
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
-                        stride=1, direction='store', variable='e')
+                        stride=1, direction='store', variable='e',
+                        count_granularity='thread')
                         ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess('global', np.float32,
-                        stride=1, direction='store', variable='c')
+                        stride=1, direction='store', variable='c',
+                        count_granularity='thread')
                         ].eval_with_dict(params)
     assert f64consec == n*m*ell
     assert f32consec == n*m*ell
@@ -688,16 +748,16 @@ def test_all_counters_parallel_matmul():
 
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     f32mul = op_map[
-                        lp.Op(np.float32, 'mul')
+                        lp.Op(np.float32, 'mul', 'thread')
                         ].eval_with_dict(params)
     f32add = op_map[
-                        lp.Op(np.float32, 'add')
+                        lp.Op(np.float32, 'add', 'thread')
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        lp.Op(np.int32, 'add')
+                        lp.Op(np.int32, 'add', 'thread')
                         ].eval_with_dict(params)
     i32ops += op_map[
-                        lp.Op(np.dtype(np.int32), 'mul')
+                        lp.Op(np.dtype(np.int32), 'mul', 'thread')
                         ].eval_with_dict(params)
 
     assert f32mul+f32add == n*m*ell*2
@@ -705,17 +765,20 @@ def test_all_counters_parallel_matmul():
     op_map = lp.get_mem_access_map(knl, count_redundant_work=True)
 
     f32s1lb = op_map[lp.MemAccess('global', np.float32,
-                     stride=1, direction='load', variable='b')
+                     stride=1, direction='load', variable='b',
+                     count_granularity='thread')
                      ].eval_with_dict(params)
     f32s1la = op_map[lp.MemAccess('global', np.float32,
-                     stride=1, direction='load', variable='a')
+                     stride=1, direction='load', variable='a',
+                     count_granularity='thread')
                      ].eval_with_dict(params)
 
     assert f32s1lb == n*m*ell/bsize
     assert f32s1la == n*m*ell/bsize
 
     f32coal = op_map[lp.MemAccess('global', np.float32,
-                     stride=1, direction='store', variable='c')
+                     stride=1, direction='store', variable='c',
+                     count_granularity='thread')
                      ].eval_with_dict(params)
 
     assert f32coal == n*ell
@@ -723,7 +786,8 @@ def test_all_counters_parallel_matmul():
     local_mem_map = lp.get_mem_access_map(knl,
                         count_redundant_work=True).filter_by(mtype=['local'])
     local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
-                                             direction='load')
+                                             direction='load',
+                                             count_granularity='thread')
                                 ].eval_with_dict(params)
     assert local_mem_l == n*m*ell*2
 
@@ -773,7 +837,7 @@ def test_summations_and_filters():
             name="basic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
+                    dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     n = 512
     m = 256
     ell = 128
@@ -781,17 +845,21 @@ def test_summations_and_filters():
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
 
-    loads_a = mem_map.filter_by(direction=['load'], variable=['a']
+    loads_a = mem_map.filter_by(direction=['load'], variable=['a'],
+                                count_granularity=['warp']
                                 ).eval_and_sum(params)
     assert loads_a == 2*n*m*ell/32  # /warpsize because these are considered uniform
 
-    global_stores = mem_map.filter_by(mtype=['global'], direction=['store']
+    global_stores = mem_map.filter_by(mtype=['global'], direction=['store'],
+                                      count_granularity=['warp']
                                       ).eval_and_sum(params)
     assert global_stores == (n*m*ell + n*m)/32  # /warpsize for uniform
 
-    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
+    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'],
+                                 count_granularity=['warp']
                                  ).to_bytes().eval_and_sum(params)
-    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store']
+    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'],
+                                 count_granularity=['warp']
                                  ).to_bytes().eval_and_sum(params)
     assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)/32  # /warpsize for uniform
     assert st_bytes == (4*n*m*ell + 8*n*m)/32  # /warpsize for uniform
-- 
GitLab


From 88e505b077c86819027226118457622e92b2c625 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Fri, 12 Jan 2018 21:38:43 -0600
Subject: [PATCH 10/59] flake8 fixes

---
 loopy/statistics.py     | 11 ++++++-----
 test/test_statistics.py |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 0776eb1c3..fd1e2039c 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -27,7 +27,6 @@ import six
 import loopy as lp
 from islpy import dim_type
 import islpy as isl
-from pytools import memoize_in
 from pymbolic.mapper import CombineMapper
 from functools import reduce
 from loopy.kernel.data import (
@@ -1239,8 +1238,10 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False):
 
         op_map = get_op_map(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-        f32add = op_map[Op(np.float32, 'add', count_granularity='thread')].eval_with_dict(params)
-        f32mul = op_map[Op(np.float32, 'mul', count_granularity='thread')].eval_with_dict(params)
+        f32add = op_map[Op(np.float32, 'add', count_granularity='thread')
+                       ].eval_with_dict(params)
+        f32mul = op_map[Op(np.float32, 'mul', count_granularity='thread')
+                       ].eval_with_dict(params)
 
         # (now use these counts to predict performance)
 
@@ -1360,8 +1361,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
     class CacheHolder(object):
         pass
 
-    cache_holder = CacheHolder()
-
+    #cache_holder = CacheHolder()
+    #from pytools import memoize_in
     #@memoize_in(cache_holder, "insn_count")  # TODO why doesn't this work anymore?
     def get_insn_count(knl, insn_id,
                        disregard_local_axes=False,
diff --git a/test/test_statistics.py b/test/test_statistics.py
index a5132b94f..b3f4d2226 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -191,7 +191,7 @@ def test_op_counter_bitwise():
     i64add = op_map[lp.Op(np.dtype(np.int64), 'add', 'thread')
                     ].eval_with_dict(params)
     i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', 'thread')
-                    ].eval_with_dict(params)
+                      ].eval_with_dict(params)
     assert i32add == n*m+n*m*ell
     assert i32bw == 2*n*m*ell
     assert i64bw == 2*n*m
-- 
GitLab


From c5cff697ae74e5127dd3b5358562c641d0d53896 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Fri, 12 Jan 2018 21:42:22 -0600
Subject: [PATCH 11/59] factoring out m in polynomial

---
 doc/tutorial.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 8e05cf0f4..c45e711f5 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1703,8 +1703,8 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
     >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
     ...                                         ).group_by('direction')
     >>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
-    MemAccess(None, None, None, load, None, None) : [m, l, n] -> { (1/2 + 3/8 * l) * n * m : m > 0 and l > 0 and n > 0 }
-    MemAccess(None, None, None, store, None, None) : [m, l, n] -> { (1/4 + 1/8 * l) * n * m : m > 0 and l > 0 and n > 0 }
+    MemAccess(None, None, None, load, None, None) : [m, l, n] -> { (1/2 * m + 3/8 * m * l) * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(None, None, None, store, None, None) : [m, l, n] -> { (1/4 * m + 1/8 * m * l) * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
     >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
     ...                            ].eval_with_dict(param_dict)
-- 
GitLab


From bc1b9f71a8c46c78ed0c66d6682a2f5f481d0a55 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Sat, 13 Jan 2018 17:08:42 -0600
Subject: [PATCH 12/59] changed dependency url

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b8f36d125..91e81ee51 100644
--- a/setup.py
+++ b/setup.py
@@ -62,7 +62,7 @@ setup(name="loo.py",
           },
 
       dependency_links=[
-          "hg+https://bitbucket.org/inducer/f2py#egg=f2py==0.3.1"
+          "git+https://github.com/pearu/f2py.git"
           ],
 
       scripts=["bin/loopy"],
-- 
GitLab


From 777eb04ad97768ff26d9567cfbbc095482dd329f Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 22 Jan 2018 03:02:18 -0600
Subject: [PATCH 13/59] renamed warp->subgroup

---
 loopy/statistics.py     |  24 ++++----
 test/test_statistics.py | 129 ++++++++++++++++++++--------------------
 2 files changed, 77 insertions(+), 76 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index fd1e2039c..e3dc2f503 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -897,7 +897,7 @@ class GlobalMemAccessCounter(MemAccessCounter):
             return ToCountMap({MemAccess(mtype='global',
                                          dtype=self.type_inf(expr), stride=0,
                                          variable=name,
-                                         count_granularity='warp'): 1}
+                                         count_granularity='subgroup'): 1}
                               ) + self.rec(expr.index)
 
         if min_tag_axis != 0:
@@ -952,7 +952,7 @@ class GlobalMemAccessCounter(MemAccessCounter):
 
             total_stride += stride*coeff_min_lid
 
-        count_granularity = 'thread' if total_stride is not 0 else 'warp'
+        count_granularity = 'thread' if total_stride is not 0 else 'subgroup'
 
         return ToCountMap({MemAccess(
                             mtype='global',
@@ -1284,7 +1284,7 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False):
 # {{{ get_mem_access_map
 
 def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
-                       wsize=None):
+                       subgroup_size=None):
     """Count the number of memory accesses in a loopy kernel.
 
     :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be
@@ -1351,12 +1351,12 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
     """
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
 
-    if wsize is None:
-        wsize = 32
-        warn_with_kernel(knl, "get_mem_access_map_assumes_warpsize",
-                         "get_mem_access_map: No warp size passed, "
-                         "assuming warp size is %d."
-                         % (wsize))
+    if subgroup_size is None:
+        subgroup_size = 32
+        warn_with_kernel(knl, "get_mem_access_map_assumes_subgroup_size",
+                         "get_mem_access_map: No subgroup size passed, "
+                         "assuming subgroup size is %d."
+                         % (subgroup_size))
 
     class CacheHolder(object):
         pass
@@ -1379,8 +1379,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
             return ct
         elif count_granularity == 'thread':
             return ct
-        elif count_granularity == 'warp':
-            return ct/wsize
+        elif count_granularity == 'subgroup':
+            return ct/subgroup_size
         elif count_granularity == 'group':
             from loopy.symbolic import aff_to_expr
             _, local_size = knl.get_grid_size_upper_bounds()
@@ -1397,7 +1397,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         else:
             raise ValueError("get_insn_count: count_granularity '%s' is"
                     "not allowed. count_granularity must be 'group', "
-                    "'warp', or 'thread'." % (count_granularity))
+                    "'subgroup', or 'thread'." % (count_granularity))
 
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
diff --git a/test/test_statistics.py b/test/test_statistics.py
index b3f4d2226..b93e26264 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -247,7 +247,7 @@ def test_mem_access_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                     dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
 
     n = 512
     m = 256
@@ -255,33 +255,33 @@ def test_mem_access_counter_basic():
     params = {'n': n, 'm': m, 'ell': ell}
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='b',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                     ].eval_with_dict(params)
     f64l = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='load', variable='g',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
     f64l += mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='load', variable='h',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                     ].eval_with_dict(params)
-    assert f32l == 3*n*m*ell/32  # /warpsize because these are considered uniform
-    assert f64l == 2*n*m/32  # /warpsize because these are considered uniform
+    assert f32l == 3*n*m*ell/32  # /subgroup_size because these are uniform
+    assert f64l == 2*n*m/32  # /subgroup_size because these are uniform
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
     f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='store', variable='e',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
-    assert f32s == n*m*ell/32  # /warpsize because these are considered uniform
-    assert f64s == n*m/32  # /warpsize because these are considered uniform
+    assert f32s == n*m*ell/32  # /subgroup_size because these are uniform
+    assert f64s == n*m/32  # /subgroup_size because these are uniform
 
 
 def test_mem_access_counter_reduction():
@@ -294,26 +294,26 @@ def test_mem_access_counter_reduction():
             name="matmul", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='b',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                     ].eval_with_dict(params)
-    assert f32l == 2*n*m*ell/32  # /warpsize because these are considered uniform
+    assert f32l == 2*n*m*ell/32  # /subgroup_size because these are uniform
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
-    assert f32s == n*ell/32  # /warpsize because these are considered uniform
+    assert f32s == n*ell/32  # /subgroup_size because these are uniform
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                                  ).to_bytes().eval_and_sum(params)
@@ -337,7 +337,7 @@ def test_mem_access_counter_logic():
             name="logic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
     n = 512
     m = 256
     ell = 128
@@ -354,9 +354,9 @@ def test_mem_access_counter_logic():
     f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
                                        direction='store')
                           ].eval_with_dict(params)
-    assert f32_g_l == 2*n*m/32  # /warpsize because these are considered uniform
-    assert f64_g_l == n*m/32  # /warpsize because these are considered uniform
-    assert f64_g_s == n*m/32  # /warpsize because these are considered uniform
+    assert f32_g_l == 2*n*m/32  # /subgroup_size because these are uniform
+    assert f64_g_l == n*m/32  # /subgroup_size because these are uniform
+    assert f64_g_s == n*m/32  # /subgroup_size because these are uniform
 
 
 def test_mem_access_counter_specialops():
@@ -373,46 +373,46 @@ def test_mem_access_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                   ].eval_with_dict(params)
     f32 += mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='b',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='load', variable='g',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                   ].eval_with_dict(params)
     f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='load', variable='h',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
-    assert f32 == 2*n*m*ell/32  # /warpsize because these are considered uniform
-    assert f64 == 2*n*m/32  # /warpsize because these are considered uniform
+    assert f32 == 2*n*m*ell/32  # /subgroup_size because these are uniform
+    assert f64 == 2*n*m/32  # /subgroup_size because these are uniform
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='store', variable='c',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                   ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='store', variable='e',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                   ].eval_with_dict(params)
-    assert f32 == n*m*ell/32  # /warpsize because these are considered uniform
-    assert f64 == n*m/32  # /warpsize because these are considered uniform
+    assert f32 == n*m*ell/32  # /subgroup_size because these are uniform
+    assert f64 == n*m/32  # /subgroup_size because these are uniform
 
     filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'],
-                         count_granularity='warp')
+                         count_granularity='subgroup')
     #tot = lp.eval_and_sum_polys(filtered_map, params)
     tot = filtered_map.eval_and_sum(params)
-    assert tot == (n*m*ell + n*m)/32  # /warpsize for uniform
+    assert tot == (n*m*ell + n*m)/32  # /subgroup_size for uniform
 
 
 def test_mem_access_counter_bitwise():
@@ -432,38 +432,38 @@ def test_mem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='a',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='b',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='g',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
                          stride=0, direction='load', variable='h',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
-    assert i32 == (4*n*m+2*n*m*ell)/32  # /warpsize for uniform
+    assert i32 == (4*n*m+2*n*m*ell)/32  # /subgroup_size for uniform
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='c',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='e',
-                         count_granularity='warp')
+                         count_granularity='subgroup')
                    ].eval_with_dict(params)
-    assert i32 == (n*m+n*m*ell)/32  # /warpsize because these are considered uniform
+    assert i32 == (n*m+n*m*ell)/32  # /subgroup_size because these are uniform
 
 
 def test_mem_access_counter_mixed():
@@ -484,22 +484,22 @@ def test_mem_access_counter_mixed():
     knl = lp.split_iname(knl, "j", bsize)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)  # noqa
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)  # noqa
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='load', variable='g',
-                                count_granularity='warp')
+                                count_granularity='subgroup')
                          ].eval_with_dict(params)
     f64uniform += mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='load', variable='h',
-                                count_granularity='warp')
+                                count_granularity='subgroup')
                           ].eval_with_dict(params)
     f32uniform = mem_map[lp.MemAccess('global', np.float32,
                                 stride=0, direction='load', variable='x',
-                                count_granularity='warp')
+                                count_granularity='subgroup')
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m'), direction='load',
@@ -511,20 +511,20 @@ def test_mem_access_counter_mixed():
                                 variable='b',
                                 count_granularity='thread')
                             ].eval_with_dict(params)
-    assert f64uniform == 2*n*m*ell/32  # /warpsize for uniform
-    assert f32uniform == n*m*ell/32  # /warpsize for uniform
+    assert f64uniform == 2*n*m*ell/32  # /subgroup_size for uniform
+    assert f32uniform == n*m*ell/32  # /subgroup_size for uniform
     assert f32nonconsec == 3*n*m*ell
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='store', variable='e',
-                                count_granularity='warp')
+                                count_granularity='subgroup')
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                 stride=Variable('m'), direction='store',
                                 variable='c',
                                 count_granularity='thread')
                            ].eval_with_dict(params)
-    assert f64uniform == n*m*ell/32  # /warpsize because these are considered uniform
+    assert f64uniform == n*m*ell/32  # /subgroup_size because these are uniform
     assert f32nonconsec == n*m*ell
 
 
@@ -585,7 +585,8 @@ def test_mem_access_counter_nonconsec():
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*ell
 
-    mem_map64 = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=64)
+    mem_map64 = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                      subgroup_size=64)
     f64nonconsec = mem_map64[lp.MemAccess(
                     'global',
                     np.float64, stride=Variable('m'),
@@ -843,26 +844,26 @@ def test_summations_and_filters():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
 
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
 
     loads_a = mem_map.filter_by(direction=['load'], variable=['a'],
-                                count_granularity=['warp']
+                                count_granularity=['subgroup']
                                 ).eval_and_sum(params)
-    assert loads_a == 2*n*m*ell/32  # /warpsize because these are considered uniform
+    assert loads_a == 2*n*m*ell/32  # /subgroup_size because these are uniform
 
     global_stores = mem_map.filter_by(mtype=['global'], direction=['store'],
-                                      count_granularity=['warp']
+                                      count_granularity=['subgroup']
                                       ).eval_and_sum(params)
-    assert global_stores == (n*m*ell + n*m)/32  # /warpsize for uniform
+    assert global_stores == (n*m*ell + n*m)/32  # /subgroup_size for uniform
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'],
-                                 count_granularity=['warp']
+                                 count_granularity=['subgroup']
                                  ).to_bytes().eval_and_sum(params)
     st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'],
-                                 count_granularity=['warp']
+                                 count_granularity=['subgroup']
                                  ).to_bytes().eval_and_sum(params)
-    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)/32  # /warpsize for uniform
-    assert st_bytes == (4*n*m*ell + 8*n*m)/32  # /warpsize for uniform
+    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)/32  # /subgroup_size for uniform
+    assert st_bytes == (4*n*m*ell + 8*n*m)/32  # /subgroup_size for uniform
 
     # ignore stride and variable names in this map
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
@@ -870,8 +871,8 @@ def test_summations_and_filters():
                           ].eval_with_dict(params)
     f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
                           ].eval_with_dict(params)
-    assert f32lall == 3*n*m*ell/32  # /warpsize because these are considered uniform
-    assert f64lall == 2*n*m/32  # /warpsize because these are considered uniform
+    assert f32lall == 3*n*m*ell/32  # /subgroup_size because these are uniform
+    assert f64lall == 2*n*m/32  # /subgroup_size because these are uniform
 
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     #for k, v in op_map.items():
@@ -904,7 +905,7 @@ def test_summations_and_filters():
         return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \
                key.direction == 'load'
     s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
-    assert s1f64l == 2*n*m/32  # /warpsize because these are considered uniform
+    assert s1f64l == 2*n*m/32  # /subgroup_size because these are uniform
 
 
 def test_strided_footprint():
-- 
GitLab


From 80c6f8b4ad33f31cd5e748103e2c895601ad0f5a Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 22 Jan 2018 03:12:16 -0600
Subject: [PATCH 14/59] renaming thread->workitem

---
 loopy/statistics.py     |  62 +++++++++++------------
 test/test_statistics.py | 107 ++++++++++++++++++++--------------------
 2 files changed, 85 insertions(+), 84 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index e3dc2f503..765c75a8f 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -529,7 +529,7 @@ class MemAccess(object):
     .. attribute:: stride
 
        An :class:`int` that specifies stride of the memory access. A stride of 0
-       indicates a uniform access (i.e. all threads access the same item).
+       indicates a uniform access (i.e. all work items access the same item).
 
     .. attribute:: direction
 
@@ -692,7 +692,7 @@ class ExpressionOpCounter(CounterBase):
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
                         name='func:'+str(expr.function),
-                        count_granularity='thread'): 1}
+                        count_granularity='workitem'): 1}
                     ) + self.rec(expr.parameters)
 
     def map_subscript(self, expr):
@@ -703,7 +703,7 @@ class ExpressionOpCounter(CounterBase):
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
                         name='add',
-                        count_granularity='thread'): len(expr.children)-1}
+                        count_granularity='workitem'): len(expr.children)-1}
                     ) + sum(self.rec(child) for child in expr.children)
 
     def map_product(self, expr):
@@ -711,18 +711,18 @@ class ExpressionOpCounter(CounterBase):
         assert expr.children
         return sum(ToCountMap({Op(dtype=self.type_inf(expr),
                                   name='mul',
-                                  count_granularity='thread'): 1})
+                                  count_granularity='workitem'): 1})
                    + self.rec(child)
                    for child in expr.children
                    if not is_zero(child + 1)) + \
                    ToCountMap({Op(dtype=self.type_inf(expr),
                                   name='mul',
-                                  count_granularity='thread'): -1})
+                                  count_granularity='workitem'): -1})
 
     def map_quotient(self, expr, *args):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='div',
-                              count_granularity='thread'): 1}) \
+                              count_granularity='workitem'): 1}) \
                                 + self.rec(expr.numerator) \
                                 + self.rec(expr.denominator)
 
@@ -732,14 +732,14 @@ class ExpressionOpCounter(CounterBase):
     def map_power(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='pow',
-                              count_granularity='thread'): 1}) \
+                              count_granularity='workitem'): 1}) \
                                 + self.rec(expr.base) \
                                 + self.rec(expr.exponent)
 
     def map_left_shift(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='shift',
-                              count_granularity='thread'): 1}) \
+                              count_granularity='workitem'): 1}) \
                                 + self.rec(expr.shiftee) \
                                 + self.rec(expr.shift)
 
@@ -748,13 +748,13 @@ class ExpressionOpCounter(CounterBase):
     def map_bitwise_not(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='bw',
-                              count_granularity='thread'): 1}) \
+                              count_granularity='workitem'): 1}) \
                                 + self.rec(expr.child)
 
     def map_bitwise_or(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='bw',
-                              count_granularity='thread'):
+                              count_granularity='workitem'):
                            len(expr.children)-1}) \
                                 + sum(self.rec(child) for child in expr.children)
 
@@ -778,7 +778,7 @@ class ExpressionOpCounter(CounterBase):
     def map_min(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='maxmin',
-                              count_granularity='thread'):
+                              count_granularity='workitem'):
                            len(expr.children)-1}) \
                + sum(self.rec(child) for child in expr.children)
 
@@ -820,7 +820,7 @@ class LocalMemAccessCounter(MemAccessCounter):
             if isinstance(array, TemporaryVariable) and (
                     array.scope == temp_var_scope.LOCAL):
                 sub_map[MemAccess(mtype='local', dtype=dtype,
-                                  count_granularity='thread')] = 1
+                                  count_granularity='workitem')] = 1
         return sub_map
 
     def map_variable(self, expr):
@@ -857,7 +857,7 @@ class GlobalMemAccessCounter(MemAccessCounter):
         return ToCountMap({MemAccess(mtype='global',
                                      dtype=self.type_inf(expr), stride=0,
                                      variable=name,
-                                     count_granularity='thread'): 1}
+                                     count_granularity='workitem'): 1}
                           ) + self.rec(expr.index)
 
     def map_subscript(self, expr):
@@ -908,7 +908,7 @@ class GlobalMemAccessCounter(MemAccessCounter):
             return ToCountMap({MemAccess(mtype='global',
                                          dtype=self.type_inf(expr),
                                          stride=sys.maxsize, variable=name,
-                                         count_granularity='thread'): 1}
+                                         count_granularity='workitem'): 1}
                               ) + self.rec(expr.index)
 
         # get local_id associated with minimum tag axis
@@ -952,7 +952,7 @@ class GlobalMemAccessCounter(MemAccessCounter):
 
             total_stride += stride*coeff_min_lid
 
-        count_granularity = 'thread' if total_stride is not 0 else 'subgroup'
+        count_granularity = 'workitem' if total_stride is not 0 else 'subgroup'
 
         return ToCountMap({MemAccess(
                             mtype='global',
@@ -1238,9 +1238,9 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False):
 
         op_map = get_op_map(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-        f32add = op_map[Op(np.float32, 'add', count_granularity='thread')
+        f32add = op_map[Op(np.float32, 'add', count_granularity='workitem')
                        ].eval_with_dict(params)
-        f32mul = op_map[Op(np.float32, 'mul', count_granularity='thread')
+        f32mul = op_map[Op(np.float32, 'mul', count_granularity='workitem')
                        ].eval_with_dict(params)
 
         # (now use these counts to predict performance)
@@ -1322,28 +1322,28 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                                           stride=1,
                                           direction='load',
                                           variable='a',
-                                          count_granularity='thread')
+                                          count_granularity='workitem')
                                ].eval_with_dict(params)
         f32_s1_g_st_a = mem_map[MemAccess(mtype='global',
                                           dtype=np.float32,
                                           stride=1,
                                           direction='store',
                                           variable='a',
-                                          count_granularity='thread')
+                                          count_granularity='workitem')
                                ].eval_with_dict(params)
         f32_s1_l_ld_x = mem_map[MemAccess(mtype='local',
                                           dtype=np.float32,
                                           stride=1,
                                           direction='load',
                                           variable='x',
-                                          count_granularity='thread')
+                                          count_granularity='workitem')
                                ].eval_with_dict(params)
         f32_s1_l_st_x = mem_map[MemAccess(mtype='local',
                                           dtype=np.float32,
                                           stride=1,
                                           direction='store',
                                           variable='x',
-                                          count_granularity='thread')
+                                          count_granularity='workitem')
                                ].eval_with_dict(params)
 
         # (now use these counts to predict performance)
@@ -1366,7 +1366,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
     #@memoize_in(cache_holder, "insn_count")  # TODO why doesn't this work anymore?
     def get_insn_count(knl, insn_id,
                        disregard_local_axes=False,
-                       count_granularity='thread'):
+                       count_granularity='workitem'):
         insn = knl.id_to_insn[insn_id]
         ct = count_insn_runs(
                 knl, insn, disregard_local_axes=disregard_local_axes,
@@ -1375,16 +1375,16 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         if count_granularity is None:
             warn_with_kernel(knl, "get_insn_count_assumes_granularity",
                              "get_insn_count: No count granularity passed for "
-                             "MemAccess, assuming thread granularity.")
+                             "MemAccess, assuming workitem granularity.")
             return ct
-        elif count_granularity == 'thread':
+        elif count_granularity == 'workitem':
             return ct
         elif count_granularity == 'subgroup':
             return ct/subgroup_size
         elif count_granularity == 'group':
             from loopy.symbolic import aff_to_expr
             _, local_size = knl.get_grid_size_upper_bounds()
-            group_threads = 1
+            group_workitems = 1
             for size in local_size:
                 try:
                     s = aff_to_expr(size)
@@ -1392,12 +1392,12 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                     raise LoopyError("Cannot count insn with group granularity, "
                                      "group size is not integer: %s"
                                      % (local_size))
-                group_threads *= s
-            return ct/group_threads
+                group_workitems *= s
+            return ct/group_workitems
         else:
             raise ValueError("get_insn_count: count_granularity '%s' is"
                     "not allowed. count_granularity must be 'group', "
-                    "'subgroup', or 'thread'." % (count_granularity))
+                    "'subgroup', or 'workitem'." % (count_granularity))
 
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
@@ -1468,14 +1468,14 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
 
 def get_synchronization_map(knl):
 
-    """Count the number of synchronization events each thread encounters in a
+    """Count the number of synchronization events each work item encounters in a
     loopy kernel.
 
     :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
     :return: A dictionary mapping each type of synchronization event to a
             :class:`islpy.PwQPolynomial` holding the number of events per
-            thread.
+            work item.
 
             Possible keys include ``barrier_local``, ``barrier_global``
             (if supported by the target) and ``kernel_launch``.
@@ -1684,7 +1684,7 @@ def get_gmem_access_poly(knl):
 
 
 def get_synchronization_poly(knl):
-    """Count the number of synchronization events each thread encounters in a
+    """Count the number of synchronization events each work item encounters in a
     loopy kernel.
 
     get_synchronization_poly is deprecated. Use get_synchronization_map instead.
diff --git a/test/test_statistics.py b/test/test_statistics.py
index b93e26264..f8735553f 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -54,12 +54,12 @@ def test_op_counter_basic():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', 'thread')].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.float32, 'mul', 'thread')].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div', 'thread')].eval_with_dict(params)
-    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', 'thread')
+    f32add = op_map[lp.Op(np.float32, 'add', 'workitem')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul', 'workitem')].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div', 'workitem')].eval_with_dict(params)
+    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', 'workitem')
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'thread')
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'workitem')
                     ].eval_with_dict(params)
     assert f32add == f32mul == f32div == n*m*ell
     assert f64mul == n*m
@@ -81,8 +81,8 @@ def test_op_counter_reduction():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', 'thread')].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', 'thread')
+    f32add = op_map[lp.Op(np.float32, 'add', 'workitem')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', 'workitem')
                     ].eval_with_dict(params)
     assert f32add == f32mul == n*m*ell
 
@@ -111,11 +111,11 @@ def test_op_counter_logic():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', 'thread')].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.float64, 'add', 'thread')].eval_with_dict(params)
-    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', 'thread')
+    f32mul = op_map[lp.Op(np.float32, 'mul', 'workitem')].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.float64, 'add', 'workitem')].eval_with_dict(params)
+    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', 'workitem')
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'thread')
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'workitem')
                     ].eval_with_dict(params)
     assert f32mul == n*m
     assert f64div == 2*n*m  # TODO why?
@@ -143,17 +143,17 @@ def test_op_counter_specialops():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', 'thread')].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div', 'thread')].eval_with_dict(params)
-    f32add = op_map[lp.Op(np.float32, 'add', 'thread')].eval_with_dict(params)
-    f64pow = op_map[lp.Op(np.float64, 'pow', 'thread')].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', 'thread')
+    f32mul = op_map[lp.Op(np.float32, 'mul', 'workitem')].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div', 'workitem')].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add', 'workitem')].eval_with_dict(params)
+    f64pow = op_map[lp.Op(np.float64, 'pow', 'workitem')].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', 'workitem')
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'thread')
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'workitem')
                     ].eval_with_dict(params)
-    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', 'thread')
+    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', 'workitem')
                     ].eval_with_dict(params)
-    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', 'thread')
+    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', 'workitem')
                     ].eval_with_dict(params)
     assert f32div == 2*n*m*ell
     assert f32mul == f32add == n*m*ell
@@ -183,14 +183,15 @@ def test_op_counter_bitwise():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    i32add = op_map[lp.Op(np.int32, 'add', 'thread')].eval_with_dict(params)
-    i32bw = op_map[lp.Op(np.int32, 'bw', 'thread')].eval_with_dict(params)
-    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', 'thread')].eval_with_dict(params)
-    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', 'thread')
+    i32add = op_map[lp.Op(np.int32, 'add', 'workitem')].eval_with_dict(params)
+    i32bw = op_map[lp.Op(np.int32, 'bw', 'workitem')].eval_with_dict(params)
+    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', 'workitem')
+                   ].eval_with_dict(params)
+    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', 'workitem')
                     ].eval_with_dict(params)
-    i64add = op_map[lp.Op(np.dtype(np.int64), 'add', 'thread')
+    i64add = op_map[lp.Op(np.dtype(np.int64), 'add', 'workitem')
                     ].eval_with_dict(params)
-    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', 'thread')
+    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', 'workitem')
                       ].eval_with_dict(params)
     assert i32add == n*m+n*m*ell
     assert i32bw == 2*n*m*ell
@@ -223,7 +224,7 @@ def test_op_counter_triangular_domain():
     op_map = lp.get_op_map(
                     knl,
                     count_redundant_work=True
-                    )[lp.Op(np.float64, 'mul', 'thread')]
+                    )[lp.Op(np.float64, 'mul', 'workitem')]
     value_dict = dict(m=13, n=200)
     flops = op_map.eval_with_dict(value_dict)
 
@@ -504,12 +505,12 @@ def test_mem_access_counter_mixed():
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m'), direction='load',
                                 variable='a',
-                                count_granularity='thread')
+                                count_granularity='workitem')
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m'), direction='load',
                                 variable='b',
-                                count_granularity='thread')
+                                count_granularity='workitem')
                             ].eval_with_dict(params)
     assert f64uniform == 2*n*m*ell/32  # /subgroup_size for uniform
     assert f32uniform == n*m*ell/32  # /subgroup_size for uniform
@@ -522,7 +523,7 @@ def test_mem_access_counter_mixed():
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                 stride=Variable('m'), direction='store',
                                 variable='c',
-                                count_granularity='thread')
+                                count_granularity='workitem')
                            ].eval_with_dict(params)
     assert f64uniform == n*m*ell/32  # /subgroup_size because these are uniform
     assert f32nonconsec == n*m*ell
@@ -552,22 +553,22 @@ def test_mem_access_counter_nonconsec():
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
                                 stride=Variable('m'), direction='load',
                                 variable='g',
-                                count_granularity='thread')
+                                count_granularity='workitem')
                            ].eval_with_dict(params)
     f64nonconsec += mem_map[lp.MemAccess('global', np.float64,
                                 stride=Variable('m'), direction='load',
                                 variable='h',
-                                count_granularity='thread')
+                                count_granularity='workitem')
                             ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m')*Variable('ell'),
                                 direction='load', variable='a',
-                                count_granularity='thread')
+                                count_granularity='workitem')
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m')*Variable('ell'),
                                 direction='load', variable='b',
-                                count_granularity='thread')
+                                count_granularity='workitem')
                             ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
@@ -575,12 +576,12 @@ def test_mem_access_counter_nonconsec():
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
                                 stride=Variable('m'), direction='store',
                                 variable='e',
-                                count_granularity='thread')
+                                count_granularity='workitem')
                            ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                 stride=Variable('m')*Variable('ell'),
                                 direction='store', variable='c',
-                                count_granularity='thread')
+                                count_granularity='workitem')
                            ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*ell
@@ -591,13 +592,13 @@ def test_mem_access_counter_nonconsec():
                     'global',
                     np.float64, stride=Variable('m'),
                     direction='load', variable='g',
-                    count_granularity='thread')
+                    count_granularity='workitem')
                     ].eval_with_dict(params)
     f64nonconsec += mem_map64[lp.MemAccess(
                     'global',
                     np.float64, stride=Variable('m'),
                     direction='load', variable='h',
-                    count_granularity='thread')
+                    count_granularity='workitem')
                     ].eval_with_dict(params)
     f32nonconsec = mem_map64[lp.MemAccess(
                     'global',
@@ -605,7 +606,7 @@ def test_mem_access_counter_nonconsec():
                     stride=Variable('m')*Variable('ell'),
                     direction='load',
                     variable='a',
-                    count_granularity='thread')
+                    count_granularity='workitem')
                     ].eval_with_dict(params)
     f32nonconsec += mem_map64[lp.MemAccess(
                     'global',
@@ -613,7 +614,7 @@ def test_mem_access_counter_nonconsec():
                     stride=Variable('m')*Variable('ell'),
                     direction='load',
                     variable='b',
-                    count_granularity='thread')
+                    count_granularity='workitem')
                     ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
@@ -642,30 +643,30 @@ def test_mem_access_counter_consec():
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='load', variable='g',
-                        count_granularity='thread')
+                        count_granularity='workitem')
                         ].eval_with_dict(params)
     f64consec += mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='load', variable='h',
-                        count_granularity='thread')
+                        count_granularity='workitem')
                          ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='load', variable='a',
-                        count_granularity='thread')
+                        count_granularity='workitem')
                         ].eval_with_dict(params)
     f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         stride=1, direction='load', variable='b',
-                        count_granularity='thread')
+                        count_granularity='workitem')
                          ].eval_with_dict(params)
     assert f64consec == 2*n*m*ell
     assert f32consec == 3*n*m*ell
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='store', variable='e',
-                        count_granularity='thread')
+                        count_granularity='workitem')
                         ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='store', variable='c',
-                        count_granularity='thread')
+                        count_granularity='workitem')
                         ].eval_with_dict(params)
     assert f64consec == n*m*ell
     assert f32consec == n*m*ell
@@ -749,16 +750,16 @@ def test_all_counters_parallel_matmul():
 
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     f32mul = op_map[
-                        lp.Op(np.float32, 'mul', 'thread')
+                        lp.Op(np.float32, 'mul', 'workitem')
                         ].eval_with_dict(params)
     f32add = op_map[
-                        lp.Op(np.float32, 'add', 'thread')
+                        lp.Op(np.float32, 'add', 'workitem')
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        lp.Op(np.int32, 'add', 'thread')
+                        lp.Op(np.int32, 'add', 'workitem')
                         ].eval_with_dict(params)
     i32ops += op_map[
-                        lp.Op(np.dtype(np.int32), 'mul', 'thread')
+                        lp.Op(np.dtype(np.int32), 'mul', 'workitem')
                         ].eval_with_dict(params)
 
     assert f32mul+f32add == n*m*ell*2
@@ -767,11 +768,11 @@ def test_all_counters_parallel_matmul():
 
     f32s1lb = op_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='load', variable='b',
-                     count_granularity='thread')
+                     count_granularity='workitem')
                      ].eval_with_dict(params)
     f32s1la = op_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='load', variable='a',
-                     count_granularity='thread')
+                     count_granularity='workitem')
                      ].eval_with_dict(params)
 
     assert f32s1lb == n*m*ell/bsize
@@ -779,7 +780,7 @@ def test_all_counters_parallel_matmul():
 
     f32coal = op_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='store', variable='c',
-                     count_granularity='thread')
+                     count_granularity='workitem')
                      ].eval_with_dict(params)
 
     assert f32coal == n*ell
@@ -788,7 +789,7 @@ def test_all_counters_parallel_matmul():
                         count_redundant_work=True).filter_by(mtype=['local'])
     local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                              direction='load',
-                                             count_granularity='thread')
+                                             count_granularity='workitem')
                                 ].eval_with_dict(params)
     assert local_mem_l == n*m*ell*2
 
-- 
GitLab


From 46f9acabeb46295fb9780843fae2806437461862 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 22 Jan 2018 03:47:13 -0600
Subject: [PATCH 15/59] renaming thread->workitem in tutorial

---
 doc/tutorial.rst | 80 ++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index c45e711f5..b94708ed3 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1552,12 +1552,12 @@ information provided. Now we will count the operations:
 
     >>> op_map = lp.get_op_map(knl)
     >>> print(lp.stringify_stats_mapping(op_map))
-    Op(np:dtype('float32'), add, thread) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float32'), div, thread) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float32'), mul, thread) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float64'), add, thread) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('float64'), mul, thread) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
-    Op(np:dtype('int32'), add, thread) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float32'), add, workitem) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float32'), div, workitem) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float32'), mul, workitem) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float64'), add, workitem) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('float64'), mul, workitem) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    Op(np:dtype('int32'), add, workitem) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
 
 :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{**
@@ -1579,12 +1579,12 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
 .. doctest::
 
     >>> param_dict = {'n': 256, 'm': 256, 'l': 8}
-    >>> f32add = op_map[lp.Op(np.float32, 'add', 'thread')].eval_with_dict(param_dict)
-    >>> f32div = op_map[lp.Op(np.float32, 'div', 'thread')].eval_with_dict(param_dict)
-    >>> f32mul = op_map[lp.Op(np.float32, 'mul', 'thread')].eval_with_dict(param_dict)
-    >>> f64add = op_map[lp.Op(np.float64, 'add', 'thread')].eval_with_dict(param_dict)
-    >>> f64mul = op_map[lp.Op(np.float64, 'mul', 'thread')].eval_with_dict(param_dict)
-    >>> i32add = op_map[lp.Op(np.int32, 'add', 'thread')].eval_with_dict(param_dict)
+    >>> f32add = op_map[lp.Op(np.float32, 'add', 'workitem')].eval_with_dict(param_dict)
+    >>> f32div = op_map[lp.Op(np.float32, 'div', 'workitem')].eval_with_dict(param_dict)
+    >>> f32mul = op_map[lp.Op(np.float32, 'mul', 'workitem')].eval_with_dict(param_dict)
+    >>> f64add = op_map[lp.Op(np.float64, 'add', 'workitem')].eval_with_dict(param_dict)
+    >>> f64mul = op_map[lp.Op(np.float64, 'mul', 'workitem')].eval_with_dict(param_dict)
+    >>> i32add = op_map[lp.Op(np.int32, 'add', 'workitem')].eval_with_dict(param_dict)
     >>> print("%i\n%i\n%i\n%i\n%i\n%i" %
     ...     (f32add, f32div, f32mul, f64add, f64mul, i32add))
     524288
@@ -1657,7 +1657,7 @@ we'll continue using the kernel from the previous example:
   data type accessed.
 
 - stride: An :class:`int` that specifies stride of the memory access. A stride
-  of 0 indicates a uniform access (i.e. all threads access the same item).
+  of 0 indicates a uniform access (i.e. all work-items access the same item).
 
 - direction: A :class:`str` that specifies the direction of memory access as
   **load** or **store**.
@@ -1720,7 +1720,7 @@ achieved memory bandwidth in byte/sec or performance in FLOP/sec.
 ~~~~~~~~~~~
 
 Since we have not tagged any of the inames or parallelized the kernel across
-threads (which would have produced iname tags), :func:`loopy.get_mem_access_map`
+work-items (which would have produced iname tags), :func:`loopy.get_mem_access_map`
 considers the memory accesses *uniform*, so the *stride* of each access is 0.
 Now we'll parallelize the kernel and count the array accesses again. The
 resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
@@ -1731,28 +1731,28 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
     ...                             outer_tag="l.1", inner_tag="l.0")
     >>> mem_map = lp.get_mem_access_map(knl_consec)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 1, load, a, thread) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float32'), 1, load, b, thread) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float32'), 1, store, c, thread) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, load, g, thread) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, load, h, thread) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 1, store, e, thread) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, load, a, workitem) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, load, b, workitem) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, store, c, workitem) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, load, g, workitem) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, load, h, workitem) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, store, e, workitem) : [m, l, n] -> { ... }
     <BLANKLINE>
 
-With this parallelization, consecutive threads will access consecutive array
+With this parallelization, consecutive work-items will access consecutive array
 elements in memory. The polynomials are a bit more complicated now due to the
 parallelization, but when we evaluate them, we see that the total number of
 array accesses has not changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g', 'thread')
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g', 'workitem')
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e', 'thread')
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e', 'workitem')
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a', 'thread')
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a', 'workitem')
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c', 'thread')
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c', 'workitem')
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1772,27 +1772,27 @@ switch the inner and outer tags in our parallelization of the kernel:
     ...                                outer_tag="l.0", inner_tag="l.1")
     >>> mem_map = lp.get_mem_access_map(knl_nonconsec)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 128, load, a, thread) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float32'), 128, load, b, thread) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float32'), 128, store, c, thread) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, load, g, thread) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, load, h, thread) : [m, l, n] -> { ... }
-    MemAccess(global, np:dtype('float64'), 128, store, e, thread) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, load, a, workitem) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, load, b, workitem) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, store, c, workitem) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, load, g, workitem) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, load, h, workitem) : [m, l, n] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, store, e, workitem) : [m, l, n] -> { ... }
     <BLANKLINE>
 
-With this parallelization, consecutive threads will access *nonconsecutive*
+With this parallelization, consecutive work-items will access *nonconsecutive*
 array elements in memory. The total number of array accesses still has not
 changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g', 'thread')
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g', 'workitem')
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e', 'thread')
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e', 'workitem')
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a', 'thread')
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a', 'workitem')
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c', 'thread')
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c', 'workitem')
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1819,7 +1819,7 @@ Counting synchronization events
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 :func:`loopy.get_synchronization_map` counts the number of synchronization
-events per **thread** in a kernel. First, we'll call this function on the
+events per **work-item** in a kernel. First, we'll call this function on the
 kernel from the previous example:
 
 .. doctest::
@@ -1877,8 +1877,8 @@ Now to make things more interesting, we'll create a kernel with barriers:
       }
     }
 
-In this kernel, when a thread performs the second instruction it uses data
-produced by *different* threads during the first instruction. Because of this,
+In this kernel, when a work-item performs the second instruction it uses data
+produced by *different* work-items during the first instruction. Because of this,
 barriers are required for correct execution, so loopy inserts them. Now we'll
 count the barriers using :func:`loopy.get_synchronization_map`:
 
@@ -1890,7 +1890,7 @@ count the barriers using :func:`loopy.get_synchronization_map`:
     kernel_launch : { 1 }
     <BLANKLINE>
 
-Based on the kernel code printed above, we would expect each thread to
+Based on the kernel code printed above, we would expect each work-item to
 encounter 50x10x2 barriers, which matches the result from
 :func:`loopy.get_synchronization_map`. In this case, the number of barriers
 does not depend on any inames, so we can pass an empty dictionary to
-- 
GitLab


From 9ba5d96d5f7c529b33694eed29745f04fff819b3 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 22 Jan 2018 03:48:34 -0600
Subject: [PATCH 16/59] renaming warp->subgroup in tutorial

---
 doc/tutorial.rst | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index b94708ed3..7a2fb04fc 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1638,12 +1638,12 @@ we'll continue using the kernel from the previous example:
 
     >>> mem_map = lp.get_mem_access_map(knl)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a, warp) : [m, l, n] -> { 1/16 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b, warp) : [m, l, n] -> { 1/32 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c, warp) : [m, l, n] -> { 1/32 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g, warp) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h, warp) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e, warp) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 1/16 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { 1/32 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { 1/32 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g, subgroup) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h, subgroup) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e, subgroup) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
 
 :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
@@ -1669,13 +1669,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g', 'warp')
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g', 'subgroup')
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e', 'warp')
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e', 'subgroup')
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a', 'warp')
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a', 'subgroup')
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c', 'warp')
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c', 'subgroup')
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1693,12 +1693,12 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
     >>> bytes_map = mem_map.to_bytes()
     >>> print(lp.stringify_stats_mapping(bytes_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a, warp) : [m, l, n] -> { 1/4 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b, warp) : [m, l, n] -> { 1/8 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c, warp) : [m, l, n] -> { 1/8 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g, warp) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h, warp) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e, warp) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 1/4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { 1/8 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { 1/8 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g, subgroup) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h, subgroup) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e, subgroup) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
     >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
     ...                                         ).group_by('direction')
-- 
GitLab


From 61595cb3a877980c4827b0b7a355d3d69f9a02df Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 22 Jan 2018 07:45:40 -0600
Subject: [PATCH 17/59] inheriting from record in Op and MemAccess

---
 loopy/statistics.py | 73 ++++++++++-----------------------------------
 1 file changed, 16 insertions(+), 57 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 765c75a8f..4987b27df 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -32,6 +32,7 @@ from functools import reduce
 from loopy.kernel.data import (
         MultiAssignmentBase, TemporaryVariable, temp_var_scope)
 from loopy.diagnostic import warn_with_kernel, LoopyError
+from pytools import Record
 
 
 __doc__ = """
@@ -466,7 +467,7 @@ def stringify_stats_mapping(m):
 
 # {{{ Op descriptor
 
-class Op(object):
+class Op(Record):
     """A descriptor for a type of arithmetic operation.
 
     .. attribute:: dtype
@@ -481,26 +482,14 @@ class Op(object):
 
     """
 
-    # FIXME: This could be done much more briefly by inheriting from Record.
-
     def __init__(self, dtype=None, name=None, count_granularity=None):
-        self.name = name
-        self.count_granularity = count_granularity
         if dtype is None:
-            self.dtype = dtype
+            Record.__init__(self, dtype=dtype, name=name,
+                            count_granularity=count_granularity)
         else:
             from loopy.types import to_loopy_type
-            self.dtype = to_loopy_type(dtype)
-
-    def __eq__(self, other):
-        return isinstance(other, Op) and (
-                (self.dtype is None or other.dtype is None or
-                 self.dtype == other.dtype) and
-                (self.name is None or other.name is None or
-                 self.name == other.name) and
-                (self.count_granularity is None or
-                 other.count_granularity is None or
-                 self.count_granularity == other.count_granularity))
+            Record.__init__(self, dtype=to_loopy_type(dtype), name=name,
+                            count_granularity=count_granularity)
 
     def __hash__(self):
         return hash(str(self))
@@ -513,7 +502,7 @@ class Op(object):
 
 # {{{ MemAccess descriptor
 
-class MemAccess(object):
+class MemAccess(Record):
     """A descriptor for a type of memory access.
 
     .. attribute:: mtype
@@ -547,17 +536,6 @@ class MemAccess(object):
 
     def __init__(self, mtype=None, dtype=None, stride=None, direction=None,
                  variable=None, count_granularity=None):
-        self.mtype = mtype
-        self.stride = stride
-        self.direction = direction
-        self.variable = variable
-        self.count_granularity = count_granularity
-
-        if dtype is None:
-            self.dtype = dtype
-        else:
-            from loopy.types import to_loopy_type
-            self.dtype = to_loopy_type(dtype)
 
         #TODO currently giving all lmem access stride=None
         if (mtype == 'local') and (stride is not None):
@@ -569,34 +547,15 @@ class MemAccess(object):
             raise NotImplementedError("MemAccess: variable must be None when "
                                       "mtype is 'local'")
 
-    def copy(self, mtype=None, dtype=None, stride=None, direction=None,
-             variable=None, count_granularity=None):
-        return MemAccess(
-                mtype=mtype if mtype is not None else self.mtype,
-                dtype=dtype if dtype is not None else self.dtype,
-                stride=stride if stride is not None else self.stride,
-                direction=direction if direction is not None else self.direction,
-                variable=variable if variable is not None else self.variable,
-                count_granularity=count_granularity
-                if count_granularity is not None
-                else self.count_granularity)
-
-    def __eq__(self, other):
-        return isinstance(other, MemAccess) and (
-                (self.mtype is None or other.mtype is None or
-                 self.mtype == other.mtype) and
-                (self.dtype is None or other.dtype is None or
-                 self.dtype == other.dtype) and
-                (self.stride is None or other.stride is None or
-                 self.stride == other.stride) and
-                (self.direction is None or other.direction is None or
-                 self.direction == other.direction) and
-                (self.variable is None or other.variable is None or
-                 self.variable == other.variable) and
-                (self.count_granularity is None or
-                 other.count_granularity is None or
-                 self.count_granularity == other.count_granularity)
-                )
+        if dtype is None:
+            Record.__init__(self, mtype=mtype, dtype=dtype, stride=stride,
+                            direction=direction, variable=variable,
+                            count_granularity=count_granularity)
+        else:
+            from loopy.types import to_loopy_type
+            Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype), stride=stride,
+                            direction=direction, variable=variable,
+                            count_granularity=count_granularity)
 
     def __hash__(self):
         return hash(str(self))
-- 
GitLab


From 3d8945c39cd8652962b93223a62c3e74ea34febf Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 22 Jan 2018 08:35:13 -0600
Subject: [PATCH 18/59] line too long, shortened

---
 loopy/statistics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 4987b27df..3c88a56fd 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -553,8 +553,8 @@ class MemAccess(Record):
                             count_granularity=count_granularity)
         else:
             from loopy.types import to_loopy_type
-            Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype), stride=stride,
-                            direction=direction, variable=variable,
+            Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype),
+                            stride=stride, direction=direction, variable=variable,
                             count_granularity=count_granularity)
 
     def __hash__(self):
-- 
GitLab


From db6d9a4aa0612f6082fe0d5baf106b6db396f159 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 23 Jan 2018 00:22:55 -0600
Subject: [PATCH 19/59] updated docstrings

---
 loopy/statistics.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 3c88a56fd..05009ce49 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -478,7 +478,12 @@ class Op(Record):
     .. attribute:: name
 
        A :class:`str` that specifies the kind of arithmetic operation as
-       *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
+       *add*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
+
+    .. attribute:: count_granularity
+
+       A :class:`str` that specifies whether this operation should be counted
+       once per *workitem*, *subgroup*, or *group*.
 
     """
 
@@ -530,6 +535,11 @@ class MemAccess(Record):
        A :class:`str` that specifies the variable name of the data
        accessed.
 
+    .. attribute:: count_granularity
+
+       A :class:`str` that specifies whether this operation should be counted
+       once per *workitem*, *subgroup*, or *group*.
+
     """
 
     # FIXME: This could be done much more briefly by inheriting from Record.
@@ -1259,6 +1269,11 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for
         code optimization.)
 
+    :arg subgroup_size: A :class:`int` that specifies the sub-group size. This
+        is used, e.g., when counting a :class:`MemAccess` whose count_granularity
+        specifies that it should only be counted once per sub-group. The default
+        subgroup_size is 32.
+
     :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:**
         :class:`islpy.PwQPolynomial` **}**.
 
@@ -1380,6 +1395,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                     direction="store")
 
             # FIXME: (!!!!) for now, don't count writes to local mem
+            # (^this is updated in a branch that will be merged soon)
 
             # use count excluding local index tags for uniform accesses
             for key, val in six.iteritems(access_expr.count_map):
-- 
GitLab


From 93bda9b2ae96242ba3b41dd20f10e1560dd395e8 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 23 Jan 2018 00:28:44 -0600
Subject: [PATCH 20/59] removing finished TODO

---
 loopy/statistics.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 05009ce49..dbbdb97da 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -542,8 +542,6 @@ class MemAccess(Record):
 
     """
 
-    # FIXME: This could be done much more briefly by inheriting from Record.
-
     def __init__(self, mtype=None, dtype=None, stride=None, direction=None,
                  variable=None, count_granularity=None):
 
-- 
GitLab


From d1433a2ab7cc087e94eaffd3a23038a7c1f4e1df Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 23 Jan 2018 01:02:18 -0600
Subject: [PATCH 21/59] no longer modifying maps in place when converting to
 numpy types

---
 loopy/statistics.py | 50 ++++++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 21 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index dbbdb97da..6c7f20d36 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1236,14 +1236,19 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False):
                     % type(insn).__name__)
 
     if numpy_types:
-        op_map.count_map = dict((Op(
-                                    dtype=op.dtype.numpy_dtype,
-                                    name=op.name,
-                                    count_granularity=op.count_granularity),
-                                 count)
-                for op, count in six.iteritems(op_map.count_map))
-
-    return op_map
+        return ToCountMap(
+                    init_dict=dict(
+                        (Op(
+                            dtype=op.dtype.numpy_dtype,
+                            name=op.name,
+                            count_granularity=op.count_granularity
+                            )
+                        , ct)
+                        for op, ct in six.iteritems(op_map.count_map)),
+                    val_type=op_map.val_type
+                    )
+    else:
+        return op_map
 
 # }}}
 
@@ -1420,19 +1425,22 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                     % type(insn).__name__)
 
     if numpy_types:
-        # FIXME: Don't modify in-place
-        access_map.count_map = dict(
-            (MemAccess(
-                       mtype=mem_access.mtype,
-                       dtype=mem_access.dtype.numpy_dtype,
-                       stride=mem_access.stride,
-                       direction=mem_access.direction,
-                       variable=mem_access.variable,
-                       count_granularity=mem_access.count_granularity
-                      ), count)
-            for mem_access, count in six.iteritems(access_map.count_map))
-
-    return access_map
+        return ToCountMap(
+                    init_dict=dict(
+                        (MemAccess(
+                            mtype=mem_access.mtype,
+                            dtype=mem_access.dtype.numpy_dtype,
+                            stride=mem_access.stride,
+                            direction=mem_access.direction,
+                            variable=mem_access.variable,
+                            count_granularity=mem_access.count_granularity
+                            )
+                        , ct)
+                        for mem_access, ct in six.iteritems(access_map.count_map)),
+                    val_type=access_map.val_type
+                    )
+    else:
+        return access_map
 
 # }}}
 
-- 
GitLab


From dcd7259fc0a0ba6c243b5e5b7587348201e6768e Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 23 Jan 2018 03:50:15 -0600
Subject: [PATCH 22/59] fixing formatting problems

---
 loopy/statistics.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 6c7f20d36..a56be22a3 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1241,8 +1241,7 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False):
                         (Op(
                             dtype=op.dtype.numpy_dtype,
                             name=op.name,
-                            count_granularity=op.count_granularity
-                            )
+                            count_granularity=op.count_granularity)
                         , ct)
                         for op, ct in six.iteritems(op_map.count_map)),
                     val_type=op_map.val_type
@@ -1433,8 +1432,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                             stride=mem_access.stride,
                             direction=mem_access.direction,
                             variable=mem_access.variable,
-                            count_granularity=mem_access.count_granularity
-                            )
+                            count_granularity=mem_access.count_granularity)
                         , ct)
                         for mem_access, ct in six.iteritems(access_map.count_map)),
                     val_type=access_map.val_type
-- 
GitLab


From 6dfc346bc735f8165bfdd81b0578042663b0292f Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 23 Jan 2018 04:16:58 -0600
Subject: [PATCH 23/59] ensuring count_granularity values are valid in
 Op.__init__ and MemAccess.__init__

---
 loopy/statistics.py     | 18 ++++++++++++++++--
 test/test_statistics.py | 25 +++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index a56be22a3..4dac09c0d 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -487,7 +487,13 @@ class Op(Record):
 
     """
 
+    count_granularity_options = ["workitem", "subgroup", "group", None]
+
     def __init__(self, dtype=None, name=None, count_granularity=None):
+        if not count_granularity in self.count_granularity_options:
+            raise ValueError("Op.__init__: count_granularity '%s' is"
+                    "not allowed. count_granularity options: %s"
+                    % (count_granularity, self.count_granularity_options))
         if dtype is None:
             Record.__init__(self, dtype=dtype, name=name,
                             count_granularity=count_granularity)
@@ -542,6 +548,8 @@ class MemAccess(Record):
 
     """
 
+    count_granularity_options = ["workitem", "subgroup", "group", None]
+
     def __init__(self, mtype=None, dtype=None, stride=None, direction=None,
                  variable=None, count_granularity=None):
 
@@ -555,6 +563,11 @@ class MemAccess(Record):
             raise NotImplementedError("MemAccess: variable must be None when "
                                       "mtype is 'local'")
 
+        if not count_granularity in self.count_granularity_options:
+            raise ValueError("Op.__init__: count_granularity '%s' is"
+                    "not allowed. count_granularity options: %s"
+                    % (count_granularity, self.count_granularity_options))
+
         if dtype is None:
             Record.__init__(self, mtype=mtype, dtype=dtype, stride=stride,
                             direction=direction, variable=variable,
@@ -1371,9 +1384,10 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                 group_workitems *= s
             return ct/group_workitems
         else:
+            # this should not happen since this is enforced in MemAccess
             raise ValueError("get_insn_count: count_granularity '%s' is"
-                    "not allowed. count_granularity must be 'group', "
-                    "'subgroup', or 'workitem'." % (count_granularity))
+                    "not allowed. count_granularity options: %s"
+                    % (count_granularity, MemAccess.count_granularity_options))
 
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
diff --git a/test/test_statistics.py b/test/test_statistics.py
index f8735553f..82f9f0886 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -672,6 +672,31 @@ def test_mem_access_counter_consec():
     assert f32consec == n*m*ell
 
 
+def test_count_granularity_val_checks():
+
+    try:
+        lp.MemAccess(count_granularity='workitem')
+        lp.MemAccess(count_granularity='subgroup')
+        lp.MemAccess(count_granularity='group')
+        lp.MemAccess(count_granularity=None)
+        assert True
+        lp.MemAccess(count_granularity='bushel')
+        assert False
+    except ValueError:
+        assert True
+
+    try:
+        lp.Op(count_granularity='workitem')
+        lp.Op(count_granularity='subgroup')
+        lp.Op(count_granularity='group')
+        lp.Op(count_granularity=None)
+        assert True
+        lp.Op(count_granularity='bushel')
+        assert False
+    except ValueError:
+        assert True
+
+
 def test_barrier_counter_nobarriers():
 
     knl = lp.make_kernel(
-- 
GitLab


From 2705f321105f65ffa94eafa7c9530add3e062ec1 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 23 Jan 2018 04:38:17 -0600
Subject: [PATCH 24/59] fixing more flake8 issues

---
 loopy/statistics.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 4dac09c0d..acd2755ba 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -490,7 +490,7 @@ class Op(Record):
     count_granularity_options = ["workitem", "subgroup", "group", None]
 
     def __init__(self, dtype=None, name=None, count_granularity=None):
-        if not count_granularity in self.count_granularity_options:
+        if count_granularity not in self.count_granularity_options:
             raise ValueError("Op.__init__: count_granularity '%s' is"
                     "not allowed. count_granularity options: %s"
                     % (count_granularity, self.count_granularity_options))
@@ -563,7 +563,7 @@ class MemAccess(Record):
             raise NotImplementedError("MemAccess: variable must be None when "
                                       "mtype is 'local'")
 
-        if not count_granularity in self.count_granularity_options:
+        if count_granularity not in self.count_granularity_options:
             raise ValueError("Op.__init__: count_granularity '%s' is"
                     "not allowed. count_granularity options: %s"
                     % (count_granularity, self.count_granularity_options))
@@ -1254,8 +1254,8 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False):
                         (Op(
                             dtype=op.dtype.numpy_dtype,
                             name=op.name,
-                            count_granularity=op.count_granularity)
-                        , ct)
+                            count_granularity=op.count_granularity),
+                        ct)
                         for op, ct in six.iteritems(op_map.count_map)),
                     val_type=op_map.val_type
                     )
@@ -1446,8 +1446,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                             stride=mem_access.stride,
                             direction=mem_access.direction,
                             variable=mem_access.variable,
-                            count_granularity=mem_access.count_granularity)
-                        , ct)
+                            count_granularity=mem_access.count_granularity),
+                        ct)
                         for mem_access, ct in six.iteritems(access_map.count_map)),
                     val_type=access_map.val_type
                     )
-- 
GitLab


From 8c75f8eeeeefb7075bfd89c4b534125be0e15664 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 24 Jan 2018 03:19:28 -0600
Subject: [PATCH 25/59] changed truediv->floordiv, added ceildiv

---
 loopy/statistics.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index acd2755ba..341c6aaa9 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -89,14 +89,23 @@ class GuardedPwQPolynomial(object):
 
     __rmul__ = __mul__
 
-    def __truediv__(self, other):
+    def __floordiv__(self, other):
         if not isinstance(other, int):
-            raise ValueError("GuardedPwQPolynomial.__truediv__ only valid for "
+            raise ValueError("GuardedPwQPolynomial.__floordiv__ only valid for "
                     "type int. Attempted to divide by %s" % (type(other)))
         return GuardedPwQPolynomial(
                 self.pwqpolynomial.scale_val(isl.Val(1).div(isl.Val(other))),
                 self.valid_domain)
 
+    def ceildiv(self, other):
+        if not isinstance(other, int):
+            raise ValueError("GuardedPwQPolynomial.ceildiv only valid for "
+                    "type int. Attempted to divide by %s" % (type(other)))
+        return GuardedPwQPolynomial(
+                (self.pwqpolynomial + other - 1).scale_val(isl.Val(1).div(isl.Val(other))),
+                self.valid_domain)
+
+
     def eval_with_dict(self, value_dict):
         space = self.pwqpolynomial.space
         pt = isl.Point.zero(space.params())
@@ -1369,7 +1378,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         elif count_granularity == 'workitem':
             return ct
         elif count_granularity == 'subgroup':
-            return ct/subgroup_size
+            return ct//subgroup_size
         elif count_granularity == 'group':
             from loopy.symbolic import aff_to_expr
             _, local_size = knl.get_grid_size_upper_bounds()
@@ -1382,7 +1391,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                                      "group size is not integer: %s"
                                      % (local_size))
                 group_workitems *= s
-            return ct/group_workitems
+            return ct//group_workitems
         else:
             # this should not happen since this is enforced in MemAccess
             raise ValueError("get_insn_count: count_granularity '%s' is"
-- 
GitLab


From 0c6fa60190ac8b225410e6bf5e513048540fcec1 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 24 Jan 2018 16:23:00 -0600
Subject: [PATCH 26/59] fixed count_granularity rounding behavior for groups
 not evenly divisible by subgroups, updated tests

---
 loopy/statistics.py     |  45 ++++++-----
 test/test_statistics.py | 168 +++++++++++++++++++++++++++++++---------
 2 files changed, 158 insertions(+), 55 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 341c6aaa9..936a840b1 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1166,6 +1166,7 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None):
 
 
 def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False):
+
     insn_inames = knl.insn_inames(insn)
 
     if disregard_local_axes:
@@ -1363,35 +1364,41 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
     #from pytools import memoize_in
     #@memoize_in(cache_holder, "insn_count")  # TODO why doesn't this work anymore?
     def get_insn_count(knl, insn_id,
-                       disregard_local_axes=False,
                        count_granularity='workitem'):
         insn = knl.id_to_insn[insn_id]
-        ct = count_insn_runs(
-                knl, insn, disregard_local_axes=disregard_local_axes,
-                count_redundant_work=count_redundant_work)
 
         if count_granularity is None:
             warn_with_kernel(knl, "get_insn_count_assumes_granularity",
                              "get_insn_count: No count granularity passed for "
                              "MemAccess, assuming workitem granularity.")
-            return ct
-        elif count_granularity == 'workitem':
-            return ct
+            count_granularity == 'workitem'
+
+        if count_granularity == 'workitem':
+            return count_insn_runs(
+                knl, insn, count_redundant_work=count_redundant_work)
+
+        ct_disregard_local = count_insn_runs(
+                knl, insn, disregard_local_axes=True,
+                count_redundant_work=count_redundant_work)
+
+        if count_granularity == 'group':
+            return ct_disregard_local
         elif count_granularity == 'subgroup':
-            return ct//subgroup_size
-        elif count_granularity == 'group':
+            # get the group size
             from loopy.symbolic import aff_to_expr
-            _, local_size = knl.get_grid_size_upper_bounds()
-            group_workitems = 1
-            for size in local_size:
-                try:
+            global_size, local_size = knl.get_grid_size_upper_bounds()
+            group_size = 1
+            if local_size:
+                for size in local_size:
                     s = aff_to_expr(size)
-                except AttributeError:
-                    raise LoopyError("Cannot count insn with group granularity, "
-                                     "group size is not integer: %s"
-                                     % (local_size))
-                group_workitems *= s
-            return ct//group_workitems
+                    if not isinstance(s, int):
+                        raise LoopyError("Cannot count insn with subgroup granularity, "
+                                         "group size is not integer: %s"
+                                         % (local_size))
+                    group_size *= s
+
+            from pytools import div_ceil
+            return ct_disregard_local*div_ceil(group_size, subgroup_size)
         else:
             # this should not happen since this is enforced in MemAccess
             raise ValueError("get_insn_count: count_granularity '%s' is"
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 82f9f0886..c2fb4ffe2 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -30,6 +30,7 @@ from pyopencl.tools import (  # noqa
 import loopy as lp
 from loopy.types import to_loopy_type
 import numpy as np
+from pytools import div_ceil
 
 from pymbolic.primitives import Variable
 
@@ -248,12 +249,21 @@ def test_mem_access_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                     dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
+
+    subgroup_size = 32
+
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                    subgroup_size=subgroup_size)
 
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
+
+    n_groups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, subgroup_size)
+
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
                          count_granularity='subgroup')
@@ -270,8 +280,10 @@ def test_mem_access_counter_basic():
                          stride=0, direction='load', variable='h',
                          count_granularity='subgroup')
                     ].eval_with_dict(params)
-    assert f32l == 3*n*m*ell/32  # /subgroup_size because these are uniform
-    assert f64l == 2*n*m/32  # /subgroup_size because these are uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert f32l == (3*n*m*ell)*n_groups*subgroups_per_group  # these are uniform
+    assert f64l == (2*n*m)*n_groups*subgroups_per_group  # these are uniform
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c',
@@ -281,8 +293,10 @@ def test_mem_access_counter_basic():
                          stride=0, direction='store', variable='e',
                          count_granularity='subgroup')
                    ].eval_with_dict(params)
-    assert f32s == n*m*ell/32  # /subgroup_size because these are uniform
-    assert f64s == n*m/32  # /subgroup_size because these are uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert f32s == (n*m*ell)*n_groups*subgroups_per_group  # these are uniform
+    assert f64s == (n*m)*n_groups*subgroups_per_group  # these are uniform
 
 
 def test_mem_access_counter_reduction():
@@ -295,11 +309,20 @@ def test_mem_access_counter_reduction():
             name="matmul", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
+
+    subgroup_size = 32
+
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                    subgroup_size=subgroup_size)
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
+
+    n_groups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, subgroup_size)
+
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
                          count_granularity='subgroup')
@@ -308,13 +331,17 @@ def test_mem_access_counter_reduction():
                          stride=0, direction='load', variable='b',
                          count_granularity='subgroup')
                     ].eval_with_dict(params)
-    assert f32l == 2*n*m*ell/32  # /subgroup_size because these are uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert f32l == (2*n*m*ell)*n_groups*subgroups_per_group  # these are uniform
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c',
                          count_granularity='subgroup')
                    ].eval_with_dict(params)
-    assert f32s == n*ell/32  # /subgroup_size because these are uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert f32s == (n*ell)*n_groups*subgroups_per_group  # these are uniform
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                                  ).to_bytes().eval_and_sum(params)
@@ -338,12 +365,20 @@ def test_mem_access_counter_logic():
             name="logic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
+
+    subgroup_size = 32
+
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                    subgroup_size=subgroup_size)
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
 
+    n_groups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, subgroup_size)
+
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
 
     f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32),
@@ -355,9 +390,11 @@ def test_mem_access_counter_logic():
     f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
                                        direction='store')
                           ].eval_with_dict(params)
-    assert f32_g_l == 2*n*m/32  # /subgroup_size because these are uniform
-    assert f64_g_l == n*m/32  # /subgroup_size because these are uniform
-    assert f64_g_s == n*m/32  # /subgroup_size because these are uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert f32_g_l == (2*n*m)*n_groups*subgroups_per_group  # these are uniform
+    assert f64_g_l == (n*m)*n_groups*subgroups_per_group  # these are uniform
+    assert f64_g_s == (n*m)*n_groups*subgroups_per_group  # these are uniform
 
 
 def test_mem_access_counter_specialops():
@@ -374,11 +411,20 @@ def test_mem_access_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
+
+    subgroup_size = 32
+
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                    subgroup_size=subgroup_size)
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
+
+    n_groups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, subgroup_size)
+
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
                          count_granularity='subgroup')
@@ -395,8 +441,10 @@ def test_mem_access_counter_specialops():
                          stride=0, direction='load', variable='h',
                          count_granularity='subgroup')
                    ].eval_with_dict(params)
-    assert f32 == 2*n*m*ell/32  # /subgroup_size because these are uniform
-    assert f64 == 2*n*m/32  # /subgroup_size because these are uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert f32 == (2*n*m*ell)*n_groups*subgroups_per_group  # these are uniform
+    assert f64 == (2*n*m)*n_groups*subgroups_per_group  # these are uniform
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='store', variable='c',
@@ -406,14 +454,17 @@ def test_mem_access_counter_specialops():
                          stride=0, direction='store', variable='e',
                          count_granularity='subgroup')
                   ].eval_with_dict(params)
-    assert f32 == n*m*ell/32  # /subgroup_size because these are uniform
-    assert f64 == n*m/32  # /subgroup_size because these are uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert f32 == (n*m*ell)*n_groups*subgroups_per_group  # these are uniform
+    assert f64 == (n*m)*n_groups*subgroups_per_group  # these are uniform
 
     filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'],
                          count_granularity='subgroup')
-    #tot = lp.eval_and_sum_polys(filtered_map, params)
     tot = filtered_map.eval_and_sum(params)
-    assert tot == (n*m*ell + n*m)/32  # /subgroup_size for uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert tot == (n*m*ell + n*m)*n_groups*subgroups_per_group  # these are uniform
 
 
 def test_mem_access_counter_bitwise():
@@ -433,11 +484,19 @@ def test_mem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
+    subgroup_size = 32
+
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                    subgroup_size=subgroup_size)
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
+
+    n_groups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, subgroup_size)
+
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='a',
                          count_granularity='subgroup')
@@ -454,7 +513,9 @@ def test_mem_access_counter_bitwise():
                          stride=0, direction='load', variable='h',
                          count_granularity='subgroup')
                    ].eval_with_dict(params)
-    assert i32 == (4*n*m+2*n*m*ell)/32  # /subgroup_size for uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert i32 == (4*n*m+2*n*m*ell)*n_groups*subgroups_per_group  # these are uniform
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='c',
@@ -464,7 +525,9 @@ def test_mem_access_counter_bitwise():
                          stride=0, direction='store', variable='e',
                          count_granularity='subgroup')
                    ].eval_with_dict(params)
-    assert i32 == (n*m+n*m*ell)/32  # /subgroup_size because these are uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert i32 == (n*m+n*m*ell)*n_groups*subgroups_per_group  # these are uniform
 
 
 def test_mem_access_counter_mixed():
@@ -478,18 +541,28 @@ def test_mem_access_counter_mixed():
             """
             ],
             name="mixed", assumptions="n,m,ell >= 1")
+
     knl = lp.add_and_infer_dtypes(knl, dict(
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64,
                 x=np.float32))
-    bsize = 16
-    knl = lp.split_iname(knl, "j", bsize)
+
+    bsize0 = 65
+    subgroup_size = 32
+
+    knl = lp.split_iname(knl, "j", bsize0)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)  # noqa
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
+
+    n_groups = div_ceil(ell, bsize0)
+    group_size = bsize0
+    subgroups_per_group = div_ceil(group_size, subgroup_size)
+
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                    subgroup_size=subgroup_size)
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='load', variable='g',
                                 count_granularity='subgroup')
@@ -512,8 +585,11 @@ def test_mem_access_counter_mixed():
                                 variable='b',
                                 count_granularity='workitem')
                             ].eval_with_dict(params)
-    assert f64uniform == 2*n*m*ell/32  # /subgroup_size for uniform
-    assert f32uniform == n*m*ell/32  # /subgroup_size for uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert f64uniform == (2*n*m)*n_groups*subgroups_per_group  # /subgroup_size for uniform
+    assert f32uniform == (m*n)*n_groups*subgroups_per_group  # /subgroup_size for uniform
+
     assert f32nonconsec == 3*n*m*ell
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
@@ -525,7 +601,9 @@ def test_mem_access_counter_mixed():
                                 variable='c',
                                 count_granularity='workitem')
                            ].eval_with_dict(params)
-    assert f64uniform == n*m*ell/32  # /subgroup_size because these are uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert f64uniform == m*n*n_groups*subgroups_per_group  # /subgroup_size for uniform
     assert f32nonconsec == n*m*ell
 
 
@@ -865,22 +943,34 @@ def test_summations_and_filters():
 
     knl = lp.add_and_infer_dtypes(knl,
                     dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
+
+    subgroup_size = 32
+
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
 
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32)
+    n_groups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, subgroup_size)
+
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                    subgroup_size=subgroup_size)
 
     loads_a = mem_map.filter_by(direction=['load'], variable=['a'],
                                 count_granularity=['subgroup']
                                 ).eval_and_sum(params)
-    assert loads_a == 2*n*m*ell/32  # /subgroup_size because these are uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert loads_a == (2*n*m*ell)*n_groups*subgroups_per_group  # these are uniform
 
     global_stores = mem_map.filter_by(mtype=['global'], direction=['store'],
                                       count_granularity=['subgroup']
                                       ).eval_and_sum(params)
-    assert global_stores == (n*m*ell + n*m)/32  # /subgroup_size for uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert global_stores == (n*m*ell + n*m)*n_groups*subgroups_per_group  # these are uniform
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'],
                                  count_granularity=['subgroup']
@@ -888,8 +978,10 @@ def test_summations_and_filters():
     st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'],
                                  count_granularity=['subgroup']
                                  ).to_bytes().eval_and_sum(params)
-    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)/32  # /subgroup_size for uniform
-    assert st_bytes == (4*n*m*ell + 8*n*m)/32  # /subgroup_size for uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_groups*subgroups_per_group  # these are uniform
+    assert st_bytes == (4*n*m*ell + 8*n*m)*n_groups*subgroups_per_group  # these are uniform
 
     # ignore stride and variable names in this map
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
@@ -897,8 +989,10 @@ def test_summations_and_filters():
                           ].eval_with_dict(params)
     f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
                           ].eval_with_dict(params)
-    assert f32lall == 3*n*m*ell/32  # /subgroup_size because these are uniform
-    assert f64lall == 2*n*m/32  # /subgroup_size because these are uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert f32lall== (3*n*m*ell)*n_groups*subgroups_per_group  # these are uniform
+    assert f64lall == (2*n*m)*n_groups*subgroups_per_group  # these are uniform
 
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     #for k, v in op_map.items():
@@ -931,7 +1025,9 @@ def test_summations_and_filters():
         return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \
                key.direction == 'load'
     s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
-    assert s1f64l == 2*n*m/32  # /subgroup_size because these are uniform
+
+    # (count-per-sub-group*n_groups*subgroups_per_group)
+    assert s1f64l == (2*n*m)*n_groups*subgroups_per_group  # these are uniform
 
 
 def test_strided_footprint():
-- 
GitLab


From aa7fb37a3155311a31efbecf80c9cc889405b7d0 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 24 Jan 2018 16:24:07 -0600
Subject: [PATCH 27/59] removed now-unused div functions in
 GuardedPwQPolynomial

---
 loopy/statistics.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 936a840b1..19fa3d71a 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -89,23 +89,6 @@ class GuardedPwQPolynomial(object):
 
     __rmul__ = __mul__
 
-    def __floordiv__(self, other):
-        if not isinstance(other, int):
-            raise ValueError("GuardedPwQPolynomial.__floordiv__ only valid for "
-                    "type int. Attempted to divide by %s" % (type(other)))
-        return GuardedPwQPolynomial(
-                self.pwqpolynomial.scale_val(isl.Val(1).div(isl.Val(other))),
-                self.valid_domain)
-
-    def ceildiv(self, other):
-        if not isinstance(other, int):
-            raise ValueError("GuardedPwQPolynomial.ceildiv only valid for "
-                    "type int. Attempted to divide by %s" % (type(other)))
-        return GuardedPwQPolynomial(
-                (self.pwqpolynomial + other - 1).scale_val(isl.Val(1).div(isl.Val(other))),
-                self.valid_domain)
-
-
     def eval_with_dict(self, value_dict):
         space = self.pwqpolynomial.space
         pt = isl.Point.zero(space.params())
-- 
GitLab


From f591dac06746a5c49e3174c5ab9eeb227c1f99a0 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 24 Jan 2018 16:46:56 -0600
Subject: [PATCH 28/59] fixed flake8 issues

---
 loopy/statistics.py     |  4 +-
 test/test_statistics.py | 86 ++++++++++++++++++++---------------------
 2 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 19fa3d71a..92762156b 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1375,8 +1375,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                 for size in local_size:
                     s = aff_to_expr(size)
                     if not isinstance(s, int):
-                        raise LoopyError("Cannot count insn with subgroup granularity, "
-                                         "group size is not integer: %s"
+                        raise LoopyError("Cannot count insn with subgroup "
+                                         "granularity, group size is not integer: %s"
                                          % (local_size))
                     group_size *= s
 
diff --git a/test/test_statistics.py b/test/test_statistics.py
index c2fb4ffe2..7d1b6df0d 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -281,9 +281,9 @@ def test_mem_access_counter_basic():
                          count_granularity='subgroup')
                     ].eval_with_dict(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert f32l == (3*n*m*ell)*n_groups*subgroups_per_group  # these are uniform
-    assert f64l == (2*n*m)*n_groups*subgroups_per_group  # these are uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert f32l == (3*n*m*ell)*n_groups*subgroups_per_group
+    assert f64l == (2*n*m)*n_groups*subgroups_per_group
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c',
@@ -294,9 +294,9 @@ def test_mem_access_counter_basic():
                          count_granularity='subgroup')
                    ].eval_with_dict(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert f32s == (n*m*ell)*n_groups*subgroups_per_group  # these are uniform
-    assert f64s == (n*m)*n_groups*subgroups_per_group  # these are uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert f32s == (n*m*ell)*n_groups*subgroups_per_group
+    assert f64s == (n*m)*n_groups*subgroups_per_group
 
 
 def test_mem_access_counter_reduction():
@@ -332,16 +332,16 @@ def test_mem_access_counter_reduction():
                          count_granularity='subgroup')
                     ].eval_with_dict(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert f32l == (2*n*m*ell)*n_groups*subgroups_per_group  # these are uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert f32l == (2*n*m*ell)*n_groups*subgroups_per_group
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c',
                          count_granularity='subgroup')
                    ].eval_with_dict(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert f32s == (n*ell)*n_groups*subgroups_per_group  # these are uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert f32s == (n*ell)*n_groups*subgroups_per_group
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                                  ).to_bytes().eval_and_sum(params)
@@ -391,10 +391,10 @@ def test_mem_access_counter_logic():
                                        direction='store')
                           ].eval_with_dict(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert f32_g_l == (2*n*m)*n_groups*subgroups_per_group  # these are uniform
-    assert f64_g_l == (n*m)*n_groups*subgroups_per_group  # these are uniform
-    assert f64_g_s == (n*m)*n_groups*subgroups_per_group  # these are uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert f32_g_l == (2*n*m)*n_groups*subgroups_per_group
+    assert f64_g_l == (n*m)*n_groups*subgroups_per_group
+    assert f64_g_s == (n*m)*n_groups*subgroups_per_group
 
 
 def test_mem_access_counter_specialops():
@@ -442,9 +442,9 @@ def test_mem_access_counter_specialops():
                          count_granularity='subgroup')
                    ].eval_with_dict(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert f32 == (2*n*m*ell)*n_groups*subgroups_per_group  # these are uniform
-    assert f64 == (2*n*m)*n_groups*subgroups_per_group  # these are uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert f32 == (2*n*m*ell)*n_groups*subgroups_per_group
+    assert f64 == (2*n*m)*n_groups*subgroups_per_group
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='store', variable='c',
@@ -455,16 +455,16 @@ def test_mem_access_counter_specialops():
                          count_granularity='subgroup')
                   ].eval_with_dict(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert f32 == (n*m*ell)*n_groups*subgroups_per_group  # these are uniform
-    assert f64 == (n*m)*n_groups*subgroups_per_group  # these are uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert f32 == (n*m*ell)*n_groups*subgroups_per_group
+    assert f64 == (n*m)*n_groups*subgroups_per_group
 
     filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'],
                          count_granularity='subgroup')
     tot = filtered_map.eval_and_sum(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert tot == (n*m*ell + n*m)*n_groups*subgroups_per_group  # these are uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert tot == (n*m*ell + n*m)*n_groups*subgroups_per_group
 
 
 def test_mem_access_counter_bitwise():
@@ -514,8 +514,8 @@ def test_mem_access_counter_bitwise():
                          count_granularity='subgroup')
                    ].eval_with_dict(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert i32 == (4*n*m+2*n*m*ell)*n_groups*subgroups_per_group  # these are uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert i32 == (4*n*m+2*n*m*ell)*n_groups*subgroups_per_group
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='c',
@@ -526,8 +526,8 @@ def test_mem_access_counter_bitwise():
                          count_granularity='subgroup')
                    ].eval_with_dict(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert i32 == (n*m+n*m*ell)*n_groups*subgroups_per_group  # these are uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert i32 == (n*m+n*m*ell)*n_groups*subgroups_per_group
 
 
 def test_mem_access_counter_mixed():
@@ -586,9 +586,9 @@ def test_mem_access_counter_mixed():
                                 count_granularity='workitem')
                             ].eval_with_dict(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert f64uniform == (2*n*m)*n_groups*subgroups_per_group  # /subgroup_size for uniform
-    assert f32uniform == (m*n)*n_groups*subgroups_per_group  # /subgroup_size for uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert f64uniform == (2*n*m)*n_groups*subgroups_per_group
+    assert f32uniform == (m*n)*n_groups*subgroups_per_group
 
     assert f32nonconsec == 3*n*m*ell
 
@@ -602,8 +602,8 @@ def test_mem_access_counter_mixed():
                                 count_granularity='workitem')
                            ].eval_with_dict(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert f64uniform == m*n*n_groups*subgroups_per_group  # /subgroup_size for uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert f64uniform == m*n*n_groups*subgroups_per_group
     assert f32nonconsec == n*m*ell
 
 
@@ -962,15 +962,15 @@ def test_summations_and_filters():
                                 count_granularity=['subgroup']
                                 ).eval_and_sum(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert loads_a == (2*n*m*ell)*n_groups*subgroups_per_group  # these are uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert loads_a == (2*n*m*ell)*n_groups*subgroups_per_group
 
     global_stores = mem_map.filter_by(mtype=['global'], direction=['store'],
                                       count_granularity=['subgroup']
                                       ).eval_and_sum(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert global_stores == (n*m*ell + n*m)*n_groups*subgroups_per_group  # these are uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert global_stores == (n*m*ell + n*m)*n_groups*subgroups_per_group
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'],
                                  count_granularity=['subgroup']
@@ -979,9 +979,9 @@ def test_summations_and_filters():
                                  count_granularity=['subgroup']
                                  ).to_bytes().eval_and_sum(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_groups*subgroups_per_group  # these are uniform
-    assert st_bytes == (4*n*m*ell + 8*n*m)*n_groups*subgroups_per_group  # these are uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_groups*subgroups_per_group
+    assert st_bytes == (4*n*m*ell + 8*n*m)*n_groups*subgroups_per_group
 
     # ignore stride and variable names in this map
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
@@ -990,9 +990,9 @@ def test_summations_and_filters():
     f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
                           ].eval_with_dict(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert f32lall== (3*n*m*ell)*n_groups*subgroups_per_group  # these are uniform
-    assert f64lall == (2*n*m)*n_groups*subgroups_per_group  # these are uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert f32lall== (3*n*m*ell)*n_groups*subgroups_per_group
+    assert f64lall == (2*n*m)*n_groups*subgroups_per_group
 
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     #for k, v in op_map.items():
@@ -1026,8 +1026,8 @@ def test_summations_and_filters():
                key.direction == 'load'
     s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
 
-    # (count-per-sub-group*n_groups*subgroups_per_group)
-    assert s1f64l == (2*n*m)*n_groups*subgroups_per_group  # these are uniform
+    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
+    assert s1f64l == (2*n*m)*n_groups*subgroups_per_group
 
 
 def test_strided_footprint():
-- 
GitLab


From d1df544477c2073c44db1dfc31dc9a5a14fc31e1 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 24 Jan 2018 16:50:01 -0600
Subject: [PATCH 29/59] fixed flake8 issue

---
 test/test_statistics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_statistics.py b/test/test_statistics.py
index 7d1b6df0d..25ae3b2da 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -991,7 +991,7 @@ def test_summations_and_filters():
                           ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert f32lall== (3*n*m*ell)*n_groups*subgroups_per_group
+    assert f32lall == (3*n*m*ell)*n_groups*subgroups_per_group
     assert f64lall == (2*n*m)*n_groups*subgroups_per_group
 
     op_map = lp.get_op_map(knl, count_redundant_work=True)
-- 
GitLab


From fffeb48b647e631257a4f7211f7304b5f67e7461 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 24 Jan 2018 17:18:49 -0600
Subject: [PATCH 30/59] updated doctests for count_granularity rounding changes

---
 doc/tutorial.rst | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 7a2fb04fc..5fd4f72cb 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1638,12 +1638,12 @@ we'll continue using the kernel from the previous example:
 
     >>> mem_map = lp.get_mem_access_map(knl)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 1/16 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { 1/32 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { 1/32 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g, subgroup) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h, subgroup) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e, subgroup) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g, subgroup) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h, subgroup) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e, subgroup) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
 
 :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
@@ -1679,10 +1679,10 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
-    f32 ld a: 32768
-    f32 st c: 16384
-    f64 ld g: 2048
-    f64 st e: 2048
+    f32 ld a: 1048576
+    f32 st c: 524288
+    f64 ld g: 65536
+    f64 st e: 65536
 
 :class:`loopy.ToCountMap` also makes it easy to determine the total amount
 of data moved in bytes. Suppose we want to know the total amount of global
@@ -1693,26 +1693,26 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
     >>> bytes_map = mem_map.to_bytes()
     >>> print(lp.stringify_stats_mapping(bytes_map))
-    MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 1/4 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { 1/8 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { 1/8 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, g, subgroup) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, load, h, subgroup) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), 0, store, e, subgroup) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
     >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
     ...                                         ).group_by('direction')
     >>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
-    MemAccess(None, None, None, load, None, None) : [m, l, n] -> { (1/2 * m + 3/8 * m * l) * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(None, None, None, store, None, None) : [m, l, n] -> { (1/4 * m + 1/8 * m * l) * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(None, None, None, load, None, None) : [m, l, n] -> { (16 * m + 12 * m * l) * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(None, None, None, store, None, None) : [m, l, n] -> { (8 * m + 4 * m * l) * n : m > 0 and l > 0 and n > 0 }
     <BLANKLINE>
     >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
     ...                            ].eval_with_dict(param_dict)
     >>> stored = global_ld_st_bytes[lp.MemAccess(direction='store')
     ...                            ].eval_with_dict(param_dict)
     >>> print("bytes loaded: %s\nbytes stored: %s" % (loaded, stored))
-    bytes loaded: 229376
-    bytes stored: 81920
+    bytes loaded: 7340032
+    bytes stored: 2621440
 
 One can see how these functions might be useful in computing, for example,
 achieved memory bandwidth in byte/sec or performance in FLOP/sec.
-- 
GitLab


From 0df9212cd9b9d4d55240af65263fecfe30c95bf2 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Thu, 25 Jan 2018 15:12:22 -0600
Subject: [PATCH 31/59] renamed kernel

---
 test/test_statistics.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/test_statistics.py b/test/test_statistics.py
index 25ae3b2da..a507c1bd0 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -540,7 +540,7 @@ def test_mem_access_counter_mixed():
             e[i, k] = g[i,k]*(2+h[i,k])
             """
             ],
-            name="mixed", assumptions="n,m,ell >= 1")
+            name="mixed_knl", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64,
@@ -589,7 +589,6 @@ def test_mem_access_counter_mixed():
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
     assert f64uniform == (2*n*m)*n_groups*subgroups_per_group
     assert f32uniform == (m*n)*n_groups*subgroups_per_group
-
     assert f32nonconsec == 3*n*m*ell
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
-- 
GitLab


From ca62d04904552dbeedf5db8f4a115925002565c3 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Thu, 25 Jan 2018 15:13:26 -0600
Subject: [PATCH 32/59] putting unnecessary default arg back into call to
 count_insn_runs

---
 loopy/statistics.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 92762156b..d08046876 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1343,9 +1343,9 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
     class CacheHolder(object):
         pass
 
-    #cache_holder = CacheHolder()
-    #from pytools import memoize_in
-    #@memoize_in(cache_holder, "insn_count")  # TODO why doesn't this work anymore?
+    # cache_holder = CacheHolder()
+    # from pytools import memoize_in
+    # @memoize_in(cache_holder, "insn_count")  # TODO why doesn't this work anymore?
     def get_insn_count(knl, insn_id,
                        count_granularity='workitem'):
         insn = knl.id_to_insn[insn_id]
@@ -1358,7 +1358,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
 
         if count_granularity == 'workitem':
             return count_insn_runs(
-                knl, insn, count_redundant_work=count_redundant_work)
+                knl, insn, count_redundant_work=count_redundant_work,
+                disregard_local_axes=False)
 
         ct_disregard_local = count_insn_runs(
                 knl, insn, disregard_local_axes=True,
-- 
GitLab


From 9b760d4ad8a848217a3c0e02f71ba947470dce5f Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Sat, 27 Jan 2018 08:29:20 -0600
Subject: [PATCH 33/59] made test_mem_access_counter_mixed handle non-barvinok
 counting

---
 test/test_statistics.py | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/test/test_statistics.py b/test/test_statistics.py
index a507c1bd0..0c51bb7e5 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -531,7 +531,6 @@ def test_mem_access_counter_bitwise():
 
 
 def test_mem_access_counter_mixed():
-
     knl = lp.make_kernel(
             "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
@@ -540,16 +539,16 @@ def test_mem_access_counter_mixed():
             e[i, k] = g[i,k]*(2+h[i,k])
             """
             ],
-            name="mixed_knl", assumptions="n,m,ell >= 1")
+            name="mixed", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64,
                 x=np.float32))
 
-    bsize0 = 65
+    group_size_0 = 65
     subgroup_size = 32
 
-    knl = lp.split_iname(knl, "j", bsize0)
+    knl = lp.split_iname(knl, "j", group_size_0)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
     n = 512
@@ -557,8 +556,8 @@ def test_mem_access_counter_mixed():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
 
-    n_groups = div_ceil(ell, bsize0)
-    group_size = bsize0
+    n_groups = div_ceil(ell, group_size_0)
+    group_size = group_size_0
     subgroups_per_group = div_ceil(group_size, subgroup_size)
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
@@ -589,7 +588,23 @@ def test_mem_access_counter_mixed():
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
     assert f64uniform == (2*n*m)*n_groups*subgroups_per_group
     assert f32uniform == (m*n)*n_groups*subgroups_per_group
-    assert f32nonconsec == 3*n*m*ell
+
+    expect_fallback = False
+    import islpy as isl
+    try:
+        isl.BasicSet.card
+    except AttributeError:
+        expect_fallback = True
+    else:
+        expect_fallback = False
+
+    if expect_fallback:
+        if ell < group_size_0:
+            assert f32nonconsec == 3*n*m*ell*n_groups
+        else:
+            assert f32nonconsec == 3*n*m*n_groups*group_size_0
+    else:
+        assert f32nonconsec == 3*n*m*ell
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='store', variable='e',
@@ -603,7 +618,14 @@ def test_mem_access_counter_mixed():
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
     assert f64uniform == m*n*n_groups*subgroups_per_group
-    assert f32nonconsec == n*m*ell
+
+    if expect_fallback:
+        if ell < group_size_0:
+            assert f32nonconsec == n*m*ell*n_groups
+        else:
+            assert f32nonconsec == n*m*n_groups*group_size_0
+    else:
+        assert f32nonconsec == n*m*ell
 
 
 def test_mem_access_counter_nonconsec():
-- 
GitLab


From 5e94786c9079b028adb04b4892e120ccd1947c60 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 30 Jan 2018 16:28:02 -0600
Subject: [PATCH 34/59] added warning, get_insn_count uses upper bound for
 group size

---
 loopy/statistics.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index d08046876..bd61cda3f 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1370,7 +1370,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         elif count_granularity == 'subgroup':
             # get the group size
             from loopy.symbolic import aff_to_expr
-            global_size, local_size = knl.get_grid_size_upper_bounds()
+            _, local_size = knl.get_grid_size_upper_bounds()
             group_size = 1
             if local_size:
                 for size in local_size:
@@ -1381,6 +1381,13 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                                          % (local_size))
                     group_size *= s
 
+            warn_with_kernel(knl, "insn_count_subgroups_upper_bound",
+                    "get_insn_count: when counting instruction %s with "
+                    "count_granularity=subgroup, using upper bound for group size "
+                    "(%d workitems) to compute subgroups per group. If kernel has "
+                    "multiple device programs, actual subgroup count may be lower."
+                    % (insn_id, group_size))
+
             from pytools import div_ceil
             return ct_disregard_local*div_ceil(group_size, subgroup_size)
         else:
-- 
GitLab


From b26368374d754861f2ff22a204fa4a8adc70bbfe Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 30 Jan 2018 16:42:35 -0600
Subject: [PATCH 35/59] re-worded warning

---
 loopy/statistics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index bd61cda3f..6eb17aca1 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1384,8 +1384,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
             warn_with_kernel(knl, "insn_count_subgroups_upper_bound",
                     "get_insn_count: when counting instruction %s with "
                     "count_granularity=subgroup, using upper bound for group size "
-                    "(%d workitems) to compute subgroups per group. If kernel has "
-                    "multiple device programs, actual subgroup count may be lower."
+                    "(%d workitems) to compute subgroups per group. When multiple "
+                    "device programs present, actual subgroup count may be lower."
                     % (insn_id, group_size))
 
             from pytools import div_ceil
-- 
GitLab


From ef79671a3f3ad2116df2704dfba78183d9b5770f Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Thu, 8 Feb 2018 15:50:24 -0600
Subject: [PATCH 36/59] made docstring comment more precise about usage of
 counts

---
 loopy/statistics.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 6eb17aca1..2019a5791 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -225,7 +225,7 @@ class ToCountMap(object):
                                              variable=['a','g'])
             tot_loads_a_g = filtered_map.eval_and_sum(params)
 
-            # (now use these counts to predict performance)
+            # (now use these counts to, e.g., predict performance)
 
         """
 
@@ -273,7 +273,7 @@ class ToCountMap(object):
             filtered_map = mem_map.filter_by_func(filter_func)
             tot = filtered_map.eval_and_sum(params)
 
-            # (now use these counts to predict performance)
+            # (now use these counts to, e.g., predict performance)
 
         """
 
@@ -328,7 +328,7 @@ class ToCountMap(object):
             f64ops = ops_dtype[Op(dtype=np.float64)].eval_with_dict(params)
             i32ops = ops_dtype[Op(dtype=np.int32)].eval_with_dict(params)
 
-            # (now use these counts to predict performance)
+            # (now use these counts to, e.g., predict performance)
 
         """
 
@@ -385,7 +385,7 @@ class ToCountMap(object):
                                 mtype=['global'], stride=[2],
                                 direction=['store']).eval_and_sum(params)
 
-            # (now use these counts to predict performance)
+            # (now use these counts to, e.g., predict performance)
 
         """
 
@@ -442,7 +442,7 @@ class ToCountMap(object):
                                              variable=['a','g'])
             tot_loads_a_g = filtered_map.eval_and_sum(params)
 
-            # (now use these counts to predict performance)
+            # (now use these counts to, e.g., predict performance)
 
         """
         return self.sum().eval_with_dict(params)
@@ -1216,7 +1216,7 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False):
         f32mul = op_map[Op(np.float32, 'mul', count_granularity='workitem')
                        ].eval_with_dict(params)
 
-        # (now use these counts to predict performance)
+        # (now use these counts to, e.g., predict performance)
 
     """
 
@@ -1328,7 +1328,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                                           count_granularity='workitem')
                                ].eval_with_dict(params)
 
-        # (now use these counts to predict performance)
+        # (now use these counts to, e.g., predict performance)
 
     """
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
@@ -1488,7 +1488,7 @@ def get_synchronization_map(knl):
         params = {'n': 512, 'm': 256, 'l': 128}
         barrier_ct = sync_map['barrier_local'].eval_with_dict(params)
 
-        # (now use this count to predict performance)
+        # (now use this count to, e.g., predict performance)
 
     """
 
-- 
GitLab


From d56044f8ce19abfb2f529aa11d97f514e8e311a6 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Thu, 8 Feb 2018 21:12:54 -0600
Subject: [PATCH 37/59] can't pass kwarg to get_insn_count when using
 @memoize_in, so passing key.count_granularity without keyword

---
 loopy/statistics.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 2019a5791..f6ca1eac5 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1343,11 +1343,10 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
     class CacheHolder(object):
         pass
 
-    # cache_holder = CacheHolder()
-    # from pytools import memoize_in
-    # @memoize_in(cache_holder, "insn_count")  # TODO why doesn't this work anymore?
-    def get_insn_count(knl, insn_id,
-                       count_granularity='workitem'):
+    cache_holder = CacheHolder()
+    from pytools import memoize_in
+    @memoize_in(cache_holder, "insn_count")
+    def get_insn_count(knl, insn_id, count_granularity='workitem'):
         insn = knl.id_to_insn[insn_id]
 
         if count_granularity is None:
@@ -1426,8 +1425,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                 access_map = (
                         access_map
                         + ToCountMap({key: val})
-                        * get_insn_count(knl, insn.id,
-                                         count_granularity=key.count_granularity))
+                        * get_insn_count(knl, insn.id, key.count_granularity))
                 #currently not counting stride of local mem access
 
             for key, val in six.iteritems(access_assignee_g.count_map):
@@ -1435,8 +1433,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                 access_map = (
                         access_map
                         + ToCountMap({key: val})
-                        * get_insn_count(knl, insn.id,
-                                         count_granularity=key.count_granularity))
+                        * get_insn_count(knl, insn.id, key.count_granularity))
                 # for now, don't count writes to local mem
         elif isinstance(insn, (NoOpInstruction, BarrierInstruction)):
             pass
-- 
GitLab


From 7e237cf03780e696faf5063c495e2e31f58003f0 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Thu, 8 Feb 2018 21:34:25 -0600
Subject: [PATCH 38/59] added unused subgroup_size argument to get_op_map and
 get_syncronization_map for consistency and potential future use

---
 loopy/statistics.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index f6ca1eac5..7babc7b6b 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1179,7 +1179,8 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False)
 
 # {{{ get_op_map
 
-def get_op_map(knl, numpy_types=True, count_redundant_work=False):
+def get_op_map(knl, numpy_types=True, count_redundant_work=False,
+               subgroup_size=None):
 
     """Count the number of operations in a loopy kernel.
 
@@ -1463,7 +1464,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
 
 # {{{ get_synchronization_map
 
-def get_synchronization_map(knl):
+def get_synchronization_map(knl, subgroup_size=None):
 
     """Count the number of synchronization events each work item encounters in a
     loopy kernel.
-- 
GitLab


From 6fbf29d71924533b040b5b1a3c3f48d15774f208 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Thu, 8 Feb 2018 21:36:22 -0600
Subject: [PATCH 39/59] fixed flake8 issue

---
 loopy/statistics.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 7babc7b6b..64b849a43 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1346,6 +1346,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
 
     cache_holder = CacheHolder()
     from pytools import memoize_in
+
     @memoize_in(cache_holder, "insn_count")
     def get_insn_count(knl, insn_id, count_granularity='workitem'):
         insn = knl.id_to_insn[insn_id]
-- 
GitLab


From f790e4397c82262793bc23e4ce436af14dc52630 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Fri, 9 Feb 2018 00:59:09 -0600
Subject: [PATCH 40/59] comment documenting reason for overriding
 Record.__repr__

---
 loopy/statistics.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 64b849a43..a6b461e88 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -498,6 +498,7 @@ class Op(Record):
         return hash(str(self))
 
     def __repr__(self):
+        # Record.__repr__ overridden for consistent ordering and conciseness
         return "Op(%s, %s, %s)" % (self.dtype, self.name, self.count_granularity)
 
 # }}}
@@ -574,6 +575,7 @@ class MemAccess(Record):
         return hash(str(self))
 
     def __repr__(self):
+        # Record.__repr__ overridden for consistent ordering and conciseness
         return "MemAccess(%s, %s, %s, %s, %s, %s)" % (
             self.mtype,
             self.dtype,
-- 
GitLab


From 479d89b822281ffc895b76f967679186a1e29123 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 19 Feb 2018 20:47:39 -0600
Subject: [PATCH 41/59] made CountGranularity class to contain cg strings

---
 loopy/statistics.py     | 173 ++++++++++++++++++++++++--------------
 test/test_statistics.py | 179 ++++++++++++++++++++--------------------
 2 files changed, 199 insertions(+), 153 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index a6b461e88..2305144ac 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -40,6 +40,7 @@ __doc__ = """
 .. currentmodule:: loopy
 
 .. autoclass:: ToCountMap
+.. autoclass:: CountGranularity
 .. autoclass:: Op
 .. autoclass:: MemAccess
 
@@ -457,6 +458,31 @@ def stringify_stats_mapping(m):
     return result
 
 
+class CountGranularity:
+    """Strings specifying whether an operation should be counted once per
+    *work-item*, *sub-group*, or *group*.
+
+    .. attribute :: WORKITEM
+
+       A :class:`str` that specifies that an operation should be counted
+       once per *work-item*.
+
+    .. attribute :: SUBGROUP
+
+       A :class:`str` that specifies that an operation should be counted
+       once per *sub-group*.
+
+    .. attribute :: GROUP
+
+       A :class:`str` that specifies that an operation should be counted
+       once per *group*.
+
+    """
+    WORKITEM = "workitem"
+    SUBGROUP = "subgroup"
+    GROUP = "group"
+
+
 # {{{ Op descriptor
 
 class Op(Record):
@@ -479,7 +505,10 @@ class Op(Record):
 
     """
 
-    count_granularity_options = ["workitem", "subgroup", "group", None]
+    count_granularity_options = [CountGranularity.WORKITEM,
+                                 CountGranularity.SUBGROUP,
+                                 CountGranularity.GROUP,
+                                 None]
 
     def __init__(self, dtype=None, name=None, count_granularity=None):
         if count_granularity not in self.count_granularity_options:
@@ -522,7 +551,7 @@ class MemAccess(Record):
     .. attribute:: stride
 
        An :class:`int` that specifies stride of the memory access. A stride of 0
-       indicates a uniform access (i.e. all work items access the same item).
+       indicates a uniform access (i.e. all work-items access the same item).
 
     .. attribute:: direction
 
@@ -541,7 +570,10 @@ class MemAccess(Record):
 
     """
 
-    count_granularity_options = ["workitem", "subgroup", "group", None]
+    count_granularity_options = [CountGranularity.WORKITEM,
+                                 CountGranularity.SUBGROUP,
+                                 CountGranularity.GROUP,
+                                 None]
 
     def __init__(self, mtype=None, dtype=None, stride=None, direction=None,
                  variable=None, count_granularity=None):
@@ -666,7 +698,7 @@ class ExpressionOpCounter(CounterBase):
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
                         name='func:'+str(expr.function),
-                        count_granularity='workitem'): 1}
+                        count_granularity=CountGranularity.WORKITEM): 1}
                     ) + self.rec(expr.parameters)
 
     def map_subscript(self, expr):
@@ -677,7 +709,8 @@ class ExpressionOpCounter(CounterBase):
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
                         name='add',
-                        count_granularity='workitem'): len(expr.children)-1}
+                        count_granularity=CountGranularity.WORKITEM):
+                     len(expr.children)-1}
                     ) + sum(self.rec(child) for child in expr.children)
 
     def map_product(self, expr):
@@ -685,18 +718,18 @@ class ExpressionOpCounter(CounterBase):
         assert expr.children
         return sum(ToCountMap({Op(dtype=self.type_inf(expr),
                                   name='mul',
-                                  count_granularity='workitem'): 1})
+                                  count_granularity=CountGranularity.WORKITEM): 1})
                    + self.rec(child)
                    for child in expr.children
                    if not is_zero(child + 1)) + \
                    ToCountMap({Op(dtype=self.type_inf(expr),
                                   name='mul',
-                                  count_granularity='workitem'): -1})
+                                  count_granularity=CountGranularity.WORKITEM): -1})
 
     def map_quotient(self, expr, *args):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='div',
-                              count_granularity='workitem'): 1}) \
+                              count_granularity=CountGranularity.WORKITEM): 1}) \
                                 + self.rec(expr.numerator) \
                                 + self.rec(expr.denominator)
 
@@ -706,14 +739,14 @@ class ExpressionOpCounter(CounterBase):
     def map_power(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='pow',
-                              count_granularity='workitem'): 1}) \
+                              count_granularity=CountGranularity.WORKITEM): 1}) \
                                 + self.rec(expr.base) \
                                 + self.rec(expr.exponent)
 
     def map_left_shift(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='shift',
-                              count_granularity='workitem'): 1}) \
+                              count_granularity=CountGranularity.WORKITEM): 1}) \
                                 + self.rec(expr.shiftee) \
                                 + self.rec(expr.shift)
 
@@ -722,13 +755,13 @@ class ExpressionOpCounter(CounterBase):
     def map_bitwise_not(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='bw',
-                              count_granularity='workitem'): 1}) \
+                              count_granularity=CountGranularity.WORKITEM): 1}) \
                                 + self.rec(expr.child)
 
     def map_bitwise_or(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='bw',
-                              count_granularity='workitem'):
+                              count_granularity=CountGranularity.WORKITEM):
                            len(expr.children)-1}) \
                                 + sum(self.rec(child) for child in expr.children)
 
@@ -752,7 +785,7 @@ class ExpressionOpCounter(CounterBase):
     def map_min(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='maxmin',
-                              count_granularity='workitem'):
+                              count_granularity=CountGranularity.WORKITEM):
                            len(expr.children)-1}) \
                + sum(self.rec(child) for child in expr.children)
 
@@ -794,7 +827,7 @@ class LocalMemAccessCounter(MemAccessCounter):
             if isinstance(array, TemporaryVariable) and (
                     array.scope == temp_var_scope.LOCAL):
                 sub_map[MemAccess(mtype='local', dtype=dtype,
-                                  count_granularity='workitem')] = 1
+                                  count_granularity=CountGranularity.WORKITEM)] = 1
         return sub_map
 
     def map_variable(self, expr):
@@ -831,7 +864,7 @@ class GlobalMemAccessCounter(MemAccessCounter):
         return ToCountMap({MemAccess(mtype='global',
                                      dtype=self.type_inf(expr), stride=0,
                                      variable=name,
-                                     count_granularity='workitem'): 1}
+                                     count_granularity=CountGranularity.WORKITEM): 1}
                           ) + self.rec(expr.index)
 
     def map_subscript(self, expr):
@@ -868,10 +901,11 @@ class GlobalMemAccessCounter(MemAccessCounter):
 
         if not local_id_found:
             # count as uniform access
-            return ToCountMap({MemAccess(mtype='global',
-                                         dtype=self.type_inf(expr), stride=0,
-                                         variable=name,
-                                         count_granularity='subgroup'): 1}
+            return ToCountMap({MemAccess(
+                                mtype='global',
+                                dtype=self.type_inf(expr), stride=0,
+                                variable=name,
+                                count_granularity=CountGranularity.SUBGROUP): 1}
                               ) + self.rec(expr.index)
 
         if min_tag_axis != 0:
@@ -879,10 +913,11 @@ class GlobalMemAccessCounter(MemAccessCounter):
                              "GlobalSubscriptCounter: Memory access minimum "
                              "tag axis %d != 0, stride unknown, using "
                              "sys.maxsize." % (min_tag_axis))
-            return ToCountMap({MemAccess(mtype='global',
-                                         dtype=self.type_inf(expr),
-                                         stride=sys.maxsize, variable=name,
-                                         count_granularity='workitem'): 1}
+            return ToCountMap({MemAccess(
+                                mtype='global',
+                                dtype=self.type_inf(expr),
+                                stride=sys.maxsize, variable=name,
+                                count_granularity=CountGranularity.WORKITEM): 1}
                               ) + self.rec(expr.index)
 
         # get local_id associated with minimum tag axis
@@ -926,7 +961,8 @@ class GlobalMemAccessCounter(MemAccessCounter):
 
             total_stride += stride*coeff_min_lid
 
-        count_granularity = 'workitem' if total_stride is not 0 else 'subgroup'
+        count_granularity = CountGranularity.WORKITEM if total_stride is not 0 \
+                                else CountGranularity.SUBGROUP
 
         return ToCountMap({MemAccess(
                             mtype='global',
@@ -1214,9 +1250,13 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
 
         op_map = get_op_map(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-        f32add = op_map[Op(np.float32, 'add', count_granularity='workitem')
+        f32add = op_map[Op(np.float32,
+                           'add',
+                           count_granularity=CountGranularity.WORKITEM)
                        ].eval_with_dict(params)
-        f32mul = op_map[Op(np.float32, 'mul', count_granularity='workitem')
+        f32mul = op_map[Op(np.float32,
+                           'mul',
+                           count_granularity=CountGranularity.WORKITEM)
                        ].eval_with_dict(params)
 
         # (now use these counts to, e.g., predict performance)
@@ -1302,33 +1342,37 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         params = {'n': 512, 'm': 256, 'l': 128}
         mem_map = get_mem_access_map(knl)
 
-        f32_s1_g_ld_a = mem_map[MemAccess(mtype='global',
-                                          dtype=np.float32,
-                                          stride=1,
-                                          direction='load',
-                                          variable='a',
-                                          count_granularity='workitem')
+        f32_s1_g_ld_a = mem_map[MemAccess(
+                                    mtype='global',
+                                    dtype=np.float32,
+                                    stride=1,
+                                    direction='load',
+                                    variable='a',
+                                    count_granularity=CountGranularity.WORKITEM)
                                ].eval_with_dict(params)
-        f32_s1_g_st_a = mem_map[MemAccess(mtype='global',
-                                          dtype=np.float32,
-                                          stride=1,
-                                          direction='store',
-                                          variable='a',
-                                          count_granularity='workitem')
+        f32_s1_g_st_a = mem_map[MemAccess(
+                                    mtype='global',
+                                    dtype=np.float32,
+                                    stride=1,
+                                    direction='store',
+                                    variable='a',
+                                    count_granularity=CountGranularity.WORKITEM)
                                ].eval_with_dict(params)
-        f32_s1_l_ld_x = mem_map[MemAccess(mtype='local',
-                                          dtype=np.float32,
-                                          stride=1,
-                                          direction='load',
-                                          variable='x',
-                                          count_granularity='workitem')
+        f32_s1_l_ld_x = mem_map[MemAccess(
+                                    mtype='local',
+                                    dtype=np.float32,
+                                    stride=1,
+                                    direction='load',
+                                    variable='x',
+                                    count_granularity=CountGranularity.WORKITEM)
                                ].eval_with_dict(params)
-        f32_s1_l_st_x = mem_map[MemAccess(mtype='local',
-                                          dtype=np.float32,
-                                          stride=1,
-                                          direction='store',
-                                          variable='x',
-                                          count_granularity='workitem')
+        f32_s1_l_st_x = mem_map[MemAccess(
+                                    mtype='local',
+                                    dtype=np.float32,
+                                    stride=1,
+                                    direction='store',
+                                    variable='x',
+                                    count_granularity=CountGranularity.WORKITEM)
                                ].eval_with_dict(params)
 
         # (now use these counts to, e.g., predict performance)
@@ -1350,16 +1394,17 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
     from pytools import memoize_in
 
     @memoize_in(cache_holder, "insn_count")
-    def get_insn_count(knl, insn_id, count_granularity='workitem'):
+    def get_insn_count(knl, insn_id, count_granularity=CountGranularity.WORKITEM):
         insn = knl.id_to_insn[insn_id]
 
         if count_granularity is None:
             warn_with_kernel(knl, "get_insn_count_assumes_granularity",
                              "get_insn_count: No count granularity passed for "
-                             "MemAccess, assuming workitem granularity.")
-            count_granularity == 'workitem'
+                             "MemAccess, assuming %s granularity."
+                             % (CountGranularity.WORKITEM))
+            count_granularity == CountGranularity.WORKITEM
 
-        if count_granularity == 'workitem':
+        if count_granularity == CountGranularity.WORKITEM:
             return count_insn_runs(
                 knl, insn, count_redundant_work=count_redundant_work,
                 disregard_local_axes=False)
@@ -1368,9 +1413,9 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                 knl, insn, disregard_local_axes=True,
                 count_redundant_work=count_redundant_work)
 
-        if count_granularity == 'group':
+        if count_granularity == CountGranularity.GROUP:
             return ct_disregard_local
-        elif count_granularity == 'subgroup':
+        elif count_granularity == CountGranularity.SUBGROUP:
             # get the group size
             from loopy.symbolic import aff_to_expr
             _, local_size = knl.get_grid_size_upper_bounds()
@@ -1379,17 +1424,17 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                 for size in local_size:
                     s = aff_to_expr(size)
                     if not isinstance(s, int):
-                        raise LoopyError("Cannot count insn with subgroup "
-                                         "granularity, group size is not integer: %s"
-                                         % (local_size))
+                        raise LoopyError("Cannot count insn with %s granularity, "
+                                         "group size is not integer: %s"
+                                         % (CountGranularity.SUBGROUP, local_size))
                     group_size *= s
 
             warn_with_kernel(knl, "insn_count_subgroups_upper_bound",
                     "get_insn_count: when counting instruction %s with "
-                    "count_granularity=subgroup, using upper bound for group size "
+                    "count_granularity=%s, using upper bound for group size "
                     "(%d workitems) to compute subgroups per group. When multiple "
                     "device programs present, actual subgroup count may be lower."
-                    % (insn_id, group_size))
+                    % (insn_id, CountGranularity.SUBGROUP, group_size))
 
             from pytools import div_ceil
             return ct_disregard_local*div_ceil(group_size, subgroup_size)
@@ -1469,14 +1514,14 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
 
 def get_synchronization_map(knl, subgroup_size=None):
 
-    """Count the number of synchronization events each work item encounters in a
+    """Count the number of synchronization events each work-item encounters in a
     loopy kernel.
 
     :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
     :return: A dictionary mapping each type of synchronization event to a
             :class:`islpy.PwQPolynomial` holding the number of events per
-            work item.
+            work-item.
 
             Possible keys include ``barrier_local``, ``barrier_global``
             (if supported by the target) and ``kernel_launch``.
@@ -1685,7 +1730,7 @@ def get_gmem_access_poly(knl):
 
 
 def get_synchronization_poly(knl):
-    """Count the number of synchronization events each work item encounters in a
+    """Count the number of synchronization events each work-item encounters in a
     loopy kernel.
 
     get_synchronization_poly is deprecated. Use get_synchronization_map instead.
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 0c51bb7e5..25c6dffee 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -31,6 +31,7 @@ import loopy as lp
 from loopy.types import to_loopy_type
 import numpy as np
 from pytools import div_ceil
+from loopy.statistics import CountGranularity as cg
 
 from pymbolic.primitives import Variable
 
@@ -55,12 +56,12 @@ def test_op_counter_basic():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', 'workitem')].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.float32, 'mul', 'workitem')].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div', 'workitem')].eval_with_dict(params)
-    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', 'workitem')
+    f32add = op_map[lp.Op(np.float32, 'add', cg.WORKITEM)].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul', cg.WORKITEM)].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div', cg.WORKITEM)].eval_with_dict(params)
+    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', cg.WORKITEM)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'workitem')
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', cg.WORKITEM)
                     ].eval_with_dict(params)
     assert f32add == f32mul == f32div == n*m*ell
     assert f64mul == n*m
@@ -82,8 +83,8 @@ def test_op_counter_reduction():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', 'workitem')].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', 'workitem')
+    f32add = op_map[lp.Op(np.float32, 'add', cg.WORKITEM)].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', cg.WORKITEM)
                     ].eval_with_dict(params)
     assert f32add == f32mul == n*m*ell
 
@@ -112,11 +113,11 @@ def test_op_counter_logic():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', 'workitem')].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.float64, 'add', 'workitem')].eval_with_dict(params)
-    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', 'workitem')
+    f32mul = op_map[lp.Op(np.float32, 'mul', cg.WORKITEM)].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.float64, 'add', cg.WORKITEM)].eval_with_dict(params)
+    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', cg.WORKITEM)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'workitem')
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', cg.WORKITEM)
                     ].eval_with_dict(params)
     assert f32mul == n*m
     assert f64div == 2*n*m  # TODO why?
@@ -144,17 +145,17 @@ def test_op_counter_specialops():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', 'workitem')].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div', 'workitem')].eval_with_dict(params)
-    f32add = op_map[lp.Op(np.float32, 'add', 'workitem')].eval_with_dict(params)
-    f64pow = op_map[lp.Op(np.float64, 'pow', 'workitem')].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', 'workitem')
+    f32mul = op_map[lp.Op(np.float32, 'mul', cg.WORKITEM)].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div', cg.WORKITEM)].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add', cg.WORKITEM)].eval_with_dict(params)
+    f64pow = op_map[lp.Op(np.float64, 'pow', cg.WORKITEM)].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', cg.WORKITEM)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'workitem')
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', cg.WORKITEM)
                     ].eval_with_dict(params)
-    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', 'workitem')
+    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', cg.WORKITEM)
                     ].eval_with_dict(params)
-    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', 'workitem')
+    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', cg.WORKITEM)
                     ].eval_with_dict(params)
     assert f32div == 2*n*m*ell
     assert f32mul == f32add == n*m*ell
@@ -184,15 +185,15 @@ def test_op_counter_bitwise():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    i32add = op_map[lp.Op(np.int32, 'add', 'workitem')].eval_with_dict(params)
-    i32bw = op_map[lp.Op(np.int32, 'bw', 'workitem')].eval_with_dict(params)
-    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', 'workitem')
+    i32add = op_map[lp.Op(np.int32, 'add', cg.WORKITEM)].eval_with_dict(params)
+    i32bw = op_map[lp.Op(np.int32, 'bw', cg.WORKITEM)].eval_with_dict(params)
+    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', cg.WORKITEM)
                    ].eval_with_dict(params)
-    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', 'workitem')
+    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', cg.WORKITEM)
                     ].eval_with_dict(params)
-    i64add = op_map[lp.Op(np.dtype(np.int64), 'add', 'workitem')
+    i64add = op_map[lp.Op(np.dtype(np.int64), 'add', cg.WORKITEM)
                     ].eval_with_dict(params)
-    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', 'workitem')
+    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', cg.WORKITEM)
                       ].eval_with_dict(params)
     assert i32add == n*m+n*m*ell
     assert i32bw == 2*n*m*ell
@@ -225,7 +226,7 @@ def test_op_counter_triangular_domain():
     op_map = lp.get_op_map(
                     knl,
                     count_redundant_work=True
-                    )[lp.Op(np.float64, 'mul', 'workitem')]
+                    )[lp.Op(np.float64, 'mul', cg.WORKITEM)]
     value_dict = dict(m=13, n=200)
     flops = op_map.eval_with_dict(value_dict)
 
@@ -266,19 +267,19 @@ def test_mem_access_counter_basic():
 
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='b',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                     ].eval_with_dict(params)
     f64l = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='load', variable='g',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                    ].eval_with_dict(params)
     f64l += mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='load', variable='h',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                     ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -287,11 +288,11 @@ def test_mem_access_counter_basic():
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                    ].eval_with_dict(params)
     f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='store', variable='e',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -325,11 +326,11 @@ def test_mem_access_counter_reduction():
 
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='b',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                     ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -337,7 +338,7 @@ def test_mem_access_counter_reduction():
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -427,19 +428,19 @@ def test_mem_access_counter_specialops():
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                   ].eval_with_dict(params)
     f32 += mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='b',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                    ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='load', variable='g',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                   ].eval_with_dict(params)
     f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='load', variable='h',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -448,11 +449,11 @@ def test_mem_access_counter_specialops():
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='store', variable='c',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                   ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='store', variable='e',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                   ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -460,7 +461,7 @@ def test_mem_access_counter_specialops():
     assert f64 == (n*m)*n_groups*subgroups_per_group
 
     filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'],
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
     tot = filtered_map.eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -499,19 +500,19 @@ def test_mem_access_counter_bitwise():
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='a',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='b',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='g',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
                          stride=0, direction='load', variable='h',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -519,11 +520,11 @@ def test_mem_access_counter_bitwise():
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='c',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='e',
-                         count_granularity='subgroup')
+                         count_granularity=cg.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -564,25 +565,25 @@ def test_mem_access_counter_mixed():
                                     subgroup_size=subgroup_size)
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='load', variable='g',
-                                count_granularity='subgroup')
+                                count_granularity=cg.SUBGROUP)
                          ].eval_with_dict(params)
     f64uniform += mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='load', variable='h',
-                                count_granularity='subgroup')
+                                count_granularity=cg.SUBGROUP)
                           ].eval_with_dict(params)
     f32uniform = mem_map[lp.MemAccess('global', np.float32,
                                 stride=0, direction='load', variable='x',
-                                count_granularity='subgroup')
+                                count_granularity=cg.SUBGROUP)
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m'), direction='load',
                                 variable='a',
-                                count_granularity='workitem')
+                                count_granularity=cg.WORKITEM)
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m'), direction='load',
                                 variable='b',
-                                count_granularity='workitem')
+                                count_granularity=cg.WORKITEM)
                             ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -608,12 +609,12 @@ def test_mem_access_counter_mixed():
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='store', variable='e',
-                                count_granularity='subgroup')
+                                count_granularity=cg.SUBGROUP)
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                 stride=Variable('m'), direction='store',
                                 variable='c',
-                                count_granularity='workitem')
+                                count_granularity=cg.WORKITEM)
                            ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -652,22 +653,22 @@ def test_mem_access_counter_nonconsec():
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
                                 stride=Variable('m'), direction='load',
                                 variable='g',
-                                count_granularity='workitem')
+                                count_granularity=cg.WORKITEM)
                            ].eval_with_dict(params)
     f64nonconsec += mem_map[lp.MemAccess('global', np.float64,
                                 stride=Variable('m'), direction='load',
                                 variable='h',
-                                count_granularity='workitem')
+                                count_granularity=cg.WORKITEM)
                             ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m')*Variable('ell'),
                                 direction='load', variable='a',
-                                count_granularity='workitem')
+                                count_granularity=cg.WORKITEM)
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m')*Variable('ell'),
                                 direction='load', variable='b',
-                                count_granularity='workitem')
+                                count_granularity=cg.WORKITEM)
                             ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
@@ -675,12 +676,12 @@ def test_mem_access_counter_nonconsec():
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
                                 stride=Variable('m'), direction='store',
                                 variable='e',
-                                count_granularity='workitem')
+                                count_granularity=cg.WORKITEM)
                            ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                 stride=Variable('m')*Variable('ell'),
                                 direction='store', variable='c',
-                                count_granularity='workitem')
+                                count_granularity=cg.WORKITEM)
                            ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*ell
@@ -691,13 +692,13 @@ def test_mem_access_counter_nonconsec():
                     'global',
                     np.float64, stride=Variable('m'),
                     direction='load', variable='g',
-                    count_granularity='workitem')
+                    count_granularity=cg.WORKITEM)
                     ].eval_with_dict(params)
     f64nonconsec += mem_map64[lp.MemAccess(
                     'global',
                     np.float64, stride=Variable('m'),
                     direction='load', variable='h',
-                    count_granularity='workitem')
+                    count_granularity=cg.WORKITEM)
                     ].eval_with_dict(params)
     f32nonconsec = mem_map64[lp.MemAccess(
                     'global',
@@ -705,7 +706,7 @@ def test_mem_access_counter_nonconsec():
                     stride=Variable('m')*Variable('ell'),
                     direction='load',
                     variable='a',
-                    count_granularity='workitem')
+                    count_granularity=cg.WORKITEM)
                     ].eval_with_dict(params)
     f32nonconsec += mem_map64[lp.MemAccess(
                     'global',
@@ -713,7 +714,7 @@ def test_mem_access_counter_nonconsec():
                     stride=Variable('m')*Variable('ell'),
                     direction='load',
                     variable='b',
-                    count_granularity='workitem')
+                    count_granularity=cg.WORKITEM)
                     ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
@@ -742,30 +743,30 @@ def test_mem_access_counter_consec():
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='load', variable='g',
-                        count_granularity='workitem')
+                        count_granularity=cg.WORKITEM)
                         ].eval_with_dict(params)
     f64consec += mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='load', variable='h',
-                        count_granularity='workitem')
+                        count_granularity=cg.WORKITEM)
                          ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='load', variable='a',
-                        count_granularity='workitem')
+                        count_granularity=cg.WORKITEM)
                         ].eval_with_dict(params)
     f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         stride=1, direction='load', variable='b',
-                        count_granularity='workitem')
+                        count_granularity=cg.WORKITEM)
                          ].eval_with_dict(params)
     assert f64consec == 2*n*m*ell
     assert f32consec == 3*n*m*ell
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='store', variable='e',
-                        count_granularity='workitem')
+                        count_granularity=cg.WORKITEM)
                         ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='store', variable='c',
-                        count_granularity='workitem')
+                        count_granularity=cg.WORKITEM)
                         ].eval_with_dict(params)
     assert f64consec == n*m*ell
     assert f32consec == n*m*ell
@@ -774,9 +775,9 @@ def test_mem_access_counter_consec():
 def test_count_granularity_val_checks():
 
     try:
-        lp.MemAccess(count_granularity='workitem')
-        lp.MemAccess(count_granularity='subgroup')
-        lp.MemAccess(count_granularity='group')
+        lp.MemAccess(count_granularity=cg.WORKITEM)
+        lp.MemAccess(count_granularity=cg.SUBGROUP)
+        lp.MemAccess(count_granularity=cg.GROUP)
         lp.MemAccess(count_granularity=None)
         assert True
         lp.MemAccess(count_granularity='bushel')
@@ -785,9 +786,9 @@ def test_count_granularity_val_checks():
         assert True
 
     try:
-        lp.Op(count_granularity='workitem')
-        lp.Op(count_granularity='subgroup')
-        lp.Op(count_granularity='group')
+        lp.Op(count_granularity=cg.WORKITEM)
+        lp.Op(count_granularity=cg.SUBGROUP)
+        lp.Op(count_granularity=cg.GROUP)
         lp.Op(count_granularity=None)
         assert True
         lp.Op(count_granularity='bushel')
@@ -874,16 +875,16 @@ def test_all_counters_parallel_matmul():
 
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     f32mul = op_map[
-                        lp.Op(np.float32, 'mul', 'workitem')
+                        lp.Op(np.float32, 'mul', cg.WORKITEM)
                         ].eval_with_dict(params)
     f32add = op_map[
-                        lp.Op(np.float32, 'add', 'workitem')
+                        lp.Op(np.float32, 'add', cg.WORKITEM)
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        lp.Op(np.int32, 'add', 'workitem')
+                        lp.Op(np.int32, 'add', cg.WORKITEM)
                         ].eval_with_dict(params)
     i32ops += op_map[
-                        lp.Op(np.dtype(np.int32), 'mul', 'workitem')
+                        lp.Op(np.dtype(np.int32), 'mul', cg.WORKITEM)
                         ].eval_with_dict(params)
 
     assert f32mul+f32add == n*m*ell*2
@@ -892,11 +893,11 @@ def test_all_counters_parallel_matmul():
 
     f32s1lb = op_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='load', variable='b',
-                     count_granularity='workitem')
+                     count_granularity=cg.WORKITEM)
                      ].eval_with_dict(params)
     f32s1la = op_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='load', variable='a',
-                     count_granularity='workitem')
+                     count_granularity=cg.WORKITEM)
                      ].eval_with_dict(params)
 
     assert f32s1lb == n*m*ell/bsize
@@ -904,7 +905,7 @@ def test_all_counters_parallel_matmul():
 
     f32coal = op_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='store', variable='c',
-                     count_granularity='workitem')
+                     count_granularity=cg.WORKITEM)
                      ].eval_with_dict(params)
 
     assert f32coal == n*ell
@@ -913,7 +914,7 @@ def test_all_counters_parallel_matmul():
                         count_redundant_work=True).filter_by(mtype=['local'])
     local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                              direction='load',
-                                             count_granularity='workitem')
+                                             count_granularity=cg.WORKITEM)
                                 ].eval_with_dict(params)
     assert local_mem_l == n*m*ell*2
 
@@ -980,24 +981,24 @@ def test_summations_and_filters():
                                     subgroup_size=subgroup_size)
 
     loads_a = mem_map.filter_by(direction=['load'], variable=['a'],
-                                count_granularity=['subgroup']
+                                count_granularity=[cg.SUBGROUP]
                                 ).eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
     assert loads_a == (2*n*m*ell)*n_groups*subgroups_per_group
 
     global_stores = mem_map.filter_by(mtype=['global'], direction=['store'],
-                                      count_granularity=['subgroup']
+                                      count_granularity=[cg.SUBGROUP]
                                       ).eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
     assert global_stores == (n*m*ell + n*m)*n_groups*subgroups_per_group
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'],
-                                 count_granularity=['subgroup']
+                                 count_granularity=[cg.SUBGROUP]
                                  ).to_bytes().eval_and_sum(params)
     st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'],
-                                 count_granularity=['subgroup']
+                                 count_granularity=[cg.SUBGROUP]
                                  ).to_bytes().eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-- 
GitLab


From c5a06ed8a6bb2f68096e3a4cbe4651950f02d1be Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 19 Feb 2018 23:23:17 -0600
Subject: [PATCH 42/59] attempt to get subgroup size from device, don't guess
 unless explicitly told

---
 loopy/statistics.py     | 45 +++++++++++++++++++++++++++++++----------
 test/test_statistics.py | 18 ++++++++++-------
 2 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 2305144ac..2b5e3876e 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -501,7 +501,7 @@ class Op(Record):
     .. attribute:: count_granularity
 
        A :class:`str` that specifies whether this operation should be counted
-       once per *workitem*, *subgroup*, or *group*.
+       once per *work-item*, *sub-group*, or *group*.
 
     """
 
@@ -566,7 +566,7 @@ class MemAccess(Record):
     .. attribute:: count_granularity
 
        A :class:`str` that specifies whether this operation should be counted
-       once per *workitem*, *subgroup*, or *group*.
+       once per *work-item*, *sub-group*, or *group*.
 
     """
 
@@ -1323,7 +1323,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
     :arg subgroup_size: A :class:`int` that specifies the sub-group size. This
         is used, e.g., when counting a :class:`MemAccess` whose count_granularity
         specifies that it should only be counted once per sub-group. The default
-        subgroup_size is 32.
+        sub-group_size is 32.
 
     :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:**
         :class:`islpy.PwQPolynomial` **}**.
@@ -1380,12 +1380,35 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
     """
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
 
-    if subgroup_size is None:
-        subgroup_size = 32
-        warn_with_kernel(knl, "get_mem_access_map_assumes_subgroup_size",
-                         "get_mem_access_map: No subgroup size passed, "
-                         "assuming subgroup size is %d."
-                         % (subgroup_size))
+    if not isinstance(subgroup_size, int):
+        # try to find subgroup_size
+        from loopy.target.pyopencl import PyOpenCLTarget
+        if isinstance(knl.target, PyOpenCLTarget) and knl.target.device is not None:
+            from pyopencl.characterize import get_simd_group_size
+            subgroup_size_guess = get_simd_group_size(knl.target.device, None)
+            warn_with_kernel(knl, "get_mem_access_map_assumes_subgroup_size",
+                             "subgroup_size passed: %s. Device: %s. Using "
+                             "sub-group size given by get_simd_group_size(): %d"
+                             % (subgroup_size, knl.target.device,
+                                subgroup_size_guess))
+            subgroup_size = subgroup_size_guess
+        elif subgroup_size == 'guess':
+            # unable to get subgroup_size from device, so guess
+            subgroup_size = 32
+            warn_with_kernel(knl, "get_mem_access_map_guessing_subgroup_size",
+                             "get_mem_access_map: 'guess' sub-group size passed, "
+                             "no target device found, wildly guessing that "
+                             "sub-group size is %d."
+                             % (subgroup_size))
+
+        if subgroup_size is None:
+            # 'guess' was not passed and either no target device found
+            # or get_simd_group_size returned None
+            raise ValueError("No sub-group size passed and no target device found. "
+                             "Either (1) pass integer value for subgroup_size, "
+                             "(2) ensure that kernel.target is PyOpenClTarget "
+                             "and kernel.target.device is set, or (3) pass "
+                             "subgroup_size='guess' and hope for the best.")
 
     class CacheHolder(object):
         pass
@@ -1432,8 +1455,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
             warn_with_kernel(knl, "insn_count_subgroups_upper_bound",
                     "get_insn_count: when counting instruction %s with "
                     "count_granularity=%s, using upper bound for group size "
-                    "(%d workitems) to compute subgroups per group. When multiple "
-                    "device programs present, actual subgroup count may be lower."
+                    "(%d work-items) to compute sub-groups per group. When multiple "
+                    "device programs present, actual sub-group count may be lower."
                     % (insn_id, CountGranularity.SUBGROUP, group_size))
 
             from pytools import div_ceil
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 25c6dffee..9bfea34ab 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -645,7 +645,8 @@ def test_mem_access_counter_nonconsec():
     knl = lp.split_iname(knl, "i", 16)
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)  # noqa
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                    subgroup_size=32)  # noqa
     n = 512
     m = 256
     ell = 128
@@ -735,7 +736,8 @@ def test_mem_access_counter_consec():
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
-    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
+    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                    subgroup_size='guess')
     n = 512
     m = 256
     ell = 128
@@ -889,13 +891,14 @@ def test_all_counters_parallel_matmul():
 
     assert f32mul+f32add == n*m*ell*2
 
-    op_map = lp.get_mem_access_map(knl, count_redundant_work=True)
+    mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                           subgroup_size=32)
 
-    f32s1lb = op_map[lp.MemAccess('global', np.float32,
+    f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='load', variable='b',
                      count_granularity=cg.WORKITEM)
                      ].eval_with_dict(params)
-    f32s1la = op_map[lp.MemAccess('global', np.float32,
+    f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='load', variable='a',
                      count_granularity=cg.WORKITEM)
                      ].eval_with_dict(params)
@@ -903,7 +906,7 @@ def test_all_counters_parallel_matmul():
     assert f32s1lb == n*m*ell/bsize
     assert f32s1la == n*m*ell/bsize
 
-    f32coal = op_map[lp.MemAccess('global', np.float32,
+    f32coal = mem_access_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='store', variable='c',
                      count_granularity=cg.WORKITEM)
                      ].eval_with_dict(params)
@@ -911,7 +914,8 @@ def test_all_counters_parallel_matmul():
     assert f32coal == n*ell
 
     local_mem_map = lp.get_mem_access_map(knl,
-                        count_redundant_work=True).filter_by(mtype=['local'])
+                        count_redundant_work=True,
+                        subgroup_size=32).filter_by(mtype=['local'])
     local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                              direction='load',
                                              count_granularity=cg.WORKITEM)
-- 
GitLab


From 8af713d354ab3b63c749c72c66b204856f02eb6c Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 19 Feb 2018 23:52:08 -0600
Subject: [PATCH 43/59] fixing flake8 issues

---
 test/test_statistics.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/test/test_statistics.py b/test/test_statistics.py
index 9bfea34ab..0687bff5a 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -895,21 +895,21 @@ def test_all_counters_parallel_matmul():
                                            subgroup_size=32)
 
     f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
-                     stride=1, direction='load', variable='b',
-                     count_granularity=cg.WORKITEM)
-                     ].eval_with_dict(params)
+                             stride=1, direction='load', variable='b',
+                             count_granularity=cg.WORKITEM)
+                             ].eval_with_dict(params)
     f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
-                     stride=1, direction='load', variable='a',
-                     count_granularity=cg.WORKITEM)
-                     ].eval_with_dict(params)
+                             stride=1, direction='load', variable='a',
+                             count_granularity=cg.WORKITEM)
+                             ].eval_with_dict(params)
 
     assert f32s1lb == n*m*ell/bsize
     assert f32s1la == n*m*ell/bsize
 
     f32coal = mem_access_map[lp.MemAccess('global', np.float32,
-                     stride=1, direction='store', variable='c',
-                     count_granularity=cg.WORKITEM)
-                     ].eval_with_dict(params)
+                             stride=1, direction='store', variable='c',
+                             count_granularity=cg.WORKITEM)
+                             ].eval_with_dict(params)
 
     assert f32coal == n*ell
 
-- 
GitLab


From 6ffb9e3a654c8aab0b456ade2adecd001c0d82ef Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 19 Feb 2018 23:53:00 -0600
Subject: [PATCH 44/59] passing subgroup size to get_mem_access_map in tutorial

---
 doc/tutorial.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 5fd4f72cb..dd0bf6367 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1636,7 +1636,7 @@ we'll continue using the kernel from the previous example:
 
 .. doctest::
 
-    >>> mem_map = lp.get_mem_access_map(knl)
+    >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
     MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
     MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
@@ -1729,7 +1729,7 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
 
     >>> knl_consec = lp.split_iname(knl, "k", 128,
     ...                             outer_tag="l.1", inner_tag="l.0")
-    >>> mem_map = lp.get_mem_access_map(knl_consec)
+    >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
     MemAccess(global, np:dtype('float32'), 1, load, a, workitem) : [m, l, n] -> { ... }
     MemAccess(global, np:dtype('float32'), 1, load, b, workitem) : [m, l, n] -> { ... }
@@ -1770,7 +1770,7 @@ switch the inner and outer tags in our parallelization of the kernel:
 
     >>> knl_nonconsec = lp.split_iname(knl, "k", 128,
     ...                                outer_tag="l.0", inner_tag="l.1")
-    >>> mem_map = lp.get_mem_access_map(knl_nonconsec)
+    >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
     MemAccess(global, np:dtype('float32'), 128, load, a, workitem) : [m, l, n] -> { ... }
     MemAccess(global, np:dtype('float32'), 128, load, b, workitem) : [m, l, n] -> { ... }
-- 
GitLab


From 66693459ed20a65f6180a531e2690c37ae33e4ca Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 19 Feb 2018 23:54:10 -0600
Subject: [PATCH 45/59] added CountGranularity to loopy.__init__

---
 loopy/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index 5e8a3fb06..b6a72c021 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -119,8 +119,8 @@ from loopy.transform.add_barrier import add_barrier
 from loopy.type_inference import infer_unknown_types
 from loopy.preprocess import preprocess_kernel, realize_reduction
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
-from loopy.statistics import (ToCountMap, stringify_stats_mapping, Op,
-        MemAccess, get_op_poly, get_op_map, get_lmem_access_poly,
+from loopy.statistics import (ToCountMap, CountGranularity, stringify_stats_mapping,
+        Op, MemAccess, get_op_poly, get_op_map, get_lmem_access_poly,
         get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_map,
         get_synchronization_poly, get_synchronization_map,
         gather_access_footprints, gather_access_footprint_bytes)
-- 
GitLab


From dd57c36050600bca28bcf630d6d33bd91c4d6cfe Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 20 Feb 2018 00:04:20 -0600
Subject: [PATCH 46/59] fixed flake8 issue

---
 test/test_statistics.py | 180 ++++++++++++++++++++--------------------
 1 file changed, 90 insertions(+), 90 deletions(-)

diff --git a/test/test_statistics.py b/test/test_statistics.py
index 0687bff5a..7a5d13949 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -31,7 +31,7 @@ import loopy as lp
 from loopy.types import to_loopy_type
 import numpy as np
 from pytools import div_ceil
-from loopy.statistics import CountGranularity as cg
+from loopy.statistics import CountGranularity as CG
 
 from pymbolic.primitives import Variable
 
@@ -56,12 +56,12 @@ def test_op_counter_basic():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', cg.WORKITEM)].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.float32, 'mul', cg.WORKITEM)].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div', cg.WORKITEM)].eval_with_dict(params)
-    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', cg.WORKITEM)
+    f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params)
+    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.WORKITEM)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', cg.WORKITEM)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM)
                     ].eval_with_dict(params)
     assert f32add == f32mul == f32div == n*m*ell
     assert f64mul == n*m
@@ -83,8 +83,8 @@ def test_op_counter_reduction():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', cg.WORKITEM)].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', cg.WORKITEM)
+    f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.WORKITEM)
                     ].eval_with_dict(params)
     assert f32add == f32mul == n*m*ell
 
@@ -113,11 +113,11 @@ def test_op_counter_logic():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', cg.WORKITEM)].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.float64, 'add', cg.WORKITEM)].eval_with_dict(params)
-    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', cg.WORKITEM)
+    f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(params)
+    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.WORKITEM)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', cg.WORKITEM)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM)
                     ].eval_with_dict(params)
     assert f32mul == n*m
     assert f64div == 2*n*m  # TODO why?
@@ -145,17 +145,17 @@ def test_op_counter_specialops():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', cg.WORKITEM)].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div', cg.WORKITEM)].eval_with_dict(params)
-    f32add = op_map[lp.Op(np.float32, 'add', cg.WORKITEM)].eval_with_dict(params)
-    f64pow = op_map[lp.Op(np.float64, 'pow', cg.WORKITEM)].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', cg.WORKITEM)
+    f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params)
+    f64pow = op_map[lp.Op(np.float64, 'pow', CG.WORKITEM)].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.WORKITEM)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', cg.WORKITEM)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM)
                     ].eval_with_dict(params)
-    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', cg.WORKITEM)
+    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.WORKITEM)
                     ].eval_with_dict(params)
-    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', cg.WORKITEM)
+    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.WORKITEM)
                     ].eval_with_dict(params)
     assert f32div == 2*n*m*ell
     assert f32mul == f32add == n*m*ell
@@ -185,15 +185,15 @@ def test_op_counter_bitwise():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    i32add = op_map[lp.Op(np.int32, 'add', cg.WORKITEM)].eval_with_dict(params)
-    i32bw = op_map[lp.Op(np.int32, 'bw', cg.WORKITEM)].eval_with_dict(params)
-    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', cg.WORKITEM)
+    i32add = op_map[lp.Op(np.int32, 'add', CG.WORKITEM)].eval_with_dict(params)
+    i32bw = op_map[lp.Op(np.int32, 'bw', CG.WORKITEM)].eval_with_dict(params)
+    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.WORKITEM)
                    ].eval_with_dict(params)
-    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', cg.WORKITEM)
+    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.WORKITEM)
                     ].eval_with_dict(params)
-    i64add = op_map[lp.Op(np.dtype(np.int64), 'add', cg.WORKITEM)
+    i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.WORKITEM)
                     ].eval_with_dict(params)
-    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', cg.WORKITEM)
+    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.WORKITEM)
                       ].eval_with_dict(params)
     assert i32add == n*m+n*m*ell
     assert i32bw == 2*n*m*ell
@@ -226,7 +226,7 @@ def test_op_counter_triangular_domain():
     op_map = lp.get_op_map(
                     knl,
                     count_redundant_work=True
-                    )[lp.Op(np.float64, 'mul', cg.WORKITEM)]
+                    )[lp.Op(np.float64, 'mul', CG.WORKITEM)]
     value_dict = dict(m=13, n=200)
     flops = op_map.eval_with_dict(value_dict)
 
@@ -267,19 +267,19 @@ def test_mem_access_counter_basic():
 
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='b',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
     f64l = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='load', variable='g',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f64l += mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='load', variable='h',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -288,11 +288,11 @@ def test_mem_access_counter_basic():
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='store', variable='e',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -326,11 +326,11 @@ def test_mem_access_counter_reduction():
 
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='b',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -338,7 +338,7 @@ def test_mem_access_counter_reduction():
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -428,19 +428,19 @@ def test_mem_access_counter_specialops():
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     f32 += mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='b',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='load', variable='g',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='load', variable='h',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -449,11 +449,11 @@ def test_mem_access_counter_specialops():
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='store', variable='c',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='store', variable='e',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -461,7 +461,7 @@ def test_mem_access_counter_specialops():
     assert f64 == (n*m)*n_groups*subgroups_per_group
 
     filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'],
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
     tot = filtered_map.eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -500,19 +500,19 @@ def test_mem_access_counter_bitwise():
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='a',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='b',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='g',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
                          stride=0, direction='load', variable='h',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -520,11 +520,11 @@ def test_mem_access_counter_bitwise():
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='c',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='e',
-                         count_granularity=cg.SUBGROUP)
+                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -565,25 +565,25 @@ def test_mem_access_counter_mixed():
                                     subgroup_size=subgroup_size)
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='load', variable='g',
-                                count_granularity=cg.SUBGROUP)
+                                count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
     f64uniform += mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='load', variable='h',
-                                count_granularity=cg.SUBGROUP)
+                                count_granularity=CG.SUBGROUP)
                           ].eval_with_dict(params)
     f32uniform = mem_map[lp.MemAccess('global', np.float32,
                                 stride=0, direction='load', variable='x',
-                                count_granularity=cg.SUBGROUP)
+                                count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m'), direction='load',
                                 variable='a',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m'), direction='load',
                                 variable='b',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -609,12 +609,12 @@ def test_mem_access_counter_mixed():
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='store', variable='e',
-                                count_granularity=cg.SUBGROUP)
+                                count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                 stride=Variable('m'), direction='store',
                                 variable='c',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
@@ -654,22 +654,22 @@ def test_mem_access_counter_nonconsec():
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
                                 stride=Variable('m'), direction='load',
                                 variable='g',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     f64nonconsec += mem_map[lp.MemAccess('global', np.float64,
                                 stride=Variable('m'), direction='load',
                                 variable='h',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m')*Variable('ell'),
                                 direction='load', variable='a',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                 stride=Variable('m')*Variable('ell'),
                                 direction='load', variable='b',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
@@ -677,12 +677,12 @@ def test_mem_access_counter_nonconsec():
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
                                 stride=Variable('m'), direction='store',
                                 variable='e',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                 stride=Variable('m')*Variable('ell'),
                                 direction='store', variable='c',
-                                count_granularity=cg.WORKITEM)
+                                count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*ell
@@ -693,13 +693,13 @@ def test_mem_access_counter_nonconsec():
                     'global',
                     np.float64, stride=Variable('m'),
                     direction='load', variable='g',
-                    count_granularity=cg.WORKITEM)
+                    count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     f64nonconsec += mem_map64[lp.MemAccess(
                     'global',
                     np.float64, stride=Variable('m'),
                     direction='load', variable='h',
-                    count_granularity=cg.WORKITEM)
+                    count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     f32nonconsec = mem_map64[lp.MemAccess(
                     'global',
@@ -707,7 +707,7 @@ def test_mem_access_counter_nonconsec():
                     stride=Variable('m')*Variable('ell'),
                     direction='load',
                     variable='a',
-                    count_granularity=cg.WORKITEM)
+                    count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     f32nonconsec += mem_map64[lp.MemAccess(
                     'global',
@@ -715,7 +715,7 @@ def test_mem_access_counter_nonconsec():
                     stride=Variable('m')*Variable('ell'),
                     direction='load',
                     variable='b',
-                    count_granularity=cg.WORKITEM)
+                    count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
@@ -745,30 +745,30 @@ def test_mem_access_counter_consec():
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='load', variable='g',
-                        count_granularity=cg.WORKITEM)
+                        count_granularity=CG.WORKITEM)
                         ].eval_with_dict(params)
     f64consec += mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='load', variable='h',
-                        count_granularity=cg.WORKITEM)
+                        count_granularity=CG.WORKITEM)
                          ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='load', variable='a',
-                        count_granularity=cg.WORKITEM)
+                        count_granularity=CG.WORKITEM)
                         ].eval_with_dict(params)
     f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         stride=1, direction='load', variable='b',
-                        count_granularity=cg.WORKITEM)
+                        count_granularity=CG.WORKITEM)
                          ].eval_with_dict(params)
     assert f64consec == 2*n*m*ell
     assert f32consec == 3*n*m*ell
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='store', variable='e',
-                        count_granularity=cg.WORKITEM)
+                        count_granularity=CG.WORKITEM)
                         ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='store', variable='c',
-                        count_granularity=cg.WORKITEM)
+                        count_granularity=CG.WORKITEM)
                         ].eval_with_dict(params)
     assert f64consec == n*m*ell
     assert f32consec == n*m*ell
@@ -777,9 +777,9 @@ def test_mem_access_counter_consec():
 def test_count_granularity_val_checks():
 
     try:
-        lp.MemAccess(count_granularity=cg.WORKITEM)
-        lp.MemAccess(count_granularity=cg.SUBGROUP)
-        lp.MemAccess(count_granularity=cg.GROUP)
+        lp.MemAccess(count_granularity=CG.WORKITEM)
+        lp.MemAccess(count_granularity=CG.SUBGROUP)
+        lp.MemAccess(count_granularity=CG.GROUP)
         lp.MemAccess(count_granularity=None)
         assert True
         lp.MemAccess(count_granularity='bushel')
@@ -788,9 +788,9 @@ def test_count_granularity_val_checks():
         assert True
 
     try:
-        lp.Op(count_granularity=cg.WORKITEM)
-        lp.Op(count_granularity=cg.SUBGROUP)
-        lp.Op(count_granularity=cg.GROUP)
+        lp.Op(count_granularity=CG.WORKITEM)
+        lp.Op(count_granularity=CG.SUBGROUP)
+        lp.Op(count_granularity=CG.GROUP)
         lp.Op(count_granularity=None)
         assert True
         lp.Op(count_granularity='bushel')
@@ -877,16 +877,16 @@ def test_all_counters_parallel_matmul():
 
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     f32mul = op_map[
-                        lp.Op(np.float32, 'mul', cg.WORKITEM)
+                        lp.Op(np.float32, 'mul', CG.WORKITEM)
                         ].eval_with_dict(params)
     f32add = op_map[
-                        lp.Op(np.float32, 'add', cg.WORKITEM)
+                        lp.Op(np.float32, 'add', CG.WORKITEM)
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        lp.Op(np.int32, 'add', cg.WORKITEM)
+                        lp.Op(np.int32, 'add', CG.WORKITEM)
                         ].eval_with_dict(params)
     i32ops += op_map[
-                        lp.Op(np.dtype(np.int32), 'mul', cg.WORKITEM)
+                        lp.Op(np.dtype(np.int32), 'mul', CG.WORKITEM)
                         ].eval_with_dict(params)
 
     assert f32mul+f32add == n*m*ell*2
@@ -896,11 +896,11 @@ def test_all_counters_parallel_matmul():
 
     f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
                              stride=1, direction='load', variable='b',
-                             count_granularity=cg.WORKITEM)
+                             count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
     f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
                              stride=1, direction='load', variable='a',
-                             count_granularity=cg.WORKITEM)
+                             count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
 
     assert f32s1lb == n*m*ell/bsize
@@ -908,7 +908,7 @@ def test_all_counters_parallel_matmul():
 
     f32coal = mem_access_map[lp.MemAccess('global', np.float32,
                              stride=1, direction='store', variable='c',
-                             count_granularity=cg.WORKITEM)
+                             count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
 
     assert f32coal == n*ell
@@ -918,7 +918,7 @@ def test_all_counters_parallel_matmul():
                         subgroup_size=32).filter_by(mtype=['local'])
     local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                              direction='load',
-                                             count_granularity=cg.WORKITEM)
+                                             count_granularity=CG.WORKITEM)
                                 ].eval_with_dict(params)
     assert local_mem_l == n*m*ell*2
 
@@ -985,24 +985,24 @@ def test_summations_and_filters():
                                     subgroup_size=subgroup_size)
 
     loads_a = mem_map.filter_by(direction=['load'], variable=['a'],
-                                count_granularity=[cg.SUBGROUP]
+                                count_granularity=[CG.SUBGROUP]
                                 ).eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
     assert loads_a == (2*n*m*ell)*n_groups*subgroups_per_group
 
     global_stores = mem_map.filter_by(mtype=['global'], direction=['store'],
-                                      count_granularity=[cg.SUBGROUP]
+                                      count_granularity=[CG.SUBGROUP]
                                       ).eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
     assert global_stores == (n*m*ell + n*m)*n_groups*subgroups_per_group
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'],
-                                 count_granularity=[cg.SUBGROUP]
+                                 count_granularity=[CG.SUBGROUP]
                                  ).to_bytes().eval_and_sum(params)
     st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'],
-                                 count_granularity=[cg.SUBGROUP]
+                                 count_granularity=[CG.SUBGROUP]
                                  ).to_bytes().eval_and_sum(params)
 
     # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-- 
GitLab


From d1a587c3d0ee6e922008a8afdb8c2dda10460ba6 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 20 Feb 2018 00:33:36 -0600
Subject: [PATCH 47/59] added CountGranularity to __all__

---
 loopy/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index b6a72c021..a09fdd184 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -239,8 +239,8 @@ __all__ = [
         "PreambleInfo",
         "generate_code", "generate_code_v2", "generate_body",
 
-        "ToCountMap", "stringify_stats_mapping", "Op", "MemAccess",
-        "get_op_poly", "get_op_map", "get_lmem_access_poly",
+        "ToCountMap", "CountGranularity", "stringify_stats_mapping", "Op",
+        "MemAccess", "get_op_poly", "get_op_map", "get_lmem_access_poly",
         "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_map",
         "get_synchronization_poly", "get_synchronization_map",
         "gather_access_footprints", "gather_access_footprint_bytes",
-- 
GitLab


From 4af3e4bed1fd113afce903152c592b6968a39535 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 20 Feb 2018 00:34:30 -0600
Subject: [PATCH 48/59] passing subgroup_size (now required) in
 test_gnuma_hroiz_kernel

---
 test/test_numa_diff.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index eff3dbd0e..d30a81e84 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -229,7 +229,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
         print(lp.stringify_stats_mapping(op_map))
 
         print("MEM")
-        gmem_map = lp.get_mem_access_map(hsv).to_bytes()
+        gmem_map = lp.get_mem_access_map(hsv, subgroup_size=32).to_bytes()
         print(lp.stringify_stats_mapping(gmem_map))
 
     hsv = lp.set_options(hsv, cl_build_options=[
-- 
GitLab


From b03a9222826f0dd9b58aaa3b91fe7f4bb55d240a Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 20 Feb 2018 03:12:01 -0600
Subject: [PATCH 49/59] defined work-item, sub-group, and group

---
 loopy/statistics.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 2b5e3876e..d9bbd4b24 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -501,7 +501,12 @@ class Op(Record):
     .. attribute:: count_granularity
 
        A :class:`str` that specifies whether this operation should be counted
-       once per *work-item*, *sub-group*, or *group*.
+       once per *work-item*, *sub-group*, or *group*. A work-item is a single
+       instance of computation executing on a single processor (think 'thread'),
+       a collection of which may be grouped together into a work-group. Each
+       work-group executes on a single compute unit with all work-items within
+       the group sharing local memory. A sub-group is an implementation-dependent
+       grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp.
 
     """
 
@@ -1320,10 +1325,15 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for
         code optimization.)
 
-    :arg subgroup_size: A :class:`int` that specifies the sub-group size. This
-        is used, e.g., when counting a :class:`MemAccess` whose count_granularity
-        specifies that it should only be counted once per sub-group. The default
-        sub-group_size is 32.
+    :arg subgroup_size: A :class:`int` that specifies the sub-group size. An OpenCL
+        sub-group is an implementation-dependent grouping of work-items within a
+        work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g.,
+        when counting a :class:`MemAccess` whose count_granularity specifies that it
+        should only be counted once per sub-group. If set to None an attempt to find
+        the sub-group size using the device will be made. A :class:`string` 'guess'
+        may also be passed as the subgroup_size, in which case get_mem_access_map
+        will attempt to find the sub-group sizeusing the device and, if
+        unsuccessful, will make a wild guess.
 
     :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:**
         :class:`islpy.PwQPolynomial` **}**.
-- 
GitLab


From b50249339cfde1279c5c98afbbe376a8c9df3408 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 20 Feb 2018 03:35:37 -0600
Subject: [PATCH 50/59] made docstring indentation more consistent

---
 loopy/statistics.py | 133 ++++++++++++++++++++++----------------------
 1 file changed, 67 insertions(+), 66 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index d9bbd4b24..a63ee41ad 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -208,13 +208,13 @@ class ToCountMap(object):
     def filter_by(self, **kwargs):
         """Remove items without specified key fields.
 
-        :arg kwargs: Keyword arguments matching fields in the keys of
-                 the :class:`ToCountMap`, each given a list of
-                 allowable values for that key field.
+        :arg kwargs: Keyword arguments matching fields in the keys of the
+            :class:`ToCountMap`, each given a list of allowable values for that
+            key field.
 
         :return: A :class:`ToCountMap` containing the subset of the items in
-                 the original :class:`ToCountMap` that match the field values
-                 passed.
+            the original :class:`ToCountMap` that match the field values
+            passed.
 
         Example usage::
 
@@ -256,11 +256,11 @@ class ToCountMap(object):
     def filter_by_func(self, func):
         """Keep items that pass a test.
 
-        :arg func: A function that takes a map key a parameter and
-             returns a :class:`bool`.
+        :arg func: A function that takes a map key a parameter and returns a
+            :class:`bool`.
 
-        :arg: A :class:`ToCountMap` containing the subset of the items in
-                 the original :class:`ToCountMap` for which func(key) is true.
+        :arg: A :class:`ToCountMap` containing the subset of the items in the
+            original :class:`ToCountMap` for which func(key) is true.
 
         Example usage::
 
@@ -289,13 +289,13 @@ class ToCountMap(object):
 
     def group_by(self, *args):
         """Group map items together, distinguishing by only the key fields
-           passed in args.
+        passed in args.
 
         :arg args: Zero or more :class:`str` fields of map keys.
 
-        :return: A :class:`ToCountMap` containing the same total counts
-                 grouped together by new keys that only contain the fields
-                 specified in the arguments passed.
+        :return: A :class:`ToCountMap` containing the same total counts grouped
+            together by new keys that only contain the fields specified in the
+            arguments passed.
 
         Example usage::
 
@@ -363,8 +363,8 @@ class ToCountMap(object):
         """Convert counts to bytes using data type in map key.
 
         :return: A :class:`ToCountMap` mapping each original key to a
-                 :class:`islpy.PwQPolynomial` with counts in bytes rather than
-                 instances.
+            :class:`islpy.PwQPolynomial` with counts in bytes rather than
+            instances.
 
         Example usage::
 
@@ -404,8 +404,8 @@ class ToCountMap(object):
     def sum(self):
         """Add all counts in ToCountMap.
 
-        :return: A :class:`islpy.PwQPolynomial` or :class:`int` containing the sum of
-                 counts.
+        :return: A :class:`islpy.PwQPolynomial` or :class:`int` containing the
+            sum of counts.
 
         """
 
@@ -431,7 +431,7 @@ class ToCountMap(object):
         parameter dict.
 
         :return: An :class:`int` containing the sum of all counts in the
-                 :class:`ToCountMap` evaluated with the parameters provided.
+            :class:`ToCountMap` evaluated with the parameters provided.
 
         Example usage::
 
@@ -502,11 +502,12 @@ class Op(Record):
 
        A :class:`str` that specifies whether this operation should be counted
        once per *work-item*, *sub-group*, or *group*. A work-item is a single
-       instance of computation executing on a single processor (think 'thread'),
-       a collection of which may be grouped together into a work-group. Each
-       work-group executes on a single compute unit with all work-items within
-       the group sharing local memory. A sub-group is an implementation-dependent
-       grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp.
+       instance of computation executing on a single processor (think
+       'thread'), a collection of which may be grouped together into a
+       work-group. Each work-group executes on a single compute unit with all
+       work-items within the group sharing local memory. A sub-group is an
+       implementation-dependent grouping of work-items within a work-group,
+       analagous to an NVIDIA CUDA warp.
 
     """
 
@@ -555,8 +556,8 @@ class MemAccess(Record):
 
     .. attribute:: stride
 
-       An :class:`int` that specifies stride of the memory access. A stride of 0
-       indicates a uniform access (i.e. all work-items access the same item).
+       An :class:`int` that specifies stride of the memory access. A stride of
+       0 indicates a uniform access (i.e. all work-items access the same item).
 
     .. attribute:: direction
 
@@ -1229,15 +1230,15 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
 
     :arg knl: A :class:`loopy.LoopKernel` whose operations are to be counted.
 
-    :arg numpy_types: A :class:`bool` specifying whether the types
-         in the returned mapping should be numpy types
-         instead of :class:`loopy.LoopyType`.
+    :arg numpy_types: A :class:`bool` specifying whether the types in the
+        returned mapping should be numpy types instead of
+        :class:`loopy.LoopyType`.
 
     :arg count_redundant_work: Based on usage of hardware axes or other
         specifics, a kernel may perform work redundantly. This :class:`bool`
         flag indicates whether this work should be included in the count.
-        (Likely desirable for performance modeling, but undesirable for
-        code optimization.)
+        (Likely desirable for performance modeling, but undesirable for code
+        optimization.)
 
     :return: A :class:`ToCountMap` of **{** :class:`Op` **:**
         :class:`islpy.PwQPolynomial` **}**.
@@ -1315,9 +1316,9 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
     :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be
         counted.
 
-    :arg numpy_types: A :class:`bool` specifying whether the types
-        in the returned mapping should be numpy types
-        instead of :class:`loopy.LoopyType`.
+    :arg numpy_types: A :class:`bool` specifying whether the types in the
+        returned mapping should be numpy types instead of
+        :class:`loopy.LoopyType`.
 
     :arg count_redundant_work: Based on usage of hardware axes or other
         specifics, a kernel may perform work redundantly. This :class:`bool`
@@ -1325,25 +1326,26 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for
         code optimization.)
 
-    :arg subgroup_size: A :class:`int` that specifies the sub-group size. An OpenCL
-        sub-group is an implementation-dependent grouping of work-items within a
-        work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g.,
-        when counting a :class:`MemAccess` whose count_granularity specifies that it
-        should only be counted once per sub-group. If set to None an attempt to find
-        the sub-group size using the device will be made. A :class:`string` 'guess'
-        may also be passed as the subgroup_size, in which case get_mem_access_map
-        will attempt to find the sub-group sizeusing the device and, if
-        unsuccessful, will make a wild guess.
+    :arg subgroup_size: A :class:`int` that specifies the sub-group size. An
+        OpenCL sub-group is an implementation-dependent grouping of work-items
+        within a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is
+        used, e.g., when counting a :class:`MemAccess` whose count_granularity
+        specifies that it should only be counted once per sub-group. If set to
+        None an attempt to find the sub-group size using the device will be
+        made. A :class:`string` 'guess' may also be passed as the
+        subgroup_size, in which case get_mem_access_map will attempt to find
+        the sub-group sizeusing the device and, if unsuccessful, will make a
+        wild guess.
 
     :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:**
         :class:`islpy.PwQPolynomial` **}**.
 
-        - The :class:`MemAccess` specifies the characteristics of the
-          memory access.
+        - The :class:`MemAccess` specifies the characteristics of the memory
+          access.
 
-        - The :class:`islpy.PwQPolynomial` holds the number of memory
-          accesses with the characteristics specified in the key (in terms
-          of the :class:`loopy.LoopKernel` *inames*).
+        - The :class:`islpy.PwQPolynomial` holds the number of memory accesses
+          with the characteristics specified in the key (in terms of the
+          :class:`loopy.LoopKernel` *inames*).
 
     Example usage::
 
@@ -1547,17 +1549,16 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
 
 def get_synchronization_map(knl, subgroup_size=None):
 
-    """Count the number of synchronization events each work-item encounters in a
-    loopy kernel.
+    """Count the number of synchronization events each work-item encounters in
+    a loopy kernel.
 
     :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
     :return: A dictionary mapping each type of synchronization event to a
-            :class:`islpy.PwQPolynomial` holding the number of events per
-            work-item.
+        :class:`islpy.PwQPolynomial` holding the number of events per work-item.
 
-            Possible keys include ``barrier_local``, ``barrier_global``
-            (if supported by the target) and ``kernel_launch``.
+        Possible keys include ``barrier_local``, ``barrier_global``
+        (if supported by the target) and ``kernel_launch``.
 
     Example usage::
 
@@ -1626,14 +1627,14 @@ def get_synchronization_map(knl, subgroup_size=None):
 # {{{ gather_access_footprints
 
 def gather_access_footprints(kernel, ignore_uncountable=False):
-    """Return a dictionary mapping ``(var_name, direction)``
-    to :class:`islpy.Set` instances capturing which indices
-    of each the array *var_name* are read/written (where
-    *direction* is either ``read`` or ``write``.
-
-    :arg ignore_uncountable: If *False*, an error will be raised for
-        accesses on which the footprint cannot be determined (e.g.
-        data-dependent or nonlinear indices)
+    """Return a dictionary mapping ``(var_name, direction)`` to
+    :class:`islpy.Set` instances capturing which indices of each the array
+    *var_name* are read/written (where *direction* is either ``read`` or
+    ``write``.
+
+    :arg ignore_uncountable: If *False*, an error will be raised for accesses
+        on which the footprint cannot be determined (e.g. data-dependent or
+        nonlinear indices)
     """
 
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
@@ -1685,9 +1686,9 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
     read/written (where *direction* is either ``read`` or ``write`` on array
     *var_name*
 
-    :arg ignore_uncountable: If *True*, an error will be raised for
-        accesses on which the footprint cannot be determined (e.g.
-        data-dependent or nonlinear indices)
+    :arg ignore_uncountable: If *True*, an error will be raised for accesses on
+        which the footprint cannot be determined (e.g. data-dependent or
+        nonlinear indices)
     """
 
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
@@ -1763,8 +1764,8 @@ def get_gmem_access_poly(knl):
 
 
 def get_synchronization_poly(knl):
-    """Count the number of synchronization events each work-item encounters in a
-    loopy kernel.
+    """Count the number of synchronization events each work-item encounters in
+    a loopy kernel.
 
     get_synchronization_poly is deprecated. Use get_synchronization_map instead.
 
-- 
GitLab


From 8406772c33b86457a5a72ddbde71de890d404fbe Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 20 Feb 2018 03:40:54 -0600
Subject: [PATCH 51/59] more subgroup_size and count_granularity doc

---
 loopy/statistics.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index a63ee41ad..0607a769e 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -572,7 +572,13 @@ class MemAccess(Record):
     .. attribute:: count_granularity
 
        A :class:`str` that specifies whether this operation should be counted
-       once per *work-item*, *sub-group*, or *group*.
+       once per *work-item*, *sub-group*, or *group*. A work-item is a single
+       instance of computation executing on a single processor (think
+       'thread'), a collection of which may be grouped together into a
+       work-group. Each work-group executes on a single compute unit with all
+       work-items within the group sharing local memory. A sub-group is an
+       implementation-dependent grouping of work-items within a work-group,
+       analagous to an NVIDIA CUDA warp.
 
     """
 
@@ -1240,6 +1246,13 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for code
         optimization.)
 
+    :arg subgroup_size: (currently unused) A :class:`int` that specifies the
+        sub-group size. An OpenCL sub-group is an implementation-dependent
+        grouping of work-items within a work-group, analagous to an NVIDIA CUDA
+        warp. subgroup_size is used, e.g., when counting a :class:`MemAccess`
+        whose count_granularity specifies that it should only be counted once
+        per sub-group.
+
     :return: A :class:`ToCountMap` of **{** :class:`Op` **:**
         :class:`islpy.PwQPolynomial` **}**.
 
@@ -1554,6 +1567,13 @@ def get_synchronization_map(knl, subgroup_size=None):
 
     :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
+    :arg subgroup_size: (currently unused) A :class:`int` that specifies the
+        sub-group size. An OpenCL sub-group is an implementation-dependent
+        grouping of work-items within a work-group, analagous to an NVIDIA CUDA
+        warp. subgroup_size is used, e.g., when counting a :class:`MemAccess`
+        whose count_granularity specifies that it should only be counted once
+        per sub-group.
+
     :return: A dictionary mapping each type of synchronization event to a
         :class:`islpy.PwQPolynomial` holding the number of events per work-item.
 
-- 
GitLab


From 4243a56391ecd42280a318c10f5fb19fc88fa9be Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 20 Feb 2018 03:55:38 -0600
Subject: [PATCH 52/59] renamed group->work-group to match opencl terminology

---
 loopy/statistics.py     |  42 +++++++--------
 test/test_statistics.py | 112 ++++++++++++++++++++--------------------
 2 files changed, 77 insertions(+), 77 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 0607a769e..ed21dd045 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -460,7 +460,7 @@ def stringify_stats_mapping(m):
 
 class CountGranularity:
     """Strings specifying whether an operation should be counted once per
-    *work-item*, *sub-group*, or *group*.
+    *work-item*, *sub-group*, or *work-group*.
 
     .. attribute :: WORKITEM
 
@@ -472,15 +472,15 @@ class CountGranularity:
        A :class:`str` that specifies that an operation should be counted
        once per *sub-group*.
 
-    .. attribute :: GROUP
+    .. attribute :: WORKGROUP
 
        A :class:`str` that specifies that an operation should be counted
-       once per *group*.
+       once per *work-group*.
 
     """
     WORKITEM = "workitem"
     SUBGROUP = "subgroup"
-    GROUP = "group"
+    WORKGROUP = "workgroup"
 
 
 # {{{ Op descriptor
@@ -501,11 +501,11 @@ class Op(Record):
     .. attribute:: count_granularity
 
        A :class:`str` that specifies whether this operation should be counted
-       once per *work-item*, *sub-group*, or *group*. A work-item is a single
-       instance of computation executing on a single processor (think
+       once per *work-item*, *sub-group*, or *work-group*. A work-item is a
+       single instance of computation executing on a single processor (think
        'thread'), a collection of which may be grouped together into a
        work-group. Each work-group executes on a single compute unit with all
-       work-items within the group sharing local memory. A sub-group is an
+       work-items within the work-group sharing local memory. A sub-group is an
        implementation-dependent grouping of work-items within a work-group,
        analagous to an NVIDIA CUDA warp.
 
@@ -513,7 +513,7 @@ class Op(Record):
 
     count_granularity_options = [CountGranularity.WORKITEM,
                                  CountGranularity.SUBGROUP,
-                                 CountGranularity.GROUP,
+                                 CountGranularity.WORKGROUP,
                                  None]
 
     def __init__(self, dtype=None, name=None, count_granularity=None):
@@ -572,11 +572,11 @@ class MemAccess(Record):
     .. attribute:: count_granularity
 
        A :class:`str` that specifies whether this operation should be counted
-       once per *work-item*, *sub-group*, or *group*. A work-item is a single
-       instance of computation executing on a single processor (think
+       once per *work-item*, *sub-group*, or *work-group*. A work-item is a
+       single instance of computation executing on a single processor (think
        'thread'), a collection of which may be grouped together into a
        work-group. Each work-group executes on a single compute unit with all
-       work-items within the group sharing local memory. A sub-group is an
+       work-items within the work-group sharing local memory. A sub-group is an
        implementation-dependent grouping of work-items within a work-group,
        analagous to an NVIDIA CUDA warp.
 
@@ -584,7 +584,7 @@ class MemAccess(Record):
 
     count_granularity_options = [CountGranularity.WORKITEM,
                                  CountGranularity.SUBGROUP,
-                                 CountGranularity.GROUP,
+                                 CountGranularity.WORKGROUP,
                                  None]
 
     def __init__(self, mtype=None, dtype=None, stride=None, direction=None,
@@ -1461,31 +1461,31 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                 knl, insn, disregard_local_axes=True,
                 count_redundant_work=count_redundant_work)
 
-        if count_granularity == CountGranularity.GROUP:
+        if count_granularity == CountGranularity.WORKGROUP:
             return ct_disregard_local
         elif count_granularity == CountGranularity.SUBGROUP:
             # get the group size
             from loopy.symbolic import aff_to_expr
             _, local_size = knl.get_grid_size_upper_bounds()
-            group_size = 1
+            workgroup_size = 1
             if local_size:
                 for size in local_size:
                     s = aff_to_expr(size)
                     if not isinstance(s, int):
                         raise LoopyError("Cannot count insn with %s granularity, "
-                                         "group size is not integer: %s"
+                                         "work-group size is not integer: %s"
                                          % (CountGranularity.SUBGROUP, local_size))
-                    group_size *= s
+                    workgroup_size *= s
 
             warn_with_kernel(knl, "insn_count_subgroups_upper_bound",
                     "get_insn_count: when counting instruction %s with "
-                    "count_granularity=%s, using upper bound for group size "
-                    "(%d work-items) to compute sub-groups per group. When multiple "
-                    "device programs present, actual sub-group count may be lower."
-                    % (insn_id, CountGranularity.SUBGROUP, group_size))
+                    "count_granularity=%s, using upper bound for work-group size "
+                    "(%d work-items) to compute sub-groups per work-group. When "
+                    "multiple device programs present, actual sub-group count may be"
+                    "lower." % (insn_id, CountGranularity.SUBGROUP, workgroup_size))
 
             from pytools import div_ceil
-            return ct_disregard_local*div_ceil(group_size, subgroup_size)
+            return ct_disregard_local*div_ceil(workgroup_size, subgroup_size)
         else:
             # this should not happen since this is enforced in MemAccess
             raise ValueError("get_insn_count: count_granularity '%s' is"
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 7a5d13949..bdc64cf83 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -261,7 +261,7 @@ def test_mem_access_counter_basic():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
 
-    n_groups = 1
+    n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, subgroup_size)
 
@@ -282,9 +282,9 @@ def test_mem_access_counter_basic():
                          count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert f32l == (3*n*m*ell)*n_groups*subgroups_per_group
-    assert f64l == (2*n*m)*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert f32l == (3*n*m*ell)*n_workgroups*subgroups_per_group
+    assert f64l == (2*n*m)*n_workgroups*subgroups_per_group
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c',
@@ -295,9 +295,9 @@ def test_mem_access_counter_basic():
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert f32s == (n*m*ell)*n_groups*subgroups_per_group
-    assert f64s == (n*m)*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert f32s == (n*m*ell)*n_workgroups*subgroups_per_group
+    assert f64s == (n*m)*n_workgroups*subgroups_per_group
 
 
 def test_mem_access_counter_reduction():
@@ -320,7 +320,7 @@ def test_mem_access_counter_reduction():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
 
-    n_groups = 1
+    n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, subgroup_size)
 
@@ -333,16 +333,16 @@ def test_mem_access_counter_reduction():
                          count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert f32l == (2*n*m*ell)*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c',
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert f32s == (n*ell)*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert f32s == (n*ell)*n_workgroups*subgroups_per_group
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                                  ).to_bytes().eval_and_sum(params)
@@ -376,7 +376,7 @@ def test_mem_access_counter_logic():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
 
-    n_groups = 1
+    n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, subgroup_size)
 
@@ -392,10 +392,10 @@ def test_mem_access_counter_logic():
                                        direction='store')
                           ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert f32_g_l == (2*n*m)*n_groups*subgroups_per_group
-    assert f64_g_l == (n*m)*n_groups*subgroups_per_group
-    assert f64_g_s == (n*m)*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert f32_g_l == (2*n*m)*n_workgroups*subgroups_per_group
+    assert f64_g_l == (n*m)*n_workgroups*subgroups_per_group
+    assert f64_g_s == (n*m)*n_workgroups*subgroups_per_group
 
 
 def test_mem_access_counter_specialops():
@@ -422,7 +422,7 @@ def test_mem_access_counter_specialops():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
 
-    n_groups = 1
+    n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, subgroup_size)
 
@@ -443,9 +443,9 @@ def test_mem_access_counter_specialops():
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert f32 == (2*n*m*ell)*n_groups*subgroups_per_group
-    assert f64 == (2*n*m)*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert f32 == (2*n*m*ell)*n_workgroups*subgroups_per_group
+    assert f64 == (2*n*m)*n_workgroups*subgroups_per_group
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='store', variable='c',
@@ -456,16 +456,16 @@ def test_mem_access_counter_specialops():
                          count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert f32 == (n*m*ell)*n_groups*subgroups_per_group
-    assert f64 == (n*m)*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert f32 == (n*m*ell)*n_workgroups*subgroups_per_group
+    assert f64 == (n*m)*n_workgroups*subgroups_per_group
 
     filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'],
                          count_granularity=CG.SUBGROUP)
     tot = filtered_map.eval_and_sum(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert tot == (n*m*ell + n*m)*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert tot == (n*m*ell + n*m)*n_workgroups*subgroups_per_group
 
 
 def test_mem_access_counter_bitwise():
@@ -494,7 +494,7 @@ def test_mem_access_counter_bitwise():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
 
-    n_groups = 1
+    n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, subgroup_size)
 
@@ -515,8 +515,8 @@ def test_mem_access_counter_bitwise():
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert i32 == (4*n*m+2*n*m*ell)*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert i32 == (4*n*m+2*n*m*ell)*n_workgroups*subgroups_per_group
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='c',
@@ -527,8 +527,8 @@ def test_mem_access_counter_bitwise():
                          count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert i32 == (n*m+n*m*ell)*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert i32 == (n*m+n*m*ell)*n_workgroups*subgroups_per_group
 
 
 def test_mem_access_counter_mixed():
@@ -557,7 +557,7 @@ def test_mem_access_counter_mixed():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
 
-    n_groups = div_ceil(ell, group_size_0)
+    n_workgroups = div_ceil(ell, group_size_0)
     group_size = group_size_0
     subgroups_per_group = div_ceil(group_size, subgroup_size)
 
@@ -586,9 +586,9 @@ def test_mem_access_counter_mixed():
                                 count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert f64uniform == (2*n*m)*n_groups*subgroups_per_group
-    assert f32uniform == (m*n)*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert f64uniform == (2*n*m)*n_workgroups*subgroups_per_group
+    assert f32uniform == (m*n)*n_workgroups*subgroups_per_group
 
     expect_fallback = False
     import islpy as isl
@@ -601,9 +601,9 @@ def test_mem_access_counter_mixed():
 
     if expect_fallback:
         if ell < group_size_0:
-            assert f32nonconsec == 3*n*m*ell*n_groups
+            assert f32nonconsec == 3*n*m*ell*n_workgroups
         else:
-            assert f32nonconsec == 3*n*m*n_groups*group_size_0
+            assert f32nonconsec == 3*n*m*n_workgroups*group_size_0
     else:
         assert f32nonconsec == 3*n*m*ell
 
@@ -617,14 +617,14 @@ def test_mem_access_counter_mixed():
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert f64uniform == m*n*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert f64uniform == m*n*n_workgroups*subgroups_per_group
 
     if expect_fallback:
         if ell < group_size_0:
-            assert f32nonconsec == n*m*ell*n_groups
+            assert f32nonconsec == n*m*ell*n_workgroups
         else:
-            assert f32nonconsec == n*m*n_groups*group_size_0
+            assert f32nonconsec == n*m*n_workgroups*group_size_0
     else:
         assert f32nonconsec == n*m*ell
 
@@ -779,7 +779,7 @@ def test_count_granularity_val_checks():
     try:
         lp.MemAccess(count_granularity=CG.WORKITEM)
         lp.MemAccess(count_granularity=CG.SUBGROUP)
-        lp.MemAccess(count_granularity=CG.GROUP)
+        lp.MemAccess(count_granularity=CG.WORKGROUP)
         lp.MemAccess(count_granularity=None)
         assert True
         lp.MemAccess(count_granularity='bushel')
@@ -790,7 +790,7 @@ def test_count_granularity_val_checks():
     try:
         lp.Op(count_granularity=CG.WORKITEM)
         lp.Op(count_granularity=CG.SUBGROUP)
-        lp.Op(count_granularity=CG.GROUP)
+        lp.Op(count_granularity=CG.WORKGROUP)
         lp.Op(count_granularity=None)
         assert True
         lp.Op(count_granularity='bushel')
@@ -977,7 +977,7 @@ def test_summations_and_filters():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
 
-    n_groups = 1
+    n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, subgroup_size)
 
@@ -988,15 +988,15 @@ def test_summations_and_filters():
                                 count_granularity=[CG.SUBGROUP]
                                 ).eval_and_sum(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert loads_a == (2*n*m*ell)*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert loads_a == (2*n*m*ell)*n_workgroups*subgroups_per_group
 
     global_stores = mem_map.filter_by(mtype=['global'], direction=['store'],
                                       count_granularity=[CG.SUBGROUP]
                                       ).eval_and_sum(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert global_stores == (n*m*ell + n*m)*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert global_stores == (n*m*ell + n*m)*n_workgroups*subgroups_per_group
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'],
                                  count_granularity=[CG.SUBGROUP]
@@ -1005,9 +1005,9 @@ def test_summations_and_filters():
                                  count_granularity=[CG.SUBGROUP]
                                  ).to_bytes().eval_and_sum(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_groups*subgroups_per_group
-    assert st_bytes == (4*n*m*ell + 8*n*m)*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_workgroups*subgroups_per_group
+    assert st_bytes == (4*n*m*ell + 8*n*m)*n_workgroups*subgroups_per_group
 
     # ignore stride and variable names in this map
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
@@ -1016,9 +1016,9 @@ def test_summations_and_filters():
     f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
                           ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert f32lall == (3*n*m*ell)*n_groups*subgroups_per_group
-    assert f64lall == (2*n*m)*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert f32lall == (3*n*m*ell)*n_workgroups*subgroups_per_group
+    assert f64lall == (2*n*m)*n_workgroups*subgroups_per_group
 
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     #for k, v in op_map.items():
@@ -1052,8 +1052,8 @@ def test_summations_and_filters():
                key.direction == 'load'
     s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
 
-    # uniform: (count-per-sub-group)*n_groups*subgroups_per_group
-    assert s1f64l == (2*n*m)*n_groups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
+    assert s1f64l == (2*n*m)*n_workgroups*subgroups_per_group
 
 
 def test_strided_footprint():
-- 
GitLab


From f8cf6fcf8025e4412f2327c4d7ece9b055734ffe Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 20 Feb 2018 22:50:40 -0600
Subject: [PATCH 53/59] added CountGranularity.ALL to list all granularities

---
 loopy/statistics.py | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index ed21dd045..5a5f85f65 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -478,9 +478,11 @@ class CountGranularity:
        once per *work-group*.
 
     """
+
     WORKITEM = "workitem"
     SUBGROUP = "subgroup"
     WORKGROUP = "workgroup"
+    ALL = [WORKITEM, SUBGROUP, WORKGROUP]
 
 
 # {{{ Op descriptor
@@ -511,16 +513,11 @@ class Op(Record):
 
     """
 
-    count_granularity_options = [CountGranularity.WORKITEM,
-                                 CountGranularity.SUBGROUP,
-                                 CountGranularity.WORKGROUP,
-                                 None]
-
     def __init__(self, dtype=None, name=None, count_granularity=None):
-        if count_granularity not in self.count_granularity_options:
-            raise ValueError("Op.__init__: count_granularity '%s' is"
+        if count_granularity not in CountGranularity.ALL+[None]:
+            raise ValueError("Op.__init__: count_granularity '%s' is "
                     "not allowed. count_granularity options: %s"
-                    % (count_granularity, self.count_granularity_options))
+                    % (count_granularity, CountGranularity.ALL+[None]))
         if dtype is None:
             Record.__init__(self, dtype=dtype, name=name,
                             count_granularity=count_granularity)
@@ -582,11 +579,6 @@ class MemAccess(Record):
 
     """
 
-    count_granularity_options = [CountGranularity.WORKITEM,
-                                 CountGranularity.SUBGROUP,
-                                 CountGranularity.WORKGROUP,
-                                 None]
-
     def __init__(self, mtype=None, dtype=None, stride=None, direction=None,
                  variable=None, count_granularity=None):
 
@@ -600,10 +592,10 @@ class MemAccess(Record):
             raise NotImplementedError("MemAccess: variable must be None when "
                                       "mtype is 'local'")
 
-        if count_granularity not in self.count_granularity_options:
-            raise ValueError("Op.__init__: count_granularity '%s' is"
+        if count_granularity not in CountGranularity.ALL+[None]:
+            raise ValueError("Op.__init__: count_granularity '%s' is "
                     "not allowed. count_granularity options: %s"
-                    % (count_granularity, self.count_granularity_options))
+                    % (count_granularity, CountGranularity.ALL+[None]))
 
         if dtype is None:
             Record.__init__(self, mtype=mtype, dtype=dtype, stride=stride,
@@ -1490,7 +1482,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
             # this should not happen since this is enforced in MemAccess
             raise ValueError("get_insn_count: count_granularity '%s' is"
                     "not allowed. count_granularity options: %s"
-                    % (count_granularity, MemAccess.count_granularity_options))
+                    % (count_granularity, CountGranularity.ALL+[None]))
 
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
-- 
GitLab


From 98d7b5464d6b8c0d693c6b583e1c21ea860f54f6 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 20 Feb 2018 22:59:47 -0600
Subject: [PATCH 54/59] using enum instead of string for count granularity in
 tutorial

---
 doc/tutorial.rst | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 52d6e169c..4efc13de4 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1578,12 +1578,13 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
 .. doctest::
 
     >>> param_dict = {'n': 256, 'm': 256, 'l': 8}
-    >>> f32add = op_map[lp.Op(np.float32, 'add', 'workitem')].eval_with_dict(param_dict)
-    >>> f32div = op_map[lp.Op(np.float32, 'div', 'workitem')].eval_with_dict(param_dict)
-    >>> f32mul = op_map[lp.Op(np.float32, 'mul', 'workitem')].eval_with_dict(param_dict)
-    >>> f64add = op_map[lp.Op(np.float64, 'add', 'workitem')].eval_with_dict(param_dict)
-    >>> f64mul = op_map[lp.Op(np.float64, 'mul', 'workitem')].eval_with_dict(param_dict)
-    >>> i32add = op_map[lp.Op(np.int32, 'add', 'workitem')].eval_with_dict(param_dict)
+    >>> from loopy.statistics import CountGranularity as CG
+    >>> f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(param_dict)
+    >>> f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(param_dict)
+    >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(param_dict)
+    >>> f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(param_dict)
+    >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.WORKITEM)].eval_with_dict(param_dict)
+    >>> i32add = op_map[lp.Op(np.int32, 'add', CG.WORKITEM)].eval_with_dict(param_dict)
     >>> print("%i\n%i\n%i\n%i\n%i\n%i" %
     ...     (f32add, f32div, f32mul, f64add, f64mul, i32add))
     524288
@@ -1673,13 +1674,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g', 'subgroup')
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g', CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e', 'subgroup')
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e', CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a', 'subgroup')
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a', CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c', 'subgroup')
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c', CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1754,13 +1755,13 @@ array accesses has not changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g', 'workitem')
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e', 'workitem')
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a', 'workitem')
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c', 'workitem')
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1794,13 +1795,13 @@ changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g', 'workitem')
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e', 'workitem')
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a', 'workitem')
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c', 'workitem')
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
-- 
GitLab


From abf3f04b74dcfae12f2c967bbc38fdf200a27189 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 21 Feb 2018 19:22:01 -0600
Subject: [PATCH 55/59] reorganized subgroup_size processing/guessing for
 clarity

---
 loopy/statistics.py | 66 ++++++++++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 25 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 5a5f85f65..2f87734b0 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1312,6 +1312,20 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
 # }}}
 
 
+def _find_subgroup_size_for_knl(knl):
+    from loopy.target.pyopencl import PyOpenCLTarget
+    if isinstance(knl.target, PyOpenCLTarget) and knl.target.device is not None:
+        from pyopencl.characterize import get_simd_group_size
+        subgroup_size_guess = get_simd_group_size(knl.target.device, None)
+        warn_with_kernel(knl, "getting_subgroup_size_from_device",
+                         "Device: %s. Using sub-group size given by "
+                         "pyopencl.characterize.get_simd_group_size(): %d"
+                         % (knl.target.device, subgroup_size_guess))
+        return subgroup_size_guess
+    else:
+        return None
+
+
 # {{{ get_mem_access_map
 
 def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
@@ -1399,33 +1413,35 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
 
     if not isinstance(subgroup_size, int):
         # try to find subgroup_size
-        from loopy.target.pyopencl import PyOpenCLTarget
-        if isinstance(knl.target, PyOpenCLTarget) and knl.target.device is not None:
-            from pyopencl.characterize import get_simd_group_size
-            subgroup_size_guess = get_simd_group_size(knl.target.device, None)
-            warn_with_kernel(knl, "get_mem_access_map_assumes_subgroup_size",
-                             "subgroup_size passed: %s. Device: %s. Using "
-                             "sub-group size given by get_simd_group_size(): %d"
-                             % (subgroup_size, knl.target.device,
-                                subgroup_size_guess))
-            subgroup_size = subgroup_size_guess
-        elif subgroup_size == 'guess':
-            # unable to get subgroup_size from device, so guess
-            subgroup_size = 32
-            warn_with_kernel(knl, "get_mem_access_map_guessing_subgroup_size",
-                             "get_mem_access_map: 'guess' sub-group size passed, "
-                             "no target device found, wildly guessing that "
-                             "sub-group size is %d."
-                             % (subgroup_size))
+        subgroup_size_guess = _find_subgroup_size_for_knl(knl)
 
         if subgroup_size is None:
-            # 'guess' was not passed and either no target device found
-            # or get_simd_group_size returned None
-            raise ValueError("No sub-group size passed and no target device found. "
-                             "Either (1) pass integer value for subgroup_size, "
-                             "(2) ensure that kernel.target is PyOpenClTarget "
-                             "and kernel.target.device is set, or (3) pass "
-                             "subgroup_size='guess' and hope for the best.")
+            if subgroup_size_guess is None:
+                # 'guess' was not passed and either no target device found
+                # or get_simd_group_size returned None
+                raise ValueError("No sub-group size passed and no target device found. "
+                                 "Either (1) pass integer value for subgroup_size, "
+                                 "(2) ensure that kernel.target is PyOpenClTarget "
+                                 "and kernel.target.device is set, or (3) pass "
+                                 "subgroup_size='guess' and hope for the best.")
+            else:
+                subgroup_size = subgroup_size_guess
+
+        elif subgroup_size == 'guess':
+            if subgroup_size_guess is None:
+                # unable to get subgroup_size from device, so guess
+                subgroup_size = 32
+                warn_with_kernel(knl, "get_mem_access_map_guessing_subgroup_size",
+                                 "get_mem_access_map: 'guess' sub-group size passed, "
+                                 "no target device found, wildly guessing that "
+                                 "sub-group size is %d."
+                                 % (subgroup_size))
+            else:
+                subgroup_size = subgroup_size_guess
+        else:
+            raise ValueError("Invalid value for subgroup_size: %s. subgroup_size "
+                             "must be integer, 'guess', or, if you're feeling "
+                             "lucky, None." % (subgroup_size))
 
     class CacheHolder(object):
         pass
-- 
GitLab


From 509692c616a68797ab0eef9be3df604ccade5e22 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 21 Feb 2018 19:36:53 -0600
Subject: [PATCH 56/59] fixing flake8 issues

---
 loopy/statistics.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 2f87734b0..f9f068fd3 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1419,7 +1419,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
             if subgroup_size_guess is None:
                 # 'guess' was not passed and either no target device found
                 # or get_simd_group_size returned None
-                raise ValueError("No sub-group size passed and no target device found. "
+                raise ValueError("No sub-group size passed, no target device found. "
                                  "Either (1) pass integer value for subgroup_size, "
                                  "(2) ensure that kernel.target is PyOpenClTarget "
                                  "and kernel.target.device is set, or (3) pass "
@@ -1432,10 +1432,9 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                 # unable to get subgroup_size from device, so guess
                 subgroup_size = 32
                 warn_with_kernel(knl, "get_mem_access_map_guessing_subgroup_size",
-                                 "get_mem_access_map: 'guess' sub-group size passed, "
-                                 "no target device found, wildly guessing that "
-                                 "sub-group size is %d."
-                                 % (subgroup_size))
+                                 "get_mem_access_map: 'guess' sub-group size "
+                                 "passed, no target device found, wildly guessing "
+                                 "that sub-group size is %d." % (subgroup_size))
             else:
                 subgroup_size = subgroup_size_guess
         else:
-- 
GitLab


From 309775a7bbdc6f250d54f27ec157e5e3721af9a8 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 21 Feb 2018 20:07:33 -0600
Subject: [PATCH 57/59] updated subgroup_size explanation slightly

---
 loopy/statistics.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index f9f068fd3..3d44826bb 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1351,10 +1351,10 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         used, e.g., when counting a :class:`MemAccess` whose count_granularity
         specifies that it should only be counted once per sub-group. If set to
         None an attempt to find the sub-group size using the device will be
-        made. A :class:`string` 'guess' may also be passed as the
-        subgroup_size, in which case get_mem_access_map will attempt to find
-        the sub-group sizeusing the device and, if unsuccessful, will make a
-        wild guess.
+        made, if this fails an error will be raised. A :class:`string` 'guess'
+        may also be passed as the subgroup_size, in which case
+        get_mem_access_map will attempt to find the sub-group size using the
+        device and, if unsuccessful, will make a wild guess.
 
     :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:**
         :class:`islpy.PwQPolynomial` **}**.
-- 
GitLab


From d00bfdeeae0b0ffc17bc5dd61efd68370da16984 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Fri, 23 Feb 2018 20:33:47 -0600
Subject: [PATCH 58/59] improved docs

---
 loopy/statistics.py | 97 ++++++++++++++++++++++++++-------------------
 1 file changed, 57 insertions(+), 40 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 3d44826bb..31cc94e73 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -362,7 +362,7 @@ class ToCountMap(object):
     def to_bytes(self):
         """Convert counts to bytes using data type in map key.
 
-        :return: A :class:`ToCountMap` mapping each original key to a
+        :return: A :class:`ToCountMap` mapping each original key to an
             :class:`islpy.PwQPolynomial` with counts in bytes rather than
             instances.
 
@@ -404,7 +404,7 @@ class ToCountMap(object):
     def sum(self):
         """Add all counts in ToCountMap.
 
-        :return: A :class:`islpy.PwQPolynomial` or :class:`int` containing the
+        :return: An :class:`islpy.PwQPolynomial` or :class:`int` containing the
             sum of counts.
 
         """
@@ -462,17 +462,17 @@ class CountGranularity:
     """Strings specifying whether an operation should be counted once per
     *work-item*, *sub-group*, or *work-group*.
 
-    .. attribute :: WORKITEM
+    .. attribute:: WORKITEM
 
        A :class:`str` that specifies that an operation should be counted
        once per *work-item*.
 
-    .. attribute :: SUBGROUP
+    .. attribute:: SUBGROUP
 
        A :class:`str` that specifies that an operation should be counted
        once per *sub-group*.
 
-    .. attribute :: WORKGROUP
+    .. attribute:: WORKGROUP
 
        A :class:`str` that specifies that an operation should be counted
        once per *work-group*.
@@ -503,11 +503,13 @@ class Op(Record):
     .. attribute:: count_granularity
 
        A :class:`str` that specifies whether this operation should be counted
-       once per *work-item*, *sub-group*, or *work-group*. A work-item is a
-       single instance of computation executing on a single processor (think
-       'thread'), a collection of which may be grouped together into a
-       work-group. Each work-group executes on a single compute unit with all
-       work-items within the work-group sharing local memory. A sub-group is an
+       once per *work-item*, *sub-group*, or *work-group*. The granularities
+       allowed can be found in :class:`CountGranularity`, and may be accessed,
+       e.g., as ``CountGranularity.WORKITEM``. A work-item is a single instance
+       of computation executing on a single processor (think 'thread'), a
+       collection of which may be grouped together into a work-group. Each
+       work-group executes on a single compute unit with all work-items within
+       the work-group sharing local memory. A sub-group is an
        implementation-dependent grouping of work-items within a work-group,
        analagous to an NVIDIA CUDA warp.
 
@@ -569,11 +571,13 @@ class MemAccess(Record):
     .. attribute:: count_granularity
 
        A :class:`str` that specifies whether this operation should be counted
-       once per *work-item*, *sub-group*, or *work-group*. A work-item is a
-       single instance of computation executing on a single processor (think
-       'thread'), a collection of which may be grouped together into a
-       work-group. Each work-group executes on a single compute unit with all
-       work-items within the work-group sharing local memory. A sub-group is an
+       once per *work-item*, *sub-group*, or *work-group*. The granularities
+       allowed can be found in :class:`CountGranularity`, and may be accessed,
+       e.g., as ``CountGranularity.WORKITEM``. A work-item is a single instance
+       of computation executing on a single processor (think 'thread'), a
+       collection of which may be grouped together into a work-group. Each
+       work-group executes on a single compute unit with all work-items within
+       the work-group sharing local memory. A sub-group is an
        implementation-dependent grouping of work-items within a work-group,
        analagous to an NVIDIA CUDA warp.
 
@@ -1238,12 +1242,17 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for code
         optimization.)
 
-    :arg subgroup_size: (currently unused) A :class:`int` that specifies the
-        sub-group size. An OpenCL sub-group is an implementation-dependent
-        grouping of work-items within a work-group, analagous to an NVIDIA CUDA
-        warp. subgroup_size is used, e.g., when counting a :class:`MemAccess`
-        whose count_granularity specifies that it should only be counted once
-        per sub-group.
+    :arg subgroup_size: (currently unused) An :class:`int`, :class:`string`
+        ``'guess'``, or *None* that specifies the sub-group size. An OpenCL
+        sub-group is an implementation-dependent grouping of work-items within
+        a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used,
+        e.g., when counting a :class:`MemAccess` whose count_granularity
+        specifies that it should only be counted once per sub-group. If set to
+        *None* an attempt to find the sub-group size using the device will be
+        made, if this fails an error will be raised. If a :class:`string`
+        ``'guess'`` is passed as the subgroup_size, get_mem_access_map will
+        attempt to find the sub-group size using the device and, if
+        unsuccessful, will make a wild guess.
 
     :return: A :class:`ToCountMap` of **{** :class:`Op` **:**
         :class:`islpy.PwQPolynomial` **}**.
@@ -1345,16 +1354,17 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for
         code optimization.)
 
-    :arg subgroup_size: A :class:`int` that specifies the sub-group size. An
-        OpenCL sub-group is an implementation-dependent grouping of work-items
-        within a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is
-        used, e.g., when counting a :class:`MemAccess` whose count_granularity
-        specifies that it should only be counted once per sub-group. If set to
-        None an attempt to find the sub-group size using the device will be
-        made, if this fails an error will be raised. A :class:`string` 'guess'
-        may also be passed as the subgroup_size, in which case
-        get_mem_access_map will attempt to find the sub-group size using the
-        device and, if unsuccessful, will make a wild guess.
+    :arg subgroup_size: An :class:`int`, :class:`string` ``'guess'``, or
+        *None* that specifies the sub-group size. An OpenCL sub-group is an
+        implementation-dependent grouping of work-items within a work-group,
+        analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when
+        counting a :class:`MemAccess` whose count_granularity specifies that it
+        should only be counted once per sub-group. If set to *None* an attempt
+        to find the sub-group size using the device will be made, if this fails
+        an error will be raised. If a :class:`string` ``'guess'`` is passed as
+        the subgroup_size, get_mem_access_map will attempt to find the
+        sub-group size using the device and, if unsuccessful, will make a wild
+        guess.
 
     :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:**
         :class:`islpy.PwQPolynomial` **}**.
@@ -1574,15 +1584,21 @@ def get_synchronization_map(knl, subgroup_size=None):
 
     :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
-    :arg subgroup_size: (currently unused) A :class:`int` that specifies the
-        sub-group size. An OpenCL sub-group is an implementation-dependent
-        grouping of work-items within a work-group, analagous to an NVIDIA CUDA
-        warp. subgroup_size is used, e.g., when counting a :class:`MemAccess`
-        whose count_granularity specifies that it should only be counted once
-        per sub-group.
+    :arg subgroup_size: (currently unused) An :class:`int`, :class:`string`
+        ``'guess'``, or *None* that specifies the sub-group size. An OpenCL
+        sub-group is an implementation-dependent grouping of work-items within
+        a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used,
+        e.g., when counting a :class:`MemAccess` whose count_granularity
+        specifies that it should only be counted once per sub-group. If set to
+        *None* an attempt to find the sub-group size using the device will be
+        made, if this fails an error will be raised. If a :class:`string`
+        ``'guess'`` is passed as the subgroup_size, get_mem_access_map will
+        attempt to find the sub-group size using the device and, if
+        unsuccessful, will make a wild guess.
 
-    :return: A dictionary mapping each type of synchronization event to a
-        :class:`islpy.PwQPolynomial` holding the number of events per work-item.
+    :return: A dictionary mapping each type of synchronization event to an
+        :class:`islpy.PwQPolynomial` holding the number of events per
+        work-item.
 
         Possible keys include ``barrier_local``, ``barrier_global``
         (if supported by the target) and ``kernel_launch``.
@@ -1794,7 +1810,8 @@ def get_synchronization_poly(knl):
     """Count the number of synchronization events each work-item encounters in
     a loopy kernel.
 
-    get_synchronization_poly is deprecated. Use get_synchronization_map instead.
+    get_synchronization_poly is deprecated. Use get_synchronization_map
+    instead.
 
     """
     warn_with_kernel(knl, "deprecated_get_synchronization_poly",
-- 
GitLab


From 68a108ba3057c26db083cc10748fd613765e1271 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Sat, 24 Feb 2018 01:01:19 -0600
Subject: [PATCH 59/59] string->str in docs

---
 loopy/statistics.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 31cc94e73..17c5bd355 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1242,14 +1242,14 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for code
         optimization.)
 
-    :arg subgroup_size: (currently unused) An :class:`int`, :class:`string`
+    :arg subgroup_size: (currently unused) An :class:`int`, :class:`str`
         ``'guess'``, or *None* that specifies the sub-group size. An OpenCL
         sub-group is an implementation-dependent grouping of work-items within
         a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used,
         e.g., when counting a :class:`MemAccess` whose count_granularity
         specifies that it should only be counted once per sub-group. If set to
         *None* an attempt to find the sub-group size using the device will be
-        made, if this fails an error will be raised. If a :class:`string`
+        made, if this fails an error will be raised. If a :class:`str`
         ``'guess'`` is passed as the subgroup_size, get_mem_access_map will
         attempt to find the sub-group size using the device and, if
         unsuccessful, will make a wild guess.
@@ -1354,14 +1354,14 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for
         code optimization.)
 
-    :arg subgroup_size: An :class:`int`, :class:`string` ``'guess'``, or
+    :arg subgroup_size: An :class:`int`, :class:`str` ``'guess'``, or
         *None* that specifies the sub-group size. An OpenCL sub-group is an
         implementation-dependent grouping of work-items within a work-group,
         analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when
         counting a :class:`MemAccess` whose count_granularity specifies that it
         should only be counted once per sub-group. If set to *None* an attempt
         to find the sub-group size using the device will be made, if this fails
-        an error will be raised. If a :class:`string` ``'guess'`` is passed as
+        an error will be raised. If a :class:`str` ``'guess'`` is passed as
         the subgroup_size, get_mem_access_map will attempt to find the
         sub-group size using the device and, if unsuccessful, will make a wild
         guess.
@@ -1584,14 +1584,14 @@ def get_synchronization_map(knl, subgroup_size=None):
 
     :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
-    :arg subgroup_size: (currently unused) An :class:`int`, :class:`string`
+    :arg subgroup_size: (currently unused) An :class:`int`, :class:`str`
         ``'guess'``, or *None* that specifies the sub-group size. An OpenCL
         sub-group is an implementation-dependent grouping of work-items within
         a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used,
         e.g., when counting a :class:`MemAccess` whose count_granularity
         specifies that it should only be counted once per sub-group. If set to
         *None* an attempt to find the sub-group size using the device will be
-        made, if this fails an error will be raised. If a :class:`string`
+        made, if this fails an error will be raised. If a :class:`str`
         ``'guess'`` is passed as the subgroup_size, get_mem_access_map will
         attempt to find the sub-group size using the device and, if
         unsuccessful, will make a wild guess.
-- 
GitLab