From 561dbbbb590bbb731bb93ac0b00f4c0d2a6fdef6 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 2 Apr 2018 14:16:04 -0500
Subject: [PATCH 1/3] collecting global id strides in MemAccessCounter, updated
 tests and tutorial

---
 doc/tutorial.rst        |  88 +++++++-------
 loopy/statistics.py     | 103 +++++++++++-----
 test/test_statistics.py | 256 +++++++++++++++++++++++++---------------
 3 files changed, 283 insertions(+), 164 deletions(-)
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index af8c8281c..5d4e972ee 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1641,15 +1641,15 @@ we'll continue using the kernel from the previous example:
 
     >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : ...
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : ...
     <BLANKLINE>
 
 Each line of output will look roughly like::
 
 
-    MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), {}, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), {}, store, c, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, store, c, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
 
 :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
 :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
@@ -1661,7 +1661,7 @@ Each line of output will look roughly like::
 - dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
   data type accessed.
 
-- stride: A :class:`dict` of **{** :class:`int` **:**
+- lid_strides: A :class:`dict` of **{** :class:`int` **:**
   :class:`pymbolic.primitives.Expression` or :class:`int` **}** that specifies
   local strides for each local id in the memory access index. Local ids not
   found will not be present in ``lid_strides.keys()``. Uniform access (i.e.
@@ -1669,6 +1669,11 @@ Each line of output will look roughly like::
   ``lid_strides[0]=0``, but may also occur when no local id 0 is found, in
   which case the 0 key will not be present in lid_strides.
 
+- gid_strides: A :class:`dict` of **{** :class:`int` **:**
+  :class:`pymbolic.primitives.Expression` or :class:`int` **}** that specifies
+  global strides for each global id in the memory access index. Global ids not
+  found will not be present in ``gid_strides.keys()``.
+
 - direction: A :class:`str` that specifies the direction of memory access as
   **load** or **store**.
 
@@ -1679,13 +1684,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, 'load', 'g', CG.SUBGROUP)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, 'store', 'e', CG.SUBGROUP)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, 'load', 'a', CG.SUBGROUP)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, 'store', 'c', CG.SUBGROUP)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1703,13 +1708,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
     >>> bytes_map = mem_map.to_bytes()
     >>> print(lp.stringify_stats_mapping(bytes_map))
-    MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : ...
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : ...
     <BLANKLINE>
     >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
     ...                                         ).group_by('direction')
     >>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
-    MemAccess(None, None, None, load, None, None) : ...
-    MemAccess(None, None, None, store, None, None) : ...
+    MemAccess(None, None, None, None, load, None, None) : ...
+    MemAccess(None, None, None, None, store, None, None) : ...
     <BLANKLINE>
     >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
     ...                            ].eval_with_dict(param_dict)
@@ -1721,12 +1726,12 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
 The lines of output above might look like::
 
-    MemAccess(global, np:dtype('float32'), {}, load, a, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), {}, load, b, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), {}, store, c, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), {}, load, g, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), {}, load, h, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), {}, store, e, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, b, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, store, c, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), {}, {}, load, g, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), {}, {}, load, h, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), {}, {}, store, e, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
 
 One can see how these functions might be useful in computing, for example,
 achieved memory bandwidth in byte/sec or performance in FLOP/sec.
@@ -1735,9 +1740,10 @@ achieved memory bandwidth in byte/sec or performance in FLOP/sec.
 
 Since we have not tagged any of the inames or parallelized the kernel across
 work-items (which would have produced iname tags), :func:`loopy.get_mem_access_map`
-finds no local id strides, leaving ``lid_strides`` empty for each memory access.
-Now we'll parallelize the kernel and count the array accesses again. The
-resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
+finds no local or global id strides, leaving ``lid_strides`` and ``gid_strides``
+empty for each memory access. Now we'll parallelize the kernel and count the array
+accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated
+this time.
 
 .. doctest::
 
@@ -1745,12 +1751,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
     ...                             outer_tag="l.1", inner_tag="l.0")
     >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, load, a, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, load, b, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, store, c, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, load, g, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, load, h, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, store, e, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, workitem) : ...
     <BLANKLINE>
 
 With this parallelization, consecutive work-items will access consecutive array
@@ -1760,13 +1766,13 @@ array accesses has not changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, 'load', 'g', CG.WORKITEM)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, 'store', 'e', CG.WORKITEM)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, 'load', 'a', CG.WORKITEM)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, 'store', 'c', CG.WORKITEM)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1786,12 +1792,12 @@ we'll switch the inner and outer tags in our parallelization of the kernel:
     ...                                outer_tag="l.0", inner_tag="l.1")
     >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, load, a, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, load, b, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, store, c, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, load, g, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, load, h, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, store, e, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, workitem) : ...
     <BLANKLINE>
 
 With this parallelization, consecutive work-items will access *nonconsecutive*
@@ -1800,13 +1806,13 @@ changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, 'load', 'g', CG.WORKITEM)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, 'store', 'e', CG.WORKITEM)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, 'load', 'a', CG.WORKITEM)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, 'store', 'c', CG.WORKITEM)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 5e929b618..7cb70026d 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -564,6 +564,13 @@ class MemAccess(Record):
        when no local id 0 is found, in which case the 0 key will not be
        present in lid_strides.
 
+    .. attribute:: gid_strides
+
+       A :class:`dict` of **{** :class:`int` **:**
+       :class:`pymbolic.primitives.Expression` or :class:`int` **}** that
+       specifies global strides for each global id in the memory access index.
+       global ids not found will not be present in ``gid_strides.keys()``.
+
     .. attribute:: direction
 
        A :class:`str` that specifies the direction of memory access as
@@ -589,14 +596,19 @@ class MemAccess(Record):
 
     """
 
-    def __init__(self, mtype=None, dtype=None, lid_strides=None, direction=None,
-                 variable=None, count_granularity=None):
+    def __init__(self, mtype=None, dtype=None, lid_strides=None, gid_strides=None,
+                 direction=None, variable=None, count_granularity=None):
 
         #TODO currently giving all lmem access lid_strides=None
         if mtype == 'local' and lid_strides is not None:
             raise NotImplementedError("MemAccess: lid_strides must be None when "
                                       "mtype is 'local'")
 
+        #TODO currently giving all gmem access lid_strides=None
+        if mtype == 'local' and gid_strides is not None:
+            raise NotImplementedError("MemAccess: gid_strides must be None when "
+                                      "mtype is 'local'")
+
         #TODO currently giving all lmem access variable=None
         if (mtype == 'local') and (variable is not None):
             raise NotImplementedError("MemAccess: variable must be None when "
@@ -609,25 +621,29 @@ class MemAccess(Record):
 
         if dtype is None:
             Record.__init__(self, mtype=mtype, dtype=dtype, lid_strides=lid_strides,
-                            direction=direction, variable=variable,
-                            count_granularity=count_granularity)
+                            gid_strides=gid_strides, direction=direction,
+                            variable=variable, count_granularity=count_granularity)
         else:
             from loopy.types import to_loopy_type
             Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype),
-                            lid_strides=lid_strides, direction=direction,
-                            variable=variable, count_granularity=count_granularity)
+                            lid_strides=lid_strides, gid_strides=gid_strides,
+                            direction=direction, variable=variable,
+                            count_granularity=count_granularity)
 
     def __hash__(self):
-        # Note that this means lid_strides must be sorted in self.__repr__()
+        # Note that this means lid_strides and gid_strides must be sorted
+        # in self.__repr__()
         return hash(repr(self))
 
     def __repr__(self):
         # Record.__repr__ overridden for consistent ordering and conciseness
-        return "MemAccess(%s, %s, %s, %s, %s, %s)" % (
+        return "MemAccess(%s, %s, %s, %s, %s, %s, %s)" % (
             self.mtype,
             self.dtype,
             None if self.lid_strides is None else dict(
                 sorted(six.iteritems(self.lid_strides))),
+            None if self.gid_strides is None else dict(
+                sorted(six.iteritems(self.gid_strides))),
             self.direction,
             self.variable,
             self.count_granularity)
@@ -879,7 +895,7 @@ class GlobalMemAccessCounter(MemAccessCounter):
 
         return ToCountMap({MemAccess(mtype='global',
                                      dtype=self.type_inf(expr), lid_strides={},
-                                     variable=name,
+                                     gid_strides={}, variable=name,
                                      count_granularity=CountGranularity.WORKITEM): 1}
                           ) + self.rec(expr.index)
 
@@ -901,34 +917,18 @@ class GlobalMemAccessCounter(MemAccessCounter):
             index = (index,)
 
         from loopy.symbolic import get_dependencies
-        from loopy.kernel.data import LocalIndexTag
+        from loopy.kernel.data import LocalIndexTag, GroupIndexTag
         my_inames = get_dependencies(index) & self.knl.all_inames()
 
-        # find all local index tags and corresponding inames
+        # find all local and global index tags and corresponding inames
         lid_to_iname = {}
+        gid_to_iname = {}
         for iname in my_inames:
             tag = self.knl.iname_to_tag.get(iname)
             if isinstance(tag, LocalIndexTag):
                 lid_to_iname[tag.axis] = iname
-
-        if not lid_to_iname:
-
-            # no local id found, count as uniform access
-            # Note, a few different cases may be considered uniform:
-            # lid_strides={} if no local ids were found,
-            # lid_strides={1:1, 2:32} if no local id 0 was found,
-            # lid_strides={0:0, ...} if a local id 0 is found and its stride is 0
-            warn_with_kernel(self.knl, "no_lid_found",
-                             "GlobalSubscriptCounter: No local id found, "
-                             "setting lid_strides to {}. Expression: %s"
-                             % (expr))
-
-            return ToCountMap({MemAccess(
-                                mtype='global',
-                                dtype=self.type_inf(expr), lid_strides={},
-                                variable=name,
-                                count_granularity=CountGranularity.SUBGROUP): 1}
-                              ) + self.rec(expr.index)
+            elif isinstance(tag, GroupIndexTag):
+                gid_to_iname[tag.axis] = iname
 
         # create lid_strides dict (strides are coefficents in flattened index)
         # i.e., we want {0:A, 1:B, 2:C, ...} where A, B, & C
@@ -939,7 +939,6 @@ class GlobalMemAccessCounter(MemAccessCounter):
         from pymbolic.primitives import Variable
 
         lid_strides = {}
-
         for ltag, iname in six.iteritems(lid_to_iname):
             ltag_stride = 0
             # check coefficient of this lid for each axis
@@ -971,6 +970,42 @@ class GlobalMemAccessCounter(MemAccessCounter):
                 ltag_stride += stride*coeff_lid
             lid_strides[ltag] = ltag_stride
 
+        # create gid_strides dict (strides are coefficents in flattened index)
+        # i.e., we want {0:A, 1:B, 2:C, ...} where A, B, & C
+        # come from flattened index [... + C*gid2 + B*gid1 + A*gid0]
+
+        gid_strides = {}
+        for gtag, iname in six.iteritems(gid_to_iname):
+            gtag_stride = 0
+            # check coefficient of this gid for each axis
+            for idx, axis_tag in zip(index, array.dim_tags):
+
+                from loopy.symbolic import simplify_using_aff
+                from loopy.diagnostic import ExpressionNotAffineError
+                try:
+                    coeffs = CoefficientCollector()(
+                              simplify_using_aff(self.knl, idx))
+                except ExpressionNotAffineError:
+                    gtag_stride = None
+                    break
+
+                # check if idx contains this gid
+                try:
+                    coeff_gid = coeffs[Variable(gid_to_iname[gtag])]
+                except KeyError:
+                    # idx does not contain this gid
+                    continue
+
+                # found coefficient of this gid
+                # now determine stride
+                if isinstance(axis_tag, FixedStrideArrayDimTag):
+                    stride = axis_tag.stride
+                else:
+                    continue
+
+                gtag_stride += stride*coeff_gid
+            gid_strides[gtag] = gtag_stride
+
         count_granularity = CountGranularity.WORKITEM if (
                                 0 in lid_strides and lid_strides[0] != 0
                                 ) else CountGranularity.SUBGROUP
@@ -979,6 +1014,7 @@ class GlobalMemAccessCounter(MemAccessCounter):
                             mtype='global',
                             dtype=self.type_inf(expr),
                             lid_strides=dict(sorted(six.iteritems(lid_strides))),
+                            gid_strides=dict(sorted(six.iteritems(gid_strides))),
                             variable=name,
                             count_granularity=count_granularity
                             ): 1}
@@ -1390,6 +1426,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                                     mtype='global',
                                     dtype=np.float32,
                                     lid_strides={0: 1},
+                                    gid_strides={0: 256},
                                     direction='load',
                                     variable='a',
                                     count_granularity=CountGranularity.WORKITEM)
@@ -1398,6 +1435,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                                     mtype='global',
                                     dtype=np.float32,
                                     lid_strides={0: 1},
+                                    gid_strides={0: 256},
                                     direction='store',
                                     variable='a',
                                     count_granularity=CountGranularity.WORKITEM)
@@ -1406,6 +1444,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                                     mtype='local',
                                     dtype=np.float32,
                                     lid_strides={0: 1},
+                                    gid_strides={0: 256},
                                     direction='load',
                                     variable='x',
                                     count_granularity=CountGranularity.WORKITEM)
@@ -1414,6 +1453,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                                     mtype='local',
                                     dtype=np.float32,
                                     lid_strides={0: 1},
+                                    gid_strides={0: 256},
                                     direction='store',
                                     variable='x',
                                     count_granularity=CountGranularity.WORKITEM)
@@ -1562,6 +1602,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                             mtype=mem_access.mtype,
                             dtype=mem_access.dtype.numpy_dtype,
                             lid_strides=mem_access.lid_strides,
+                            gid_strides=mem_access.gid_strides,
                             direction=mem_access.direction,
                             variable=mem_access.variable,
                             count_granularity=mem_access.count_granularity),
diff --git a/test/test_statistics.py b/test/test_statistics.py
index e42c43f60..0f57c8f20 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -269,20 +269,24 @@ def test_mem_access_counter_basic():
     subgroups_per_group = div_ceil(group_size, subgroup_size)
 
     f32l = mem_map[lp.MemAccess('global', np.float32,
-                         lid_strides={}, direction='load', variable='a',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='load', variable='a',
+                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
-                         lid_strides={}, direction='load', variable='b',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='load', variable='b',
+                        count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
     f64l = mem_map[lp.MemAccess('global', np.float64,
-                         lid_strides={}, direction='load', variable='g',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='load', variable='g',
+                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f64l += mem_map[lp.MemAccess('global', np.float64,
-                         lid_strides={}, direction='load', variable='h',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='load', variable='h',
+                        count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
@@ -290,12 +294,14 @@ def test_mem_access_counter_basic():
     assert f64l == (2*n*m)*n_workgroups*subgroups_per_group
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                         lid_strides={}, direction='store', variable='c',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='store', variable='c',
+                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
-                         lid_strides={}, direction='store', variable='e',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='store', variable='e',
+                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
@@ -328,20 +334,23 @@ def test_mem_access_counter_reduction():
     subgroups_per_group = div_ceil(group_size, subgroup_size)
 
     f32l = mem_map[lp.MemAccess('global', np.float32,
-                         lid_strides={}, direction='load', variable='a',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='load', variable='a',
+                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
-                         lid_strides={}, direction='load', variable='b',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='load', variable='b',
+                        count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
     assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                         lid_strides={}, direction='store', variable='c',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='store', variable='c',
+                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
@@ -430,20 +439,24 @@ def test_mem_access_counter_specialops():
     subgroups_per_group = div_ceil(group_size, subgroup_size)
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
-                         lid_strides={}, direction='load', variable='a',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='load', variable='a',
+                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     f32 += mem_map[lp.MemAccess('global', np.float32,
-                         lid_strides={}, direction='load', variable='b',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='load', variable='b',
+                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
-                         lid_strides={}, direction='load', variable='g',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='load', variable='g',
+                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
-                         lid_strides={}, direction='load', variable='h',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='load', variable='h',
+                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
@@ -451,12 +464,14 @@ def test_mem_access_counter_specialops():
     assert f64 == (2*n*m)*n_workgroups*subgroups_per_group
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
-                         lid_strides={}, direction='store', variable='c',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='store', variable='c',
+                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.float64,
-                         lid_strides={}, direction='store', variable='e',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='store', variable='e',
+                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
@@ -502,32 +517,38 @@ def test_mem_access_counter_bitwise():
     subgroups_per_group = div_ceil(group_size, subgroup_size)
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
-                         lid_strides={}, direction='load', variable='a',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='load', variable='a',
+                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
-                         lid_strides={}, direction='load', variable='b',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='load', variable='b',
+                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
-                         lid_strides={}, direction='load', variable='g',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='load', variable='g',
+                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
-                         lid_strides={}, direction='load', variable='h',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='load', variable='h',
+                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
     assert i32 == (4*n*m+2*n*m*ell)*n_workgroups*subgroups_per_group
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
-                         lid_strides={}, direction='store', variable='c',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='store', variable='c',
+                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
-                         lid_strides={}, direction='store', variable='e',
-                         count_granularity=CG.SUBGROUP)
+                        lid_strides={}, gid_strides={},
+                        direction='store', variable='e',
+                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
@@ -567,24 +588,31 @@ def test_mem_access_counter_mixed():
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                     subgroup_size=subgroup_size)
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
-                                lid_strides={}, direction='load', variable='g',
+                                lid_strides={}, gid_strides={},
+                                direction='load', variable='g',
                                 count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
     f64uniform += mem_map[lp.MemAccess('global', np.float64,
-                                lid_strides={}, direction='load', variable='h',
+                                lid_strides={}, gid_strides={},
+                                direction='load', variable='h',
                                 count_granularity=CG.SUBGROUP)
                           ].eval_with_dict(params)
     f32uniform = mem_map[lp.MemAccess('global', np.float32,
-                                lid_strides={}, direction='load', variable='x',
+                                lid_strides={}, gid_strides={},
+                                direction='load', variable='x',
                                 count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                lid_strides={0: Variable('m')}, direction='load',
+                                lid_strides={0: Variable('m')},
+                                gid_strides={0: Variable('m')*group_size_0},
+                                direction='load',
                                 variable='a',
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                lid_strides={0: Variable('m')}, direction='load',
+                                lid_strides={0: Variable('m')},
+                                gid_strides={0: Variable('m')*group_size_0},
+                                direction='load',
                                 variable='b',
                                 count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
@@ -611,11 +639,14 @@ def test_mem_access_counter_mixed():
         assert f32nonconsec == 3*n*m*ell
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
-                                lid_strides={}, direction='store', variable='e',
+                                lid_strides={}, gid_strides={},
+                                direction='store', variable='e',
                                 count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
-                                lid_strides={0: Variable('m')}, direction='store',
+                                lid_strides={0: Variable('m')},
+                                gid_strides={0: Variable('m')*group_size_0},
+                                direction='store',
                                 variable='c',
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
@@ -645,7 +676,8 @@ def test_mem_access_counter_nonconsec():
             name="nonconsec", assumptions="n,m,ell >= 1")
     knl = lp.add_and_infer_dtypes(knl, dict(
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    knl = lp.split_iname(knl, "i", 16)
+    lsize0 = 16
+    knl = lp.split_iname(knl, "i", lsize0)
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
@@ -655,37 +687,52 @@ def test_mem_access_counter_nonconsec():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
-                                lid_strides={0: Variable('m')}, direction='load',
+                                lid_strides={0: Variable('m')},
+                                gid_strides={0: Variable('m')*lsize0},
+                                direction='load',
                                 variable='g',
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
     f64nonconsec += mem_map[lp.MemAccess('global', np.float64,
-                                lid_strides={0: Variable('m')}, direction='load',
+                                lid_strides={0: Variable('m')},
+                                gid_strides={0: Variable('m')*lsize0},
+                                direction='load',
                                 variable='h',
                                 count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
-    f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                lid_strides={0: Variable('m')*Variable('ell')},
-                                direction='load', variable='a',
-                                count_granularity=CG.WORKITEM)
+    f32nonconsec = mem_map[lp.MemAccess(
+                            'global', np.dtype(np.float32),
+                            lid_strides={0: Variable('m')*Variable('ell')},
+                            gid_strides={0: Variable('m')*Variable('ell')*lsize0},
+                            direction='load', variable='a',
+                            count_granularity=CG.WORKITEM
+                            )
                            ].eval_with_dict(params)
-    f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                lid_strides={0: Variable('m')*Variable('ell')},
-                                direction='load', variable='b',
-                                count_granularity=CG.WORKITEM)
+    f32nonconsec += mem_map[lp.MemAccess(
+                            'global', np.dtype(np.float32),
+                            lid_strides={0: Variable('m')*Variable('ell')},
+                            gid_strides={0: Variable('m')*Variable('ell')*lsize0},
+                            direction='load', variable='b',
+                            count_granularity=CG.WORKITEM
+                            )
                             ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
 
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
-                                lid_strides={0: Variable('m')}, direction='store',
+                                lid_strides={0: Variable('m')},
+                                gid_strides={0: Variable('m')*lsize0},
+                                direction='store',
                                 variable='e',
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
-    f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
-                                lid_strides={0: Variable('m')*Variable('ell')},
-                                direction='store', variable='c',
-                                count_granularity=CG.WORKITEM)
+    f32nonconsec = mem_map[lp.MemAccess(
+                            'global', np.float32,
+                            lid_strides={0: Variable('m')*Variable('ell')},
+                            gid_strides={0: Variable('m')*Variable('ell')*lsize0},
+                            direction='store', variable='c',
+                            count_granularity=CG.WORKITEM
+                            )
                            ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*ell
@@ -694,13 +741,17 @@ def test_mem_access_counter_nonconsec():
                                       subgroup_size=64)
     f64nonconsec = mem_map64[lp.MemAccess(
                     'global',
-                    np.float64, lid_strides={0: Variable('m')},
+                    np.float64,
+                    lid_strides={0: Variable('m')},
+                    gid_strides={0: Variable('m')*lsize0},
                     direction='load', variable='g',
                     count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
     f64nonconsec += mem_map64[lp.MemAccess(
                     'global',
-                    np.float64, lid_strides={0: Variable('m')},
+                    np.float64,
+                    lid_strides={0: Variable('m')},
+                    gid_strides={0: Variable('m')*lsize0},
                     direction='load', variable='h',
                     count_granularity=CG.WORKITEM)
                     ].eval_with_dict(params)
@@ -708,6 +759,7 @@ def test_mem_access_counter_nonconsec():
                     'global',
                     np.dtype(np.float32),
                     lid_strides={0: Variable('m')*Variable('ell')},
+                    gid_strides={0: Variable('m')*Variable('ell')*lsize0},
                     direction='load',
                     variable='a',
                     count_granularity=CG.WORKITEM)
@@ -716,6 +768,7 @@ def test_mem_access_counter_nonconsec():
                     'global',
                     np.dtype(np.float32),
                     lid_strides={0: Variable('m')*Variable('ell')},
+                    gid_strides={0: Variable('m')*Variable('ell')*lsize0},
                     direction='load',
                     variable='b',
                     count_granularity=CG.WORKITEM)
@@ -746,33 +799,48 @@ def test_mem_access_counter_consec():
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
 
-    f64consec = mem_map[lp.MemAccess('global', np.float64,
-                        lid_strides={0: 1}, direction='load', variable='g',
-                        count_granularity=CG.WORKITEM)
-                        ].eval_with_dict(params)
-    f64consec += mem_map[lp.MemAccess('global', np.float64,
-                        lid_strides={0: 1}, direction='load', variable='h',
-                        count_granularity=CG.WORKITEM)
-                         ].eval_with_dict(params)
-    f32consec = mem_map[lp.MemAccess('global', np.float32,
-                        lid_strides={0: 1}, direction='load', variable='a',
-                        count_granularity=CG.WORKITEM)
-                        ].eval_with_dict(params)
-    f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                        lid_strides={0: 1}, direction='load', variable='b',
-                        count_granularity=CG.WORKITEM)
-                         ].eval_with_dict(params)
+    f64consec = mem_map[lp.MemAccess(
+                    'global', np.float64,
+                    lid_strides={0: 1}, gid_strides={0: Variable('m')},
+                    direction='load', variable='g',
+                    count_granularity=CG.WORKITEM)
+                    ].eval_with_dict(params)
+    f64consec += mem_map[lp.MemAccess(
+                    'global', np.float64,
+                    lid_strides={0: 1}, gid_strides={0: Variable('m')},
+                    direction='load', variable='h',
+                    count_granularity=CG.WORKITEM)
+                    ].eval_with_dict(params)
+    f32consec = mem_map[lp.MemAccess(
+                    'global', np.float32,
+                    lid_strides={0: 1},
+                    gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')},
+                    direction='load', variable='a',
+                    count_granularity=CG.WORKITEM)
+                    ].eval_with_dict(params)
+    f32consec += mem_map[lp.MemAccess(
+                    'global', np.dtype(np.float32),
+                    lid_strides={0: 1},
+                    gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')},
+                    direction='load', variable='b',
+                    count_granularity=CG.WORKITEM)
+                    ].eval_with_dict(params)
     assert f64consec == 2*n*m*ell
     assert f32consec == 3*n*m*ell
 
-    f64consec = mem_map[lp.MemAccess('global', np.float64,
-                        lid_strides={0: 1}, direction='store', variable='e',
-                        count_granularity=CG.WORKITEM)
-                        ].eval_with_dict(params)
-    f32consec = mem_map[lp.MemAccess('global', np.float32,
-                        lid_strides={0: 1}, direction='store', variable='c',
-                        count_granularity=CG.WORKITEM)
-                        ].eval_with_dict(params)
+    f64consec = mem_map[lp.MemAccess(
+                    'global', np.float64,
+                    lid_strides={0: 1}, gid_strides={0: Variable('m')},
+                    direction='store', variable='e',
+                    count_granularity=CG.WORKITEM)
+                    ].eval_with_dict(params)
+    f32consec = mem_map[lp.MemAccess(
+                    'global', np.float32,
+                    lid_strides={0: 1},
+                    gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')},
+                    direction='store', variable='c',
+                    count_granularity=CG.WORKITEM)
+                    ].eval_with_dict(params)
     assert f64consec == n*m*ell
     assert f32consec == n*m*ell
 
@@ -898,11 +966,14 @@ def test_all_counters_parallel_matmul():
 
     f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
                              lid_strides={0: 1, 1: Variable('ell')},
+                             gid_strides={1: bsize},
                              direction='load', variable='b',
                              count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
     f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
-                             lid_strides={0: 1, 1: Variable('m')}, direction='load',
+                             lid_strides={0: 1, 1: Variable('m')},
+                             gid_strides={0: Variable('m')*bsize},
+                             direction='load',
                              variable='a', count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
 
@@ -911,6 +982,7 @@ def test_all_counters_parallel_matmul():
 
     f32coal = mem_access_map[lp.MemAccess('global', np.float32,
                              lid_strides={0: 1, 1: Variable('ell')},
+                             gid_strides={0: Variable('ell')*bsize, 1: bsize},
                              direction='store', variable='c',
                              count_granularity=CG.WORKITEM)
                              ].eval_with_dict(params)
-- 
GitLab


From 6d8f6f701f6703ee3315fbe1d467c9ab9761f4ca Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 2 Apr 2018 14:21:03 -0500
Subject: [PATCH 2/3] fixing flake8 issues

---
 loopy/statistics.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 7cb70026d..1fe55111c 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -937,6 +937,8 @@ class GlobalMemAccessCounter(MemAccessCounter):
         from loopy.symbolic import CoefficientCollector
         from loopy.kernel.array import FixedStrideArrayDimTag
         from pymbolic.primitives import Variable
+        from loopy.symbolic import simplify_using_aff
+        from loopy.diagnostic import ExpressionNotAffineError
 
         lid_strides = {}
         for ltag, iname in six.iteritems(lid_to_iname):
@@ -944,8 +946,6 @@ class GlobalMemAccessCounter(MemAccessCounter):
             # check coefficient of this lid for each axis
             for idx, axis_tag in zip(index, array.dim_tags):
 
-                from loopy.symbolic import simplify_using_aff
-                from loopy.diagnostic import ExpressionNotAffineError
                 try:
                     coeffs = CoefficientCollector()(
                               simplify_using_aff(self.knl, idx))
@@ -980,8 +980,6 @@ class GlobalMemAccessCounter(MemAccessCounter):
             # check coefficient of this gid for each axis
             for idx, axis_tag in zip(index, array.dim_tags):
 
-                from loopy.symbolic import simplify_using_aff
-                from loopy.diagnostic import ExpressionNotAffineError
                 try:
                     coeffs = CoefficientCollector()(
                               simplify_using_aff(self.knl, idx))
-- 
GitLab


From c6445465d65a7eedf41ecbd04ad5c92c090a6b94 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 2 Apr 2018 18:04:47 -0500
Subject: [PATCH 3/3] function get_iname_strides() eliminating repeated code

---
 loopy/statistics.py | 107 +++++++++++++++++---------------------------
 1 file changed, 42 insertions(+), 65 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 1fe55111c..c4f8c9e26 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -930,9 +930,13 @@ class GlobalMemAccessCounter(MemAccessCounter):
             elif isinstance(tag, GroupIndexTag):
                 gid_to_iname[tag.axis] = iname
 
-        # create lid_strides dict (strides are coefficents in flattened index)
-        # i.e., we want {0:A, 1:B, 2:C, ...} where A, B, & C
-        # come from flattened index [... + C*lid2 + B*lid1 + A*lid0]
+        # create lid_strides and gid_strides dicts
+
+        # strides are coefficents in flattened index, i.e., we want
+        # lid_strides = {0:l0, 1:l1, 2:l2, ...} and
+        # gid_strides = {0:g0, 1:g1, 2:g2, ...},
+        # where l0, l1, l2, g0, g1, and g2 come from flattened index
+        # [... + g2*gid2 + g1*gid1 + g0*gid0 + ... + l2*lid2 + l1*lid1 + l0*lid0]
 
         from loopy.symbolic import CoefficientCollector
         from loopy.kernel.array import FixedStrideArrayDimTag
@@ -940,69 +944,42 @@ class GlobalMemAccessCounter(MemAccessCounter):
         from loopy.symbolic import simplify_using_aff
         from loopy.diagnostic import ExpressionNotAffineError
 
-        lid_strides = {}
-        for ltag, iname in six.iteritems(lid_to_iname):
-            ltag_stride = 0
-            # check coefficient of this lid for each axis
-            for idx, axis_tag in zip(index, array.dim_tags):
-
-                try:
-                    coeffs = CoefficientCollector()(
-                              simplify_using_aff(self.knl, idx))
-                except ExpressionNotAffineError:
-                    ltag_stride = None
-                    break
-
-                # check if idx contains this lid
-                try:
-                    coeff_lid = coeffs[Variable(lid_to_iname[ltag])]
-                except KeyError:
-                    # idx does not contain this lid
-                    continue
-
-                # found coefficient of this lid
-                # now determine stride
-                if isinstance(axis_tag, FixedStrideArrayDimTag):
-                    stride = axis_tag.stride
-                else:
-                    continue
-
-                ltag_stride += stride*coeff_lid
-            lid_strides[ltag] = ltag_stride
-
-        # create gid_strides dict (strides are coefficents in flattened index)
-        # i.e., we want {0:A, 1:B, 2:C, ...} where A, B, & C
-        # come from flattened index [... + C*gid2 + B*gid1 + A*gid0]
-
-        gid_strides = {}
-        for gtag, iname in six.iteritems(gid_to_iname):
-            gtag_stride = 0
-            # check coefficient of this gid for each axis
-            for idx, axis_tag in zip(index, array.dim_tags):
-
-                try:
-                    coeffs = CoefficientCollector()(
-                              simplify_using_aff(self.knl, idx))
-                except ExpressionNotAffineError:
-                    gtag_stride = None
-                    break
-
-                # check if idx contains this gid
-                try:
-                    coeff_gid = coeffs[Variable(gid_to_iname[gtag])]
-                except KeyError:
-                    # idx does not contain this gid
-                    continue
-
-                # found coefficient of this gid
-                # now determine stride
-                if isinstance(axis_tag, FixedStrideArrayDimTag):
-                    stride = axis_tag.stride
-                else:
-                    continue
+        def get_iname_strides(tag_to_iname_dict):
+            tag_to_stride_dict = {}
+            for tag, iname in six.iteritems(tag_to_iname_dict):
+                total_iname_stride = 0
+                # find total stride of this iname for each axis
+                for idx, axis_tag in zip(index, array.dim_tags):
+                    # collect index coefficients
+                    try:
+                        coeffs = CoefficientCollector()(
+                                  simplify_using_aff(self.knl, idx))
+                    except ExpressionNotAffineError:
+                        total_iname_stride = None
+                        break
+
+                    # check if idx contains this iname
+                    try:
+                        coeff = coeffs[Variable(tag_to_iname_dict[tag])]
+                    except KeyError:
+                        # idx does not contain this iname
+                        continue
+
+                    # found coefficient of this iname
+                    # now determine stride
+                    if isinstance(axis_tag, FixedStrideArrayDimTag):
+                        axis_tag_stride = axis_tag.stride
+                    else:
+                        continue
+
+                    total_iname_stride += axis_tag_stride*coeff
+
+                tag_to_stride_dict[tag] = total_iname_stride
+
+            return tag_to_stride_dict
 
-                gtag_stride += stride*coeff_gid
-            gid_strides[gtag] = gtag_stride
+        lid_strides = get_iname_strides(lid_to_iname)
+        gid_strides = get_iname_strides(gid_to_iname)
 
         count_granularity = CountGranularity.WORKITEM if (
                                 0 in lid_strides and lid_strides[0] != 0
-- 
GitLab