From 9dfeff33502b5585105615f860e6105fe3c70a78 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Wed, 7 Dec 2016 22:08:21 -0600
Subject: [PATCH 1/8] fixed white space

---
 loopy/statistics.py     | 12 ++++-----
 test/test_statistics.py | 58 ++++++++++++++++++++---------------------
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index f363523b8..157eb70d5 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -581,7 +581,7 @@ class ExpressionOpCounter(CombineMapper):
     map_bitwise_and = map_bitwise_or
 
     def map_if(self, expr):
-        warn_with_kernel(self.knl, "summing_if_branches_ops", 
+        warn_with_kernel(self.knl, "summing_if_branches_ops",
                          "ExpressionOpCounter counting ops as sum of "
                          "if-statement branches.")
         return self.rec(expr.condition) + self.rec(expr.then) \
@@ -652,7 +652,7 @@ class LocalSubscriptCounter(CombineMapper):
             if array.is_local:
                 sub_map[MemAccess(mtype='local', dtype=self.type_inf(expr))] = 1
         return sub_map + self.rec(expr.index)
-            
+
     def map_sum(self, expr):
         if expr.children:
             return sum(self.rec(child) for child in expr.children)
@@ -665,14 +665,14 @@ class LocalSubscriptCounter(CombineMapper):
         return self.rec(expr.left)+self.rec(expr.right)
 
     def map_if(self, expr):
-        warn_with_kernel(self.knl, "summing_if_branches_lsubs", 
+        warn_with_kernel(self.knl, "summing_if_branches_lsubs",
                          "LocalSubscriptCounter counting LMEM accesses as sum "
                          "of if-statement branches.")
         return self.rec(expr.condition) + self.rec(expr.then) \
                + self.rec(expr.else_)
 
     def map_if_positive(self, expr):
-        warn_with_kernel(self.knl, "summing_ifpos_branches_lsubs", 
+        warn_with_kernel(self.knl, "summing_ifpos_branches_lsubs",
                          "LocalSubscriptCounter counting LMEM accesses as sum "
                          "of if_pos-statement branches.")
         return self.rec(expr.criterion) + self.rec(expr.then) \
@@ -818,14 +818,14 @@ class GlobalSubscriptCounter(CombineMapper):
     map_product = map_sum
 
     def map_if(self, expr):
-        warn_with_kernel(self.knl, "summing_if_branches_gsubs", 
+        warn_with_kernel(self.knl, "summing_if_branches_gsubs",
                          "GlobalSubscriptCounter counting GMEM accesses as "
                          "sum of if-statement branches.")
         return self.rec(expr.condition) + self.rec(expr.then) \
                + self.rec(expr.else_)
 
     def map_if_positive(self, expr):
-        warn_with_kernel(self.knl, "summing_ifpos_branches_gsubs", 
+        warn_with_kernel(self.knl, "summing_ifpos_branches_gsubs",
                          "GlobalSubscriptCounter counting GMEM accesses as "
                          "sum of if_pos-statement branches.")
         return self.rec(expr.criterion) + self.rec(expr.then) \
diff --git a/test/test_statistics.py b/test/test_statistics.py
index fb502045c..ed592842d 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -61,7 +61,7 @@ def test_op_counter_basic():
     assert f32add == f32mul == f32div == n*m*l
     assert f64mul == n*m
     assert i32add == n*m*2
-    
+
 
 def test_op_counter_reduction():
 
@@ -398,24 +398,24 @@ def test_mem_access_counter_bitwise():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    i32 = mem_map[lp.MemAccess('global', np.int32, 
+    i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='a')
               ].eval_with_dict(params)
-    i32 += mem_map[lp.MemAccess('global', np.int32, 
+    i32 += mem_map[lp.MemAccess('global', np.int32,
                           stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
-    i32 += mem_map[lp.MemAccess('global', np.int32, 
+    i32 += mem_map[lp.MemAccess('global', np.int32,
                           stride=0, direction='load', variable='g')
                ].eval_with_dict(params)
-    i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), 
+    i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
                           stride=0, direction='load', variable='h')
                ].eval_with_dict(params)
     assert i32 == 4*n*m+2*n*m*l
 
-    i32 = mem_map[lp.MemAccess('global', np.int32, 
+    i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='c')
               ].eval_with_dict(params)
-    i32 += mem_map[lp.MemAccess('global', np.int32, 
+    i32 += mem_map[lp.MemAccess('global', np.int32,
                           stride=0, direction='store', variable='e')
                ].eval_with_dict(params)
     assert i32 == n*m+n*m*l
@@ -444,20 +444,20 @@ def test_mem_access_counter_mixed():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64uniform = mem_map[lp.MemAccess('global', np.float64, 
+    f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='load', variable='g')
                      ].eval_with_dict(params)
-    f64uniform += mem_map[lp.MemAccess('global', np.float64, 
+    f64uniform += mem_map[lp.MemAccess('global', np.float64,
                                  stride=0, direction='load', variable='h')
                       ].eval_with_dict(params)
-    f32uniform = mem_map[lp.MemAccess('global', np.float32, 
+    f32uniform = mem_map[lp.MemAccess('global', np.float32,
                                 stride=0, direction='load', variable='x')
                      ].eval_with_dict(params)
-    f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), 
+    f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                   stride=Variable('m'), direction='load',
                                   variable='a')
                        ].eval_with_dict(params)
-    f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), 
+    f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                    stride=Variable('m'), direction='load',
                                    variable='b')
                         ].eval_with_dict(params)
@@ -465,10 +465,10 @@ def test_mem_access_counter_mixed():
     assert f32uniform == n*m*l/threads
     assert f32nonconsec == 3*n*m*l
 
-    f64uniform = mem_map[lp.MemAccess('global', np.float64, 
+    f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='store', variable='e')
                      ].eval_with_dict(params)
-    f32nonconsec = mem_map[lp.MemAccess('global', np.float32, 
+    f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                   stride=Variable('m'), direction='store',
                                   variable='c')
                        ].eval_with_dict(params)
@@ -497,30 +497,30 @@ def test_mem_access_counter_nonconsec():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64nonconsec = mem_map[lp.MemAccess('global', np.float64, 
+    f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
                                   stride=Variable('m'), direction='load',
                                   variable='g')
                        ].eval_with_dict(params)
-    f64nonconsec += mem_map[lp.MemAccess('global', np.float64, 
+    f64nonconsec += mem_map[lp.MemAccess('global', np.float64,
                                    stride=Variable('m'), direction='load',
                                    variable='h')
                         ].eval_with_dict(params)
-    f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), 
+    f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                   stride=Variable('m')*Variable('l'),
                                   direction='load', variable='a')
                        ].eval_with_dict(params)
-    f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), 
+    f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                    stride=Variable('m')*Variable('l'),
                                    direction='load', variable='b')
                         ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*l
 
-    f64nonconsec = mem_map[lp.MemAccess('global', np.float64, 
+    f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
                                   stride=Variable('m'), direction='store',
                                   variable='e')
                        ].eval_with_dict(params)
-    f32nonconsec = mem_map[lp.MemAccess('global', np.float32, 
+    f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                   stride=Variable('m')*Variable('l'),
                                   direction='store', variable='c')
                        ].eval_with_dict(params)
@@ -552,25 +552,25 @@ def test_mem_access_counter_consec():
     #for k in mem_map:
     #    print(k.mtype, k.dtype, type(k.dtype), k.stride, k.direction, k.variable, " :\n", mem_map[k])
 
-    f64consec = mem_map[lp.MemAccess('global', np.float64, 
+    f64consec = mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='load', variable='g')
                      ].eval_with_dict(params)
-    f64consec += mem_map[lp.MemAccess('global', np.float64, 
+    f64consec += mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='load', variable='h')
                      ].eval_with_dict(params)
-    f32consec = mem_map[lp.MemAccess('global', np.float32, 
+    f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='load', variable='a')
                      ].eval_with_dict(params)
-    f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), 
+    f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         stride=1, direction='load', variable='b')
                      ].eval_with_dict(params)
     assert f64consec == 2*n*m
     assert f32consec == 3*n*m*l
 
-    f64consec = mem_map[lp.MemAccess('global', np.float64, 
+    f64consec = mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='store', variable='e')
                      ].eval_with_dict(params)
-    f32consec = mem_map[lp.MemAccess('global', np.float32, 
+    f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='store', variable='c')
                      ].eval_with_dict(params)
     assert f64consec == n*m
@@ -670,16 +670,16 @@ def test_all_counters_parallel_matmul():
 
     op_map = lp.get_mem_access_map(knl)
 
-    f32coal = op_map[lp.MemAccess('global', np.float32, 
+    f32coal = op_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='load', variable='b')
                             ].eval_with_dict(params)
-    f32coal += op_map[lp.MemAccess('global', np.float32, 
+    f32coal += op_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='load', variable='a')
                             ].eval_with_dict(params)
 
     assert f32coal == n*m+m*l
 
-    f32coal = op_map[lp.MemAccess('global', np.float32, 
+    f32coal = op_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='store', variable='c')
                             ].eval_with_dict(params)
 
-- 
GitLab


From e5d2c3ad00460bef313f779a2453ce693e2dbfd0 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@illinois.edu>
Date: Wed, 7 Dec 2016 22:43:36 -0600
Subject: [PATCH 2/8] fixing flagged style problems

---
 loopy/statistics.py | 42 ++++++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 157eb70d5..fde8643bf 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -25,8 +25,6 @@ THE SOFTWARE.
 import six
 
 import loopy as lp
-import numpy as np
-import warnings
 from islpy import dim_type
 import islpy as isl
 from pytools import memoize_in
@@ -319,7 +317,6 @@ class ToCountMap(object):
 
         return result
 
-
     def sum(self):
         """Add all counts in ToCountMap.
 
@@ -335,7 +332,6 @@ class ToCountMap(object):
             total += v
         return total
 
-
     def eval_and_sum(self, params):
         """Add all counts in :class:`ToCountMap` and evaluate with provided
         parameter dict.
@@ -443,7 +439,8 @@ class MemAccess(object):
 
     """
 
-    def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None):
+    def __init__(self, mtype=None, dtype=None, stride=None, direction=None,
+                 variable=None):
         self.mtype = mtype
         self.stride = stride
         self.direction = direction
@@ -501,8 +498,8 @@ class MemAccess(object):
             variable = 'None'
         else:
             variable = self.variable
-        return "MemAccess("+mtype+", "+dtype+", "+stride+", "+direction+", " \
-               +variable+")"
+        return "MemAccess(" + mtype + ", " + dtype + ", " + stride + ", " \
+               + direction + ", " + variable + ")"
 
 
 # {{{ ExpressionOpCounter
@@ -574,8 +571,8 @@ class ExpressionOpCounter(CombineMapper):
 
     def map_bitwise_or(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'):
-                           len(expr.children)-1}
-                         ) + sum(self.rec(child) for child in expr.children)
+                           len(expr.children)-1}) \
+                                + sum(self.rec(child) for child in expr.children)
 
     map_bitwise_xor = map_bitwise_or
     map_bitwise_and = map_bitwise_or
@@ -596,8 +593,8 @@ class ExpressionOpCounter(CombineMapper):
 
     def map_min(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr), name='maxmin'):
-                           len(expr.children)-1}
-                         ) + sum(self.rec(child) for child in expr.children)
+                           len(expr.children)-1}) \
+               + sum(self.rec(child) for child in expr.children)
 
     map_max = map_min
 
@@ -739,7 +736,7 @@ class GlobalSubscriptCounter(CombineMapper):
             index = (index,)
 
         from loopy.symbolic import get_dependencies
-        from loopy.kernel.data import LocalIndexTag, GroupIndexTag
+        from loopy.kernel.data import LocalIndexTag
         my_inames = get_dependencies(index) & self.knl.all_inames()
 
         # find min tag axis
@@ -758,7 +755,7 @@ class GlobalSubscriptCounter(CombineMapper):
             return ToCountMap({MemAccess(mtype='global',
                                          dtype=self.type_inf(expr), stride=0,
                                          variable=name): 1}
-                             ) + self.rec(expr.index)
+                              ) + self.rec(expr.index)
 
         if min_tag_axis != 0:
             warn_with_kernel(self.knl, "unknown_gmem_stride",
@@ -768,7 +765,7 @@ class GlobalSubscriptCounter(CombineMapper):
             return ToCountMap({MemAccess(mtype='global',
                                          dtype=self.type_inf(expr),
                                          stride=sys.maxsize, variable=name): 1}
-                             ) + self.rec(expr.index)
+                              ) + self.rec(expr.index)
 
         # get local_id associated with minimum tag axis
         min_lid = None
@@ -807,7 +804,7 @@ class GlobalSubscriptCounter(CombineMapper):
 
         return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr),
                                      stride=total_stride, variable=name): 1}
-                         ) + self.rec(expr.index)
+                          ) + self.rec(expr.index)
 
     def map_sum(self, expr):
         if expr.children:
@@ -1203,8 +1200,7 @@ def get_mem_access_map(knl, numpy_types=True):
         if uniform:
             from loopy.kernel.data import LocalIndexTag
             insn_inames = [iname for iname in insn_inames if not
-                           isinstance(
-                           knl.iname_to_tag.get(iname), LocalIndexTag)]
+                           isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)]
         inames_domain = knl.get_inames_domain(insn_inames)
         domain = (inames_domain.project_out_except(
                                 insn_inames, [dim_type.set]))
@@ -1227,7 +1223,7 @@ def get_mem_access_map(knl, numpy_types=True):
             subs_expr[MemAccess(mtype=key.mtype, dtype=key.dtype,
                                 stride=key.stride, direction='load',
                                 variable=key.variable)
-                     ] = subs_expr.pop(key)
+                      ] = subs_expr.pop(key)
 
         subs_assignee_g = subs_counter_g(insn.assignee)
         for key in subs_assignee_g.count_map:
@@ -1235,7 +1231,7 @@ def get_mem_access_map(knl, numpy_types=True):
                                       stride=key.stride,
                                       direction='store',
                                       variable=key.variable)
-                           ] = subs_assignee_g.pop(key)
+                            ] = subs_assignee_g.pop(key)
         # for now, don't count writes to local mem
 
         insn_inames = knl.insn_inames(insn)
@@ -1243,7 +1239,9 @@ def get_mem_access_map(knl, numpy_types=True):
         # use count excluding local index tags for uniform accesses
         for key in subs_expr.count_map:
             map = ToCountMap({key: subs_expr[key]})
-            if key.mtype == 'global' and isinstance(key.stride, int) and key.stride == 0:
+            if (key.mtype == 'global' and
+                    isinstance(key.stride, int) and
+                    key.stride == 0):
                 subs_map = subs_map \
                             + map*get_insn_count(knl, insn_inames, True)
             else:
@@ -1264,8 +1262,8 @@ def get_mem_access_map(knl, numpy_types=True):
                                              dtype=mem_access.dtype.numpy_dtype,
                                              stride=mem_access.stride,
                                              direction=mem_access.direction,
-                                             variable=mem_access.variable)
-                                   , count)
+                                             variable=mem_access.variable),
+                                  count)
                       for mem_access, count in six.iteritems(subs_map.count_map))
 
     return subs_map
-- 
GitLab


From 9a19d4f8059e9676b37bc99aad1c2fa192a69b75 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@illinois.edu>
Date: Wed, 7 Dec 2016 22:52:06 -0600
Subject: [PATCH 3/8] fixing flagged style problems

---
 test/test_statistics.py | 128 ++++++++++++++++++++--------------------
 1 file changed, 65 insertions(+), 63 deletions(-)

diff --git a/test/test_statistics.py b/test/test_statistics.py
index ed592842d..13f0474e8 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -33,6 +33,7 @@ import numpy as np
 
 from pymbolic.primitives import Variable
 
+
 def test_op_counter_basic():
 
     knl = lp.make_kernel(
@@ -235,25 +236,25 @@ def test_mem_access_counter_basic():
     params = {'n': n, 'm': m, 'l': l}
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
-              ].eval_with_dict(params)
+                   ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
-               ].eval_with_dict(params)
+                    ].eval_with_dict(params)
     f64l = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='load', variable='g')
-              ].eval_with_dict(params)
+                   ].eval_with_dict(params)
     f64l += mem_map[lp.MemAccess('global', np.float64,
                           stride=0, direction='load', variable='h')
-               ].eval_with_dict(params)
+                    ].eval_with_dict(params)
     assert f32l == 3*n*m*l
     assert f64l == 2*n*m
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c')
-              ].eval_with_dict(params)
+                   ].eval_with_dict(params)
     f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='store', variable='e')
-              ].eval_with_dict(params)
+                   ].eval_with_dict(params)
     assert f32s == n*m*l
     assert f64s == n*m
 
@@ -275,21 +276,21 @@ def test_mem_access_counter_reduction():
     params = {'n': n, 'm': m, 'l': l}
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
-              ].eval_with_dict(params)
+                   ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
-               ].eval_with_dict(params)
+                    ].eval_with_dict(params)
     assert f32l == 2*n*m*l
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c')
-              ].eval_with_dict(params)
+                   ].eval_with_dict(params)
     assert f32s == n*l
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
-                             ).to_bytes().eval_and_sum(params)
+                                 ).to_bytes().eval_and_sum(params)
     st_bytes = mem_map.filter_by(mtype=['global'], direction=['store']
-                             ).to_bytes().eval_and_sum(params)
+                                 ).to_bytes().eval_and_sum(params)
     assert ld_bytes == 4*f32l
     assert st_bytes == 4*f32s
 
@@ -316,13 +317,13 @@ def test_mem_access_counter_logic():
 
     f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32),
                                        direction='load')
-                         ].eval_with_dict(params)
+                          ].eval_with_dict(params)
     f64_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
                                        direction='load')
-                         ].eval_with_dict(params)
+                          ].eval_with_dict(params)
     f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
                                        direction='store')
-                         ].eval_with_dict(params)
+                          ].eval_with_dict(params)
     assert f32_g_l == 2*n*m
     assert f64_g_l == n*m
     assert f64_g_s == n*m
@@ -349,33 +350,34 @@ def test_mem_access_counter_specialops():
     params = {'n': n, 'm': m, 'l': l}
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
-              ].eval_with_dict(params)
+                  ].eval_with_dict(params)
     f32 += mem_map[lp.MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
-               ].eval_with_dict(params)
+                   ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='load', variable='g')
-              ].eval_with_dict(params)
+                  ].eval_with_dict(params)
     f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
                           stride=0, direction='load', variable='h')
-               ].eval_with_dict(params)
+                   ].eval_with_dict(params)
     assert f32 == 2*n*m*l
     assert f64 == 2*n*m
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='store', variable='c')
-              ].eval_with_dict(params)
+                  ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='store', variable='e')
-              ].eval_with_dict(params)
+                  ].eval_with_dict(params)
     assert f32 == n*m*l
     assert f64 == n*m
 
-    filtered_map = mem_map.filter_by(direction=['load'], variable=['a','g'])
+    filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'])
     #tot = lp.eval_and_sum_polys(filtered_map, params)
     tot = filtered_map.eval_and_sum(params)
     assert tot == n*m*l + n*m
 
+
 def test_mem_access_counter_bitwise():
 
     knl = lp.make_kernel(
@@ -400,24 +402,24 @@ def test_mem_access_counter_bitwise():
     params = {'n': n, 'm': m, 'l': l}
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='a')
-              ].eval_with_dict(params)
+                  ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                           stride=0, direction='load', variable='b')
-               ].eval_with_dict(params)
+                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                           stride=0, direction='load', variable='g')
-               ].eval_with_dict(params)
+                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
                           stride=0, direction='load', variable='h')
-               ].eval_with_dict(params)
+                   ].eval_with_dict(params)
     assert i32 == 4*n*m+2*n*m*l
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='c')
-              ].eval_with_dict(params)
+                  ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess('global', np.int32,
                           stride=0, direction='store', variable='e')
-               ].eval_with_dict(params)
+                   ].eval_with_dict(params)
     assert i32 == n*m+n*m*l
 
 
@@ -446,32 +448,32 @@ def test_mem_access_counter_mixed():
     params = {'n': n, 'm': m, 'l': l}
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='load', variable='g')
-                     ].eval_with_dict(params)
+                         ].eval_with_dict(params)
     f64uniform += mem_map[lp.MemAccess('global', np.float64,
                                  stride=0, direction='load', variable='h')
-                      ].eval_with_dict(params)
+                          ].eval_with_dict(params)
     f32uniform = mem_map[lp.MemAccess('global', np.float32,
                                 stride=0, direction='load', variable='x')
-                     ].eval_with_dict(params)
+                         ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                   stride=Variable('m'), direction='load',
                                   variable='a')
-                       ].eval_with_dict(params)
+                           ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                    stride=Variable('m'), direction='load',
                                    variable='b')
-                        ].eval_with_dict(params)
+                            ].eval_with_dict(params)
     assert f64uniform == 2*n*m
     assert f32uniform == n*m*l/threads
     assert f32nonconsec == 3*n*m*l
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='store', variable='e')
-                     ].eval_with_dict(params)
+                         ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                   stride=Variable('m'), direction='store',
                                   variable='c')
-                       ].eval_with_dict(params)
+                           ].eval_with_dict(params)
     assert f64uniform == n*m
     assert f32nonconsec == n*m*l
 
@@ -500,30 +502,30 @@ def test_mem_access_counter_nonconsec():
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
                                   stride=Variable('m'), direction='load',
                                   variable='g')
-                       ].eval_with_dict(params)
+                           ].eval_with_dict(params)
     f64nonconsec += mem_map[lp.MemAccess('global', np.float64,
                                    stride=Variable('m'), direction='load',
                                    variable='h')
-                        ].eval_with_dict(params)
+                            ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                   stride=Variable('m')*Variable('l'),
                                   direction='load', variable='a')
-                       ].eval_with_dict(params)
+                           ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                    stride=Variable('m')*Variable('l'),
                                    direction='load', variable='b')
-                        ].eval_with_dict(params)
+                            ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*l
 
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
                                   stride=Variable('m'), direction='store',
                                   variable='e')
-                       ].eval_with_dict(params)
+                           ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                   stride=Variable('m')*Variable('l'),
                                   direction='store', variable='c')
-                       ].eval_with_dict(params)
+                           ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*l
 
@@ -549,30 +551,27 @@ def test_mem_access_counter_consec():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    #for k in mem_map:
-    #    print(k.mtype, k.dtype, type(k.dtype), k.stride, k.direction, k.variable, " :\n", mem_map[k])
-
     f64consec = mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='load', variable='g')
-                     ].eval_with_dict(params)
+                        ].eval_with_dict(params)
     f64consec += mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='load', variable='h')
-                     ].eval_with_dict(params)
+                         ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='load', variable='a')
-                     ].eval_with_dict(params)
+                        ].eval_with_dict(params)
     f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         stride=1, direction='load', variable='b')
-                     ].eval_with_dict(params)
+                         ].eval_with_dict(params)
     assert f64consec == 2*n*m
     assert f32consec == 3*n*m*l
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='store', variable='e')
-                     ].eval_with_dict(params)
+                        ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='store', variable='c')
-                     ].eval_with_dict(params)
+                        ].eval_with_dict(params)
     assert f64consec == n*m
     assert f32consec == n*m*l
 
@@ -671,26 +670,27 @@ def test_all_counters_parallel_matmul():
     op_map = lp.get_mem_access_map(knl)
 
     f32coal = op_map[lp.MemAccess('global', np.float32,
-                        stride=1, direction='load', variable='b')
-                            ].eval_with_dict(params)
+                     stride=1, direction='load', variable='b')
+                     ].eval_with_dict(params)
     f32coal += op_map[lp.MemAccess('global', np.float32,
-                        stride=1, direction='load', variable='a')
-                            ].eval_with_dict(params)
+                      stride=1, direction='load', variable='a')
+                      ].eval_with_dict(params)
 
     assert f32coal == n*m+m*l
 
     f32coal = op_map[lp.MemAccess('global', np.float32,
-                        stride=1, direction='store', variable='c')
-                            ].eval_with_dict(params)
+                     stride=1, direction='store', variable='c')
+                     ].eval_with_dict(params)
 
     assert f32coal == n*l
 
     local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local'])
     local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
-                                            direction='load')
-                                 ].eval_with_dict(params)
+                                             direction='load')
+                                ].eval_with_dict(params)
     assert local_mem_l == n*m*l*2
 
+
 def test_gather_access_footprint():
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i,j,k<n}",
@@ -744,25 +744,27 @@ def test_summations_and_filters():
 
     mem_map = lp.get_mem_access_map(knl)
 
-    loads_a = mem_map.filter_by(direction=['load'], variable=['a']).eval_and_sum(params)
+    loads_a = mem_map.filter_by(direction=['load'], variable=['a']
+                                ).eval_and_sum(params)
     assert loads_a == 2*n*m*l
 
-    global_stores = mem_map.filter_by(mtype=['global'], direction=['store']).eval_and_sum(params)
+    global_stores = mem_map.filter_by(mtype=['global'], direction=['store']
+                                      ).eval_and_sum(params)
     assert global_stores == n*m*l + n*m
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
-                             ).to_bytes().eval_and_sum(params)
+                                 ).to_bytes().eval_and_sum(params)
     st_bytes = mem_map.filter_by(mtype=['global'], direction=['store']
-                             ).to_bytes().eval_and_sum(params)
+                                 ).to_bytes().eval_and_sum(params)
     assert ld_bytes == 4*n*m*l*3 + 8*n*m*2
     assert st_bytes == 4*n*m*l + 8*n*m
 
     # ignore stride and variable names in this map
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
     f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load')
-                         ].eval_with_dict(params)
+                          ].eval_with_dict(params)
     f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
-                         ].eval_with_dict(params)
+                          ].eval_with_dict(params)
     assert f32lall == 3*n*m*l
     assert f64lall == 2*n*m
 
-- 
GitLab


From c0b1ec6cc15e8cb0e6957f8d1418ee9392916f30 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Sat, 10 Dec 2016 18:51:14 -0600
Subject: [PATCH 4/8] added get member function

---
 loopy/statistics.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index fde8643bf..990744248 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -112,6 +112,9 @@ class ToCountMap(object):
     def __len__(self):
         return len(self.count_map)
 
+    def get(self, key, default=None):
+        return self.count_map.get(key, default)
+
     def items(self):
         return self.count_map.items()
 
-- 
GitLab


From d6e4d1a7c837569dd58e99609b6dff933b8d107e Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Sun, 11 Dec 2016 02:16:47 -0600
Subject: [PATCH 5/8] added val_type and eval member function to ToCountMap

---
 loopy/statistics.py | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 990744248..e644886f3 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -70,6 +70,7 @@ class ToCountMap(object):
         if init_dict is None:
             init_dict = {}
         self.count_map = init_dict
+        self.val_type = isl.PwQPolynomial
 
     def __add__(self, other):
         result = self.count_map.copy()
@@ -101,7 +102,11 @@ class ToCountMap(object):
         try:
             return self.count_map[index]
         except KeyError:
-            return isl.PwQPolynomial('{ 0 }')
+            #TODO what is the best way to handle this?
+            if self.val_type is isl.PwQPolynomial:
+                return isl.PwQPolynomial('{ 0 }')
+            else:
+                return 0
 
     def __setitem__(self, index, value):
         self.count_map[index] = value
@@ -318,23 +323,36 @@ class ToCountMap(object):
             bytes_processed = int(key.dtype.itemsize) * val
             result[key] = bytes_processed
 
+        #TODO again, is this okay?
+        result.val_type = int
+
         return result
 
     def sum(self):
         """Add all counts in ToCountMap.
 
-        :return: A :class:`islpy.PwQPolynomial` containing the sum of counts.
+        :return: A :class:`islpy.PwQPolynomial` or :class:`int` containing the sum of
+                 counts.
 
         """
-        total = isl.PwQPolynomial('{ 0 }')
+
+        if self.val_type is isl.PwQPolynomial:
+            total = isl.PwQPolynomial('{ 0 }')
+        else:
+            total = 0
+
         for k, v in self.items():
-            if not isinstance(v, isl.PwQPolynomial):
-                raise ValueError("ToCountMap: sum() encountered type {0} but "
-                                 "may only be used on PwQPolynomials."
-                                 .format(type(v)))
             total += v
         return total
 
+    #TODO test and document
+    def eval(self, params):
+        result = self.copy()
+        for key, val in self.items():
+            result[key] = val.eval_with_dict(params)
+        result.val_type = int
+        return result
+
     def eval_and_sum(self, params):
         """Add all counts in :class:`ToCountMap` and evaluate with provided
         parameter dict.
-- 
GitLab


From 8d118e18d3769c369361654e4e94f44734df65a3 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Sun, 11 Dec 2016 03:33:39 -0600
Subject: [PATCH 6/8] set val_type in places where it was missing

---
 loopy/statistics.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index e644886f3..cb15eb554 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -66,17 +66,17 @@ class ToCountMap(object):
 
     """
 
-    def __init__(self, init_dict=None):
+    def __init__(self, init_dict=None, val_type=isl.PwQPolynomial):
         if init_dict is None:
             init_dict = {}
         self.count_map = init_dict
-        self.val_type = isl.PwQPolynomial
+        self.val_type = val_type
 
     def __add__(self, other):
         result = self.count_map.copy()
         for k, v in six.iteritems(other.count_map):
             result[k] = self.count_map.get(k, 0) + v
-        return ToCountMap(result)
+        return ToCountMap(result, self.val_type)
 
     def __radd__(self, other):
         if other != 0:
@@ -130,7 +130,7 @@ class ToCountMap(object):
         return self.count_map.pop(item)
 
     def copy(self):
-        return ToCountMap(dict(self.count_map))
+        return ToCountMap(dict(self.count_map), self.val_type)
 
     def filter_by(self, **kwargs):
         """Remove items without specified key fields.
@@ -157,7 +157,7 @@ class ToCountMap(object):
 
         """
 
-        result_map = ToCountMap()
+        result_map = ToCountMap(val_type=self.val_type)
 
         from loopy.types import to_loopy_type
         if 'dtype' in kwargs.keys():
@@ -205,7 +205,7 @@ class ToCountMap(object):
 
         """
 
-        result_map = ToCountMap()
+        result_map = ToCountMap(val_type=self.val_type)
 
         # for each item in self.count_map, call func on the key
         for self_key, self_val in self.items():
@@ -260,7 +260,7 @@ class ToCountMap(object):
 
         """
 
-        result_map = ToCountMap()
+        result_map = ToCountMap(val_type=self.val_type)
 
         # make sure all item keys have same type
         if self.count_map:
-- 
GitLab


From ee9811db42098c3c3433e1a203f0e7cfa772e3cc Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Tue, 2 May 2017 19:10:35 -0500
Subject: [PATCH 7/8] added assumptions to precompute check

---
 loopy/isl_helpers.py          |  4 ++++
 loopy/transform/precompute.py | 19 +++++++++++++------
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 0ebe90fbc..36fbb49f4 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -594,6 +594,10 @@ def get_simple_strides(bset, key_by="name"):
     """
     result = {}
 
+    comp_div_set_pieces = convexify(bset.compute_divs()).get_basic_sets()
+    assert len(comp_div_set_pieces) == 1
+    bset, = comp_div_set_pieces
+
     lspace = bset.get_local_space()
     for idiv in range(lspace.dim(dim_type.div)):
         div = lspace.get_div(idiv)
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index a19e06ecd..5b208d0a4 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -681,12 +681,18 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
                 dt, dim_idx = var_dict[primed_non1_saxis_names[i]]
                 mod_domain = mod_domain.set_dim_name(dt, dim_idx, saxis)
 
+        def add_assumptions(d):
+            assumption_non_param = isl.BasicSet.from_params(kernel.assumptions)
+            assumptions, domain = isl.align_two(assumption_non_param, d)
+            return d & assumptions
+
         # {{{ check that we got the desired domain
 
-        check_domain = check_domain.project_out_except(
-                primed_non1_saxis_names, [isl.dim_type.set])
+        check_domain = add_assumptions(
+            check_domain.project_out_except(
+                primed_non1_saxis_names, [isl.dim_type.set]))
 
-        mod_check_domain = mod_domain
+        mod_check_domain = add_assumptions(mod_domain)
 
         # re-add the prime from the new variable
         var_dict = mod_check_domain.get_var_dict(isl.dim_type.set)
@@ -716,10 +722,11 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
         # project out the new names from the modified domain
         orig_domain_inames = list(domch.domain.get_var_dict(isl.dim_type.set))
-        mod_check_domain = mod_domain.project_out_except(
-                orig_domain_inames, [isl.dim_type.set])
+        mod_check_domain = add_assumptions(
+                mod_domain.project_out_except(
+                    orig_domain_inames, [isl.dim_type.set]))
 
-        check_domain = domch.domain
+        check_domain = add_assumptions(domch.domain)
 
         mod_check_domain, check_domain = isl.align_two(
                 mod_check_domain, check_domain)
-- 
GitLab


From 6bb1e04e9d5328d6e35dbbfc8c62c8be54b74286 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 11 May 2017 13:28:08 -0500
Subject: [PATCH 8/8] Fix precompute assumption adding

---
 loopy/transform/precompute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index 5b208d0a4..6077332c4 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -684,7 +684,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
         def add_assumptions(d):
             assumption_non_param = isl.BasicSet.from_params(kernel.assumptions)
             assumptions, domain = isl.align_two(assumption_non_param, d)
-            return d & assumptions
+            return assumptions & domain
 
         # {{{ check that we got the desired domain
 
-- 
GitLab