From 0494c442a6b10dce04adc57439b83823253cbd7d Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Sat, 12 Mar 2016 18:09:05 -0600
Subject: [PATCH 01/55] replaced consec, uniform, and nonconsec subscript types
 with DataAccess class

---
 loopy/statistics.py     | 78 +++++++++++++++++++++++------------------
 test/test_statistics.py | 70 +++++++++++++++++++-----------------
 2 files changed, 81 insertions(+), 67 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index c273edd54..5faeb12e3 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -105,6 +105,18 @@ def stringify_stats_mapping(m):
     return result
 
 
+class DataAccess:
+
+    def __init__(self, stride=0):
+        self.stride = stride
+
+    def __eq__(self, other):
+        return isinstance(other, DataAccess) and other.stride == self.stride  #TODO is this okay?
+
+    def __hash__(self):
+        return hash(self.stride)
+
+
 # {{{ ExpressionOpCounter
 
 class ExpressionOpCounter(CombineMapper):
@@ -277,67 +289,63 @@ class GlobalSubscriptCounter(CombineMapper):
         from loopy.symbolic import get_dependencies
         from loopy.kernel.data import LocalIndexTag
         my_inames = get_dependencies(index) & self.knl.all_inames()
-        local_id0 = None
+
+        # find min tag axis
+        import sys
+        min_tag_axis = sys.maxsize
         local_id_found = False
         for iname in my_inames:
-            # find local id0
             tag = self.knl.iname_to_tag.get(iname)
             if isinstance(tag, LocalIndexTag):
                 local_id_found = True
-                if tag.axis == 0:
-                    local_id0 = iname
-                    break  # there will be only one local_id0
+                if tag.axis < min_tag_axis:
+                    min_tag_axis = tag.axis
 
         if not local_id_found:
             # count as uniform access
             return ToCountMap(
-                    {(self.type_inf(expr), 'uniform'): 1}
+                    {(self.type_inf(expr), DataAccess(stride=0)): 1}
                     ) + self.rec(expr.index)
 
-        if local_id0 is None:
-            # only non-zero local id(s) found, assume non-consecutive access
-            return ToCountMap(
-                    {(self.type_inf(expr), 'nonconsecutive'): 1}
-                    ) + self.rec(expr.index)
+        # get local_id associated with minimum tag axis
+        min_local_id = None
+        for iname in my_inames:
+            tag = self.knl.iname_to_tag.get(iname)
+            if isinstance(tag, LocalIndexTag):
+                if tag.axis == min_tag_axis:
+                    min_local_id = iname
+                    break  # there will be only one min local_id
+
+        # found local_id associated with minimum tag axis
 
-        # check coefficient of local_id0 for each axis
+        total_stride = None
+        # check coefficient of min_local_id for each axis
         from loopy.symbolic import CoefficientCollector
         from pymbolic.primitives import Variable
         for idx, axis_tag in zip(index, array.dim_tags):
 
             coeffs = CoefficientCollector()(idx)
-            # check if he contains the lid 0 guy
+            # check if he contains the min lid guy
             try:
-                coeff_id0 = coeffs[Variable(local_id0)]
+                coeff_min_lid = coeffs[Variable(min_local_id)]
             except KeyError:
-                # does not contain local_id0
+                # does not contain min_local_id
                 continue
 
-            if coeff_id0 != 1:
-                # non-consecutive access
-                return ToCountMap(
-                        {(self.type_inf(expr), 'nonconsecutive'): 1}
-                        ) + self.rec(expr.index)
-
-            # coefficient is 1, now determine if stride is 1
+            # found coefficient of min_local_id
+            # now determine stride
             from loopy.kernel.array import FixedStrideArrayDimTag
             if isinstance(axis_tag, FixedStrideArrayDimTag):
                 stride = axis_tag.stride
             else:
                 continue
 
-            if stride != 1:
-                # non-consecutive
-                return ToCountMap(
-                        {(self.type_inf(expr), 'nonconsecutive'): 1}
-                        ) + self.rec(expr.index)
-
-            # else, stride == 1, continue since another idx could contain id0
+            total_stride = stride*coeff_min_lid
+            #TODO is there a case where this^ does not execute, or executes more than once for two different axes?
 
-        # loop finished without returning, stride==1 for every instance of local_id0
-        return ToCountMap(
-                {(self.type_inf(expr), 'consecutive'): 1}
-                ) + self.rec(expr.index)
+        return ToCountMap({(self.type_inf(expr),
+                           DataAccess(stride=total_stride)): 1}
+                          ) + self.rec(expr.index)
 
     def map_sum(self, expr):
         if expr.children:
@@ -717,13 +725,13 @@ def get_gmem_access_poly(knl):  # for now just counting subscripts
         # use count excluding local index tags for uniform accesses
         for key in subs_expr.dict:
             poly = ToCountMap({key: subs_expr.dict[key]})
-            if key[1] == "uniform":
+            if key[1].stride == 0:
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True)
             else:
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
         for key in subs_assignee.dict:
             poly = ToCountMap({key: subs_assignee.dict[key]})
-            if key[1] == "uniform":
+            if key[1].stride == 0:
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True)
             else:
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 2cf537f5e..6e5b6270b 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -28,9 +28,15 @@ from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 import loopy as lp
-from loopy.statistics import get_op_poly, get_gmem_access_poly, get_barrier_poly
+from loopy.statistics import (
+        get_op_poly,
+        get_gmem_access_poly,
+        get_barrier_poly,
+        DataAccess)
+
 import numpy as np
 
+from pymbolic.primitives import Variable
 
 def test_op_counter_basic():
 
@@ -227,19 +233,19 @@ def test_gmem_access_counter_basic():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
     f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
+                    (np.dtype(np.float32), DataAccess(stride=0), 'load')
                    ].eval_with_dict(params)
     f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'load')
+                    (np.dtype(np.float64), DataAccess(stride=0), 'load')
                    ].eval_with_dict(params)
     assert f32 == 3*n*m*l
     assert f64 == 2*n*m
 
     f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'store')
+                    (np.dtype(np.float32), DataAccess(stride=0), 'store')
                    ].eval_with_dict(params)
     f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'store')
+                    (np.dtype(np.float64), DataAccess(stride=0), 'store')
                    ].eval_with_dict(params)
     assert f32 == n*m*l
     assert f64 == n*m
@@ -261,12 +267,12 @@ def test_gmem_access_counter_reduction():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
     f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
+                    (np.dtype(np.float32), DataAccess(stride=0), 'load')
                     ].eval_with_dict(params)
     assert f32 == 2*n*m*l
 
     f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'store')
+                    (np.dtype(np.float32), DataAccess(stride=0), 'store')
                     ].eval_with_dict(params)
     assert f32 == n*l
 
@@ -289,16 +295,16 @@ def test_gmem_access_counter_logic():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
     f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
+                    (np.dtype(np.float32), DataAccess(stride=0), 'load')
                     ].eval_with_dict(params)
     f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'load')
+                    (np.dtype(np.float64), DataAccess(stride=0), 'load')
                     ].eval_with_dict(params)
     assert f32 == 2*n*m
     assert f64 == n*m
 
     f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'store')
+                    (np.dtype(np.float64), DataAccess(stride=0), 'store')
                     ].eval_with_dict(params)
     assert f64 == n*m
 
@@ -323,19 +329,19 @@ def test_gmem_access_counter_specialops():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
     f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
+                    (np.dtype(np.float32), DataAccess(stride=0), 'load')
                     ].eval_with_dict(params)
     f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'load')
+                    (np.dtype(np.float64), DataAccess(stride=0), 'load')
                     ].eval_with_dict(params)
     assert f32 == 2*n*m*l
     assert f64 == 2*n*m
 
     f32 = poly[
-                    (np.dtype(np.float32), 'uniform', 'store')
+                    (np.dtype(np.float32), DataAccess(stride=0), 'store')
                     ].eval_with_dict(params)
     f64 = poly[
-                    (np.dtype(np.float64), 'uniform', 'store')
+                    (np.dtype(np.float64), DataAccess(stride=0), 'store')
                     ].eval_with_dict(params)
     assert f32 == n*m*l
     assert f64 == n*m
@@ -364,12 +370,12 @@ def test_gmem_access_counter_bitwise():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
     i32 = poly[
-                    (np.dtype(np.int32), 'uniform', 'load')
+                    (np.dtype(np.int32), DataAccess(stride=0), 'load')
                     ].eval_with_dict(params)
     assert i32 == 4*n*m+2*n*m*l
 
     i32 = poly[
-                    (np.dtype(np.int32), 'uniform', 'store')
+                    (np.dtype(np.int32), DataAccess(stride=0), 'store')
                     ].eval_with_dict(params)
     assert i32 == n*m+n*m*l
 
@@ -398,23 +404,23 @@ def test_gmem_access_counter_mixed():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
     f64uniform = poly[
-                    (np.dtype(np.float64), 'uniform', 'load')
+                    (np.dtype(np.float64), DataAccess(stride=0), 'load')
                     ].eval_with_dict(params)
     f32uniform = poly[
-                    (np.dtype(np.float32), 'uniform', 'load')
+                    (np.dtype(np.float32), DataAccess(stride=0), 'load')
                     ].eval_with_dict(params)
     f32nonconsec = poly[
-                    (np.dtype(np.float32), 'nonconsecutive', 'load')
+                    (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'load')
                     ].eval_with_dict(params)
     assert f64uniform == 2*n*m
     assert f32uniform == n*m*l/threads
     assert f32nonconsec == 3*n*m*l
 
     f64uniform = poly[
-                    (np.dtype(np.float64), 'uniform', 'store')
+                    (np.dtype(np.float64), DataAccess(stride=0), 'store')
                     ].eval_with_dict(params)
     f32nonconsec = poly[
-                    (np.dtype(np.float32), 'nonconsecutive', 'store')
+                    (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'store')
                     ].eval_with_dict(params)
     assert f64uniform == n*m
     assert f32nonconsec == n*m*l
@@ -442,19 +448,19 @@ def test_gmem_access_counter_nonconsec():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
     f64nonconsec = poly[
-                    (np.dtype(np.float64), 'nonconsecutive', 'load')
+                    (np.dtype(np.float64), DataAccess(stride=Variable('m')), 'load')
                     ].eval_with_dict(params)
     f32nonconsec = poly[
-                    (np.dtype(np.float32), 'nonconsecutive', 'load')
+                    (np.dtype(np.float32), DataAccess(stride=Variable('m')*Variable('l')), 'load')
                     ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*l
 
     f64nonconsec = poly[
-                    (np.dtype(np.float64), 'nonconsecutive', 'store')
+                    (np.dtype(np.float64), DataAccess(stride=Variable('m')), 'store')
                     ].eval_with_dict(params)
     f32nonconsec = poly[
-                    (np.dtype(np.float32), 'nonconsecutive', 'store')
+                    (np.dtype(np.float32), DataAccess(stride=Variable('m')*Variable('l')), 'store')
                     ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*l
@@ -482,19 +488,19 @@ def test_gmem_access_counter_consec():
     params = {'n': n, 'm': m, 'l': l}
 
     f64consec = poly[
-                    (np.dtype(np.float64), 'consecutive', 'load')
+                    (np.dtype(np.float64), DataAccess(stride=1), 'load')
                     ].eval_with_dict(params)
     f32consec = poly[
-                    (np.dtype(np.float32), 'consecutive', 'load')
+                    (np.dtype(np.float32), DataAccess(stride=1), 'load')
                     ].eval_with_dict(params)
     assert f64consec == 2*n*m
     assert f32consec == 3*n*m*l
 
     f64consec = poly[
-                    (np.dtype(np.float64), 'consecutive', 'store')
+                    (np.dtype(np.float64), DataAccess(stride=1), 'store')
                     ].eval_with_dict(params)
     f32consec = poly[
-                    (np.dtype(np.float32), 'consecutive', 'store')
+                    (np.dtype(np.float32), DataAccess(stride=1), 'store')
                     ].eval_with_dict(params)
     assert f64consec == n*m
     assert f32consec == n*m*l
@@ -588,17 +594,17 @@ def test_all_counters_parallel_matmul():
 
     subscript_map = get_gmem_access_poly(knl)
     f32uncoal = subscript_map[
-                        (np.dtype(np.float32), 'nonconsecutive', 'load')
+                        (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'load')
                         ].eval_with_dict(params)
     f32coal = subscript_map[
-                        (np.dtype(np.float32), 'consecutive', 'load')
+                        (np.dtype(np.float32), DataAccess(stride=1), 'load')
                         ].eval_with_dict(params)
 
     assert f32uncoal == n*m*l
     assert f32coal == n*m*l
 
     f32coal = subscript_map[
-                        (np.dtype(np.float32), 'consecutive', 'store')
+                        (np.dtype(np.float32), DataAccess(stride=1), 'store')
                         ].eval_with_dict(params)
 
     assert f32coal == n*l
-- 
GitLab


From 09420000fb7a565fd3fa64a6ca8e8a609eae8008 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Sun, 13 Mar 2016 13:23:59 -0500
Subject: [PATCH 02/55] subscript counter only looking for lid0 now, if not
 found, setting stride to maxsize

---
 loopy/statistics.py     | 35 ++++++++++++++++++++++++-----------
 test/test_statistics.py |  2 +-
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 5faeb12e3..e10de8cb4 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -291,15 +291,19 @@ class GlobalSubscriptCounter(CombineMapper):
         my_inames = get_dependencies(index) & self.knl.all_inames()
 
         # find min tag axis
-        import sys
-        min_tag_axis = sys.maxsize
+        #import sys
+        local_id0 = None
+        #min_tag_axis = sys.maxsize
         local_id_found = False
         for iname in my_inames:
             tag = self.knl.iname_to_tag.get(iname)
             if isinstance(tag, LocalIndexTag):
                 local_id_found = True
-                if tag.axis < min_tag_axis:
-                    min_tag_axis = tag.axis
+                #if tag.axis < min_tag_axis:
+                #    min_tag_axis = tag.axis
+                if tag.axis == 0:
+                    local_id0 = iname
+                    break
 
         if not local_id_found:
             # count as uniform access
@@ -307,6 +311,15 @@ class GlobalSubscriptCounter(CombineMapper):
                     {(self.type_inf(expr), DataAccess(stride=0)): 1}
                     ) + self.rec(expr.index)
 
+        if local_id0 is None:
+            # only non-zero local id(s) found, assume non-consecutive access
+            #TODO what to do here?
+            import sys
+            return ToCountMap(
+                    {(self.type_inf(expr), DataAccess(stride=sys.maxsize)): 1}
+                    ) + self.rec(expr.index)
+
+        '''            
         # get local_id associated with minimum tag axis
         min_local_id = None
         for iname in my_inames:
@@ -315,11 +328,11 @@ class GlobalSubscriptCounter(CombineMapper):
                 if tag.axis == min_tag_axis:
                     min_local_id = iname
                     break  # there will be only one min local_id
+        '''
 
-        # found local_id associated with minimum tag axis
-
+        # found local_id associated with axis 0
         total_stride = None
-        # check coefficient of min_local_id for each axis
+        # check coefficient of local_id0 for each axis
         from loopy.symbolic import CoefficientCollector
         from pymbolic.primitives import Variable
         for idx, axis_tag in zip(index, array.dim_tags):
@@ -327,12 +340,12 @@ class GlobalSubscriptCounter(CombineMapper):
             coeffs = CoefficientCollector()(idx)
             # check if he contains the min lid guy
             try:
-                coeff_min_lid = coeffs[Variable(min_local_id)]
+                coeff_lid0 = coeffs[Variable(local_id0)]
             except KeyError:
-                # does not contain min_local_id
+                # does not contain local_id0
                 continue
 
-            # found coefficient of min_local_id
+            # found coefficient of local_id0
             # now determine stride
             from loopy.kernel.array import FixedStrideArrayDimTag
             if isinstance(axis_tag, FixedStrideArrayDimTag):
@@ -340,7 +353,7 @@ class GlobalSubscriptCounter(CombineMapper):
             else:
                 continue
 
-            total_stride = stride*coeff_min_lid
+            total_stride = stride*coeff_lid0
             #TODO is there a case where this^ does not execute, or executes more than once for two different axes?
 
         return ToCountMap({(self.type_inf(expr),
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 6e5b6270b..6aec20444 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -594,7 +594,7 @@ def test_all_counters_parallel_matmul():
 
     subscript_map = get_gmem_access_poly(knl)
     f32uncoal = subscript_map[
-                        (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'load')
+                        (np.dtype(np.float32), DataAccess(stride=sys.maxsize), 'load')
                         ].eval_with_dict(params)
     f32coal = subscript_map[
                         (np.dtype(np.float32), DataAccess(stride=1), 'load')
-- 
GitLab


From 2dbeb3877549f4564d96aae3314ab6636d8c8a56 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Tue, 15 Mar 2016 18:55:39 -0500
Subject: [PATCH 03/55] now calculating strides greater than 1

---
 loopy/statistics.py     | 56 +++++++++++++++++++++--------------------
 test/test_statistics.py |  2 +-
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index e10de8cb4..eff571668 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -291,19 +291,19 @@ class GlobalSubscriptCounter(CombineMapper):
         my_inames = get_dependencies(index) & self.knl.all_inames()
 
         # find min tag axis
-        #import sys
-        local_id0 = None
-        #min_tag_axis = sys.maxsize
+        import sys
+        #local_id0 = None
+        min_tag_axis = sys.maxsize
         local_id_found = False
         for iname in my_inames:
             tag = self.knl.iname_to_tag.get(iname)
             if isinstance(tag, LocalIndexTag):
                 local_id_found = True
-                #if tag.axis < min_tag_axis:
-                #    min_tag_axis = tag.axis
-                if tag.axis == 0:
-                    local_id0 = iname
-                    break
+                if tag.axis < min_tag_axis:
+                    min_tag_axis = tag.axis
+                #if tag.axis == 0:
+                #    local_id0 = iname
+                #    break
 
         if not local_id_found:
             # count as uniform access
@@ -311,49 +311,51 @@ class GlobalSubscriptCounter(CombineMapper):
                     {(self.type_inf(expr), DataAccess(stride=0)): 1}
                     ) + self.rec(expr.index)
 
-        if local_id0 is None:
-            # only non-zero local id(s) found, assume non-consecutive access
-            #TODO what to do here?
-            import sys
-            return ToCountMap(
-                    {(self.type_inf(expr), DataAccess(stride=sys.maxsize)): 1}
-                    ) + self.rec(expr.index)
-
-        '''            
         # get local_id associated with minimum tag axis
-        min_local_id = None
+        min_lid = None
         for iname in my_inames:
             tag = self.knl.iname_to_tag.get(iname)
             if isinstance(tag, LocalIndexTag):
                 if tag.axis == min_tag_axis:
-                    min_local_id = iname
+                    min_lid = iname
                     break  # there will be only one min local_id
-        '''
 
-        # found local_id associated with axis 0
+        # found local_id associated with minimum tag axis
+
         total_stride = None
         # check coefficient of local_id0 for each axis
         from loopy.symbolic import CoefficientCollector
         from pymbolic.primitives import Variable
+        #print("==========================================================================================")
+        #print("expr: ", expr)
+        #print("min_lid: ", min_lid)
+        #print("min_tag_axis: ", min_tag_axis)
+        #print("Var(min_lid): ", Variable(min_lid))
         for idx, axis_tag in zip(index, array.dim_tags):
-
+            #print("...........................................................................................")
+            #print("idx, axis_tag: ", idx, "\t",  axis_tag)
             coeffs = CoefficientCollector()(idx)
+            #print("coeffs: ", coeffs)
             # check if he contains the min lid guy
             try:
-                coeff_lid0 = coeffs[Variable(local_id0)]
+                coeff_min_lid = coeffs[Variable(min_lid)]
             except KeyError:
-                # does not contain local_id0
+                # does not contain min_lid
+                #print("key error")
                 continue
-
-            # found coefficient of local_id0
+            #print("coeff_min_lid: ", coeff_min_lid)
+            #print("axis_tag: ", axis_tag)
+            # found coefficient of min_lid
             # now determine stride
             from loopy.kernel.array import FixedStrideArrayDimTag
             if isinstance(axis_tag, FixedStrideArrayDimTag):
                 stride = axis_tag.stride
             else:
+                #print("continuing")
                 continue
+            #print("stride: ", stride)
 
-            total_stride = stride*coeff_lid0
+            total_stride = stride*coeff_min_lid
             #TODO is there a case where this^ does not execute, or executes more than once for two different axes?
 
         return ToCountMap({(self.type_inf(expr),
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 6aec20444..6e5b6270b 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -594,7 +594,7 @@ def test_all_counters_parallel_matmul():
 
     subscript_map = get_gmem_access_poly(knl)
     f32uncoal = subscript_map[
-                        (np.dtype(np.float32), DataAccess(stride=sys.maxsize), 'load')
+                        (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'load')
                         ].eval_with_dict(params)
     f32coal = subscript_map[
                         (np.dtype(np.float32), DataAccess(stride=1), 'load')
-- 
GitLab


From 0573d5457aacb21ce73e754304f86e35671784fa Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Tue, 15 Mar 2016 22:43:17 -0500
Subject: [PATCH 04/55] added StridedGmemAccess class, now used as key in gmem
 access dicts

---
 loopy/statistics.py     |  58 +++++++--------
 test/test_statistics.py | 156 +++++++++++++++++-----------------------
 2 files changed, 91 insertions(+), 123 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index eff571668..96bf511b5 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -105,16 +105,24 @@ def stringify_stats_mapping(m):
     return result
 
 
-class DataAccess:
+class StridedGmemAccess:
 
-    def __init__(self, stride=0):
+    def __init__(self, dtype, stride, direction=None):
+        self.dtype = dtype
         self.stride = stride
+        self.direction = direction
 
     def __eq__(self, other):
-        return isinstance(other, DataAccess) and other.stride == self.stride  #TODO is this okay?
+        return isinstance(other, StridedGmemAccess) and (
+                other.dtype == self.dtype and
+                other.stride == self.stride and
+                other.direction == self.direction )
 
     def __hash__(self):
-        return hash(self.stride)
+        if self.direction == None:
+            return hash(str(self.dtype)+str(self.stride)+"None")
+        else:
+            return hash(str(self.dtype)+str(self.stride)+self.direction)
 
 
 # {{{ ExpressionOpCounter
@@ -292,7 +300,6 @@ class GlobalSubscriptCounter(CombineMapper):
 
         # find min tag axis
         import sys
-        #local_id0 = None
         min_tag_axis = sys.maxsize
         local_id_found = False
         for iname in my_inames:
@@ -301,14 +308,11 @@ class GlobalSubscriptCounter(CombineMapper):
                 local_id_found = True
                 if tag.axis < min_tag_axis:
                     min_tag_axis = tag.axis
-                #if tag.axis == 0:
-                #    local_id0 = iname
-                #    break
 
         if not local_id_found:
             # count as uniform access
             return ToCountMap(
-                    {(self.type_inf(expr), DataAccess(stride=0)): 1}
+                    {StridedGmemAccess(self.type_inf(expr), 0): 1}
                     ) + self.rec(expr.index)
 
         # get local_id associated with minimum tag axis
@@ -326,41 +330,27 @@ class GlobalSubscriptCounter(CombineMapper):
         # check coefficient of local_id0 for each axis
         from loopy.symbolic import CoefficientCollector
         from pymbolic.primitives import Variable
-        #print("==========================================================================================")
-        #print("expr: ", expr)
-        #print("min_lid: ", min_lid)
-        #print("min_tag_axis: ", min_tag_axis)
-        #print("Var(min_lid): ", Variable(min_lid))
         for idx, axis_tag in zip(index, array.dim_tags):
-            #print("...........................................................................................")
-            #print("idx, axis_tag: ", idx, "\t",  axis_tag)
             coeffs = CoefficientCollector()(idx)
-            #print("coeffs: ", coeffs)
             # check if he contains the min lid guy
             try:
                 coeff_min_lid = coeffs[Variable(min_lid)]
             except KeyError:
                 # does not contain min_lid
-                #print("key error")
                 continue
-            #print("coeff_min_lid: ", coeff_min_lid)
-            #print("axis_tag: ", axis_tag)
             # found coefficient of min_lid
             # now determine stride
             from loopy.kernel.array import FixedStrideArrayDimTag
             if isinstance(axis_tag, FixedStrideArrayDimTag):
                 stride = axis_tag.stride
             else:
-                #print("continuing")
                 continue
-            #print("stride: ", stride)
 
             total_stride = stride*coeff_min_lid
             #TODO is there a case where this^ does not execute, or executes more than once for two different axes?
 
-        return ToCountMap({(self.type_inf(expr),
-                           DataAccess(stride=total_stride)): 1}
-                          ) + self.rec(expr.index)
+        return ToCountMap({StridedGmemAccess(self.type_inf(expr),
+                           total_stride): 1}) + self.rec(expr.index)
 
     def map_sum(self, expr):
         if expr.children:
@@ -727,26 +717,28 @@ def get_gmem_access_poly(knl):  # for now just counting subscripts
     for insn in knl.instructions:
         # count subscripts, distinguishing loads and stores
         subs_expr = subscript_counter(insn.expression)
-        subs_expr = ToCountMap(dict(
-            (key + ("load",), val)
-            for key, val in six.iteritems(subs_expr.dict)))
+        for key in subs_expr.dict:
+            subs_expr.dict[StridedGmemAccess(
+                           key.dtype, key.stride, 'load')
+                          ] = subs_expr.dict.pop(key)
         subs_assignee = subscript_counter(insn.assignee)
-        subs_assignee = ToCountMap(dict(
-            (key + ("store",), val)
-            for key, val in six.iteritems(subs_assignee.dict)))
+        for key in subs_assignee.dict:
+            subs_assignee.dict[StridedGmemAccess(
+                           key.dtype, key.stride, 'store')
+                          ] = subs_assignee.dict.pop(key)
 
         insn_inames = knl.insn_inames(insn)
 
         # use count excluding local index tags for uniform accesses
         for key in subs_expr.dict:
             poly = ToCountMap({key: subs_expr.dict[key]})
-            if key[1].stride == 0:
+            if isinstance(key.stride, int) and key.stride == 0:
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True)
             else:
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
         for key in subs_assignee.dict:
             poly = ToCountMap({key: subs_assignee.dict[key]})
-            if key[1].stride == 0:
+            if isinstance(key.stride, int) and key.stride == 0:
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True)
             else:
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 6e5b6270b..a4fc022d5 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -32,7 +32,7 @@ from loopy.statistics import (
         get_op_poly,
         get_gmem_access_poly,
         get_barrier_poly,
-        DataAccess)
+        StridedGmemAccess)
 
 import numpy as np
 
@@ -232,21 +232,17 @@ def test_gmem_access_counter_basic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[
-                    (np.dtype(np.float32), DataAccess(stride=0), 'load')
-                   ].eval_with_dict(params)
-    f64 = poly[
-                    (np.dtype(np.float64), DataAccess(stride=0), 'load')
-                   ].eval_with_dict(params)
+    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load')
+               ].eval_with_dict(params)
+    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'load')
+               ].eval_with_dict(params)
     assert f32 == 3*n*m*l
     assert f64 == 2*n*m
 
-    f32 = poly[
-                    (np.dtype(np.float32), DataAccess(stride=0), 'store')
-                   ].eval_with_dict(params)
-    f64 = poly[
-                    (np.dtype(np.float64), DataAccess(stride=0), 'store')
-                   ].eval_with_dict(params)
+    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'store')
+               ].eval_with_dict(params)
+    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'store')
+               ].eval_with_dict(params)
     assert f32 == n*m*l
     assert f64 == n*m
 
@@ -266,14 +262,12 @@ def test_gmem_access_counter_reduction():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[
-                    (np.dtype(np.float32), DataAccess(stride=0), 'load')
-                    ].eval_with_dict(params)
+    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load')
+               ].eval_with_dict(params)
     assert f32 == 2*n*m*l
 
-    f32 = poly[
-                    (np.dtype(np.float32), DataAccess(stride=0), 'store')
-                    ].eval_with_dict(params)
+    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'store')
+               ].eval_with_dict(params)
     assert f32 == n*l
 
 
@@ -294,18 +288,15 @@ def test_gmem_access_counter_logic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[
-                    (np.dtype(np.float32), DataAccess(stride=0), 'load')
-                    ].eval_with_dict(params)
-    f64 = poly[
-                    (np.dtype(np.float64), DataAccess(stride=0), 'load')
-                    ].eval_with_dict(params)
+    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load')
+               ].eval_with_dict(params)
+    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'load')
+               ].eval_with_dict(params)
     assert f32 == 2*n*m
     assert f64 == n*m
 
-    f64 = poly[
-                    (np.dtype(np.float64), DataAccess(stride=0), 'store')
-                    ].eval_with_dict(params)
+    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'store')
+               ].eval_with_dict(params)
     assert f64 == n*m
 
 
@@ -328,21 +319,17 @@ def test_gmem_access_counter_specialops():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[
-                    (np.dtype(np.float32), DataAccess(stride=0), 'load')
-                    ].eval_with_dict(params)
-    f64 = poly[
-                    (np.dtype(np.float64), DataAccess(stride=0), 'load')
-                    ].eval_with_dict(params)
+    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load')
+               ].eval_with_dict(params)
+    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'load')
+               ].eval_with_dict(params)
     assert f32 == 2*n*m*l
     assert f64 == 2*n*m
 
-    f32 = poly[
-                    (np.dtype(np.float32), DataAccess(stride=0), 'store')
-                    ].eval_with_dict(params)
-    f64 = poly[
-                    (np.dtype(np.float64), DataAccess(stride=0), 'store')
-                    ].eval_with_dict(params)
+    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'store')
+               ].eval_with_dict(params)
+    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'store')
+               ].eval_with_dict(params)
     assert f32 == n*m*l
     assert f64 == n*m
 
@@ -369,14 +356,12 @@ def test_gmem_access_counter_bitwise():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    i32 = poly[
-                    (np.dtype(np.int32), DataAccess(stride=0), 'load')
-                    ].eval_with_dict(params)
+    i32 = poly[StridedGmemAccess(np.dtype(np.int32), 0, 'load')
+               ].eval_with_dict(params)
     assert i32 == 4*n*m+2*n*m*l
 
-    i32 = poly[
-                    (np.dtype(np.int32), DataAccess(stride=0), 'store')
-                    ].eval_with_dict(params)
+    i32 = poly[StridedGmemAccess(np.dtype(np.int32), 0, 'store')
+               ].eval_with_dict(params)
     assert i32 == n*m+n*m*l
 
 
@@ -403,24 +388,21 @@ def test_gmem_access_counter_mixed():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64uniform = poly[
-                    (np.dtype(np.float64), DataAccess(stride=0), 'load')
-                    ].eval_with_dict(params)
-    f32uniform = poly[
-                    (np.dtype(np.float32), DataAccess(stride=0), 'load')
-                    ].eval_with_dict(params)
+    f64uniform = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'load')
+                      ].eval_with_dict(params)
+    f32uniform = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load')
+                      ].eval_with_dict(params)
     f32nonconsec = poly[
-                    (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'load')
+                    StridedGmemAccess(np.dtype(np.float32), Variable('m'), 'load')
                     ].eval_with_dict(params)
     assert f64uniform == 2*n*m
     assert f32uniform == n*m*l/threads
     assert f32nonconsec == 3*n*m*l
 
-    f64uniform = poly[
-                    (np.dtype(np.float64), DataAccess(stride=0), 'store')
-                    ].eval_with_dict(params)
+    f64uniform = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'store')
+                      ].eval_with_dict(params)
     f32nonconsec = poly[
-                    (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'store')
+                    StridedGmemAccess(np.dtype(np.float32), Variable('m'), 'store')
                     ].eval_with_dict(params)
     assert f64uniform == n*m
     assert f32nonconsec == n*m*l
@@ -447,21 +429,21 @@ def test_gmem_access_counter_nonconsec():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64nonconsec = poly[
-                    (np.dtype(np.float64), DataAccess(stride=Variable('m')), 'load')
-                    ].eval_with_dict(params)
-    f32nonconsec = poly[
-                    (np.dtype(np.float32), DataAccess(stride=Variable('m')*Variable('l')), 'load')
-                    ].eval_with_dict(params)
+    f64nonconsec = poly[StridedGmemAccess(
+                        np.dtype(np.float64), Variable('m'), 'load')
+                        ].eval_with_dict(params)
+    f32nonconsec = poly[StridedGmemAccess(
+                        np.dtype(np.float32), Variable('m')*Variable('l'), 'load')
+                        ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*l
 
-    f64nonconsec = poly[
-                    (np.dtype(np.float64), DataAccess(stride=Variable('m')), 'store')
-                    ].eval_with_dict(params)
-    f32nonconsec = poly[
-                    (np.dtype(np.float32), DataAccess(stride=Variable('m')*Variable('l')), 'store')
-                    ].eval_with_dict(params)
+    f64nonconsec = poly[StridedGmemAccess(
+                        np.dtype(np.float64), Variable('m'), 'store')
+                        ].eval_with_dict(params)
+    f32nonconsec = poly[StridedGmemAccess(
+                        np.dtype(np.float32), Variable('m')*Variable('l'), 'store')
+                        ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*l
 
@@ -487,21 +469,17 @@ def test_gmem_access_counter_consec():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    f64consec = poly[
-                    (np.dtype(np.float64), DataAccess(stride=1), 'load')
-                    ].eval_with_dict(params)
-    f32consec = poly[
-                    (np.dtype(np.float32), DataAccess(stride=1), 'load')
-                    ].eval_with_dict(params)
+    f64consec = poly[StridedGmemAccess(np.dtype(np.float64), 1, 'load')
+                     ].eval_with_dict(params)
+    f32consec = poly[StridedGmemAccess(np.dtype(np.float32), 1, 'load')
+                     ].eval_with_dict(params)
     assert f64consec == 2*n*m
     assert f32consec == 3*n*m*l
 
-    f64consec = poly[
-                    (np.dtype(np.float64), DataAccess(stride=1), 'store')
-                    ].eval_with_dict(params)
-    f32consec = poly[
-                    (np.dtype(np.float32), DataAccess(stride=1), 'store')
-                    ].eval_with_dict(params)
+    f64consec = poly[StridedGmemAccess(np.dtype(np.float64), 1, 'store')
+                     ].eval_with_dict(params)
+    f32consec = poly[StridedGmemAccess(np.dtype(np.float32), 1, 'store')
+                     ].eval_with_dict(params)
     assert f64consec == n*m
     assert f32consec == n*m*l
 
@@ -593,19 +571,17 @@ def test_all_counters_parallel_matmul():
     assert i32ops == n*m*l*4 + l*n*4
 
     subscript_map = get_gmem_access_poly(knl)
-    f32uncoal = subscript_map[
-                        (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'load')
-                        ].eval_with_dict(params)
-    f32coal = subscript_map[
-                        (np.dtype(np.float32), DataAccess(stride=1), 'load')
-                        ].eval_with_dict(params)
+    f32uncoal = subscript_map[StridedGmemAccess(
+                              np.dtype(np.float32), Variable('m'), 'load')
+                              ].eval_with_dict(params)
+    f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, 'load')
+                            ].eval_with_dict(params)
 
     assert f32uncoal == n*m*l
     assert f32coal == n*m*l
 
-    f32coal = subscript_map[
-                        (np.dtype(np.float32), DataAccess(stride=1), 'store')
-                        ].eval_with_dict(params)
+    f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, 'store')
+                            ].eval_with_dict(params)
 
     assert f32coal == n*l
 
-- 
GitLab


From c0dcf557e5996990e80fecc65ae46268728e876f Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Tue, 15 Mar 2016 23:39:45 -0500
Subject: [PATCH 05/55] replaced operation tuples with TypedOp class

---
 loopy/statistics.py     | 39 +++++++++++++++++--------
 test/test_statistics.py | 65 +++++++++++++++++++++--------------------
 2 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 96bf511b5..5855f0852 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -105,6 +105,21 @@ def stringify_stats_mapping(m):
     return result
 
 
+class TypedOp:
+
+    def __init__(self, dtype, name):
+        self.dtype = dtype
+        self.name = name
+
+    def __eq__(self, other):
+        return isinstance(other, TypedOp) and (
+                other.dtype == self.dtype and
+                other.name == self.name )
+
+    def __hash__(self):
+        return hash(str(self.dtype)+self.name)
+
+
 class StridedGmemAccess:
 
     def __init__(self, dtype, stride, direction=None):
@@ -151,7 +166,7 @@ class ExpressionOpCounter(CombineMapper):
 
     def map_call(self, expr):
         return ToCountMap(
-                    {(self.type_inf(expr), 'func:'+str(expr.function)): 1}
+                    {TypedOp(self.type_inf(expr), 'func:'+str(expr.function)): 1}
                     ) + self.rec(expr.parameters)
 
     # def map_call_with_kwargs(self, expr):  # implemented in CombineMapper
@@ -164,20 +179,20 @@ class ExpressionOpCounter(CombineMapper):
     def map_sum(self, expr):
         assert expr.children
         return ToCountMap(
-                    {(self.type_inf(expr), 'add'): len(expr.children)-1}
+                    {TypedOp(self.type_inf(expr), 'add'): len(expr.children)-1}
                     ) + sum(self.rec(child) for child in expr.children)
 
     def map_product(self, expr):
         from pymbolic.primitives import is_zero
         assert expr.children
-        return sum(ToCountMap({(self.type_inf(expr), 'mul'): 1})
+        return sum(ToCountMap({TypedOp(self.type_inf(expr), 'mul'): 1})
                    + self.rec(child)
                    for child in expr.children
                    if not is_zero(child + 1)) + \
-                   ToCountMap({(self.type_inf(expr), 'mul'): -1})
+                   ToCountMap({TypedOp(self.type_inf(expr), 'mul'): -1})
 
     def map_quotient(self, expr, *args):
-        return ToCountMap({(self.type_inf(expr), 'div'): 1}) \
+        return ToCountMap({TypedOp(self.type_inf(expr), 'div'): 1}) \
                                 + self.rec(expr.numerator) \
                                 + self.rec(expr.denominator)
 
@@ -185,24 +200,24 @@ class ExpressionOpCounter(CombineMapper):
     map_remainder = map_quotient
 
     def map_power(self, expr):
-        return ToCountMap({(self.type_inf(expr), 'pow'): 1}) \
+        return ToCountMap({TypedOp(self.type_inf(expr), 'pow'): 1}) \
                                 + self.rec(expr.base) \
                                 + self.rec(expr.exponent)
 
     def map_left_shift(self, expr):
-        return ToCountMap({(self.type_inf(expr), 'shift'): 1}) \
+        return ToCountMap({TypedOp(self.type_inf(expr), 'shift'): 1}) \
                                 + self.rec(expr.shiftee) \
                                 + self.rec(expr.shift)
 
     map_right_shift = map_left_shift
 
     def map_bitwise_not(self, expr):
-        return ToCountMap({(self.type_inf(expr), 'bw'): 1}) \
+        return ToCountMap({TypedOp(self.type_inf(expr), 'bw'): 1}) \
                                 + self.rec(expr.child)
 
     def map_bitwise_or(self, expr):
         return ToCountMap(
-                        {(self.type_inf(expr), 'bw'): len(expr.children)-1}
+                        {TypedOp(self.type_inf(expr), 'bw'): len(expr.children)-1}
                         ) + sum(self.rec(child) for child in expr.children)
 
     map_bitwise_xor = map_bitwise_or
@@ -230,9 +245,9 @@ class ExpressionOpCounter(CombineMapper):
         return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
 
     def map_min(self, expr):
-        return ToCountMap(
-                        {(self.type_inf(expr), 'maxmin'): len(expr.children)-1}
-                        ) + sum(self.rec(child) for child in expr.children)
+        return ToCountMap({TypedOp(
+                           self.type_inf(expr), 'maxmin'): len(expr.children)-1}
+                         ) + sum(self.rec(child) for child in expr.children)
 
     map_max = map_min
 
diff --git a/test/test_statistics.py b/test/test_statistics.py
index a4fc022d5..5d6fac573 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -32,7 +32,8 @@ from loopy.statistics import (
         get_op_poly,
         get_gmem_access_poly,
         get_barrier_poly,
-        StridedGmemAccess)
+        StridedGmemAccess,
+        TypedOp)
 
 import numpy as np
 
@@ -57,11 +58,11 @@ def test_op_counter_basic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
-    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params)
-    f64mul = poly[(np.dtype(np.float64), 'mul')].eval_with_dict(params)
-    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f32add = poly[TypedOp(np.dtype(np.float32), 'add')].eval_with_dict(params)
+    f32mul = poly[TypedOp(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    f32div = poly[TypedOp(np.dtype(np.float32), 'div')].eval_with_dict(params)
+    f64mul = poly[TypedOp(np.dtype(np.float64), 'mul')].eval_with_dict(params)
+    i32add = poly[TypedOp(np.dtype(np.int32), 'add')].eval_with_dict(params)
     assert f32add == f32mul == f32div == n*m*l
     assert f64mul == n*m
     assert i32add == n*m*2
@@ -82,8 +83,8 @@ def test_op_counter_reduction():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
-    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    f32add = poly[TypedOp(np.dtype(np.float32), 'add')].eval_with_dict(params)
+    f32mul = poly[TypedOp(np.dtype(np.float32), 'mul')].eval_with_dict(params)
     assert f32add == f32mul == n*m*l
 
 
@@ -104,10 +105,10 @@ def test_op_counter_logic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params)
-    f64div = poly[(np.dtype(np.float64), 'div')].eval_with_dict(params)
-    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f32mul = poly[TypedOp(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    f64add = poly[TypedOp(np.dtype(np.float64), 'add')].eval_with_dict(params)
+    f64div = poly[TypedOp(np.dtype(np.float64), 'div')].eval_with_dict(params)
+    i32add = poly[TypedOp(np.dtype(np.int32), 'add')].eval_with_dict(params)
     assert f32mul == n*m
     assert f64div == 2*n*m  # TODO why?
     assert f64add == n*m
@@ -133,18 +134,18 @@ def test_op_counter_specialops():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params)
-    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
-    f64pow = poly[(np.dtype(np.float64), 'pow')].eval_with_dict(params)
-    f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params)
-    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
-    f64rsqrt = poly[(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params)
-    f64sin = poly[(np.dtype(np.float64), 'func:sin')].eval_with_dict(params)
+    f32mul = poly[TypedOp(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    f32div = poly[TypedOp(np.dtype(np.float32), 'div')].eval_with_dict(params)
+    f32add = poly[TypedOp(np.dtype(np.float32), 'add')].eval_with_dict(params)
+    f64pow = poly[TypedOp(np.dtype(np.float64), 'pow')].eval_with_dict(params)
+    f64add = poly[TypedOp(np.dtype(np.float64), 'add')].eval_with_dict(params)
+    i32add = poly[TypedOp(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f64rsq = poly[TypedOp(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params)
+    f64sin = poly[TypedOp(np.dtype(np.float64), 'func:sin')].eval_with_dict(params)
     assert f32div == 2*n*m*l
     assert f32mul == f32add == n*m*l
     assert f64add == 3*n*m
-    assert f64pow == i32add == f64rsqrt == f64sin == n*m
+    assert f64pow == i32add == f64rsq == f64sin == n*m
 
 
 def test_op_counter_bitwise():
@@ -169,12 +170,12 @@ def test_op_counter_bitwise():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
-    i32bw = poly[(np.dtype(np.int32), 'bw')].eval_with_dict(params)
-    i64bw = poly[(np.dtype(np.int64), 'bw')].eval_with_dict(params)
-    i64mul = poly[(np.dtype(np.int64), 'mul')].eval_with_dict(params)
-    i64add = poly[(np.dtype(np.int64), 'add')].eval_with_dict(params)
-    i64shift = poly[(np.dtype(np.int64), 'shift')].eval_with_dict(params)
+    i32add = poly[TypedOp(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    i32bw = poly[TypedOp(np.dtype(np.int32), 'bw')].eval_with_dict(params)
+    i64bw = poly[TypedOp(np.dtype(np.int64), 'bw')].eval_with_dict(params)
+    i64mul = poly[TypedOp(np.dtype(np.int64), 'mul')].eval_with_dict(params)
+    i64add = poly[TypedOp(np.dtype(np.int64), 'add')].eval_with_dict(params)
+    i64shift = poly[TypedOp(np.dtype(np.int64), 'shift')].eval_with_dict(params)
     assert i32add == n*m+n*m*l
     assert i32bw == 2*n*m*l
     assert i64bw == 2*n*m
@@ -203,7 +204,7 @@ def test_op_counter_triangular_domain():
     else:
         expect_fallback = False
 
-    poly = get_op_poly(knl)[(np.dtype(np.float64), 'mul')]
+    poly = get_op_poly(knl)[TypedOp(np.dtype(np.float64), 'mul')]
     value_dict = dict(m=13, n=200)
     flops = poly.eval_with_dict(value_dict)
 
@@ -555,16 +556,16 @@ def test_all_counters_parallel_matmul():
 
     op_map = get_op_poly(knl)
     f32mul = op_map[
-                        (np.dtype(np.float32), 'mul')
+                        TypedOp(np.dtype(np.float32), 'mul')
                         ].eval_with_dict(params)
     f32add = op_map[
-                        (np.dtype(np.float32), 'add')
+                        TypedOp(np.dtype(np.float32), 'add')
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        (np.dtype(np.int32), 'add')
+                        TypedOp(np.dtype(np.int32), 'add')
                         ].eval_with_dict(params)
     i32ops += op_map[
-                        (np.dtype(np.int32), 'mul')
+                        TypedOp(np.dtype(np.int32), 'mul')
                         ].eval_with_dict(params)
 
     assert f32mul+f32add == n*m*l*2
-- 
GitLab


From b3d9498ca3bc92b1b23428f859ee83f610bc7a16 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Fri, 1 Apr 2016 11:29:01 -0500
Subject: [PATCH 06/55] temporary fix for stride counting when min tag axis >0,
 and added variable name to StridedGmemAccess (still work to do)

---
 loopy/statistics.py | 43 +++++++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 5855f0852..3f2c3a4b5 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -122,22 +122,30 @@ class TypedOp:
 
 class StridedGmemAccess:
 
-    def __init__(self, dtype, stride, direction=None):
+    #TODO "ANY_VAR" does not work yet
+
+    def __init__(self, dtype, stride, direction=None, variable='ANY_VAR'):
         self.dtype = dtype
         self.stride = stride
         self.direction = direction
+        self.variable = variable
 
     def __eq__(self, other):
         return isinstance(other, StridedGmemAccess) and (
                 other.dtype == self.dtype and
                 other.stride == self.stride and
-                other.direction == self.direction )
+                other.direction == self.direction and
+                ((self.variable == 'ANY_VAR' or other.variable == 'ANY_VAR') or
+                 self.variable == other.variable))
 
     def __hash__(self):
-        if self.direction == None:
-            return hash(str(self.dtype)+str(self.stride)+"None")
-        else:
-            return hash(str(self.dtype)+str(self.stride)+self.direction)
+        direction = self.direction
+        variable = self.variable
+        if direction == None:
+            direction = 'None'
+        if variable == None:
+            variable = 'ANY_VAR'
+        return hash(str(self.dtype)+str(self.stride)+direction+variable)
 
 
 # {{{ ExpressionOpCounter
@@ -310,7 +318,7 @@ class GlobalSubscriptCounter(CombineMapper):
             index = (index,)
 
         from loopy.symbolic import get_dependencies
-        from loopy.kernel.data import LocalIndexTag
+        from loopy.kernel.data import LocalIndexTag, GroupIndexTag
         my_inames = get_dependencies(index) & self.knl.all_inames()
 
         # find min tag axis
@@ -327,7 +335,7 @@ class GlobalSubscriptCounter(CombineMapper):
         if not local_id_found:
             # count as uniform access
             return ToCountMap(
-                    {StridedGmemAccess(self.type_inf(expr), 0): 1}
+                    {StridedGmemAccess(self.type_inf(expr), 0, direction=None, variable=name): 1}
                     ) + self.rec(expr.index)
 
         # get local_id associated with minimum tag axis
@@ -342,8 +350,10 @@ class GlobalSubscriptCounter(CombineMapper):
         # found local_id associated with minimum tag axis
 
         total_stride = None
-        # check coefficient of local_id0 for each axis
+        extra_stride = 1
+        # check coefficient of min_lid for each axis
         from loopy.symbolic import CoefficientCollector
+        from loopy.kernel.array import FixedStrideArrayDimTag
         from pymbolic.primitives import Variable
         for idx, axis_tag in zip(index, array.dim_tags):
             coeffs = CoefficientCollector()(idx)
@@ -355,17 +365,22 @@ class GlobalSubscriptCounter(CombineMapper):
                 continue
             # found coefficient of min_lid
             # now determine stride
-            from loopy.kernel.array import FixedStrideArrayDimTag
             if isinstance(axis_tag, FixedStrideArrayDimTag):
                 stride = axis_tag.stride
             else:
                 continue
 
-            total_stride = stride*coeff_min_lid
+            total_stride = stride*coeff_min_lid*extra_stride
             #TODO is there a case where this^ does not execute, or executes more than once for two different axes?
 
+        #TODO temporary fix that needs changing:
+        if min_tag_axis != 0:
+            print("...... min tag axis (%d) is not zero! ......" % (min_tag_axis))
+            return ToCountMap({StridedGmemAccess(self.type_inf(expr),
+                           sys.maxsize, direction=None, variable=name): 1}) + self.rec(expr.index)
+
         return ToCountMap({StridedGmemAccess(self.type_inf(expr),
-                           total_stride): 1}) + self.rec(expr.index)
+                           total_stride, direction=None, variable=name): 1}) + self.rec(expr.index)
 
     def map_sum(self, expr):
         if expr.children:
@@ -734,12 +749,12 @@ def get_gmem_access_poly(knl):  # for now just counting subscripts
         subs_expr = subscript_counter(insn.expression)
         for key in subs_expr.dict:
             subs_expr.dict[StridedGmemAccess(
-                           key.dtype, key.stride, 'load')
+                           key.dtype, key.stride, direction='load', variable=key.variable)
                           ] = subs_expr.dict.pop(key)
         subs_assignee = subscript_counter(insn.assignee)
         for key in subs_assignee.dict:
             subs_assignee.dict[StridedGmemAccess(
-                           key.dtype, key.stride, 'store')
+                           key.dtype, key.stride, direction='store', variable=key.variable)
                           ] = subs_assignee.dict.pop(key)
 
         insn_inames = knl.insn_inames(insn)
-- 
GitLab


From 138c0b1e553a4f2a6edcdba15a6deaee9e1b28d0 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Fri, 1 Apr 2016 11:29:25 -0500
Subject: [PATCH 07/55] added variable name to StridedGmemAccess (still work to
 do)

---
 test/test_statistics.py | 107 ++++++++++++++++++++++++++++------------
 1 file changed, 76 insertions(+), 31 deletions(-)

diff --git a/test/test_statistics.py b/test/test_statistics.py
index 5d6fac573..56b9f4003 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -233,16 +233,20 @@ def test_gmem_access_counter_basic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load')
+    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='a')
                ].eval_with_dict(params)
-    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'load')
+    f32 += poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='b')
+               ].eval_with_dict(params)
+    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='g')
+               ].eval_with_dict(params)
+    f64 += poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='h')
                ].eval_with_dict(params)
     assert f32 == 3*n*m*l
     assert f64 == 2*n*m
 
-    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'store')
+    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='store', variable='c')
                ].eval_with_dict(params)
-    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'store')
+    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='store', variable='e')
                ].eval_with_dict(params)
     assert f32 == n*m*l
     assert f64 == n*m
@@ -263,11 +267,13 @@ def test_gmem_access_counter_reduction():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load')
+    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='a')
+               ].eval_with_dict(params)
+    f32 += poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='b')
                ].eval_with_dict(params)
     assert f32 == 2*n*m*l
 
-    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'store')
+    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='store', variable='c')
                ].eval_with_dict(params)
     assert f32 == n*l
 
@@ -289,14 +295,14 @@ def test_gmem_access_counter_logic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load')
+    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='g')
                ].eval_with_dict(params)
-    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'load')
+    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='h')
                ].eval_with_dict(params)
     assert f32 == 2*n*m
     assert f64 == n*m
 
-    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'store')
+    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='store', variable='e')
                ].eval_with_dict(params)
     assert f64 == n*m
 
@@ -320,16 +326,20 @@ def test_gmem_access_counter_specialops():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load')
+    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='a')
+               ].eval_with_dict(params)
+    f32 += poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='b')
+               ].eval_with_dict(params)
+    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='g')
                ].eval_with_dict(params)
-    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'load')
+    f64 += poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='h')
                ].eval_with_dict(params)
     assert f32 == 2*n*m*l
     assert f64 == 2*n*m
 
-    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'store')
+    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='store', variable='c')
                ].eval_with_dict(params)
-    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'store')
+    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='store', variable='e')
                ].eval_with_dict(params)
     assert f32 == n*m*l
     assert f64 == n*m
@@ -357,11 +367,19 @@ def test_gmem_access_counter_bitwise():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    i32 = poly[StridedGmemAccess(np.dtype(np.int32), 0, 'load')
+    i32 = poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='load', variable='a')
+               ].eval_with_dict(params)
+    i32 += poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='load', variable='b')
+               ].eval_with_dict(params)
+    i32 += poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='load', variable='g')
+               ].eval_with_dict(params)
+    i32 += poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='load', variable='h')
                ].eval_with_dict(params)
     assert i32 == 4*n*m+2*n*m*l
 
-    i32 = poly[StridedGmemAccess(np.dtype(np.int32), 0, 'store')
+    i32 = poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='store', variable='c')
+               ].eval_with_dict(params)
+    i32 += poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='store', variable='e')
                ].eval_with_dict(params)
     assert i32 == n*m+n*m*l
 
@@ -389,21 +407,26 @@ def test_gmem_access_counter_mixed():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64uniform = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'load')
+    f64uniform = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='g')
                       ].eval_with_dict(params)
-    f32uniform = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load')
+    f64uniform += poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='h')
+                      ].eval_with_dict(params)
+    f32uniform = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='x')
                       ].eval_with_dict(params)
     f32nonconsec = poly[
-                    StridedGmemAccess(np.dtype(np.float32), Variable('m'), 'load')
+                    StridedGmemAccess(np.dtype(np.float32), Variable('m'), direction='load', variable='a')
+                    ].eval_with_dict(params)
+    f32nonconsec += poly[
+                    StridedGmemAccess(np.dtype(np.float32), Variable('m'), direction='load', variable='b')
                     ].eval_with_dict(params)
     assert f64uniform == 2*n*m
     assert f32uniform == n*m*l/threads
     assert f32nonconsec == 3*n*m*l
 
-    f64uniform = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'store')
+    f64uniform = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='store', variable='e')
                       ].eval_with_dict(params)
     f32nonconsec = poly[
-                    StridedGmemAccess(np.dtype(np.float32), Variable('m'), 'store')
+                    StridedGmemAccess(np.dtype(np.float32), Variable('m'), direction='store', variable='c')
                     ].eval_with_dict(params)
     assert f64uniform == n*m
     assert f32nonconsec == n*m*l
@@ -431,19 +454,25 @@ def test_gmem_access_counter_nonconsec():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
     f64nonconsec = poly[StridedGmemAccess(
-                        np.dtype(np.float64), Variable('m'), 'load')
+                        np.dtype(np.float64), Variable('m'), direction='load', variable='g')
+                        ].eval_with_dict(params)
+    f64nonconsec += poly[StridedGmemAccess(
+                        np.dtype(np.float64), Variable('m'), direction='load', variable='h')
                         ].eval_with_dict(params)
     f32nonconsec = poly[StridedGmemAccess(
-                        np.dtype(np.float32), Variable('m')*Variable('l'), 'load')
+                        np.dtype(np.float32), Variable('m')*Variable('l'), direction='load', variable='a')
+                        ].eval_with_dict(params)
+    f32nonconsec += poly[StridedGmemAccess(
+                        np.dtype(np.float32), Variable('m')*Variable('l'), direction='load', variable='b')
                         ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*l
 
     f64nonconsec = poly[StridedGmemAccess(
-                        np.dtype(np.float64), Variable('m'), 'store')
+                        np.dtype(np.float64), Variable('m'), direction='store', variable='e')
                         ].eval_with_dict(params)
     f32nonconsec = poly[StridedGmemAccess(
-                        np.dtype(np.float32), Variable('m')*Variable('l'), 'store')
+                        np.dtype(np.float32), Variable('m')*Variable('l'), direction='store', variable='c')
                         ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*l
@@ -470,16 +499,20 @@ def test_gmem_access_counter_consec():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    f64consec = poly[StridedGmemAccess(np.dtype(np.float64), 1, 'load')
+    f64consec = poly[StridedGmemAccess(np.dtype(np.float64), 1, direction='load', variable='g')
                      ].eval_with_dict(params)
-    f32consec = poly[StridedGmemAccess(np.dtype(np.float32), 1, 'load')
+    f64consec += poly[StridedGmemAccess(np.dtype(np.float64), 1, direction='load', variable='h')
+                     ].eval_with_dict(params)
+    f32consec = poly[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='a')
+                     ].eval_with_dict(params)
+    f32consec += poly[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='b')
                      ].eval_with_dict(params)
     assert f64consec == 2*n*m
     assert f32consec == 3*n*m*l
 
-    f64consec = poly[StridedGmemAccess(np.dtype(np.float64), 1, 'store')
+    f64consec = poly[StridedGmemAccess(np.dtype(np.float64), 1, direction='store', variable='e')
                      ].eval_with_dict(params)
-    f32consec = poly[StridedGmemAccess(np.dtype(np.float32), 1, 'store')
+    f32consec = poly[StridedGmemAccess(np.dtype(np.float32), 1, direction='store', variable='c')
                      ].eval_with_dict(params)
     assert f64consec == n*m
     assert f32consec == n*m*l
@@ -572,16 +605,28 @@ def test_all_counters_parallel_matmul():
     assert i32ops == n*m*l*4 + l*n*4
 
     subscript_map = get_gmem_access_poly(knl)
+    #f32uncoal = subscript_map[StridedGmemAccess(
+    #                          np.dtype(np.float32), Variable('m'), direction='load', variable='ANY_VAR')
+    #                          ].eval_with_dict(params)
+    #test = StridedGmemAccess(np.dtype(np.float32), sys.maxsize, direction='load', variable='ANY_VAR')
+    #print("test key: ", test.dtype, test.stride, test.direction, test.variable)
+    #for key in subscript_map:
+    #    print(key.dtype, key.stride, key.direction, key.variable)
+    f32uncoal = subscript_map[StridedGmemAccess(
+                              np.dtype(np.float32), sys.maxsize, direction='load', variable='a')
+                              ].eval_with_dict(params)
+    '''
     f32uncoal = subscript_map[StridedGmemAccess(
-                              np.dtype(np.float32), Variable('m'), 'load')
+                              np.dtype(np.float32), sys.maxsize, direction='load', variable='ANY_VAR')
                               ].eval_with_dict(params)
-    f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, 'load')
+    '''
+    f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='b')
                             ].eval_with_dict(params)
 
     assert f32uncoal == n*m*l
     assert f32coal == n*m*l
 
-    f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, 'store')
+    f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, direction='store', variable='c')
                             ].eval_with_dict(params)
 
     assert f32coal == n*l
-- 
GitLab


From e16469d799a21f7cc5459a445c64bc05a21209cc Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Fri, 1 Apr 2016 17:35:36 -0500
Subject: [PATCH 08/55] adding local mem access counter

---
 loopy/statistics.py     | 188 ++++++++++++++++++++++++++++++++++++++++
 test/test_statistics.py |  35 ++++----
 2 files changed, 204 insertions(+), 19 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 3f2c3a4b5..419eb2868 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -120,6 +120,24 @@ class TypedOp:
         return hash(str(self.dtype)+self.name)
 
 
+class LmemAccess:
+
+    def __init__(self, dtype, direction=None):
+        self.dtype = dtype
+        self.direction = direction
+
+    def __eq__(self, other):
+        return isinstance(other, LmemAccess) and (
+                other.dtype == self.dtype and
+                other.direction == self.direction)
+
+    def __hash__(self):
+        direction = self.direction
+        if direction == None:
+            direction = 'None'
+        return hash(str(self.dtype)+direction)
+
+
 class StridedGmemAccess:
 
     #TODO "ANY_VAR" does not work yet
@@ -279,6 +297,116 @@ class ExpressionOpCounter(CombineMapper):
 # }}}
 
 
+# {{{ LocalSubscriptCounter
+
+class LocalSubscriptCounter(CombineMapper):
+
+    def __init__(self, knl):
+        self.knl = knl
+        from loopy.expression import TypeInferenceMapper
+        self.type_inf = TypeInferenceMapper(knl)
+
+    def combine(self, values):
+        return sum(values)
+
+    def map_constant(self, expr):
+        return ToCountMap()
+
+    map_tagged_variable = map_constant
+    map_variable = map_constant
+
+    def map_call(self, expr):
+        return self.rec(expr.parameters)
+
+    def map_subscript(self, expr):
+        name = expr.aggregate.name  # name of array
+
+        if name in self.knl.temporary_variables:
+            array = self.knl.temporary_variables[name]
+            #print("array: ", array)
+            #print("is local? ", array.is_local)
+            if array.is_local:
+                return ToCountMap(
+                        {LmemAccess(self.type_inf(expr), direction=None): 1}
+                        ) + self.rec(expr.index)
+
+        return self.rec(expr.index)
+            
+    def map_sum(self, expr):
+        if expr.children:
+            return sum(self.rec(child) for child in expr.children)
+        else:
+            return ToCountMap()
+
+    map_product = map_sum
+
+    def map_quotient(self, expr, *args):
+        return self.rec(expr.numerator) + self.rec(expr.denominator)
+
+    map_floor_div = map_quotient
+    map_remainder = map_quotient
+
+    def map_power(self, expr):
+        return self.rec(expr.base) + self.rec(expr.exponent)
+
+    def map_left_shift(self, expr):
+        return self.rec(expr.shiftee)+self.rec(expr.shift)
+
+    map_right_shift = map_left_shift
+
+    def map_bitwise_not(self, expr):
+        return self.rec(expr.child)
+
+    def map_bitwise_or(self, expr):
+        return sum(self.rec(child) for child in expr.children)
+
+    map_bitwise_xor = map_bitwise_or
+    map_bitwise_and = map_bitwise_or
+
+    def map_comparison(self, expr):
+        return self.rec(expr.left)+self.rec(expr.right)
+
+    map_logical_not = map_bitwise_not
+    map_logical_or = map_bitwise_or
+    map_logical_and = map_logical_or
+
+    def map_if(self, expr):
+        warnings.warn("LocalSubscriptCounter counting LMEM accesses as "
+                      "sum of if-statement branches.")
+        return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_)
+
+    def map_if_positive(self, expr):
+        warnings.warn("LocalSubscriptCounter counting LMEM accesses as "
+                      "sum of if_pos-statement branches.")
+        return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
+
+    map_min = map_bitwise_or
+    map_max = map_min
+
+    def map_common_subexpression(self, expr):
+        raise NotImplementedError("LocalSubscriptCounter encountered "
+                                  "common_subexpression, "
+                                  "map_common_subexpression not implemented.")
+
+    def map_substitution(self, expr):
+        raise NotImplementedError("LocalSubscriptCounter encountered "
+                                  "substitution, "
+                                  "map_substitution not implemented.")
+
+    def map_derivative(self, expr):
+        raise NotImplementedError("LocalSubscriptCounter encountered "
+                                  "derivative, "
+                                  "map_derivative not implemented.")
+
+    def map_slice(self, expr):
+        raise NotImplementedError("LocalSubscriptCounter encountered slice, "
+                                  "map_slice not implemented.")
+
+# }}}
+
+
+
+
 # {{{ GlobalSubscriptCounter
 
 class GlobalSubscriptCounter(CombineMapper):
@@ -674,6 +802,66 @@ def sum_ops_to_dtypes(op_poly_dict):
     return result
 
 
+def get_lmem_access_poly(knl):
+
+    """Count the number of local memory accesses in a loopy kernel.
+    """
+
+    from loopy.preprocess import preprocess_kernel, infer_unknown_types
+
+    class CacheHolder(object):
+        pass
+
+    cache_holder = CacheHolder()
+
+    @memoize_in(cache_holder, "insn_count")
+    def get_insn_count(knl, insn_inames):
+        inames_domain = knl.get_inames_domain(insn_inames)
+        domain = (inames_domain.project_out_except(
+                                insn_inames, [dim_type.set]))
+        return count(knl, domain)
+
+    knl = infer_unknown_types(knl, expect_completion=True)
+    knl = preprocess_kernel(knl)
+
+    subs_poly = ToCountMap()
+    subscript_counter = LocalSubscriptCounter(knl)
+    for insn in knl.instructions:
+        # count subscripts, distinguishing loads and stores
+        subs_expr = subscript_counter(insn.expression)
+        for key in subs_expr.dict:
+            subs_expr.dict[LmemAccess(
+                           key.dtype, direction='load')
+                          ] = subs_expr.dict.pop(key)
+        subs_assignee = subscript_counter(insn.assignee)
+        for key in subs_assignee.dict:
+            print(key.dtype, key.direction, subs_assignee.dict[key])
+
+        # for now, not counting stores in local mem
+        '''
+        for key in subs_assignee.dict:
+            subs_assignee.dict[LmemAccess(
+                               key.dtype, direction='store')
+                              ] = subs_assignee.dict.pop(key)
+        '''
+
+        insn_inames = knl.insn_inames(insn)
+
+        # use count excluding local index tags for uniform accesses
+        for key in subs_expr.dict:
+            poly = ToCountMap({key: subs_expr.dict[key]})
+            subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
+
+        # for now, not counting stores in local mem
+        '''
+        for key in subs_assignee.dict:
+            poly = ToCountMap({key: subs_assignee.dict[key]})
+            subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
+        '''
+
+    return subs_poly.dict
+
+
 # {{{ get_gmem_access_poly
 def get_gmem_access_poly(knl):  # for now just counting subscripts
 
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 56b9f4003..0353ac0d4 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -31,8 +31,10 @@ import loopy as lp
 from loopy.statistics import (
         get_op_poly,
         get_gmem_access_poly,
+        get_lmem_access_poly,
         get_barrier_poly,
         StridedGmemAccess,
+        LmemAccess,
         TypedOp)
 
 import numpy as np
@@ -578,6 +580,9 @@ def test_all_counters_parallel_matmul():
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
+    knl = lp.split_iname(knl, "k", 16)
+    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
+    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])
 
     n = 512
     m = 256
@@ -585,7 +590,7 @@ def test_all_counters_parallel_matmul():
     params = {'n': n, 'm': m, 'l': l}
 
     barrier_count = get_barrier_poly(knl).eval_with_dict(params)
-    assert barrier_count == 0
+    assert barrier_count == 2*m/16
 
     op_map = get_op_poly(knl)
     f32mul = op_map[
@@ -602,35 +607,27 @@ def test_all_counters_parallel_matmul():
                         ].eval_with_dict(params)
 
     assert f32mul+f32add == n*m*l*2
-    assert i32ops == n*m*l*4 + l*n*4
 
     subscript_map = get_gmem_access_poly(knl)
-    #f32uncoal = subscript_map[StridedGmemAccess(
-    #                          np.dtype(np.float32), Variable('m'), direction='load', variable='ANY_VAR')
-    #                          ].eval_with_dict(params)
-    #test = StridedGmemAccess(np.dtype(np.float32), sys.maxsize, direction='load', variable='ANY_VAR')
-    #print("test key: ", test.dtype, test.stride, test.direction, test.variable)
-    #for key in subscript_map:
-    #    print(key.dtype, key.stride, key.direction, key.variable)
-    f32uncoal = subscript_map[StridedGmemAccess(
-                              np.dtype(np.float32), sys.maxsize, direction='load', variable='a')
-                              ].eval_with_dict(params)
-    '''
-    f32uncoal = subscript_map[StridedGmemAccess(
-                              np.dtype(np.float32), sys.maxsize, direction='load', variable='ANY_VAR')
-                              ].eval_with_dict(params)
-    '''
+
     f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='b')
                             ].eval_with_dict(params)
+    f32coal += subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='a')
+                            ].eval_with_dict(params)
 
-    assert f32uncoal == n*m*l
-    assert f32coal == n*m*l
+    assert f32coal == n*m+m*l
 
     f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, direction='store', variable='c')
                             ].eval_with_dict(params)
 
     assert f32coal == n*l
 
+    local_subs_map = get_lmem_access_poly(knl)
+
+    local_subs_l = local_subs_map[LmemAccess(np.dtype(np.float32), direction='load')
+                                  ].eval_with_dict(params)
+
+    assert local_subs_l == n*m*l*2
 
 def test_gather_access_footprint():
     knl = lp.make_kernel(
-- 
GitLab


From 0d069a656e5661e9dc82df8b5d8097e2b538a98b Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Thu, 1 Sep 2016 12:30:23 -0500
Subject: [PATCH 09/55] commenting out debug print

---
 loopy/statistics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 419eb2868..8b57b782b 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -834,8 +834,8 @@ def get_lmem_access_poly(knl):
                            key.dtype, direction='load')
                           ] = subs_expr.dict.pop(key)
         subs_assignee = subscript_counter(insn.assignee)
-        for key in subs_assignee.dict:
-            print(key.dtype, key.direction, subs_assignee.dict[key])
+        #for key in subs_assignee.dict:
+        #    print(key.dtype, key.direction, subs_assignee.dict[key])
 
         # for now, not counting stores in local mem
         '''
-- 
GitLab


From 1a3e4259784c0b791cbed8f06c8da6c49d0ce272 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Sat, 15 Oct 2016 19:33:56 -0500
Subject: [PATCH 10/55] TypedOp -> Op

---
 loopy/statistics.py     | 26 ++++++++---------
 test/test_statistics.py | 62 ++++++++++++++++++++---------------------
 2 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 0257623ed..684a683c5 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -108,14 +108,14 @@ def stringify_stats_mapping(m):
     return result
 
 
-class TypedOp:
+class Op:
 
     def __init__(self, dtype, name):
         self.dtype = dtype
         self.name = name
 
     def __eq__(self, other):
-        return isinstance(other, TypedOp) and (
+        return isinstance(other, Op) and (
                 other.dtype == self.dtype and
                 other.name == self.name )
 
@@ -195,7 +195,7 @@ class ExpressionOpCounter(CombineMapper):
 
     def map_call(self, expr):
         return ToCountMap(
-                    {TypedOp(self.type_inf(expr), 'func:'+str(expr.function)): 1}
+                    {Op(self.type_inf(expr), 'func:'+str(expr.function)): 1}
                     ) + self.rec(expr.parameters)
 
     # def map_call_with_kwargs(self, expr):  # implemented in CombineMapper
@@ -208,20 +208,20 @@ class ExpressionOpCounter(CombineMapper):
     def map_sum(self, expr):
         assert expr.children
         return ToCountMap(
-                    {TypedOp(self.type_inf(expr), 'add'): len(expr.children)-1}
+                    {Op(self.type_inf(expr), 'add'): len(expr.children)-1}
                     ) + sum(self.rec(child) for child in expr.children)
 
     def map_product(self, expr):
         from pymbolic.primitives import is_zero
         assert expr.children
-        return sum(ToCountMap({TypedOp(self.type_inf(expr), 'mul'): 1})
+        return sum(ToCountMap({Op(self.type_inf(expr), 'mul'): 1})
                    + self.rec(child)
                    for child in expr.children
                    if not is_zero(child + 1)) + \
-                   ToCountMap({TypedOp(self.type_inf(expr), 'mul'): -1})
+                   ToCountMap({Op(self.type_inf(expr), 'mul'): -1})
 
     def map_quotient(self, expr, *args):
-        return ToCountMap({TypedOp(self.type_inf(expr), 'div'): 1}) \
+        return ToCountMap({Op(self.type_inf(expr), 'div'): 1}) \
                                 + self.rec(expr.numerator) \
                                 + self.rec(expr.denominator)
 
@@ -229,24 +229,24 @@ class ExpressionOpCounter(CombineMapper):
     map_remainder = map_quotient
 
     def map_power(self, expr):
-        return ToCountMap({TypedOp(self.type_inf(expr), 'pow'): 1}) \
+        return ToCountMap({Op(self.type_inf(expr), 'pow'): 1}) \
                                 + self.rec(expr.base) \
                                 + self.rec(expr.exponent)
 
     def map_left_shift(self, expr):
-        return ToCountMap({TypedOp(self.type_inf(expr), 'shift'): 1}) \
+        return ToCountMap({Op(self.type_inf(expr), 'shift'): 1}) \
                                 + self.rec(expr.shiftee) \
                                 + self.rec(expr.shift)
 
     map_right_shift = map_left_shift
 
     def map_bitwise_not(self, expr):
-        return ToCountMap({TypedOp(self.type_inf(expr), 'bw'): 1}) \
+        return ToCountMap({Op(self.type_inf(expr), 'bw'): 1}) \
                                 + self.rec(expr.child)
 
     def map_bitwise_or(self, expr):
         return ToCountMap(
-                        {TypedOp(self.type_inf(expr), 'bw'): len(expr.children)-1}
+                        {Op(self.type_inf(expr), 'bw'): len(expr.children)-1}
                         ) + sum(self.rec(child) for child in expr.children)
 
     map_bitwise_xor = map_bitwise_or
@@ -274,7 +274,7 @@ class ExpressionOpCounter(CombineMapper):
         return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
 
     def map_min(self, expr):
-        return ToCountMap({TypedOp(
+        return ToCountMap({Op(
                            self.type_inf(expr), 'maxmin'): len(expr.children)-1}
                          ) + sum(self.rec(child) for child in expr.children)
 
@@ -807,7 +807,7 @@ def get_op_poly(knl, numpy_types=True):
 
     if numpy_types:
         result = dict(
-                (TypedOp(op.dtype.numpy_dtype, op.name), count)
+                (Op(op.dtype.numpy_dtype, op.name), count)
                 for op, count in six.iteritems(result))
 
     return result
diff --git a/test/test_statistics.py b/test/test_statistics.py
index c2d1d459c..4b4a344f4 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -35,7 +35,7 @@ from loopy.statistics import (
         get_synchronization_poly,
         StridedGmemAccess,
         LmemAccess,
-        TypedOp)
+        Op)
 import loopy as lp
 import numpy as np
 
@@ -60,11 +60,11 @@ def test_op_counter_basic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32add = poly[TypedOp(np.dtype(np.float32), 'add')].eval_with_dict(params)
-    f32mul = poly[TypedOp(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    f32div = poly[TypedOp(np.dtype(np.float32), 'div')].eval_with_dict(params)
-    f64mul = poly[TypedOp(np.dtype(np.float64), 'mul')].eval_with_dict(params)
-    i32add = poly[TypedOp(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f32add = poly[Op(np.dtype(np.float32), 'add')].eval_with_dict(params)
+    f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    f32div = poly[Op(np.dtype(np.float32), 'div')].eval_with_dict(params)
+    f64mul = poly[Op(np.dtype(np.float64), 'mul')].eval_with_dict(params)
+    i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
     assert f32add == f32mul == f32div == n*m*l
     assert f64mul == n*m
     assert i32add == n*m*2
@@ -85,8 +85,8 @@ def test_op_counter_reduction():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32add = poly[TypedOp(np.dtype(np.float32), 'add')].eval_with_dict(params)
-    f32mul = poly[TypedOp(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    f32add = poly[Op(np.dtype(np.float32), 'add')].eval_with_dict(params)
+    f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
     assert f32add == f32mul == n*m*l
 
 
@@ -107,10 +107,10 @@ def test_op_counter_logic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32mul = poly[TypedOp(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    f64add = poly[TypedOp(np.dtype(np.float64), 'add')].eval_with_dict(params)
-    f64div = poly[TypedOp(np.dtype(np.float64), 'div')].eval_with_dict(params)
-    i32add = poly[TypedOp(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    f64add = poly[Op(np.dtype(np.float64), 'add')].eval_with_dict(params)
+    f64div = poly[Op(np.dtype(np.float64), 'div')].eval_with_dict(params)
+    i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
     assert f32mul == n*m
     assert f64div == 2*n*m  # TODO why?
     assert f64add == n*m
@@ -136,14 +136,14 @@ def test_op_counter_specialops():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32mul = poly[TypedOp(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    f32div = poly[TypedOp(np.dtype(np.float32), 'div')].eval_with_dict(params)
-    f32add = poly[TypedOp(np.dtype(np.float32), 'add')].eval_with_dict(params)
-    f64pow = poly[TypedOp(np.dtype(np.float64), 'pow')].eval_with_dict(params)
-    f64add = poly[TypedOp(np.dtype(np.float64), 'add')].eval_with_dict(params)
-    i32add = poly[TypedOp(np.dtype(np.int32), 'add')].eval_with_dict(params)
-    f64rsq = poly[TypedOp(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params)
-    f64sin = poly[TypedOp(np.dtype(np.float64), 'func:sin')].eval_with_dict(params)
+    f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    f32div = poly[Op(np.dtype(np.float32), 'div')].eval_with_dict(params)
+    f32add = poly[Op(np.dtype(np.float32), 'add')].eval_with_dict(params)
+    f64pow = poly[Op(np.dtype(np.float64), 'pow')].eval_with_dict(params)
+    f64add = poly[Op(np.dtype(np.float64), 'add')].eval_with_dict(params)
+    i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f64rsq = poly[Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params)
+    f64sin = poly[Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params)
     assert f32div == 2*n*m*l
     assert f32mul == f32add == n*m*l
     assert f64add == 3*n*m
@@ -172,12 +172,12 @@ def test_op_counter_bitwise():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    i32add = poly[TypedOp(np.dtype(np.int32), 'add')].eval_with_dict(params)
-    i32bw = poly[TypedOp(np.dtype(np.int32), 'bw')].eval_with_dict(params)
-    i64bw = poly[TypedOp(np.dtype(np.int64), 'bw')].eval_with_dict(params)
-    i64mul = poly[TypedOp(np.dtype(np.int64), 'mul')].eval_with_dict(params)
-    i64add = poly[TypedOp(np.dtype(np.int64), 'add')].eval_with_dict(params)
-    i64shift = poly[TypedOp(np.dtype(np.int64), 'shift')].eval_with_dict(params)
+    i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    i32bw = poly[Op(np.dtype(np.int32), 'bw')].eval_with_dict(params)
+    i64bw = poly[Op(np.dtype(np.int64), 'bw')].eval_with_dict(params)
+    i64mul = poly[Op(np.dtype(np.int64), 'mul')].eval_with_dict(params)
+    i64add = poly[Op(np.dtype(np.int64), 'add')].eval_with_dict(params)
+    i64shift = poly[Op(np.dtype(np.int64), 'shift')].eval_with_dict(params)
     assert i32add == n*m+n*m*l
     assert i32bw == 2*n*m*l
     assert i64bw == 2*n*m
@@ -206,7 +206,7 @@ def test_op_counter_triangular_domain():
     else:
         expect_fallback = False
 
-    poly = lp.get_op_poly(knl)[TypedOp(np.dtype(np.float64), 'mul')]
+    poly = lp.get_op_poly(knl)[Op(np.dtype(np.float64), 'mul')]
     value_dict = dict(m=13, n=200)
     flops = poly.eval_with_dict(value_dict)
 
@@ -600,16 +600,16 @@ def test_all_counters_parallel_matmul():
 
     op_map = lp.get_op_poly(knl)
     f32mul = op_map[
-                        TypedOp(np.dtype(np.float32), 'mul')
+                        Op(np.dtype(np.float32), 'mul')
                         ].eval_with_dict(params)
     f32add = op_map[
-                        TypedOp(np.dtype(np.float32), 'add')
+                        Op(np.dtype(np.float32), 'add')
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        TypedOp(np.dtype(np.int32), 'add')
+                        Op(np.dtype(np.int32), 'add')
                         ].eval_with_dict(params)
     i32ops += op_map[
-                        TypedOp(np.dtype(np.int32), 'mul')
+                        Op(np.dtype(np.int32), 'mul')
                         ].eval_with_dict(params)
 
     assert f32mul+f32add == n*m*l*2
-- 
GitLab


From fadc351819db1060fec85307bf3ed92ee461621c Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Sat, 15 Oct 2016 20:37:00 -0500
Subject: [PATCH 11/55] merged StridedGmemAccess and LmemAccess into MemAccess,
 also reformatted long lines of code

---
 loopy/statistics.py     | 131 ++++++++++++++---------
 test/test_statistics.py | 229 ++++++++++++++++++++++++----------------
 2 files changed, 219 insertions(+), 141 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 684a683c5..a08e36c50 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -122,7 +122,7 @@ class Op:
     def __hash__(self):
         return hash(str(self.dtype)+self.name)
 
-
+'''
 class LmemAccess:
 
     def __init__(self, dtype, direction=None):
@@ -139,20 +139,25 @@ class LmemAccess:
         if direction == None:
             direction = 'None'
         return hash(str(self.dtype)+direction)
+'''
 
 
-class StridedGmemAccess:
+#class StridedGmemAccess:
+class MemAccess:
 
     #TODO "ANY_VAR" does not work yet
-
-    def __init__(self, dtype, stride, direction=None, variable='ANY_VAR'):
+    #TODO currently counting all lmem access as stride-1
+    def __init__(self, mtype, dtype, stride=1, direction=None,
+                 variable='ANY_VAR'):
+        self.mtype = mtype
         self.dtype = dtype
         self.stride = stride
         self.direction = direction
         self.variable = variable
 
     def __eq__(self, other):
-        return isinstance(other, StridedGmemAccess) and (
+        return isinstance(other, MemAccess) and (
+                other.mtype == self.mtype and
                 other.dtype == self.dtype and
                 other.stride == self.stride and
                 other.direction == self.direction and
@@ -166,7 +171,9 @@ class StridedGmemAccess:
             direction = 'None'
         if variable == None:
             variable = 'ANY_VAR'
-        return hash(str(self.dtype)+str(self.stride)+direction+variable)
+        return hash(str(self.mtype)+str(self.dtype)+str(self.stride)
+                    +direction+variable)
+
 
 
 # {{{ ExpressionOpCounter
@@ -266,16 +273,18 @@ class ExpressionOpCounter(CombineMapper):
     def map_if(self, expr):
         warnings.warn("ExpressionOpCounter counting ops as "
                       "sum of if-statement branches.")
-        return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_)
+        return self.rec(expr.condition) + self.rec(expr.then) \
+               + self.rec(expr.else_)
 
     def map_if_positive(self, expr):
         warnings.warn("ExpressionOpCounter counting ops as "
                       "sum of if_pos-statement branches.")
-        return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
+        return self.rec(expr.criterion) + self.rec(expr.then) \
+               + self.rec(expr.else_)
 
     def map_min(self, expr):
         return ToCountMap({Op(
-                           self.type_inf(expr), 'maxmin'): len(expr.children)-1}
+                          self.type_inf(expr), 'maxmin'): len(expr.children)-1}
                          ) + sum(self.rec(child) for child in expr.children)
 
     map_max = map_min
@@ -286,11 +295,13 @@ class ExpressionOpCounter(CombineMapper):
                                   "map_common_subexpression not implemented.")
 
     def map_substitution(self, expr):
-        raise NotImplementedError("ExpressionOpCounter encountered substitution, "
+        raise NotImplementedError("ExpressionOpCounter encountered "
+                                  "substitution, "
                                   "map_substitution not implemented.")
 
     def map_derivative(self, expr):
-        raise NotImplementedError("ExpressionOpCounter encountered derivative, "
+        raise NotImplementedError("ExpressionOpCounter encountered "
+                                  "derivative, "
                                   "map_derivative not implemented.")
 
     def map_slice(self, expr):
@@ -330,7 +341,7 @@ class LocalSubscriptCounter(CombineMapper):
             #print("is local? ", array.is_local)
             if array.is_local:
                 return ToCountMap(
-                        {LmemAccess(self.type_inf(expr), direction=None): 1}
+                        {MemAccess('local', self.type_inf(expr)): 1}
                         ) + self.rec(expr.index)
 
         return self.rec(expr.index)
@@ -376,12 +387,14 @@ class LocalSubscriptCounter(CombineMapper):
     def map_if(self, expr):
         warnings.warn("LocalSubscriptCounter counting LMEM accesses as "
                       "sum of if-statement branches.")
-        return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_)
+        return self.rec(expr.condition) + self.rec(expr.then) \
+               + self.rec(expr.else_)
 
     def map_if_positive(self, expr):
         warnings.warn("LocalSubscriptCounter counting LMEM accesses as "
                       "sum of if_pos-statement branches.")
-        return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
+        return self.rec(expr.criterion) + self.rec(expr.then) \
+               + self.rec(expr.else_)
 
     map_min = map_bitwise_or
     map_max = map_min
@@ -465,9 +478,9 @@ class GlobalSubscriptCounter(CombineMapper):
 
         if not local_id_found:
             # count as uniform access
-            return ToCountMap(
-                    {StridedGmemAccess(self.type_inf(expr), 0, direction=None, variable=name): 1}
-                    ) + self.rec(expr.index)
+            return ToCountMap({MemAccess('global', self.type_inf(expr),
+                               stride=0, variable=name): 1}
+                             ) + self.rec(expr.index)
 
         # get local_id associated with minimum tag axis
         min_lid = None
@@ -504,16 +517,19 @@ class GlobalSubscriptCounter(CombineMapper):
                 continue
 
             total_stride = stride*coeff_min_lid*extra_stride
-            #TODO is there a case where this^ does not execute, or executes more than once for two different axes?
+            #TODO is there a case where this^ does not execute,
+            # or executes more than once for two different axes?
 
         #TODO temporary fix that needs changing:
         if min_tag_axis != 0:
-            print("...... min tag axis (%d) is not zero! ......" % (min_tag_axis))
-            return ToCountMap({StridedGmemAccess(self.type_inf(expr),
-                           sys.maxsize, direction=None, variable=name): 1}) + self.rec(expr.index)
+            print("... min tag axis (%d) is not zero! ..." % (min_tag_axis))
+            return ToCountMap({MemAccess('global', self.type_inf(expr),
+                               stride=sys.maxsize, variable=name): 1}
+                             ) + self.rec(expr.index)
 
-        return ToCountMap({StridedGmemAccess(self.type_inf(expr),
-                           total_stride, direction=None, variable=name): 1}) + self.rec(expr.index)
+        return ToCountMap({MemAccess('global', self.type_inf(expr),
+                           stride=total_stride, variable=name): 1}
+                         ) + self.rec(expr.index)
 
     def map_sum(self, expr):
         if expr.children:
@@ -556,12 +572,14 @@ class GlobalSubscriptCounter(CombineMapper):
     def map_if(self, expr):
         warnings.warn("GlobalSubscriptCounter counting GMEM accesses as "
                       "sum of if-statement branches.")
-        return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_)
+        return self.rec(expr.condition) + self.rec(expr.then) \
+               + self.rec(expr.else_)
 
     def map_if_positive(self, expr):
         warnings.warn("GlobalSubscriptCounter counting GMEM accesses as "
                       "sum of if_pos-statement branches.")
-        return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_)
+        return self.rec(expr.criterion) + self.rec(expr.then) \
+               + self.rec(expr.else_)
 
     map_min = map_bitwise_or
     map_max = map_min
@@ -696,7 +714,8 @@ def count(kernel, set):
 
             # {{{ rebuild check domain
 
-            zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(bset.space))
+            zero = isl.Aff.zero_on_domain(
+                        isl.LocalSpace.from_space(bset.space))
             iname = isl.PwAff.from_aff(
                     zero.set_coefficient_val(isl.dim_type.in_, i, 1))
             dmin_matched = dmin.insert_dims(
@@ -800,7 +819,8 @@ def get_op_poly(knl, numpy_types=True):
         # check domain size:
         insn_inames = knl.insn_inames(insn)
         inames_domain = knl.get_inames_domain(insn_inames)
-        domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
+        domain = (inames_domain.project_out_except(
+                                        insn_inames, [dim_type.set]))
         ops = op_counter(insn.assignee) + op_counter(insn.expression)
         op_poly = op_poly + ops*count(knl, domain)
     result = op_poly.dict
@@ -854,7 +874,7 @@ def get_lmem_access_poly(knl, numpy_types=True):
         # count subscripts, distinguishing loads and stores
         subs_expr = subscript_counter(insn.expression)
         for key in subs_expr.dict:
-            subs_expr.dict[LmemAccess(
+            subs_expr.dict[MemAccess('local', 
                            key.dtype, direction='load')
                           ] = subs_expr.dict.pop(key)
         subs_assignee = subscript_counter(insn.assignee)
@@ -864,7 +884,7 @@ def get_lmem_access_poly(knl, numpy_types=True):
         # for now, not counting stores in local mem
         '''
         for key in subs_assignee.dict:
-            subs_assignee.dict[LmemAccess(
+            subs_assignee.dict[MemAccess('local', 
                                key.dtype, direction='store')
                               ] = subs_assignee.dict.pop(key)
         '''
@@ -887,15 +907,15 @@ def get_lmem_access_poly(knl, numpy_types=True):
     result = subs_poly.dict
 
     if numpy_types:
-        result = dict(
-                (LmemAccess(mem_access.dtype.numpy_dtype, mem_access.direction), count)
-                for mem_access, count in six.iteritems(result))
+        result = dict((MemAccess('local', mem_access.dtype.numpy_dtype,
+                       direction=mem_access.direction), count)
+                       for mem_access, count in six.iteritems(result))
 
     return result
 
 
 # {{{ get_gmem_access_poly
-def get_gmem_access_poly(knl, numpy_types=True):  # for now just counting subscripts
+def get_gmem_access_poly(knl, numpy_types=True):
 
     """Count the number of global memory accesses in a loopy kernel.
 
@@ -955,7 +975,8 @@ def get_gmem_access_poly(knl, numpy_types=True):  # for now just counting subscr
         if uniform:
             from loopy.kernel.data import LocalIndexTag
             insn_inames = [iname for iname in insn_inames if not
-                           isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)]
+                           isinstance(
+                           knl.iname_to_tag.get(iname), LocalIndexTag)]
         inames_domain = knl.get_inames_domain(insn_inames)
         domain = (inames_domain.project_out_except(
                                 insn_inames, [dim_type.set]))
@@ -970,14 +991,15 @@ def get_gmem_access_poly(knl, numpy_types=True):  # for now just counting subscr
         # count subscripts, distinguishing loads and stores
         subs_expr = subscript_counter(insn.expression)
         for key in subs_expr.dict:
-            subs_expr.dict[StridedGmemAccess(
-                           key.dtype, key.stride, direction='load', variable=key.variable)
+            subs_expr.dict[MemAccess('global', key.dtype, stride=key.stride,
+                                     direction='load', variable=key.variable)
                           ] = subs_expr.dict.pop(key)
         subs_assignee = subscript_counter(insn.assignee)
         for key in subs_assignee.dict:
-            subs_assignee.dict[StridedGmemAccess(
-                           key.dtype, key.stride, direction='store', variable=key.variable)
-                          ] = subs_assignee.dict.pop(key)
+            subs_assignee.dict[MemAccess('global', key.dtype,
+                                         stride=key.stride, direction='store',
+                                         variable=key.variable)
+                              ] = subs_assignee.dict.pop(key)
 
         insn_inames = knl.insn_inames(insn)
 
@@ -985,31 +1007,36 @@ def get_gmem_access_poly(knl, numpy_types=True):  # for now just counting subscr
         for key in subs_expr.dict:
             poly = ToCountMap({key: subs_expr.dict[key]})
             if isinstance(key.stride, int) and key.stride == 0:
-                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True)
+                subs_poly = subs_poly \
+                            + poly*get_insn_count(knl, insn_inames, True)
             else:
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
         for key in subs_assignee.dict:
             poly = ToCountMap({key: subs_assignee.dict[key]})
             if isinstance(key.stride, int) and key.stride == 0:
-                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True)
+                subs_poly = subs_poly \
+                            + poly*get_insn_count(knl, insn_inames, True)
             else:
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
 
     result = subs_poly.dict
 
     if numpy_types:
-        result = dict(
-                (StridedGmemAccess(mem_access.dtype.numpy_dtype, mem_access.stride,
-                                   mem_access.direction, mem_access.variable), count)
-                for mem_access, count in six.iteritems(result))
+        result = dict((MemAccess('global', mem_access.dtype.numpy_dtype,
+                                 stride=mem_access.stride,
+                                 direction=mem_access.direction,
+                                 variable=mem_access.variable)
+                       , count)
+                      for mem_access, count in six.iteritems(result))
 
     return result
 
 
 def get_DRAM_access_poly(knl):
     from warnings import warn
-    warn("get_DRAM_access_poly is deprecated. Use get_gmem_access_poly instead",
-            DeprecationWarning, stacklevel=2)
+    warn("get_DRAM_access_poly is deprecated. "
+         "Use get_gmem_access_poly instead",
+         DeprecationWarning, stacklevel=2)
     return get_gmem_access_poly(knl)
 
 # }}}
@@ -1100,8 +1127,8 @@ def get_synchronization_poly(knl):
                 iname_list.pop()
 
         elif isinstance(sched_item, Barrier):
-            result = result + ToCountMap(
-                    {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)})
+            result = result + ToCountMap({"barrier_%s" % sched_item.kind:
+                                          get_count_poly(iname_list)})
 
         elif isinstance(sched_item, CallKernel):
             result = result + ToCountMap(
@@ -1151,7 +1178,8 @@ def gather_access_footprints(kernel, ignore_uncountable=False):
 
         insn_inames = kernel.insn_inames(insn)
         inames_domain = kernel.get_inames_domain(insn_inames)
-        domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
+        domain = (inames_domain.project_out_except(insn_inames,
+                                                   [dim_type.set]))
 
         afg = AccessFootprintGatherer(kernel, domain,
                 ignore_uncountable=ignore_uncountable)
@@ -1193,7 +1221,8 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
         kernel = preprocess_kernel(kernel)
 
     result = {}
-    fp = gather_access_footprints(kernel, ignore_uncountable=ignore_uncountable)
+    fp = gather_access_footprints(kernel,
+                                  ignore_uncountable=ignore_uncountable)
 
     for key, var_fp in fp.items():
         vname, direction = key
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 4b4a344f4..54e3b69a8 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -33,8 +33,7 @@ from loopy.statistics import (
         get_gmem_access_poly,
         get_lmem_access_poly,
         get_synchronization_poly,
-        StridedGmemAccess,
-        LmemAccess,
+        MemAccess,
         Op)
 import loopy as lp
 import numpy as np
@@ -54,7 +53,8 @@ def test_op_counter_basic():
             name="basic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
+                                  dict(a=np.float32, b=np.float32,
+                                       g=np.float64, h=np.float64))
     poly = lp.get_op_poly(knl)
     n = 512
     m = 256
@@ -130,7 +130,8 @@ def test_op_counter_specialops():
             name="specialops", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
+                                  dict(a=np.float32, b=np.float32,
+                                       g=np.float64, h=np.float64))
     poly = lp.get_op_poly(knl)
     n = 512
     m = 256
@@ -235,21 +236,27 @@ def test_gmem_access_counter_basic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='a')
-               ].eval_with_dict(params)
-    f32 += poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='b')
-               ].eval_with_dict(params)
-    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='g')
+    f32 = poly[MemAccess('global', np.dtype(np.float32),
+                         stride=0, direction='load', variable='a')
+              ].eval_with_dict(params)
+    f32 += poly[MemAccess('global', np.dtype(np.float32),
+                          stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
-    f64 += poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='h')
+    f64 = poly[MemAccess('global', np.dtype(np.float64),
+                         stride=0, direction='load', variable='g')
+              ].eval_with_dict(params)
+    f64 += poly[MemAccess('global', np.dtype(np.float64),
+                          stride=0, direction='load', variable='h')
                ].eval_with_dict(params)
     assert f32 == 3*n*m*l
     assert f64 == 2*n*m
 
-    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='store', variable='c')
-               ].eval_with_dict(params)
-    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='store', variable='e')
-               ].eval_with_dict(params)
+    f32 = poly[MemAccess('global', np.dtype(np.float32),
+                         stride=0, direction='store', variable='c')
+              ].eval_with_dict(params)
+    f64 = poly[MemAccess('global', np.dtype(np.float64),
+                         stride=0, direction='store', variable='e')
+              ].eval_with_dict(params)
     assert f32 == n*m*l
     assert f64 == n*m
 
@@ -269,14 +276,17 @@ def test_gmem_access_counter_reduction():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='a')
-               ].eval_with_dict(params)
-    f32 += poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='b')
+    f32 = poly[MemAccess('global', np.dtype(np.float32),
+                         stride=0, direction='load', variable='a')
+              ].eval_with_dict(params)
+    f32 += poly[MemAccess('global', np.dtype(np.float32),
+                          stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
     assert f32 == 2*n*m*l
 
-    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='store', variable='c')
-               ].eval_with_dict(params)
+    f32 = poly[MemAccess('global', np.dtype(np.float32),
+                         stride=0, direction='store', variable='c')
+              ].eval_with_dict(params)
     assert f32 == n*l
 
 
@@ -297,15 +307,18 @@ def test_gmem_access_counter_logic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='g')
-               ].eval_with_dict(params)
-    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='h')
-               ].eval_with_dict(params)
+    f32 = poly[MemAccess('global', np.dtype(np.float32),
+                         stride=0, direction='load', variable='g')
+              ].eval_with_dict(params)
+    f64 = poly[MemAccess('global', np.dtype(np.float64),
+                         stride=0, direction='load', variable='h')
+              ].eval_with_dict(params)
     assert f32 == 2*n*m
     assert f64 == n*m
 
-    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='store', variable='e')
-               ].eval_with_dict(params)
+    f64 = poly[MemAccess('global', np.dtype(np.float64),
+                         stride=0, direction='store', variable='e')
+              ].eval_with_dict(params)
     assert f64 == n*m
 
 
@@ -321,28 +334,34 @@ def test_gmem_access_counter_specialops():
             ],
             name="specialops", assumptions="n,m,l >= 1")
 
-    knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
+                                            g=np.float64, h=np.float64))
     poly = lp.get_gmem_access_poly(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='a')
-               ].eval_with_dict(params)
-    f32 += poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='b')
+    f32 = poly[MemAccess('global', np.dtype(np.float32),
+                         stride=0, direction='load', variable='a')
+              ].eval_with_dict(params)
+    f32 += poly[MemAccess('global', np.dtype(np.float32),
+                          stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
-    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='g')
-               ].eval_with_dict(params)
-    f64 += poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='h')
+    f64 = poly[MemAccess('global', np.dtype(np.float64),
+                         stride=0, direction='load', variable='g')
+              ].eval_with_dict(params)
+    f64 += poly[MemAccess('global', np.dtype(np.float64),
+                          stride=0, direction='load', variable='h')
                ].eval_with_dict(params)
     assert f32 == 2*n*m*l
     assert f64 == 2*n*m
 
-    f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='store', variable='c')
-               ].eval_with_dict(params)
-    f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='store', variable='e')
-               ].eval_with_dict(params)
+    f32 = poly[MemAccess('global', np.dtype(np.float32),
+                         stride=0, direction='store', variable='c')
+              ].eval_with_dict(params)
+    f64 = poly[MemAccess('global', np.dtype(np.float64),
+                         stride=0, direction='store', variable='e')
+              ].eval_with_dict(params)
     assert f32 == n*m*l
     assert f64 == n*m
 
@@ -369,19 +388,25 @@ def test_gmem_access_counter_bitwise():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    i32 = poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='load', variable='a')
-               ].eval_with_dict(params)
-    i32 += poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='load', variable='b')
+    i32 = poly[MemAccess('global', np.dtype(np.int32), 
+                         stride=0, direction='load', variable='a')
+              ].eval_with_dict(params)
+    i32 += poly[MemAccess('global', np.dtype(np.int32), 
+                          stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
-    i32 += poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='load', variable='g')
+    i32 += poly[MemAccess('global', np.dtype(np.int32), 
+                          stride=0, direction='load', variable='g')
                ].eval_with_dict(params)
-    i32 += poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='load', variable='h')
+    i32 += poly[MemAccess('global', np.dtype(np.int32), 
+                          stride=0, direction='load', variable='h')
                ].eval_with_dict(params)
     assert i32 == 4*n*m+2*n*m*l
 
-    i32 = poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='store', variable='c')
-               ].eval_with_dict(params)
-    i32 += poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='store', variable='e')
+    i32 = poly[MemAccess('global', np.dtype(np.int32), 
+                         stride=0, direction='store', variable='c')
+              ].eval_with_dict(params)
+    i32 += poly[MemAccess('global', np.dtype(np.int32), 
+                          stride=0, direction='store', variable='e')
                ].eval_with_dict(params)
     assert i32 == n*m+n*m*l
 
@@ -409,27 +434,34 @@ def test_gmem_access_counter_mixed():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64uniform = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='g')
-                      ].eval_with_dict(params)
-    f64uniform += poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='h')
-                      ].eval_with_dict(params)
-    f32uniform = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='x')
+    f64uniform = poly[MemAccess('global', np.dtype(np.float64), 
+                                stride=0, direction='load', variable='g')
+                     ].eval_with_dict(params)
+    f64uniform += poly[MemAccess('global', np.dtype(np.float64), 
+                                 stride=0, direction='load', variable='h')
                       ].eval_with_dict(params)
-    f32nonconsec = poly[
-                    StridedGmemAccess(np.dtype(np.float32), Variable('m'), direction='load', variable='a')
-                    ].eval_with_dict(params)
-    f32nonconsec += poly[
-                    StridedGmemAccess(np.dtype(np.float32), Variable('m'), direction='load', variable='b')
-                    ].eval_with_dict(params)
+    f32uniform = poly[MemAccess('global', np.dtype(np.float32), 
+                                stride=0, direction='load', variable='x')
+                     ].eval_with_dict(params)
+    f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), 
+                                  stride=Variable('m'), direction='load',
+                                  variable='a')
+                       ].eval_with_dict(params)
+    f32nonconsec += poly[MemAccess('global', np.dtype(np.float32), 
+                                   stride=Variable('m'), direction='load',
+                                   variable='b')
+                        ].eval_with_dict(params)
     assert f64uniform == 2*n*m
     assert f32uniform == n*m*l/threads
     assert f32nonconsec == 3*n*m*l
 
-    f64uniform = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='store', variable='e')
-                      ].eval_with_dict(params)
-    f32nonconsec = poly[
-                    StridedGmemAccess(np.dtype(np.float32), Variable('m'), direction='store', variable='c')
-                    ].eval_with_dict(params)
+    f64uniform = poly[MemAccess('global', np.dtype(np.float64), 
+                                stride=0, direction='store', variable='e')
+                     ].eval_with_dict(params)
+    f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), 
+                                  stride=Variable('m'), direction='store',
+                                  variable='c')
+                       ].eval_with_dict(params)
     assert f64uniform == n*m
     assert f32nonconsec == n*m*l
 
@@ -455,27 +487,33 @@ def test_gmem_access_counter_nonconsec():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64nonconsec = poly[StridedGmemAccess(
-                        np.dtype(np.float64), Variable('m'), direction='load', variable='g')
-                        ].eval_with_dict(params)
-    f64nonconsec += poly[StridedGmemAccess(
-                        np.dtype(np.float64), Variable('m'), direction='load', variable='h')
-                        ].eval_with_dict(params)
-    f32nonconsec = poly[StridedGmemAccess(
-                        np.dtype(np.float32), Variable('m')*Variable('l'), direction='load', variable='a')
+    f64nonconsec = poly[MemAccess('global', np.dtype(np.float64), 
+                                  stride=Variable('m'), direction='load',
+                                  variable='g')
+                       ].eval_with_dict(params)
+    f64nonconsec += poly[MemAccess('global', np.dtype(np.float64), 
+                                   stride=Variable('m'), direction='load',
+                                   variable='h')
                         ].eval_with_dict(params)
-    f32nonconsec += poly[StridedGmemAccess(
-                        np.dtype(np.float32), Variable('m')*Variable('l'), direction='load', variable='b')
+    f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), 
+                                  stride=Variable('m')*Variable('l'),
+                                  direction='load', variable='a')
+                       ].eval_with_dict(params)
+    f32nonconsec += poly[MemAccess('global', np.dtype(np.float32), 
+                                   stride=Variable('m')*Variable('l'),
+                                   direction='load', variable='b')
                         ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*l
 
-    f64nonconsec = poly[StridedGmemAccess(
-                        np.dtype(np.float64), Variable('m'), direction='store', variable='e')
-                        ].eval_with_dict(params)
-    f32nonconsec = poly[StridedGmemAccess(
-                        np.dtype(np.float32), Variable('m')*Variable('l'), direction='store', variable='c')
-                        ].eval_with_dict(params)
+    f64nonconsec = poly[MemAccess('global', np.dtype(np.float64), 
+                                  stride=Variable('m'), direction='store',
+                                  variable='e')
+                       ].eval_with_dict(params)
+    f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), 
+                                  stride=Variable('m')*Variable('l'),
+                                  direction='store', variable='c')
+                       ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*l
 
@@ -501,20 +539,26 @@ def test_gmem_access_counter_consec():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    f64consec = poly[StridedGmemAccess(np.dtype(np.float64), 1, direction='load', variable='g')
+    f64consec = poly[MemAccess('global', np.dtype(np.float64), 
+                        stride=1, direction='load', variable='g')
                      ].eval_with_dict(params)
-    f64consec += poly[StridedGmemAccess(np.dtype(np.float64), 1, direction='load', variable='h')
+    f64consec += poly[MemAccess('global', np.dtype(np.float64), 
+                        stride=1, direction='load', variable='h')
                      ].eval_with_dict(params)
-    f32consec = poly[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='a')
+    f32consec = poly[MemAccess('global', np.dtype(np.float32), 
+                        stride=1, direction='load', variable='a')
                      ].eval_with_dict(params)
-    f32consec += poly[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='b')
+    f32consec += poly[MemAccess('global', np.dtype(np.float32), 
+                        stride=1, direction='load', variable='b')
                      ].eval_with_dict(params)
     assert f64consec == 2*n*m
     assert f32consec == 3*n*m*l
 
-    f64consec = poly[StridedGmemAccess(np.dtype(np.float64), 1, direction='store', variable='e')
+    f64consec = poly[MemAccess('global', np.dtype(np.float64), 
+                        stride=1, direction='store', variable='e')
                      ].eval_with_dict(params)
-    f32consec = poly[StridedGmemAccess(np.dtype(np.float32), 1, direction='store', variable='c')
+    f32consec = poly[MemAccess('global', np.dtype(np.float32), 
+                        stride=1, direction='store', variable='c')
                      ].eval_with_dict(params)
     assert f64consec == n*m
     assert f32consec == n*m*l
@@ -532,8 +576,8 @@ def test_barrier_counter_nobarriers():
             ],
             name="basic", assumptions="n,m,l >= 1")
 
-    knl = lp.add_and_infer_dtypes(knl,
-                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
+                                            g=np.float64, h=np.float64))
     sync_poly = lp.get_synchronization_poly(knl)
     n = 512
     m = 256
@@ -616,22 +660,27 @@ def test_all_counters_parallel_matmul():
 
     subscript_map = lp.get_gmem_access_poly(knl)
 
-    f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='b')
+    f32coal = subscript_map[MemAccess('global', np.dtype(np.float32), 
+                        stride=1, direction='load', variable='b')
                             ].eval_with_dict(params)
-    f32coal += subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='a')
+    f32coal += subscript_map[MemAccess('global', np.dtype(np.float32), 
+                        stride=1, direction='load', variable='a')
                             ].eval_with_dict(params)
 
     assert f32coal == n*m+m*l
 
-    f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, direction='store', variable='c')
+    f32coal = subscript_map[MemAccess('global', np.dtype(np.float32), 
+                        stride=1, direction='store', variable='c')
                             ].eval_with_dict(params)
 
     assert f32coal == n*l
 
     local_subs_map = get_lmem_access_poly(knl)
 
-    local_subs_l = local_subs_map[LmemAccess(np.dtype(np.float32), direction='load')
-                                  ].eval_with_dict(params)
+    # TODO currently considering all local mem access stride-1
+    local_subs_l = local_subs_map[MemAccess('local', np.dtype(np.float32),
+                                            direction='load')
+                                 ].eval_with_dict(params)
 
     assert local_subs_l == n*m*l*2
 
-- 
GitLab


From eeb027a66244e8c6908c6b4085d03fd301f94db6 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Sun, 16 Oct 2016 21:21:56 -0500
Subject: [PATCH 12/55] allowing for data types entered as np.float64 and
 converting to np.dtype(np.float64)

---
 loopy/statistics.py     |  35 +++++---------
 test/test_statistics.py | 102 ++++++++++++++++++++--------------------
 2 files changed, 64 insertions(+), 73 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index a08e36c50..df385aa6d 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -111,8 +111,13 @@ def stringify_stats_mapping(m):
 class Op:
 
     def __init__(self, dtype, name):
-        self.dtype = dtype
         self.name = name
+        import numpy as np
+        if issubclass(type(dtype), type):
+            self.dtype = np.dtype(dtype)
+        else:
+            self.dtype = dtype
+        #TODO should this check be more robust?
 
     def __eq__(self, other):
         return isinstance(other, Op) and (
@@ -122,27 +127,7 @@ class Op:
     def __hash__(self):
         return hash(str(self.dtype)+self.name)
 
-'''
-class LmemAccess:
-
-    def __init__(self, dtype, direction=None):
-        self.dtype = dtype
-        self.direction = direction
-
-    def __eq__(self, other):
-        return isinstance(other, LmemAccess) and (
-                other.dtype == self.dtype and
-                other.direction == self.direction)
-
-    def __hash__(self):
-        direction = self.direction
-        if direction == None:
-            direction = 'None'
-        return hash(str(self.dtype)+direction)
-'''
 
-
-#class StridedGmemAccess:
 class MemAccess:
 
     #TODO "ANY_VAR" does not work yet
@@ -150,11 +135,17 @@ class MemAccess:
     def __init__(self, mtype, dtype, stride=1, direction=None,
                  variable='ANY_VAR'):
         self.mtype = mtype
-        self.dtype = dtype
         self.stride = stride
         self.direction = direction
         self.variable = variable
 
+        import numpy as np
+        if issubclass(type(dtype), type):
+            self.dtype = np.dtype(dtype)
+        else:
+            self.dtype = dtype
+        #TODO should this check be more robust?
+
     def __eq__(self, other):
         return isinstance(other, MemAccess) and (
                 other.mtype == self.mtype and
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 54e3b69a8..4a83092cd 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -60,9 +60,9 @@ def test_op_counter_basic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32add = poly[Op(np.dtype(np.float32), 'add')].eval_with_dict(params)
-    f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    f32div = poly[Op(np.dtype(np.float32), 'div')].eval_with_dict(params)
+    f32add = poly[Op(np.float32, 'add')].eval_with_dict(params)
+    f32mul = poly[Op(np.float32, 'mul')].eval_with_dict(params)
+    f32div = poly[Op(np.float32, 'div')].eval_with_dict(params)
     f64mul = poly[Op(np.dtype(np.float64), 'mul')].eval_with_dict(params)
     i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
     assert f32add == f32mul == f32div == n*m*l
@@ -85,7 +85,7 @@ def test_op_counter_reduction():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32add = poly[Op(np.dtype(np.float32), 'add')].eval_with_dict(params)
+    f32add = poly[Op(np.float32, 'add')].eval_with_dict(params)
     f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
     assert f32add == f32mul == n*m*l
 
@@ -107,8 +107,8 @@ def test_op_counter_logic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    f64add = poly[Op(np.dtype(np.float64), 'add')].eval_with_dict(params)
+    f32mul = poly[Op(np.float32, 'mul')].eval_with_dict(params)
+    f64add = poly[Op(np.float64, 'add')].eval_with_dict(params)
     f64div = poly[Op(np.dtype(np.float64), 'div')].eval_with_dict(params)
     i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
     assert f32mul == n*m
@@ -137,10 +137,10 @@ def test_op_counter_specialops():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    f32div = poly[Op(np.dtype(np.float32), 'div')].eval_with_dict(params)
-    f32add = poly[Op(np.dtype(np.float32), 'add')].eval_with_dict(params)
-    f64pow = poly[Op(np.dtype(np.float64), 'pow')].eval_with_dict(params)
+    f32mul = poly[Op(np.float32, 'mul')].eval_with_dict(params)
+    f32div = poly[Op(np.float32, 'div')].eval_with_dict(params)
+    f32add = poly[Op(np.float32, 'add')].eval_with_dict(params)
+    f64pow = poly[Op(np.float64, 'pow')].eval_with_dict(params)
     f64add = poly[Op(np.dtype(np.float64), 'add')].eval_with_dict(params)
     i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
     f64rsq = poly[Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params)
@@ -173,8 +173,8 @@ def test_op_counter_bitwise():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
-    i32bw = poly[Op(np.dtype(np.int32), 'bw')].eval_with_dict(params)
+    i32add = poly[Op(np.int32, 'add')].eval_with_dict(params)
+    i32bw = poly[Op(np.int32, 'bw')].eval_with_dict(params)
     i64bw = poly[Op(np.dtype(np.int64), 'bw')].eval_with_dict(params)
     i64mul = poly[Op(np.dtype(np.int64), 'mul')].eval_with_dict(params)
     i64add = poly[Op(np.dtype(np.int64), 'add')].eval_with_dict(params)
@@ -207,7 +207,7 @@ def test_op_counter_triangular_domain():
     else:
         expect_fallback = False
 
-    poly = lp.get_op_poly(knl)[Op(np.dtype(np.float64), 'mul')]
+    poly = lp.get_op_poly(knl)[Op(np.float64, 'mul')]
     value_dict = dict(m=13, n=200)
     flops = poly.eval_with_dict(value_dict)
 
@@ -236,16 +236,16 @@ def test_gmem_access_counter_basic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[MemAccess('global', np.dtype(np.float32),
+    f32 = poly[MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
               ].eval_with_dict(params)
-    f32 += poly[MemAccess('global', np.dtype(np.float32),
+    f32 += poly[MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
-    f64 = poly[MemAccess('global', np.dtype(np.float64),
+    f64 = poly[MemAccess('global', np.float64,
                          stride=0, direction='load', variable='g')
               ].eval_with_dict(params)
-    f64 += poly[MemAccess('global', np.dtype(np.float64),
+    f64 += poly[MemAccess('global', np.float64,
                           stride=0, direction='load', variable='h')
                ].eval_with_dict(params)
     assert f32 == 3*n*m*l
@@ -276,10 +276,10 @@ def test_gmem_access_counter_reduction():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[MemAccess('global', np.dtype(np.float32),
+    f32 = poly[MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
               ].eval_with_dict(params)
-    f32 += poly[MemAccess('global', np.dtype(np.float32),
+    f32 += poly[MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
     assert f32 == 2*n*m*l
@@ -307,16 +307,16 @@ def test_gmem_access_counter_logic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[MemAccess('global', np.dtype(np.float32),
+    f32 = poly[MemAccess('global', np.float32,
                          stride=0, direction='load', variable='g')
               ].eval_with_dict(params)
-    f64 = poly[MemAccess('global', np.dtype(np.float64),
+    f64 = poly[MemAccess('global', np.float64,
                          stride=0, direction='load', variable='h')
               ].eval_with_dict(params)
     assert f32 == 2*n*m
     assert f64 == n*m
 
-    f64 = poly[MemAccess('global', np.dtype(np.float64),
+    f64 = poly[MemAccess('global', np.float64,
                          stride=0, direction='store', variable='e')
               ].eval_with_dict(params)
     assert f64 == n*m
@@ -341,10 +341,10 @@ def test_gmem_access_counter_specialops():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[MemAccess('global', np.dtype(np.float32),
+    f32 = poly[MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
               ].eval_with_dict(params)
-    f32 += poly[MemAccess('global', np.dtype(np.float32),
+    f32 += poly[MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
     f64 = poly[MemAccess('global', np.dtype(np.float64),
@@ -356,10 +356,10 @@ def test_gmem_access_counter_specialops():
     assert f32 == 2*n*m*l
     assert f64 == 2*n*m
 
-    f32 = poly[MemAccess('global', np.dtype(np.float32),
+    f32 = poly[MemAccess('global', np.float32,
                          stride=0, direction='store', variable='c')
               ].eval_with_dict(params)
-    f64 = poly[MemAccess('global', np.dtype(np.float64),
+    f64 = poly[MemAccess('global', np.float64,
                          stride=0, direction='store', variable='e')
               ].eval_with_dict(params)
     assert f32 == n*m*l
@@ -388,13 +388,13 @@ def test_gmem_access_counter_bitwise():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    i32 = poly[MemAccess('global', np.dtype(np.int32), 
+    i32 = poly[MemAccess('global', np.int32, 
                          stride=0, direction='load', variable='a')
               ].eval_with_dict(params)
-    i32 += poly[MemAccess('global', np.dtype(np.int32), 
+    i32 += poly[MemAccess('global', np.int32, 
                           stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
-    i32 += poly[MemAccess('global', np.dtype(np.int32), 
+    i32 += poly[MemAccess('global', np.int32, 
                           stride=0, direction='load', variable='g')
                ].eval_with_dict(params)
     i32 += poly[MemAccess('global', np.dtype(np.int32), 
@@ -402,10 +402,10 @@ def test_gmem_access_counter_bitwise():
                ].eval_with_dict(params)
     assert i32 == 4*n*m+2*n*m*l
 
-    i32 = poly[MemAccess('global', np.dtype(np.int32), 
+    i32 = poly[MemAccess('global', np.int32, 
                          stride=0, direction='store', variable='c')
               ].eval_with_dict(params)
-    i32 += poly[MemAccess('global', np.dtype(np.int32), 
+    i32 += poly[MemAccess('global', np.int32, 
                           stride=0, direction='store', variable='e')
                ].eval_with_dict(params)
     assert i32 == n*m+n*m*l
@@ -434,13 +434,13 @@ def test_gmem_access_counter_mixed():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64uniform = poly[MemAccess('global', np.dtype(np.float64), 
+    f64uniform = poly[MemAccess('global', np.float64, 
                                 stride=0, direction='load', variable='g')
                      ].eval_with_dict(params)
-    f64uniform += poly[MemAccess('global', np.dtype(np.float64), 
+    f64uniform += poly[MemAccess('global', np.float64, 
                                  stride=0, direction='load', variable='h')
                       ].eval_with_dict(params)
-    f32uniform = poly[MemAccess('global', np.dtype(np.float32), 
+    f32uniform = poly[MemAccess('global', np.float32, 
                                 stride=0, direction='load', variable='x')
                      ].eval_with_dict(params)
     f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), 
@@ -455,10 +455,10 @@ def test_gmem_access_counter_mixed():
     assert f32uniform == n*m*l/threads
     assert f32nonconsec == 3*n*m*l
 
-    f64uniform = poly[MemAccess('global', np.dtype(np.float64), 
+    f64uniform = poly[MemAccess('global', np.float64, 
                                 stride=0, direction='store', variable='e')
                      ].eval_with_dict(params)
-    f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), 
+    f32nonconsec = poly[MemAccess('global', np.float32, 
                                   stride=Variable('m'), direction='store',
                                   variable='c')
                        ].eval_with_dict(params)
@@ -487,11 +487,11 @@ def test_gmem_access_counter_nonconsec():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64nonconsec = poly[MemAccess('global', np.dtype(np.float64), 
+    f64nonconsec = poly[MemAccess('global', np.float64, 
                                   stride=Variable('m'), direction='load',
                                   variable='g')
                        ].eval_with_dict(params)
-    f64nonconsec += poly[MemAccess('global', np.dtype(np.float64), 
+    f64nonconsec += poly[MemAccess('global', np.float64, 
                                    stride=Variable('m'), direction='load',
                                    variable='h')
                         ].eval_with_dict(params)
@@ -506,11 +506,11 @@ def test_gmem_access_counter_nonconsec():
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*l
 
-    f64nonconsec = poly[MemAccess('global', np.dtype(np.float64), 
+    f64nonconsec = poly[MemAccess('global', np.float64, 
                                   stride=Variable('m'), direction='store',
                                   variable='e')
                        ].eval_with_dict(params)
-    f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), 
+    f32nonconsec = poly[MemAccess('global', np.float32, 
                                   stride=Variable('m')*Variable('l'),
                                   direction='store', variable='c')
                        ].eval_with_dict(params)
@@ -539,13 +539,13 @@ def test_gmem_access_counter_consec():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    f64consec = poly[MemAccess('global', np.dtype(np.float64), 
+    f64consec = poly[MemAccess('global', np.float64, 
                         stride=1, direction='load', variable='g')
                      ].eval_with_dict(params)
-    f64consec += poly[MemAccess('global', np.dtype(np.float64), 
+    f64consec += poly[MemAccess('global', np.float64, 
                         stride=1, direction='load', variable='h')
                      ].eval_with_dict(params)
-    f32consec = poly[MemAccess('global', np.dtype(np.float32), 
+    f32consec = poly[MemAccess('global', np.float32, 
                         stride=1, direction='load', variable='a')
                      ].eval_with_dict(params)
     f32consec += poly[MemAccess('global', np.dtype(np.float32), 
@@ -554,10 +554,10 @@ def test_gmem_access_counter_consec():
     assert f64consec == 2*n*m
     assert f32consec == 3*n*m*l
 
-    f64consec = poly[MemAccess('global', np.dtype(np.float64), 
+    f64consec = poly[MemAccess('global', np.float64, 
                         stride=1, direction='store', variable='e')
                      ].eval_with_dict(params)
-    f32consec = poly[MemAccess('global', np.dtype(np.float32), 
+    f32consec = poly[MemAccess('global', np.float32, 
                         stride=1, direction='store', variable='c')
                      ].eval_with_dict(params)
     assert f64consec == n*m
@@ -644,13 +644,13 @@ def test_all_counters_parallel_matmul():
 
     op_map = lp.get_op_poly(knl)
     f32mul = op_map[
-                        Op(np.dtype(np.float32), 'mul')
+                        Op(np.float32, 'mul')
                         ].eval_with_dict(params)
     f32add = op_map[
-                        Op(np.dtype(np.float32), 'add')
+                        Op(np.float32, 'add')
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        Op(np.dtype(np.int32), 'add')
+                        Op(np.int32, 'add')
                         ].eval_with_dict(params)
     i32ops += op_map[
                         Op(np.dtype(np.int32), 'mul')
@@ -660,16 +660,16 @@ def test_all_counters_parallel_matmul():
 
     subscript_map = lp.get_gmem_access_poly(knl)
 
-    f32coal = subscript_map[MemAccess('global', np.dtype(np.float32), 
+    f32coal = subscript_map[MemAccess('global', np.float32, 
                         stride=1, direction='load', variable='b')
                             ].eval_with_dict(params)
-    f32coal += subscript_map[MemAccess('global', np.dtype(np.float32), 
+    f32coal += subscript_map[MemAccess('global', np.float32, 
                         stride=1, direction='load', variable='a')
                             ].eval_with_dict(params)
 
     assert f32coal == n*m+m*l
 
-    f32coal = subscript_map[MemAccess('global', np.dtype(np.float32), 
+    f32coal = subscript_map[MemAccess('global', np.float32, 
                         stride=1, direction='store', variable='c')
                             ].eval_with_dict(params)
 
-- 
GitLab


From 2db20d6763bcfa2d400e08283accd53f2a76caaa Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Mon, 17 Oct 2016 20:27:39 -0500
Subject: [PATCH 13/55] combined get_lmem_access_poly and get_gmem_access_poly
 into get_mem_access_poly

---
 loopy/__init__.py       |   4 +-
 loopy/statistics.py     | 145 ++++++++++++++--------------------------
 test/test_statistics.py |  23 +++----
 3 files changed, 65 insertions(+), 107 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index 897567444..4cfa23fa4 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -112,7 +112,7 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction,
         infer_unknown_types)
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
 from loopy.statistics import (get_op_poly, sum_ops_to_dtypes,
-        get_gmem_access_poly,
+        get_mem_access_poly, get_gmem_access_poly,
         get_DRAM_access_poly, get_synchronization_poly, stringify_stats_mapping,
         sum_mem_access_to_bytes,
         gather_access_footprints, gather_access_footprint_bytes)
@@ -218,7 +218,7 @@ __all__ = [
         "generate_code", "generate_code_v2", "generate_body",
 
         "get_op_poly", "sum_ops_to_dtypes", "get_gmem_access_poly",
-        "get_DRAM_access_poly",
+        "get_mem_access_poly", "get_DRAM_access_poly",
         "get_synchronization_poly", "stringify_stats_mapping",
         "sum_mem_access_to_bytes",
         "gather_access_footprints", "gather_access_footprint_bytes",
diff --git a/loopy/statistics.py b/loopy/statistics.py
index df385aa6d..24ba905a5 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -837,77 +837,19 @@ def sum_ops_to_dtypes(op_poly_dict):
     return result
 
 
-def get_lmem_access_poly(knl, numpy_types=True):
-
+def get_lmem_access_poly(knl):
     """Count the number of local memory accesses in a loopy kernel.
     """
-
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-
-    class CacheHolder(object):
-        pass
-
-    cache_holder = CacheHolder()
-
-    @memoize_in(cache_holder, "insn_count")
-    def get_insn_count(knl, insn_inames):
-        inames_domain = knl.get_inames_domain(insn_inames)
-        domain = (inames_domain.project_out_except(
-                                insn_inames, [dim_type.set]))
-        return count(knl, domain)
-
-    knl = infer_unknown_types(knl, expect_completion=True)
-    knl = preprocess_kernel(knl)
-
-    subs_poly = ToCountMap()
-    subscript_counter = LocalSubscriptCounter(knl)
-    for insn in knl.instructions:
-        # count subscripts, distinguishing loads and stores
-        subs_expr = subscript_counter(insn.expression)
-        for key in subs_expr.dict:
-            subs_expr.dict[MemAccess('local', 
-                           key.dtype, direction='load')
-                          ] = subs_expr.dict.pop(key)
-        subs_assignee = subscript_counter(insn.assignee)
-        #for key in subs_assignee.dict:
-        #    print(key.dtype, key.direction, subs_assignee.dict[key])
-
-        # for now, not counting stores in local mem
-        '''
-        for key in subs_assignee.dict:
-            subs_assignee.dict[MemAccess('local', 
-                               key.dtype, direction='store')
-                              ] = subs_assignee.dict.pop(key)
-        '''
-
-        insn_inames = knl.insn_inames(insn)
-
-        # use count excluding local index tags for uniform accesses
-        for key in subs_expr.dict:
-            poly = ToCountMap({key: subs_expr.dict[key]})
-            subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
-
-        # for now, not counting stores in local mem
-        '''
-        for key in subs_assignee.dict:
-            poly = ToCountMap({key: subs_assignee.dict[key]})
-            subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
-        '''
-
-    #return subs_poly.dict
-    result = subs_poly.dict
-
-    if numpy_types:
-        result = dict((MemAccess('local', mem_access.dtype.numpy_dtype,
-                       direction=mem_access.direction), count)
-                       for mem_access, count in six.iteritems(result))
-
-    return result
+    from warnings import warn
+    warn("get_lmem_access_poly is deprecated. "
+         "Use get_mem_access_poly with local option instead",
+         DeprecationWarning, stacklevel=2)
+    return get_mem_access_poly(knl, 'local')
 
 
 # {{{ get_gmem_access_poly
-def get_gmem_access_poly(knl, numpy_types=True):
 
+def get_gmem_access_poly(knl):
     """Count the number of global memory accesses in a loopy kernel.
 
     :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
@@ -953,7 +895,23 @@ def get_gmem_access_poly(knl, numpy_types=True):
         # (now use these counts to predict performance)
 
     """
+    from warnings import warn
+    warn("get_gmem_access_poly is deprecated. "
+         "Use get_mem_access_poly with global option instead",
+         DeprecationWarning, stacklevel=2)
+    return get_mem_access_poly(knl, 'global')
+
 
+def get_DRAM_access_poly(knl):
+    from warnings import warn
+    warn("get_DRAM_access_poly is deprecated. "
+         "Use get_mem_access_poly with global option instead",
+         DeprecationWarning, stacklevel=2)
+    return get_mem_access_poly(knl, 'global')
+
+# }}}
+
+def get_mem_access_poly(knl, mtype, numpy_types=True):
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
 
     class CacheHolder(object):
@@ -977,43 +935,55 @@ def get_gmem_access_poly(knl, numpy_types=True):
     knl = preprocess_kernel(knl)
 
     subs_poly = ToCountMap()
-    subscript_counter = GlobalSubscriptCounter(knl)
+    if mtype == 'global':
+        subscript_counter = GlobalSubscriptCounter(knl)
+    elif mtype == 'local':
+        subscript_counter = LocalSubscriptCounter(knl)
+    else:
+        raise ValueError("get_mem_access_poly: mtype must be "
+                         "'local' or 'global', received {0}"
+                         .format(mtype))
+
     for insn in knl.instructions:
         # count subscripts, distinguishing loads and stores
         subs_expr = subscript_counter(insn.expression)
         for key in subs_expr.dict:
-            subs_expr.dict[MemAccess('global', key.dtype, stride=key.stride,
+            subs_expr.dict[MemAccess(key.mtype, key.dtype, stride=key.stride,
                                      direction='load', variable=key.variable)
                           ] = subs_expr.dict.pop(key)
-        subs_assignee = subscript_counter(insn.assignee)
-        for key in subs_assignee.dict:
-            subs_assignee.dict[MemAccess('global', key.dtype,
-                                         stride=key.stride, direction='store',
-                                         variable=key.variable)
-                              ] = subs_assignee.dict.pop(key)
+
+        if mtype == 'global':  # for now, don't count writes to local mem
+            subs_assignee = subscript_counter(insn.assignee)
+            for key in subs_assignee.dict:
+                subs_assignee.dict[MemAccess(key.mtype, key.dtype,
+                                             stride=key.stride, direction='store',
+                                             variable=key.variable)
+                                  ] = subs_assignee.dict.pop(key)
 
         insn_inames = knl.insn_inames(insn)
 
         # use count excluding local index tags for uniform accesses
         for key in subs_expr.dict:
             poly = ToCountMap({key: subs_expr.dict[key]})
-            if isinstance(key.stride, int) and key.stride == 0:
-                subs_poly = subs_poly \
-                            + poly*get_insn_count(knl, insn_inames, True)
-            else:
-                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
-        for key in subs_assignee.dict:
-            poly = ToCountMap({key: subs_assignee.dict[key]})
-            if isinstance(key.stride, int) and key.stride == 0:
+            if mtype == 'global' and isinstance(key.stride, int) and key.stride == 0:
                 subs_poly = subs_poly \
                             + poly*get_insn_count(knl, insn_inames, True)
             else:
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
 
+        if mtype == 'global':  # for now, don't count writes to local mem
+            for key in subs_assignee.dict:
+                poly = ToCountMap({key: subs_assignee.dict[key]})
+                if isinstance(key.stride, int) and key.stride == 0:
+                    subs_poly = subs_poly \
+                                + poly*get_insn_count(knl, insn_inames, True)
+                else:
+                    subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
+
     result = subs_poly.dict
 
     if numpy_types:
-        result = dict((MemAccess('global', mem_access.dtype.numpy_dtype,
+        result = dict((MemAccess(mem_access.mtype, mem_access.dtype.numpy_dtype,
                                  stride=mem_access.stride,
                                  direction=mem_access.direction,
                                  variable=mem_access.variable)
@@ -1022,17 +992,6 @@ def get_gmem_access_poly(knl, numpy_types=True):
 
     return result
 
-
-def get_DRAM_access_poly(knl):
-    from warnings import warn
-    warn("get_DRAM_access_poly is deprecated. "
-         "Use get_gmem_access_poly instead",
-         DeprecationWarning, stacklevel=2)
-    return get_gmem_access_poly(knl)
-
-# }}}
-
-
 # {{{ sum_mem_access_to_bytes
 
 def sum_mem_access_to_bytes(m):
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 4a83092cd..504f15403 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -30,8 +30,7 @@ from pyopencl.tools import (  # noqa
 #TODO why is this import required?
 from loopy.statistics import (
         get_op_poly,
-        get_gmem_access_poly,
-        get_lmem_access_poly,
+        get_mem_access_poly,
         get_synchronization_poly,
         MemAccess,
         Op)
@@ -231,7 +230,7 @@ def test_gmem_access_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = lp.get_gmem_access_poly(knl)
+    poly = lp.get_mem_access_poly(knl, 'global')
     n = 512
     m = 256
     l = 128
@@ -271,7 +270,7 @@ def test_gmem_access_counter_reduction():
             name="matmul", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    poly = lp.get_gmem_access_poly(knl)
+    poly = lp.get_mem_access_poly(knl, 'global')
     n = 512
     m = 256
     l = 128
@@ -302,7 +301,7 @@ def test_gmem_access_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = lp.get_gmem_access_poly(knl)
+    poly = lp.get_mem_access_poly(knl, 'global')
     n = 512
     m = 256
     l = 128
@@ -336,7 +335,7 @@ def test_gmem_access_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
-    poly = lp.get_gmem_access_poly(knl)
+    poly = lp.get_mem_access_poly(knl, 'global')
     n = 512
     m = 256
     l = 128
@@ -383,7 +382,7 @@ def test_gmem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    poly = lp.get_gmem_access_poly(knl)
+    poly = lp.get_mem_access_poly(knl, 'global')
     n = 512
     m = 256
     l = 128
@@ -429,7 +428,7 @@ def test_gmem_access_counter_mixed():
     knl = lp.split_iname(knl, "j", threads)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
-    poly = lp.get_gmem_access_poly(knl)  # noqa
+    poly = lp.get_mem_access_poly(knl, 'global')  # noqa
     n = 512
     m = 256
     l = 128
@@ -482,7 +481,7 @@ def test_gmem_access_counter_nonconsec():
     knl = lp.split_iname(knl, "i", 16)
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
-    poly = lp.get_gmem_access_poly(knl)  # noqa
+    poly = lp.get_mem_access_poly(knl, 'global')  # noqa
     n = 512
     m = 256
     l = 128
@@ -533,7 +532,7 @@ def test_gmem_access_counter_consec():
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
-    poly = lp.get_gmem_access_poly(knl)
+    poly = lp.get_mem_access_poly(knl, 'global')
     n = 512
     m = 256
     l = 128
@@ -658,7 +657,7 @@ def test_all_counters_parallel_matmul():
 
     assert f32mul+f32add == n*m*l*2
 
-    subscript_map = lp.get_gmem_access_poly(knl)
+    subscript_map = lp.get_mem_access_poly(knl, 'global')
 
     f32coal = subscript_map[MemAccess('global', np.float32, 
                         stride=1, direction='load', variable='b')
@@ -675,7 +674,7 @@ def test_all_counters_parallel_matmul():
 
     assert f32coal == n*l
 
-    local_subs_map = get_lmem_access_poly(knl)
+    local_subs_map = get_mem_access_poly(knl, 'local')
 
     # TODO currently considering all local mem access stride-1
     local_subs_l = local_subs_map[MemAccess('local', np.dtype(np.float32),
-- 
GitLab


From b72b7ab3c2e421b67cc61a3467148149d7b263a8 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Sat, 22 Oct 2016 21:30:31 -0500
Subject: [PATCH 14/55] added new stats functions to __init__ file so no need
 for import, updated test

---
 loopy/__init__.py       |  16 ++--
 loopy/statistics.py     |  19 ++---
 test/test_statistics.py | 163 +++++++++++++++++++---------------------
 3 files changed, 93 insertions(+), 105 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index 63268a214..a644fdf53 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -112,10 +112,10 @@ from loopy.transform.parameter import assume, fix_parameters
 from loopy.preprocess import (preprocess_kernel, realize_reduction,
         infer_unknown_types)
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
-from loopy.statistics import (get_op_poly, sum_ops_to_dtypes,
-        get_mem_access_poly, get_gmem_access_poly,
-        get_DRAM_access_poly, get_synchronization_poly, stringify_stats_mapping,
-        sum_mem_access_to_bytes,
+from loopy.statistics import (stringify_stats_mapping, Op, MemAccess,
+        get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly,
+        get_gmem_access_poly, get_DRAM_access_poly, get_mem_access_poly,
+        sum_mem_access_to_bytes, get_synchronization_poly,
         gather_access_footprints, gather_access_footprint_bytes)
 from loopy.codegen import (
         PreambleInfo,
@@ -219,10 +219,10 @@ __all__ = [
         "PreambleInfo",
         "generate_code", "generate_code_v2", "generate_body",
 
-        "get_op_poly", "sum_ops_to_dtypes", "get_gmem_access_poly",
-        "get_mem_access_poly", "get_DRAM_access_poly",
-        "get_synchronization_poly", "stringify_stats_mapping",
-        "sum_mem_access_to_bytes",
+        "stringify_stats_mapping", "Op", "MemAccess", "get_op_poly",
+        "sum_ops_to_dtypes", "get_lmem_access_poly", "get_gmem_access_poly",
+        "get_DRAM_access_poly", "get_mem_access_poly",
+        "sum_mem_access_to_bytes", "get_synchronization_poly",
         "gather_access_footprints", "gather_access_footprint_bytes",
 
         "CompiledKernel",
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 24ba905a5..c349b34f4 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 import six
 
 import loopy as lp
+import numpy as np
 import warnings
 from islpy import dim_type
 import islpy as isl
@@ -112,12 +113,8 @@ class Op:
 
     def __init__(self, dtype, name):
         self.name = name
-        import numpy as np
-        if issubclass(type(dtype), type):
-            self.dtype = np.dtype(dtype)
-        else:
-            self.dtype = dtype
-        #TODO should this check be more robust?
+        from loopy.types import to_loopy_type
+        self.dtype = to_loopy_type(dtype)
 
     def __eq__(self, other):
         return isinstance(other, Op) and (
@@ -139,12 +136,8 @@ class MemAccess:
         self.direction = direction
         self.variable = variable
 
-        import numpy as np
-        if issubclass(type(dtype), type):
-            self.dtype = np.dtype(dtype)
-        else:
-            self.dtype = dtype
-        #TODO should this check be more robust?
+        from loopy.types import to_loopy_type
+        self.dtype = to_loopy_type(dtype)
 
     def __eq__(self, other):
         return isinstance(other, MemAccess) and (
@@ -827,6 +820,7 @@ def get_op_poly(knl, numpy_types=True):
 
 def sum_ops_to_dtypes(op_poly_dict):
     result = {}
+    #TODO fix this
     for (dtype, kind), v in op_poly_dict.items():
         new_key = dtype
         if new_key in result:
@@ -1004,6 +998,7 @@ def sum_mem_access_to_bytes(m):
     """
 
     result = {}
+    #TODO fix this and test
     for (dtype, kind, direction), v in m.items():
         new_key = (kind, direction)
         bytes_transferred = int(dtype.itemsize) * v
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 504f15403..a7f061c2a 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -27,13 +27,6 @@ import sys
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
-#TODO why is this import required?
-from loopy.statistics import (
-        get_op_poly,
-        get_mem_access_poly,
-        get_synchronization_poly,
-        MemAccess,
-        Op)
 import loopy as lp
 import numpy as np
 
@@ -59,11 +52,11 @@ def test_op_counter_basic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32add = poly[Op(np.float32, 'add')].eval_with_dict(params)
-    f32mul = poly[Op(np.float32, 'mul')].eval_with_dict(params)
-    f32div = poly[Op(np.float32, 'div')].eval_with_dict(params)
-    f64mul = poly[Op(np.dtype(np.float64), 'mul')].eval_with_dict(params)
-    i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f32add = poly[lp.Op(np.float32, 'add')].eval_with_dict(params)
+    f32mul = poly[lp.Op(np.float32, 'mul')].eval_with_dict(params)
+    f32div = poly[lp.Op(np.float32, 'div')].eval_with_dict(params)
+    f64mul = poly[lp.Op(np.dtype(np.float64), 'mul')].eval_with_dict(params)
+    i32add = poly[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
     assert f32add == f32mul == f32div == n*m*l
     assert f64mul == n*m
     assert i32add == n*m*2
@@ -84,8 +77,8 @@ def test_op_counter_reduction():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32add = poly[Op(np.float32, 'add')].eval_with_dict(params)
-    f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    f32add = poly[lp.Op(np.float32, 'add')].eval_with_dict(params)
+    f32mul = poly[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
     assert f32add == f32mul == n*m*l
 
 
@@ -106,10 +99,10 @@ def test_op_counter_logic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32mul = poly[Op(np.float32, 'mul')].eval_with_dict(params)
-    f64add = poly[Op(np.float64, 'add')].eval_with_dict(params)
-    f64div = poly[Op(np.dtype(np.float64), 'div')].eval_with_dict(params)
-    i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f32mul = poly[lp.Op(np.float32, 'mul')].eval_with_dict(params)
+    f64add = poly[lp.Op(np.float64, 'add')].eval_with_dict(params)
+    f64div = poly[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params)
+    i32add = poly[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
     assert f32mul == n*m
     assert f64div == 2*n*m  # TODO why?
     assert f64add == n*m
@@ -136,14 +129,14 @@ def test_op_counter_specialops():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32mul = poly[Op(np.float32, 'mul')].eval_with_dict(params)
-    f32div = poly[Op(np.float32, 'div')].eval_with_dict(params)
-    f32add = poly[Op(np.float32, 'add')].eval_with_dict(params)
-    f64pow = poly[Op(np.float64, 'pow')].eval_with_dict(params)
-    f64add = poly[Op(np.dtype(np.float64), 'add')].eval_with_dict(params)
-    i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
-    f64rsq = poly[Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params)
-    f64sin = poly[Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params)
+    f32mul = poly[lp.Op(np.float32, 'mul')].eval_with_dict(params)
+    f32div = poly[lp.Op(np.float32, 'div')].eval_with_dict(params)
+    f32add = poly[lp.Op(np.float32, 'add')].eval_with_dict(params)
+    f64pow = poly[lp.Op(np.float64, 'pow')].eval_with_dict(params)
+    f64add = poly[lp.Op(np.dtype(np.float64), 'add')].eval_with_dict(params)
+    i32add = poly[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f64rsq = poly[lp.Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params)
+    f64sin = poly[lp.Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params)
     assert f32div == 2*n*m*l
     assert f32mul == f32add == n*m*l
     assert f64add == 3*n*m
@@ -172,12 +165,12 @@ def test_op_counter_bitwise():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    i32add = poly[Op(np.int32, 'add')].eval_with_dict(params)
-    i32bw = poly[Op(np.int32, 'bw')].eval_with_dict(params)
-    i64bw = poly[Op(np.dtype(np.int64), 'bw')].eval_with_dict(params)
-    i64mul = poly[Op(np.dtype(np.int64), 'mul')].eval_with_dict(params)
-    i64add = poly[Op(np.dtype(np.int64), 'add')].eval_with_dict(params)
-    i64shift = poly[Op(np.dtype(np.int64), 'shift')].eval_with_dict(params)
+    i32add = poly[lp.Op(np.int32, 'add')].eval_with_dict(params)
+    i32bw = poly[lp.Op(np.int32, 'bw')].eval_with_dict(params)
+    i64bw = poly[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params)
+    i64mul = poly[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params)
+    i64add = poly[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params)
+    i64shift = poly[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params)
     assert i32add == n*m+n*m*l
     assert i32bw == 2*n*m*l
     assert i64bw == 2*n*m
@@ -206,7 +199,7 @@ def test_op_counter_triangular_domain():
     else:
         expect_fallback = False
 
-    poly = lp.get_op_poly(knl)[Op(np.float64, 'mul')]
+    poly = lp.get_op_poly(knl)[lp.Op(np.float64, 'mul')]
     value_dict = dict(m=13, n=200)
     flops = poly.eval_with_dict(value_dict)
 
@@ -235,25 +228,25 @@ def test_gmem_access_counter_basic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[MemAccess('global', np.float32,
+    f32 = poly[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
               ].eval_with_dict(params)
-    f32 += poly[MemAccess('global', np.float32,
+    f32 += poly[lp.MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
-    f64 = poly[MemAccess('global', np.float64,
+    f64 = poly[lp.MemAccess('global', np.float64,
                          stride=0, direction='load', variable='g')
               ].eval_with_dict(params)
-    f64 += poly[MemAccess('global', np.float64,
+    f64 += poly[lp.MemAccess('global', np.float64,
                           stride=0, direction='load', variable='h')
                ].eval_with_dict(params)
     assert f32 == 3*n*m*l
     assert f64 == 2*n*m
 
-    f32 = poly[MemAccess('global', np.dtype(np.float32),
+    f32 = poly[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c')
               ].eval_with_dict(params)
-    f64 = poly[MemAccess('global', np.dtype(np.float64),
+    f64 = poly[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='store', variable='e')
               ].eval_with_dict(params)
     assert f32 == n*m*l
@@ -275,15 +268,15 @@ def test_gmem_access_counter_reduction():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[MemAccess('global', np.float32,
+    f32 = poly[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
               ].eval_with_dict(params)
-    f32 += poly[MemAccess('global', np.float32,
+    f32 += poly[lp.MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
     assert f32 == 2*n*m*l
 
-    f32 = poly[MemAccess('global', np.dtype(np.float32),
+    f32 = poly[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c')
               ].eval_with_dict(params)
     assert f32 == n*l
@@ -306,16 +299,16 @@ def test_gmem_access_counter_logic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[MemAccess('global', np.float32,
+    f32 = poly[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='g')
               ].eval_with_dict(params)
-    f64 = poly[MemAccess('global', np.float64,
+    f64 = poly[lp.MemAccess('global', np.float64,
                          stride=0, direction='load', variable='h')
               ].eval_with_dict(params)
     assert f32 == 2*n*m
     assert f64 == n*m
 
-    f64 = poly[MemAccess('global', np.float64,
+    f64 = poly[lp.MemAccess('global', np.float64,
                          stride=0, direction='store', variable='e')
               ].eval_with_dict(params)
     assert f64 == n*m
@@ -340,25 +333,25 @@ def test_gmem_access_counter_specialops():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[MemAccess('global', np.float32,
+    f32 = poly[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
               ].eval_with_dict(params)
-    f32 += poly[MemAccess('global', np.float32,
+    f32 += poly[lp.MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
-    f64 = poly[MemAccess('global', np.dtype(np.float64),
+    f64 = poly[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='load', variable='g')
               ].eval_with_dict(params)
-    f64 += poly[MemAccess('global', np.dtype(np.float64),
+    f64 += poly[lp.MemAccess('global', np.dtype(np.float64),
                           stride=0, direction='load', variable='h')
                ].eval_with_dict(params)
     assert f32 == 2*n*m*l
     assert f64 == 2*n*m
 
-    f32 = poly[MemAccess('global', np.float32,
+    f32 = poly[lp.MemAccess('global', np.float32,
                          stride=0, direction='store', variable='c')
               ].eval_with_dict(params)
-    f64 = poly[MemAccess('global', np.float64,
+    f64 = poly[lp.MemAccess('global', np.float64,
                          stride=0, direction='store', variable='e')
               ].eval_with_dict(params)
     assert f32 == n*m*l
@@ -387,24 +380,24 @@ def test_gmem_access_counter_bitwise():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    i32 = poly[MemAccess('global', np.int32, 
+    i32 = poly[lp.MemAccess('global', np.int32, 
                          stride=0, direction='load', variable='a')
               ].eval_with_dict(params)
-    i32 += poly[MemAccess('global', np.int32, 
+    i32 += poly[lp.MemAccess('global', np.int32, 
                           stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
-    i32 += poly[MemAccess('global', np.int32, 
+    i32 += poly[lp.MemAccess('global', np.int32, 
                           stride=0, direction='load', variable='g')
                ].eval_with_dict(params)
-    i32 += poly[MemAccess('global', np.dtype(np.int32), 
+    i32 += poly[lp.MemAccess('global', np.dtype(np.int32), 
                           stride=0, direction='load', variable='h')
                ].eval_with_dict(params)
     assert i32 == 4*n*m+2*n*m*l
 
-    i32 = poly[MemAccess('global', np.int32, 
+    i32 = poly[lp.MemAccess('global', np.int32, 
                          stride=0, direction='store', variable='c')
               ].eval_with_dict(params)
-    i32 += poly[MemAccess('global', np.int32, 
+    i32 += poly[lp.MemAccess('global', np.int32, 
                           stride=0, direction='store', variable='e')
                ].eval_with_dict(params)
     assert i32 == n*m+n*m*l
@@ -433,20 +426,20 @@ def test_gmem_access_counter_mixed():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64uniform = poly[MemAccess('global', np.float64, 
+    f64uniform = poly[lp.MemAccess('global', np.float64, 
                                 stride=0, direction='load', variable='g')
                      ].eval_with_dict(params)
-    f64uniform += poly[MemAccess('global', np.float64, 
+    f64uniform += poly[lp.MemAccess('global', np.float64, 
                                  stride=0, direction='load', variable='h')
                       ].eval_with_dict(params)
-    f32uniform = poly[MemAccess('global', np.float32, 
+    f32uniform = poly[lp.MemAccess('global', np.float32, 
                                 stride=0, direction='load', variable='x')
                      ].eval_with_dict(params)
-    f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), 
+    f32nonconsec = poly[lp.MemAccess('global', np.dtype(np.float32), 
                                   stride=Variable('m'), direction='load',
                                   variable='a')
                        ].eval_with_dict(params)
-    f32nonconsec += poly[MemAccess('global', np.dtype(np.float32), 
+    f32nonconsec += poly[lp.MemAccess('global', np.dtype(np.float32), 
                                    stride=Variable('m'), direction='load',
                                    variable='b')
                         ].eval_with_dict(params)
@@ -454,10 +447,10 @@ def test_gmem_access_counter_mixed():
     assert f32uniform == n*m*l/threads
     assert f32nonconsec == 3*n*m*l
 
-    f64uniform = poly[MemAccess('global', np.float64, 
+    f64uniform = poly[lp.MemAccess('global', np.float64, 
                                 stride=0, direction='store', variable='e')
                      ].eval_with_dict(params)
-    f32nonconsec = poly[MemAccess('global', np.float32, 
+    f32nonconsec = poly[lp.MemAccess('global', np.float32, 
                                   stride=Variable('m'), direction='store',
                                   variable='c')
                        ].eval_with_dict(params)
@@ -486,30 +479,30 @@ def test_gmem_access_counter_nonconsec():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64nonconsec = poly[MemAccess('global', np.float64, 
+    f64nonconsec = poly[lp.MemAccess('global', np.float64, 
                                   stride=Variable('m'), direction='load',
                                   variable='g')
                        ].eval_with_dict(params)
-    f64nonconsec += poly[MemAccess('global', np.float64, 
+    f64nonconsec += poly[lp.MemAccess('global', np.float64, 
                                    stride=Variable('m'), direction='load',
                                    variable='h')
                         ].eval_with_dict(params)
-    f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), 
+    f32nonconsec = poly[lp.MemAccess('global', np.dtype(np.float32), 
                                   stride=Variable('m')*Variable('l'),
                                   direction='load', variable='a')
                        ].eval_with_dict(params)
-    f32nonconsec += poly[MemAccess('global', np.dtype(np.float32), 
+    f32nonconsec += poly[lp.MemAccess('global', np.dtype(np.float32), 
                                    stride=Variable('m')*Variable('l'),
                                    direction='load', variable='b')
                         ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*l
 
-    f64nonconsec = poly[MemAccess('global', np.float64, 
+    f64nonconsec = poly[lp.MemAccess('global', np.float64, 
                                   stride=Variable('m'), direction='store',
                                   variable='e')
                        ].eval_with_dict(params)
-    f32nonconsec = poly[MemAccess('global', np.float32, 
+    f32nonconsec = poly[lp.MemAccess('global', np.float32, 
                                   stride=Variable('m')*Variable('l'),
                                   direction='store', variable='c')
                        ].eval_with_dict(params)
@@ -538,25 +531,25 @@ def test_gmem_access_counter_consec():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    f64consec = poly[MemAccess('global', np.float64, 
+    f64consec = poly[lp.MemAccess('global', np.float64, 
                         stride=1, direction='load', variable='g')
                      ].eval_with_dict(params)
-    f64consec += poly[MemAccess('global', np.float64, 
+    f64consec += poly[lp.MemAccess('global', np.float64, 
                         stride=1, direction='load', variable='h')
                      ].eval_with_dict(params)
-    f32consec = poly[MemAccess('global', np.float32, 
+    f32consec = poly[lp.MemAccess('global', np.float32, 
                         stride=1, direction='load', variable='a')
                      ].eval_with_dict(params)
-    f32consec += poly[MemAccess('global', np.dtype(np.float32), 
+    f32consec += poly[lp.MemAccess('global', np.dtype(np.float32), 
                         stride=1, direction='load', variable='b')
                      ].eval_with_dict(params)
     assert f64consec == 2*n*m
     assert f32consec == 3*n*m*l
 
-    f64consec = poly[MemAccess('global', np.float64, 
+    f64consec = poly[lp.MemAccess('global', np.float64, 
                         stride=1, direction='store', variable='e')
                      ].eval_with_dict(params)
-    f32consec = poly[MemAccess('global', np.float32, 
+    f32consec = poly[lp.MemAccess('global', np.float32, 
                         stride=1, direction='store', variable='c')
                      ].eval_with_dict(params)
     assert f64consec == n*m
@@ -643,41 +636,41 @@ def test_all_counters_parallel_matmul():
 
     op_map = lp.get_op_poly(knl)
     f32mul = op_map[
-                        Op(np.float32, 'mul')
+                        lp.Op(np.float32, 'mul')
                         ].eval_with_dict(params)
     f32add = op_map[
-                        Op(np.float32, 'add')
+                        lp.Op(np.float32, 'add')
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        Op(np.int32, 'add')
+                        lp.Op(np.int32, 'add')
                         ].eval_with_dict(params)
     i32ops += op_map[
-                        Op(np.dtype(np.int32), 'mul')
+                        lp.Op(np.dtype(np.int32), 'mul')
                         ].eval_with_dict(params)
 
     assert f32mul+f32add == n*m*l*2
 
     subscript_map = lp.get_mem_access_poly(knl, 'global')
 
-    f32coal = subscript_map[MemAccess('global', np.float32, 
+    f32coal = subscript_map[lp.MemAccess('global', np.float32, 
                         stride=1, direction='load', variable='b')
                             ].eval_with_dict(params)
-    f32coal += subscript_map[MemAccess('global', np.float32, 
+    f32coal += subscript_map[lp.MemAccess('global', np.float32, 
                         stride=1, direction='load', variable='a')
                             ].eval_with_dict(params)
 
     assert f32coal == n*m+m*l
 
-    f32coal = subscript_map[MemAccess('global', np.float32, 
+    f32coal = subscript_map[lp.MemAccess('global', np.float32, 
                         stride=1, direction='store', variable='c')
                             ].eval_with_dict(params)
 
     assert f32coal == n*l
 
-    local_subs_map = get_mem_access_poly(knl, 'local')
+    local_subs_map = lp.get_mem_access_poly(knl, 'local')
 
     # TODO currently considering all local mem access stride-1
-    local_subs_l = local_subs_map[MemAccess('local', np.dtype(np.float32),
+    local_subs_l = local_subs_map[lp.MemAccess('local', np.dtype(np.float32),
                                             direction='load')
                                  ].eval_with_dict(params)
 
-- 
GitLab


From 1f079851255bf9b555d6d3446d1c4f015c69e1ff Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Sun, 23 Oct 2016 22:32:19 -0500
Subject: [PATCH 15/55] updated old versions of sum_xxx for new dict keys

---
 loopy/statistics.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index c349b34f4..7702b8995 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -820,9 +820,9 @@ def get_op_poly(knl, numpy_types=True):
 
 def sum_ops_to_dtypes(op_poly_dict):
     result = {}
-    #TODO fix this
-    for (dtype, kind), v in op_poly_dict.items():
-        new_key = dtype
+    #TODO test this
+    for op, v in op_poly_dict.items():
+        new_key = op.dtype
         if new_key in result:
             result[new_key] += v
         else:
@@ -998,10 +998,10 @@ def sum_mem_access_to_bytes(m):
     """
 
     result = {}
-    #TODO fix this and test
-    for (dtype, kind, direction), v in m.items():
-        new_key = (kind, direction)
-        bytes_transferred = int(dtype.itemsize) * v
+    #TODO test this
+    for mem_access, v in m.items():
+        new_key = (mem_access.stride, mem_access.direction)
+        bytes_transferred = int(mem_access.dtype.itemsize) * v
         if new_key in result:
             result[new_key] += bytes_transferred
         else:
-- 
GitLab


From d6b72e3b1df925af50642f74d7daab790fbc9028 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Mon, 24 Oct 2016 00:50:46 -0500
Subject: [PATCH 16/55] added tests for existing sum_xxx functions

---
 test/test_statistics.py | 55 ++++++++++++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 15 deletions(-)

diff --git a/test/test_statistics.py b/test/test_statistics.py
index a7f061c2a..eaac2081e 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -28,6 +28,7 @@ from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl
         as pytest_generate_tests)
 import loopy as lp
+from loopy.types import to_loopy_type
 import numpy as np
 
 from pymbolic.primitives import Variable
@@ -61,6 +62,14 @@ def test_op_counter_basic():
     assert f64mul == n*m
     assert i32add == n*m*2
 
+    poly_dtype = lp.sum_ops_to_dtypes(poly)
+    f32 = poly_dtype[to_loopy_type(np.float32)].eval_with_dict(params)
+    f64 = poly_dtype[to_loopy_type(np.float64)].eval_with_dict(params)
+    i32 = poly_dtype[to_loopy_type(np.int32)].eval_with_dict(params)
+    assert f32 == f32add + f32mul + f32div
+    assert f64 == f64mul
+    assert i32 == i32add
+    
 
 def test_op_counter_reduction():
 
@@ -81,6 +90,10 @@ def test_op_counter_reduction():
     f32mul = poly[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
     assert f32add == f32mul == n*m*l
 
+    poly_dtype = lp.sum_ops_to_dtypes(poly)
+    f32 = poly_dtype[to_loopy_type(np.float32)].eval_with_dict(params)
+    assert f32 == f32add + f32mul
+
 
 def test_op_counter_logic():
 
@@ -228,29 +241,35 @@ def test_gmem_access_counter_basic():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[lp.MemAccess('global', np.float32,
+    f32l = poly[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
               ].eval_with_dict(params)
-    f32 += poly[lp.MemAccess('global', np.float32,
+    f32l += poly[lp.MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
-    f64 = poly[lp.MemAccess('global', np.float64,
+    f64l = poly[lp.MemAccess('global', np.float64,
                          stride=0, direction='load', variable='g')
               ].eval_with_dict(params)
-    f64 += poly[lp.MemAccess('global', np.float64,
+    f64l += poly[lp.MemAccess('global', np.float64,
                           stride=0, direction='load', variable='h')
                ].eval_with_dict(params)
-    assert f32 == 3*n*m*l
-    assert f64 == 2*n*m
+    assert f32l == 3*n*m*l
+    assert f64l == 2*n*m
 
-    f32 = poly[lp.MemAccess('global', np.dtype(np.float32),
+    f32s = poly[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c')
               ].eval_with_dict(params)
-    f64 = poly[lp.MemAccess('global', np.dtype(np.float64),
+    f64s = poly[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='store', variable='e')
               ].eval_with_dict(params)
-    assert f32 == n*m*l
-    assert f64 == n*m
+    assert f32s == n*m*l
+    assert f64s == n*m
+
+    poly_b = lp.sum_mem_access_to_bytes(poly)
+    s0load = poly_b[(0, 'load')].eval_with_dict(params)
+    s0store = poly_b[(0, 'store')].eval_with_dict(params)
+    assert s0load == 4*f32l + 8*f64l
+    assert s0store == 4*f32s + 8*f64s
 
 
 def test_gmem_access_counter_reduction():
@@ -268,18 +287,24 @@ def test_gmem_access_counter_reduction():
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[lp.MemAccess('global', np.float32,
+    f32l = poly[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
               ].eval_with_dict(params)
-    f32 += poly[lp.MemAccess('global', np.float32,
+    f32l += poly[lp.MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
-    assert f32 == 2*n*m*l
+    assert f32l == 2*n*m*l
 
-    f32 = poly[lp.MemAccess('global', np.dtype(np.float32),
+    f32s = poly[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c')
               ].eval_with_dict(params)
-    assert f32 == n*l
+    assert f32s == n*l
+
+    poly_b = lp.sum_mem_access_to_bytes(poly)
+    s0load = poly_b[(0, 'load')].eval_with_dict(params)
+    s0store = poly_b[(0, 'store')].eval_with_dict(params)
+    assert s0load == 4*f32l
+    assert s0store == 4*f32s
 
 
 def test_gmem_access_counter_logic():
-- 
GitLab


From 8fc4268b4cf0699fc10a36b1d060fdf0b61e3061 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Mon, 24 Oct 2016 00:55:51 -0500
Subject: [PATCH 17/55] added mtype to keys in sum_mem_access_to_bytes

---
 loopy/statistics.py     | 4 +---
 test/test_statistics.py | 8 ++++----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 7702b8995..f3346572e 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -820,7 +820,6 @@ def get_op_poly(knl, numpy_types=True):
 
 def sum_ops_to_dtypes(op_poly_dict):
     result = {}
-    #TODO test this
     for op, v in op_poly_dict.items():
         new_key = op.dtype
         if new_key in result:
@@ -998,9 +997,8 @@ def sum_mem_access_to_bytes(m):
     """
 
     result = {}
-    #TODO test this
     for mem_access, v in m.items():
-        new_key = (mem_access.stride, mem_access.direction)
+        new_key = (mem_access.mtype, mem_access.stride, mem_access.direction)
         bytes_transferred = int(mem_access.dtype.itemsize) * v
         if new_key in result:
             result[new_key] += bytes_transferred
diff --git a/test/test_statistics.py b/test/test_statistics.py
index eaac2081e..feda05125 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -266,8 +266,8 @@ def test_gmem_access_counter_basic():
     assert f64s == n*m
 
     poly_b = lp.sum_mem_access_to_bytes(poly)
-    s0load = poly_b[(0, 'load')].eval_with_dict(params)
-    s0store = poly_b[(0, 'store')].eval_with_dict(params)
+    s0load = poly_b[('global', 0, 'load')].eval_with_dict(params)
+    s0store = poly_b[('global', 0, 'store')].eval_with_dict(params)
     assert s0load == 4*f32l + 8*f64l
     assert s0store == 4*f32s + 8*f64s
 
@@ -301,8 +301,8 @@ def test_gmem_access_counter_reduction():
     assert f32s == n*l
 
     poly_b = lp.sum_mem_access_to_bytes(poly)
-    s0load = poly_b[(0, 'load')].eval_with_dict(params)
-    s0store = poly_b[(0, 'store')].eval_with_dict(params)
+    s0load = poly_b[('global', 0, 'load')].eval_with_dict(params)
+    s0store = poly_b[('global', 0, 'store')].eval_with_dict(params)
     assert s0load == 4*f32l
     assert s0store == 4*f32s
 
-- 
GitLab


From 4552b861c2b387191efb710f74a498c48f490c2d Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Tue, 25 Oct 2016 20:54:46 -0500
Subject: [PATCH 18/55] updated old doc strings and added new doc strings

---
 loopy/__init__.py   |   6 +-
 loopy/statistics.py | 234 +++++++++++++++++++++++++++++++++-----------
 2 files changed, 178 insertions(+), 62 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index a644fdf53..340aec051 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -114,7 +114,7 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction,
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
 from loopy.statistics import (stringify_stats_mapping, Op, MemAccess,
         get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly,
-        get_gmem_access_poly, get_DRAM_access_poly, get_mem_access_poly,
+        get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly,
         sum_mem_access_to_bytes, get_synchronization_poly,
         gather_access_footprints, gather_access_footprint_bytes)
 from loopy.codegen import (
@@ -220,8 +220,8 @@ __all__ = [
         "generate_code", "generate_code_v2", "generate_body",
 
         "stringify_stats_mapping", "Op", "MemAccess", "get_op_poly",
-        "sum_ops_to_dtypes", "get_lmem_access_poly", "get_gmem_access_poly",
-        "get_DRAM_access_poly", "get_mem_access_poly",
+        "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly",
+        "get_gmem_access_poly", "get_mem_access_poly",
         "sum_mem_access_to_bytes", "get_synchronization_poly",
         "gather_access_footprints", "gather_access_footprint_bytes",
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index f3346572e..fd9863eb2 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -42,7 +42,10 @@ __doc__ = """
 
 .. autofunction:: get_op_poly
 
+.. autofunction:: get_lmem_access_poly
+.. autofunction:: get_DRAM_access_poly
 .. autofunction:: get_gmem_access_poly
+.. autofunction:: get_mem_access_poly
 
 .. autofunction:: sum_mem_access_to_bytes
 
@@ -110,6 +113,19 @@ def stringify_stats_mapping(m):
 
 
 class Op:
+    """An arithmetic operation
+
+    .. attribute:: dtype
+
+       A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+       data type operated on.
+
+    .. attribute:: name
+
+       A :class:`string` that specifies the kind of arithmetic operation as
+       *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
+
+    """
 
     def __init__(self, dtype, name):
         self.name = name
@@ -126,6 +142,34 @@ class Op:
 
 
 class MemAccess:
+    """A memory access
+
+    .. attribute:: mtype
+
+       A :class:`string` that specifies the memory type accessed as **global**
+       or **local**
+
+    .. attribute:: dtype
+
+       A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+       data type accessed.
+
+    .. attribute:: stride
+
+       A :class:`int` specifies stride of the memory access. A stride of 0
+       indicates a uniform access (i.e. all threads access the same item).
+
+    .. attribute:: direction
+
+       A :class:`string` that specifies the direction of memory access as
+       **load** or **store**.
+
+    .. attribute:: variable
+
+       A :class:`string` that specifies the variable name of the data
+       accessed.
+
+    """
 
     #TODO "ANY_VAR" does not work yet
     #TODO currently counting all lmem access as stride-1
@@ -764,16 +808,14 @@ def get_op_poly(knl, numpy_types=True):
 
     :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted.
 
-    :return: A mapping of **{(** *type* **,** :class:`string` **)**
-             **:** :class:`islpy.PwQPolynomial` **}**.
+    :parameter numpy_types: A :class:`boolean` specifying whether the types
+                            in the returned mapping should be numpy types
+                            instead of :class:'loopy.LoopyType`.
 
-             - The *type* specifies the type of the data being
-               accessed. This can be a :class:`numpy.dtype` if
-               *numpy_types* is True, otherwise the internal
-               loopy type.
+    :return: A mapping of **{** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**.
 
-             - The string specifies the operation type as
-               *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
+             - The :class:`loopy.Op` specifies an arithmetic operation with
+               specific characteristics.
 
              - The :class:`islpy.PwQPolynomial` holds the number of operations of
                the kind specified in the key (in terms of the
@@ -785,8 +827,8 @@ def get_op_poly(knl, numpy_types=True):
 
         poly = get_op_poly(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-        f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
-        f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+        f32add = poly[Op(np.dtype(np.float32), 'add')].eval_with_dict(params)
+        f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
 
         # (now use these counts to predict performance)
 
@@ -819,6 +861,34 @@ def get_op_poly(knl, numpy_types=True):
 
 
 def sum_ops_to_dtypes(op_poly_dict):
+    """Sum the mapping returned by :func:`get_op_poly` to a mapping that ignores arithmetic op type
+
+    :parameter op_poly_dict: A mapping of **{** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**.
+
+    :return: A mapping of **{** :class:`loopy.LoopyType` **:** :class:`islpy.PwQPolynomial` **}**
+
+             - The :class:`loopy.LoopyType` specifies the data type operated on 
+
+             - The :class:`islpy.PwQPolynomial` holds the number of arithmetic
+               operations on the data type specified (in terms of the
+               :class:`loopy.LoopKernel` *inames*).
+
+    Example usage::
+
+        # (first create loopy kernel and specify array data types)
+
+        op_map = get_op_poly(knl)
+        op_map_by_dtype = sum_ops_to_dtypes(op_map)
+        params = {'n': 512, 'm': 256, 'l': 128}
+
+        f32ops = op_map_by_dtype[to_loopy_type(np.float32)].eval_with_dict(params)
+        f64ops = op_map_by_dtype[to_loopy_type(np.float64)].eval_with_dict(params)
+        i32ops = op_map_by_dtype[to_loopy_type(np.int32)].eval_with_dict(params)
+
+        # (now use these counts to predict performance)
+
+    """
+
     result = {}
     for op, v in op_poly_dict.items():
         new_key = op.dtype
@@ -840,71 +910,86 @@ def get_lmem_access_poly(knl):
     return get_mem_access_poly(knl, 'local')
 
 
+def get_DRAM_access_poly(knl):
+    """Count the number of global memory accesses in a loopy kernel.
+    """
+    from warnings import warn
+    warn("get_DRAM_access_poly is deprecated. "
+         "Use get_mem_access_poly with global option instead",
+         DeprecationWarning, stacklevel=2)
+    return get_mem_access_poly(knl, 'global')
+
 # {{{ get_gmem_access_poly
 
 def get_gmem_access_poly(knl):
     """Count the number of global memory accesses in a loopy kernel.
+    """
+    from warnings import warn
+    warn("get_gmem_access_poly is deprecated. "
+         "Use get_mem_access_poly with global option instead",
+         DeprecationWarning, stacklevel=2)
+    return get_mem_access_poly(knl, 'global')
+
+
+# }}}
+
+def get_mem_access_poly(knl, mtype, numpy_types=True):
+    """Count the number of memory accesses in a loopy kernel.
 
     :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
                     counted.
 
-    :return: A mapping of **{(** *type* **,** :class:`string` **,**
-             :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**.
+    :parameter mtype: A :class:`string` specifying the memory accesses as
+                      *global* or *local*.
 
-             - The *type* specifies the type of the data being
-               accessed. This can be a :class:`numpy.dtype` if
-               *numpy_types* is True, otherwise the internal
-               loopy type.
+    :parameter numpy_types: A :class:`boolean` specifying whether the types
+                            in the returned mapping should be numpy types
+                            instead of :class:'loopy.LoopyType`.
 
-             - The first string in the map key specifies the global memory
-               access type as
-               *consecutive*, *nonconsecutive*, or *uniform*.
+    :return: A mapping of **{** :class:`loopy.MemAccess` **:**
+             :class:`islpy.PwQPolynomial` **}**.
 
-             - The second string in the map key specifies the global memory
-               access type as a
-               *load*, or a *store*.
+             - The :class:`loopy.MemAccess` specifies the type of memory
+               access.
 
-             - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses
-               with the characteristics specified in the key (in terms of the
-               :class:`loopy.LoopKernel` *inames*).
+             - The :class:`islpy.PwQPolynomial` holds the number of memory
+               accesses with the characteristics specified in the key (in terms
+               of the :class:`loopy.LoopKernel` *inames*).
 
     Example usage::
 
         # (first create loopy kernel and specify array data types)
 
-        subscript_map = get_gmem_access_poly(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-
-        f32_uncoalesced_load = subscript_map.dict[
-                            (np.dtype(np.float32), 'nonconsecutive', 'load')
-                            ].eval_with_dict(params)
-        f32_coalesced_load = subscript_map.dict[
-                            (np.dtype(np.float32), 'consecutive', 'load')
-                            ].eval_with_dict(params)
-        f32_coalesced_store = subscript_map.dict[
-                            (np.dtype(np.float32), 'consecutive', 'store')
-                            ].eval_with_dict(params)
+        gmem_access_map = get_mem_access_poly('global', knl)
+
+        f32_stride1_g_loads_a = gmem_access_map[MemAccess('global', np.float32,
+                                                          stride=1,
+                                                          direction='load',
+                                                          variable='a')
+                                               ].eval_with_dict(params)
+        f32_stride1_g_stores_a = gmem_access_map[MemAccess('global', np.float32,
+                                                           stride=1,
+                                                           direction='stores')
+                                                           variable='a'
+                                                ].eval_with_dict(params)
+
+        lmem_access_map = get_mem_access_poly('local', knl)
+
+        f32_stride1_l_loads_x = lmem_access_map[MemAccess('local', np.float32,
+                                                          stride=1,
+                                                          direction='load',
+                                                          variable='x')
+                                               ].eval_with_dict(params)
+        f32_stride1_l_stores_x = lmem_access_map[MemAccess('local', np.float32,
+                                                           stride=1,
+                                                           direction='stores',
+                                                           variable='x')
+                                                ].eval_with_dict(params)
 
         # (now use these counts to predict performance)
 
     """
-    from warnings import warn
-    warn("get_gmem_access_poly is deprecated. "
-         "Use get_mem_access_poly with global option instead",
-         DeprecationWarning, stacklevel=2)
-    return get_mem_access_poly(knl, 'global')
-
-
-def get_DRAM_access_poly(knl):
-    from warnings import warn
-    warn("get_DRAM_access_poly is deprecated. "
-         "Use get_mem_access_poly with global option instead",
-         DeprecationWarning, stacklevel=2)
-    return get_mem_access_poly(knl, 'global')
-
-# }}}
-
-def get_mem_access_poly(knl, mtype, numpy_types=True):
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
 
     class CacheHolder(object):
@@ -988,12 +1073,43 @@ def get_mem_access_poly(knl, mtype, numpy_types=True):
 # {{{ sum_mem_access_to_bytes
 
 def sum_mem_access_to_bytes(m):
-    """Sum the mapping returned by :func:`get_gmem_access_poly` to a mapping
+    """Convert counts returned by :func:`get_mem_access_poly` to bytes and sum across data types and variables
+
+    :parameter m: A mapping of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
+
+    :return: A mapping of **{(** :class:`string`**,** :class:`int` **,** :class:`string` **)**
+             **:** :class:`islpy.PwQPolynomial` **}**
+
+             - The first string in the key specifies the memory type as *global* or *local*
+
+             - The integer in the key specifies the *stride*
+
+             - The second string in the key specifies the direction as *load* or *store*
+
+             - The :class:`islpy.PwQPolynomial` holds the aggregate transfer
+               size in bytes for memory accesses of all data types with the
+               characteristics specified in the key (in terms of the
+               :class:`loopy.LoopKernel` *inames*).
+
+    Example usage::
+
+        # (first create loopy kernel and specify array data types)
+
+        mem_access_map = get_mem_access_poly('global', knl)
+        byte_totals_map = sum_mem_access_to_bytes(mem_access_map)
+        params = {'n': 512, 'm': 256, 'l': 128}
+
+        stride1_global_bytes_loaded = byte_totals_map[('global', 1, 'load')
+                                                     ].eval_with_dict(params)
+        stride2_global_bytes_loaded = byte_totals_map[('global', 2, 'load')
+                                                     ].eval_with_dict(params)
+        stride1_global_bytes_stored = byte_totals_map[('global', 1, 'store')
+                                                     ].eval_with_dict(params)
+        stride2_global_bytes_stored = byte_totals_map[('global', 2, 'store')
+                                                     ].eval_with_dict(params)
 
-    **{(** :class:`string` **,** :class:`string` **)**
-    **:** :class:`islpy.PwQPolynomial` **}**
+        # (now use thess counts to predict performance)
 
-    i.e., aggregate the transfer numbers for all types into a single byte count.
     """
 
     result = {}
@@ -1030,9 +1146,9 @@ def get_synchronization_poly(knl):
 
         # (first create loopy kernel and specify array data types)
 
-        barrier_poly = get_synchronization_poly(knl)
+        sync_poly = get_synchronization_poly(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-        barrier_count = barrier_poly.eval_with_dict(params)
+        barrier_count = sync_poly['barrier_local'].eval_with_dict(params)
 
         # (now use this count to predict performance)
 
-- 
GitLab


From e42ad212a75e5fd6f6eb08eeb465c5565884ce5f Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Tue, 25 Oct 2016 21:43:42 -0500
Subject: [PATCH 19/55] added sum_mem_access_across_vars for convenience

---
 loopy/__init__.py       | 10 ++++---
 loopy/statistics.py     | 60 +++++++++++++++++++++++++++++++++++++++++
 test/test_statistics.py | 10 +++++++
 3 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index 340aec051..6a482cc1c 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -115,8 +115,9 @@ from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
 from loopy.statistics import (stringify_stats_mapping, Op, MemAccess,
         get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly,
         get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly,
-        sum_mem_access_to_bytes, get_synchronization_poly,
-        gather_access_footprints, gather_access_footprint_bytes)
+        sum_mem_access_to_bytes, sum_mem_access_across_vars, 
+        get_synchronization_poly, gather_access_footprints,
+        gather_access_footprint_bytes)
 from loopy.codegen import (
         PreambleInfo,
         generate_code, generate_code_v2, generate_body)
@@ -222,8 +223,9 @@ __all__ = [
         "stringify_stats_mapping", "Op", "MemAccess", "get_op_poly",
         "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly",
         "get_gmem_access_poly", "get_mem_access_poly",
-        "sum_mem_access_to_bytes", "get_synchronization_poly",
-        "gather_access_footprints", "gather_access_footprint_bytes",
+        "sum_mem_access_to_bytes", "sum_mem_access_across_vars",
+        "get_synchronization_poly", "gather_access_footprints",
+        "gather_access_footprint_bytes",
 
         "CompiledKernel",
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index fd9863eb2..f27f26f3e 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1125,6 +1125,66 @@ def sum_mem_access_to_bytes(m):
 
 # }}}
 
+# {{{ sum_mem_access_across_vars
+
+def sum_mem_access_across_vars(m):
+    """Remove variable name divisions in mapping returned by :func:`get_mem_access_poly`
+
+    :parameter m: A mapping of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
+
+    :return: A mapping of **{(** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**
+
+             - The **variable** attribute in the keys of the returned mapping is set to 'ANY_VAR' 
+
+             - The :class:`islpy.PwQPolynomial` holds the aggregate transfer
+               size in bytes for memory accesses of all data types with the
+               characteristics specified in the key (in terms of the
+               :class:`loopy.LoopKernel` *inames*).
+
+    Example usage::
+
+        # (first create loopy kernel and specify array data types)
+
+        params = {'n': 512, 'm': 256, 'l': 128}
+        gmem_access_map = get_mem_access_poly('global', knl)
+        gmem_acrossvars = sum_mem_access_across_vars(gmem_access_map)
+
+        f32_stride1_g_loads = gmem_acrossvars[MemAccess('global', np.float32,
+                                                        stride=1,
+                                                        direction='load') # do not specify variable
+                                             ].eval_with_dict(params)
+        f32_stride1_g_stores = gmem_acrossvars[MemAccess('global', np.float32,
+                                                         stride=1,
+                                                         direction='store') # do not specify variable
+                                              ].eval_with_dict(params)
+
+        lmem_access_map = get_mem_access_poly('local', knl)
+        lmem_acrossvars = sum_mem_access_across_vars(lmem_access_map)
+
+        f32_stride1_l_loads = lmem_acrossvars[MemAccess('local', np.float32,
+                                                        stride=1,
+                                                        direction='load') # do not specify variable
+                                             ].eval_with_dict(params)
+        f32_stride1_l_stores = lmem_acrossvars[MemAccess('local', np.float32,
+                                                         stride=1,
+                                                         direction='store') # do not specify variable
+                                              ].eval_with_dict(params)
+
+        # (now use these counts to predict performance)
+
+    """
+
+    result = {}
+    for mem_access, v in m.items():
+        new_key = MemAccess(mem_access.mtype, mem_access.dtype, mem_access.stride, mem_access.direction)
+        if new_key in result:
+            result[new_key] += m[mem_access]
+        else:
+            result[new_key] = m[mem_access]
+
+    return result
+
+# }}}
 
 # {{{ get_synchronization_poly
 
diff --git a/test/test_statistics.py b/test/test_statistics.py
index feda05125..5629a0702 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -271,6 +271,16 @@ def test_gmem_access_counter_basic():
     assert s0load == 4*f32l + 8*f64l
     assert s0store == 4*f32s + 8*f64s
 
+    poly_c = lp.sum_mem_access_across_vars(poly)
+    f32lall = poly_c[lp.MemAccess('global', np.float32,
+                                stride=0, direction='load')
+                  ].eval_with_dict(params)
+    f64lall = poly_c[lp.MemAccess('global', np.float64,
+                                stride=0, direction='load')
+                  ].eval_with_dict(params)
+    assert f32lall == 3*n*m*l
+    assert f64lall == 2*n*m
+
 
 def test_gmem_access_counter_reduction():
 
-- 
GitLab


From 38f040cf4f244bf3848d0d8aa18d0c1f8ed9cb63 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Wed, 26 Oct 2016 03:16:07 -0500
Subject: [PATCH 20/55] added ignore_vars option to get_mem_access_poly

---
 loopy/statistics.py     | 26 +++++++++++++++-----------
 test/test_statistics.py |  8 ++++----
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index f27f26f3e..ab68c4253 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -171,10 +171,8 @@ class MemAccess:
 
     """
 
-    #TODO "ANY_VAR" does not work yet
     #TODO currently counting all lmem access as stride-1
-    def __init__(self, mtype, dtype, stride=1, direction=None,
-                 variable='ANY_VAR'):
+    def __init__(self, mtype, dtype, stride=1, direction=None, variable=None):
         self.mtype = mtype
         self.stride = stride
         self.direction = direction
@@ -189,7 +187,7 @@ class MemAccess:
                 other.dtype == self.dtype and
                 other.stride == self.stride and
                 other.direction == self.direction and
-                ((self.variable == 'ANY_VAR' or other.variable == 'ANY_VAR') or
+                ((self.variable == None or other.variable == None) or
                  self.variable == other.variable))
 
     def __hash__(self):
@@ -198,7 +196,7 @@ class MemAccess:
         if direction == None:
             direction = 'None'
         if variable == None:
-            variable = 'ANY_VAR'
+            variable = 'None'
         return hash(str(self.mtype)+str(self.dtype)+str(self.stride)
                     +direction+variable)
 
@@ -933,7 +931,7 @@ def get_gmem_access_poly(knl):
 
 # }}}
 
-def get_mem_access_poly(knl, mtype, numpy_types=True):
+def get_mem_access_poly(knl, mtype, numpy_types=True, ignore_vars=False):
     """Count the number of memory accesses in a loopy kernel.
 
     :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
@@ -946,6 +944,9 @@ def get_mem_access_poly(knl, mtype, numpy_types=True):
                             in the returned mapping should be numpy types
                             instead of :class:'loopy.LoopyType`.
 
+    :parameter ignore_vars: A :class:`boolean` specifying whether to separate 
+                            memory accesses by variable name.
+
     :return: A mapping of **{** :class:`loopy.MemAccess` **:**
              :class:`islpy.PwQPolynomial` **}**.
 
@@ -970,8 +971,8 @@ def get_mem_access_poly(knl, mtype, numpy_types=True):
                                                ].eval_with_dict(params)
         f32_stride1_g_stores_a = gmem_access_map[MemAccess('global', np.float32,
                                                            stride=1,
-                                                           direction='stores')
-                                                           variable='a'
+                                                           direction='store',
+                                                           variable='a')
                                                 ].eval_with_dict(params)
 
         lmem_access_map = get_mem_access_poly('local', knl)
@@ -983,7 +984,7 @@ def get_mem_access_poly(knl, mtype, numpy_types=True):
                                                ].eval_with_dict(params)
         f32_stride1_l_stores_x = lmem_access_map[MemAccess('local', np.float32,
                                                            stride=1,
-                                                           direction='stores',
+                                                           direction='store',
                                                            variable='x')
                                                 ].eval_with_dict(params)
 
@@ -1068,7 +1069,10 @@ def get_mem_access_poly(knl, mtype, numpy_types=True):
                        , count)
                       for mem_access, count in six.iteritems(result))
 
-    return result
+    if ignore_vars:
+        return sum_mem_access_across_vars(result)
+    else:
+        return result
 
 # {{{ sum_mem_access_to_bytes
 
@@ -1134,7 +1138,7 @@ def sum_mem_access_across_vars(m):
 
     :return: A mapping of **{(** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**
 
-             - The **variable** attribute in the keys of the returned mapping is set to 'ANY_VAR' 
+             - The **variable** attribute in the keys of the returned mapping is set to None 
 
              - The :class:`islpy.PwQPolynomial` holds the aggregate transfer
                size in bytes for memory accesses of all data types with the
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 5629a0702..dd651e7c7 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -329,22 +329,22 @@ def test_gmem_access_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = lp.get_mem_access_poly(knl, 'global')
+    poly = lp.get_mem_access_poly(knl, 'global', ignore_vars=True)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
     f32 = poly[lp.MemAccess('global', np.float32,
-                         stride=0, direction='load', variable='g')
+                         stride=0, direction='load')
               ].eval_with_dict(params)
     f64 = poly[lp.MemAccess('global', np.float64,
-                         stride=0, direction='load', variable='h')
+                         stride=0, direction='load')
               ].eval_with_dict(params)
     assert f32 == 2*n*m
     assert f64 == n*m
 
     f64 = poly[lp.MemAccess('global', np.float64,
-                         stride=0, direction='store', variable='e')
+                         stride=0, direction='store')
               ].eval_with_dict(params)
     assert f64 == n*m
 
-- 
GitLab


From 61d27603c27e0867aaf97030a20e05340d2bb837 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Wed, 26 Oct 2016 19:29:34 -0500
Subject: [PATCH 21/55] added reduce_mem_access_poly_fields function

---
 loopy/__init__.py       |  10 +--
 loopy/statistics.py     | 133 +++++++++++++++++++++++++++++++++-------
 test/test_statistics.py |  32 ++++++----
 3 files changed, 136 insertions(+), 39 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index 6a482cc1c..e50e53fb7 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -115,9 +115,9 @@ from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
 from loopy.statistics import (stringify_stats_mapping, Op, MemAccess,
         get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly,
         get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly,
-        sum_mem_access_to_bytes, sum_mem_access_across_vars, 
-        get_synchronization_poly, gather_access_footprints,
-        gather_access_footprint_bytes)
+        sum_mem_access_to_bytes, sum_mem_access_across_vars,
+        reduce_mem_access_poly_fields, get_synchronization_poly,
+        gather_access_footprints, gather_access_footprint_bytes)
 from loopy.codegen import (
         PreambleInfo,
         generate_code, generate_code_v2, generate_body)
@@ -224,8 +224,8 @@ __all__ = [
         "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly",
         "get_gmem_access_poly", "get_mem_access_poly",
         "sum_mem_access_to_bytes", "sum_mem_access_across_vars",
-        "get_synchronization_poly", "gather_access_footprints",
-        "gather_access_footprint_bytes",
+        "reduce_mem_access_poly_fields", "get_synchronization_poly",
+        "gather_access_footprints", "gather_access_footprint_bytes",
 
         "CompiledKernel",
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index ab68c4253..adf0781c8 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -48,6 +48,8 @@ __doc__ = """
 .. autofunction:: get_mem_access_poly
 
 .. autofunction:: sum_mem_access_to_bytes
+.. autofunction:: sum_mem_access_across_vars
+.. autofunction:: reduce_mem_access_poly_fields
 
 .. autofunction:: get_synchronization_poly
 
@@ -172,33 +174,47 @@ class MemAccess:
     """
 
     #TODO currently counting all lmem access as stride-1
-    def __init__(self, mtype, dtype, stride=1, direction=None, variable=None):
+    def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None):
         self.mtype = mtype
         self.stride = stride
         self.direction = direction
         self.variable = variable
-
-        from loopy.types import to_loopy_type
-        self.dtype = to_loopy_type(dtype)
+        if dtype is None:
+            self.dtype = dtype
+        else:
+            from loopy.types import to_loopy_type
+            self.dtype = to_loopy_type(dtype)
 
     def __eq__(self, other):
         return isinstance(other, MemAccess) and (
-                other.mtype == self.mtype and
-                other.dtype == self.dtype and
-                other.stride == self.stride and
-                other.direction == self.direction and
-                ((self.variable == None or other.variable == None) or
+                (self.mtype is None or other.mtype is None or
+                 self.mtype == other.mtype) and
+                (self.dtype is None or other.dtype is None or
+                 self.dtype == other.dtype) and
+                (self.stride is None or other.stride is None or
+                 self.stride == other.stride) and
+                (self.direction is None or other.direction is None or
+                 self.direction == other.direction) and
+                (self.variable is None or other.variable is None or
                  self.variable == other.variable))
 
     def __hash__(self):
+        mtype = self.mtype
+        dtype = self.dtype
+        stride = self.stride
         direction = self.direction
         variable = self.variable
-        if direction == None:
+        if mtype is None:
+            mtype = 'None'
+        if dtype is None:
+            dtype = 'None'
+        if stride is None:
+            stride = 'None'
+        if direction is None:
             direction = 'None'
-        if variable == None:
+        if variable is None:
             variable = 'None'
-        return hash(str(self.mtype)+str(self.dtype)+str(self.stride)
-                    +direction+variable)
+        return hash(mtype+str(dtype)+str(stride)+direction+variable)
 
 
@@ -931,7 +947,7 @@ def get_gmem_access_poly(knl):
 
 # }}}
 
-def get_mem_access_poly(knl, mtype, numpy_types=True, ignore_vars=False):
+def get_mem_access_poly(knl, mtype, numpy_types=True):
     """Count the number of memory accesses in a loopy kernel.
 
     :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
@@ -944,9 +960,6 @@ def get_mem_access_poly(knl, mtype, numpy_types=True, ignore_vars=False):
                             in the returned mapping should be numpy types
                             instead of :class:'loopy.LoopyType`.
 
-    :parameter ignore_vars: A :class:`boolean` specifying whether to separate 
-                            memory accesses by variable name.
-
     :return: A mapping of **{** :class:`loopy.MemAccess` **:**
              :class:`islpy.PwQPolynomial` **}**.
 
@@ -1069,10 +1082,7 @@ def get_mem_access_poly(knl, mtype, numpy_types=True, ignore_vars=False):
                        , count)
                       for mem_access, count in six.iteritems(result))
 
-    if ignore_vars:
-        return sum_mem_access_across_vars(result)
-    else:
-        return result
+    return result
 
 # {{{ sum_mem_access_to_bytes
 
@@ -1190,6 +1200,87 @@ def sum_mem_access_across_vars(m):
 
 # }}}
 
+# {{{ reduce_mem_access_poly_fields
+
+def reduce_mem_access_poly_fields(m, mtype=True, dtype=True, stride=True,
+                                  direction=True, variable=True):
+    """Take map returned from :func:`get_mem_access_poly`, remove specified MemAccess fields from keys, and combine counts
+
+    :parameter m: A mapping of **{** :class:`loopy.MemAccess` **:**
+                  :class:`islpy.PwQPolynomial` **}**.
+
+    :parameter mtype: A :class:`boolean` specifying whether keys in returned
+                      map will include the memory type.
+
+    :parameter dtype: A :class:`boolean` specifying whether keys in returned
+                      map will include the data type.
+
+    :parameter stride: A :class:`boolean` specifying whether keys in returned
+                       map will include the stride.
+
+    :parameter direction: A :class:`boolean` specifying whether keys in
+                          returned map will include the direction.
+
+    :parameter variable: A :class:`boolean` specifying whether keys in returned
+                         map will include the variable name.
+
+
+    :return: A mapping of **{(** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**
+
+             - The :class:`islpy.PwQPolynomial` holds the aggregate transfer
+               size in bytes for memory accesses of all data types with the
+               characteristics specified in the key (in terms of the
+               :class:`loopy.LoopKernel` *inames*).
+
+    Example usage::
+
+        # (first create loopy kernel and specify array data types)
+
+        params = {'n': 512, 'm': 256, 'l': 128}
+        mem_map = get_mem_access_poly(knl)
+        reduced_mem_map = reduce_mem_access_poly_fields(mem_map, stride=False,
+                                                        variable=False)
+
+        all_f32_global_loads = reduced_mem_map[MemAccess('global', np.float32,
+                                                         direction='load')
+                                              ].eval_with_dict(params)
+        all_f32_global_stores = reduced_mem_map[MemAccess('global', np.float32,
+                                                          direction='store')
+                                               ].eval_with_dict(params)
+        all_f32_local_loads = reduced_mem_map[MemAccess('local', np.float32,
+                                                        direction='load')
+                                             ].eval_with_dict(params)
+        all_f32_local_stores = reduced_mem_map[MemAccess('local', np.float32,
+                                                         direction='store')
+                                              ].eval_with_dict(params)
+
+        # (now use these counts to predict performance)
+
+    """
+
+    result = {}
+    for k, v in m.items():
+        new_key = MemAccess()
+        if mtype == True:
+            new_key.mtype = k.mtype
+        if dtype == True:
+            new_key.dtype = k.dtype
+        if stride == True:
+            new_key.stride = k.stride
+        if direction == True:
+            new_key.direction = k.direction
+        if variable == True:
+            new_key.variable = k.variable
+
+        if new_key in result:
+            result[new_key] += m[k]
+        else:
+            result[new_key] = m[k]
+
+    return result
+
+# }}}
+
 # {{{ get_synchronization_poly
 
 def get_synchronization_poly(knl):
diff --git a/test/test_statistics.py b/test/test_statistics.py
index dd651e7c7..51368781d 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -329,24 +329,27 @@ def test_gmem_access_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = lp.get_mem_access_poly(knl, 'global', ignore_vars=True)
+    poly = lp.get_mem_access_poly(knl, 'global')
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[lp.MemAccess('global', np.float32,
-                         stride=0, direction='load')
-              ].eval_with_dict(params)
-    f64 = poly[lp.MemAccess('global', np.float64,
-                         stride=0, direction='load')
-              ].eval_with_dict(params)
-    assert f32 == 2*n*m
-    assert f64 == n*m
 
-    f64 = poly[lp.MemAccess('global', np.float64,
-                         stride=0, direction='store')
-              ].eval_with_dict(params)
-    assert f64 == n*m
+    reduced_map = lp.reduce_mem_access_poly_fields(poly, stride=False,
+                                                    variable=False)
+
+    f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32),
+                                       direction='load')
+                         ].eval_with_dict(params)
+    f64_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
+                                       direction='load')
+                         ].eval_with_dict(params)
+    f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
+                                       direction='store')
+                         ].eval_with_dict(params)
+    assert f32_g_l == 2*n*m
+    assert f64_g_l == n*m
+    assert f64_g_s == n*m
 
 
 def test_gmem_access_counter_specialops():
@@ -566,6 +569,9 @@ def test_gmem_access_counter_consec():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
+    #for k in poly:
+    #    print(k.mtype, k.dtype, type(k.dtype), k.stride, k.direction, k.variable, " :\n", poly[k])
+
     f64consec = poly[lp.MemAccess('global', np.float64, 
                         stride=1, direction='load', variable='g')
                      ].eval_with_dict(params)
-- 
GitLab


From 3b54c505f83ef0db4346d5edae035dd2ee1145fe Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Wed, 26 Oct 2016 22:50:39 -0500
Subject: [PATCH 22/55] added filter_mem_access_poly_fields, sum_polys, and
 eval_and_sum_polys

---
 loopy/__init__.py       |  6 ++-
 loopy/statistics.py     | 94 ++++++++++++++++++++++++++++++++++++++---
 test/test_statistics.py |  3 ++
 3 files changed, 94 insertions(+), 9 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index e50e53fb7..0f56d3d4a 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -116,7 +116,8 @@ from loopy.statistics import (stringify_stats_mapping, Op, MemAccess,
         get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly,
         get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly,
         sum_mem_access_to_bytes, sum_mem_access_across_vars,
-        reduce_mem_access_poly_fields, get_synchronization_poly,
+        reduce_mem_access_poly_fields, filter_mem_access_poly_fields,
+        sum_polys, eval_and_sum_polys, get_synchronization_poly,
         gather_access_footprints, gather_access_footprint_bytes)
 from loopy.codegen import (
         PreambleInfo,
@@ -224,7 +225,8 @@ __all__ = [
         "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly",
         "get_gmem_access_poly", "get_mem_access_poly",
         "sum_mem_access_to_bytes", "sum_mem_access_across_vars",
-        "reduce_mem_access_poly_fields", "get_synchronization_poly",
+        "reduce_mem_access_poly_fields", "filter_mem_access_poly_fields",
+        "sum_polys", "eval_and_sum_polys", "get_synchronization_poly",
         "gather_access_footprints", "gather_access_footprint_bytes",
 
         "CompiledKernel",
diff --git a/loopy/statistics.py b/loopy/statistics.py
index adf0781c8..921dd6893 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1150,9 +1150,8 @@ def sum_mem_access_across_vars(m):
 
              - The **variable** attribute in the keys of the returned mapping is set to None 
 
-             - The :class:`islpy.PwQPolynomial` holds the aggregate transfer
-               size in bytes for memory accesses of all data types with the
-               characteristics specified in the key (in terms of the
+             - The :class:`islpy.PwQPolynomial` holds the aggregate counts for
+               memory accesses across all variables (in terms of the
                :class:`loopy.LoopKernel` *inames*).
 
     Example usage::
@@ -1227,10 +1226,9 @@ def reduce_mem_access_poly_fields(m, mtype=True, dtype=True, stride=True,
 
     :return: A mapping of **{(** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**
 
-             - The :class:`islpy.PwQPolynomial` holds the aggregate transfer
-               size in bytes for memory accesses of all data types with the
-               characteristics specified in the key (in terms of the
-               :class:`loopy.LoopKernel` *inames*).
+             - The :class:`islpy.PwQPolynomial` holds the counts (in terms of
+               the :class:`loopy.LoopKernel` *inames*) for memory accesses
+               categorized by the fields not set to False.
 
     Example usage::
 
@@ -1281,6 +1279,88 @@ def reduce_mem_access_poly_fields(m, mtype=True, dtype=True, stride=True,
 
 # }}}
 
+# {{{ filter_mem_access_poly_fields
+
+def filter_mem_access_poly_fields(m, mtypes=None, dtypes=None, strides=None,
+                                  directions=None, variables=None):
+    """Take map returned from :func:`get_mem_access_poly` and remove items without specified MemAccess fields
+
+    :parameter m: A mapping of **{** :class:`loopy.MemAccess` **:**
+                  :class:`islpy.PwQPolynomial` **}**.
+
+    :parameter mtypes: A list of :class:`string` that specifies the memory type
+                      accessed as **global** or **local**
+
+    :parameter dtypes: A list of :class:`loopy.LoopyType` (or
+                      :class:`numpy.dtype`) that specifies the data type
+                      accessed.
+
+    :parameter strides: A list of :class:`int` specifies stride of the memory
+                       access. A stride of 0 indicates a uniform access (i.e.
+                       all threads access the same item).
+
+    :parameter directions: A list of :class:`string` that specifies the
+                          direction of memory access as **load** or **store**.
+
+    :parameter variables: A list of :class:`string` that specifies the variable
+                         name of the data accessed.
+
+
+    :return: A mapping of **{(** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**
+
+             - The :class:`islpy.PwQPolynomial` holds the counts (in terms of
+               the :class:`loopy.LoopKernel` *inames*) for memory accesses
+               matching the fields passed as parameters.
+
+    Example usage::
+
+        # (first create loopy kernel and specify array data types)
+
+        params = {'n': 512, 'm': 256, 'l': 128}
+        mem_map = lp.get_mem_access_poly(knl)
+        filtered_map = lp.filter_mem_access_poly_fields(mem_map,
+                                                        directions=['load'],
+                                                        variables=['a','g'])
+        tot = lp.eval_and_sum_polys(filtered_map, params)
+
+        # (now use these counts to predict performance)
+
+    """
+
+    if dtypes is not None:
+        dtypes_lp = [to_loopy_type(d) for d in dtypes]
+
+    result = {}
+
+    for k, v in m.items():
+        if (mtypes is None or k.mtype in mtypes) and \
+           (dtypes is None or k.dtype in dtypes_lp) and \
+           (strides is None or k.stride in strides) and \
+           (directions is None or k.direction in directions) and \
+           (variables is None or k.variable in variables):
+
+            new_key = MemAccess(k.mtype, k.dtype, k.stride, k.direction, k.variable)
+
+            if new_key in result:
+                result[new_key] += m[k]
+            else:
+                result[new_key] = m[k]
+
+    return result
+
+# }}}
+
+def sum_polys(m):
+    total = isl.PwQPolynomial('{ 0 }')
+    for k, v in m.items():
+        total += v
+    return total
+
+
+def eval_and_sum_polys(m, params):
+    return sum_polys(m).eval_with_dict(params)
+
+
 # {{{ get_synchronization_poly
 
 def get_synchronization_poly(knl):
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 51368781d..305a6cb9d 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -395,6 +395,9 @@ def test_gmem_access_counter_specialops():
     assert f32 == n*m*l
     assert f64 == n*m
 
+    filtered_map = lp.filter_mem_access_poly_fields(poly, directions=['load'], variables=['a','g'])
+    tot = lp.eval_and_sum_polys(filtered_map, params)
+    assert tot == n*m*l + n*m
 
 def test_gmem_access_counter_bitwise():
 
-- 
GitLab


From a7d04f6884bb41a9c8dfac0c20c8d3b6587e0322 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Thu, 27 Oct 2016 03:26:50 -0500
Subject: [PATCH 23/55] removed sum_mem_access_across_vars (now redundant),
 removed mtype specifyer from get_mem_access_poly, which now returns map with
 all mtypes

---
 loopy/__init__.py       |  16 ++--
 loopy/statistics.py     | 170 +++++++++++++---------------------------
 test/test_statistics.py |  91 ++++++++++++++-------
 3 files changed, 122 insertions(+), 155 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index 0f56d3d4a..15fe458e6 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -115,10 +115,10 @@ from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
 from loopy.statistics import (stringify_stats_mapping, Op, MemAccess,
         get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly,
         get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly,
-        sum_mem_access_to_bytes, sum_mem_access_across_vars,
-        reduce_mem_access_poly_fields, filter_mem_access_poly_fields,
-        sum_polys, eval_and_sum_polys, get_synchronization_poly,
-        gather_access_footprints, gather_access_footprint_bytes)
+        sum_mem_access_to_bytes, reduce_mem_access_poly_fields,
+        filter_mem_access_poly_fields, sum_polys, eval_and_sum_polys,
+        get_synchronization_poly, gather_access_footprints,
+        gather_access_footprint_bytes)
 from loopy.codegen import (
         PreambleInfo,
         generate_code, generate_code_v2, generate_body)
@@ -224,10 +224,10 @@ __all__ = [
         "stringify_stats_mapping", "Op", "MemAccess", "get_op_poly",
         "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly",
         "get_gmem_access_poly", "get_mem_access_poly",
-        "sum_mem_access_to_bytes", "sum_mem_access_across_vars",
-        "reduce_mem_access_poly_fields", "filter_mem_access_poly_fields",
-        "sum_polys", "eval_and_sum_polys", "get_synchronization_poly",
-        "gather_access_footprints", "gather_access_footprint_bytes",
+        "sum_mem_access_to_bytes", "reduce_mem_access_poly_fields",
+        "filter_mem_access_poly_fields", "sum_polys", "eval_and_sum_polys",
+        "get_synchronization_poly", "gather_access_footprints",
+        "gather_access_footprint_bytes",
 
         "CompiledKernel",
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 921dd6893..7c5efb3d7 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -48,7 +48,6 @@ __doc__ = """
 .. autofunction:: get_mem_access_poly
 
 .. autofunction:: sum_mem_access_to_bytes
-.. autofunction:: sum_mem_access_across_vars
 .. autofunction:: reduce_mem_access_poly_fields
 
 .. autofunction:: get_synchronization_poly
@@ -919,9 +918,11 @@ def get_lmem_access_poly(knl):
     """
     from warnings import warn
     warn("get_lmem_access_poly is deprecated. "
-         "Use get_mem_access_poly with local option instead",
+         "Instead, use get_mem_access_poly and then pass the result to "
+         "filter_mem_access_poly_fields with mtypes=['local'] option.",
          DeprecationWarning, stacklevel=2)
-    return get_mem_access_poly(knl, 'local')
+    return filter_mem_access_poly_fields(
+                get_mem_access_poly(knl), mtypes=['local'])
 
 
 def get_DRAM_access_poly(knl):
@@ -929,9 +930,11 @@ def get_DRAM_access_poly(knl):
     """
     from warnings import warn
     warn("get_DRAM_access_poly is deprecated. "
-         "Use get_mem_access_poly with global option instead",
+         "Instead, use get_mem_access_poly and then pass the result to "
+         "filter_mem_access_poly_fields with mtypes=['global'] option.",
          DeprecationWarning, stacklevel=2)
-    return get_mem_access_poly(knl, 'global')
+    return filter_mem_access_poly_fields(
+                get_mem_access_poly(knl), mtypes=['global'])
 
 # {{{ get_gmem_access_poly
 
@@ -940,22 +943,21 @@ def get_gmem_access_poly(knl):
     """
     from warnings import warn
     warn("get_gmem_access_poly is deprecated. "
-         "Use get_mem_access_poly with global option instead",
+         "Instead, use get_mem_access_poly and then pass the result to "
+         "filter_mem_access_poly_fields with mtypes=['global'] option.",
          DeprecationWarning, stacklevel=2)
-    return get_mem_access_poly(knl, 'global')
+    return filter_mem_access_poly_fields(
+                get_mem_access_poly(knl), mtypes=['global'])
 
 
 # }}}
 
-def get_mem_access_poly(knl, mtype, numpy_types=True):
+def get_mem_access_poly(knl, numpy_types=True):
     """Count the number of memory accesses in a loopy kernel.
 
     :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
                     counted.
 
-    :parameter mtype: A :class:`string` specifying the memory accesses as
-                      *global* or *local*.
-
     :parameter numpy_types: A :class:`boolean` specifying whether the types
                             in the returned mapping should be numpy types
                             instead of :class:'loopy.LoopyType`.
@@ -975,31 +977,28 @@ def get_mem_access_poly(knl, mtype, numpy_types=True):
         # (first create loopy kernel and specify array data types)
 
         params = {'n': 512, 'm': 256, 'l': 128}
-        gmem_access_map = get_mem_access_poly('global', knl)
+        mem_access_map = get_mem_access_poly(knl)
 
-        f32_stride1_g_loads_a = gmem_access_map[MemAccess('global', np.float32,
+        f32_stride1_g_loads_a = mem_access_map[MemAccess('global', np.float32,
+                                                         stride=1,
+                                                         direction='load',
+                                                         variable='a')
+                                              ].eval_with_dict(params)
+        f32_stride1_g_stores_a = mem_access_map[MemAccess('global', np.float32,
                                                           stride=1,
-                                                          direction='load',
+                                                          direction='store',
                                                           variable='a')
                                                ].eval_with_dict(params)
-        f32_stride1_g_stores_a = gmem_access_map[MemAccess('global', np.float32,
-                                                           stride=1,
-                                                           direction='store',
-                                                           variable='a')
-                                                ].eval_with_dict(params)
-
-        lmem_access_map = get_mem_access_poly('local', knl)
-
-        f32_stride1_l_loads_x = lmem_access_map[MemAccess('local', np.float32,
+        f32_stride1_l_loads_x = mem_access_map[MemAccess('local', np.float32,
+                                                         stride=1,
+                                                         direction='load',
+                                                         variable='x')
+                                              ].eval_with_dict(params)
+        f32_stride1_l_stores_x = mem_access_map[MemAccess('local', np.float32,
                                                           stride=1,
-                                                          direction='load',
+                                                          direction='store',
                                                           variable='x')
                                                ].eval_with_dict(params)
-        f32_stride1_l_stores_x = lmem_access_map[MemAccess('local', np.float32,
-                                                           stride=1,
-                                                           direction='store',
-                                                           variable='x')
-                                                ].eval_with_dict(params)
 
         # (now use these counts to predict performance)
 
@@ -1027,50 +1026,48 @@ def get_mem_access_poly(knl, mtype, numpy_types=True):
     knl = preprocess_kernel(knl)
 
     subs_poly = ToCountMap()
-    if mtype == 'global':
-        subscript_counter = GlobalSubscriptCounter(knl)
-    elif mtype == 'local':
-        subscript_counter = LocalSubscriptCounter(knl)
-    else:
-        raise ValueError("get_mem_access_poly: mtype must be "
-                         "'local' or 'global', received {0}"
-                         .format(mtype))
+    subs_counter_g = GlobalSubscriptCounter(knl)
+    subs_counter_l = LocalSubscriptCounter(knl)
 
     for insn in knl.instructions:
-        # count subscripts, distinguishing loads and stores
-        subs_expr = subscript_counter(insn.expression)
+        # count subscripts
+        subs_expr = subs_counter_g(insn.expression) \
+                    + subs_counter_l(insn.expression)
+
+        # distinguish loads and stores
         for key in subs_expr.dict:
             subs_expr.dict[MemAccess(key.mtype, key.dtype, stride=key.stride,
                                      direction='load', variable=key.variable)
                           ] = subs_expr.dict.pop(key)
 
-        if mtype == 'global':  # for now, don't count writes to local mem
-            subs_assignee = subscript_counter(insn.assignee)
-            for key in subs_assignee.dict:
-                subs_assignee.dict[MemAccess(key.mtype, key.dtype,
-                                             stride=key.stride, direction='store',
-                                             variable=key.variable)
-                                  ] = subs_assignee.dict.pop(key)
+        subs_assignee_g = subs_counter_g(insn.assignee)
+        for key in subs_assignee_g.dict:
+            subs_assignee_g.dict[MemAccess(key.mtype, key.dtype,
+                                           stride=key.stride, direction='store',
+                                           variable=key.variable)
+                                ] = subs_assignee_g.dict.pop(key)
+        # for now, don't count writes to local mem
 
         insn_inames = knl.insn_inames(insn)
 
         # use count excluding local index tags for uniform accesses
         for key in subs_expr.dict:
             poly = ToCountMap({key: subs_expr.dict[key]})
-            if mtype == 'global' and isinstance(key.stride, int) and key.stride == 0:
+            if key.mtype == 'global' and isinstance(key.stride, int) and key.stride == 0:
                 subs_poly = subs_poly \
                             + poly*get_insn_count(knl, insn_inames, True)
             else:
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
+                #currently not counting stride of local mem access
 
-        if mtype == 'global':  # for now, don't count writes to local mem
-            for key in subs_assignee.dict:
-                poly = ToCountMap({key: subs_assignee.dict[key]})
-                if isinstance(key.stride, int) and key.stride == 0:
-                    subs_poly = subs_poly \
-                                + poly*get_insn_count(knl, insn_inames, True)
-                else:
-                    subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
+        for key in subs_assignee_g.dict:
+            poly = ToCountMap({key: subs_assignee_g.dict[key]})
+            if isinstance(key.stride, int) and key.stride == 0:
+                subs_poly = subs_poly \
+                            + poly*get_insn_count(knl, insn_inames, True)
+            else:
+                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
+            # for now, don't count writes to local mem
 
     result = subs_poly.dict
 
@@ -1109,7 +1106,7 @@ def sum_mem_access_to_bytes(m):
 
         # (first create loopy kernel and specify array data types)
 
-        mem_access_map = get_mem_access_poly('global', knl)
+        mem_access_map = get_mem_access_poly(knl)
         byte_totals_map = sum_mem_access_to_bytes(mem_access_map)
         params = {'n': 512, 'm': 256, 'l': 128}
 
@@ -1138,67 +1135,6 @@ def sum_mem_access_to_bytes(m):
     return result
 
 # }}}
-
-# {{{ sum_mem_access_across_vars
-
-def sum_mem_access_across_vars(m):
-    """Remove variable name divisions in mapping returned by :func:`get_mem_access_poly`
-
-    :parameter m: A mapping of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
-
-    :return: A mapping of **{(** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**
-
-             - The **variable** attribute in the keys of the returned mapping is set to None 
-
-             - The :class:`islpy.PwQPolynomial` holds the aggregate counts for
-               memory accesses across all variables (in terms of the
-               :class:`loopy.LoopKernel` *inames*).
-
-    Example usage::
-
-        # (first create loopy kernel and specify array data types)
-
-        params = {'n': 512, 'm': 256, 'l': 128}
-        gmem_access_map = get_mem_access_poly('global', knl)
-        gmem_acrossvars = sum_mem_access_across_vars(gmem_access_map)
-
-        f32_stride1_g_loads = gmem_acrossvars[MemAccess('global', np.float32,
-                                                        stride=1,
-                                                        direction='load') # do not specify variable
-                                             ].eval_with_dict(params)
-        f32_stride1_g_stores = gmem_acrossvars[MemAccess('global', np.float32,
-                                                         stride=1,
-                                                         direction='store') # do not specify variable
-                                              ].eval_with_dict(params)
-
-        lmem_access_map = get_mem_access_poly('local', knl)
-        lmem_acrossvars = sum_mem_access_across_vars(lmem_access_map)
-
-        f32_stride1_l_loads = lmem_acrossvars[MemAccess('local', np.float32,
-                                                        stride=1,
-                                                        direction='load') # do not specify variable
-                                             ].eval_with_dict(params)
-        f32_stride1_l_stores = lmem_acrossvars[MemAccess('local', np.float32,
-                                                         stride=1,
-                                                         direction='store') # do not specify variable
-                                              ].eval_with_dict(params)
-
-        # (now use these counts to predict performance)
-
-    """
-
-    result = {}
-    for mem_access, v in m.items():
-        new_key = MemAccess(mem_access.mtype, mem_access.dtype, mem_access.stride, mem_access.direction)
-        if new_key in result:
-            result[new_key] += m[mem_access]
-        else:
-            result[new_key] = m[mem_access]
-
-    return result
-
-# }}}
-
 # {{{ reduce_mem_access_poly_fields
 
 def reduce_mem_access_poly_fields(m, mtype=True, dtype=True, stride=True,
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 305a6cb9d..7ad06f3eb 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -236,7 +236,7 @@ def test_gmem_access_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = lp.get_mem_access_poly(knl, 'global')
+    poly = lp.get_mem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -265,22 +265,6 @@ def test_gmem_access_counter_basic():
     assert f32s == n*m*l
     assert f64s == n*m
 
-    poly_b = lp.sum_mem_access_to_bytes(poly)
-    s0load = poly_b[('global', 0, 'load')].eval_with_dict(params)
-    s0store = poly_b[('global', 0, 'store')].eval_with_dict(params)
-    assert s0load == 4*f32l + 8*f64l
-    assert s0store == 4*f32s + 8*f64s
-
-    poly_c = lp.sum_mem_access_across_vars(poly)
-    f32lall = poly_c[lp.MemAccess('global', np.float32,
-                                stride=0, direction='load')
-                  ].eval_with_dict(params)
-    f64lall = poly_c[lp.MemAccess('global', np.float64,
-                                stride=0, direction='load')
-                  ].eval_with_dict(params)
-    assert f32lall == 3*n*m*l
-    assert f64lall == 2*n*m
-
 
 def test_gmem_access_counter_reduction():
 
@@ -292,7 +276,7 @@ def test_gmem_access_counter_reduction():
             name="matmul", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    poly = lp.get_mem_access_poly(knl, 'global')
+    poly = lp.get_mem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -329,7 +313,7 @@ def test_gmem_access_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = lp.get_mem_access_poly(knl, 'global')
+    poly = lp.get_mem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -366,7 +350,7 @@ def test_gmem_access_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
-    poly = lp.get_mem_access_poly(knl, 'global')
+    poly = lp.get_mem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -416,7 +400,7 @@ def test_gmem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    poly = lp.get_mem_access_poly(knl, 'global')
+    poly = lp.get_mem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -462,7 +446,7 @@ def test_gmem_access_counter_mixed():
     knl = lp.split_iname(knl, "j", threads)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
-    poly = lp.get_mem_access_poly(knl, 'global')  # noqa
+    poly = lp.get_mem_access_poly(knl)  # noqa
     n = 512
     m = 256
     l = 128
@@ -515,7 +499,7 @@ def test_gmem_access_counter_nonconsec():
     knl = lp.split_iname(knl, "i", 16)
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
-    poly = lp.get_mem_access_poly(knl, 'global')  # noqa
+    poly = lp.get_mem_access_poly(knl)  # noqa
     n = 512
     m = 256
     l = 128
@@ -566,7 +550,7 @@ def test_gmem_access_counter_consec():
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
-    poly = lp.get_mem_access_poly(knl, 'global')
+    poly = lp.get_mem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -670,10 +654,7 @@ def test_all_counters_parallel_matmul():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    #barrier_count = get_barrier_poly(knl).eval_with_dict(params)
-    #assert barrier_count == 2*m/16
     sync_poly = lp.get_synchronization_poly(knl)
-    #assert len(sync_poly) == 1 #TODO why?
     assert len(sync_poly) == 2
     assert sync_poly["kernel_launch"].eval_with_dict(params) == 1
     assert sync_poly["barrier_local"].eval_with_dict(params) == 2*m/16
@@ -694,7 +675,7 @@ def test_all_counters_parallel_matmul():
 
     assert f32mul+f32add == n*m*l*2
 
-    subscript_map = lp.get_mem_access_poly(knl, 'global')
+    subscript_map = lp.get_mem_access_poly(knl)
 
     f32coal = subscript_map[lp.MemAccess('global', np.float32, 
                         stride=1, direction='load', variable='b')
@@ -711,9 +692,8 @@ def test_all_counters_parallel_matmul():
 
     assert f32coal == n*l
 
-    local_subs_map = lp.get_mem_access_poly(knl, 'local')
+    local_subs_map = lp.get_mem_access_poly(knl)
 
-    # TODO currently considering all local mem access stride-1
     local_subs_l = local_subs_map[lp.MemAccess('local', np.dtype(np.float32),
                                             direction='load')
                                  ].eval_with_dict(params)
@@ -752,6 +732,57 @@ def test_gather_access_footprint_2():
         print(key, count(knl, footprint))
 
 
+def test_summations_and_filters():
+
+    knl = lp.make_kernel(
+            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            [
+                """
+                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
+                e[i, k] = g[i,k]*h[i,k+1]
+                """
+            ],
+            name="basic", assumptions="n,m,l >= 1")
+
+    knl = lp.add_and_infer_dtypes(knl,
+                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
+    poly = lp.get_mem_access_poly(knl)
+    n = 512
+    m = 256
+    l = 128
+    params = {'n': n, 'm': m, 'l': l}
+
+    loads_a = lp.eval_and_sum_polys(
+                    lp.filter_mem_access_poly_fields(
+                        lp.get_mem_access_poly(knl),
+                        directions=['load'], variables=['a']),
+                    params)
+    assert loads_a == 2*n*m*l
+
+    global_stores = lp.eval_and_sum_polys(
+                        lp.filter_mem_access_poly_fields(
+                            lp.get_mem_access_poly(knl),
+                            mtypes=['global'], directions=['store']),
+                        params)
+    assert global_stores == n*m*l + n*m
+
+    bytes_map = lp.sum_mem_access_to_bytes(lp.get_mem_access_poly(knl))
+    s0load = bytes_map[('global', 0, 'load')].eval_with_dict(params)
+    s0store = bytes_map[('global', 0, 'store')].eval_with_dict(params)
+    assert s0load == 4*n*m*l*3 + 8*n*m*2
+    assert s0store == 4*n*m*l + 8*n*m
+
+    # ignore stride and variable names in this map
+    reduced_map = lp.reduce_mem_access_poly_fields(lp.get_mem_access_poly(knl),
+                                                  stride=False, variable=False)
+    f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load')
+                         ].eval_with_dict(params)
+    f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
+                         ].eval_with_dict(params)
+    assert f32lall == 3*n*m*l
+    assert f64lall == 2*n*m
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
-- 
GitLab


From 1b1180c3ee46e7e701ddc97043f87f6dfcde15be Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Thu, 27 Oct 2016 04:23:37 -0500
Subject: [PATCH 24/55] added reduce_op_poly_fields and filter_op_poly_fields

---
 loopy/__init__.py       |   6 +-
 loopy/statistics.py     | 144 ++++++++++++++++++++++++++++++++++++++--
 test/test_statistics.py |  36 +++++++---
 3 files changed, 168 insertions(+), 18 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index 15fe458e6..05ccddb54 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -116,7 +116,8 @@ from loopy.statistics import (stringify_stats_mapping, Op, MemAccess,
         get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly,
         get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly,
         sum_mem_access_to_bytes, reduce_mem_access_poly_fields,
-        filter_mem_access_poly_fields, sum_polys, eval_and_sum_polys,
+        filter_mem_access_poly_fields, reduce_op_poly_fields,
+        filter_op_poly_fields, sum_polys, eval_and_sum_polys,
         get_synchronization_poly, gather_access_footprints,
         gather_access_footprint_bytes)
 from loopy.codegen import (
@@ -225,7 +226,8 @@ __all__ = [
         "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly",
         "get_gmem_access_poly", "get_mem_access_poly",
         "sum_mem_access_to_bytes", "reduce_mem_access_poly_fields",
-        "filter_mem_access_poly_fields", "sum_polys", "eval_and_sum_polys",
+        "filter_mem_access_poly_fields", "reduce_op_poly_fields",
+        "filter_op_poly_fields", "sum_polys", "eval_and_sum_polys",
         "get_synchronization_poly", "gather_access_footprints",
         "gather_access_footprint_bytes",
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 7c5efb3d7..5de9b6b0e 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -49,6 +49,10 @@ __doc__ = """
 
 .. autofunction:: sum_mem_access_to_bytes
 .. autofunction:: reduce_mem_access_poly_fields
+.. autofunction:: filter_mem_access_poly_fields
+
+.. autofunction:: reduce_op_poly_fields
+.. autofunction:: filter_op_poly_fields
 
 .. autofunction:: get_synchronization_poly
 
@@ -128,18 +132,29 @@ class Op:
 
     """
 
-    def __init__(self, dtype, name):
+    def __init__(self, dtype=None, name=None):
         self.name = name
-        from loopy.types import to_loopy_type
-        self.dtype = to_loopy_type(dtype)
+        if dtype is None:
+            self.dtype = dtype
+        else:
+            from loopy.types import to_loopy_type
+            self.dtype = to_loopy_type(dtype)
 
     def __eq__(self, other):
         return isinstance(other, Op) and (
-                other.dtype == self.dtype and
-                other.name == self.name )
+                (self.dtype is None or other.dtype is None or
+                 self.dtype == other.dtype) and
+                (self.name is None or other.name is None or
+                 self.name == other.name))
 
     def __hash__(self):
-        return hash(str(self.dtype)+self.name)
+        dtype = self.dtype
+        name = self.name
+        if dtype is None:
+            dtype = 'None'
+        if name is None:
+            name = 'None'
+        return hash(str(dtype)+name)
 
 
 class MemAccess:
@@ -172,7 +187,7 @@ class MemAccess:
 
     """
 
-    #TODO currently counting all lmem access as stride-1
+    #TODO currently counting all lmem access as stride None
     def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None):
         self.mtype = mtype
         self.stride = stride
@@ -1135,6 +1150,7 @@ def sum_mem_access_to_bytes(m):
     return result
 
 # }}}
+
 # {{{ reduce_mem_access_poly_fields
 
 def reduce_mem_access_poly_fields(m, mtype=True, dtype=True, stride=True,
@@ -1263,6 +1279,7 @@ def filter_mem_access_poly_fields(m, mtypes=None, dtypes=None, strides=None,
 
     """
 
+    from loopy.types import to_loopy_type
     if dtypes is not None:
         dtypes_lp = [to_loopy_type(d) for d in dtypes]
 
@@ -1286,6 +1303,119 @@ def filter_mem_access_poly_fields(m, mtypes=None, dtypes=None, strides=None,
 
 # }}}
 
+# {{{ reduce_op_poly_fields
+
+def reduce_op_poly_fields(m, dtype=True, name=True):
+    """Take map returned from :func:`get_op_poly`, remove specified Op fields from keys, and combine counts
+
+    :parameter m: A mapping of **{** :class:`loopy.Op` **:**
+                  :class:`islpy.PwQPolynomial` **}**.
+
+    :parameter dtype: A :class:`boolean` specifying whether keys in returned
+                      map will include the data type.
+
+    :parameter name: A :class:`boolean` specifying whether keys in returned
+                     map will include the name of the operation.
+
+    :return: A mapping of **{(** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**
+
+             - The :class:`islpy.PwQPolynomial` holds the counts (in terms of
+               the :class:`loopy.LoopKernel` *inames*) for arithmetic ops
+               categorized by the fields not set to False.
+
+    Example usage::
+
+        # (first create loopy kernel and specify array data types)
+
+        params = {'n': 512, 'm': 256, 'l': 128}
+        op_map = get_op_poly(knl)
+        reduced_op_map = reduce_op_fields(op_map, name=False)
+
+        all_f32_ops = reduced_op_map[Op(dtype=np.float32)].eval_with_dict(params)
+        all_f64_ops = reduced_op_map[Op(dtype=np.float64)].eval_with_dict(params)
+
+        reduced_op_map = reduce_op_fields(op_map, dtype=False)
+
+        all_add_ops = reduced_op_map[Op(name='add')].eval_with_dict(params)
+        all_mul_ops = reduced_op_map[Op(name='mul')].eval_with_dict(params)
+
+        # (now use these counts to predict performance)
+
+    """
+
+    result = {}
+    for k, v in m.items():
+        new_key = Op()
+        if dtype == True:
+            new_key.dtype = k.dtype
+        if name == True:
+            new_key.name = k.name
+
+        if new_key in result:
+            result[new_key] += m[k]
+        else:
+            result[new_key] = m[k]
+
+    return result
+
+# }}}
+
+# {{{ filter_op_poly_fields
+
+def filter_op_poly_fields(m, dtypes=None, names=None):
+    """Take map returned from :func:`get_op_poly` and remove items without specified Op fields
+
+    :parameter m: A mapping of **{** :class:`loopy.Op` **:**
+                  :class:`islpy.PwQPolynomial` **}**.
+
+    :parameter dtypes: A list of :class:`loopy.LoopyType` (or
+                      :class:`numpy.dtype`) that specifies the data type
+                      operated on.
+
+    :parameter names: A list of :class:`string` that specifies the kind of
+                      arithmetic operation as *add*, *sub*, *mul*, *div*,
+                      *pow*, *shift*, *bw* (bitwise), etc.
+
+    :return: A mapping of **{(** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**
+
+             - The :class:`islpy.PwQPolynomial` holds the counts (in terms of
+               the :class:`loopy.LoopKernel` *inames*) for arithmetic ops
+               matching the fields passed as parameters.
+
+    Example usage::
+
+        # (first create loopy kernel and specify array data types)
+
+        params = {'n': 512, 'm': 256, 'l': 128}
+        op_map = lp.get_op_poly(knl)
+        filtered_map = lp.filter_op_poly_fields(op_map, names=['add', 'sub'])
+        tot_addsub = lp.eval_and_sum_polys(filtered_map, params)
+
+        # (now use these counts to predict performance)
+
+    """
+
+    from loopy.types import to_loopy_type
+    if dtypes is not None:
+        dtypes_lp = [to_loopy_type(d) for d in dtypes]
+
+    result = {}
+
+    for k, v in m.items():
+        if (dtypes is None or k.dtype in dtypes_lp) and \
+           (names is None or k.name in names):
+
+            new_key = Op(k.dtype, k.name)
+
+            if new_key in result:
+                result[new_key] += m[k]
+            else:
+                result[new_key] = m[k]
+
+    return result
+
+# }}}
+
 def sum_polys(m):
     total = isl.PwQPolynomial('{ 0 }')
     for k, v in m.items():
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 7ad06f3eb..a853e8c30 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -61,14 +61,6 @@ def test_op_counter_basic():
     assert f32add == f32mul == f32div == n*m*l
     assert f64mul == n*m
     assert i32add == n*m*2
-
-    poly_dtype = lp.sum_ops_to_dtypes(poly)
-    f32 = poly_dtype[to_loopy_type(np.float32)].eval_with_dict(params)
-    f64 = poly_dtype[to_loopy_type(np.float64)].eval_with_dict(params)
-    i32 = poly_dtype[to_loopy_type(np.int32)].eval_with_dict(params)
-    assert f32 == f32add + f32mul + f32div
-    assert f64 == f64mul
-    assert i32 == i32add
     
 
 def test_op_counter_reduction():
@@ -739,7 +731,7 @@ def test_summations_and_filters():
             [
                 """
                 c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
-                e[i, k] = g[i,k]*h[i,k+1]
+                e[i, k+1] = -g[i,k]*h[i,k+1]
                 """
             ],
             name="basic", assumptions="n,m,l >= 1")
@@ -782,6 +774,32 @@ def test_summations_and_filters():
     assert f32lall == 3*n*m*l
     assert f64lall == 2*n*m
 
+    poly_dtype = lp.sum_ops_to_dtypes(lp.get_op_poly(knl))
+    f32 = poly_dtype[to_loopy_type(np.float32)].eval_with_dict(params)
+    f64 = poly_dtype[to_loopy_type(np.float64)].eval_with_dict(params)
+    i32 = poly_dtype[to_loopy_type(np.int32)].eval_with_dict(params)
+    assert f32 == n*m*l*3
+    assert f64 == n*m
+    assert i32 == n*m*2
+
+    addsub_all = lp.eval_and_sum_polys(
+                        lp.filter_op_poly_fields(
+                            lp.get_op_poly(knl), names=['add', 'sub']),
+                        params)
+    f32ops_all = lp.eval_and_sum_polys(
+                        lp.filter_op_poly_fields(
+                            lp.get_op_poly(knl), dtypes=[np.float32]),
+                        params)
+    assert addsub_all == n*m*l + n*m*2
+    assert f32ops_all == n*m*l*3
+
+    ops_nodtype = lp.reduce_op_poly_fields(lp.get_op_poly(knl), dtype=False)
+    ops_noname = lp.reduce_op_poly_fields(lp.get_op_poly(knl), name=False)
+    mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params)
+    f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params)
+    assert mul_all == n*m*l + n*m
+    assert f64ops_all == n*m
+
 
 if __name__ == "__main__":
     if len(sys.argv) > 1:
-- 
GitLab


From c892d77f8c818fcbe2335e56106260462320d282 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Sun, 30 Oct 2016 22:32:05 -0500
Subject: [PATCH 25/55] changed mem_access and op filter functions to single
 ToCountMap member function

---
 loopy/__init__.py       |   8 +-
 loopy/statistics.py     | 219 +++++++++++++---------------------------
 test/test_statistics.py |  42 ++++----
 3 files changed, 94 insertions(+), 175 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index 05ccddb54..80c266ba4 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -116,8 +116,8 @@ from loopy.statistics import (stringify_stats_mapping, Op, MemAccess,
         get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly,
         get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly,
         sum_mem_access_to_bytes, reduce_mem_access_poly_fields,
-        filter_mem_access_poly_fields, reduce_op_poly_fields,
-        filter_op_poly_fields, sum_polys, eval_and_sum_polys,
+        reduce_op_poly_fields,
+        sum_polys, eval_and_sum_polys,
         get_synchronization_poly, gather_access_footprints,
         gather_access_footprint_bytes)
 from loopy.codegen import (
@@ -226,8 +226,8 @@ __all__ = [
         "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly",
         "get_gmem_access_poly", "get_mem_access_poly",
         "sum_mem_access_to_bytes", "reduce_mem_access_poly_fields",
-        "filter_mem_access_poly_fields", "reduce_op_poly_fields",
-        "filter_op_poly_fields", "sum_polys", "eval_and_sum_polys",
+        "reduce_op_poly_fields",
+        "sum_polys", "eval_and_sum_polys",
         "get_synchronization_poly", "gather_access_footprints",
         "gather_access_footprint_bytes",
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 5de9b6b0e..b5e37d2d0 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -36,10 +36,13 @@ from loopy.kernel.data import MultiAssignmentBase
 from loopy.diagnostic import warn, LoopyError
 
 
+#TODO does this work for class functions?
 __doc__ = """
 
 .. currentmodule:: loopy
 
+.. autofunction:: filter
+
 .. autofunction:: get_op_poly
 
 .. autofunction:: get_lmem_access_poly
@@ -49,10 +52,8 @@ __doc__ = """
 
 .. autofunction:: sum_mem_access_to_bytes
 .. autofunction:: reduce_mem_access_poly_fields
-.. autofunction:: filter_mem_access_poly_fields
 
 .. autofunction:: reduce_op_poly_fields
-.. autofunction:: filter_op_poly_fields
 
 .. autofunction:: get_synchronization_poly
 
@@ -107,6 +108,59 @@ class ToCountMap:
     def __repr__(self):
         return repr(self.dict)
 
+    def items(self):
+        return self.dict.items()
+
+    def filter(self, **kwargs):
+        """Remove items without specified key fields
+
+        :parameter **kwargs: Keyword arguments matching fields in the keys of
+                             the :class:`ToCountMap`, each given a list of
+                             allowable values for that key field.
+
+        :return: A :class:`ToCountMap` containing the subset of the items in
+                 the oriinal :class:`ToCountMap` that match the field values
+                 passed
+
+        Example usage::
+
+            # (first create loopy kernel and specify array data types)
+
+            params = {'n': 512, 'm': 256, 'l': 128}
+            mem_map = lp.get_mem_access_poly(knl)
+            filtered_map = mem_map.filter(directions=['load'],
+                                          variables=['a','g'])
+            tot_loads_a_g = lp.eval_and_sum_polys(filtered_map, params)
+
+            # (now use these counts to predict performance)
+
+        """
+
+        new_map = ToCountMap()
+
+        from loopy.types import to_loopy_type
+        if 'dtype' in kwargs.keys():
+            kwargs['dtype'] = [to_loopy_type(d) for d in kwargs['dtype']]
+
+        # for each item in self.dict
+        for self_key, self_val in self.dict.items():
+            try:
+                # check to see if key attribute values match all filters
+                for arg_field, allowable_vals in kwargs.items():
+                    attr_val = getattr(self_key, arg_field)
+                    # see if the value is in the filter list
+                    if attr_val not in allowable_vals:
+                        print("DEBUG: "+str(attr_val)+" not in ", allowable_vals, ", removing.")
+                        break
+                else:  # loop terminated without break or error
+                    new_map.dict[self_key] = self_val
+            except(AttributeError):
+                # the field passed is not a field of this key
+                print("DEBUG: "+arg_field+" not in ", self_key, ", removing.") 
+                continue
+
+        return new_map
+
 # }}}
 
 
@@ -884,7 +938,7 @@ def get_op_poly(knl, numpy_types=True):
                 (Op(op.dtype.numpy_dtype, op.name), count)
                 for op, count in six.iteritems(result))
 
-    return result
+    return ToCountMap(result)
 # }}}
 
 
@@ -927,29 +981,25 @@ def sum_ops_to_dtypes(op_poly_dict):
 
     return result
 
-
+#TODO test depricated functions?
 def get_lmem_access_poly(knl):
     """Count the number of local memory accesses in a loopy kernel.
     """
     from warnings import warn
-    warn("get_lmem_access_poly is deprecated. "
-         "Instead, use get_mem_access_poly and then pass the result to "
-         "filter_mem_access_poly_fields with mtypes=['local'] option.",
+    warn("get_lmem_access_poly is deprecated. Use get_mem_access_poly and "
+         "filter the result with the mtype=['local'] option.",
          DeprecationWarning, stacklevel=2)
-    return filter_mem_access_poly_fields(
-                get_mem_access_poly(knl), mtypes=['local'])
+    return get_mem_access_poly(knl).filter(mtypes=['local'])
 
 
 def get_DRAM_access_poly(knl):
     """Count the number of global memory accesses in a loopy kernel.
     """
     from warnings import warn
-    warn("get_DRAM_access_poly is deprecated. "
-         "Instead, use get_mem_access_poly and then pass the result to "
-         "filter_mem_access_poly_fields with mtypes=['global'] option.",
+    warn("get_DRAM_access_poly is deprecated. Use get_mem_access_poly and "
+         "filter the result with the mtype=['global'] option.",
          DeprecationWarning, stacklevel=2)
-    return filter_mem_access_poly_fields(
-                get_mem_access_poly(knl), mtypes=['global'])
+    return get_mem_access_poly(knl).filter(mtypes=['global'])
 
 # {{{ get_gmem_access_poly
 
@@ -957,13 +1007,10 @@ def get_gmem_access_poly(knl):
     """Count the number of global memory accesses in a loopy kernel.
     """
     from warnings import warn
-    warn("get_gmem_access_poly is deprecated. "
-         "Instead, use get_mem_access_poly and then pass the result to "
-         "filter_mem_access_poly_fields with mtypes=['global'] option.",
+    warn("get_DRAM_access_poly is deprecated. Use get_mem_access_poly and "
+         "filter the result with the mtype=['global'] option.",
          DeprecationWarning, stacklevel=2)
-    return filter_mem_access_poly_fields(
-                get_mem_access_poly(knl), mtypes=['global'])
-
+    return get_mem_access_poly(knl).filter(mtypes=['global'])
 
 # }}}
 
@@ -1091,10 +1138,10 @@ def get_mem_access_poly(knl, numpy_types=True):
                                  stride=mem_access.stride,
                                  direction=mem_access.direction,
                                  variable=mem_access.variable)
-                       , count)
+                                 , count)
                       for mem_access, count in six.iteritems(result))
 
-    return result
+    return ToCountMap(result)
 
 # {{{ sum_mem_access_to_bytes
 
@@ -1231,78 +1278,6 @@ def reduce_mem_access_poly_fields(m, mtype=True, dtype=True, stride=True,
 
 # }}}
 
-# {{{ filter_mem_access_poly_fields
-
-def filter_mem_access_poly_fields(m, mtypes=None, dtypes=None, strides=None,
-                                  directions=None, variables=None):
-    """Take map returned from :func:`get_mem_access_poly` and remove items without specified MemAccess fields
-
-    :parameter m: A mapping of **{** :class:`loopy.MemAccess` **:**
-                  :class:`islpy.PwQPolynomial` **}**.
-
-    :parameter mtypes: A list of :class:`string` that specifies the memory type
-                      accessed as **global** or **local**
-
-    :parameter dtypes: A list of :class:`loopy.LoopyType` (or
-                      :class:`numpy.dtype`) that specifies the data type
-                      accessed.
-
-    :parameter strides: A list of :class:`int` specifies stride of the memory
-                       access. A stride of 0 indicates a uniform access (i.e.
-                       all threads access the same item).
-
-    :parameter directions: A list of :class:`string` that specifies the
-                          direction of memory access as **load** or **store**.
-
-    :parameter variables: A list of :class:`string` that specifies the variable
-                         name of the data accessed.
-
-
-    :return: A mapping of **{(** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**
-
-             - The :class:`islpy.PwQPolynomial` holds the counts (in terms of
-               the :class:`loopy.LoopKernel` *inames*) for memory accesses
-               matching the fields passed as parameters.
-
-    Example usage::
-
-        # (first create loopy kernel and specify array data types)
-
-        params = {'n': 512, 'm': 256, 'l': 128}
-        mem_map = lp.get_mem_access_poly(knl)
-        filtered_map = lp.filter_mem_access_poly_fields(mem_map,
-                                                        directions=['load'],
-                                                        variables=['a','g'])
-        tot = lp.eval_and_sum_polys(filtered_map, params)
-
-        # (now use these counts to predict performance)
-
-    """
-
-    from loopy.types import to_loopy_type
-    if dtypes is not None:
-        dtypes_lp = [to_loopy_type(d) for d in dtypes]
-
-    result = {}
-
-    for k, v in m.items():
-        if (mtypes is None or k.mtype in mtypes) and \
-           (dtypes is None or k.dtype in dtypes_lp) and \
-           (strides is None or k.stride in strides) and \
-           (directions is None or k.direction in directions) and \
-           (variables is None or k.variable in variables):
-
-            new_key = MemAccess(k.mtype, k.dtype, k.stride, k.direction, k.variable)
-
-            if new_key in result:
-                result[new_key] += m[k]
-            else:
-                result[new_key] = m[k]
-
-    return result
-
-# }}}
-
 # {{{ reduce_op_poly_fields
 
 def reduce_op_poly_fields(m, dtype=True, name=True):
@@ -1360,62 +1335,6 @@ def reduce_op_poly_fields(m, dtype=True, name=True):
 
 # }}}
 
-# {{{ filter_op_poly_fields
-
-def filter_op_poly_fields(m, dtypes=None, names=None):
-    """Take map returned from :func:`get_op_poly` and remove items without specified Op fields
-
-    :parameter m: A mapping of **{** :class:`loopy.Op` **:**
-                  :class:`islpy.PwQPolynomial` **}**.
-
-    :parameter dtypes: A list of :class:`loopy.LoopyType` (or
-                      :class:`numpy.dtype`) that specifies the data type
-                      operated on.
-
-    :parameter names: A list of :class:`string` that specifies the kind of
-                      arithmetic operation as *add*, *sub*, *mul*, *div*,
-                      *pow*, *shift*, *bw* (bitwise), etc.
-
-    :return: A mapping of **{(** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**
-
-             - The :class:`islpy.PwQPolynomial` holds the counts (in terms of
-               the :class:`loopy.LoopKernel` *inames*) for arithmetic ops
-               matching the fields passed as parameters.
-
-    Example usage::
-
-        # (first create loopy kernel and specify array data types)
-
-        params = {'n': 512, 'm': 256, 'l': 128}
-        op_map = lp.get_op_poly(knl)
-        filtered_map = lp.filter_op_poly_fields(op_map, names=['add', 'sub'])
-        tot_addsub = lp.eval_and_sum_polys(filtered_map, params)
-
-        # (now use these counts to predict performance)
-
-    """
-
-    from loopy.types import to_loopy_type
-    if dtypes is not None:
-        dtypes_lp = [to_loopy_type(d) for d in dtypes]
-
-    result = {}
-
-    for k, v in m.items():
-        if (dtypes is None or k.dtype in dtypes_lp) and \
-           (names is None or k.name in names):
-
-            new_key = Op(k.dtype, k.name)
-
-            if new_key in result:
-                result[new_key] += m[k]
-            else:
-                result[new_key] = m[k]
-
-    return result
-
-# }}}
-
 def sum_polys(m):
     total = isl.PwQPolynomial('{ 0 }')
     for k, v in m.items():
diff --git a/test/test_statistics.py b/test/test_statistics.py
index a853e8c30..eb51fbfd0 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -371,7 +371,7 @@ def test_gmem_access_counter_specialops():
     assert f32 == n*m*l
     assert f64 == n*m
 
-    filtered_map = lp.filter_mem_access_poly_fields(poly, directions=['load'], variables=['a','g'])
+    filtered_map = poly.filter(direction=['load'], variable=['a','g'])
     tot = lp.eval_and_sum_polys(filtered_map, params)
     assert tot == n*m*l + n*m
 
@@ -744,29 +744,27 @@ def test_summations_and_filters():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
+    mem_map = lp.get_mem_access_poly(knl)
+
     loads_a = lp.eval_and_sum_polys(
-                    lp.filter_mem_access_poly_fields(
-                        lp.get_mem_access_poly(knl),
-                        directions=['load'], variables=['a']),
+                    mem_map.filter(direction=['load'], variable=['a']),
                     params)
     assert loads_a == 2*n*m*l
 
     global_stores = lp.eval_and_sum_polys(
-                        lp.filter_mem_access_poly_fields(
-                            lp.get_mem_access_poly(knl),
-                            mtypes=['global'], directions=['store']),
+                        mem_map.filter(mtype=['global'], direction=['store']),
                         params)
     assert global_stores == n*m*l + n*m
 
-    bytes_map = lp.sum_mem_access_to_bytes(lp.get_mem_access_poly(knl))
+    bytes_map = lp.sum_mem_access_to_bytes(mem_map)
     s0load = bytes_map[('global', 0, 'load')].eval_with_dict(params)
     s0store = bytes_map[('global', 0, 'store')].eval_with_dict(params)
     assert s0load == 4*n*m*l*3 + 8*n*m*2
     assert s0store == 4*n*m*l + 8*n*m
 
     # ignore stride and variable names in this map
-    reduced_map = lp.reduce_mem_access_poly_fields(lp.get_mem_access_poly(knl),
-                                                  stride=False, variable=False)
+    reduced_map = lp.reduce_mem_access_poly_fields(mem_map, stride=False,
+                                                   variable=False)
     f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load')
                          ].eval_with_dict(params)
     f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
@@ -774,7 +772,9 @@ def test_summations_and_filters():
     assert f32lall == 3*n*m*l
     assert f64lall == 2*n*m
 
-    poly_dtype = lp.sum_ops_to_dtypes(lp.get_op_poly(knl))
+    op_map = lp.get_op_poly(knl)
+
+    poly_dtype = lp.sum_ops_to_dtypes(op_map)
     f32 = poly_dtype[to_loopy_type(np.float32)].eval_with_dict(params)
     f64 = poly_dtype[to_loopy_type(np.float64)].eval_with_dict(params)
     i32 = poly_dtype[to_loopy_type(np.int32)].eval_with_dict(params)
@@ -782,25 +782,25 @@ def test_summations_and_filters():
     assert f64 == n*m
     assert i32 == n*m*2
 
-    addsub_all = lp.eval_and_sum_polys(
-                        lp.filter_op_poly_fields(
-                            lp.get_op_poly(knl), names=['add', 'sub']),
-                        params)
-    f32ops_all = lp.eval_and_sum_polys(
-                        lp.filter_op_poly_fields(
-                            lp.get_op_poly(knl), dtypes=[np.float32]),
-                        params)
+    addsub_all = lp.eval_and_sum_polys(op_map.filter(name=['add', 'sub']),
+                                       params)
+    f32ops_all = lp.eval_and_sum_polys(op_map.filter(dtype=[np.float32]),
+                                       params)
     assert addsub_all == n*m*l + n*m*2
     assert f32ops_all == n*m*l*3
 
-    ops_nodtype = lp.reduce_op_poly_fields(lp.get_op_poly(knl), dtype=False)
-    ops_noname = lp.reduce_op_poly_fields(lp.get_op_poly(knl), name=False)
+    non_field = lp.eval_and_sum_polys(op_map.filter(xxx=[np.float32]), params)
+    assert non_field == 0
+
+    ops_nodtype = lp.reduce_op_poly_fields(op_map, dtype=False)
+    ops_noname = lp.reduce_op_poly_fields(op_map, name=False)
     mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params)
     f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params)
     assert mul_all == n*m*l + n*m
     assert f64ops_all == n*m
 
 
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
-- 
GitLab


From a699f3232afd915b59f63a70bbeb982c43bc511c Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Mon, 31 Oct 2016 00:38:47 -0500
Subject: [PATCH 26/55] replaced reduce_mem_access and reduce_op functions with
 ToCountMap group_by member function

---
 loopy/__init__.py       |   8 +-
 loopy/statistics.py     | 238 +++++++++++++++-------------------------
 test/test_statistics.py |  22 ++--
 3 files changed, 99 insertions(+), 169 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index 80c266ba4..8b10edf19 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -115,9 +115,7 @@ from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
 from loopy.statistics import (stringify_stats_mapping, Op, MemAccess,
         get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly,
         get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly,
-        sum_mem_access_to_bytes, reduce_mem_access_poly_fields,
-        reduce_op_poly_fields,
-        sum_polys, eval_and_sum_polys,
+        sum_mem_access_to_bytes, sum_polys, eval_and_sum_polys,
         get_synchronization_poly, gather_access_footprints,
         gather_access_footprint_bytes)
 from loopy.codegen import (
@@ -225,9 +223,7 @@ __all__ = [
         "stringify_stats_mapping", "Op", "MemAccess", "get_op_poly",
         "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly",
         "get_gmem_access_poly", "get_mem_access_poly",
-        "sum_mem_access_to_bytes", "reduce_mem_access_poly_fields",
-        "reduce_op_poly_fields",
-        "sum_polys", "eval_and_sum_polys",
+        "sum_mem_access_to_bytes", "sum_polys", "eval_and_sum_polys",
         "get_synchronization_poly", "gather_access_footprints",
         "gather_access_footprint_bytes",
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index b5e37d2d0..d251e249b 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -41,7 +41,8 @@ __doc__ = """
 
 .. currentmodule:: loopy
 
-.. autofunction:: filter
+.. autofunction:: filter_by
+.. autofunction:: group_by
 
 .. autofunction:: get_op_poly
 
@@ -91,7 +92,7 @@ class ToCountMap:
         if isinstance(other, isl.PwQPolynomial):
             return ToCountMap(dict(
                 (index, self.dict[index]*other)
-                for index in self.dict.keys()))
+                for index in self.keys()))
         else:
             raise ValueError("ToCountMap: Attempted to multiply "
                                 "ToCountMap by {0} {1}."
@@ -105,13 +106,19 @@ class ToCountMap:
         except KeyError:
             return isl.PwQPolynomial('{ 0 }')
 
+    def __setitem__(self, index, value):
+        self.dict[index] = value
+
     def __repr__(self):
         return repr(self.dict)
 
     def items(self):
         return self.dict.items()
 
-    def filter(self, **kwargs):
+    def keys(self):
+        return self.dict.keys()
+
+    def filter_by(self, **kwargs):
         """Remove items without specified key fields
 
         :parameter **kwargs: Keyword arguments matching fields in the keys of
@@ -128,7 +135,7 @@ class ToCountMap:
 
             params = {'n': 512, 'm': 256, 'l': 128}
             mem_map = lp.get_mem_access_poly(knl)
-            filtered_map = mem_map.filter(directions=['load'],
+            filtered_map = mem_map.filter_by(directions=['load'],
                                           variables=['a','g'])
             tot_loads_a_g = lp.eval_and_sum_polys(filtered_map, params)
 
@@ -136,30 +143,92 @@ class ToCountMap:
 
         """
 
-        new_map = ToCountMap()
+        result_map = ToCountMap()
 
         from loopy.types import to_loopy_type
         if 'dtype' in kwargs.keys():
             kwargs['dtype'] = [to_loopy_type(d) for d in kwargs['dtype']]
 
         # for each item in self.dict
-        for self_key, self_val in self.dict.items():
+        for self_key, self_val in self.items():
             try:
                 # check to see if key attribute values match all filters
                 for arg_field, allowable_vals in kwargs.items():
                     attr_val = getattr(self_key, arg_field)
                     # see if the value is in the filter list
                     if attr_val not in allowable_vals:
-                        print("DEBUG: "+str(attr_val)+" not in ", allowable_vals, ", removing.")
                         break
                 else:  # loop terminated without break or error
-                    new_map.dict[self_key] = self_val
+                    result_map.dict[self_key] = self_val
             except(AttributeError):
                 # the field passed is not a field of this key
-                print("DEBUG: "+arg_field+" not in ", self_key, ", removing.") 
                 continue
 
-        return new_map
+        return result_map
+
+    def group_by(self, *args):
+        """Group map items together, distinguishing by only the key fields passed in args
+
+        :parameter args: Zero or more :class:`string` fields of map keys
+
+        :return: A :class:`ToCountMap` containing the same total counts
+                 grouped together by new keys that only contain the fields
+                 specified in the arguments passed.
+
+        Example usage::
+
+            # (first create loopy kernel and specify array data types)
+
+            params = {'n': 512, 'm': 256, 'l': 128}
+            mem_map = get_mem_access_poly(knl)
+            grouped_mem_map = mem_map.group_by('mtype', 'dtype', 'direction')
+
+            all_f32_global_loads = grouped_mem_map[MemAccess(mtype='global',
+                                                             dtype=np.float32,
+                                                             direction='load')
+                                                  ].eval_with_dict(params)
+            all_f32_global_stores = grouped_mem_map[MemAccess(mtype='global',
+                                                              dtype=np.float32,
+                                                              direction='store')
+                                                   ].eval_with_dict(params)
+            all_f32_local_loads = grouped_mem_map[MemAccess(mtype='local',
+                                                            dtype=np.float32,
+                                                            direction='load')
+                                                 ].eval_with_dict(params)
+            all_f32_local_stores = grouped_mem_map[MemAccess(mtype='local',
+                                                             dtype=np.float32,
+                                                             direction='store')
+                                                  ].eval_with_dict(params)
+
+            # (now use these counts to predict performance)
+
+        """
+
+        result_map = ToCountMap()
+
+        # make sure all item keys have same type
+        if self.dict:
+            key_type = type(list(self.keys())[0])
+            if not all(isinstance(x, key_type) for x in self.keys()):
+                raise ValueError("ToCountMap: group_by() function may only "
+                                 "be used on ToCountMaps with uniform keys")
+        else:
+            return result_map
+
+        # for each item in self.dict
+        for self_key, self_val in self.items():
+            new_key = key_type()
+
+            # set all specified fields
+            for field in args:
+                setattr(new_key, field, getattr(self_key, field))
+
+            if new_key in result_map.keys():
+                result_map[new_key] += self_val
+            else:
+                result_map[new_key] = self_val
+
+        return result_map
 
 # }}}
 
@@ -241,7 +310,6 @@ class MemAccess:
 
     """
 
-    #TODO currently counting all lmem access as stride None
     def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None):
         self.mtype = mtype
         self.stride = stride
@@ -253,6 +321,11 @@ class MemAccess:
             from loopy.types import to_loopy_type
             self.dtype = to_loopy_type(dtype)
 
+        #TODO currently counting all lmem access as stride None
+        if (mtype == 'local') and (stride is not None):
+            raise NotImplementedError("MemAccess: stride must be None when "
+                                      "mtype is 'local'")
+
     def __eq__(self, other):
         return isinstance(other, MemAccess) and (
                 (self.mtype is None or other.mtype is None or
@@ -989,7 +1062,7 @@ def get_lmem_access_poly(knl):
     warn("get_lmem_access_poly is deprecated. Use get_mem_access_poly and "
          "filter the result with the mtype=['local'] option.",
          DeprecationWarning, stacklevel=2)
-    return get_mem_access_poly(knl).filter(mtypes=['local'])
+    return get_mem_access_poly(knl).filter_by(mtypes=['local'])
 
 
 def get_DRAM_access_poly(knl):
@@ -999,7 +1072,7 @@ def get_DRAM_access_poly(knl):
     warn("get_DRAM_access_poly is deprecated. Use get_mem_access_poly and "
          "filter the result with the mtype=['global'] option.",
          DeprecationWarning, stacklevel=2)
-    return get_mem_access_poly(knl).filter(mtypes=['global'])
+    return get_mem_access_poly(knl).filter_by(mtypes=['global'])
 
 # {{{ get_gmem_access_poly
 
@@ -1010,7 +1083,7 @@ def get_gmem_access_poly(knl):
     warn("get_DRAM_access_poly is deprecated. Use get_mem_access_poly and "
          "filter the result with the mtype=['global'] option.",
          DeprecationWarning, stacklevel=2)
-    return get_mem_access_poly(knl).filter(mtypes=['global'])
+    return get_mem_access_poly(knl).filter_by(mtypes=['global'])
 
 # }}}
 
@@ -1198,143 +1271,6 @@ def sum_mem_access_to_bytes(m):
 
 # }}}
 
-# {{{ reduce_mem_access_poly_fields
-
-def reduce_mem_access_poly_fields(m, mtype=True, dtype=True, stride=True,
-                                  direction=True, variable=True):
-    """Take map returned from :func:`get_mem_access_poly`, remove specified MemAccess fields from keys, and combine counts
-
-    :parameter m: A mapping of **{** :class:`loopy.MemAccess` **:**
-                  :class:`islpy.PwQPolynomial` **}**.
-
-    :parameter mtype: A :class:`boolean` specifying whether keys in returned
-                      map will include the memory type.
-
-    :parameter dtype: A :class:`boolean` specifying whether keys in returned
-                      map will include the data type.
-
-    :parameter stride: A :class:`boolean` specifying whether keys in returned
-                       map will include the stride.
-
-    :parameter direction: A :class:`boolean` specifying whether keys in
-                          returned map will include the direction.
-
-    :parameter variable: A :class:`boolean` specifying whether keys in returned
-                         map will include the variable name.
-
-
-    :return: A mapping of **{(** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**
-
-             - The :class:`islpy.PwQPolynomial` holds the counts (in terms of
-               the :class:`loopy.LoopKernel` *inames*) for memory accesses
-               categorized by the fields not set to False.
-
-    Example usage::
-
-        # (first create loopy kernel and specify array data types)
-
-        params = {'n': 512, 'm': 256, 'l': 128}
-        mem_map = get_mem_access_poly(knl)
-        reduced_mem_map = reduce_mem_access_poly_fields(mem_map, stride=False,
-                                                        variable=False)
-
-        all_f32_global_loads = reduced_mem_map[MemAccess('global', np.float32,
-                                                         direction='load')
-                                              ].eval_with_dict(params)
-        all_f32_global_stores = reduced_mem_map[MemAccess('global', np.float32,
-                                                          direction='store')
-                                               ].eval_with_dict(params)
-        all_f32_local_loads = reduced_mem_map[MemAccess('local', np.float32,
-                                                        direction='load')
-                                             ].eval_with_dict(params)
-        all_f32_local_stores = reduced_mem_map[MemAccess('local', np.float32,
-                                                         direction='store')
-                                              ].eval_with_dict(params)
-
-        # (now use these counts to predict performance)
-
-    """
-
-    result = {}
-    for k, v in m.items():
-        new_key = MemAccess()
-        if mtype == True:
-            new_key.mtype = k.mtype
-        if dtype == True:
-            new_key.dtype = k.dtype
-        if stride == True:
-            new_key.stride = k.stride
-        if direction == True:
-            new_key.direction = k.direction
-        if variable == True:
-            new_key.variable = k.variable
-
-        if new_key in result:
-            result[new_key] += m[k]
-        else:
-            result[new_key] = m[k]
-
-    return result
-
-# }}}
-
-# {{{ reduce_op_poly_fields
-
-def reduce_op_poly_fields(m, dtype=True, name=True):
-    """Take map returned from :func:`get_op_poly`, remove specified Op fields from keys, and combine counts
-
-    :parameter m: A mapping of **{** :class:`loopy.Op` **:**
-                  :class:`islpy.PwQPolynomial` **}**.
-
-    :parameter dtype: A :class:`boolean` specifying whether keys in returned
-                      map will include the data type.
-
-    :parameter name: A :class:`boolean` specifying whether keys in returned
-                     map will include the name of the operation.
-
-    :return: A mapping of **{(** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**
-
-             - The :class:`islpy.PwQPolynomial` holds the counts (in terms of
-               the :class:`loopy.LoopKernel` *inames*) for arithmetic ops
-               categorized by the fields not set to False.
-
-    Example usage::
-
-        # (first create loopy kernel and specify array data types)
-
-        params = {'n': 512, 'm': 256, 'l': 128}
-        op_map = get_op_poly(knl)
-        reduced_op_map = reduce_op_fields(op_map, name=False)
-
-        all_f32_ops = reduced_op_map[Op(dtype=np.float32)].eval_with_dict(params)
-        all_f64_ops = reduced_op_map[Op(dtype=np.float64)].eval_with_dict(params)
-
-        reduced_op_map = reduce_op_fields(op_map, dtype=False)
-
-        all_add_ops = reduced_op_map[Op(name='add')].eval_with_dict(params)
-        all_mul_ops = reduced_op_map[Op(name='mul')].eval_with_dict(params)
-
-        # (now use these counts to predict performance)
-
-    """
-
-    result = {}
-    for k, v in m.items():
-        new_key = Op()
-        if dtype == True:
-            new_key.dtype = k.dtype
-        if name == True:
-            new_key.name = k.name
-
-        if new_key in result:
-            result[new_key] += m[k]
-        else:
-            result[new_key] = m[k]
-
-    return result
-
-# }}}
-
 def sum_polys(m):
     total = isl.PwQPolynomial('{ 0 }')
     for k, v in m.items():
diff --git a/test/test_statistics.py b/test/test_statistics.py
index eb51fbfd0..3657721e4 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -311,8 +311,7 @@ def test_gmem_access_counter_logic():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    reduced_map = lp.reduce_mem_access_poly_fields(poly, stride=False,
-                                                    variable=False)
+    reduced_map = poly.group_by('mtype', 'dtype', 'direction')
 
     f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32),
                                        direction='load')
@@ -371,7 +370,7 @@ def test_gmem_access_counter_specialops():
     assert f32 == n*m*l
     assert f64 == n*m
 
-    filtered_map = poly.filter(direction=['load'], variable=['a','g'])
+    filtered_map = poly.filter_by(direction=['load'], variable=['a','g'])
     tot = lp.eval_and_sum_polys(filtered_map, params)
     assert tot == n*m*l + n*m
 
@@ -747,12 +746,12 @@ def test_summations_and_filters():
     mem_map = lp.get_mem_access_poly(knl)
 
     loads_a = lp.eval_and_sum_polys(
-                    mem_map.filter(direction=['load'], variable=['a']),
+                    mem_map.filter_by(direction=['load'], variable=['a']),
                     params)
     assert loads_a == 2*n*m*l
 
     global_stores = lp.eval_and_sum_polys(
-                        mem_map.filter(mtype=['global'], direction=['store']),
+                        mem_map.filter_by(mtype=['global'], direction=['store']),
                         params)
     assert global_stores == n*m*l + n*m
 
@@ -763,8 +762,7 @@ def test_summations_and_filters():
     assert s0store == 4*n*m*l + 8*n*m
 
     # ignore stride and variable names in this map
-    reduced_map = lp.reduce_mem_access_poly_fields(mem_map, stride=False,
-                                                   variable=False)
+    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
     f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load')
                          ].eval_with_dict(params)
     f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
@@ -782,18 +780,18 @@ def test_summations_and_filters():
     assert f64 == n*m
     assert i32 == n*m*2
 
-    addsub_all = lp.eval_and_sum_polys(op_map.filter(name=['add', 'sub']),
+    addsub_all = lp.eval_and_sum_polys(op_map.filter_by(name=['add', 'sub']),
                                        params)
-    f32ops_all = lp.eval_and_sum_polys(op_map.filter(dtype=[np.float32]),
+    f32ops_all = lp.eval_and_sum_polys(op_map.filter_by(dtype=[np.float32]),
                                        params)
     assert addsub_all == n*m*l + n*m*2
     assert f32ops_all == n*m*l*3
 
-    non_field = lp.eval_and_sum_polys(op_map.filter(xxx=[np.float32]), params)
+    non_field = lp.eval_and_sum_polys(op_map.filter_by(xxx=[np.float32]), params)
     assert non_field == 0
 
-    ops_nodtype = lp.reduce_op_poly_fields(op_map, dtype=False)
-    ops_noname = lp.reduce_op_poly_fields(op_map, name=False)
+    ops_nodtype = op_map.group_by('name')
+    ops_noname = op_map.group_by('dtype')
     mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params)
     f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params)
     assert mul_all == n*m*l + n*m
-- 
GitLab


From 8c7194c113c364343e9ec4de698b6ab1ca104196 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Tue, 1 Nov 2016 00:27:32 -0500
Subject: [PATCH 27/55] added to_bytes, sum, and eval_and_sum member functions
 to ToCountMap, removed previous/redundant functions

---
 loopy/__init__.py       |  11 +-
 loopy/statistics.py     | 314 ++++++++++++++++++++--------------------
 test/test_statistics.py |  55 ++++---
 3 files changed, 187 insertions(+), 193 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index 8b10edf19..f505759ac 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -113,11 +113,9 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction,
         infer_unknown_types)
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
 from loopy.statistics import (stringify_stats_mapping, Op, MemAccess,
-        get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly,
-        get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly,
-        sum_mem_access_to_bytes, sum_polys, eval_and_sum_polys,
-        get_synchronization_poly, gather_access_footprints,
-        gather_access_footprint_bytes)
+        get_op_poly, get_lmem_access_poly, get_DRAM_access_poly,
+        get_gmem_access_poly, get_mem_access_poly, get_synchronization_poly,
+        gather_access_footprints, gather_access_footprint_bytes)
 from loopy.codegen import (
         PreambleInfo,
         generate_code, generate_code_v2, generate_body)
@@ -221,9 +219,8 @@ __all__ = [
         "generate_code", "generate_code_v2", "generate_body",
 
         "stringify_stats_mapping", "Op", "MemAccess", "get_op_poly",
-        "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly",
+        "get_lmem_access_poly", "get_DRAM_access_poly",
         "get_gmem_access_poly", "get_mem_access_poly",
-        "sum_mem_access_to_bytes", "sum_polys", "eval_and_sum_polys",
         "get_synchronization_poly", "gather_access_footprints",
         "gather_access_footprint_bytes",
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index d251e249b..6efb61da1 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -43,6 +43,9 @@ __doc__ = """
 
 .. autofunction:: filter_by
 .. autofunction:: group_by
+.. autofunction:: to_bytes
+.. autofunciton:: sum
+.. autofunction:: eval_and_sum
 
 .. autofunction:: get_op_poly
 
@@ -51,11 +54,6 @@ __doc__ = """
 .. autofunction:: get_gmem_access_poly
 .. autofunction:: get_mem_access_poly
 
-.. autofunction:: sum_mem_access_to_bytes
-.. autofunction:: reduce_mem_access_poly_fields
-
-.. autofunction:: reduce_op_poly_fields
-
 .. autofunction:: get_synchronization_poly
 
 .. autofunction:: gather_access_footprints
@@ -112,12 +110,18 @@ class ToCountMap:
     def __repr__(self):
         return repr(self.dict)
 
+    def __len__(self):
+        return len(self.dict)
+
     def items(self):
         return self.dict.items()
 
     def keys(self):
         return self.dict.keys()
 
+    def copy(self):
+        return ToCountMap(dict(self.dict))
+
     def filter_by(self, **kwargs):
         """Remove items without specified key fields
 
@@ -135,9 +139,9 @@ class ToCountMap:
 
             params = {'n': 512, 'm': 256, 'l': 128}
             mem_map = lp.get_mem_access_poly(knl)
-            filtered_map = mem_map.filter_by(directions=['load'],
-                                          variables=['a','g'])
-            tot_loads_a_g = lp.eval_and_sum_polys(filtered_map, params)
+            filtered_map = mem_map.filter_by(direction=['load'],
+                                             variable=['a','g'])
+            tot_loads_a_g = filtered_map.eval_and_sum(params)
 
             # (now use these counts to predict performance)
 
@@ -200,6 +204,13 @@ class ToCountMap:
                                                              direction='store')
                                                   ].eval_with_dict(params)
 
+            op_map = get_op_poly(knl)
+            ops_by_dtype = op_map.group_by('dtype')
+
+            f32ops = ops_by_dtype[Op(dtype=np.float32)].eval_with_dict(params)
+            f64ops = ops_by_dtype[Op(dtype=np.float64)].eval_with_dict(params)
+            i32ops = ops_by_dtype[Op(dtype=np.int32)].eval_with_dict(params)
+
             # (now use these counts to predict performance)
 
         """
@@ -230,6 +241,83 @@ class ToCountMap:
 
         return result_map
 
+    def to_bytes(self):
+        """Convert counts to bytes using data type in map key
+
+        :return: A :class:`ToCountMap` mapping each original key to a
+                 :class:`islpy.PwQPolynomial` with counts in bytes rather than
+                 instances.
+
+        Example usage::
+
+            # (first create loopy kernel and specify array data types)
+
+            bytes_map = get_mem_access_poly(knl).to_bytes()
+            params = {'n': 512, 'm': 256, 'l': 128}
+
+            s1_global_ld_byt = bytes_map.filter_by(
+                                    mtype=['global'], stride=[1],
+                                    direction=['load']).eval_and_sum(params)
+            s2_global_ld_byt = bytes_map.filter_by(
+                                    mtype=['global'], stride=[2],
+                                    direction=['load']).eval_and_sum(params)
+            s1_global_st_byt = bytes_map.filter_by(
+                                    mtype=['global'], stride=[1],
+                                    direction=['store']).eval_and_sum(params)
+            s2_global_st_byt = bytes_map.filter_by(
+                                    mtype=['global'], stride=[2],
+                                    direction=['store']).eval_and_sum(params)
+
+            # (now use these counts to predict performance)
+
+        """
+
+        result = self.copy()
+
+        for key, val in self.items():
+            bytes_processed = int(key.dtype.itemsize) * val
+            result[key] = bytes_processed
+
+        return result
+
+
+    def sum(self):
+        """Add all counts in ToCountMap
+
+        :return: A :class:`islpy.PwQPolynomial` containing the sum of counts
+
+        """
+        total = isl.PwQPolynomial('{ 0 }')
+        for k, v in self.items():
+            if not isinstance(v, isl.PwQPolynomial):
+                raise ValueError("ToCountMap: sum() encountered type {0} but "
+                                 "may only be used on PwQPolynomials."
+                                 .format(type(v)))
+            total += v
+        return total
+
+
+    def eval_and_sum(self, params):
+        """Add all counts in ToCountMap and evaluate with provided parameters
+
+        :return: An :class:`integer` containing the sum of all counts in the
+                 :class:`ToCountMap` evaluated with the parameters provided
+
+        Example usage::
+
+            # (first create loopy kernel and specify array data types)
+
+            params = {'n': 512, 'm': 256, 'l': 128}
+            mem_map = lp.get_mem_access_poly(knl)
+            filtered_map = mem_map.filter_by(direction=['load'],
+                                             variable=['a','g'])
+            tot_loads_a_g = filtered_map.eval_and_sum(params)
+
+            # (now use these counts to predict performance)
+
+        """
+        return self.sum().eval_with_dict(params)
+
 # }}}
 
 
@@ -358,7 +446,6 @@ class MemAccess:
         return hash(mtype+str(dtype)+str(stride)+direction+variable)
 
 
-
 # {{{ ExpressionOpCounter
 
 class ExpressionOpCounter(CombineMapper):
@@ -385,7 +472,8 @@ class ExpressionOpCounter(CombineMapper):
 
     def map_call(self, expr):
         return ToCountMap(
-                    {Op(self.type_inf(expr), 'func:'+str(expr.function)): 1}
+                    {Op(dtype=self.type_inf(expr),
+                        name='func:'+str(expr.function)): 1}
                     ) + self.rec(expr.parameters)
 
     # def map_call_with_kwargs(self, expr):  # implemented in CombineMapper
@@ -398,20 +486,21 @@ class ExpressionOpCounter(CombineMapper):
     def map_sum(self, expr):
         assert expr.children
         return ToCountMap(
-                    {Op(self.type_inf(expr), 'add'): len(expr.children)-1}
+                    {Op(dtype=self.type_inf(expr),
+                        name='add'): len(expr.children)-1}
                     ) + sum(self.rec(child) for child in expr.children)
 
     def map_product(self, expr):
         from pymbolic.primitives import is_zero
         assert expr.children
-        return sum(ToCountMap({Op(self.type_inf(expr), 'mul'): 1})
+        return sum(ToCountMap({Op(dtype=self.type_inf(expr), name='mul'): 1})
                    + self.rec(child)
                    for child in expr.children
                    if not is_zero(child + 1)) + \
-                   ToCountMap({Op(self.type_inf(expr), 'mul'): -1})
+                   ToCountMap({Op(dtype=self.type_inf(expr), name='mul'): -1})
 
     def map_quotient(self, expr, *args):
-        return ToCountMap({Op(self.type_inf(expr), 'div'): 1}) \
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='div'): 1}) \
                                 + self.rec(expr.numerator) \
                                 + self.rec(expr.denominator)
 
@@ -419,25 +508,25 @@ class ExpressionOpCounter(CombineMapper):
     map_remainder = map_quotient
 
     def map_power(self, expr):
-        return ToCountMap({Op(self.type_inf(expr), 'pow'): 1}) \
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='pow'): 1}) \
                                 + self.rec(expr.base) \
                                 + self.rec(expr.exponent)
 
     def map_left_shift(self, expr):
-        return ToCountMap({Op(self.type_inf(expr), 'shift'): 1}) \
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='shift'): 1}) \
                                 + self.rec(expr.shiftee) \
                                 + self.rec(expr.shift)
 
     map_right_shift = map_left_shift
 
     def map_bitwise_not(self, expr):
-        return ToCountMap({Op(self.type_inf(expr), 'bw'): 1}) \
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'): 1}) \
                                 + self.rec(expr.child)
 
     def map_bitwise_or(self, expr):
-        return ToCountMap(
-                        {Op(self.type_inf(expr), 'bw'): len(expr.children)-1}
-                        ) + sum(self.rec(child) for child in expr.children)
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'):
+                           len(expr.children)-1}
+                         ) + sum(self.rec(child) for child in expr.children)
 
     map_bitwise_xor = map_bitwise_or
     map_bitwise_and = map_bitwise_or
@@ -466,8 +555,8 @@ class ExpressionOpCounter(CombineMapper):
                + self.rec(expr.else_)
 
     def map_min(self, expr):
-        return ToCountMap({Op(
-                          self.type_inf(expr), 'maxmin'): len(expr.children)-1}
+        return ToCountMap({Op(dtype=self.type_inf(expr), name='maxmin'):
+                           len(expr.children)-1}
                          ) + sum(self.rec(child) for child in expr.children)
 
     map_max = map_min
@@ -524,7 +613,8 @@ class LocalSubscriptCounter(CombineMapper):
             #print("is local? ", array.is_local)
             if array.is_local:
                 return ToCountMap(
-                        {MemAccess('local', self.type_inf(expr)): 1}
+                        {MemAccess(mtype='local',
+                                   dtype=self.type_inf(expr)): 1}
                         ) + self.rec(expr.index)
 
         return self.rec(expr.index)
@@ -661,8 +751,9 @@ class GlobalSubscriptCounter(CombineMapper):
 
         if not local_id_found:
             # count as uniform access
-            return ToCountMap({MemAccess('global', self.type_inf(expr),
-                               stride=0, variable=name): 1}
+            return ToCountMap({MemAccess(mtype='global',
+                                         dtype=self.type_inf(expr), stride=0,
+                                         variable=name): 1}
                              ) + self.rec(expr.index)
 
         # get local_id associated with minimum tag axis
@@ -706,12 +797,13 @@ class GlobalSubscriptCounter(CombineMapper):
         #TODO temporary fix that needs changing:
         if min_tag_axis != 0:
             print("... min tag axis (%d) is not zero! ..." % (min_tag_axis))
-            return ToCountMap({MemAccess('global', self.type_inf(expr),
-                               stride=sys.maxsize, variable=name): 1}
+            return ToCountMap({MemAccess(mtype='global',
+                                         dtype=self.type_inf(expr),
+                                         stride=sys.maxsize, variable=name): 1}
                              ) + self.rec(expr.index)
 
-        return ToCountMap({MemAccess('global', self.type_inf(expr),
-                           stride=total_stride, variable=name): 1}
+        return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr),
+                                     stride=total_stride, variable=name): 1}
                          ) + self.rec(expr.index)
 
     def map_sum(self, expr):
@@ -1004,55 +1096,16 @@ def get_op_poly(knl, numpy_types=True):
                                         insn_inames, [dim_type.set]))
         ops = op_counter(insn.assignee) + op_counter(insn.expression)
         op_poly = op_poly + ops*count(knl, domain)
-    result = op_poly.dict
 
     if numpy_types:
-        result = dict(
-                (Op(op.dtype.numpy_dtype, op.name), count)
-                for op, count in six.iteritems(result))
-
-    return ToCountMap(result)
-# }}}
-
-
-def sum_ops_to_dtypes(op_poly_dict):
-    """Sum the mapping returned by :func:`get_op_poly` to a mapping that ignores arithmetic op type
-
-    :parameter op_poly_dict: A mapping of **{** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**.
+        op_poly.dict = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name),
+                             count)
+                for op, count in six.iteritems(op_poly.dict))
 
-    :return: A mapping of **{** :class:`loopy.LoopyType` **:** :class:`islpy.PwQPolynomial` **}**
+    return op_poly
 
-             - The :class:`loopy.LoopyType` specifies the data type operated on 
-
-             - The :class:`islpy.PwQPolynomial` holds the number of arithmetic
-               operations on the data type specified (in terms of the
-               :class:`loopy.LoopKernel` *inames*).
-
-    Example usage::
-
-        # (first create loopy kernel and specify array data types)
-
-        op_map = get_op_poly(knl)
-        op_map_by_dtype = sum_ops_to_dtypes(op_map)
-        params = {'n': 512, 'm': 256, 'l': 128}
-
-        f32ops = op_map_by_dtype[to_loopy_type(np.float32)].eval_with_dict(params)
-        f64ops = op_map_by_dtype[to_loopy_type(np.float64)].eval_with_dict(params)
-        i32ops = op_map_by_dtype[to_loopy_type(np.int32)].eval_with_dict(params)
-
-        # (now use these counts to predict performance)
-
-    """
-
-    result = {}
-    for op, v in op_poly_dict.items():
-        new_key = op.dtype
-        if new_key in result:
-            result[new_key] += v
-        else:
-            result[new_key] = v
+# }}}
 
-    return result
 
 #TODO test depricated functions?
 def get_lmem_access_poly(knl):
@@ -1062,7 +1115,7 @@ def get_lmem_access_poly(knl):
     warn("get_lmem_access_poly is deprecated. Use get_mem_access_poly and "
          "filter the result with the mtype=['local'] option.",
          DeprecationWarning, stacklevel=2)
-    return get_mem_access_poly(knl).filter_by(mtypes=['local'])
+    return get_mem_access_poly(knl).filter_by(mtype=['local'])
 
 
 def get_DRAM_access_poly(knl):
@@ -1072,7 +1125,8 @@ def get_DRAM_access_poly(knl):
     warn("get_DRAM_access_poly is deprecated. Use get_mem_access_poly and "
          "filter the result with the mtype=['global'] option.",
          DeprecationWarning, stacklevel=2)
-    return get_mem_access_poly(knl).filter_by(mtypes=['global'])
+    return get_mem_access_poly(knl).filter_by(mtype=['global'])
+
 
 # {{{ get_gmem_access_poly
 
@@ -1083,10 +1137,11 @@ def get_gmem_access_poly(knl):
     warn("get_DRAM_access_poly is deprecated. Use get_mem_access_poly and "
          "filter the result with the mtype=['global'] option.",
          DeprecationWarning, stacklevel=2)
-    return get_mem_access_poly(knl).filter_by(mtypes=['global'])
+    return get_mem_access_poly(knl).filter_by(mtype=['global'])
 
 # }}}
 
+
 def get_mem_access_poly(knl, numpy_types=True):
     """Count the number of memory accesses in a loopy kernel.
 
@@ -1114,22 +1169,26 @@ def get_mem_access_poly(knl, numpy_types=True):
         params = {'n': 512, 'm': 256, 'l': 128}
         mem_access_map = get_mem_access_poly(knl)
 
-        f32_stride1_g_loads_a = mem_access_map[MemAccess('global', np.float32,
+        f32_stride1_g_loads_a = mem_access_map[MemAccess(mtype='global',
+                                                         dtype=np.float32,
                                                          stride=1,
                                                          direction='load',
                                                          variable='a')
                                               ].eval_with_dict(params)
-        f32_stride1_g_stores_a = mem_access_map[MemAccess('global', np.float32,
+        f32_stride1_g_stores_a = mem_access_map[MemAccess(mtype='global',
+                                                          dtype=np.float32,
                                                           stride=1,
                                                           direction='store',
                                                           variable='a')
                                                ].eval_with_dict(params)
-        f32_stride1_l_loads_x = mem_access_map[MemAccess('local', np.float32,
+        f32_stride1_l_loads_x = mem_access_map[MemAccess(mtype='local',
+                                                         dtype=np.float32,
                                                          stride=1,
                                                          direction='load',
                                                          variable='x')
                                               ].eval_with_dict(params)
-        f32_stride1_l_stores_x = mem_access_map[MemAccess('local', np.float32,
+        f32_stride1_l_stores_x = mem_access_map[MemAccess(mtype='local',
+                                                          dtype=np.float32,
                                                           stride=1,
                                                           direction='store',
                                                           variable='x')
@@ -1171,14 +1230,16 @@ def get_mem_access_poly(knl, numpy_types=True):
 
         # distinguish loads and stores
         for key in subs_expr.dict:
-            subs_expr.dict[MemAccess(key.mtype, key.dtype, stride=key.stride,
-                                     direction='load', variable=key.variable)
+            subs_expr.dict[MemAccess(mtype=key.mtype, dtype=key.dtype,
+                                     stride=key.stride, direction='load',
+                                     variable=key.variable)
                           ] = subs_expr.dict.pop(key)
 
         subs_assignee_g = subs_counter_g(insn.assignee)
         for key in subs_assignee_g.dict:
-            subs_assignee_g.dict[MemAccess(key.mtype, key.dtype,
-                                           stride=key.stride, direction='store',
+            subs_assignee_g.dict[MemAccess(mtype=key.mtype, dtype=key.dtype,
+                                           stride=key.stride,
+                                           direction='store',
                                            variable=key.variable)
                                 ] = subs_assignee_g.dict.pop(key)
         # for now, don't count writes to local mem
@@ -1204,82 +1265,18 @@ def get_mem_access_poly(knl, numpy_types=True):
                 subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
             # for now, don't count writes to local mem
 
-    result = subs_poly.dict
+    #result = subs_poly.dict
 
     if numpy_types:
-        result = dict((MemAccess(mem_access.mtype, mem_access.dtype.numpy_dtype,
-                                 stride=mem_access.stride,
-                                 direction=mem_access.direction,
-                                 variable=mem_access.variable)
-                                 , count)
-                      for mem_access, count in six.iteritems(result))
-
-    return ToCountMap(result)
-
-# {{{ sum_mem_access_to_bytes
-
-def sum_mem_access_to_bytes(m):
-    """Convert counts returned by :func:`get_mem_access_poly` to bytes and sum across data types and variables
-
-    :parameter m: A mapping of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
-
-    :return: A mapping of **{(** :class:`string`**,** :class:`int` **,** :class:`string` **)**
-             **:** :class:`islpy.PwQPolynomial` **}**
-
-             - The first string in the key specifies the memory type as *global* or *local*
+        subs_poly.dict = dict((MemAccess(mtype=mem_access.mtype,
+                                         dtype=mem_access.dtype.numpy_dtype,
+                                         stride=mem_access.stride,
+                                         direction=mem_access.direction,
+                                         variable=mem_access.variable)
+                               , count)
+                      for mem_access, count in six.iteritems(subs_poly.dict))
 
-             - The integer in the key specifies the *stride*
-
-             - The second string in the key specifies the direction as *load* or *store*
-
-             - The :class:`islpy.PwQPolynomial` holds the aggregate transfer
-               size in bytes for memory accesses of all data types with the
-               characteristics specified in the key (in terms of the
-               :class:`loopy.LoopKernel` *inames*).
-
-    Example usage::
-
-        # (first create loopy kernel and specify array data types)
-
-        mem_access_map = get_mem_access_poly(knl)
-        byte_totals_map = sum_mem_access_to_bytes(mem_access_map)
-        params = {'n': 512, 'm': 256, 'l': 128}
-
-        stride1_global_bytes_loaded = byte_totals_map[('global', 1, 'load')
-                                                     ].eval_with_dict(params)
-        stride2_global_bytes_loaded = byte_totals_map[('global', 2, 'load')
-                                                     ].eval_with_dict(params)
-        stride1_global_bytes_stored = byte_totals_map[('global', 1, 'store')
-                                                     ].eval_with_dict(params)
-        stride2_global_bytes_stored = byte_totals_map[('global', 2, 'store')
-                                                     ].eval_with_dict(params)
-
-        # (now use thess counts to predict performance)
-
-    """
-
-    result = {}
-    for mem_access, v in m.items():
-        new_key = (mem_access.mtype, mem_access.stride, mem_access.direction)
-        bytes_transferred = int(mem_access.dtype.itemsize) * v
-        if new_key in result:
-            result[new_key] += bytes_transferred
-        else:
-            result[new_key] = bytes_transferred
-
-    return result
-
-# }}}
-
-def sum_polys(m):
-    total = isl.PwQPolynomial('{ 0 }')
-    for k, v in m.items():
-        total += v
-    return total
-
-
-def eval_and_sum_polys(m, params):
-    return sum_polys(m).eval_with_dict(params)
+    return subs_poly
 
 
 # {{{ get_synchronization_poly
@@ -1356,7 +1353,8 @@ def get_synchronization_poly(knl):
             raise LoopyError("unexpected schedule item: %s"
                     % type(sched_item).__name__)
 
-    return result.dict
+    #return result.dict #TODO is this okay?
+    return result
 
 # }}}
 
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 3657721e4..7332d4ca5 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -82,8 +82,8 @@ def test_op_counter_reduction():
     f32mul = poly[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
     assert f32add == f32mul == n*m*l
 
-    poly_dtype = lp.sum_ops_to_dtypes(poly)
-    f32 = poly_dtype[to_loopy_type(np.float32)].eval_with_dict(params)
+    poly_dtype = poly.group_by('dtype')
+    f32 = poly_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
     assert f32 == f32add + f32mul
 
 
@@ -286,11 +286,12 @@ def test_gmem_access_counter_reduction():
               ].eval_with_dict(params)
     assert f32s == n*l
 
-    poly_b = lp.sum_mem_access_to_bytes(poly)
-    s0load = poly_b[('global', 0, 'load')].eval_with_dict(params)
-    s0store = poly_b[('global', 0, 'store')].eval_with_dict(params)
-    assert s0load == 4*f32l
-    assert s0store == 4*f32s
+    ld_bytes = poly.filter_by(mtype=['global'], direction=['load']
+                             ).to_bytes().eval_and_sum(params)
+    st_bytes = poly.filter_by(mtype=['global'], direction=['store']
+                             ).to_bytes().eval_and_sum(params)
+    assert ld_bytes == 4*f32l
+    assert st_bytes == 4*f32s
 
 
 def test_gmem_access_counter_logic():
@@ -371,7 +372,8 @@ def test_gmem_access_counter_specialops():
     assert f64 == n*m
 
     filtered_map = poly.filter_by(direction=['load'], variable=['a','g'])
-    tot = lp.eval_and_sum_polys(filtered_map, params)
+    #tot = lp.eval_and_sum_polys(filtered_map, params)
+    tot = filtered_map.eval_and_sum(params)
     assert tot == n*m*l + n*m
 
 def test_gmem_access_counter_bitwise():
@@ -745,21 +747,18 @@ def test_summations_and_filters():
 
     mem_map = lp.get_mem_access_poly(knl)
 
-    loads_a = lp.eval_and_sum_polys(
-                    mem_map.filter_by(direction=['load'], variable=['a']),
-                    params)
+    loads_a = mem_map.filter_by(direction=['load'], variable=['a']).eval_and_sum(params)
     assert loads_a == 2*n*m*l
 
-    global_stores = lp.eval_and_sum_polys(
-                        mem_map.filter_by(mtype=['global'], direction=['store']),
-                        params)
+    global_stores = mem_map.filter_by(mtype=['global'], direction=['store']).eval_and_sum(params)
     assert global_stores == n*m*l + n*m
 
-    bytes_map = lp.sum_mem_access_to_bytes(mem_map)
-    s0load = bytes_map[('global', 0, 'load')].eval_with_dict(params)
-    s0store = bytes_map[('global', 0, 'store')].eval_with_dict(params)
-    assert s0load == 4*n*m*l*3 + 8*n*m*2
-    assert s0store == 4*n*m*l + 8*n*m
+    ld_bytes = poly.filter_by(mtype=['global'], direction=['load']
+                             ).to_bytes().eval_and_sum(params)
+    st_bytes = poly.filter_by(mtype=['global'], direction=['store']
+                             ).to_bytes().eval_and_sum(params)
+    assert ld_bytes == 4*n*m*l*3 + 8*n*m*2
+    assert st_bytes == 4*n*m*l + 8*n*m
 
     # ignore stride and variable names in this map
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
@@ -771,23 +770,23 @@ def test_summations_and_filters():
     assert f64lall == 2*n*m
 
     op_map = lp.get_op_poly(knl)
+    #for k, v in op_map.items():
+    #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)
 
-    poly_dtype = lp.sum_ops_to_dtypes(op_map)
-    f32 = poly_dtype[to_loopy_type(np.float32)].eval_with_dict(params)
-    f64 = poly_dtype[to_loopy_type(np.float64)].eval_with_dict(params)
-    i32 = poly_dtype[to_loopy_type(np.int32)].eval_with_dict(params)
+    poly_dtype = op_map.group_by('dtype')
+    f32 = poly_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
+    f64 = poly_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params)
+    i32 = poly_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params)
     assert f32 == n*m*l*3
     assert f64 == n*m
     assert i32 == n*m*2
 
-    addsub_all = lp.eval_and_sum_polys(op_map.filter_by(name=['add', 'sub']),
-                                       params)
-    f32ops_all = lp.eval_and_sum_polys(op_map.filter_by(dtype=[np.float32]),
-                                       params)
+    addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params)
+    f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params)
     assert addsub_all == n*m*l + n*m*2
     assert f32ops_all == n*m*l*3
 
-    non_field = lp.eval_and_sum_polys(op_map.filter_by(xxx=[np.float32]), params)
+    non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params)
     assert non_field == 0
 
     ops_nodtype = op_map.group_by('name')
-- 
GitLab


From 79017fe6aed2cf34ccd513bdc6e33893aeb456cd Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Tue, 1 Nov 2016 00:45:31 -0500
Subject: [PATCH 28/55] renamed variables in stats test

---
 loopy/statistics.py     |   7 +-
 test/test_statistics.py | 225 ++++++++++++++++++++--------------------
 2 files changed, 117 insertions(+), 115 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 6efb61da1..5f5408770 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -409,11 +409,16 @@ class MemAccess:
             from loopy.types import to_loopy_type
             self.dtype = to_loopy_type(dtype)
 
-        #TODO currently counting all lmem access as stride None
+        #TODO currently giving all lmem access stride=None
         if (mtype == 'local') and (stride is not None):
             raise NotImplementedError("MemAccess: stride must be None when "
                                       "mtype is 'local'")
 
+        #TODO currently giving all lmem access variable=None
+        if (mtype == 'local') and (variable is not None):
+            raise NotImplementedError("MemAccess: variable must be None when "
+                                      "mtype is 'local'")
+
     def __eq__(self, other):
         return isinstance(other, MemAccess) and (
                 (self.mtype is None or other.mtype is None or
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 7332d4ca5..685406fee 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -48,16 +48,16 @@ def test_op_counter_basic():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    poly = lp.get_op_poly(knl)
+    op_map = lp.get_op_poly(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32add = poly[lp.Op(np.float32, 'add')].eval_with_dict(params)
-    f32mul = poly[lp.Op(np.float32, 'mul')].eval_with_dict(params)
-    f32div = poly[lp.Op(np.float32, 'div')].eval_with_dict(params)
-    f64mul = poly[lp.Op(np.dtype(np.float64), 'mul')].eval_with_dict(params)
-    i32add = poly[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params)
+    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul')].eval_with_dict(params)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
     assert f32add == f32mul == f32div == n*m*l
     assert f64mul == n*m
     assert i32add == n*m*2
@@ -73,17 +73,17 @@ def test_op_counter_reduction():
             name="matmul_serial", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    poly = lp.get_op_poly(knl)
+    op_map = lp.get_op_poly(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32add = poly[lp.Op(np.float32, 'add')].eval_with_dict(params)
-    f32mul = poly[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
     assert f32add == f32mul == n*m*l
 
-    poly_dtype = poly.group_by('dtype')
-    f32 = poly_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
+    op_map_dtype = op_map.group_by('dtype')
+    f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
     assert f32 == f32add + f32mul
 
 
@@ -99,15 +99,15 @@ def test_op_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = lp.get_op_poly(knl)
+    op_map = lp.get_op_poly(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32mul = poly[lp.Op(np.float32, 'mul')].eval_with_dict(params)
-    f64add = poly[lp.Op(np.float64, 'add')].eval_with_dict(params)
-    f64div = poly[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params)
-    i32add = poly[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(params)
+    f64div = op_map[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
     assert f32mul == n*m
     assert f64div == 2*n*m  # TODO why?
     assert f64add == n*m
@@ -129,19 +129,19 @@ def test_op_counter_specialops():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    poly = lp.get_op_poly(knl)
+    op_map = lp.get_op_poly(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32mul = poly[lp.Op(np.float32, 'mul')].eval_with_dict(params)
-    f32div = poly[lp.Op(np.float32, 'div')].eval_with_dict(params)
-    f32add = poly[lp.Op(np.float32, 'add')].eval_with_dict(params)
-    f64pow = poly[lp.Op(np.float64, 'pow')].eval_with_dict(params)
-    f64add = poly[lp.Op(np.dtype(np.float64), 'add')].eval_with_dict(params)
-    i32add = poly[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
-    f64rsq = poly[lp.Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params)
-    f64sin = poly[lp.Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
+    f64pow = op_map[lp.Op(np.float64, 'pow')].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.dtype(np.float64), 'add')].eval_with_dict(params)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
+    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params)
+    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params)
     assert f32div == 2*n*m*l
     assert f32mul == f32add == n*m*l
     assert f64add == 3*n*m
@@ -165,17 +165,17 @@ def test_op_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
 
-    poly = lp.get_op_poly(knl)
+    op_map = lp.get_op_poly(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    i32add = poly[lp.Op(np.int32, 'add')].eval_with_dict(params)
-    i32bw = poly[lp.Op(np.int32, 'bw')].eval_with_dict(params)
-    i64bw = poly[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params)
-    i64mul = poly[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params)
-    i64add = poly[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params)
-    i64shift = poly[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params)
+    i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params)
+    i32bw = op_map[lp.Op(np.int32, 'bw')].eval_with_dict(params)
+    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params)
+    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params)
+    i64add = op_map[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params)
+    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params)
     assert i32add == n*m+n*m*l
     assert i32bw == 2*n*m*l
     assert i64bw == 2*n*m
@@ -204,9 +204,9 @@ def test_op_counter_triangular_domain():
     else:
         expect_fallback = False
 
-    poly = lp.get_op_poly(knl)[lp.Op(np.float64, 'mul')]
+    op_map = lp.get_op_poly(knl)[lp.Op(np.float64, 'mul')]
     value_dict = dict(m=13, n=200)
-    flops = poly.eval_with_dict(value_dict)
+    flops = op_map.eval_with_dict(value_dict)
 
     if expect_fallback:
         assert flops == 144
@@ -228,30 +228,30 @@ def test_gmem_access_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = lp.get_mem_access_poly(knl)
+    mem_map = lp.get_mem_access_poly(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32l = poly[lp.MemAccess('global', np.float32,
+    f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
               ].eval_with_dict(params)
-    f32l += poly[lp.MemAccess('global', np.float32,
+    f32l += mem_map[lp.MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
-    f64l = poly[lp.MemAccess('global', np.float64,
+    f64l = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='load', variable='g')
               ].eval_with_dict(params)
-    f64l += poly[lp.MemAccess('global', np.float64,
+    f64l += mem_map[lp.MemAccess('global', np.float64,
                           stride=0, direction='load', variable='h')
                ].eval_with_dict(params)
     assert f32l == 3*n*m*l
     assert f64l == 2*n*m
 
-    f32s = poly[lp.MemAccess('global', np.dtype(np.float32),
+    f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c')
               ].eval_with_dict(params)
-    f64s = poly[lp.MemAccess('global', np.dtype(np.float64),
+    f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='store', variable='e')
               ].eval_with_dict(params)
     assert f32s == n*m*l
@@ -268,27 +268,27 @@ def test_gmem_access_counter_reduction():
             name="matmul", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    poly = lp.get_mem_access_poly(knl)
+    mem_map = lp.get_mem_access_poly(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32l = poly[lp.MemAccess('global', np.float32,
+    f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
               ].eval_with_dict(params)
-    f32l += poly[lp.MemAccess('global', np.float32,
+    f32l += mem_map[lp.MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
     assert f32l == 2*n*m*l
 
-    f32s = poly[lp.MemAccess('global', np.dtype(np.float32),
+    f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c')
               ].eval_with_dict(params)
     assert f32s == n*l
 
-    ld_bytes = poly.filter_by(mtype=['global'], direction=['load']
+    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                              ).to_bytes().eval_and_sum(params)
-    st_bytes = poly.filter_by(mtype=['global'], direction=['store']
+    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store']
                              ).to_bytes().eval_and_sum(params)
     assert ld_bytes == 4*f32l
     assert st_bytes == 4*f32s
@@ -306,13 +306,13 @@ def test_gmem_access_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    poly = lp.get_mem_access_poly(knl)
+    mem_map = lp.get_mem_access_poly(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    reduced_map = poly.group_by('mtype', 'dtype', 'direction')
+    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
 
     f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32),
                                        direction='load')
@@ -342,36 +342,36 @@ def test_gmem_access_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
-    poly = lp.get_mem_access_poly(knl)
+    mem_map = lp.get_mem_access_poly(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f32 = poly[lp.MemAccess('global', np.float32,
+    f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
               ].eval_with_dict(params)
-    f32 += poly[lp.MemAccess('global', np.float32,
+    f32 += mem_map[lp.MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
-    f64 = poly[lp.MemAccess('global', np.dtype(np.float64),
+    f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='load', variable='g')
               ].eval_with_dict(params)
-    f64 += poly[lp.MemAccess('global', np.dtype(np.float64),
+    f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
                           stride=0, direction='load', variable='h')
                ].eval_with_dict(params)
     assert f32 == 2*n*m*l
     assert f64 == 2*n*m
 
-    f32 = poly[lp.MemAccess('global', np.float32,
+    f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='store', variable='c')
               ].eval_with_dict(params)
-    f64 = poly[lp.MemAccess('global', np.float64,
+    f64 = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='store', variable='e')
               ].eval_with_dict(params)
     assert f32 == n*m*l
     assert f64 == n*m
 
-    filtered_map = poly.filter_by(direction=['load'], variable=['a','g'])
+    filtered_map = mem_map.filter_by(direction=['load'], variable=['a','g'])
     #tot = lp.eval_and_sum_polys(filtered_map, params)
     tot = filtered_map.eval_and_sum(params)
     assert tot == n*m*l + n*m
@@ -393,29 +393,29 @@ def test_gmem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    poly = lp.get_mem_access_poly(knl)
+    mem_map = lp.get_mem_access_poly(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    i32 = poly[lp.MemAccess('global', np.int32, 
+    i32 = mem_map[lp.MemAccess('global', np.int32, 
                          stride=0, direction='load', variable='a')
               ].eval_with_dict(params)
-    i32 += poly[lp.MemAccess('global', np.int32, 
+    i32 += mem_map[lp.MemAccess('global', np.int32, 
                           stride=0, direction='load', variable='b')
                ].eval_with_dict(params)
-    i32 += poly[lp.MemAccess('global', np.int32, 
+    i32 += mem_map[lp.MemAccess('global', np.int32, 
                           stride=0, direction='load', variable='g')
                ].eval_with_dict(params)
-    i32 += poly[lp.MemAccess('global', np.dtype(np.int32), 
+    i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), 
                           stride=0, direction='load', variable='h')
                ].eval_with_dict(params)
     assert i32 == 4*n*m+2*n*m*l
 
-    i32 = poly[lp.MemAccess('global', np.int32, 
+    i32 = mem_map[lp.MemAccess('global', np.int32, 
                          stride=0, direction='store', variable='c')
               ].eval_with_dict(params)
-    i32 += poly[lp.MemAccess('global', np.int32, 
+    i32 += mem_map[lp.MemAccess('global', np.int32, 
                           stride=0, direction='store', variable='e')
                ].eval_with_dict(params)
     assert i32 == n*m+n*m*l
@@ -439,25 +439,25 @@ def test_gmem_access_counter_mixed():
     knl = lp.split_iname(knl, "j", threads)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
-    poly = lp.get_mem_access_poly(knl)  # noqa
+    mem_map = lp.get_mem_access_poly(knl)  # noqa
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64uniform = poly[lp.MemAccess('global', np.float64, 
+    f64uniform = mem_map[lp.MemAccess('global', np.float64, 
                                 stride=0, direction='load', variable='g')
                      ].eval_with_dict(params)
-    f64uniform += poly[lp.MemAccess('global', np.float64, 
+    f64uniform += mem_map[lp.MemAccess('global', np.float64, 
                                  stride=0, direction='load', variable='h')
                       ].eval_with_dict(params)
-    f32uniform = poly[lp.MemAccess('global', np.float32, 
+    f32uniform = mem_map[lp.MemAccess('global', np.float32, 
                                 stride=0, direction='load', variable='x')
                      ].eval_with_dict(params)
-    f32nonconsec = poly[lp.MemAccess('global', np.dtype(np.float32), 
+    f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), 
                                   stride=Variable('m'), direction='load',
                                   variable='a')
                        ].eval_with_dict(params)
-    f32nonconsec += poly[lp.MemAccess('global', np.dtype(np.float32), 
+    f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), 
                                    stride=Variable('m'), direction='load',
                                    variable='b')
                         ].eval_with_dict(params)
@@ -465,10 +465,10 @@ def test_gmem_access_counter_mixed():
     assert f32uniform == n*m*l/threads
     assert f32nonconsec == 3*n*m*l
 
-    f64uniform = poly[lp.MemAccess('global', np.float64, 
+    f64uniform = mem_map[lp.MemAccess('global', np.float64, 
                                 stride=0, direction='store', variable='e')
                      ].eval_with_dict(params)
-    f32nonconsec = poly[lp.MemAccess('global', np.float32, 
+    f32nonconsec = mem_map[lp.MemAccess('global', np.float32, 
                                   stride=Variable('m'), direction='store',
                                   variable='c')
                        ].eval_with_dict(params)
@@ -492,35 +492,35 @@ def test_gmem_access_counter_nonconsec():
     knl = lp.split_iname(knl, "i", 16)
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
-    poly = lp.get_mem_access_poly(knl)  # noqa
+    mem_map = lp.get_mem_access_poly(knl)  # noqa
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    f64nonconsec = poly[lp.MemAccess('global', np.float64, 
+    f64nonconsec = mem_map[lp.MemAccess('global', np.float64, 
                                   stride=Variable('m'), direction='load',
                                   variable='g')
                        ].eval_with_dict(params)
-    f64nonconsec += poly[lp.MemAccess('global', np.float64, 
+    f64nonconsec += mem_map[lp.MemAccess('global', np.float64, 
                                    stride=Variable('m'), direction='load',
                                    variable='h')
                         ].eval_with_dict(params)
-    f32nonconsec = poly[lp.MemAccess('global', np.dtype(np.float32), 
+    f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), 
                                   stride=Variable('m')*Variable('l'),
                                   direction='load', variable='a')
                        ].eval_with_dict(params)
-    f32nonconsec += poly[lp.MemAccess('global', np.dtype(np.float32), 
+    f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), 
                                    stride=Variable('m')*Variable('l'),
                                    direction='load', variable='b')
                         ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*l
 
-    f64nonconsec = poly[lp.MemAccess('global', np.float64, 
+    f64nonconsec = mem_map[lp.MemAccess('global', np.float64, 
                                   stride=Variable('m'), direction='store',
                                   variable='e')
                        ].eval_with_dict(params)
-    f32nonconsec = poly[lp.MemAccess('global', np.float32, 
+    f32nonconsec = mem_map[lp.MemAccess('global', np.float32, 
                                   stride=Variable('m')*Variable('l'),
                                   direction='store', variable='c')
                        ].eval_with_dict(params)
@@ -543,34 +543,34 @@ def test_gmem_access_counter_consec():
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
-    poly = lp.get_mem_access_poly(knl)
+    mem_map = lp.get_mem_access_poly(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    #for k in poly:
-    #    print(k.mtype, k.dtype, type(k.dtype), k.stride, k.direction, k.variable, " :\n", poly[k])
+    #for k in mem_map:
+    #    print(k.mtype, k.dtype, type(k.dtype), k.stride, k.direction, k.variable, " :\n", mem_map[k])
 
-    f64consec = poly[lp.MemAccess('global', np.float64, 
+    f64consec = mem_map[lp.MemAccess('global', np.float64, 
                         stride=1, direction='load', variable='g')
                      ].eval_with_dict(params)
-    f64consec += poly[lp.MemAccess('global', np.float64, 
+    f64consec += mem_map[lp.MemAccess('global', np.float64, 
                         stride=1, direction='load', variable='h')
                      ].eval_with_dict(params)
-    f32consec = poly[lp.MemAccess('global', np.float32, 
+    f32consec = mem_map[lp.MemAccess('global', np.float32, 
                         stride=1, direction='load', variable='a')
                      ].eval_with_dict(params)
-    f32consec += poly[lp.MemAccess('global', np.dtype(np.float32), 
+    f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), 
                         stride=1, direction='load', variable='b')
                      ].eval_with_dict(params)
     assert f64consec == 2*n*m
     assert f32consec == 3*n*m*l
 
-    f64consec = poly[lp.MemAccess('global', np.float64, 
+    f64consec = mem_map[lp.MemAccess('global', np.float64, 
                         stride=1, direction='store', variable='e')
                      ].eval_with_dict(params)
-    f32consec = poly[lp.MemAccess('global', np.float32, 
+    f32consec = mem_map[lp.MemAccess('global', np.float32, 
                         stride=1, direction='store', variable='c')
                      ].eval_with_dict(params)
     assert f64consec == n*m
@@ -591,13 +591,13 @@ def test_barrier_counter_nobarriers():
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
-    sync_poly = lp.get_synchronization_poly(knl)
+    sync_map = lp.get_synchronization_poly(knl)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    assert len(sync_poly) == 1
-    assert sync_poly["kernel_launch"].eval_with_dict(params) == 1
+    assert len(sync_map) == 1
+    assert sync_map["kernel_launch"].eval_with_dict(params) == 1
 
 
 def test_barrier_counter_barriers():
@@ -617,13 +617,13 @@ def test_barrier_counter_barriers():
             )
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32))
     knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0")
-    poly = lp.get_synchronization_poly(knl)
-    print(poly)
+    map = lp.get_synchronization_poly(knl)
+    print(map)
     n = 512
     m = 256
     l = 128
     params = {'n': n, 'm': m, 'l': l}
-    barrier_count = poly["barrier_local"].eval_with_dict(params)
+    barrier_count = map["barrier_local"].eval_with_dict(params)
     assert barrier_count == 50*10*2
 
 
@@ -647,10 +647,10 @@ def test_all_counters_parallel_matmul():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    sync_poly = lp.get_synchronization_poly(knl)
-    assert len(sync_poly) == 2
-    assert sync_poly["kernel_launch"].eval_with_dict(params) == 1
-    assert sync_poly["barrier_local"].eval_with_dict(params) == 2*m/16
+    sync_map = lp.get_synchronization_poly(knl)
+    assert len(sync_map) == 2
+    assert sync_map["kernel_launch"].eval_with_dict(params) == 1
+    assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/16
 
     op_map = lp.get_op_poly(knl)
     f32mul = op_map[
@@ -668,30 +668,28 @@ def test_all_counters_parallel_matmul():
 
     assert f32mul+f32add == n*m*l*2
 
-    subscript_map = lp.get_mem_access_poly(knl)
+    op_map = lp.get_mem_access_poly(knl)
 
-    f32coal = subscript_map[lp.MemAccess('global', np.float32, 
+    f32coal = op_map[lp.MemAccess('global', np.float32, 
                         stride=1, direction='load', variable='b')
                             ].eval_with_dict(params)
-    f32coal += subscript_map[lp.MemAccess('global', np.float32, 
+    f32coal += op_map[lp.MemAccess('global', np.float32, 
                         stride=1, direction='load', variable='a')
                             ].eval_with_dict(params)
 
     assert f32coal == n*m+m*l
 
-    f32coal = subscript_map[lp.MemAccess('global', np.float32, 
+    f32coal = op_map[lp.MemAccess('global', np.float32, 
                         stride=1, direction='store', variable='c')
                             ].eval_with_dict(params)
 
     assert f32coal == n*l
 
-    local_subs_map = lp.get_mem_access_poly(knl)
-
-    local_subs_l = local_subs_map[lp.MemAccess('local', np.dtype(np.float32),
+    local_mem_map = lp.get_mem_access_poly(knl).filter_by(mtype=['local'])
+    local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                             direction='load')
                                  ].eval_with_dict(params)
-
-    assert local_subs_l == n*m*l*2
+    assert local_mem_l == n*m*l*2
 
 def test_gather_access_footprint():
     knl = lp.make_kernel(
@@ -739,7 +737,6 @@ def test_summations_and_filters():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    poly = lp.get_mem_access_poly(knl)
     n = 512
     m = 256
     l = 128
@@ -753,9 +750,9 @@ def test_summations_and_filters():
     global_stores = mem_map.filter_by(mtype=['global'], direction=['store']).eval_and_sum(params)
     assert global_stores == n*m*l + n*m
 
-    ld_bytes = poly.filter_by(mtype=['global'], direction=['load']
+    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                              ).to_bytes().eval_and_sum(params)
-    st_bytes = poly.filter_by(mtype=['global'], direction=['store']
+    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store']
                              ).to_bytes().eval_and_sum(params)
     assert ld_bytes == 4*n*m*l*3 + 8*n*m*2
     assert st_bytes == 4*n*m*l + 8*n*m
@@ -773,10 +770,10 @@ def test_summations_and_filters():
     #for k, v in op_map.items():
     #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)
 
-    poly_dtype = op_map.group_by('dtype')
-    f32 = poly_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
-    f64 = poly_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params)
-    i32 = poly_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params)
+    op_map_dtype = op_map.group_by('dtype')
+    f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
+    f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params)
+    i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params)
     assert f32 == n*m*l*3
     assert f64 == n*m
     assert i32 == n*m*2
-- 
GitLab


From bd7d74be15c7c8a85a4755f7ab71725c9fe8effb Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Tue, 1 Nov 2016 12:12:15 -0500
Subject: [PATCH 29/55] changed get_xxx_poly functions to get_xxx_map

---
 loopy/__init__.py       |  13 ++---
 loopy/statistics.py     | 104 ++++++++++++++++++++++++----------------
 test/test_statistics.py |  44 ++++++++---------
 3 files changed, 91 insertions(+), 70 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index f505759ac..a2e403b40 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -113,8 +113,9 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction,
         infer_unknown_types)
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
 from loopy.statistics import (stringify_stats_mapping, Op, MemAccess,
-        get_op_poly, get_lmem_access_poly, get_DRAM_access_poly,
-        get_gmem_access_poly, get_mem_access_poly, get_synchronization_poly,
+        get_op_poly, get_op_map, get_lmem_access_poly, get_DRAM_access_poly,
+        get_gmem_access_poly, get_mem_access_map,
+        get_synchronization_poly, get_synchronization_map,
         gather_access_footprints, gather_access_footprint_bytes)
 from loopy.codegen import (
         PreambleInfo,
@@ -219,10 +220,10 @@ __all__ = [
         "generate_code", "generate_code_v2", "generate_body",
 
         "stringify_stats_mapping", "Op", "MemAccess", "get_op_poly",
-        "get_lmem_access_poly", "get_DRAM_access_poly",
-        "get_gmem_access_poly", "get_mem_access_poly",
-        "get_synchronization_poly", "gather_access_footprints",
-        "gather_access_footprint_bytes",
+        "get_op_map", "get_lmem_access_poly", "get_DRAM_access_poly",
+        "get_gmem_access_poly", "get_mem_access_map",
+        "get_synchronization_poly", "get_synchronization_map",
+        "gather_access_footprints", "gather_access_footprint_bytes",
 
         "CompiledKernel",
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 5f5408770..b664e1f90 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -48,13 +48,15 @@ __doc__ = """
 .. autofunction:: eval_and_sum
 
 .. autofunction:: get_op_poly
+.. autofunction:: get_op_map
 
 .. autofunction:: get_lmem_access_poly
 .. autofunction:: get_DRAM_access_poly
 .. autofunction:: get_gmem_access_poly
-.. autofunction:: get_mem_access_poly
+.. autofunction:: get_mem_access_map
 
 .. autofunction:: get_synchronization_poly
+.. autofunction:: get_synchronization_map
 
 .. autofunction:: gather_access_footprints
 .. autofunction:: gather_access_footprint_bytes
@@ -138,7 +140,7 @@ class ToCountMap:
             # (first create loopy kernel and specify array data types)
 
             params = {'n': 512, 'm': 256, 'l': 128}
-            mem_map = lp.get_mem_access_poly(knl)
+            mem_map = lp.get_mem_access_map(knl)
             filtered_map = mem_map.filter_by(direction=['load'],
                                              variable=['a','g'])
             tot_loads_a_g = filtered_map.eval_and_sum(params)
@@ -184,7 +186,7 @@ class ToCountMap:
             # (first create loopy kernel and specify array data types)
 
             params = {'n': 512, 'm': 256, 'l': 128}
-            mem_map = get_mem_access_poly(knl)
+            mem_map = get_mem_access_map(knl)
             grouped_mem_map = mem_map.group_by('mtype', 'dtype', 'direction')
 
             all_f32_global_loads = grouped_mem_map[MemAccess(mtype='global',
@@ -204,7 +206,7 @@ class ToCountMap:
                                                              direction='store')
                                                   ].eval_with_dict(params)
 
-            op_map = get_op_poly(knl)
+            op_map = get_op_map(knl)
             ops_by_dtype = op_map.group_by('dtype')
 
             f32ops = ops_by_dtype[Op(dtype=np.float32)].eval_with_dict(params)
@@ -252,7 +254,7 @@ class ToCountMap:
 
             # (first create loopy kernel and specify array data types)
 
-            bytes_map = get_mem_access_poly(knl).to_bytes()
+            bytes_map = get_mem_access_map(knl).to_bytes()
             params = {'n': 512, 'm': 256, 'l': 128}
 
             s1_global_ld_byt = bytes_map.filter_by(
@@ -308,7 +310,7 @@ class ToCountMap:
             # (first create loopy kernel and specify array data types)
 
             params = {'n': 512, 'm': 256, 'l': 128}
-            mem_map = lp.get_mem_access_poly(knl)
+            mem_map = lp.get_mem_access_map(knl)
             filtered_map = mem_map.filter_by(direction=['load'],
                                              variable=['a','g'])
             tot_loads_a_g = filtered_map.eval_and_sum(params)
@@ -1056,6 +1058,18 @@ def count(kernel, set):
 
 def get_op_poly(knl, numpy_types=True):
 
+    """Count the number of operations in a loopy kernel.
+    """
+    from warnings import warn
+    warn("get_op_poly is deprecated. Use get_op_map instead.",
+         DeprecationWarning, stacklevel=2)
+    return get_op_map(knl, numpy_types)
+
+# }}}
+
+
+def get_op_map(knl, numpy_types=True):
+
     """Count the number of operations in a loopy kernel.
 
     :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted.
@@ -1077,10 +1091,10 @@ def get_op_poly(knl, numpy_types=True):
 
         # (first create loopy kernel and specify array data types)
 
-        poly = get_op_poly(knl)
+        map = get_op_map(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-        f32add = poly[Op(np.dtype(np.float32), 'add')].eval_with_dict(params)
-        f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+        f32add = map[Op(np.dtype(np.float32), 'add')].eval_with_dict(params)
+        f32mul = map[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
 
         # (now use these counts to predict performance)
 
@@ -1090,7 +1104,7 @@ def get_op_poly(knl, numpy_types=True):
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
 
-    op_poly = ToCountMap()
+    op_map = ToCountMap()
     op_counter = ExpressionOpCounter(knl)
     for insn in knl.instructions:
         # how many times is this instruction executed?
@@ -1100,16 +1114,14 @@ def get_op_poly(knl, numpy_types=True):
         domain = (inames_domain.project_out_except(
                                         insn_inames, [dim_type.set]))
         ops = op_counter(insn.assignee) + op_counter(insn.expression)
-        op_poly = op_poly + ops*count(knl, domain)
+        op_map = op_map + ops*count(knl, domain)
 
     if numpy_types:
-        op_poly.dict = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name),
+        op_map.dict = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name),
                              count)
-                for op, count in six.iteritems(op_poly.dict))
+                for op, count in six.iteritems(op_map.dict))
 
-    return op_poly
-
-# }}}
+    return op_map
 
 
 #TODO test depricated functions?
@@ -1117,20 +1129,20 @@ def get_lmem_access_poly(knl):
     """Count the number of local memory accesses in a loopy kernel.
     """
     from warnings import warn
-    warn("get_lmem_access_poly is deprecated. Use get_mem_access_poly and "
+    warn("get_lmem_access_poly is deprecated. Use get_mem_access_map and "
          "filter the result with the mtype=['local'] option.",
          DeprecationWarning, stacklevel=2)
-    return get_mem_access_poly(knl).filter_by(mtype=['local'])
+    return get_mem_access_map(knl).filter_by(mtype=['local'])
 
 
 def get_DRAM_access_poly(knl):
     """Count the number of global memory accesses in a loopy kernel.
     """
     from warnings import warn
-    warn("get_DRAM_access_poly is deprecated. Use get_mem_access_poly and "
+    warn("get_DRAM_access_poly is deprecated. Use get_mem_access_map and "
          "filter the result with the mtype=['global'] option.",
          DeprecationWarning, stacklevel=2)
-    return get_mem_access_poly(knl).filter_by(mtype=['global'])
+    return get_mem_access_map(knl).filter_by(mtype=['global'])
 
 
 # {{{ get_gmem_access_poly
@@ -1139,15 +1151,15 @@ def get_gmem_access_poly(knl):
     """Count the number of global memory accesses in a loopy kernel.
     """
     from warnings import warn
-    warn("get_DRAM_access_poly is deprecated. Use get_mem_access_poly and "
+    warn("get_DRAM_access_poly is deprecated. Use get_mem_access_map and "
          "filter the result with the mtype=['global'] option.",
          DeprecationWarning, stacklevel=2)
-    return get_mem_access_poly(knl).filter_by(mtype=['global'])
+    return get_mem_access_map(knl).filter_by(mtype=['global'])
 
 # }}}
 
 
-def get_mem_access_poly(knl, numpy_types=True):
+def get_mem_access_map(knl, numpy_types=True):
     """Count the number of memory accesses in a loopy kernel.
 
     :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
@@ -1172,7 +1184,7 @@ def get_mem_access_poly(knl, numpy_types=True):
         # (first create loopy kernel and specify array data types)
 
         params = {'n': 512, 'm': 256, 'l': 128}
-        mem_access_map = get_mem_access_poly(knl)
+        mem_access_map = get_mem_access_map(knl)
 
         f32_stride1_g_loads_a = mem_access_map[MemAccess(mtype='global',
                                                          dtype=np.float32,
@@ -1224,7 +1236,7 @@ def get_mem_access_poly(knl, numpy_types=True):
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
 
-    subs_poly = ToCountMap()
+    subs_map = ToCountMap()
     subs_counter_g = GlobalSubscriptCounter(knl)
     subs_counter_l = LocalSubscriptCounter(knl)
 
@@ -1253,40 +1265,50 @@ def get_mem_access_poly(knl, numpy_types=True):
 
         # use count excluding local index tags for uniform accesses
         for key in subs_expr.dict:
-            poly = ToCountMap({key: subs_expr.dict[key]})
+            map = ToCountMap({key: subs_expr.dict[key]})
             if key.mtype == 'global' and isinstance(key.stride, int) and key.stride == 0:
-                subs_poly = subs_poly \
-                            + poly*get_insn_count(knl, insn_inames, True)
+                subs_map = subs_map \
+                            + map*get_insn_count(knl, insn_inames, True)
             else:
-                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
+                subs_map = subs_map + map*get_insn_count(knl, insn_inames)
                 #currently not counting stride of local mem access
 
         for key in subs_assignee_g.dict:
-            poly = ToCountMap({key: subs_assignee_g.dict[key]})
+            map = ToCountMap({key: subs_assignee_g.dict[key]})
             if isinstance(key.stride, int) and key.stride == 0:
-                subs_poly = subs_poly \
-                            + poly*get_insn_count(knl, insn_inames, True)
+                subs_map = subs_map \
+                            + map*get_insn_count(knl, insn_inames, True)
             else:
-                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
+                subs_map = subs_map + map*get_insn_count(knl, insn_inames)
             # for now, don't count writes to local mem
 
-    #result = subs_poly.dict
-
     if numpy_types:
-        subs_poly.dict = dict((MemAccess(mtype=mem_access.mtype,
+        subs_map.dict = dict((MemAccess(mtype=mem_access.mtype,
                                          dtype=mem_access.dtype.numpy_dtype,
                                          stride=mem_access.stride,
                                          direction=mem_access.direction,
                                          variable=mem_access.variable)
                                , count)
-                      for mem_access, count in six.iteritems(subs_poly.dict))
+                      for mem_access, count in six.iteritems(subs_map.dict))
 
-    return subs_poly
+    return subs_map
 
 
 # {{{ get_synchronization_poly
 
 def get_synchronization_poly(knl):
+    """Count the number of synchronization events each thread encounters in a
+    loopy kernel.
+    """
+    from warnings import warn
+    warn("get_synchronization_poly is deprecated. Use get_synchronization_map instead.",
+         DeprecationWarning, stacklevel=2)
+    return get_synchronization_map(knl)
+
+# }}}
+
+
+def get_synchronization_map(knl):
 
     """Count the number of synchronization events each thread encounters in a
     loopy kernel.
@@ -1304,9 +1326,9 @@ def get_synchronization_poly(knl):
 
         # (first create loopy kernel and specify array data types)
 
-        sync_poly = get_synchronization_poly(knl)
+        sync_map = get_synchronization_map(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-        barrier_count = sync_poly['barrier_local'].eval_with_dict(params)
+        barrier_count = sync_map['barrier_local'].eval_with_dict(params)
 
         # (now use this count to predict performance)
 
@@ -1361,8 +1383,6 @@ def get_synchronization_poly(knl):
     #return result.dict #TODO is this okay?
     return result
 
-# }}}
-
 
 # {{{ gather_access_footprints
 
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 685406fee..3f03fa955 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -48,7 +48,7 @@ def test_op_counter_basic():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_poly(knl)
+    op_map = lp.get_op_map(knl)
     n = 512
     m = 256
     l = 128
@@ -73,7 +73,7 @@ def test_op_counter_reduction():
             name="matmul_serial", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    op_map = lp.get_op_poly(knl)
+    op_map = lp.get_op_map(knl)
     n = 512
     m = 256
     l = 128
@@ -99,7 +99,7 @@ def test_op_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    op_map = lp.get_op_poly(knl)
+    op_map = lp.get_op_map(knl)
     n = 512
     m = 256
     l = 128
@@ -129,7 +129,7 @@ def test_op_counter_specialops():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_poly(knl)
+    op_map = lp.get_op_map(knl)
     n = 512
     m = 256
     l = 128
@@ -165,7 +165,7 @@ def test_op_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
 
-    op_map = lp.get_op_poly(knl)
+    op_map = lp.get_op_map(knl)
     n = 512
     m = 256
     l = 128
@@ -204,7 +204,7 @@ def test_op_counter_triangular_domain():
     else:
         expect_fallback = False
 
-    op_map = lp.get_op_poly(knl)[lp.Op(np.float64, 'mul')]
+    op_map = lp.get_op_map(knl)[lp.Op(np.float64, 'mul')]
     value_dict = dict(m=13, n=200)
     flops = op_map.eval_with_dict(value_dict)
 
@@ -228,7 +228,7 @@ def test_gmem_access_counter_basic():
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
-    mem_map = lp.get_mem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
@@ -268,7 +268,7 @@ def test_gmem_access_counter_reduction():
             name="matmul", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    mem_map = lp.get_mem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
@@ -306,7 +306,7 @@ def test_gmem_access_counter_logic():
             name="logic", assumptions="n,m,l >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    mem_map = lp.get_mem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
@@ -342,7 +342,7 @@ def test_gmem_access_counter_specialops():
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
-    mem_map = lp.get_mem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
@@ -393,7 +393,7 @@ def test_gmem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    mem_map = lp.get_mem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
@@ -439,7 +439,7 @@ def test_gmem_access_counter_mixed():
     knl = lp.split_iname(knl, "j", threads)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
 
-    mem_map = lp.get_mem_access_poly(knl)  # noqa
+    mem_map = lp.get_mem_access_map(knl)  # noqa
     n = 512
     m = 256
     l = 128
@@ -492,7 +492,7 @@ def test_gmem_access_counter_nonconsec():
     knl = lp.split_iname(knl, "i", 16)
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
-    mem_map = lp.get_mem_access_poly(knl)  # noqa
+    mem_map = lp.get_mem_access_map(knl)  # noqa
     n = 512
     m = 256
     l = 128
@@ -543,7 +543,7 @@ def test_gmem_access_counter_consec():
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
-    mem_map = lp.get_mem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
     n = 512
     m = 256
     l = 128
@@ -591,7 +591,7 @@ def test_barrier_counter_nobarriers():
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
-    sync_map = lp.get_synchronization_poly(knl)
+    sync_map = lp.get_synchronization_map(knl)
     n = 512
     m = 256
     l = 128
@@ -617,7 +617,7 @@ def test_barrier_counter_barriers():
             )
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32))
     knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0")
-    map = lp.get_synchronization_poly(knl)
+    map = lp.get_synchronization_map(knl)
     print(map)
     n = 512
     m = 256
@@ -647,12 +647,12 @@ def test_all_counters_parallel_matmul():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    sync_map = lp.get_synchronization_poly(knl)
+    sync_map = lp.get_synchronization_map(knl)
     assert len(sync_map) == 2
     assert sync_map["kernel_launch"].eval_with_dict(params) == 1
     assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/16
 
-    op_map = lp.get_op_poly(knl)
+    op_map = lp.get_op_map(knl)
     f32mul = op_map[
                         lp.Op(np.float32, 'mul')
                         ].eval_with_dict(params)
@@ -668,7 +668,7 @@ def test_all_counters_parallel_matmul():
 
     assert f32mul+f32add == n*m*l*2
 
-    op_map = lp.get_mem_access_poly(knl)
+    op_map = lp.get_mem_access_map(knl)
 
     f32coal = op_map[lp.MemAccess('global', np.float32, 
                         stride=1, direction='load', variable='b')
@@ -685,7 +685,7 @@ def test_all_counters_parallel_matmul():
 
     assert f32coal == n*l
 
-    local_mem_map = lp.get_mem_access_poly(knl).filter_by(mtype=['local'])
+    local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local'])
     local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                             direction='load')
                                  ].eval_with_dict(params)
@@ -742,7 +742,7 @@ def test_summations_and_filters():
     l = 128
     params = {'n': n, 'm': m, 'l': l}
 
-    mem_map = lp.get_mem_access_poly(knl)
+    mem_map = lp.get_mem_access_map(knl)
 
     loads_a = mem_map.filter_by(direction=['load'], variable=['a']).eval_and_sum(params)
     assert loads_a == 2*n*m*l
@@ -766,7 +766,7 @@ def test_summations_and_filters():
     assert f32lall == 3*n*m*l
     assert f64lall == 2*n*m
 
-    op_map = lp.get_op_poly(knl)
+    op_map = lp.get_op_map(knl)
     #for k, v in op_map.items():
     #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)
 
-- 
GitLab


From d5fa573fc2dd12858817088795aa3b3c853f947b Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Wed, 2 Nov 2016 14:16:49 -0500
Subject: [PATCH 30/55] doc fixes/improvements

---
 loopy/__init__.py   |  12 +--
 loopy/statistics.py | 238 ++++++++++++++++++++++++--------------------
 2 files changed, 138 insertions(+), 112 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index a2e403b40..c59c7bf8a 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -112,9 +112,9 @@ from loopy.transform.parameter import assume, fix_parameters
 from loopy.preprocess import (preprocess_kernel, realize_reduction,
         infer_unknown_types)
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
-from loopy.statistics import (stringify_stats_mapping, Op, MemAccess,
-        get_op_poly, get_op_map, get_lmem_access_poly, get_DRAM_access_poly,
-        get_gmem_access_poly, get_mem_access_map,
+from loopy.statistics import (ToCountMap, stringify_stats_mapping, Op,
+        MemAccess, get_op_poly, get_op_map, get_lmem_access_poly,
+        get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_map,
         get_synchronization_poly, get_synchronization_map,
         gather_access_footprints, gather_access_footprint_bytes)
 from loopy.codegen import (
@@ -219,9 +219,9 @@ __all__ = [
         "PreambleInfo",
         "generate_code", "generate_code_v2", "generate_body",
 
-        "stringify_stats_mapping", "Op", "MemAccess", "get_op_poly",
-        "get_op_map", "get_lmem_access_poly", "get_DRAM_access_poly",
-        "get_gmem_access_poly", "get_mem_access_map",
+        "ToCountMap", "stringify_stats_mapping", "Op", "MemAccess",
+        "get_op_poly", "get_op_map", "get_lmem_access_poly",
+        "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_map",
         "get_synchronization_poly", "get_synchronization_map",
         "gather_access_footprints", "gather_access_footprint_bytes",
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index b664e1f90..ee0d867e6 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -41,11 +41,9 @@ __doc__ = """
 
 .. currentmodule:: loopy
 
-.. autofunction:: filter_by
-.. autofunction:: group_by
-.. autofunction:: to_bytes
-.. autofunciton:: sum
-.. autofunction:: eval_and_sum
+.. autoclass:: ToCountMap
+.. autoclass:: Op
+.. autoclass:: MemAccess
 
 .. autofunction:: get_op_poly
 .. autofunction:: get_op_map
@@ -67,7 +65,15 @@ __doc__ = """
 # {{{ ToCountMap
 
 class ToCountMap:
-    """Maps any type of key to an arithmetic type."""
+    """Maps any type of key to an arithmetic type.
+
+    .. automethod:: filter_by
+    .. automethod:: group_by
+    .. automethod:: to_bytes
+    .. automethod:: sum
+    .. automethod:: eval_and_sum
+
+    """
 
     def __init__(self, init_dict=None):
         if init_dict is None:
@@ -125,15 +131,15 @@ class ToCountMap:
         return ToCountMap(dict(self.dict))
 
     def filter_by(self, **kwargs):
-        """Remove items without specified key fields
+        """Remove items without specified key fields.
 
-        :parameter **kwargs: Keyword arguments matching fields in the keys of
+        :parameter \*\*kwargs: Keyword arguments matching fields in the keys of
                              the :class:`ToCountMap`, each given a list of
                              allowable values for that key field.
 
         :return: A :class:`ToCountMap` containing the subset of the items in
-                 the oriinal :class:`ToCountMap` that match the field values
-                 passed
+                 the original :class:`ToCountMap` that match the field values
+                 passed.
 
         Example usage::
 
@@ -173,9 +179,10 @@ class ToCountMap:
         return result_map
 
     def group_by(self, *args):
-        """Group map items together, distinguishing by only the key fields passed in args
+        """Group map items together, distinguishing by only the key fields
+           passed in args.
 
-        :parameter args: Zero or more :class:`string` fields of map keys
+        :parameter \*args: Zero or more :class:`str` fields of map keys.
 
         :return: A :class:`ToCountMap` containing the same total counts
                  grouped together by new keys that only contain the fields
@@ -187,31 +194,31 @@ class ToCountMap:
 
             params = {'n': 512, 'm': 256, 'l': 128}
             mem_map = get_mem_access_map(knl)
-            grouped_mem_map = mem_map.group_by('mtype', 'dtype', 'direction')
-
-            all_f32_global_loads = grouped_mem_map[MemAccess(mtype='global',
-                                                             dtype=np.float32,
-                                                             direction='load')
-                                                  ].eval_with_dict(params)
-            all_f32_global_stores = grouped_mem_map[MemAccess(mtype='global',
-                                                              dtype=np.float32,
-                                                              direction='store')
-                                                   ].eval_with_dict(params)
-            all_f32_local_loads = grouped_mem_map[MemAccess(mtype='local',
-                                                            dtype=np.float32,
-                                                            direction='load')
-                                                 ].eval_with_dict(params)
-            all_f32_local_stores = grouped_mem_map[MemAccess(mtype='local',
-                                                             dtype=np.float32,
-                                                             direction='store')
-                                                  ].eval_with_dict(params)
+            grouped_map = mem_map.group_by('mtype', 'dtype', 'direction')
+
+            f32_global_ld = grouped_map[MemAccess(mtype='global',
+                                                  dtype=np.float32,
+                                                  direction='load')
+                                       ].eval_with_dict(params)
+            f32_global_st = grouped_map[MemAccess(mtype='global',
+                                                  dtype=np.float32,
+                                                  direction='store')
+                                       ].eval_with_dict(params)
+            f32_local_ld = grouped_map[MemAccess(mtype='local',
+                                                 dtype=np.float32,
+                                                 direction='load')
+                                      ].eval_with_dict(params)
+            f32_local_st = grouped_map[MemAccess(mtype='local',
+                                                 dtype=np.float32,
+                                                 direction='store')
+                                      ].eval_with_dict(params)
 
             op_map = get_op_map(knl)
-            ops_by_dtype = op_map.group_by('dtype')
+            ops_dtype = op_map.group_by('dtype')
 
-            f32ops = ops_by_dtype[Op(dtype=np.float32)].eval_with_dict(params)
-            f64ops = ops_by_dtype[Op(dtype=np.float64)].eval_with_dict(params)
-            i32ops = ops_by_dtype[Op(dtype=np.int32)].eval_with_dict(params)
+            f32ops = ops_dtype[Op(dtype=np.float32)].eval_with_dict(params)
+            f64ops = ops_dtype[Op(dtype=np.float64)].eval_with_dict(params)
+            i32ops = ops_dtype[Op(dtype=np.int32)].eval_with_dict(params)
 
             # (now use these counts to predict performance)
 
@@ -244,7 +251,7 @@ class ToCountMap:
         return result_map
 
     def to_bytes(self):
-        """Convert counts to bytes using data type in map key
+        """Convert counts to bytes using data type in map key.
 
         :return: A :class:`ToCountMap` mapping each original key to a
                  :class:`islpy.PwQPolynomial` with counts in bytes rather than
@@ -257,18 +264,18 @@ class ToCountMap:
             bytes_map = get_mem_access_map(knl).to_bytes()
             params = {'n': 512, 'm': 256, 'l': 128}
 
-            s1_global_ld_byt = bytes_map.filter_by(
-                                    mtype=['global'], stride=[1],
-                                    direction=['load']).eval_and_sum(params)
-            s2_global_ld_byt = bytes_map.filter_by(
-                                    mtype=['global'], stride=[2],
-                                    direction=['load']).eval_and_sum(params)
-            s1_global_st_byt = bytes_map.filter_by(
-                                    mtype=['global'], stride=[1],
-                                    direction=['store']).eval_and_sum(params)
-            s2_global_st_byt = bytes_map.filter_by(
-                                    mtype=['global'], stride=[2],
-                                    direction=['store']).eval_and_sum(params)
+            s1_g_ld_byt = bytes_map.filter_by(
+                                mtype=['global'], stride=[1],
+                                direction=['load']).eval_and_sum(params)
+            s2_g_ld_byt = bytes_map.filter_by(
+                                mtype=['global'], stride=[2],
+                                direction=['load']).eval_and_sum(params)
+            s1_g_st_byt = bytes_map.filter_by(
+                                mtype=['global'], stride=[1],
+                                direction=['store']).eval_and_sum(params)
+            s2_g_st_byt = bytes_map.filter_by(
+                                mtype=['global'], stride=[2],
+                                direction=['store']).eval_and_sum(params)
 
             # (now use these counts to predict performance)
 
@@ -284,9 +291,9 @@ class ToCountMap:
 
 
     def sum(self):
-        """Add all counts in ToCountMap
+        """Add all counts in ToCountMap.
 
-        :return: A :class:`islpy.PwQPolynomial` containing the sum of counts
+        :return: A :class:`islpy.PwQPolynomial` containing the sum of counts.
 
         """
         total = isl.PwQPolynomial('{ 0 }')
@@ -300,10 +307,11 @@ class ToCountMap:
 
 
     def eval_and_sum(self, params):
-        """Add all counts in ToCountMap and evaluate with provided parameters
+        """Add all counts in :class:`ToCountMap` and evaluate with provided
+        parameter dict.
 
-        :return: An :class:`integer` containing the sum of all counts in the
-                 :class:`ToCountMap` evaluated with the parameters provided
+        :return: An :class:`int` containing the sum of all counts in the
+                 :class:`ToCountMap` evaluated with the parameters provided.
 
         Example usage::
 
@@ -331,7 +339,7 @@ def stringify_stats_mapping(m):
 
 
 class Op:
-    """An arithmetic operation
+    """An arithmetic operation.
 
     .. attribute:: dtype
 
@@ -340,7 +348,7 @@ class Op:
 
     .. attribute:: name
 
-       A :class:`string` that specifies the kind of arithmetic operation as
+       A :class:`str` that specifies the kind of arithmetic operation as
        *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
 
     """
@@ -371,11 +379,11 @@ class Op:
 
 
 class MemAccess:
-    """A memory access
+    """A memory access.
 
     .. attribute:: mtype
 
-       A :class:`string` that specifies the memory type accessed as **global**
+       A :class:`str` that specifies the memory type accessed as **global**
        or **local**
 
     .. attribute:: dtype
@@ -385,17 +393,17 @@ class MemAccess:
 
     .. attribute:: stride
 
-       A :class:`int` specifies stride of the memory access. A stride of 0
+       An :class:`int` that specifies stride of the memory access. A stride of 0
        indicates a uniform access (i.e. all threads access the same item).
 
     .. attribute:: direction
 
-       A :class:`string` that specifies the direction of memory access as
+       A :class:`str` that specifies the direction of memory access as
        **load** or **store**.
 
     .. attribute:: variable
 
-       A :class:`string` that specifies the variable name of the data
+       A :class:`str` that specifies the variable name of the data
        accessed.
 
     """
@@ -1059,6 +1067,9 @@ def count(kernel, set):
 def get_op_poly(knl, numpy_types=True):
 
     """Count the number of operations in a loopy kernel.
+
+    get_op_poly is deprecated. Use get_op_map instead.
+
     """
     from warnings import warn
     warn("get_op_poly is deprecated. Use get_op_map instead.",
@@ -1074,27 +1085,27 @@ def get_op_map(knl, numpy_types=True):
 
     :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted.
 
-    :parameter numpy_types: A :class:`boolean` specifying whether the types
+    :parameter numpy_types: A :class:`bool` specifying whether the types
                             in the returned mapping should be numpy types
-                            instead of :class:'loopy.LoopyType`.
+                            instead of :class:`loopy.LoopyType`.
 
-    :return: A mapping of **{** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**.
+    :return: A mapping of **{** :class:`Op` **:** :class:`islpy.PwQPolynomial` **}**.
 
-             - The :class:`loopy.Op` specifies an arithmetic operation with
-               specific characteristics.
+             - The :class:`Op` specifies the characteristics of the arithmetic
+               operation.
 
              - The :class:`islpy.PwQPolynomial` holds the number of operations of
                the kind specified in the key (in terms of the
-               :class:`loopy.LoopKernel` *parameter inames*).
+               :class:`loopy.LoopKernel` parameter *inames*).
 
     Example usage::
 
         # (first create loopy kernel and specify array data types)
 
-        map = get_op_map(knl)
+        op_map = get_op_map(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-        f32add = map[Op(np.dtype(np.float32), 'add')].eval_with_dict(params)
-        f32mul = map[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
+        f32add = op_map[Op(np.float32, 'add')].eval_with_dict(params)
+        f32mul = op_map[Op(np.float32, 'mul')].eval_with_dict(params)
 
         # (now use these counts to predict performance)
 
@@ -1124,9 +1135,13 @@ def get_op_map(knl, numpy_types=True):
     return op_map
 
 
-#TODO test depricated functions?
+#TODO test deprecated functions?
 def get_lmem_access_poly(knl):
     """Count the number of local memory accesses in a loopy kernel.
+
+    get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['local'] option.
+
     """
     from warnings import warn
     warn("get_lmem_access_poly is deprecated. Use get_mem_access_map and "
@@ -1137,6 +1152,10 @@ def get_lmem_access_poly(knl):
 
 def get_DRAM_access_poly(knl):
     """Count the number of global memory accesses in a loopy kernel.
+
+    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['global'] option.
+
     """
     from warnings import warn
     warn("get_DRAM_access_poly is deprecated. Use get_mem_access_map and "
@@ -1149,6 +1168,10 @@ def get_DRAM_access_poly(knl):
 
 def get_gmem_access_poly(knl):
     """Count the number of global memory accesses in a loopy kernel.
+
+    get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the
+    result with the mtype=['global'] option.
+
     """
     from warnings import warn
     warn("get_DRAM_access_poly is deprecated. Use get_mem_access_map and "
@@ -1162,18 +1185,18 @@ def get_gmem_access_poly(knl):
 def get_mem_access_map(knl, numpy_types=True):
     """Count the number of memory accesses in a loopy kernel.
 
-    :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
+    :parameter knl: A :class:`loopy.LoopKernel` whose memory accesses are to be
                     counted.
 
-    :parameter numpy_types: A :class:`boolean` specifying whether the types
+    :parameter numpy_types: A :class:`bool` specifying whether the types
                             in the returned mapping should be numpy types
-                            instead of :class:'loopy.LoopyType`.
+                            instead of :class:`loopy.LoopyType`.
 
-    :return: A mapping of **{** :class:`loopy.MemAccess` **:**
+    :return: A mapping of **{** :class:`MemAccess` **:**
              :class:`islpy.PwQPolynomial` **}**.
 
-             - The :class:`loopy.MemAccess` specifies the type of memory
-               access.
+             - The :class:`MemAccess` specifies the characteristics of the
+               memory access.
 
              - The :class:`islpy.PwQPolynomial` holds the number of memory
                accesses with the characteristics specified in the key (in terms
@@ -1184,32 +1207,32 @@ def get_mem_access_map(knl, numpy_types=True):
         # (first create loopy kernel and specify array data types)
 
         params = {'n': 512, 'm': 256, 'l': 128}
-        mem_access_map = get_mem_access_map(knl)
-
-        f32_stride1_g_loads_a = mem_access_map[MemAccess(mtype='global',
-                                                         dtype=np.float32,
-                                                         stride=1,
-                                                         direction='load',
-                                                         variable='a')
-                                              ].eval_with_dict(params)
-        f32_stride1_g_stores_a = mem_access_map[MemAccess(mtype='global',
-                                                          dtype=np.float32,
-                                                          stride=1,
-                                                          direction='store',
-                                                          variable='a')
-                                               ].eval_with_dict(params)
-        f32_stride1_l_loads_x = mem_access_map[MemAccess(mtype='local',
-                                                         dtype=np.float32,
-                                                         stride=1,
-                                                         direction='load',
-                                                         variable='x')
-                                              ].eval_with_dict(params)
-        f32_stride1_l_stores_x = mem_access_map[MemAccess(mtype='local',
-                                                          dtype=np.float32,
-                                                          stride=1,
-                                                          direction='store',
-                                                          variable='x')
-                                               ].eval_with_dict(params)
+        mem_map = get_mem_access_map(knl)
+
+        f32_s1_g_ld_a = mem_map[MemAccess(mtype='global',
+                                          dtype=np.float32,
+                                          stride=1,
+                                          direction='load',
+                                          variable='a')
+                               ].eval_with_dict(params)
+        f32_s1_g_st_a = mem_map[MemAccess(mtype='global',
+                                          dtype=np.float32,
+                                          stride=1,
+                                          direction='store',
+                                          variable='a')
+                               ].eval_with_dict(params)
+        f32_s1_l_ld_x = mem_map[MemAccess(mtype='local',
+                                          dtype=np.float32,
+                                          stride=1,
+                                          direction='load',
+                                          variable='x')
+                               ].eval_with_dict(params)
+        f32_s1_l_st_x = mem_map[MemAccess(mtype='local',
+                                          dtype=np.float32,
+                                          stride=1,
+                                          direction='store',
+                                          variable='x')
+                               ].eval_with_dict(params)
 
         # (now use these counts to predict performance)
 
@@ -1299,6 +1322,9 @@ def get_mem_access_map(knl, numpy_types=True):
 def get_synchronization_poly(knl):
     """Count the number of synchronization events each thread encounters in a
     loopy kernel.
+
+    get_synchronization_poly is deprecated. Use get_synchronization_map instead.
+
     """
     from warnings import warn
     warn("get_synchronization_poly is deprecated. Use get_synchronization_map instead.",
@@ -1316,8 +1342,8 @@ def get_synchronization_map(knl):
     :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.
 
     :return: A dictionary mapping each type of synchronization event to a
-            :class:`islpy.PwQPolynomial` holding the number of such events
-            per thread.
+            :class:`islpy.PwQPolynomial` holding the number of events per
+            thread.
 
             Possible keys include ``barrier_local``, ``barrier_global``
             (if supported by the target) and ``kernel_launch``.
@@ -1328,7 +1354,7 @@ def get_synchronization_map(knl):
 
         sync_map = get_synchronization_map(knl)
         params = {'n': 512, 'm': 256, 'l': 128}
-        barrier_count = sync_map['barrier_local'].eval_with_dict(params)
+        barrier_ct = sync_map['barrier_local'].eval_with_dict(params)
 
         # (now use this count to predict performance)
 
@@ -1380,7 +1406,7 @@ def get_synchronization_map(knl):
             raise LoopyError("unexpected schedule item: %s"
                     % type(sched_item).__name__)
 
-    #return result.dict #TODO is this okay?
+    #return result.dict #TODO is this change okay?
     return result
 
 
@@ -1392,7 +1418,7 @@ def gather_access_footprints(kernel, ignore_uncountable=False):
     of each the array *var_name* are read/written (where
     *direction* is either ``read`` or ``write``.
 
-    :arg ignore_uncountable: If *True*, an error will be raised for
+    :arg ignore_uncountable: If *False*, an error will be raised for
         accesses on which the footprint cannot be determined (e.g.
         data-dependent or nonlinear indices)
     """
-- 
GitLab


From e137bf70bef6344b22d8040c434a03a7c35e441a Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Wed, 2 Nov 2016 15:50:50 -0500
Subject: [PATCH 31/55] added __str__ functions to Op and MemAccess

---
 loopy/statistics.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index ee0d867e6..ec10722e1 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -377,6 +377,9 @@ class Op:
             name = 'None'
         return hash(str(dtype)+name)
 
+    def __str__(self):
+        return "Op("+str(self.dtype)+", "+self.name+")"
+
 
 class MemAccess:
     """A memory access.
@@ -460,6 +463,10 @@ class MemAccess:
             variable = 'None'
         return hash(mtype+str(dtype)+str(stride)+direction+variable)
 
+    def __str__(self):
+        return "MemAccess("+self.mtype+", "+str(self.dtype)+", "+ \
+               str(self.stride)+", "+self.direction+", "+self.variable+")"
+
 
 # {{{ ExpressionOpCounter
 
@@ -1089,7 +1096,8 @@ def get_op_map(knl, numpy_types=True):
                             in the returned mapping should be numpy types
                             instead of :class:`loopy.LoopyType`.
 
-    :return: A mapping of **{** :class:`Op` **:** :class:`islpy.PwQPolynomial` **}**.
+    :return: A :class:`ToCountMap` of **{** :class:`Op` **:**
+             :class:`islpy.PwQPolynomial` **}**.
 
              - The :class:`Op` specifies the characteristics of the arithmetic
                operation.
@@ -1192,7 +1200,7 @@ def get_mem_access_map(knl, numpy_types=True):
                             in the returned mapping should be numpy types
                             instead of :class:`loopy.LoopyType`.
 
-    :return: A mapping of **{** :class:`MemAccess` **:**
+    :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:**
              :class:`islpy.PwQPolynomial` **}**.
 
              - The :class:`MemAccess` specifies the characteristics of the
-- 
GitLab


From 3124e4b3994c7aed6ac77b5e34fa9dd683fde981 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Wed, 2 Nov 2016 20:23:33 -0500
Subject: [PATCH 32/55] updated tutorial so that doctests past, still need to
 update with recently added ToCountMap member functions

---
 doc/tutorial.rst    | 302 +++++++++++++++++++++-----------------------
 loopy/statistics.py |  12 +-
 2 files changed, 151 insertions(+), 163 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 87daa9fc4..c633e55de 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -176,7 +176,7 @@ by passing :attr:`loopy.Options.write_cl`.
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
     {
       for (int i = 0; i <= -1 + n; ++i)
         out[i] = 2.0f * a[i];
@@ -250,7 +250,7 @@ call :func:`loopy.generate_code`:
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
     {
       for (int i = 0; i <= -1 + n; ++i)
         out[i] = 2.0f * a[i];
@@ -365,7 +365,7 @@ Let us take a look at the generated code for the above kernel:
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
     {
       for (int i = 0; i <= -1 + n; ++i)
         for (int j = 0; j <= -1 + n; ++j)
@@ -414,7 +414,7 @@ Now the intended code is generated and our test passes.
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
     {
       for (int i = 0; i <= -1 + n; ++i)
         for (int j = 0; j <= -1 + n; ++j)
@@ -557,12 +557,14 @@ relation to loop nesting. For example, it's perfectly possible to request
     >>> knl = lp.set_loop_priority(knl, "i_inner,i_outer")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
-    ...
+    #define gid(N) ((int) get_group_id(N))
+    <BLANKLINE>
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *restrict a, int const n)
+    {
       for (int i_inner = 0; i_inner <= 15; ++i_inner)
-        if (-1 + -1 * i_inner + n >= 0)
-          for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer)
-            a[16 * i_outer + i_inner] = 0.0f;
-    ...
+        for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer)
+          a[16 * i_outer + i_inner] = 0.0f;
+    }
 
 Notice how loopy has automatically generated guard conditionals to make
 sure the bounds on the old iname are obeyed.
@@ -701,8 +703,9 @@ Let's try this out on our vector fill kernel by creating workgroups of size
     >>> knl = lp.set_options(knl, "write_cl")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
-    ...
-    __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n)
+    #define gid(N) ((int) get_group_id(N))
+    <BLANKLINE>
+    __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *restrict a, int const n)
     {
       if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0)
         a[128 * gid(0) + lid(0)] = 0.0f;
@@ -1182,7 +1185,7 @@ When we ask to see the code, the issue becomes apparent:
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
+    __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *restrict a, int const n, __global float *restrict out)
     {
       float a_fetch[16];
     <BLANKLINE>
@@ -1207,26 +1210,30 @@ Obtaining Performance Statistics
 
 .. {{{
 
-Operations, array access, and barriers can all be counted, which may facilitate
-performance prediction and optimization of a :mod:`loopy` kernel.
+Arithmetic operations, array accesses, and synchronization operations can all
+be counted, which may facilitate performance prediction and optimization of a
+:mod:`loopy` kernel.
 
 .. note::
 
     The functions used in the following examples may produce warnings. If you have
     already made the filterwarnings and catch_warnings calls used in the examples
-    above, you may need to reset these before continuing:
+    above, you may want to reset these before continuing. We will temporarily
+    supress warnings to keep the output clean:
 
     .. doctest::
 
-        >>> from warnings import resetwarnings
+        >>> from warnings import resetwarnings, filterwarnings
         >>> resetwarnings()
+        >>> filterwarnings('ignore', category=Warning)
 
 Counting operations
 ~~~~~~~~~~~~~~~~~~~
 
-:func:`loopy.get_op_poly` provides information on the number and type of operations
-being performed in a kernel. To demonstrate this, we'll create an example kernel
-that performs several operations on arrays containing different types of data:
+:func:`loopy.get_op_map` provides information on the number and type of
+arithmetic operations being performed in a kernel. To demonstrate this, we'll
+create an example kernel that performs several operations on arrays containing
+different types of data:
 
 .. doctest::
 
@@ -1244,37 +1251,36 @@ information provided. Now we will count the operations:
 
 .. doctest::
 
-    >>> from loopy.statistics import get_op_poly
-    >>> op_map = get_op_poly(knl)
+    >>> op_map = lp.get_op_map(knl)
 
-:func:`loopy.get_op_poly` returns a mapping of **{(** :class:`numpy.dtype` **,** 
-:class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. The 
-:class:`islpy.PwQPolynomial` holds the number of operations for the type specified 
-in the key (in terms of the :class:`loopy.LoopKernel` *inames*). We'll print this 
-map now:
+:func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{**
+:class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. The
+:class:`islpy.PwQPolynomial` holds the number of operations for the kind of
+operation specified in the key(in terms of the :class:`loopy.LoopKernel`
+*inames*). We'll print this map now:
 
 .. doctest::
 
     >>> print(lp.stringify_stats_mapping(op_map))
-    (dtype('float32'), 'add') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    (dtype('float32'), 'div') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    (dtype('float32'), 'mul') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    (dtype('float64'), 'add') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    (dtype('float64'), 'mul') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
-    (dtype('int32'), 'add') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float32'), add) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float32'), div) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float32'), mul) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float64'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float64'), mul) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('int32'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
     <BLANKLINE>
 
-We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
+One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
     >>> param_dict = {'n': 256, 'm': 256, 'l': 8}
-    >>> f32add = op_map[(np.dtype(np.float32), 'add')].eval_with_dict(param_dict)
-    >>> f32div = op_map[(np.dtype(np.float32), 'div')].eval_with_dict(param_dict)
-    >>> f32mul = op_map[(np.dtype(np.float32), 'mul')].eval_with_dict(param_dict)
-    >>> f64add = op_map[(np.dtype(np.float64), 'add')].eval_with_dict(param_dict)
-    >>> f64mul = op_map[(np.dtype(np.float64), 'mul')].eval_with_dict(param_dict)
-    >>> i32add = op_map[(np.dtype(np.int32), 'add')].eval_with_dict(param_dict)
+    >>> f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(param_dict)
+    >>> f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(param_dict)
+    >>> f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(param_dict)
+    >>> f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(param_dict)
+    >>> f64mul = op_map[lp.Op(np.float64, 'mul')].eval_with_dict(param_dict)
+    >>> i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(param_dict)
     >>> print("%i\n%i\n%i\n%i\n%i\n%i" % 
     ...     (f32add, f32div, f32mul, f64add, f64mul, i32add))
     524288
@@ -1284,174 +1290,156 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
     65536
     65536
 
-Counting array accesses
-~~~~~~~~~~~~~~~~~~~~~~~
+Counting memory accesses
+~~~~~~~~~~~~~~~~~~~~~~~~
 
-:func:`loopy.get_gmem_access_poly` provides information on the number and type of
-array loads and stores being performed in a kernel. To demonstrate this, we'll
-continue using the kernel from the previous example:
+:func:`loopy.get_mem_access_map` provides information on the number and
+characteristics of memory accesses performed in a kernel. To demonstrate this,
+we'll continue using the kernel from the previous example:
 
 .. doctest::
 
-    >>> from loopy.statistics import get_gmem_access_poly
-    >>> load_store_map = get_gmem_access_poly(knl)
-    >>> print(lp.stringify_stats_mapping(load_store_map))
-    (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 }
-    (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
-    (dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 }
-    (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    >>> mem_map = lp.get_mem_access_map(knl)
+    >>> print(lp.stringify_stats_mapping(mem_map))
+    MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 2 * n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
     <BLANKLINE>
 
-:func:`loopy.get_gmem_access_poly` returns a mapping of **{(**
-:class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)**
-**:** :class:`islpy.PwQPolynomial` **}**.
+:func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
+:class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
+:class:`loopy.MemAccess` attributes include:
 
-- The :class:`numpy.dtype` specifies the type of the data being accessed.
+- mtype: A :class:`str` that specifies the memory type accessed as **global**
+  or **local**
 
-- The first string in the map key specifies the DRAM access type as *consecutive*,
-  *nonconsecutive*, or *uniform*. *Consecutive* memory accesses occur when
-  consecutive threads access consecutive array elements in memory, *nonconsecutive*
-  accesses occur when consecutive threads access nonconsecutive array elements in
-  memory, and *uniform* accesses occur when consecutive threads access the *same*
-  element in memory.
+- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+  data type accessed.
 
-- The second string in the map key specifies the DRAM access type as a *load*, or a
-  *store*.
+- stride: An :class:`int` that specifies stride of the memory access. A stride
+  of 0 indicates a uniform access (i.e. all threads access the same item).
 
-- The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the
-  characteristics specified in the key (in terms of the :class:`loopy.LoopKernel`
-  *inames*).
+- direction: A :class:`str` that specifies the direction of memory access as
+  **load** or **store**.
+
+- variable: A :class:`str` that specifies the variable name of the data
+  accessed.
 
 We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> f64ld = load_store_map[(np.dtype(np.float64), "uniform", "load")
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g')
     ...     ].eval_with_dict(param_dict)
-    >>> f64st = load_store_map[(np.dtype(np.float64), "uniform", "store")
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e')
     ...     ].eval_with_dict(param_dict)
-    >>> f32ld = load_store_map[(np.dtype(np.float32), "uniform", "load")
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a')
     ...     ].eval_with_dict(param_dict)
-    >>> f32st = load_store_map[(np.dtype(np.float32), "uniform", "store")
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c')
     ...     ].eval_with_dict(param_dict)
-    >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" %
-    ...     (f32ld, f32st, f64ld, f64st))
-    f32 load: 1572864
-    f32 store: 524288
-    f64 load: 131072
-    f64 store: 65536
+    >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
+    ...     (f32ld_a, f32st_c, f64ld_g, f64st_e))
+    f32 ld a: 1048576
+    f32 st c: 524288
+    f64 ld g: 65536
+    f64 st e: 65536
 
 ~~~~~~~~~~~
 
-Since we have not tagged any of the inames or parallelized the kernel across threads
-(which would have produced iname tags), :func:`loopy.get_gmem_access_poly` considers
-the array accesses *uniform*. Now we'll parallelize the kernel and count the array
-accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated
-this time, so we'll print the mapping manually to make it more legible:
+Since we have not tagged any of the inames or parallelized the kernel across
+threads (which would have produced iname tags), :func:`loopy.get_mem_access_map`
+considers the memory accesses *uniform*, so the *stride* of each access is 0.
+Now we'll parallelize the kernel and count the array accesses again. The
+resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
 
 .. doctest::
 
     >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0")
-    >>> load_store_map = get_gmem_access_poly(knl_consec)
-    >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)):
-    ...     print("%s :\n%s\n" % (key, load_store_map[key]))
-    (dtype('float32'), 'consecutive', 'load') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-    (dtype('float32'), 'consecutive', 'store') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-    (dtype('float64'), 'consecutive', 'load') :
-    [n, m, l] -> { ... }
+    >>> mem_map = lp.get_mem_access_map(knl_consec)
+    >>> print(lp.stringify_stats_mapping(mem_map))
+    MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { (2 * n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (256 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
     <BLANKLINE>
-    (dtype('float64'), 'consecutive', 'store') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-
 
 With this parallelization, consecutive threads will access consecutive array
 elements in memory. The polynomials are a bit more complicated now due to the
-parallelization, but when we evaluate them, we see that the total number of array
-accesses has not changed:
+parallelization, but when we evaluate them, we see that the total number of
+array accesses has not changed:
 
 .. doctest::
 
-    >>> f64ld = load_store_map[(np.dtype(np.float64), "consecutive", "load")
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g')
     ...     ].eval_with_dict(param_dict)
-    >>> f64st = load_store_map[(np.dtype(np.float64), "consecutive", "store")
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e')
     ...     ].eval_with_dict(param_dict)
-    >>> f32ld = load_store_map[(np.dtype(np.float32), "consecutive", "load")
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a')
     ...     ].eval_with_dict(param_dict)
-    >>> f32st = load_store_map[(np.dtype(np.float32), "consecutive", "store")
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c')
     ...     ].eval_with_dict(param_dict)
-    >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" %
-    ...     (f32ld, f32st, f64ld, f64st))
-    f32 load: 1572864
-    f32 store: 524288
-    f64 load: 131072
-    f64 store: 65536
+    >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
+    ...     (f32ld_a, f32st_c, f64ld_g, f64st_e))
+    f32 ld a: 1048576
+    f32 st c: 524288
+    f64 ld g: 65536
+    f64 st e: 65536
 
 ~~~~~~~~~~~
 
-To produce *nonconsecutive* array accesses, we'll switch the inner and outer tags in
-our parallelization of the kernel:
+To produce *nonconsecutive* array accesses with stride greater than 1, we'll
+switch the inner and outer tags in our parallelization of the kernel:
 
 .. doctest::
 
     >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1")
-    >>> load_store_map = get_gmem_access_poly(knl_nonconsec)
-    >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)):
-    ...     print("%s :\n%s\n" % (key, load_store_map[key]))
-    (dtype('float32'), 'nonconsecutive', 'load') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-    (dtype('float32'), 'nonconsecutive', 'store') :
-    [n, m, l] -> { ... }
+    >>> mem_map = lp.get_mem_access_map(knl_nonconsec)
+    >>> print(lp.stringify_stats_mapping(mem_map))
+    MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { (2 * n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (256 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
     <BLANKLINE>
-    (dtype('float64'), 'nonconsecutive', 'load') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-    (dtype('float64'), 'nonconsecutive', 'store') :
-    [n, m, l] -> { ... }
-    <BLANKLINE>
-
 
-With this parallelization, consecutive threads will access *nonconsecutive* array
-elements in memory. The total number of array accesses has not changed:
+With this parallelization, consecutive threads will access *nonconsecutive*
+array elements in memory. The total number of array accesses still has not
+changed:
 
 .. doctest::
 
-    >>> f64ld = load_store_map[
-    ...     (np.dtype(np.float64), "nonconsecutive", "load")
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g')
     ...     ].eval_with_dict(param_dict)
-    >>> f64st = load_store_map[
-    ...     (np.dtype(np.float64), "nonconsecutive", "store")
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e')
     ...     ].eval_with_dict(param_dict)
-    >>> f32ld = load_store_map[
-    ...     (np.dtype(np.float32), "nonconsecutive", "load")
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a')
     ...     ].eval_with_dict(param_dict)
-    >>> f32st = load_store_map[
-    ...     (np.dtype(np.float32), "nonconsecutive", "store")
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c')
     ...     ].eval_with_dict(param_dict)
-    >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" %
-    ...     (f32ld, f32st, f64ld, f64st))
-    f32 load: 1572864
-    f32 store: 524288
-    f64 load: 131072
-    f64 store: 65536
+    >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
+    ...     (f32ld_a, f32st_c, f64ld_g, f64st_e))
+    f32 ld a: 1048576
+    f32 st c: 524288
+    f64 ld g: 65536
+    f64 st e: 65536
 
 Counting synchronization events
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-:func:`loopy.get_synchronization_poly` counts the number of synchronization
+:func:`loopy.get_synchronization_map` counts the number of synchronization
 events per **thread** in a kernel. First, we'll call this function on the
 kernel from the previous example:
 
 .. doctest::
 
-    >>> from loopy.statistics import get_synchronization_poly
-    >>> barrier_poly = get_synchronization_poly(knl)
-    >>> print(lp.stringify_stats_mapping(barrier_poly))
+    >>> sync_map = lp.get_synchronization_map(knl)
+    >>> print(lp.stringify_stats_mapping(sync_map))
     kernel_launch : { 1 }
     <BLANKLINE>
 
@@ -1459,7 +1447,7 @@ We can evaluate this polynomial using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> launch_count = barrier_poly["kernel_launch"].eval_with_dict(param_dict)
+    >>> launch_count = sync_map["kernel_launch"].eval_with_dict(param_dict)
     >>> print("Kernel launch count: %s" % launch_count)
     Kernel launch count: 1
 
@@ -1485,7 +1473,7 @@ Now to make things more interesting, we'll create a kernel with barriers:
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *__restrict__ a, __global int *__restrict__ e)
+    __kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *restrict a, __global int *restrict e)
     {
       __local int c[50 * 10 * 99];
     <BLANKLINE>
@@ -1499,24 +1487,24 @@ Now to make things more interesting, we'll create a kernel with barriers:
         }
     }
 
-
-In this kernel, when a thread performs the second instruction it uses data produced
-by *different* threads during the first instruction. Because of this, barriers are
-required for correct execution, so loopy inserts them. Now we'll count the barriers
-using :func:`loopy.get_barrier_poly`:
+In this kernel, when a thread performs the second instruction it uses data
+produced by *different* threads during the first instruction. Because of this,
+barriers are required for correct execution, so loopy inserts them. Now we'll
+count the barriers using :func:`loopy.get_synchronization_map`:
 
 .. doctest::
 
-    >>> sync_map = lp.get_synchronization_poly(knl)
+    >>> sync_map = lp.get_synchronization_map(knl)
     >>> print(lp.stringify_stats_mapping(sync_map))
     barrier_local : { 1000 }
     kernel_launch : { 1 }
     <BLANKLINE>
 
-Based on the kernel code printed above, we would expect each thread to encounter
-50x10x2 barriers, which matches the result from :func:`loopy.get_barrier_poly`. In
-this case, the number of barriers does not depend on any inames, so we can pass an
-empty dictionary to :func:`islpy.eval_with_dict`.
+Based on the kernel code printed above, we would expect each thread to
+encounter 50x10x2 barriers, which matches the result from
+:func:`loopy.get_synchronization_map`. In this case, the number of barriers
+does not depend on any inames, so we can pass an empty dictionary to
+:func:`islpy.eval_with_dict`.
 
 .. }}}
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index ec10722e1..468a274d7 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1043,22 +1043,22 @@ def count(kernel, set):
 
         if not (is_subset and is_superset):
             if is_subset:
-                from loopy.diagnostic import warn
-                warn(kernel, "count_overestimate",
+                from loopy.diagnostic import warn_with_kernel
+                warn_with_kernel(kernel, "count_overestimate",
                         "Barvinok wrappers are not installed. "
                         "Counting routines have overestimated the "
                         "number of integer points in your loop "
                         "domain.")
             elif is_superset:
-                from loopy.diagnostic import warn
-                warn(kernel, "count_underestimate",
+                from loopy.diagnostic import warn_with_kernel
+                warn_with_kernel(kernel, "count_underestimate",
                         "Barvinok wrappers are not installed. "
                         "Counting routines have underestimated the "
                         "number of integer points in your loop "
                         "domain.")
             else:
-                from loopy.diagnostic import warn
-                warn(kernel, "count_misestimate",
+                from loopy.diagnostic import warn_with_kernel
+                warn_with_kernel(kernel, "count_misestimate",
                         "Barvinok wrappers are not installed. "
                         "Counting routines have misestimated the "
                         "number of integer points in your loop "
-- 
GitLab


From 3d202b489764545795e60dade112140f579947c5 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Thu, 3 Nov 2016 00:20:48 -0500
Subject: [PATCH 33/55] fixed ToCountMap.__str__ to handle None values

---
 loopy/statistics.py | 53 +++++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 468a274d7..4d3f8831a 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -369,16 +369,18 @@ class Op:
                  self.name == other.name))
 
     def __hash__(self):
-        dtype = self.dtype
-        name = self.name
-        if dtype is None:
-            dtype = 'None'
-        if name is None:
-            name = 'None'
-        return hash(str(dtype)+name)
+        return hash(str(self))
 
     def __str__(self):
-        return "Op("+str(self.dtype)+", "+self.name+")"
+        if self.dtype is None:
+            dtype = 'None'
+        else:
+            dtype = str(self.dtype)
+        if self.name is None:
+            name = 'None'
+        else:
+            name = self.name
+        return "Op("+dtype+", "+name+")"
 
 
 class MemAccess:
@@ -446,26 +448,31 @@ class MemAccess:
                  self.variable == other.variable))
 
     def __hash__(self):
-        mtype = self.mtype
-        dtype = self.dtype
-        stride = self.stride
-        direction = self.direction
-        variable = self.variable
-        if mtype is None:
+        return hash(str(self))
+
+    def __str__(self):
+        if self.mtype is None:
             mtype = 'None'
-        if dtype is None:
+        else:
+            mtype = self.mtype
+        if self.dtype is None:
             dtype = 'None'
-        if stride is None:
+        else:
+            dtype = str(self.dtype)
+        if self.stride is None:
             stride = 'None'
-        if direction is None:
+        else:
+            stride = str(self.stride)
+        if self.direction is None:
             direction = 'None'
-        if variable is None:
+        else:
+            direction = self.direction
+        if self.variable is None:
             variable = 'None'
-        return hash(mtype+str(dtype)+str(stride)+direction+variable)
-
-    def __str__(self):
-        return "MemAccess("+self.mtype+", "+str(self.dtype)+", "+ \
-               str(self.stride)+", "+self.direction+", "+self.variable+")"
+        else:
+            variable = self.variable
+        return "MemAccess("+mtype+", "+dtype+", "+stride+", "+direction+", " \
+               +variable+")"
 
 
 # {{{ ExpressionOpCounter
-- 
GitLab


From ef374b52ca2a1f9a8025992e3040362992c48cc6 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Thu, 3 Nov 2016 00:21:52 -0500
Subject: [PATCH 34/55] added info and examples about the new filter, group,
 to_bytes, and summation functions for ToCountMap

---
 doc/tutorial.rst | 133 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 103 insertions(+), 30 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 03cceb522..eb80fa448 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1231,10 +1231,10 @@ be counted, which may facilitate performance prediction and optimization of a
 Counting operations
 ~~~~~~~~~~~~~~~~~~~
 
-:func:`loopy.get_op_map` provides information on the number and type of
-arithmetic operations being performed in a kernel. To demonstrate this, we'll
-create an example kernel that performs several operations on arrays containing
-different types of data:
+:func:`loopy.get_op_map` provides information on the characteristics and
+quantity of arithmetic operations being performed in a kernel. To demonstrate
+this, we'll create an example kernel that performs several operations on arrays
+containing different types of data:
 
 .. doctest::
 
@@ -1253,15 +1253,6 @@ information provided. Now we will count the operations:
 .. doctest::
 
     >>> op_map = lp.get_op_map(knl)
-
-:func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{**
-:class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. The
-:class:`islpy.PwQPolynomial` holds the number of operations for the kind of
-operation specified in the key(in terms of the :class:`loopy.LoopKernel`
-*inames*). We'll print this map now:
-
-.. doctest::
-
     >>> print(lp.stringify_stats_mapping(op_map))
     Op(np:dtype('float32'), add) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
     Op(np:dtype('float32'), div) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 }
@@ -1271,6 +1262,20 @@ operation specified in the key(in terms of the :class:`loopy.LoopKernel`
     Op(np:dtype('int32'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
     <BLANKLINE>
 
+:func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{**
+:class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. A
+:class:`loopy.ToCountMap` holds a dictionary mapping any type of key to an
+arithmetic type. In this case, the :class:`islpy.PwQPolynomial` holds the
+number of operations matching the characteristics of the :class:`loopy.Op`
+specified in the key (in terms of the :class:`loopy.LoopKernel`
+*inames*). :class:`loopy.Op` attributes include:
+
+- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the
+  data type operated on.
+
+- name: A :class:`str` that specifies the kind of arithmetic operation as
+  *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.
+
 One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
 
 .. doctest::
@@ -1291,6 +1296,39 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
     65536
     65536
 
+:class:`loopy.ToCountMap` provides member functions that facilitate filtering,
+grouping, and evaluating subsets of the counts. Suppose we want to know the
+total number of 32-bit operations of any kind. We can easily count these
+using functions :func:`loopy.ToCountMap.filter_by` and
+:func:`loopy.ToCountMap.eval_and_sum`:
+
+.. doctest::
+
+    >>> filtered_op_map = op_map.filter_by(dtype=[np.float32])
+    >>> f32op_count = filtered_op_map.eval_and_sum(param_dict)
+    >>> print(f32op_count)
+    1572864
+
+We could accomplish the same goal using :func:`loopy.ToCountMap.group_by`,
+which produces a :class:`loopy.ToCountMap` that contains the same counts grouped
+together into keys containing only the specified fields:
+
+.. doctest::
+
+    >>> op_map_dtype = op_map.group_by('dtype')
+    >>> print(lp.stringify_stats_mapping(op_map_dtype))
+    Op(np:dtype('float32'), None) : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('float64'), None) : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 }
+    Op(np:dtype('int32'), None) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 }
+    <BLANKLINE>
+    >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32)
+    ...                           ].eval_with_dict(param_dict)
+    >>> print(f32op_count)
+    1572864
+
+See the reference page for :class:`loopy.ToCountMap` and :class:`loopy.Op` for
+more information on these functions.
+
 Counting memory accesses
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -1334,20 +1372,53 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 .. doctest::
 
     >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g')
-    ...     ].eval_with_dict(param_dict)
+    ...                  ].eval_with_dict(param_dict)
     >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e')
-    ...     ].eval_with_dict(param_dict)
+    ...                  ].eval_with_dict(param_dict)
     >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a')
-    ...     ].eval_with_dict(param_dict)
+    ...                  ].eval_with_dict(param_dict)
     >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c')
-    ...     ].eval_with_dict(param_dict)
+    ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
-    ...     (f32ld_a, f32st_c, f64ld_g, f64st_e))
+    ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
     f32 ld a: 1048576
     f32 st c: 524288
     f64 ld g: 65536
     f64 st e: 65536
 
+:class:`loopy.ToCountMap` also makes it easy to determine the total amount
+of data moved in bytes. Suppose we want to know the total abount of global
+memory data loaded and stored. We can produce a map with just this information
+using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
+
+.. doctest::
+
+    >>> bytes_map = mem_map.to_bytes()
+    >>> print(lp.stringify_stats_mapping(bytes_map))
+    MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 8 * n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
+    MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 }
+    <BLANKLINE>
+    >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
+    ...                                         ).group_by('direction')
+    >>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
+    MemAccess(None, None, None, load, None) : [n, m, l] -> { (16 * n * m + 12 * n * m * l) : n > 0 and m > 0 and l > 0 }
+    MemAccess(None, None, None, store, None) : [n, m, l] -> { (8 * n * m + 4 * n * m * l) : n > 0 and m > 0 and l > 0 }
+    <BLANKLINE>
+    >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
+    ...                            ].eval_with_dict(param_dict)
+    >>> stored = global_ld_st_bytes[lp.MemAccess(direction='store')
+    ...                            ].eval_with_dict(param_dict)
+    >>> print("bytes loaded: %s\nbytes stored: %s" % (loaded, stored))
+    bytes loaded: 7340032 
+    bytes stored: 2621440
+
+One can see how these functions might be useful in computing, for example,
+achieved memory bandwidth in byte/sec or performance in FLOP/sec.
+
 ~~~~~~~~~~~
 
 Since we have not tagged any of the inames or parallelized the kernel across
@@ -1358,7 +1429,8 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
 
 .. doctest::
 
-    >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0")
+    >>> knl_consec = lp.split_iname(knl, "k", 128,
+    ...                             outer_tag="l.1", inner_tag="l.0")
     >>> mem_map = lp.get_mem_access_map(knl_consec)
     >>> print(lp.stringify_stats_mapping(mem_map))
     MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { (2 * n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (256 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
@@ -1377,15 +1449,15 @@ array accesses has not changed:
 .. doctest::
 
     >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g')
-    ...     ].eval_with_dict(param_dict)
+    ...                  ].eval_with_dict(param_dict)
     >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e')
-    ...     ].eval_with_dict(param_dict)
+    ...                  ].eval_with_dict(param_dict)
     >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a')
-    ...     ].eval_with_dict(param_dict)
+    ...                  ].eval_with_dict(param_dict)
     >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c')
-    ...     ].eval_with_dict(param_dict)
+    ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
-    ...     (f32ld_a, f32st_c, f64ld_g, f64st_e))
+    ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
     f32 ld a: 1048576
     f32 st c: 524288
     f64 ld g: 65536
@@ -1398,7 +1470,8 @@ switch the inner and outer tags in our parallelization of the kernel:
 
 .. doctest::
 
-    >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1")
+    >>> knl_nonconsec = lp.split_iname(knl, "k", 128,
+    ...                                outer_tag="l.0", inner_tag="l.1")
     >>> mem_map = lp.get_mem_access_map(knl_nonconsec)
     >>> print(lp.stringify_stats_mapping(mem_map))
     MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { (2 * n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (256 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
@@ -1416,15 +1489,15 @@ changed:
 .. doctest::
 
     >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g')
-    ...     ].eval_with_dict(param_dict)
+    ...                  ].eval_with_dict(param_dict)
     >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e')
-    ...     ].eval_with_dict(param_dict)
+    ...                  ].eval_with_dict(param_dict)
     >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a')
-    ...     ].eval_with_dict(param_dict)
+    ...                  ].eval_with_dict(param_dict)
     >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c')
-    ...     ].eval_with_dict(param_dict)
+    ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
-    ...     (f32ld_a, f32st_c, f64ld_g, f64st_e))
+    ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
     f32 ld a: 1048576
     f32 st c: 524288
     f64 ld g: 65536
-- 
GitLab


From fc54488bc742c3f5407d214d4436a705b4339978 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Thu, 3 Nov 2016 12:27:51 -0500
Subject: [PATCH 35/55] fixed a broken test, renamed some tests

---
 test/test_numa_diff.py  |  2 +-
 test/test_statistics.py | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index e4f303f78..33ac31f8d 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -228,7 +228,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
         print(lp.stringify_stats_mapping(op_poly))
 
         print("MEM")
-        gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv))
+        gmem_poly = lp.get_mem_access_poly(hsv).to_bytes()
         print(lp.stringify_stats_mapping(gmem_poly))
 
     hsv = lp.set_options(hsv, cl_build_options=[
diff --git a/test/test_statistics.py b/test/test_statistics.py
index f768ef9bd..cdeef2b0e 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -214,7 +214,7 @@ def test_op_counter_triangular_domain():
         assert flops == 78
 
 
-def test_gmem_access_counter_basic():
+def test_mem_access_counter_basic():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -258,7 +258,7 @@ def test_gmem_access_counter_basic():
     assert f64s == n*m
 
 
-def test_gmem_access_counter_reduction():
+def test_mem_access_counter_reduction():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -294,7 +294,7 @@ def test_gmem_access_counter_reduction():
     assert st_bytes == 4*f32s
 
 
-def test_gmem_access_counter_logic():
+def test_mem_access_counter_logic():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -328,7 +328,7 @@ def test_gmem_access_counter_logic():
     assert f64_g_s == n*m
 
 
-def test_gmem_access_counter_specialops():
+def test_mem_access_counter_specialops():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -376,7 +376,7 @@ def test_gmem_access_counter_specialops():
     tot = filtered_map.eval_and_sum(params)
     assert tot == n*m*l + n*m
 
-def test_gmem_access_counter_bitwise():
+def test_mem_access_counter_bitwise():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -421,7 +421,7 @@ def test_gmem_access_counter_bitwise():
     assert i32 == n*m+n*m*l
 
 
-def test_gmem_access_counter_mixed():
+def test_mem_access_counter_mixed():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -476,7 +476,7 @@ def test_gmem_access_counter_mixed():
     assert f32nonconsec == n*m*l
 
 
-def test_gmem_access_counter_nonconsec():
+def test_mem_access_counter_nonconsec():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
@@ -528,7 +528,7 @@ def test_gmem_access_counter_nonconsec():
     assert f32nonconsec == n*m*l
 
 
-def test_gmem_access_counter_consec():
+def test_mem_access_counter_consec():
 
     knl = lp.make_kernel(
             "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
-- 
GitLab


From 31f49d6136ff8bf358b0b4ea47e10e5f77ed6e24 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Thu, 3 Nov 2016 12:47:43 -0500
Subject: [PATCH 36/55] actually fixing tests this time

---
 test/test_numa_diff.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 33ac31f8d..dfdd7f63e 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -224,12 +224,12 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
 
     if 1:
         print("OPS")
-        op_poly = lp.get_op_poly(hsv)
-        print(lp.stringify_stats_mapping(op_poly))
+        op_map = lp.get_op_map(hsv)
+        print(lp.stringify_stats_mapping(op_map))
 
         print("MEM")
-        gmem_poly = lp.get_mem_access_poly(hsv).to_bytes()
-        print(lp.stringify_stats_mapping(gmem_poly))
+        gmem_map = lp.get_mem_access_map(hsv).to_bytes()
+        print(lp.stringify_stats_mapping(gmem_map))
 
     hsv = lp.set_options(hsv, cl_build_options=[
          "-cl-denorms-are-zero",
-- 
GitLab


From 012954b0929d22823a397805afaaf075fd64bf62 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Thu, 3 Nov 2016 13:20:24 -0500
Subject: [PATCH 37/55] making python 2 compatible

---
 loopy/statistics.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 4d3f8831a..af9fffc51 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -228,7 +228,8 @@ class ToCountMap:
 
         # make sure all item keys have same type
         if self.dict:
-            key_type = type(list(self.keys())[0])
+            first_key = list(self.keys())[0]
+            key_type = type(first_key)
             if not all(isinstance(x, key_type) for x in self.keys()):
                 raise ValueError("ToCountMap: group_by() function may only "
                                  "be used on ToCountMaps with uniform keys")
@@ -237,7 +238,7 @@ class ToCountMap:
 
         # for each item in self.dict
         for self_key, self_val in self.items():
-            new_key = key_type()
+            new_key = first_key.__class__()
 
             # set all specified fields
             for field in args:
-- 
GitLab


From 4e5be04bfc152fd07a63c68578d3fe341d504aca Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Thu, 3 Nov 2016 14:28:12 -0500
Subject: [PATCH 38/55] restrict->__restrict__

---
 doc/tutorial.rst | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index eb80fa448..ee737ea83 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -176,7 +176,7 @@ by passing :attr:`loopy.Options.write_cl`.
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
     {
       for (int i = 0; i <= -1 + n; ++i)
         out[i] = 2.0f * a[i];
@@ -250,7 +250,7 @@ call :func:`loopy.generate_code`:
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
     {
       for (int i = 0; i <= -1 + n; ++i)
         out[i] = 2.0f * a[i];
@@ -365,7 +365,7 @@ Let us take a look at the generated code for the above kernel:
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
     {
       for (int i = 0; i <= -1 + n; ++i)
         for (int j = 0; j <= -1 + n; ++j)
@@ -414,7 +414,7 @@ Now the intended code is generated and our test passes.
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
     {
       for (int i = 0; i <= -1 + n; ++i)
         for (int j = 0; j <= -1 + n; ++j)
@@ -559,7 +559,7 @@ relation to loop nesting. For example, it's perfectly possible to request
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *restrict a, int const n)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n)
     {
       for (int i_inner = 0; i_inner <= 15; ++i_inner)
         for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer)
@@ -705,7 +705,7 @@ Let's try this out on our vector fill kernel by creating workgroups of size
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *restrict a, int const n)
+    __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n)
     {
       if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0)
         a[128 * gid(0) + lid(0)] = 0.0f;
@@ -1186,7 +1186,7 @@ When we ask to see the code, the issue becomes apparent:
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *restrict a, int const n, __global float *restrict out)
+    __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
     {
       float a_fetch[16];
     <BLANKLINE>
@@ -1547,7 +1547,7 @@ Now to make things more interesting, we'll create a kernel with barriers:
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *restrict a, __global int *restrict e)
+    __kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *__restrict__ a, __global int *__restrict__ e)
     {
       __local int c[50 * 10 * 99];
     <BLANKLINE>
-- 
GitLab


From 652688b26637240a2cce0fc8d08539137e85673c Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Thu, 3 Nov 2016 22:38:42 -0500
Subject: [PATCH 39/55] make classes inherit from object

---
 loopy/statistics.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index af9fffc51..699a86044 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -64,7 +64,7 @@ __doc__ = """
 
 # {{{ ToCountMap
 
-class ToCountMap:
+class ToCountMap(object):
     """Maps any type of key to an arithmetic type.
 
     .. automethod:: filter_by
@@ -339,7 +339,7 @@ def stringify_stats_mapping(m):
     return result
 
 
-class Op:
+class Op(object):
     """An arithmetic operation.
 
     .. attribute:: dtype
@@ -384,7 +384,7 @@ class Op:
         return "Op("+dtype+", "+name+")"
 
 
-class MemAccess:
+class MemAccess(object):
     """A memory access.
 
     .. attribute:: mtype
-- 
GitLab


From ad255d0862a7ab67e2d74e95c2bae56cd747e682 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Thu, 3 Nov 2016 22:41:15 -0500
Subject: [PATCH 40/55] changed back to constructor call of unknown type

---
 loopy/statistics.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 699a86044..fbca99779 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -228,8 +228,7 @@ class ToCountMap(object):
 
         # make sure all item keys have same type
         if self.dict:
-            first_key = list(self.keys())[0]
-            key_type = type(first_key)
+            key_type = type(list(self.keys())[0])
             if not all(isinstance(x, key_type) for x in self.keys()):
                 raise ValueError("ToCountMap: group_by() function may only "
                                  "be used on ToCountMaps with uniform keys")
@@ -238,7 +237,7 @@ class ToCountMap(object):
 
         # for each item in self.dict
         for self_key, self_val in self.items():
-            new_key = first_key.__class__()
+            new_key = key_type()
 
             # set all specified fields
             for field in args:
-- 
GitLab


From a17533d073cca343daab893c2dccb5b6c4f3a92d Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Thu, 3 Nov 2016 22:54:44 -0500
Subject: [PATCH 41/55] renamed ToCountMap.dict to ToCountMap.count_map

---
 loopy/statistics.py | 87 +++++++++++++++++++++++----------------------
 1 file changed, 45 insertions(+), 42 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index fbca99779..9736fa0d2 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -78,12 +78,12 @@ class ToCountMap(object):
     def __init__(self, init_dict=None):
         if init_dict is None:
             init_dict = {}
-        self.dict = init_dict
+        self.count_map = init_dict
 
     def __add__(self, other):
-        result = self.dict.copy()
-        for k, v in six.iteritems(other.dict):
-            result[k] = self.dict.get(k, 0) + v
+        result = self.count_map.copy()
+        for k, v in six.iteritems(other.count_map):
+            result[k] = self.count_map.get(k, 0) + v
         return ToCountMap(result)
 
     def __radd__(self, other):
@@ -97,7 +97,7 @@ class ToCountMap(object):
     def __mul__(self, other):
         if isinstance(other, isl.PwQPolynomial):
             return ToCountMap(dict(
-                (index, self.dict[index]*other)
+                (index, self.count_map[index]*other)
                 for index in self.keys()))
         else:
             raise ValueError("ToCountMap: Attempted to multiply "
@@ -108,27 +108,30 @@ class ToCountMap(object):
 
     def __getitem__(self, index):
         try:
-            return self.dict[index]
+            return self.count_map[index]
         except KeyError:
             return isl.PwQPolynomial('{ 0 }')
 
     def __setitem__(self, index, value):
-        self.dict[index] = value
+        self.count_map[index] = value
 
     def __repr__(self):
-        return repr(self.dict)
+        return repr(self.count_map)
 
     def __len__(self):
-        return len(self.dict)
+        return len(self.count_map)
 
     def items(self):
-        return self.dict.items()
+        return self.count_map.items()
 
     def keys(self):
-        return self.dict.keys()
+        return self.count_map.keys()
+
+    def pop(self, item):
+        return self.count_map.pop(item)
 
     def copy(self):
-        return ToCountMap(dict(self.dict))
+        return ToCountMap(dict(self.count_map))
 
     def filter_by(self, **kwargs):
         """Remove items without specified key fields.
@@ -161,7 +164,7 @@ class ToCountMap(object):
         if 'dtype' in kwargs.keys():
             kwargs['dtype'] = [to_loopy_type(d) for d in kwargs['dtype']]
 
-        # for each item in self.dict
+        # for each item in self.count_map
         for self_key, self_val in self.items():
             try:
                 # check to see if key attribute values match all filters
@@ -171,7 +174,7 @@ class ToCountMap(object):
                     if attr_val not in allowable_vals:
                         break
                 else:  # loop terminated without break or error
-                    result_map.dict[self_key] = self_val
+                    result_map[self_key] = self_val
             except(AttributeError):
                 # the field passed is not a field of this key
                 continue
@@ -227,7 +230,7 @@ class ToCountMap(object):
         result_map = ToCountMap()
 
         # make sure all item keys have same type
-        if self.dict:
+        if self.count_map:
             key_type = type(list(self.keys())[0])
             if not all(isinstance(x, key_type) for x in self.keys()):
                 raise ValueError("ToCountMap: group_by() function may only "
@@ -235,7 +238,7 @@ class ToCountMap(object):
         else:
             return result_map
 
-        # for each item in self.dict
+        # for each item in self.count_map
         for self_key, self_val in self.items():
             new_key = key_type()
 
@@ -1143,9 +1146,9 @@ def get_op_map(knl, numpy_types=True):
         op_map = op_map + ops*count(knl, domain)
 
     if numpy_types:
-        op_map.dict = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name),
-                             count)
-                for op, count in six.iteritems(op_map.dict))
+        op_map.count_map = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name),
+                                 count)
+                for op, count in six.iteritems(op_map.count_map))
 
     return op_map
 
@@ -1284,26 +1287,26 @@ def get_mem_access_map(knl, numpy_types=True):
                     + subs_counter_l(insn.expression)
 
         # distinguish loads and stores
-        for key in subs_expr.dict:
-            subs_expr.dict[MemAccess(mtype=key.mtype, dtype=key.dtype,
-                                     stride=key.stride, direction='load',
-                                     variable=key.variable)
-                          ] = subs_expr.dict.pop(key)
+        for key in subs_expr.count_map:
+            subs_expr[MemAccess(mtype=key.mtype, dtype=key.dtype,
+                                stride=key.stride, direction='load',
+                                variable=key.variable)
+                     ] = subs_expr.pop(key)
 
         subs_assignee_g = subs_counter_g(insn.assignee)
-        for key in subs_assignee_g.dict:
-            subs_assignee_g.dict[MemAccess(mtype=key.mtype, dtype=key.dtype,
-                                           stride=key.stride,
-                                           direction='store',
-                                           variable=key.variable)
-                                ] = subs_assignee_g.dict.pop(key)
+        for key in subs_assignee_g.count_map:
+            subs_assignee_g[MemAccess(mtype=key.mtype, dtype=key.dtype,
+                                      stride=key.stride,
+                                      direction='store',
+                                      variable=key.variable)
+                           ] = subs_assignee_g.pop(key)
         # for now, don't count writes to local mem
 
         insn_inames = knl.insn_inames(insn)
 
         # use count excluding local index tags for uniform accesses
-        for key in subs_expr.dict:
-            map = ToCountMap({key: subs_expr.dict[key]})
+        for key in subs_expr.count_map:
+            map = ToCountMap({key: subs_expr[key]})
             if key.mtype == 'global' and isinstance(key.stride, int) and key.stride == 0:
                 subs_map = subs_map \
                             + map*get_insn_count(knl, insn_inames, True)
@@ -1311,8 +1314,8 @@ def get_mem_access_map(knl, numpy_types=True):
                 subs_map = subs_map + map*get_insn_count(knl, insn_inames)
                 #currently not counting stride of local mem access
 
-        for key in subs_assignee_g.dict:
-            map = ToCountMap({key: subs_assignee_g.dict[key]})
+        for key in subs_assignee_g.count_map:
+            map = ToCountMap({key: subs_assignee_g[key]})
             if isinstance(key.stride, int) and key.stride == 0:
                 subs_map = subs_map \
                             + map*get_insn_count(knl, insn_inames, True)
@@ -1321,13 +1324,13 @@ def get_mem_access_map(knl, numpy_types=True):
             # for now, don't count writes to local mem
 
     if numpy_types:
-        subs_map.dict = dict((MemAccess(mtype=mem_access.mtype,
-                                         dtype=mem_access.dtype.numpy_dtype,
-                                         stride=mem_access.stride,
-                                         direction=mem_access.direction,
-                                         variable=mem_access.variable)
-                               , count)
-                      for mem_access, count in six.iteritems(subs_map.dict))
+        subs_map.count_map = dict((MemAccess(mtype=mem_access.mtype,
+                                             dtype=mem_access.dtype.numpy_dtype,
+                                             stride=mem_access.stride,
+                                             direction=mem_access.direction,
+                                             variable=mem_access.variable)
+                                   , count)
+                      for mem_access, count in six.iteritems(subs_map.count_map))
 
     return subs_map
 
@@ -1421,7 +1424,7 @@ def get_synchronization_map(knl):
             raise LoopyError("unexpected schedule item: %s"
                     % type(sched_item).__name__)
 
-    #return result.dict #TODO is this change okay?
+    #return result.count_map #TODO is this change okay?
     return result
 
 
-- 
GitLab


From 47f55a384754ba27dbbafb9d7dbadc353efc9b38 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Thu, 3 Nov 2016 23:28:53 -0500
Subject: [PATCH 42/55] undoing improper changes to tutorial expected output

---
 doc/tutorial.rst | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index ee737ea83..71547e695 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -557,14 +557,12 @@ relation to loop nesting. For example, it's perfectly possible to request
     >>> knl = lp.set_loop_priority(knl, "i_inner,i_outer")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
-    #define gid(N) ((int) get_group_id(N))
-    <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n)
-    {
-      for (int i_inner = 0; i_inner <= 15; ++i_inner)
-        for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer)
-          a[16 * i_outer + i_inner] = 0.0f;
-    }
+    ...
+       for (int i_inner = 0; i_inner <= 15; ++i_inner)
+         if (-1 + -1 * i_inner + n >= 0)
+           for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer)
+             a[16 * i_outer + i_inner] = 0.0f;
+    ...
 
 Notice how loopy has automatically generated guard conditionals to make
 sure the bounds on the old iname are obeyed.
@@ -703,8 +701,7 @@ Let's try this out on our vector fill kernel by creating workgroups of size
     >>> knl = lp.set_options(knl, "write_cl")
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
-    #define gid(N) ((int) get_group_id(N))
-    <BLANKLINE>
+    ...
     __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n)
     {
       if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0)
-- 
GitLab


From 88799f8d2aa4d4d3be75916ca53c05a8aba2ee05 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Thu, 3 Nov 2016 23:46:51 -0500
Subject: [PATCH 43/55] fixed missing spaces in tutorial

---
 doc/tutorial.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 71547e695..172b3a3bc 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -558,10 +558,10 @@ relation to loop nesting. For example, it's perfectly possible to request
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
-       for (int i_inner = 0; i_inner <= 15; ++i_inner)
-         if (-1 + -1 * i_inner + n >= 0)
-           for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer)
-             a[16 * i_outer + i_inner] = 0.0f;
+      for (int i_inner = 0; i_inner <= 15; ++i_inner)
+        if (-1 + -1 * i_inner + n >= 0)
+          for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer)
+            a[16 * i_outer + i_inner] = 0.0f;
     ...
 
 Notice how loopy has automatically generated guard conditionals to make
-- 
GitLab


From fe66563a2367066f6ea306b64951afc983e5104d Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Fri, 4 Nov 2016 00:13:00 -0500
Subject: [PATCH 44/55] added ToCountMap.filter_by_func

---
 loopy/statistics.py     | 35 +++++++++++++++++++++++++++++++++++
 test/test_statistics.py |  6 +++++-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 9736fa0d2..e7f6f7953 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -68,6 +68,7 @@ class ToCountMap(object):
     """Maps any type of key to an arithmetic type.
 
     .. automethod:: filter_by
+    .. automethod:: filter_by_func
     .. automethod:: group_by
     .. automethod:: to_bytes
     .. automethod:: sum
@@ -181,6 +182,40 @@ class ToCountMap(object):
 
         return result_map
 
+    def filter_by_func(self, func):
+        """Keep items that pass a test.
+
+        :parameter func: A function that takes a map key a parameter and
+                         returns a :class:`bool`.
+
+        :return: A :class:`ToCountMap` containing the subset of the items in
+                 the original :class:`ToCountMap` for which func(key) is true.
+
+        Example usage::
+
+            # (first create loopy kernel and specify array data types)
+
+            params = {'n': 512, 'm': 256, 'l': 128}
+            mem_map = lp.get_mem_access_map(knl)
+            def filter_func(key):
+                return key.stride > 1 and key.stride <= 4:
+
+            filtered_map = mem_map.filter_by_func(filter_func)
+            tot = filtered_map.eval_and_sum(params)
+
+            # (now use these counts to predict performance)
+
+        """
+
+        result_map = ToCountMap()
+
+        # for each item in self.count_map, call func on the key
+        for self_key, self_val in self.items():
+            if func(self_key):
+                result_map[self_key] = self_val
+
+        return result_map
+
     def group_by(self, *args):
         """Group map items together, distinguishing by only the key fields
            passed in args.
diff --git a/test/test_statistics.py b/test/test_statistics.py
index cdeef2b0e..05d857667 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -793,7 +793,11 @@ def test_summations_and_filters():
     assert mul_all == n*m*l + n*m
     assert f64ops_all == n*m
 
-
+    def func_filter(key):
+        return (key.stride < 1) and (to_loopy_type(key.dtype) == to_loopy_type(np.float64)) and \
+               (key.direction == 'load')
+    s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
+    assert s1f64l == 2*n*m
 
 if __name__ == "__main__":
     if len(sys.argv) > 1:
-- 
GitLab


From 6dbbc1a002fb1ba8cd87249c06654d6fec330512 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Fri, 4 Nov 2016 00:40:42 -0500
Subject: [PATCH 45/55] removed unnecessary parens

---
 test/test_statistics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_statistics.py b/test/test_statistics.py
index 05d857667..fb502045c 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -794,8 +794,8 @@ def test_summations_and_filters():
     assert f64ops_all == n*m
 
     def func_filter(key):
-        return (key.stride < 1) and (to_loopy_type(key.dtype) == to_loopy_type(np.float64)) and \
-               (key.direction == 'load')
+        return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \
+               key.direction == 'load'
     s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
     assert s1f64l == 2*n*m
 
-- 
GitLab


From 8b122d0aa11b201bbc5c474b69769fd3dc69caaa Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Fri, 4 Nov 2016 00:41:29 -0500
Subject: [PATCH 46/55] added filter_by_func example to tutorial

---
 doc/tutorial.rst | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 172b3a3bc..e7a87505f 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1410,7 +1410,7 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
     >>> stored = global_ld_st_bytes[lp.MemAccess(direction='store')
     ...                            ].eval_with_dict(param_dict)
     >>> print("bytes loaded: %s\nbytes stored: %s" % (loaded, stored))
-    bytes loaded: 7340032 
+    bytes loaded: 7340032
     bytes stored: 2621440
 
 One can see how these functions might be useful in computing, for example,
@@ -1500,6 +1500,20 @@ changed:
     f64 ld g: 65536
     f64 st e: 65536
 
+We can also filter using an arbitrary test function using
+:func:`loopy.ToCountMap.filter_by_func`. This is useful when the filter
+criteria are more complicated than a simple list of allowable values:
+
+.. doctest::
+
+    >>> def f(key):
+    ...     from loopy.types import to_loopy_type
+    ...     return key.dtype == to_loopy_type(np.float32) and \
+    ...            key.stride > 1
+    >>> count = mem_map.filter_by_func(f).eval_and_sum(param_dict)
+    >>> print(count)
+    2097152
+
 Counting synchronization events
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-- 
GitLab


From 0a7bbfa5fb77fa2df2d8656da9fc2adbfb4b2f83 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Sat, 5 Nov 2016 18:38:52 -0500
Subject: [PATCH 47/55] no longer specifying exact PwQPolynomial in some
 doctests

---
 doc/tutorial.rst | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 844387c1d..8ee322a23 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1435,12 +1435,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time.
     ...                             outer_tag="l.1", inner_tag="l.0")
     >>> mem_map = lp.get_mem_access_map(knl_consec)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { (2 * n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (256 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
-    MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
-    MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { ... }
     <BLANKLINE>
 
 With this parallelization, consecutive threads will access consecutive array
@@ -1476,12 +1476,12 @@ switch the inner and outer tags in our parallelization of the kernel:
     ...                                outer_tag="l.0", inner_tag="l.1")
     >>> mem_map = lp.get_mem_access_map(knl_nonconsec)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { (2 * n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (256 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
-    MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
-    MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
-    MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 }
+    MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { ... }
+    MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { ... }
     <BLANKLINE>
 
 With this parallelization, consecutive threads will access *nonconsecutive*
-- 
GitLab


From 04cb86b45d3962f29f994b835eb911931b5cfdfc Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Wed, 9 Nov 2016 17:17:58 -0600
Subject: [PATCH 48/55] removed fixed TODO

---
 loopy/statistics.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 5bed272a1..eac4ceafb 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -36,7 +36,6 @@ from loopy.kernel.data import MultiAssignmentBase
 from loopy.diagnostic import warn_with_kernel, LoopyError
 
 
-#TODO does this work for class functions?
 __doc__ = """
 
 .. currentmodule:: loopy
-- 
GitLab


From 86beb578cbb419466d4416a40f2f0251cbfd9b48 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Wed, 9 Nov 2016 18:29:43 -0600
Subject: [PATCH 49/55] removed unnecessary code from Mappers

---
 loopy/statistics.py | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index eac4ceafb..d28d2c14a 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -530,25 +530,15 @@ class ExpressionOpCounter(CombineMapper):
     map_tagged_variable = map_constant
     map_variable = map_constant
 
-    #def map_wildcard(self, expr):
-    #    return 0,0
-
-    #def map_function_symbol(self, expr):
-    #    return 0,0
-
     def map_call(self, expr):
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
                         name='func:'+str(expr.function)): 1}
                     ) + self.rec(expr.parameters)
 
-    # def map_call_with_kwargs(self, expr):  # implemented in CombineMapper
-
-    def map_subscript(self, expr):  # implemented in CombineMapper
+    def map_subscript(self, expr):
         return self.rec(expr.index)
 
-    # def map_lookup(self, expr):  # implemented in CombineMapper
-
     def map_sum(self, expr):
         assert expr.children
         return ToCountMap(
@@ -675,8 +665,6 @@ class LocalSubscriptCounter(CombineMapper):
 
         if name in self.knl.temporary_variables:
             array = self.knl.temporary_variables[name]
-            #print("array: ", array)
-            #print("is local? ", array.is_local)
             if array.is_local:
                 return ToCountMap(
                         {MemAccess(mtype='local',
@@ -736,7 +724,6 @@ class LocalSubscriptCounter(CombineMapper):
                + self.rec(expr.else_)
 
     map_min = map_bitwise_or
-    map_max = map_min
 
     def map_common_subexpression(self, expr):
         raise NotImplementedError("LocalSubscriptCounter encountered "
@@ -760,8 +747,6 @@ class LocalSubscriptCounter(CombineMapper):
 # }}}
 
 
-
-
 # {{{ GlobalSubscriptCounter
 
 class GlobalSubscriptCounter(CombineMapper):
@@ -923,7 +908,6 @@ class GlobalSubscriptCounter(CombineMapper):
                + self.rec(expr.else_)
 
     map_min = map_bitwise_or
-    map_max = map_min
 
     def map_common_subexpression(self, expr):
         raise NotImplementedError("GlobalSubscriptCounter encountered "
-- 
GitLab


From 59f1355e46dd548e8c3db20bbeec33e4a4d12600 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Wed, 9 Nov 2016 18:42:50 -0600
Subject: [PATCH 50/55] removing depricated functions from reference

---
 loopy/statistics.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index d28d2c14a..0bc91451e 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -44,15 +44,8 @@ __doc__ = """
 .. autoclass:: Op
 .. autoclass:: MemAccess
 
-.. autofunction:: get_op_poly
 .. autofunction:: get_op_map
-
-.. autofunction:: get_lmem_access_poly
-.. autofunction:: get_DRAM_access_poly
-.. autofunction:: get_gmem_access_poly
 .. autofunction:: get_mem_access_map
-
-.. autofunction:: get_synchronization_poly
 .. autofunction:: get_synchronization_map
 
 .. autofunction:: gather_access_footprints
-- 
GitLab


From 16d3301e68ce5264833b7162673e4710ee898cce Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Wed, 9 Nov 2016 21:08:12 -0600
Subject: [PATCH 51/55] refactored map_subscript in LocalSubscriptCounter to
 elminitate unnecessary recursive call

---
 loopy/statistics.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 0bc91451e..d5e4c43c9 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -654,17 +654,13 @@ class LocalSubscriptCounter(CombineMapper):
         return self.rec(expr.parameters)
 
     def map_subscript(self, expr):
+        sub_map = ToCountMap()
         name = expr.aggregate.name  # name of array
-
         if name in self.knl.temporary_variables:
             array = self.knl.temporary_variables[name]
             if array.is_local:
-                return ToCountMap(
-                        {MemAccess(mtype='local',
-                                   dtype=self.type_inf(expr)): 1}
-                        ) + self.rec(expr.index)
-
-        return self.rec(expr.index)
+                sub_map[MemAccess(mtype='local', dtype=self.type_inf(expr))] = 1
+        return sub_map + self.rec(expr.index)
             
     def map_sum(self, expr):
         if expr.children:
-- 
GitLab


From b64d8af8de9fe309b8fa347fc83e1d06245aab19 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Fri, 18 Nov 2016 12:50:27 -0600
Subject: [PATCH 52/55] better handling of case where min_tag_axis != 0

---
 loopy/statistics.py | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index d5e4c43c9..6c9742e52 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -586,10 +586,10 @@ class ExpressionOpCounter(CombineMapper):
     def map_logical_not(self, expr):
         return self.rec(expr.child)
 
-    def map_logical_or(self, expr):
-        return sum(self.rec(child) for child in expr.children)
+    #def map_logical_or(self, expr):
+    #    return sum(self.rec(child) for child in expr.children)
 
-    map_logical_and = map_logical_or
+    #map_logical_and = map_logical_or
 
     def map_if(self, expr):
         warnings.warn("ExpressionOpCounter counting ops as "
@@ -796,6 +796,18 @@ class GlobalSubscriptCounter(CombineMapper):
                                          variable=name): 1}
                              ) + self.rec(expr.index)
 
+        if min_tag_axis != 0:
+            warn_with_kernel(knl, "unknown_gmem_stride",
+                                  "GlobalSubscriptCounter: "
+                                  "Memory access minimum tag axis %d != 0, "
+                                  "stride unknown, using sys.maxsize."
+                                  % (min_tag_axis))
+            #TODO switch all warnings to loopy warnings warn_with_kernel
+            return ToCountMap({MemAccess(mtype='global',
+                                         dtype=self.type_inf(expr),
+                                         stride=sys.maxsize, variable=name): 1}
+                             ) + self.rec(expr.index)
+
         # get local_id associated with minimum tag axis
         min_lid = None
         for iname in my_inames:
@@ -807,8 +819,7 @@ class GlobalSubscriptCounter(CombineMapper):
 
         # found local_id associated with minimum tag axis
 
-        total_stride = None
-        extra_stride = 1
+        total_stride = 0
         # check coefficient of min_lid for each axis
         from loopy.symbolic import CoefficientCollector
         from loopy.kernel.array import FixedStrideArrayDimTag
@@ -830,17 +841,7 @@ class GlobalSubscriptCounter(CombineMapper):
             else:
                 continue
 
-            total_stride = stride*coeff_min_lid*extra_stride
-            #TODO is there a case where this^ does not execute,
-            # or executes more than once for two different axes?
-
-        #TODO temporary fix that needs changing:
-        if min_tag_axis != 0:
-            print("... min tag axis (%d) is not zero! ..." % (min_tag_axis))
-            return ToCountMap({MemAccess(mtype='global',
-                                         dtype=self.type_inf(expr),
-                                         stride=sys.maxsize, variable=name): 1}
-                             ) + self.rec(expr.index)
+            total_stride += stride*coeff_min_lid
 
         return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr),
                                      stride=total_stride, variable=name): 1}
-- 
GitLab


From f5415bc0147ac05754d493762531fe0fb7a57fc5 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Fri, 18 Nov 2016 13:16:14 -0600
Subject: [PATCH 53/55] switching warnings to warn_with_kernel

---
 loopy/statistics.py | 75 +++++++++++++++++++++++----------------------
 1 file changed, 39 insertions(+), 36 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 6c9742e52..07916022e 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -592,14 +592,16 @@ class ExpressionOpCounter(CombineMapper):
     #map_logical_and = map_logical_or
 
     def map_if(self, expr):
-        warnings.warn("ExpressionOpCounter counting ops as "
-                      "sum of if-statement branches.")
+        warn_with_kernel(self.knl, "summing_if_branches_ops", 
+                         "ExpressionOpCounter counting ops as sum of "
+                         "if-statement branches.")
         return self.rec(expr.condition) + self.rec(expr.then) \
                + self.rec(expr.else_)
 
     def map_if_positive(self, expr):
-        warnings.warn("ExpressionOpCounter counting ops as "
-                      "sum of if_pos-statement branches.")
+        warn_with_kernel(self.knl, "summing_ifpos_branches_ops",
+                         "ExpressionOpCounter counting ops as sum of "
+                         "if_pos-statement branches.")
         return self.rec(expr.criterion) + self.rec(expr.then) \
                + self.rec(expr.else_)
 
@@ -701,14 +703,16 @@ class LocalSubscriptCounter(CombineMapper):
     map_logical_and = map_logical_or
 
     def map_if(self, expr):
-        warnings.warn("LocalSubscriptCounter counting LMEM accesses as "
-                      "sum of if-statement branches.")
+        warn_with_kernel(self.knl, "summing_if_branches_lsubs", 
+                         "LocalSubscriptCounter counting LMEM accesses as sum "
+                         "of if-statement branches.")
         return self.rec(expr.condition) + self.rec(expr.then) \
                + self.rec(expr.else_)
 
     def map_if_positive(self, expr):
-        warnings.warn("LocalSubscriptCounter counting LMEM accesses as "
-                      "sum of if_pos-statement branches.")
+        warn_with_kernel(self.knl, "summing_ifpos_branches_lsubs", 
+                         "LocalSubscriptCounter counting LMEM accesses as sum "
+                         "of if_pos-statement branches.")
         return self.rec(expr.criterion) + self.rec(expr.then) \
                + self.rec(expr.else_)
 
@@ -797,12 +801,10 @@ class GlobalSubscriptCounter(CombineMapper):
                              ) + self.rec(expr.index)
 
         if min_tag_axis != 0:
-            warn_with_kernel(knl, "unknown_gmem_stride",
-                                  "GlobalSubscriptCounter: "
-                                  "Memory access minimum tag axis %d != 0, "
-                                  "stride unknown, using sys.maxsize."
-                                  % (min_tag_axis))
-            #TODO switch all warnings to loopy warnings warn_with_kernel
+            warn_with_kernel(self.knl, "unknown_gmem_stride",
+                             "GlobalSubscriptCounter: Memory access minimum "
+                             "tag axis %d != 0, stride unknown, using "
+                             "sys.maxsize." % (min_tag_axis))
             return ToCountMap({MemAccess(mtype='global',
                                          dtype=self.type_inf(expr),
                                          stride=sys.maxsize, variable=name): 1}
@@ -886,14 +888,16 @@ class GlobalSubscriptCounter(CombineMapper):
     map_logical_and = map_logical_or
 
     def map_if(self, expr):
-        warnings.warn("GlobalSubscriptCounter counting GMEM accesses as "
-                      "sum of if-statement branches.")
+        warn_with_kernel(self.knl, "summing_if_branches_gsubs", 
+                         "GlobalSubscriptCounter counting GMEM accesses as "
+                         "sum of if-statement branches.")
         return self.rec(expr.condition) + self.rec(expr.then) \
                + self.rec(expr.else_)
 
     def map_if_positive(self, expr):
-        warnings.warn("GlobalSubscriptCounter counting GMEM accesses as "
-                      "sum of if_pos-statement branches.")
+        warn_with_kernel(self.knl, "summing_ifpos_branches_gsubs", 
+                         "GlobalSubscriptCounter counting GMEM accesses as "
+                         "sum of if_pos-statement branches.")
         return self.rec(expr.criterion) + self.rec(expr.then) \
                + self.rec(expr.else_)
 
@@ -1093,9 +1097,8 @@ def get_op_poly(knl, numpy_types=True):
     get_op_poly is deprecated. Use get_op_map instead.
 
     """
-    from warnings import warn
-    warn("get_op_poly is deprecated. Use get_op_map instead.",
-         DeprecationWarning, stacklevel=2)
+    warn_with_kernel(knl, "depricated_get_op_poly",
+                     "get_op_poly is deprecated. Use get_op_map instead.")
     return get_op_map(knl, numpy_types)
 
 # }}}
@@ -1166,10 +1169,10 @@ def get_lmem_access_poly(knl):
     result with the mtype=['local'] option.
 
     """
-    from warnings import warn
-    warn("get_lmem_access_poly is deprecated. Use get_mem_access_map and "
-         "filter the result with the mtype=['local'] option.",
-         DeprecationWarning, stacklevel=2)
+    warn_with_kernel(knl, "depricated_get_lmem_access_poly",
+                     "get_lmem_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['local'] option.")
     return get_mem_access_map(knl).filter_by(mtype=['local'])
 
 
@@ -1180,10 +1183,10 @@ def get_DRAM_access_poly(knl):
     result with the mtype=['global'] option.
 
     """
-    from warnings import warn
-    warn("get_DRAM_access_poly is deprecated. Use get_mem_access_map and "
-         "filter the result with the mtype=['global'] option.",
-         DeprecationWarning, stacklevel=2)
+    warn_with_kernel(knl, "depricated_get_DRAM_access_poly",
+                     "get_DRAM_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['global'] option.")
     return get_mem_access_map(knl).filter_by(mtype=['global'])
 
 
@@ -1196,10 +1199,10 @@ def get_gmem_access_poly(knl):
     result with the mtype=['global'] option.
 
     """
-    from warnings import warn
-    warn("get_DRAM_access_poly is deprecated. Use get_mem_access_map and "
-         "filter the result with the mtype=['global'] option.",
-         DeprecationWarning, stacklevel=2)
+    warn_with_kernel(knl, "depricated_get_gmem_access_poly",
+                     "get_DRAM_access_poly is deprecated. Use "
+                     "get_mem_access_map and filter the result with the "
+                     "mtype=['global'] option.")
     return get_mem_access_map(knl).filter_by(mtype=['global'])
 
 # }}}
@@ -1349,9 +1352,9 @@ def get_synchronization_poly(knl):
     get_synchronization_poly is deprecated. Use get_synchronization_map instead.
 
     """
-    from warnings import warn
-    warn("get_synchronization_poly is deprecated. Use get_synchronization_map instead.",
-         DeprecationWarning, stacklevel=2)
+    warn_with_kernel(knl, "depricated_get_synchronization_poly",
+                     "get_synchronization_poly is deprecated. Use "
+                     "get_synchronization_map instead.")
     return get_synchronization_map(knl)
 
 # }}}
-- 
GitLab


From 064a318d894cc2a26e4d66492eb7ca898cf3aa8a Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Fri, 18 Nov 2016 13:39:55 -0600
Subject: [PATCH 54/55] removed map functions that are implemented in parent
 mapper

---
 loopy/statistics.py | 72 ---------------------------------------------
 1 file changed, 72 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 07916022e..dae6b5bfb 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -580,17 +580,6 @@ class ExpressionOpCounter(CombineMapper):
     map_bitwise_xor = map_bitwise_or
     map_bitwise_and = map_bitwise_or
 
-    def map_comparison(self, expr):
-        return self.rec(expr.left)+self.rec(expr.right)
-
-    def map_logical_not(self, expr):
-        return self.rec(expr.child)
-
-    #def map_logical_or(self, expr):
-    #    return sum(self.rec(child) for child in expr.children)
-
-    #map_logical_and = map_logical_or
-
     def map_if(self, expr):
         warn_with_kernel(self.knl, "summing_if_branches_ops", 
                          "ExpressionOpCounter counting ops as sum of "
@@ -672,36 +661,9 @@ class LocalSubscriptCounter(CombineMapper):
 
     map_product = map_sum
 
-    def map_quotient(self, expr, *args):
-        return self.rec(expr.numerator) + self.rec(expr.denominator)
-
-    map_floor_div = map_quotient
-    map_remainder = map_quotient
-
-    def map_power(self, expr):
-        return self.rec(expr.base) + self.rec(expr.exponent)
-
-    def map_left_shift(self, expr):
-        return self.rec(expr.shiftee)+self.rec(expr.shift)
-
-    map_right_shift = map_left_shift
-
-    def map_bitwise_not(self, expr):
-        return self.rec(expr.child)
-
-    def map_bitwise_or(self, expr):
-        return sum(self.rec(child) for child in expr.children)
-
-    map_bitwise_xor = map_bitwise_or
-    map_bitwise_and = map_bitwise_or
-
     def map_comparison(self, expr):
         return self.rec(expr.left)+self.rec(expr.right)
 
-    map_logical_not = map_bitwise_not
-    map_logical_or = map_bitwise_or
-    map_logical_and = map_logical_or
-
     def map_if(self, expr):
         warn_with_kernel(self.knl, "summing_if_branches_lsubs", 
                          "LocalSubscriptCounter counting LMEM accesses as sum "
@@ -716,8 +678,6 @@ class LocalSubscriptCounter(CombineMapper):
         return self.rec(expr.criterion) + self.rec(expr.then) \
                + self.rec(expr.else_)
 
-    map_min = map_bitwise_or
-
     def map_common_subexpression(self, expr):
         raise NotImplementedError("LocalSubscriptCounter encountered "
                                   "common_subexpression, "
@@ -857,36 +817,6 @@ class GlobalSubscriptCounter(CombineMapper):
 
     map_product = map_sum
 
-    def map_quotient(self, expr, *args):
-        return self.rec(expr.numerator) + self.rec(expr.denominator)
-
-    map_floor_div = map_quotient
-    map_remainder = map_quotient
-
-    def map_power(self, expr):
-        return self.rec(expr.base) + self.rec(expr.exponent)
-
-    def map_left_shift(self, expr):
-        return self.rec(expr.shiftee)+self.rec(expr.shift)
-
-    map_right_shift = map_left_shift
-
-    def map_bitwise_not(self, expr):
-        return self.rec(expr.child)
-
-    def map_bitwise_or(self, expr):
-        return sum(self.rec(child) for child in expr.children)
-
-    map_bitwise_xor = map_bitwise_or
-    map_bitwise_and = map_bitwise_or
-
-    def map_comparison(self, expr):
-        return self.rec(expr.left)+self.rec(expr.right)
-
-    map_logical_not = map_bitwise_not
-    map_logical_or = map_bitwise_or
-    map_logical_and = map_logical_or
-
     def map_if(self, expr):
         warn_with_kernel(self.knl, "summing_if_branches_gsubs", 
                          "GlobalSubscriptCounter counting GMEM accesses as "
@@ -901,8 +831,6 @@ class GlobalSubscriptCounter(CombineMapper):
         return self.rec(expr.criterion) + self.rec(expr.then) \
                + self.rec(expr.else_)
 
-    map_min = map_bitwise_or
-
     def map_common_subexpression(self, expr):
         raise NotImplementedError("GlobalSubscriptCounter encountered "
                                   "common_subexpression, "
-- 
GitLab


From bc3512da0b115aaa1076e2d3d5b15ccf85f7511a Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@porter.cs.illinois.edu>
Date: Fri, 18 Nov 2016 13:48:44 -0600
Subject: [PATCH 55/55] updating TypeInferenceMapper inport statement

---
 loopy/statistics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 9b676e69b..2ec5eb0d4 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -629,7 +629,7 @@ class LocalSubscriptCounter(CombineMapper):
 
     def __init__(self, knl):
         self.knl = knl
-        from loopy.expression import TypeInferenceMapper
+        from loopy.type_inference import TypeInferenceMapper
         self.type_inf = TypeInferenceMapper(knl)
 
     def combine(self, values):
-- 
GitLab