From 0494c442a6b10dce04adc57439b83823253cbd7d Mon Sep 17 00:00:00 2001 From: James Stevens Date: Sat, 12 Mar 2016 18:09:05 -0600 Subject: [PATCH 01/55] replaced consec, uniform, and nonconsec subscript types with DataAccess class --- loopy/statistics.py | 78 +++++++++++++++++++++++------------------ test/test_statistics.py | 70 +++++++++++++++++++----------------- 2 files changed, 81 insertions(+), 67 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index c273edd54..5faeb12e3 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -105,6 +105,18 @@ def stringify_stats_mapping(m): return result +class DataAccess: + + def __init__(self, stride=0): + self.stride = stride + + def __eq__(self, other): + return isinstance(other, DataAccess) and other.stride == self.stride #TODO is this okay? + + def __hash__(self): + return hash(self.stride) + + # {{{ ExpressionOpCounter class ExpressionOpCounter(CombineMapper): @@ -277,67 +289,63 @@ class GlobalSubscriptCounter(CombineMapper): from loopy.symbolic import get_dependencies from loopy.kernel.data import LocalIndexTag my_inames = get_dependencies(index) & self.knl.all_inames() - local_id0 = None + + # find min tag axis + import sys + min_tag_axis = sys.maxsize local_id_found = False for iname in my_inames: - # find local id0 tag = self.knl.iname_to_tag.get(iname) if isinstance(tag, LocalIndexTag): local_id_found = True - if tag.axis == 0: - local_id0 = iname - break # there will be only one local_id0 + if tag.axis < min_tag_axis: + min_tag_axis = tag.axis if not local_id_found: # count as uniform access return ToCountMap( - {(self.type_inf(expr), 'uniform'): 1} + {(self.type_inf(expr), DataAccess(stride=0)): 1} ) + self.rec(expr.index) - if local_id0 is None: - # only non-zero local id(s) found, assume non-consecutive access - return ToCountMap( - {(self.type_inf(expr), 'nonconsecutive'): 1} - ) + self.rec(expr.index) + # get local_id associated with minimum tag axis + min_local_id = None + for iname in my_inames: + tag = self.knl.iname_to_tag.get(iname) + if isinstance(tag, LocalIndexTag): + if tag.axis == min_tag_axis: + min_local_id = iname + break # there will be only one min local_id + + # found local_id associated with minimum tag axis - # check coefficient of local_id0 for each axis + total_stride = None + # check coefficient of min_local_id for each axis from loopy.symbolic import CoefficientCollector from pymbolic.primitives import Variable for idx, axis_tag in zip(index, array.dim_tags): coeffs = CoefficientCollector()(idx) - # check if he contains the lid 0 guy + # check if he contains the min lid guy try: - coeff_id0 = coeffs[Variable(local_id0)] + coeff_min_lid = coeffs[Variable(min_local_id)] except KeyError: - # does not contain local_id0 + # does not contain min_local_id continue - if coeff_id0 != 1: - # non-consecutive access - return ToCountMap( - {(self.type_inf(expr), 'nonconsecutive'): 1} - ) + self.rec(expr.index) - - # coefficient is 1, now determine if stride is 1 + # found coefficient of min_local_id + # now determine stride from loopy.kernel.array import FixedStrideArrayDimTag if isinstance(axis_tag, FixedStrideArrayDimTag): stride = axis_tag.stride else: continue - if stride != 1: - # non-consecutive - return ToCountMap( - {(self.type_inf(expr), 'nonconsecutive'): 1} - ) + self.rec(expr.index) - - # else, stride == 1, continue since another idx could contain id0 + total_stride = stride*coeff_min_lid + #TODO is there a case where this^ does not execute, or executes more than once for two different axes? - # loop finished without returning, stride==1 for every instance of local_id0 - return ToCountMap( - {(self.type_inf(expr), 'consecutive'): 1} - ) + self.rec(expr.index) + return ToCountMap({(self.type_inf(expr), + DataAccess(stride=total_stride)): 1} + ) + self.rec(expr.index) def map_sum(self, expr): if expr.children: @@ -717,13 +725,13 @@ def get_gmem_access_poly(knl): # for now just counting subscripts # use count excluding local index tags for uniform accesses for key in subs_expr.dict: poly = ToCountMap({key: subs_expr.dict[key]}) - if key[1] == "uniform": + if key[1].stride == 0: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True) else: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) for key in subs_assignee.dict: poly = ToCountMap({key: subs_assignee.dict[key]}) - if key[1] == "uniform": + if key[1].stride == 0: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True) else: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) diff --git a/test/test_statistics.py b/test/test_statistics.py index 2cf537f5e..6e5b6270b 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -28,9 +28,15 @@ from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) import loopy as lp -from loopy.statistics import get_op_poly, get_gmem_access_poly, get_barrier_poly +from loopy.statistics import ( + get_op_poly, + get_gmem_access_poly, + get_barrier_poly, + DataAccess) + import numpy as np +from pymbolic.primitives import Variable def test_op_counter_basic(): @@ -227,19 +233,19 @@ def test_gmem_access_counter_basic(): l = 128 params = {'n': n, 'm': m, 'l': l} f32 = poly[ - (np.dtype(np.float32), 'uniform', 'load') + (np.dtype(np.float32), DataAccess(stride=0), 'load') ].eval_with_dict(params) f64 = poly[ - (np.dtype(np.float64), 'uniform', 'load') + (np.dtype(np.float64), DataAccess(stride=0), 'load') ].eval_with_dict(params) assert f32 == 3*n*m*l assert f64 == 2*n*m f32 = poly[ - (np.dtype(np.float32), 'uniform', 'store') + (np.dtype(np.float32), DataAccess(stride=0), 'store') ].eval_with_dict(params) f64 = poly[ - (np.dtype(np.float64), 'uniform', 'store') + (np.dtype(np.float64), DataAccess(stride=0), 'store') ].eval_with_dict(params) assert f32 == n*m*l assert f64 == n*m @@ -261,12 +267,12 @@ def test_gmem_access_counter_reduction(): l = 128 params = {'n': n, 'm': m, 'l': l} f32 = poly[ - (np.dtype(np.float32), 'uniform', 'load') + (np.dtype(np.float32), DataAccess(stride=0), 'load') ].eval_with_dict(params) assert f32 == 2*n*m*l f32 = poly[ - (np.dtype(np.float32), 'uniform', 'store') + (np.dtype(np.float32), DataAccess(stride=0), 'store') ].eval_with_dict(params) assert f32 == n*l @@ -289,16 +295,16 @@ def test_gmem_access_counter_logic(): l = 128 params = {'n': n, 'm': m, 'l': l} f32 = poly[ - (np.dtype(np.float32), 'uniform', 'load') + (np.dtype(np.float32), DataAccess(stride=0), 'load') ].eval_with_dict(params) f64 = poly[ - (np.dtype(np.float64), 'uniform', 'load') + (np.dtype(np.float64), DataAccess(stride=0), 'load') ].eval_with_dict(params) assert f32 == 2*n*m assert f64 == n*m f64 = poly[ - (np.dtype(np.float64), 'uniform', 'store') + (np.dtype(np.float64), DataAccess(stride=0), 'store') ].eval_with_dict(params) assert f64 == n*m @@ -323,19 +329,19 @@ def test_gmem_access_counter_specialops(): l = 128 params = {'n': n, 'm': m, 'l': l} f32 = poly[ - (np.dtype(np.float32), 'uniform', 'load') + (np.dtype(np.float32), DataAccess(stride=0), 'load') ].eval_with_dict(params) f64 = poly[ - (np.dtype(np.float64), 'uniform', 'load') + (np.dtype(np.float64), DataAccess(stride=0), 'load') ].eval_with_dict(params) assert f32 == 2*n*m*l assert f64 == 2*n*m f32 = poly[ - (np.dtype(np.float32), 'uniform', 'store') + (np.dtype(np.float32), DataAccess(stride=0), 'store') ].eval_with_dict(params) f64 = poly[ - (np.dtype(np.float64), 'uniform', 'store') + (np.dtype(np.float64), DataAccess(stride=0), 'store') ].eval_with_dict(params) assert f32 == n*m*l assert f64 == n*m @@ -364,12 +370,12 @@ def test_gmem_access_counter_bitwise(): l = 128 params = {'n': n, 'm': m, 'l': l} i32 = poly[ - (np.dtype(np.int32), 'uniform', 'load') + (np.dtype(np.int32), DataAccess(stride=0), 'load') ].eval_with_dict(params) assert i32 == 4*n*m+2*n*m*l i32 = poly[ - (np.dtype(np.int32), 'uniform', 'store') + (np.dtype(np.int32), DataAccess(stride=0), 'store') ].eval_with_dict(params) assert i32 == n*m+n*m*l @@ -398,23 +404,23 @@ def test_gmem_access_counter_mixed(): l = 128 params = {'n': n, 'm': m, 'l': l} f64uniform = poly[ - (np.dtype(np.float64), 'uniform', 'load') + (np.dtype(np.float64), DataAccess(stride=0), 'load') ].eval_with_dict(params) f32uniform = poly[ - (np.dtype(np.float32), 'uniform', 'load') + (np.dtype(np.float32), DataAccess(stride=0), 'load') ].eval_with_dict(params) f32nonconsec = poly[ - (np.dtype(np.float32), 'nonconsecutive', 'load') + (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'load') ].eval_with_dict(params) assert f64uniform == 2*n*m assert f32uniform == n*m*l/threads assert f32nonconsec == 3*n*m*l f64uniform = poly[ - (np.dtype(np.float64), 'uniform', 'store') + (np.dtype(np.float64), DataAccess(stride=0), 'store') ].eval_with_dict(params) f32nonconsec = poly[ - (np.dtype(np.float32), 'nonconsecutive', 'store') + (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'store') ].eval_with_dict(params) assert f64uniform == n*m assert f32nonconsec == n*m*l @@ -442,19 +448,19 @@ def test_gmem_access_counter_nonconsec(): l = 128 params = {'n': n, 'm': m, 'l': l} f64nonconsec = poly[ - (np.dtype(np.float64), 'nonconsecutive', 'load') + (np.dtype(np.float64), DataAccess(stride=Variable('m')), 'load') ].eval_with_dict(params) f32nonconsec = poly[ - (np.dtype(np.float32), 'nonconsecutive', 'load') + (np.dtype(np.float32), DataAccess(stride=Variable('m')*Variable('l')), 'load') ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*l f64nonconsec = poly[ - (np.dtype(np.float64), 'nonconsecutive', 'store') + (np.dtype(np.float64), DataAccess(stride=Variable('m')), 'store') ].eval_with_dict(params) f32nonconsec = poly[ - (np.dtype(np.float32), 'nonconsecutive', 'store') + (np.dtype(np.float32), DataAccess(stride=Variable('m')*Variable('l')), 'store') ].eval_with_dict(params) assert f64nonconsec == n*m assert f32nonconsec == n*m*l @@ -482,19 +488,19 @@ def test_gmem_access_counter_consec(): params = {'n': n, 'm': m, 'l': l} f64consec = poly[ - (np.dtype(np.float64), 'consecutive', 'load') + (np.dtype(np.float64), DataAccess(stride=1), 'load') ].eval_with_dict(params) f32consec = poly[ - (np.dtype(np.float32), 'consecutive', 'load') + (np.dtype(np.float32), DataAccess(stride=1), 'load') ].eval_with_dict(params) assert f64consec == 2*n*m assert f32consec == 3*n*m*l f64consec = poly[ - (np.dtype(np.float64), 'consecutive', 'store') + (np.dtype(np.float64), DataAccess(stride=1), 'store') ].eval_with_dict(params) f32consec = poly[ - (np.dtype(np.float32), 'consecutive', 'store') + (np.dtype(np.float32), DataAccess(stride=1), 'store') ].eval_with_dict(params) assert f64consec == n*m assert f32consec == n*m*l @@ -588,17 +594,17 @@ def test_all_counters_parallel_matmul(): subscript_map = get_gmem_access_poly(knl) f32uncoal = subscript_map[ - (np.dtype(np.float32), 'nonconsecutive', 'load') + (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'load') ].eval_with_dict(params) f32coal = subscript_map[ - (np.dtype(np.float32), 'consecutive', 'load') + (np.dtype(np.float32), DataAccess(stride=1), 'load') ].eval_with_dict(params) assert f32uncoal == n*m*l assert f32coal == n*m*l f32coal = subscript_map[ - (np.dtype(np.float32), 'consecutive', 'store') + (np.dtype(np.float32), DataAccess(stride=1), 'store') ].eval_with_dict(params) assert f32coal == n*l -- GitLab From 09420000fb7a565fd3fa64a6ca8e8a609eae8008 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Sun, 13 Mar 2016 13:23:59 -0500 Subject: [PATCH 02/55] subscript counter only looking for lid0 now, if not found, setting stride to maxsize --- loopy/statistics.py | 35 ++++++++++++++++++++++++----------- test/test_statistics.py | 2 +- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 5faeb12e3..e10de8cb4 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -291,15 +291,19 @@ class GlobalSubscriptCounter(CombineMapper): my_inames = get_dependencies(index) & self.knl.all_inames() # find min tag axis - import sys - min_tag_axis = sys.maxsize + #import sys + local_id0 = None + #min_tag_axis = sys.maxsize local_id_found = False for iname in my_inames: tag = self.knl.iname_to_tag.get(iname) if isinstance(tag, LocalIndexTag): local_id_found = True - if tag.axis < min_tag_axis: - min_tag_axis = tag.axis + #if tag.axis < min_tag_axis: + # min_tag_axis = tag.axis + if tag.axis == 0: + local_id0 = iname + break if not local_id_found: # count as uniform access @@ -307,6 +311,15 @@ class GlobalSubscriptCounter(CombineMapper): {(self.type_inf(expr), DataAccess(stride=0)): 1} ) + self.rec(expr.index) + if local_id0 is None: + # only non-zero local id(s) found, assume non-consecutive access + #TODO what to do here? + import sys + return ToCountMap( + {(self.type_inf(expr), DataAccess(stride=sys.maxsize)): 1} + ) + self.rec(expr.index) + + ''' # get local_id associated with minimum tag axis min_local_id = None for iname in my_inames: @@ -315,11 +328,11 @@ class GlobalSubscriptCounter(CombineMapper): if tag.axis == min_tag_axis: min_local_id = iname break # there will be only one min local_id + ''' - # found local_id associated with minimum tag axis - + # found local_id associated with axis 0 total_stride = None - # check coefficient of min_local_id for each axis + # check coefficient of local_id0 for each axis from loopy.symbolic import CoefficientCollector from pymbolic.primitives import Variable for idx, axis_tag in zip(index, array.dim_tags): @@ -327,12 +340,12 @@ class GlobalSubscriptCounter(CombineMapper): coeffs = CoefficientCollector()(idx) # check if he contains the min lid guy try: - coeff_min_lid = coeffs[Variable(min_local_id)] + coeff_lid0 = coeffs[Variable(local_id0)] except KeyError: - # does not contain min_local_id + # does not contain local_id0 continue - # found coefficient of min_local_id + # found coefficient of local_id0 # now determine stride from loopy.kernel.array import FixedStrideArrayDimTag if isinstance(axis_tag, FixedStrideArrayDimTag): @@ -340,7 +353,7 @@ class GlobalSubscriptCounter(CombineMapper): else: continue - total_stride = stride*coeff_min_lid + total_stride = stride*coeff_lid0 #TODO is there a case where this^ does not execute, or executes more than once for two different axes? return ToCountMap({(self.type_inf(expr), diff --git a/test/test_statistics.py b/test/test_statistics.py index 6e5b6270b..6aec20444 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -594,7 +594,7 @@ def test_all_counters_parallel_matmul(): subscript_map = get_gmem_access_poly(knl) f32uncoal = subscript_map[ - (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'load') + (np.dtype(np.float32), DataAccess(stride=sys.maxsize), 'load') ].eval_with_dict(params) f32coal = subscript_map[ (np.dtype(np.float32), DataAccess(stride=1), 'load') -- GitLab From 2dbeb3877549f4564d96aae3314ab6636d8c8a56 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Tue, 15 Mar 2016 18:55:39 -0500 Subject: [PATCH 03/55] now calculating strides greater than 1 --- loopy/statistics.py | 56 +++++++++++++++++++++-------------------- test/test_statistics.py | 2 +- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index e10de8cb4..eff571668 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -291,19 +291,19 @@ class GlobalSubscriptCounter(CombineMapper): my_inames = get_dependencies(index) & self.knl.all_inames() # find min tag axis - #import sys - local_id0 = None - #min_tag_axis = sys.maxsize + import sys + #local_id0 = None + min_tag_axis = sys.maxsize local_id_found = False for iname in my_inames: tag = self.knl.iname_to_tag.get(iname) if isinstance(tag, LocalIndexTag): local_id_found = True - #if tag.axis < min_tag_axis: - # min_tag_axis = tag.axis - if tag.axis == 0: - local_id0 = iname - break + if tag.axis < min_tag_axis: + min_tag_axis = tag.axis + #if tag.axis == 0: + # local_id0 = iname + # break if not local_id_found: # count as uniform access @@ -311,49 +311,51 @@ class GlobalSubscriptCounter(CombineMapper): {(self.type_inf(expr), DataAccess(stride=0)): 1} ) + self.rec(expr.index) - if local_id0 is None: - # only non-zero local id(s) found, assume non-consecutive access - #TODO what to do here? - import sys - return ToCountMap( - {(self.type_inf(expr), DataAccess(stride=sys.maxsize)): 1} - ) + self.rec(expr.index) - - ''' # get local_id associated with minimum tag axis - min_local_id = None + min_lid = None for iname in my_inames: tag = self.knl.iname_to_tag.get(iname) if isinstance(tag, LocalIndexTag): if tag.axis == min_tag_axis: - min_local_id = iname + min_lid = iname break # there will be only one min local_id - ''' - # found local_id associated with axis 0 + # found local_id associated with minimum tag axis + total_stride = None # check coefficient of local_id0 for each axis from loopy.symbolic import CoefficientCollector from pymbolic.primitives import Variable + #print("==========================================================================================") + #print("expr: ", expr) + #print("min_lid: ", min_lid) + #print("min_tag_axis: ", min_tag_axis) + #print("Var(min_lid): ", Variable(min_lid)) for idx, axis_tag in zip(index, array.dim_tags): - + #print("...........................................................................................") + #print("idx, axis_tag: ", idx, "\t", axis_tag) coeffs = CoefficientCollector()(idx) + #print("coeffs: ", coeffs) # check if he contains the min lid guy try: - coeff_lid0 = coeffs[Variable(local_id0)] + coeff_min_lid = coeffs[Variable(min_lid)] except KeyError: - # does not contain local_id0 + # does not contain min_lid + #print("key error") continue - - # found coefficient of local_id0 + #print("coeff_min_lid: ", coeff_min_lid) + #print("axis_tag: ", axis_tag) + # found coefficient of min_lid # now determine stride from loopy.kernel.array import FixedStrideArrayDimTag if isinstance(axis_tag, FixedStrideArrayDimTag): stride = axis_tag.stride else: + #print("continuing") continue + #print("stride: ", stride) - total_stride = stride*coeff_lid0 + total_stride = stride*coeff_min_lid #TODO is there a case where this^ does not execute, or executes more than once for two different axes? return ToCountMap({(self.type_inf(expr), diff --git a/test/test_statistics.py b/test/test_statistics.py index 6aec20444..6e5b6270b 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -594,7 +594,7 @@ def test_all_counters_parallel_matmul(): subscript_map = get_gmem_access_poly(knl) f32uncoal = subscript_map[ - (np.dtype(np.float32), DataAccess(stride=sys.maxsize), 'load') + (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'load') ].eval_with_dict(params) f32coal = subscript_map[ (np.dtype(np.float32), DataAccess(stride=1), 'load') -- GitLab From 0573d5457aacb21ce73e754304f86e35671784fa Mon Sep 17 00:00:00 2001 From: James Stevens Date: Tue, 15 Mar 2016 22:43:17 -0500 Subject: [PATCH 04/55] added StridedGmemAccess class, now used as key in gmem access dicts --- loopy/statistics.py | 58 +++++++-------- test/test_statistics.py | 156 +++++++++++++++++----------------------- 2 files changed, 91 insertions(+), 123 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index eff571668..96bf511b5 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -105,16 +105,24 @@ def stringify_stats_mapping(m): return result -class DataAccess: +class StridedGmemAccess: - def __init__(self, stride=0): + def __init__(self, dtype, stride, direction=None): + self.dtype = dtype self.stride = stride + self.direction = direction def __eq__(self, other): - return isinstance(other, DataAccess) and other.stride == self.stride #TODO is this okay? + return isinstance(other, StridedGmemAccess) and ( + other.dtype == self.dtype and + other.stride == self.stride and + other.direction == self.direction ) def __hash__(self): - return hash(self.stride) + if self.direction == None: + return hash(str(self.dtype)+str(self.stride)+"None") + else: + return hash(str(self.dtype)+str(self.stride)+self.direction) # {{{ ExpressionOpCounter @@ -292,7 +300,6 @@ class GlobalSubscriptCounter(CombineMapper): # find min tag axis import sys - #local_id0 = None min_tag_axis = sys.maxsize local_id_found = False for iname in my_inames: @@ -301,14 +308,11 @@ class GlobalSubscriptCounter(CombineMapper): local_id_found = True if tag.axis < min_tag_axis: min_tag_axis = tag.axis - #if tag.axis == 0: - # local_id0 = iname - # break if not local_id_found: # count as uniform access return ToCountMap( - {(self.type_inf(expr), DataAccess(stride=0)): 1} + {StridedGmemAccess(self.type_inf(expr), 0): 1} ) + self.rec(expr.index) # get local_id associated with minimum tag axis @@ -326,41 +330,27 @@ class GlobalSubscriptCounter(CombineMapper): # check coefficient of local_id0 for each axis from loopy.symbolic import CoefficientCollector from pymbolic.primitives import Variable - #print("==========================================================================================") - #print("expr: ", expr) - #print("min_lid: ", min_lid) - #print("min_tag_axis: ", min_tag_axis) - #print("Var(min_lid): ", Variable(min_lid)) for idx, axis_tag in zip(index, array.dim_tags): - #print("...........................................................................................") - #print("idx, axis_tag: ", idx, "\t", axis_tag) coeffs = CoefficientCollector()(idx) - #print("coeffs: ", coeffs) # check if he contains the min lid guy try: coeff_min_lid = coeffs[Variable(min_lid)] except KeyError: # does not contain min_lid - #print("key error") continue - #print("coeff_min_lid: ", coeff_min_lid) - #print("axis_tag: ", axis_tag) # found coefficient of min_lid # now determine stride from loopy.kernel.array import FixedStrideArrayDimTag if isinstance(axis_tag, FixedStrideArrayDimTag): stride = axis_tag.stride else: - #print("continuing") continue - #print("stride: ", stride) total_stride = stride*coeff_min_lid #TODO is there a case where this^ does not execute, or executes more than once for two different axes? - return ToCountMap({(self.type_inf(expr), - DataAccess(stride=total_stride)): 1} - ) + self.rec(expr.index) + return ToCountMap({StridedGmemAccess(self.type_inf(expr), + total_stride): 1}) + self.rec(expr.index) def map_sum(self, expr): if expr.children: @@ -727,26 +717,28 @@ def get_gmem_access_poly(knl): # for now just counting subscripts for insn in knl.instructions: # count subscripts, distinguishing loads and stores subs_expr = subscript_counter(insn.expression) - subs_expr = ToCountMap(dict( - (key + ("load",), val) - for key, val in six.iteritems(subs_expr.dict))) + for key in subs_expr.dict: + subs_expr.dict[StridedGmemAccess( + key.dtype, key.stride, 'load') + ] = subs_expr.dict.pop(key) subs_assignee = subscript_counter(insn.assignee) - subs_assignee = ToCountMap(dict( - (key + ("store",), val) - for key, val in six.iteritems(subs_assignee.dict))) + for key in subs_assignee.dict: + subs_assignee.dict[StridedGmemAccess( + key.dtype, key.stride, 'store') + ] = subs_assignee.dict.pop(key) insn_inames = knl.insn_inames(insn) # use count excluding local index tags for uniform accesses for key in subs_expr.dict: poly = ToCountMap({key: subs_expr.dict[key]}) - if key[1].stride == 0: + if isinstance(key.stride, int) and key.stride == 0: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True) else: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) for key in subs_assignee.dict: poly = ToCountMap({key: subs_assignee.dict[key]}) - if key[1].stride == 0: + if isinstance(key.stride, int) and key.stride == 0: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True) else: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) diff --git a/test/test_statistics.py b/test/test_statistics.py index 6e5b6270b..a4fc022d5 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -32,7 +32,7 @@ from loopy.statistics import ( get_op_poly, get_gmem_access_poly, get_barrier_poly, - DataAccess) + StridedGmemAccess) import numpy as np @@ -232,21 +232,17 @@ def test_gmem_access_counter_basic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[ - (np.dtype(np.float32), DataAccess(stride=0), 'load') - ].eval_with_dict(params) - f64 = poly[ - (np.dtype(np.float64), DataAccess(stride=0), 'load') - ].eval_with_dict(params) + f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load') + ].eval_with_dict(params) + f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'load') + ].eval_with_dict(params) assert f32 == 3*n*m*l assert f64 == 2*n*m - f32 = poly[ - (np.dtype(np.float32), DataAccess(stride=0), 'store') - ].eval_with_dict(params) - f64 = poly[ - (np.dtype(np.float64), DataAccess(stride=0), 'store') - ].eval_with_dict(params) + f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'store') + ].eval_with_dict(params) + f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'store') + ].eval_with_dict(params) assert f32 == n*m*l assert f64 == n*m @@ -266,14 +262,12 @@ def test_gmem_access_counter_reduction(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[ - (np.dtype(np.float32), DataAccess(stride=0), 'load') - ].eval_with_dict(params) + f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load') + ].eval_with_dict(params) assert f32 == 2*n*m*l - f32 = poly[ - (np.dtype(np.float32), DataAccess(stride=0), 'store') - ].eval_with_dict(params) + f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'store') + ].eval_with_dict(params) assert f32 == n*l @@ -294,18 +288,15 @@ def test_gmem_access_counter_logic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[ - (np.dtype(np.float32), DataAccess(stride=0), 'load') - ].eval_with_dict(params) - f64 = poly[ - (np.dtype(np.float64), DataAccess(stride=0), 'load') - ].eval_with_dict(params) + f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load') + ].eval_with_dict(params) + f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'load') + ].eval_with_dict(params) assert f32 == 2*n*m assert f64 == n*m - f64 = poly[ - (np.dtype(np.float64), DataAccess(stride=0), 'store') - ].eval_with_dict(params) + f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'store') + ].eval_with_dict(params) assert f64 == n*m @@ -328,21 +319,17 @@ def test_gmem_access_counter_specialops(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[ - (np.dtype(np.float32), DataAccess(stride=0), 'load') - ].eval_with_dict(params) - f64 = poly[ - (np.dtype(np.float64), DataAccess(stride=0), 'load') - ].eval_with_dict(params) + f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load') + ].eval_with_dict(params) + f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'load') + ].eval_with_dict(params) assert f32 == 2*n*m*l assert f64 == 2*n*m - f32 = poly[ - (np.dtype(np.float32), DataAccess(stride=0), 'store') - ].eval_with_dict(params) - f64 = poly[ - (np.dtype(np.float64), DataAccess(stride=0), 'store') - ].eval_with_dict(params) + f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'store') + ].eval_with_dict(params) + f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'store') + ].eval_with_dict(params) assert f32 == n*m*l assert f64 == n*m @@ -369,14 +356,12 @@ def test_gmem_access_counter_bitwise(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - i32 = poly[ - (np.dtype(np.int32), DataAccess(stride=0), 'load') - ].eval_with_dict(params) + i32 = poly[StridedGmemAccess(np.dtype(np.int32), 0, 'load') + ].eval_with_dict(params) assert i32 == 4*n*m+2*n*m*l - i32 = poly[ - (np.dtype(np.int32), DataAccess(stride=0), 'store') - ].eval_with_dict(params) + i32 = poly[StridedGmemAccess(np.dtype(np.int32), 0, 'store') + ].eval_with_dict(params) assert i32 == n*m+n*m*l @@ -403,24 +388,21 @@ def test_gmem_access_counter_mixed(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f64uniform = poly[ - (np.dtype(np.float64), DataAccess(stride=0), 'load') - ].eval_with_dict(params) - f32uniform = poly[ - (np.dtype(np.float32), DataAccess(stride=0), 'load') - ].eval_with_dict(params) + f64uniform = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'load') + ].eval_with_dict(params) + f32uniform = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load') + ].eval_with_dict(params) f32nonconsec = poly[ - (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'load') + StridedGmemAccess(np.dtype(np.float32), Variable('m'), 'load') ].eval_with_dict(params) assert f64uniform == 2*n*m assert f32uniform == n*m*l/threads assert f32nonconsec == 3*n*m*l - f64uniform = poly[ - (np.dtype(np.float64), DataAccess(stride=0), 'store') - ].eval_with_dict(params) + f64uniform = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'store') + ].eval_with_dict(params) f32nonconsec = poly[ - (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'store') + StridedGmemAccess(np.dtype(np.float32), Variable('m'), 'store') ].eval_with_dict(params) assert f64uniform == n*m assert f32nonconsec == n*m*l @@ -447,21 +429,21 @@ def test_gmem_access_counter_nonconsec(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f64nonconsec = poly[ - (np.dtype(np.float64), DataAccess(stride=Variable('m')), 'load') - ].eval_with_dict(params) - f32nonconsec = poly[ - (np.dtype(np.float32), DataAccess(stride=Variable('m')*Variable('l')), 'load') - ].eval_with_dict(params) + f64nonconsec = poly[StridedGmemAccess( + np.dtype(np.float64), Variable('m'), 'load') + ].eval_with_dict(params) + f32nonconsec = poly[StridedGmemAccess( + np.dtype(np.float32), Variable('m')*Variable('l'), 'load') + ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*l - f64nonconsec = poly[ - (np.dtype(np.float64), DataAccess(stride=Variable('m')), 'store') - ].eval_with_dict(params) - f32nonconsec = poly[ - (np.dtype(np.float32), DataAccess(stride=Variable('m')*Variable('l')), 'store') - ].eval_with_dict(params) + f64nonconsec = poly[StridedGmemAccess( + np.dtype(np.float64), Variable('m'), 'store') + ].eval_with_dict(params) + f32nonconsec = poly[StridedGmemAccess( + np.dtype(np.float32), Variable('m')*Variable('l'), 'store') + ].eval_with_dict(params) assert f64nonconsec == n*m assert f32nonconsec == n*m*l @@ -487,21 +469,17 @@ def test_gmem_access_counter_consec(): l = 128 params = {'n': n, 'm': m, 'l': l} - f64consec = poly[ - (np.dtype(np.float64), DataAccess(stride=1), 'load') - ].eval_with_dict(params) - f32consec = poly[ - (np.dtype(np.float32), DataAccess(stride=1), 'load') - ].eval_with_dict(params) + f64consec = poly[StridedGmemAccess(np.dtype(np.float64), 1, 'load') + ].eval_with_dict(params) + f32consec = poly[StridedGmemAccess(np.dtype(np.float32), 1, 'load') + ].eval_with_dict(params) assert f64consec == 2*n*m assert f32consec == 3*n*m*l - f64consec = poly[ - (np.dtype(np.float64), DataAccess(stride=1), 'store') - ].eval_with_dict(params) - f32consec = poly[ - (np.dtype(np.float32), DataAccess(stride=1), 'store') - ].eval_with_dict(params) + f64consec = poly[StridedGmemAccess(np.dtype(np.float64), 1, 'store') + ].eval_with_dict(params) + f32consec = poly[StridedGmemAccess(np.dtype(np.float32), 1, 'store') + ].eval_with_dict(params) assert f64consec == n*m assert f32consec == n*m*l @@ -593,19 +571,17 @@ def test_all_counters_parallel_matmul(): assert i32ops == n*m*l*4 + l*n*4 subscript_map = get_gmem_access_poly(knl) - f32uncoal = subscript_map[ - (np.dtype(np.float32), DataAccess(stride=Variable('m')), 'load') - ].eval_with_dict(params) - f32coal = subscript_map[ - (np.dtype(np.float32), DataAccess(stride=1), 'load') - ].eval_with_dict(params) + f32uncoal = subscript_map[StridedGmemAccess( + np.dtype(np.float32), Variable('m'), 'load') + ].eval_with_dict(params) + f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, 'load') + ].eval_with_dict(params) assert f32uncoal == n*m*l assert f32coal == n*m*l - f32coal = subscript_map[ - (np.dtype(np.float32), DataAccess(stride=1), 'store') - ].eval_with_dict(params) + f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, 'store') + ].eval_with_dict(params) assert f32coal == n*l -- GitLab From c0dcf557e5996990e80fecc65ae46268728e876f Mon Sep 17 00:00:00 2001 From: James Stevens Date: Tue, 15 Mar 2016 23:39:45 -0500 Subject: [PATCH 05/55] replaced operation tuples with TypedOp class --- loopy/statistics.py | 39 +++++++++++++++++-------- test/test_statistics.py | 65 +++++++++++++++++++++-------------------- 2 files changed, 60 insertions(+), 44 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 96bf511b5..5855f0852 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -105,6 +105,21 @@ def stringify_stats_mapping(m): return result +class TypedOp: + + def __init__(self, dtype, name): + self.dtype = dtype + self.name = name + + def __eq__(self, other): + return isinstance(other, TypedOp) and ( + other.dtype == self.dtype and + other.name == self.name ) + + def __hash__(self): + return hash(str(self.dtype)+self.name) + + class StridedGmemAccess: def __init__(self, dtype, stride, direction=None): @@ -151,7 +166,7 @@ class ExpressionOpCounter(CombineMapper): def map_call(self, expr): return ToCountMap( - {(self.type_inf(expr), 'func:'+str(expr.function)): 1} + {TypedOp(self.type_inf(expr), 'func:'+str(expr.function)): 1} ) + self.rec(expr.parameters) # def map_call_with_kwargs(self, expr): # implemented in CombineMapper @@ -164,20 +179,20 @@ class ExpressionOpCounter(CombineMapper): def map_sum(self, expr): assert expr.children return ToCountMap( - {(self.type_inf(expr), 'add'): len(expr.children)-1} + {TypedOp(self.type_inf(expr), 'add'): len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) def map_product(self, expr): from pymbolic.primitives import is_zero assert expr.children - return sum(ToCountMap({(self.type_inf(expr), 'mul'): 1}) + return sum(ToCountMap({TypedOp(self.type_inf(expr), 'mul'): 1}) + self.rec(child) for child in expr.children if not is_zero(child + 1)) + \ - ToCountMap({(self.type_inf(expr), 'mul'): -1}) + ToCountMap({TypedOp(self.type_inf(expr), 'mul'): -1}) def map_quotient(self, expr, *args): - return ToCountMap({(self.type_inf(expr), 'div'): 1}) \ + return ToCountMap({TypedOp(self.type_inf(expr), 'div'): 1}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) @@ -185,24 +200,24 @@ class ExpressionOpCounter(CombineMapper): map_remainder = map_quotient def map_power(self, expr): - return ToCountMap({(self.type_inf(expr), 'pow'): 1}) \ + return ToCountMap({TypedOp(self.type_inf(expr), 'pow'): 1}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) def map_left_shift(self, expr): - return ToCountMap({(self.type_inf(expr), 'shift'): 1}) \ + return ToCountMap({TypedOp(self.type_inf(expr), 'shift'): 1}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) map_right_shift = map_left_shift def map_bitwise_not(self, expr): - return ToCountMap({(self.type_inf(expr), 'bw'): 1}) \ + return ToCountMap({TypedOp(self.type_inf(expr), 'bw'): 1}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): return ToCountMap( - {(self.type_inf(expr), 'bw'): len(expr.children)-1} + {TypedOp(self.type_inf(expr), 'bw'): len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) map_bitwise_xor = map_bitwise_or @@ -230,9 +245,9 @@ class ExpressionOpCounter(CombineMapper): return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_) def map_min(self, expr): - return ToCountMap( - {(self.type_inf(expr), 'maxmin'): len(expr.children)-1} - ) + sum(self.rec(child) for child in expr.children) + return ToCountMap({TypedOp( + self.type_inf(expr), 'maxmin'): len(expr.children)-1} + ) + sum(self.rec(child) for child in expr.children) map_max = map_min diff --git a/test/test_statistics.py b/test/test_statistics.py index a4fc022d5..5d6fac573 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -32,7 +32,8 @@ from loopy.statistics import ( get_op_poly, get_gmem_access_poly, get_barrier_poly, - StridedGmemAccess) + StridedGmemAccess, + TypedOp) import numpy as np @@ -57,11 +58,11 @@ def test_op_counter_basic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) - f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) - f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params) - f64mul = poly[(np.dtype(np.float64), 'mul')].eval_with_dict(params) - i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) + f32add = poly[TypedOp(np.dtype(np.float32), 'add')].eval_with_dict(params) + f32mul = poly[TypedOp(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32div = poly[TypedOp(np.dtype(np.float32), 'div')].eval_with_dict(params) + f64mul = poly[TypedOp(np.dtype(np.float64), 'mul')].eval_with_dict(params) + i32add = poly[TypedOp(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32add == f32mul == f32div == n*m*l assert f64mul == n*m assert i32add == n*m*2 @@ -82,8 +83,8 @@ def test_op_counter_reduction(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) - f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32add = poly[TypedOp(np.dtype(np.float32), 'add')].eval_with_dict(params) + f32mul = poly[TypedOp(np.dtype(np.float32), 'mul')].eval_with_dict(params) assert f32add == f32mul == n*m*l @@ -104,10 +105,10 @@ def test_op_counter_logic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) - f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params) - f64div = poly[(np.dtype(np.float64), 'div')].eval_with_dict(params) - i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) + f32mul = poly[TypedOp(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f64add = poly[TypedOp(np.dtype(np.float64), 'add')].eval_with_dict(params) + f64div = poly[TypedOp(np.dtype(np.float64), 'div')].eval_with_dict(params) + i32add = poly[TypedOp(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32mul == n*m assert f64div == 2*n*m # TODO why? assert f64add == n*m @@ -133,18 +134,18 @@ def test_op_counter_specialops(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) - f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params) - f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) - f64pow = poly[(np.dtype(np.float64), 'pow')].eval_with_dict(params) - f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params) - i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) - f64rsqrt = poly[(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params) - f64sin = poly[(np.dtype(np.float64), 'func:sin')].eval_with_dict(params) + f32mul = poly[TypedOp(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32div = poly[TypedOp(np.dtype(np.float32), 'div')].eval_with_dict(params) + f32add = poly[TypedOp(np.dtype(np.float32), 'add')].eval_with_dict(params) + f64pow = poly[TypedOp(np.dtype(np.float64), 'pow')].eval_with_dict(params) + f64add = poly[TypedOp(np.dtype(np.float64), 'add')].eval_with_dict(params) + i32add = poly[TypedOp(np.dtype(np.int32), 'add')].eval_with_dict(params) + f64rsq = poly[TypedOp(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params) + f64sin = poly[TypedOp(np.dtype(np.float64), 'func:sin')].eval_with_dict(params) assert f32div == 2*n*m*l assert f32mul == f32add == n*m*l assert f64add == 3*n*m - assert f64pow == i32add == f64rsqrt == f64sin == n*m + assert f64pow == i32add == f64rsq == f64sin == n*m def test_op_counter_bitwise(): @@ -169,12 +170,12 @@ def test_op_counter_bitwise(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) - i32bw = poly[(np.dtype(np.int32), 'bw')].eval_with_dict(params) - i64bw = poly[(np.dtype(np.int64), 'bw')].eval_with_dict(params) - i64mul = poly[(np.dtype(np.int64), 'mul')].eval_with_dict(params) - i64add = poly[(np.dtype(np.int64), 'add')].eval_with_dict(params) - i64shift = poly[(np.dtype(np.int64), 'shift')].eval_with_dict(params) + i32add = poly[TypedOp(np.dtype(np.int32), 'add')].eval_with_dict(params) + i32bw = poly[TypedOp(np.dtype(np.int32), 'bw')].eval_with_dict(params) + i64bw = poly[TypedOp(np.dtype(np.int64), 'bw')].eval_with_dict(params) + i64mul = poly[TypedOp(np.dtype(np.int64), 'mul')].eval_with_dict(params) + i64add = poly[TypedOp(np.dtype(np.int64), 'add')].eval_with_dict(params) + i64shift = poly[TypedOp(np.dtype(np.int64), 'shift')].eval_with_dict(params) assert i32add == n*m+n*m*l assert i32bw == 2*n*m*l assert i64bw == 2*n*m @@ -203,7 +204,7 @@ def test_op_counter_triangular_domain(): else: expect_fallback = False - poly = get_op_poly(knl)[(np.dtype(np.float64), 'mul')] + poly = get_op_poly(knl)[TypedOp(np.dtype(np.float64), 'mul')] value_dict = dict(m=13, n=200) flops = poly.eval_with_dict(value_dict) @@ -555,16 +556,16 @@ def test_all_counters_parallel_matmul(): op_map = get_op_poly(knl) f32mul = op_map[ - (np.dtype(np.float32), 'mul') + TypedOp(np.dtype(np.float32), 'mul') ].eval_with_dict(params) f32add = op_map[ - (np.dtype(np.float32), 'add') + TypedOp(np.dtype(np.float32), 'add') ].eval_with_dict(params) i32ops = op_map[ - (np.dtype(np.int32), 'add') + TypedOp(np.dtype(np.int32), 'add') ].eval_with_dict(params) i32ops += op_map[ - (np.dtype(np.int32), 'mul') + TypedOp(np.dtype(np.int32), 'mul') ].eval_with_dict(params) assert f32mul+f32add == n*m*l*2 -- GitLab From b3d9498ca3bc92b1b23428f859ee83f610bc7a16 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Fri, 1 Apr 2016 11:29:01 -0500 Subject: [PATCH 06/55] temporary fix for stride counting when min tag axis >0, and added variable name to StridedGmemAccess (still work to do) --- loopy/statistics.py | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 5855f0852..3f2c3a4b5 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -122,22 +122,30 @@ class TypedOp: class StridedGmemAccess: - def __init__(self, dtype, stride, direction=None): + #TODO "ANY_VAR" does not work yet + + def __init__(self, dtype, stride, direction=None, variable='ANY_VAR'): self.dtype = dtype self.stride = stride self.direction = direction + self.variable = variable def __eq__(self, other): return isinstance(other, StridedGmemAccess) and ( other.dtype == self.dtype and other.stride == self.stride and - other.direction == self.direction ) + other.direction == self.direction and + ((self.variable == 'ANY_VAR' or other.variable == 'ANY_VAR') or + self.variable == other.variable)) def __hash__(self): - if self.direction == None: - return hash(str(self.dtype)+str(self.stride)+"None") - else: - return hash(str(self.dtype)+str(self.stride)+self.direction) + direction = self.direction + variable = self.variable + if direction == None: + direction = 'None' + if variable == None: + variable = 'ANY_VAR' + return hash(str(self.dtype)+str(self.stride)+direction+variable) # {{{ ExpressionOpCounter @@ -310,7 +318,7 @@ class GlobalSubscriptCounter(CombineMapper): index = (index,) from loopy.symbolic import get_dependencies - from loopy.kernel.data import LocalIndexTag + from loopy.kernel.data import LocalIndexTag, GroupIndexTag my_inames = get_dependencies(index) & self.knl.all_inames() # find min tag axis @@ -327,7 +335,7 @@ class GlobalSubscriptCounter(CombineMapper): if not local_id_found: # count as uniform access return ToCountMap( - {StridedGmemAccess(self.type_inf(expr), 0): 1} + {StridedGmemAccess(self.type_inf(expr), 0, direction=None, variable=name): 1} ) + self.rec(expr.index) # get local_id associated with minimum tag axis @@ -342,8 +350,10 @@ class GlobalSubscriptCounter(CombineMapper): # found local_id associated with minimum tag axis total_stride = None - # check coefficient of local_id0 for each axis + extra_stride = 1 + # check coefficient of min_lid for each axis from loopy.symbolic import CoefficientCollector + from loopy.kernel.array import FixedStrideArrayDimTag from pymbolic.primitives import Variable for idx, axis_tag in zip(index, array.dim_tags): coeffs = CoefficientCollector()(idx) @@ -355,17 +365,22 @@ class GlobalSubscriptCounter(CombineMapper): continue # found coefficient of min_lid # now determine stride - from loopy.kernel.array import FixedStrideArrayDimTag if isinstance(axis_tag, FixedStrideArrayDimTag): stride = axis_tag.stride else: continue - total_stride = stride*coeff_min_lid + total_stride = stride*coeff_min_lid*extra_stride #TODO is there a case where this^ does not execute, or executes more than once for two different axes? + #TODO temporary fix that needs changing: + if min_tag_axis != 0: + print("...... min tag axis (%d) is not zero! ......" % (min_tag_axis)) + return ToCountMap({StridedGmemAccess(self.type_inf(expr), + sys.maxsize, direction=None, variable=name): 1}) + self.rec(expr.index) + return ToCountMap({StridedGmemAccess(self.type_inf(expr), - total_stride): 1}) + self.rec(expr.index) + total_stride, direction=None, variable=name): 1}) + self.rec(expr.index) def map_sum(self, expr): if expr.children: @@ -734,12 +749,12 @@ def get_gmem_access_poly(knl): # for now just counting subscripts subs_expr = subscript_counter(insn.expression) for key in subs_expr.dict: subs_expr.dict[StridedGmemAccess( - key.dtype, key.stride, 'load') + key.dtype, key.stride, direction='load', variable=key.variable) ] = subs_expr.dict.pop(key) subs_assignee = subscript_counter(insn.assignee) for key in subs_assignee.dict: subs_assignee.dict[StridedGmemAccess( - key.dtype, key.stride, 'store') + key.dtype, key.stride, direction='store', variable=key.variable) ] = subs_assignee.dict.pop(key) insn_inames = knl.insn_inames(insn) -- GitLab From 138c0b1e553a4f2a6edcdba15a6deaee9e1b28d0 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Fri, 1 Apr 2016 11:29:25 -0500 Subject: [PATCH 07/55] added variable name to StridedGmemAccess (still work to do) --- test/test_statistics.py | 107 ++++++++++++++++++++++++++++------------ 1 file changed, 76 insertions(+), 31 deletions(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index 5d6fac573..56b9f4003 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -233,16 +233,20 @@ def test_gmem_access_counter_basic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load') + f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='a') ].eval_with_dict(params) - f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'load') + f32 += poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='b') + ].eval_with_dict(params) + f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='g') + ].eval_with_dict(params) + f64 += poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='h') ].eval_with_dict(params) assert f32 == 3*n*m*l assert f64 == 2*n*m - f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'store') + f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='store', variable='c') ].eval_with_dict(params) - f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'store') + f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='store', variable='e') ].eval_with_dict(params) assert f32 == n*m*l assert f64 == n*m @@ -263,11 +267,13 @@ def test_gmem_access_counter_reduction(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load') + f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='a') + ].eval_with_dict(params) + f32 += poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='b') ].eval_with_dict(params) assert f32 == 2*n*m*l - f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'store') + f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='store', variable='c') ].eval_with_dict(params) assert f32 == n*l @@ -289,14 +295,14 @@ def test_gmem_access_counter_logic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load') + f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='g') ].eval_with_dict(params) - f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'load') + f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='h') ].eval_with_dict(params) assert f32 == 2*n*m assert f64 == n*m - f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'store') + f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='store', variable='e') ].eval_with_dict(params) assert f64 == n*m @@ -320,16 +326,20 @@ def test_gmem_access_counter_specialops(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load') + f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='a') + ].eval_with_dict(params) + f32 += poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='b') + ].eval_with_dict(params) + f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='g') ].eval_with_dict(params) - f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'load') + f64 += poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='h') ].eval_with_dict(params) assert f32 == 2*n*m*l assert f64 == 2*n*m - f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'store') + f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='store', variable='c') ].eval_with_dict(params) - f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'store') + f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='store', variable='e') ].eval_with_dict(params) assert f32 == n*m*l assert f64 == n*m @@ -357,11 +367,19 @@ def test_gmem_access_counter_bitwise(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - i32 = poly[StridedGmemAccess(np.dtype(np.int32), 0, 'load') + i32 = poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='load', variable='a') + ].eval_with_dict(params) + i32 += poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='load', variable='b') + ].eval_with_dict(params) + i32 += poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='load', variable='g') + ].eval_with_dict(params) + i32 += poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='load', variable='h') ].eval_with_dict(params) assert i32 == 4*n*m+2*n*m*l - i32 = poly[StridedGmemAccess(np.dtype(np.int32), 0, 'store') + i32 = poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='store', variable='c') + ].eval_with_dict(params) + i32 += poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='store', variable='e') ].eval_with_dict(params) assert i32 == n*m+n*m*l @@ -389,21 +407,26 @@ def test_gmem_access_counter_mixed(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f64uniform = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'load') + f64uniform = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='g') ].eval_with_dict(params) - f32uniform = poly[StridedGmemAccess(np.dtype(np.float32), 0, 'load') + f64uniform += poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='h') + ].eval_with_dict(params) + f32uniform = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='x') ].eval_with_dict(params) f32nonconsec = poly[ - StridedGmemAccess(np.dtype(np.float32), Variable('m'), 'load') + StridedGmemAccess(np.dtype(np.float32), Variable('m'), direction='load', variable='a') + ].eval_with_dict(params) + f32nonconsec += poly[ + StridedGmemAccess(np.dtype(np.float32), Variable('m'), direction='load', variable='b') ].eval_with_dict(params) assert f64uniform == 2*n*m assert f32uniform == n*m*l/threads assert f32nonconsec == 3*n*m*l - f64uniform = poly[StridedGmemAccess(np.dtype(np.float64), 0, 'store') + f64uniform = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='store', variable='e') ].eval_with_dict(params) f32nonconsec = poly[ - StridedGmemAccess(np.dtype(np.float32), Variable('m'), 'store') + StridedGmemAccess(np.dtype(np.float32), Variable('m'), direction='store', variable='c') ].eval_with_dict(params) assert f64uniform == n*m assert f32nonconsec == n*m*l @@ -431,19 +454,25 @@ def test_gmem_access_counter_nonconsec(): l = 128 params = {'n': n, 'm': m, 'l': l} f64nonconsec = poly[StridedGmemAccess( - np.dtype(np.float64), Variable('m'), 'load') + np.dtype(np.float64), Variable('m'), direction='load', variable='g') + ].eval_with_dict(params) + f64nonconsec += poly[StridedGmemAccess( + np.dtype(np.float64), Variable('m'), direction='load', variable='h') ].eval_with_dict(params) f32nonconsec = poly[StridedGmemAccess( - np.dtype(np.float32), Variable('m')*Variable('l'), 'load') + np.dtype(np.float32), Variable('m')*Variable('l'), direction='load', variable='a') + ].eval_with_dict(params) + f32nonconsec += poly[StridedGmemAccess( + np.dtype(np.float32), Variable('m')*Variable('l'), direction='load', variable='b') ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*l f64nonconsec = poly[StridedGmemAccess( - np.dtype(np.float64), Variable('m'), 'store') + np.dtype(np.float64), Variable('m'), direction='store', variable='e') ].eval_with_dict(params) f32nonconsec = poly[StridedGmemAccess( - np.dtype(np.float32), Variable('m')*Variable('l'), 'store') + np.dtype(np.float32), Variable('m')*Variable('l'), direction='store', variable='c') ].eval_with_dict(params) assert f64nonconsec == n*m assert f32nonconsec == n*m*l @@ -470,16 +499,20 @@ def test_gmem_access_counter_consec(): l = 128 params = {'n': n, 'm': m, 'l': l} - f64consec = poly[StridedGmemAccess(np.dtype(np.float64), 1, 'load') + f64consec = poly[StridedGmemAccess(np.dtype(np.float64), 1, direction='load', variable='g') ].eval_with_dict(params) - f32consec = poly[StridedGmemAccess(np.dtype(np.float32), 1, 'load') + f64consec += poly[StridedGmemAccess(np.dtype(np.float64), 1, direction='load', variable='h') + ].eval_with_dict(params) + f32consec = poly[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='a') + ].eval_with_dict(params) + f32consec += poly[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='b') ].eval_with_dict(params) assert f64consec == 2*n*m assert f32consec == 3*n*m*l - f64consec = poly[StridedGmemAccess(np.dtype(np.float64), 1, 'store') + f64consec = poly[StridedGmemAccess(np.dtype(np.float64), 1, direction='store', variable='e') ].eval_with_dict(params) - f32consec = poly[StridedGmemAccess(np.dtype(np.float32), 1, 'store') + f32consec = poly[StridedGmemAccess(np.dtype(np.float32), 1, direction='store', variable='c') ].eval_with_dict(params) assert f64consec == n*m assert f32consec == n*m*l @@ -572,16 +605,28 @@ def test_all_counters_parallel_matmul(): assert i32ops == n*m*l*4 + l*n*4 subscript_map = get_gmem_access_poly(knl) + #f32uncoal = subscript_map[StridedGmemAccess( + # np.dtype(np.float32), Variable('m'), direction='load', variable='ANY_VAR') + # ].eval_with_dict(params) + #test = StridedGmemAccess(np.dtype(np.float32), sys.maxsize, direction='load', variable='ANY_VAR') + #print("test key: ", test.dtype, test.stride, test.direction, test.variable) + #for key in subscript_map: + # print(key.dtype, key.stride, key.direction, key.variable) + f32uncoal = subscript_map[StridedGmemAccess( + np.dtype(np.float32), sys.maxsize, direction='load', variable='a') + ].eval_with_dict(params) + ''' f32uncoal = subscript_map[StridedGmemAccess( - np.dtype(np.float32), Variable('m'), 'load') + np.dtype(np.float32), sys.maxsize, direction='load', variable='ANY_VAR') ].eval_with_dict(params) - f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, 'load') + ''' + f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='b') ].eval_with_dict(params) assert f32uncoal == n*m*l assert f32coal == n*m*l - f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, 'store') + f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, direction='store', variable='c') ].eval_with_dict(params) assert f32coal == n*l -- GitLab From e16469d799a21f7cc5459a445c64bc05a21209cc Mon Sep 17 00:00:00 2001 From: James Stevens Date: Fri, 1 Apr 2016 17:35:36 -0500 Subject: [PATCH 08/55] adding local mem access counter --- loopy/statistics.py | 188 ++++++++++++++++++++++++++++++++++++++++ test/test_statistics.py | 35 ++++---- 2 files changed, 204 insertions(+), 19 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 3f2c3a4b5..419eb2868 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -120,6 +120,24 @@ class TypedOp: return hash(str(self.dtype)+self.name) +class LmemAccess: + + def __init__(self, dtype, direction=None): + self.dtype = dtype + self.direction = direction + + def __eq__(self, other): + return isinstance(other, LmemAccess) and ( + other.dtype == self.dtype and + other.direction == self.direction) + + def __hash__(self): + direction = self.direction + if direction == None: + direction = 'None' + return hash(str(self.dtype)+direction) + + class StridedGmemAccess: #TODO "ANY_VAR" does not work yet @@ -279,6 +297,116 @@ class ExpressionOpCounter(CombineMapper): # }}} +# {{{ LocalSubscriptCounter + +class LocalSubscriptCounter(CombineMapper): + + def __init__(self, knl): + self.knl = knl + from loopy.expression import TypeInferenceMapper + self.type_inf = TypeInferenceMapper(knl) + + def combine(self, values): + return sum(values) + + def map_constant(self, expr): + return ToCountMap() + + map_tagged_variable = map_constant + map_variable = map_constant + + def map_call(self, expr): + return self.rec(expr.parameters) + + def map_subscript(self, expr): + name = expr.aggregate.name # name of array + + if name in self.knl.temporary_variables: + array = self.knl.temporary_variables[name] + #print("array: ", array) + #print("is local? ", array.is_local) + if array.is_local: + return ToCountMap( + {LmemAccess(self.type_inf(expr), direction=None): 1} + ) + self.rec(expr.index) + + return self.rec(expr.index) + + def map_sum(self, expr): + if expr.children: + return sum(self.rec(child) for child in expr.children) + else: + return ToCountMap() + + map_product = map_sum + + def map_quotient(self, expr, *args): + return self.rec(expr.numerator) + self.rec(expr.denominator) + + map_floor_div = map_quotient + map_remainder = map_quotient + + def map_power(self, expr): + return self.rec(expr.base) + self.rec(expr.exponent) + + def map_left_shift(self, expr): + return self.rec(expr.shiftee)+self.rec(expr.shift) + + map_right_shift = map_left_shift + + def map_bitwise_not(self, expr): + return self.rec(expr.child) + + def map_bitwise_or(self, expr): + return sum(self.rec(child) for child in expr.children) + + map_bitwise_xor = map_bitwise_or + map_bitwise_and = map_bitwise_or + + def map_comparison(self, expr): + return self.rec(expr.left)+self.rec(expr.right) + + map_logical_not = map_bitwise_not + map_logical_or = map_bitwise_or + map_logical_and = map_logical_or + + def map_if(self, expr): + warnings.warn("LocalSubscriptCounter counting LMEM accesses as " + "sum of if-statement branches.") + return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_) + + def map_if_positive(self, expr): + warnings.warn("LocalSubscriptCounter counting LMEM accesses as " + "sum of if_pos-statement branches.") + return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_) + + map_min = map_bitwise_or + map_max = map_min + + def map_common_subexpression(self, expr): + raise NotImplementedError("LocalSubscriptCounter encountered " + "common_subexpression, " + "map_common_subexpression not implemented.") + + def map_substitution(self, expr): + raise NotImplementedError("LocalSubscriptCounter encountered " + "substitution, " + "map_substitution not implemented.") + + def map_derivative(self, expr): + raise NotImplementedError("LocalSubscriptCounter encountered " + "derivative, " + "map_derivative not implemented.") + + def map_slice(self, expr): + raise NotImplementedError("LocalSubscriptCounter encountered slice, " + "map_slice not implemented.") + +# }}} + + + + # {{{ GlobalSubscriptCounter class GlobalSubscriptCounter(CombineMapper): @@ -674,6 +802,66 @@ def sum_ops_to_dtypes(op_poly_dict): return result +def get_lmem_access_poly(knl): + + """Count the number of local memory accesses in a loopy kernel. + """ + + from loopy.preprocess import preprocess_kernel, infer_unknown_types + + class CacheHolder(object): + pass + + cache_holder = CacheHolder() + + @memoize_in(cache_holder, "insn_count") + def get_insn_count(knl, insn_inames): + inames_domain = knl.get_inames_domain(insn_inames) + domain = (inames_domain.project_out_except( + insn_inames, [dim_type.set])) + return count(knl, domain) + + knl = infer_unknown_types(knl, expect_completion=True) + knl = preprocess_kernel(knl) + + subs_poly = ToCountMap() + subscript_counter = LocalSubscriptCounter(knl) + for insn in knl.instructions: + # count subscripts, distinguishing loads and stores + subs_expr = subscript_counter(insn.expression) + for key in subs_expr.dict: + subs_expr.dict[LmemAccess( + key.dtype, direction='load') + ] = subs_expr.dict.pop(key) + subs_assignee = subscript_counter(insn.assignee) + for key in subs_assignee.dict: + print(key.dtype, key.direction, subs_assignee.dict[key]) + + # for now, not counting stores in local mem + ''' + for key in subs_assignee.dict: + subs_assignee.dict[LmemAccess( + key.dtype, direction='store') + ] = subs_assignee.dict.pop(key) + ''' + + insn_inames = knl.insn_inames(insn) + + # use count excluding local index tags for uniform accesses + for key in subs_expr.dict: + poly = ToCountMap({key: subs_expr.dict[key]}) + subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) + + # for now, not counting stores in local mem + ''' + for key in subs_assignee.dict: + poly = ToCountMap({key: subs_assignee.dict[key]}) + subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) + ''' + + return subs_poly.dict + + # {{{ get_gmem_access_poly def get_gmem_access_poly(knl): # for now just counting subscripts diff --git a/test/test_statistics.py b/test/test_statistics.py index 56b9f4003..0353ac0d4 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -31,8 +31,10 @@ import loopy as lp from loopy.statistics import ( get_op_poly, get_gmem_access_poly, + get_lmem_access_poly, get_barrier_poly, StridedGmemAccess, + LmemAccess, TypedOp) import numpy as np @@ -578,6 +580,9 @@ def test_all_counters_parallel_matmul(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") + knl = lp.split_iname(knl, "k", 16) + knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) + knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) n = 512 m = 256 @@ -585,7 +590,7 @@ def test_all_counters_parallel_matmul(): params = {'n': n, 'm': m, 'l': l} barrier_count = get_barrier_poly(knl).eval_with_dict(params) - assert barrier_count == 0 + assert barrier_count == 2*m/16 op_map = get_op_poly(knl) f32mul = op_map[ @@ -602,35 +607,27 @@ def test_all_counters_parallel_matmul(): ].eval_with_dict(params) assert f32mul+f32add == n*m*l*2 - assert i32ops == n*m*l*4 + l*n*4 subscript_map = get_gmem_access_poly(knl) - #f32uncoal = subscript_map[StridedGmemAccess( - # np.dtype(np.float32), Variable('m'), direction='load', variable='ANY_VAR') - # ].eval_with_dict(params) - #test = StridedGmemAccess(np.dtype(np.float32), sys.maxsize, direction='load', variable='ANY_VAR') - #print("test key: ", test.dtype, test.stride, test.direction, test.variable) - #for key in subscript_map: - # print(key.dtype, key.stride, key.direction, key.variable) - f32uncoal = subscript_map[StridedGmemAccess( - np.dtype(np.float32), sys.maxsize, direction='load', variable='a') - ].eval_with_dict(params) - ''' - f32uncoal = subscript_map[StridedGmemAccess( - np.dtype(np.float32), sys.maxsize, direction='load', variable='ANY_VAR') - ].eval_with_dict(params) - ''' + f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='b') ].eval_with_dict(params) + f32coal += subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='a') + ].eval_with_dict(params) - assert f32uncoal == n*m*l - assert f32coal == n*m*l + assert f32coal == n*m+m*l f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, direction='store', variable='c') ].eval_with_dict(params) assert f32coal == n*l + local_subs_map = get_lmem_access_poly(knl) + + local_subs_l = local_subs_map[LmemAccess(np.dtype(np.float32), direction='load') + ].eval_with_dict(params) + + assert local_subs_l == n*m*l*2 def test_gather_access_footprint(): knl = lp.make_kernel( -- GitLab From 0d069a656e5661e9dc82df8b5d8097e2b538a98b Mon Sep 17 00:00:00 2001 From: James Stevens Date: Thu, 1 Sep 2016 12:30:23 -0500 Subject: [PATCH 09/55] commenting out debug print --- loopy/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 419eb2868..8b57b782b 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -834,8 +834,8 @@ def get_lmem_access_poly(knl): key.dtype, direction='load') ] = subs_expr.dict.pop(key) subs_assignee = subscript_counter(insn.assignee) - for key in subs_assignee.dict: - print(key.dtype, key.direction, subs_assignee.dict[key]) + #for key in subs_assignee.dict: + # print(key.dtype, key.direction, subs_assignee.dict[key]) # for now, not counting stores in local mem ''' -- GitLab From 1a3e4259784c0b791cbed8f06c8da6c49d0ce272 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Sat, 15 Oct 2016 19:33:56 -0500 Subject: [PATCH 10/55] TypedOp -> Op --- loopy/statistics.py | 26 ++++++++--------- test/test_statistics.py | 62 ++++++++++++++++++++--------------------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 0257623ed..684a683c5 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -108,14 +108,14 @@ def stringify_stats_mapping(m): return result -class TypedOp: +class Op: def __init__(self, dtype, name): self.dtype = dtype self.name = name def __eq__(self, other): - return isinstance(other, TypedOp) and ( + return isinstance(other, Op) and ( other.dtype == self.dtype and other.name == self.name ) @@ -195,7 +195,7 @@ class ExpressionOpCounter(CombineMapper): def map_call(self, expr): return ToCountMap( - {TypedOp(self.type_inf(expr), 'func:'+str(expr.function)): 1} + {Op(self.type_inf(expr), 'func:'+str(expr.function)): 1} ) + self.rec(expr.parameters) # def map_call_with_kwargs(self, expr): # implemented in CombineMapper @@ -208,20 +208,20 @@ class ExpressionOpCounter(CombineMapper): def map_sum(self, expr): assert expr.children return ToCountMap( - {TypedOp(self.type_inf(expr), 'add'): len(expr.children)-1} + {Op(self.type_inf(expr), 'add'): len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) def map_product(self, expr): from pymbolic.primitives import is_zero assert expr.children - return sum(ToCountMap({TypedOp(self.type_inf(expr), 'mul'): 1}) + return sum(ToCountMap({Op(self.type_inf(expr), 'mul'): 1}) + self.rec(child) for child in expr.children if not is_zero(child + 1)) + \ - ToCountMap({TypedOp(self.type_inf(expr), 'mul'): -1}) + ToCountMap({Op(self.type_inf(expr), 'mul'): -1}) def map_quotient(self, expr, *args): - return ToCountMap({TypedOp(self.type_inf(expr), 'div'): 1}) \ + return ToCountMap({Op(self.type_inf(expr), 'div'): 1}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) @@ -229,24 +229,24 @@ class ExpressionOpCounter(CombineMapper): map_remainder = map_quotient def map_power(self, expr): - return ToCountMap({TypedOp(self.type_inf(expr), 'pow'): 1}) \ + return ToCountMap({Op(self.type_inf(expr), 'pow'): 1}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) def map_left_shift(self, expr): - return ToCountMap({TypedOp(self.type_inf(expr), 'shift'): 1}) \ + return ToCountMap({Op(self.type_inf(expr), 'shift'): 1}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) map_right_shift = map_left_shift def map_bitwise_not(self, expr): - return ToCountMap({TypedOp(self.type_inf(expr), 'bw'): 1}) \ + return ToCountMap({Op(self.type_inf(expr), 'bw'): 1}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): return ToCountMap( - {TypedOp(self.type_inf(expr), 'bw'): len(expr.children)-1} + {Op(self.type_inf(expr), 'bw'): len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) map_bitwise_xor = map_bitwise_or @@ -274,7 +274,7 @@ class ExpressionOpCounter(CombineMapper): return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_) def map_min(self, expr): - return ToCountMap({TypedOp( + return ToCountMap({Op( self.type_inf(expr), 'maxmin'): len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) @@ -807,7 +807,7 @@ def get_op_poly(knl, numpy_types=True): if numpy_types: result = dict( - (TypedOp(op.dtype.numpy_dtype, op.name), count) + (Op(op.dtype.numpy_dtype, op.name), count) for op, count in six.iteritems(result)) return result diff --git a/test/test_statistics.py b/test/test_statistics.py index c2d1d459c..4b4a344f4 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -35,7 +35,7 @@ from loopy.statistics import ( get_synchronization_poly, StridedGmemAccess, LmemAccess, - TypedOp) + Op) import loopy as lp import numpy as np @@ -60,11 +60,11 @@ def test_op_counter_basic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32add = poly[TypedOp(np.dtype(np.float32), 'add')].eval_with_dict(params) - f32mul = poly[TypedOp(np.dtype(np.float32), 'mul')].eval_with_dict(params) - f32div = poly[TypedOp(np.dtype(np.float32), 'div')].eval_with_dict(params) - f64mul = poly[TypedOp(np.dtype(np.float64), 'mul')].eval_with_dict(params) - i32add = poly[TypedOp(np.dtype(np.int32), 'add')].eval_with_dict(params) + f32add = poly[Op(np.dtype(np.float32), 'add')].eval_with_dict(params) + f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32div = poly[Op(np.dtype(np.float32), 'div')].eval_with_dict(params) + f64mul = poly[Op(np.dtype(np.float64), 'mul')].eval_with_dict(params) + i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32add == f32mul == f32div == n*m*l assert f64mul == n*m assert i32add == n*m*2 @@ -85,8 +85,8 @@ def test_op_counter_reduction(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32add = poly[TypedOp(np.dtype(np.float32), 'add')].eval_with_dict(params) - f32mul = poly[TypedOp(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32add = poly[Op(np.dtype(np.float32), 'add')].eval_with_dict(params) + f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) assert f32add == f32mul == n*m*l @@ -107,10 +107,10 @@ def test_op_counter_logic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32mul = poly[TypedOp(np.dtype(np.float32), 'mul')].eval_with_dict(params) - f64add = poly[TypedOp(np.dtype(np.float64), 'add')].eval_with_dict(params) - f64div = poly[TypedOp(np.dtype(np.float64), 'div')].eval_with_dict(params) - i32add = poly[TypedOp(np.dtype(np.int32), 'add')].eval_with_dict(params) + f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f64add = poly[Op(np.dtype(np.float64), 'add')].eval_with_dict(params) + f64div = poly[Op(np.dtype(np.float64), 'div')].eval_with_dict(params) + i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32mul == n*m assert f64div == 2*n*m # TODO why? assert f64add == n*m @@ -136,14 +136,14 @@ def test_op_counter_specialops(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32mul = poly[TypedOp(np.dtype(np.float32), 'mul')].eval_with_dict(params) - f32div = poly[TypedOp(np.dtype(np.float32), 'div')].eval_with_dict(params) - f32add = poly[TypedOp(np.dtype(np.float32), 'add')].eval_with_dict(params) - f64pow = poly[TypedOp(np.dtype(np.float64), 'pow')].eval_with_dict(params) - f64add = poly[TypedOp(np.dtype(np.float64), 'add')].eval_with_dict(params) - i32add = poly[TypedOp(np.dtype(np.int32), 'add')].eval_with_dict(params) - f64rsq = poly[TypedOp(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params) - f64sin = poly[TypedOp(np.dtype(np.float64), 'func:sin')].eval_with_dict(params) + f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32div = poly[Op(np.dtype(np.float32), 'div')].eval_with_dict(params) + f32add = poly[Op(np.dtype(np.float32), 'add')].eval_with_dict(params) + f64pow = poly[Op(np.dtype(np.float64), 'pow')].eval_with_dict(params) + f64add = poly[Op(np.dtype(np.float64), 'add')].eval_with_dict(params) + i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params) + f64rsq = poly[Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params) + f64sin = poly[Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params) assert f32div == 2*n*m*l assert f32mul == f32add == n*m*l assert f64add == 3*n*m @@ -172,12 +172,12 @@ def test_op_counter_bitwise(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - i32add = poly[TypedOp(np.dtype(np.int32), 'add')].eval_with_dict(params) - i32bw = poly[TypedOp(np.dtype(np.int32), 'bw')].eval_with_dict(params) - i64bw = poly[TypedOp(np.dtype(np.int64), 'bw')].eval_with_dict(params) - i64mul = poly[TypedOp(np.dtype(np.int64), 'mul')].eval_with_dict(params) - i64add = poly[TypedOp(np.dtype(np.int64), 'add')].eval_with_dict(params) - i64shift = poly[TypedOp(np.dtype(np.int64), 'shift')].eval_with_dict(params) + i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params) + i32bw = poly[Op(np.dtype(np.int32), 'bw')].eval_with_dict(params) + i64bw = poly[Op(np.dtype(np.int64), 'bw')].eval_with_dict(params) + i64mul = poly[Op(np.dtype(np.int64), 'mul')].eval_with_dict(params) + i64add = poly[Op(np.dtype(np.int64), 'add')].eval_with_dict(params) + i64shift = poly[Op(np.dtype(np.int64), 'shift')].eval_with_dict(params) assert i32add == n*m+n*m*l assert i32bw == 2*n*m*l assert i64bw == 2*n*m @@ -206,7 +206,7 @@ def test_op_counter_triangular_domain(): else: expect_fallback = False - poly = lp.get_op_poly(knl)[TypedOp(np.dtype(np.float64), 'mul')] + poly = lp.get_op_poly(knl)[Op(np.dtype(np.float64), 'mul')] value_dict = dict(m=13, n=200) flops = poly.eval_with_dict(value_dict) @@ -600,16 +600,16 @@ def test_all_counters_parallel_matmul(): op_map = lp.get_op_poly(knl) f32mul = op_map[ - TypedOp(np.dtype(np.float32), 'mul') + Op(np.dtype(np.float32), 'mul') ].eval_with_dict(params) f32add = op_map[ - TypedOp(np.dtype(np.float32), 'add') + Op(np.dtype(np.float32), 'add') ].eval_with_dict(params) i32ops = op_map[ - TypedOp(np.dtype(np.int32), 'add') + Op(np.dtype(np.int32), 'add') ].eval_with_dict(params) i32ops += op_map[ - TypedOp(np.dtype(np.int32), 'mul') + Op(np.dtype(np.int32), 'mul') ].eval_with_dict(params) assert f32mul+f32add == n*m*l*2 -- GitLab From fadc351819db1060fec85307bf3ed92ee461621c Mon Sep 17 00:00:00 2001 From: James Stevens Date: Sat, 15 Oct 2016 20:37:00 -0500 Subject: [PATCH 11/55] merged StridedGmemAccess and LmemAccess into MemAccess, also reformatted long lines of code --- loopy/statistics.py | 131 ++++++++++++++--------- test/test_statistics.py | 229 ++++++++++++++++++++++++---------------- 2 files changed, 219 insertions(+), 141 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 684a683c5..a08e36c50 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -122,7 +122,7 @@ class Op: def __hash__(self): return hash(str(self.dtype)+self.name) - +''' class LmemAccess: def __init__(self, dtype, direction=None): @@ -139,20 +139,25 @@ class LmemAccess: if direction == None: direction = 'None' return hash(str(self.dtype)+direction) +''' -class StridedGmemAccess: +#class StridedGmemAccess: +class MemAccess: #TODO "ANY_VAR" does not work yet - - def __init__(self, dtype, stride, direction=None, variable='ANY_VAR'): + #TODO currently counting all lmem access as stride-1 + def __init__(self, mtype, dtype, stride=1, direction=None, + variable='ANY_VAR'): + self.mtype = mtype self.dtype = dtype self.stride = stride self.direction = direction self.variable = variable def __eq__(self, other): - return isinstance(other, StridedGmemAccess) and ( + return isinstance(other, MemAccess) and ( + other.mtype == self.mtype and other.dtype == self.dtype and other.stride == self.stride and other.direction == self.direction and @@ -166,7 +171,9 @@ class StridedGmemAccess: direction = 'None' if variable == None: variable = 'ANY_VAR' - return hash(str(self.dtype)+str(self.stride)+direction+variable) + return hash(str(self.mtype)+str(self.dtype)+str(self.stride) + +direction+variable) + # {{{ ExpressionOpCounter @@ -266,16 +273,18 @@ class ExpressionOpCounter(CombineMapper): def map_if(self, expr): warnings.warn("ExpressionOpCounter counting ops as " "sum of if-statement branches.") - return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_) + return self.rec(expr.condition) + self.rec(expr.then) \ + + self.rec(expr.else_) def map_if_positive(self, expr): warnings.warn("ExpressionOpCounter counting ops as " "sum of if_pos-statement branches.") - return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_) + return self.rec(expr.criterion) + self.rec(expr.then) \ + + self.rec(expr.else_) def map_min(self, expr): return ToCountMap({Op( - self.type_inf(expr), 'maxmin'): len(expr.children)-1} + self.type_inf(expr), 'maxmin'): len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) map_max = map_min @@ -286,11 +295,13 @@ class ExpressionOpCounter(CombineMapper): "map_common_subexpression not implemented.") def map_substitution(self, expr): - raise NotImplementedError("ExpressionOpCounter encountered substitution, " + raise NotImplementedError("ExpressionOpCounter encountered " + "substitution, " "map_substitution not implemented.") def map_derivative(self, expr): - raise NotImplementedError("ExpressionOpCounter encountered derivative, " + raise NotImplementedError("ExpressionOpCounter encountered " + "derivative, " "map_derivative not implemented.") def map_slice(self, expr): @@ -330,7 +341,7 @@ class LocalSubscriptCounter(CombineMapper): #print("is local? ", array.is_local) if array.is_local: return ToCountMap( - {LmemAccess(self.type_inf(expr), direction=None): 1} + {MemAccess('local', self.type_inf(expr)): 1} ) + self.rec(expr.index) return self.rec(expr.index) @@ -376,12 +387,14 @@ class LocalSubscriptCounter(CombineMapper): def map_if(self, expr): warnings.warn("LocalSubscriptCounter counting LMEM accesses as " "sum of if-statement branches.") - return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_) + return self.rec(expr.condition) + self.rec(expr.then) \ + + self.rec(expr.else_) def map_if_positive(self, expr): warnings.warn("LocalSubscriptCounter counting LMEM accesses as " "sum of if_pos-statement branches.") - return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_) + return self.rec(expr.criterion) + self.rec(expr.then) \ + + self.rec(expr.else_) map_min = map_bitwise_or map_max = map_min @@ -465,9 +478,9 @@ class GlobalSubscriptCounter(CombineMapper): if not local_id_found: # count as uniform access - return ToCountMap( - {StridedGmemAccess(self.type_inf(expr), 0, direction=None, variable=name): 1} - ) + self.rec(expr.index) + return ToCountMap({MemAccess('global', self.type_inf(expr), + stride=0, variable=name): 1} + ) + self.rec(expr.index) # get local_id associated with minimum tag axis min_lid = None @@ -504,16 +517,19 @@ class GlobalSubscriptCounter(CombineMapper): continue total_stride = stride*coeff_min_lid*extra_stride - #TODO is there a case where this^ does not execute, or executes more than once for two different axes? + #TODO is there a case where this^ does not execute, + # or executes more than once for two different axes? #TODO temporary fix that needs changing: if min_tag_axis != 0: - print("...... min tag axis (%d) is not zero! ......" % (min_tag_axis)) - return ToCountMap({StridedGmemAccess(self.type_inf(expr), - sys.maxsize, direction=None, variable=name): 1}) + self.rec(expr.index) + print("... min tag axis (%d) is not zero! ..." % (min_tag_axis)) + return ToCountMap({MemAccess('global', self.type_inf(expr), + stride=sys.maxsize, variable=name): 1} + ) + self.rec(expr.index) - return ToCountMap({StridedGmemAccess(self.type_inf(expr), - total_stride, direction=None, variable=name): 1}) + self.rec(expr.index) + return ToCountMap({MemAccess('global', self.type_inf(expr), + stride=total_stride, variable=name): 1} + ) + self.rec(expr.index) def map_sum(self, expr): if expr.children: @@ -556,12 +572,14 @@ class GlobalSubscriptCounter(CombineMapper): def map_if(self, expr): warnings.warn("GlobalSubscriptCounter counting GMEM accesses as " "sum of if-statement branches.") - return self.rec(expr.condition) + self.rec(expr.then) + self.rec(expr.else_) + return self.rec(expr.condition) + self.rec(expr.then) \ + + self.rec(expr.else_) def map_if_positive(self, expr): warnings.warn("GlobalSubscriptCounter counting GMEM accesses as " "sum of if_pos-statement branches.") - return self.rec(expr.criterion) + self.rec(expr.then) + self.rec(expr.else_) + return self.rec(expr.criterion) + self.rec(expr.then) \ + + self.rec(expr.else_) map_min = map_bitwise_or map_max = map_min @@ -696,7 +714,8 @@ def count(kernel, set): # {{{ rebuild check domain - zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(bset.space)) + zero = isl.Aff.zero_on_domain( + isl.LocalSpace.from_space(bset.space)) iname = isl.PwAff.from_aff( zero.set_coefficient_val(isl.dim_type.in_, i, 1)) dmin_matched = dmin.insert_dims( @@ -800,7 +819,8 @@ def get_op_poly(knl, numpy_types=True): # check domain size: insn_inames = knl.insn_inames(insn) inames_domain = knl.get_inames_domain(insn_inames) - domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) + domain = (inames_domain.project_out_except( + insn_inames, [dim_type.set])) ops = op_counter(insn.assignee) + op_counter(insn.expression) op_poly = op_poly + ops*count(knl, domain) result = op_poly.dict @@ -854,7 +874,7 @@ def get_lmem_access_poly(knl, numpy_types=True): # count subscripts, distinguishing loads and stores subs_expr = subscript_counter(insn.expression) for key in subs_expr.dict: - subs_expr.dict[LmemAccess( + subs_expr.dict[MemAccess('local', key.dtype, direction='load') ] = subs_expr.dict.pop(key) subs_assignee = subscript_counter(insn.assignee) @@ -864,7 +884,7 @@ def get_lmem_access_poly(knl, numpy_types=True): # for now, not counting stores in local mem ''' for key in subs_assignee.dict: - subs_assignee.dict[LmemAccess( + subs_assignee.dict[MemAccess('local', key.dtype, direction='store') ] = subs_assignee.dict.pop(key) ''' @@ -887,15 +907,15 @@ def get_lmem_access_poly(knl, numpy_types=True): result = subs_poly.dict if numpy_types: - result = dict( - (LmemAccess(mem_access.dtype.numpy_dtype, mem_access.direction), count) - for mem_access, count in six.iteritems(result)) + result = dict((MemAccess('local', mem_access.dtype.numpy_dtype, + direction=mem_access.direction), count) + for mem_access, count in six.iteritems(result)) return result # {{{ get_gmem_access_poly -def get_gmem_access_poly(knl, numpy_types=True): # for now just counting subscripts +def get_gmem_access_poly(knl, numpy_types=True): """Count the number of global memory accesses in a loopy kernel. @@ -955,7 +975,8 @@ def get_gmem_access_poly(knl, numpy_types=True): # for now just counting subscr if uniform: from loopy.kernel.data import LocalIndexTag insn_inames = [iname for iname in insn_inames if not - isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)] + isinstance( + knl.iname_to_tag.get(iname), LocalIndexTag)] inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except( insn_inames, [dim_type.set])) @@ -970,14 +991,15 @@ def get_gmem_access_poly(knl, numpy_types=True): # for now just counting subscr # count subscripts, distinguishing loads and stores subs_expr = subscript_counter(insn.expression) for key in subs_expr.dict: - subs_expr.dict[StridedGmemAccess( - key.dtype, key.stride, direction='load', variable=key.variable) + subs_expr.dict[MemAccess('global', key.dtype, stride=key.stride, + direction='load', variable=key.variable) ] = subs_expr.dict.pop(key) subs_assignee = subscript_counter(insn.assignee) for key in subs_assignee.dict: - subs_assignee.dict[StridedGmemAccess( - key.dtype, key.stride, direction='store', variable=key.variable) - ] = subs_assignee.dict.pop(key) + subs_assignee.dict[MemAccess('global', key.dtype, + stride=key.stride, direction='store', + variable=key.variable) + ] = subs_assignee.dict.pop(key) insn_inames = knl.insn_inames(insn) @@ -985,31 +1007,36 @@ def get_gmem_access_poly(knl, numpy_types=True): # for now just counting subscr for key in subs_expr.dict: poly = ToCountMap({key: subs_expr.dict[key]}) if isinstance(key.stride, int) and key.stride == 0: - subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True) + subs_poly = subs_poly \ + + poly*get_insn_count(knl, insn_inames, True) else: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) for key in subs_assignee.dict: poly = ToCountMap({key: subs_assignee.dict[key]}) if isinstance(key.stride, int) and key.stride == 0: - subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True) + subs_poly = subs_poly \ + + poly*get_insn_count(knl, insn_inames, True) else: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) result = subs_poly.dict if numpy_types: - result = dict( - (StridedGmemAccess(mem_access.dtype.numpy_dtype, mem_access.stride, - mem_access.direction, mem_access.variable), count) - for mem_access, count in six.iteritems(result)) + result = dict((MemAccess('global', mem_access.dtype.numpy_dtype, + stride=mem_access.stride, + direction=mem_access.direction, + variable=mem_access.variable) + , count) + for mem_access, count in six.iteritems(result)) return result def get_DRAM_access_poly(knl): from warnings import warn - warn("get_DRAM_access_poly is deprecated. Use get_gmem_access_poly instead", - DeprecationWarning, stacklevel=2) + warn("get_DRAM_access_poly is deprecated. " + "Use get_gmem_access_poly instead", + DeprecationWarning, stacklevel=2) return get_gmem_access_poly(knl) # }}} @@ -1100,8 +1127,8 @@ def get_synchronization_poly(knl): iname_list.pop() elif isinstance(sched_item, Barrier): - result = result + ToCountMap( - {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)}) + result = result + ToCountMap({"barrier_%s" % sched_item.kind: + get_count_poly(iname_list)}) elif isinstance(sched_item, CallKernel): result = result + ToCountMap( @@ -1151,7 +1178,8 @@ def gather_access_footprints(kernel, ignore_uncountable=False): insn_inames = kernel.insn_inames(insn) inames_domain = kernel.get_inames_domain(insn_inames) - domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) + domain = (inames_domain.project_out_except(insn_inames, + [dim_type.set])) afg = AccessFootprintGatherer(kernel, domain, ignore_uncountable=ignore_uncountable) @@ -1193,7 +1221,8 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): kernel = preprocess_kernel(kernel) result = {} - fp = gather_access_footprints(kernel, ignore_uncountable=ignore_uncountable) + fp = gather_access_footprints(kernel, + ignore_uncountable=ignore_uncountable) for key, var_fp in fp.items(): vname, direction = key diff --git a/test/test_statistics.py b/test/test_statistics.py index 4b4a344f4..54e3b69a8 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -33,8 +33,7 @@ from loopy.statistics import ( get_gmem_access_poly, get_lmem_access_poly, get_synchronization_poly, - StridedGmemAccess, - LmemAccess, + MemAccess, Op) import loopy as lp import numpy as np @@ -54,7 +53,8 @@ def test_op_counter_basic(): name="basic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, - dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) + dict(a=np.float32, b=np.float32, + g=np.float64, h=np.float64)) poly = lp.get_op_poly(knl) n = 512 m = 256 @@ -130,7 +130,8 @@ def test_op_counter_specialops(): name="specialops", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, - dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) + dict(a=np.float32, b=np.float32, + g=np.float64, h=np.float64)) poly = lp.get_op_poly(knl) n = 512 m = 256 @@ -235,21 +236,27 @@ def test_gmem_access_counter_basic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='a') - ].eval_with_dict(params) - f32 += poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='b') - ].eval_with_dict(params) - f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='g') + f32 = poly[MemAccess('global', np.dtype(np.float32), + stride=0, direction='load', variable='a') + ].eval_with_dict(params) + f32 += poly[MemAccess('global', np.dtype(np.float32), + stride=0, direction='load', variable='b') ].eval_with_dict(params) - f64 += poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='h') + f64 = poly[MemAccess('global', np.dtype(np.float64), + stride=0, direction='load', variable='g') + ].eval_with_dict(params) + f64 += poly[MemAccess('global', np.dtype(np.float64), + stride=0, direction='load', variable='h') ].eval_with_dict(params) assert f32 == 3*n*m*l assert f64 == 2*n*m - f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='store', variable='c') - ].eval_with_dict(params) - f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='store', variable='e') - ].eval_with_dict(params) + f32 = poly[MemAccess('global', np.dtype(np.float32), + stride=0, direction='store', variable='c') + ].eval_with_dict(params) + f64 = poly[MemAccess('global', np.dtype(np.float64), + stride=0, direction='store', variable='e') + ].eval_with_dict(params) assert f32 == n*m*l assert f64 == n*m @@ -269,14 +276,17 @@ def test_gmem_access_counter_reduction(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='a') - ].eval_with_dict(params) - f32 += poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='b') + f32 = poly[MemAccess('global', np.dtype(np.float32), + stride=0, direction='load', variable='a') + ].eval_with_dict(params) + f32 += poly[MemAccess('global', np.dtype(np.float32), + stride=0, direction='load', variable='b') ].eval_with_dict(params) assert f32 == 2*n*m*l - f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='store', variable='c') - ].eval_with_dict(params) + f32 = poly[MemAccess('global', np.dtype(np.float32), + stride=0, direction='store', variable='c') + ].eval_with_dict(params) assert f32 == n*l @@ -297,15 +307,18 @@ def test_gmem_access_counter_logic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='g') - ].eval_with_dict(params) - f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='h') - ].eval_with_dict(params) + f32 = poly[MemAccess('global', np.dtype(np.float32), + stride=0, direction='load', variable='g') + ].eval_with_dict(params) + f64 = poly[MemAccess('global', np.dtype(np.float64), + stride=0, direction='load', variable='h') + ].eval_with_dict(params) assert f32 == 2*n*m assert f64 == n*m - f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='store', variable='e') - ].eval_with_dict(params) + f64 = poly[MemAccess('global', np.dtype(np.float64), + stride=0, direction='store', variable='e') + ].eval_with_dict(params) assert f64 == n*m @@ -321,28 +334,34 @@ def test_gmem_access_counter_specialops(): ], name="specialops", assumptions="n,m,l >= 1") - knl = lp.add_and_infer_dtypes(knl, - dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, + g=np.float64, h=np.float64)) poly = lp.get_gmem_access_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='a') - ].eval_with_dict(params) - f32 += poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='b') + f32 = poly[MemAccess('global', np.dtype(np.float32), + stride=0, direction='load', variable='a') + ].eval_with_dict(params) + f32 += poly[MemAccess('global', np.dtype(np.float32), + stride=0, direction='load', variable='b') ].eval_with_dict(params) - f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='g') - ].eval_with_dict(params) - f64 += poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='h') + f64 = poly[MemAccess('global', np.dtype(np.float64), + stride=0, direction='load', variable='g') + ].eval_with_dict(params) + f64 += poly[MemAccess('global', np.dtype(np.float64), + stride=0, direction='load', variable='h') ].eval_with_dict(params) assert f32 == 2*n*m*l assert f64 == 2*n*m - f32 = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='store', variable='c') - ].eval_with_dict(params) - f64 = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='store', variable='e') - ].eval_with_dict(params) + f32 = poly[MemAccess('global', np.dtype(np.float32), + stride=0, direction='store', variable='c') + ].eval_with_dict(params) + f64 = poly[MemAccess('global', np.dtype(np.float64), + stride=0, direction='store', variable='e') + ].eval_with_dict(params) assert f32 == n*m*l assert f64 == n*m @@ -369,19 +388,25 @@ def test_gmem_access_counter_bitwise(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - i32 = poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='load', variable='a') - ].eval_with_dict(params) - i32 += poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='load', variable='b') + i32 = poly[MemAccess('global', np.dtype(np.int32), + stride=0, direction='load', variable='a') + ].eval_with_dict(params) + i32 += poly[MemAccess('global', np.dtype(np.int32), + stride=0, direction='load', variable='b') ].eval_with_dict(params) - i32 += poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='load', variable='g') + i32 += poly[MemAccess('global', np.dtype(np.int32), + stride=0, direction='load', variable='g') ].eval_with_dict(params) - i32 += poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='load', variable='h') + i32 += poly[MemAccess('global', np.dtype(np.int32), + stride=0, direction='load', variable='h') ].eval_with_dict(params) assert i32 == 4*n*m+2*n*m*l - i32 = poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='store', variable='c') - ].eval_with_dict(params) - i32 += poly[StridedGmemAccess(np.dtype(np.int32), 0, direction='store', variable='e') + i32 = poly[MemAccess('global', np.dtype(np.int32), + stride=0, direction='store', variable='c') + ].eval_with_dict(params) + i32 += poly[MemAccess('global', np.dtype(np.int32), + stride=0, direction='store', variable='e') ].eval_with_dict(params) assert i32 == n*m+n*m*l @@ -409,27 +434,34 @@ def test_gmem_access_counter_mixed(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f64uniform = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='g') - ].eval_with_dict(params) - f64uniform += poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='load', variable='h') - ].eval_with_dict(params) - f32uniform = poly[StridedGmemAccess(np.dtype(np.float32), 0, direction='load', variable='x') + f64uniform = poly[MemAccess('global', np.dtype(np.float64), + stride=0, direction='load', variable='g') + ].eval_with_dict(params) + f64uniform += poly[MemAccess('global', np.dtype(np.float64), + stride=0, direction='load', variable='h') ].eval_with_dict(params) - f32nonconsec = poly[ - StridedGmemAccess(np.dtype(np.float32), Variable('m'), direction='load', variable='a') - ].eval_with_dict(params) - f32nonconsec += poly[ - StridedGmemAccess(np.dtype(np.float32), Variable('m'), direction='load', variable='b') - ].eval_with_dict(params) + f32uniform = poly[MemAccess('global', np.dtype(np.float32), + stride=0, direction='load', variable='x') + ].eval_with_dict(params) + f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), + stride=Variable('m'), direction='load', + variable='a') + ].eval_with_dict(params) + f32nonconsec += poly[MemAccess('global', np.dtype(np.float32), + stride=Variable('m'), direction='load', + variable='b') + ].eval_with_dict(params) assert f64uniform == 2*n*m assert f32uniform == n*m*l/threads assert f32nonconsec == 3*n*m*l - f64uniform = poly[StridedGmemAccess(np.dtype(np.float64), 0, direction='store', variable='e') - ].eval_with_dict(params) - f32nonconsec = poly[ - StridedGmemAccess(np.dtype(np.float32), Variable('m'), direction='store', variable='c') - ].eval_with_dict(params) + f64uniform = poly[MemAccess('global', np.dtype(np.float64), + stride=0, direction='store', variable='e') + ].eval_with_dict(params) + f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), + stride=Variable('m'), direction='store', + variable='c') + ].eval_with_dict(params) assert f64uniform == n*m assert f32nonconsec == n*m*l @@ -455,27 +487,33 @@ def test_gmem_access_counter_nonconsec(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f64nonconsec = poly[StridedGmemAccess( - np.dtype(np.float64), Variable('m'), direction='load', variable='g') - ].eval_with_dict(params) - f64nonconsec += poly[StridedGmemAccess( - np.dtype(np.float64), Variable('m'), direction='load', variable='h') - ].eval_with_dict(params) - f32nonconsec = poly[StridedGmemAccess( - np.dtype(np.float32), Variable('m')*Variable('l'), direction='load', variable='a') + f64nonconsec = poly[MemAccess('global', np.dtype(np.float64), + stride=Variable('m'), direction='load', + variable='g') + ].eval_with_dict(params) + f64nonconsec += poly[MemAccess('global', np.dtype(np.float64), + stride=Variable('m'), direction='load', + variable='h') ].eval_with_dict(params) - f32nonconsec += poly[StridedGmemAccess( - np.dtype(np.float32), Variable('m')*Variable('l'), direction='load', variable='b') + f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), + stride=Variable('m')*Variable('l'), + direction='load', variable='a') + ].eval_with_dict(params) + f32nonconsec += poly[MemAccess('global', np.dtype(np.float32), + stride=Variable('m')*Variable('l'), + direction='load', variable='b') ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*l - f64nonconsec = poly[StridedGmemAccess( - np.dtype(np.float64), Variable('m'), direction='store', variable='e') - ].eval_with_dict(params) - f32nonconsec = poly[StridedGmemAccess( - np.dtype(np.float32), Variable('m')*Variable('l'), direction='store', variable='c') - ].eval_with_dict(params) + f64nonconsec = poly[MemAccess('global', np.dtype(np.float64), + stride=Variable('m'), direction='store', + variable='e') + ].eval_with_dict(params) + f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), + stride=Variable('m')*Variable('l'), + direction='store', variable='c') + ].eval_with_dict(params) assert f64nonconsec == n*m assert f32nonconsec == n*m*l @@ -501,20 +539,26 @@ def test_gmem_access_counter_consec(): l = 128 params = {'n': n, 'm': m, 'l': l} - f64consec = poly[StridedGmemAccess(np.dtype(np.float64), 1, direction='load', variable='g') + f64consec = poly[MemAccess('global', np.dtype(np.float64), + stride=1, direction='load', variable='g') ].eval_with_dict(params) - f64consec += poly[StridedGmemAccess(np.dtype(np.float64), 1, direction='load', variable='h') + f64consec += poly[MemAccess('global', np.dtype(np.float64), + stride=1, direction='load', variable='h') ].eval_with_dict(params) - f32consec = poly[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='a') + f32consec = poly[MemAccess('global', np.dtype(np.float32), + stride=1, direction='load', variable='a') ].eval_with_dict(params) - f32consec += poly[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='b') + f32consec += poly[MemAccess('global', np.dtype(np.float32), + stride=1, direction='load', variable='b') ].eval_with_dict(params) assert f64consec == 2*n*m assert f32consec == 3*n*m*l - f64consec = poly[StridedGmemAccess(np.dtype(np.float64), 1, direction='store', variable='e') + f64consec = poly[MemAccess('global', np.dtype(np.float64), + stride=1, direction='store', variable='e') ].eval_with_dict(params) - f32consec = poly[StridedGmemAccess(np.dtype(np.float32), 1, direction='store', variable='c') + f32consec = poly[MemAccess('global', np.dtype(np.float32), + stride=1, direction='store', variable='c') ].eval_with_dict(params) assert f64consec == n*m assert f32consec == n*m*l @@ -532,8 +576,8 @@ def test_barrier_counter_nobarriers(): ], name="basic", assumptions="n,m,l >= 1") - knl = lp.add_and_infer_dtypes(knl, - dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, + g=np.float64, h=np.float64)) sync_poly = lp.get_synchronization_poly(knl) n = 512 m = 256 @@ -616,22 +660,27 @@ def test_all_counters_parallel_matmul(): subscript_map = lp.get_gmem_access_poly(knl) - f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='b') + f32coal = subscript_map[MemAccess('global', np.dtype(np.float32), + stride=1, direction='load', variable='b') ].eval_with_dict(params) - f32coal += subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, direction='load', variable='a') + f32coal += subscript_map[MemAccess('global', np.dtype(np.float32), + stride=1, direction='load', variable='a') ].eval_with_dict(params) assert f32coal == n*m+m*l - f32coal = subscript_map[StridedGmemAccess(np.dtype(np.float32), 1, direction='store', variable='c') + f32coal = subscript_map[MemAccess('global', np.dtype(np.float32), + stride=1, direction='store', variable='c') ].eval_with_dict(params) assert f32coal == n*l local_subs_map = get_lmem_access_poly(knl) - local_subs_l = local_subs_map[LmemAccess(np.dtype(np.float32), direction='load') - ].eval_with_dict(params) + # TODO currently considering all local mem access stride-1 + local_subs_l = local_subs_map[MemAccess('local', np.dtype(np.float32), + direction='load') + ].eval_with_dict(params) assert local_subs_l == n*m*l*2 -- GitLab From eeb027a66244e8c6908c6b4085d03fd301f94db6 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Sun, 16 Oct 2016 21:21:56 -0500 Subject: [PATCH 12/55] allowing for data types entered as np.float64 and converting to np.dtype(np.float64) --- loopy/statistics.py | 35 +++++--------- test/test_statistics.py | 102 ++++++++++++++++++++-------------------- 2 files changed, 64 insertions(+), 73 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index a08e36c50..df385aa6d 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -111,8 +111,13 @@ def stringify_stats_mapping(m): class Op: def __init__(self, dtype, name): - self.dtype = dtype self.name = name + import numpy as np + if issubclass(type(dtype), type): + self.dtype = np.dtype(dtype) + else: + self.dtype = dtype + #TODO should this check be more robust? def __eq__(self, other): return isinstance(other, Op) and ( @@ -122,27 +127,7 @@ class Op: def __hash__(self): return hash(str(self.dtype)+self.name) -''' -class LmemAccess: - - def __init__(self, dtype, direction=None): - self.dtype = dtype - self.direction = direction - - def __eq__(self, other): - return isinstance(other, LmemAccess) and ( - other.dtype == self.dtype and - other.direction == self.direction) - - def __hash__(self): - direction = self.direction - if direction == None: - direction = 'None' - return hash(str(self.dtype)+direction) -''' - -#class StridedGmemAccess: class MemAccess: #TODO "ANY_VAR" does not work yet @@ -150,11 +135,17 @@ class MemAccess: def __init__(self, mtype, dtype, stride=1, direction=None, variable='ANY_VAR'): self.mtype = mtype - self.dtype = dtype self.stride = stride self.direction = direction self.variable = variable + import numpy as np + if issubclass(type(dtype), type): + self.dtype = np.dtype(dtype) + else: + self.dtype = dtype + #TODO should this check be more robust? + def __eq__(self, other): return isinstance(other, MemAccess) and ( other.mtype == self.mtype and diff --git a/test/test_statistics.py b/test/test_statistics.py index 54e3b69a8..4a83092cd 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -60,9 +60,9 @@ def test_op_counter_basic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32add = poly[Op(np.dtype(np.float32), 'add')].eval_with_dict(params) - f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) - f32div = poly[Op(np.dtype(np.float32), 'div')].eval_with_dict(params) + f32add = poly[Op(np.float32, 'add')].eval_with_dict(params) + f32mul = poly[Op(np.float32, 'mul')].eval_with_dict(params) + f32div = poly[Op(np.float32, 'div')].eval_with_dict(params) f64mul = poly[Op(np.dtype(np.float64), 'mul')].eval_with_dict(params) i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32add == f32mul == f32div == n*m*l @@ -85,7 +85,7 @@ def test_op_counter_reduction(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32add = poly[Op(np.dtype(np.float32), 'add')].eval_with_dict(params) + f32add = poly[Op(np.float32, 'add')].eval_with_dict(params) f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) assert f32add == f32mul == n*m*l @@ -107,8 +107,8 @@ def test_op_counter_logic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) - f64add = poly[Op(np.dtype(np.float64), 'add')].eval_with_dict(params) + f32mul = poly[Op(np.float32, 'mul')].eval_with_dict(params) + f64add = poly[Op(np.float64, 'add')].eval_with_dict(params) f64div = poly[Op(np.dtype(np.float64), 'div')].eval_with_dict(params) i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32mul == n*m @@ -137,10 +137,10 @@ def test_op_counter_specialops(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) - f32div = poly[Op(np.dtype(np.float32), 'div')].eval_with_dict(params) - f32add = poly[Op(np.dtype(np.float32), 'add')].eval_with_dict(params) - f64pow = poly[Op(np.dtype(np.float64), 'pow')].eval_with_dict(params) + f32mul = poly[Op(np.float32, 'mul')].eval_with_dict(params) + f32div = poly[Op(np.float32, 'div')].eval_with_dict(params) + f32add = poly[Op(np.float32, 'add')].eval_with_dict(params) + f64pow = poly[Op(np.float64, 'pow')].eval_with_dict(params) f64add = poly[Op(np.dtype(np.float64), 'add')].eval_with_dict(params) i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params) f64rsq = poly[Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params) @@ -173,8 +173,8 @@ def test_op_counter_bitwise(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params) - i32bw = poly[Op(np.dtype(np.int32), 'bw')].eval_with_dict(params) + i32add = poly[Op(np.int32, 'add')].eval_with_dict(params) + i32bw = poly[Op(np.int32, 'bw')].eval_with_dict(params) i64bw = poly[Op(np.dtype(np.int64), 'bw')].eval_with_dict(params) i64mul = poly[Op(np.dtype(np.int64), 'mul')].eval_with_dict(params) i64add = poly[Op(np.dtype(np.int64), 'add')].eval_with_dict(params) @@ -207,7 +207,7 @@ def test_op_counter_triangular_domain(): else: expect_fallback = False - poly = lp.get_op_poly(knl)[Op(np.dtype(np.float64), 'mul')] + poly = lp.get_op_poly(knl)[Op(np.float64, 'mul')] value_dict = dict(m=13, n=200) flops = poly.eval_with_dict(value_dict) @@ -236,16 +236,16 @@ def test_gmem_access_counter_basic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[MemAccess('global', np.dtype(np.float32), + f32 = poly[MemAccess('global', np.float32, stride=0, direction='load', variable='a') ].eval_with_dict(params) - f32 += poly[MemAccess('global', np.dtype(np.float32), + f32 += poly[MemAccess('global', np.float32, stride=0, direction='load', variable='b') ].eval_with_dict(params) - f64 = poly[MemAccess('global', np.dtype(np.float64), + f64 = poly[MemAccess('global', np.float64, stride=0, direction='load', variable='g') ].eval_with_dict(params) - f64 += poly[MemAccess('global', np.dtype(np.float64), + f64 += poly[MemAccess('global', np.float64, stride=0, direction='load', variable='h') ].eval_with_dict(params) assert f32 == 3*n*m*l @@ -276,10 +276,10 @@ def test_gmem_access_counter_reduction(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[MemAccess('global', np.dtype(np.float32), + f32 = poly[MemAccess('global', np.float32, stride=0, direction='load', variable='a') ].eval_with_dict(params) - f32 += poly[MemAccess('global', np.dtype(np.float32), + f32 += poly[MemAccess('global', np.float32, stride=0, direction='load', variable='b') ].eval_with_dict(params) assert f32 == 2*n*m*l @@ -307,16 +307,16 @@ def test_gmem_access_counter_logic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[MemAccess('global', np.dtype(np.float32), + f32 = poly[MemAccess('global', np.float32, stride=0, direction='load', variable='g') ].eval_with_dict(params) - f64 = poly[MemAccess('global', np.dtype(np.float64), + f64 = poly[MemAccess('global', np.float64, stride=0, direction='load', variable='h') ].eval_with_dict(params) assert f32 == 2*n*m assert f64 == n*m - f64 = poly[MemAccess('global', np.dtype(np.float64), + f64 = poly[MemAccess('global', np.float64, stride=0, direction='store', variable='e') ].eval_with_dict(params) assert f64 == n*m @@ -341,10 +341,10 @@ def test_gmem_access_counter_specialops(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[MemAccess('global', np.dtype(np.float32), + f32 = poly[MemAccess('global', np.float32, stride=0, direction='load', variable='a') ].eval_with_dict(params) - f32 += poly[MemAccess('global', np.dtype(np.float32), + f32 += poly[MemAccess('global', np.float32, stride=0, direction='load', variable='b') ].eval_with_dict(params) f64 = poly[MemAccess('global', np.dtype(np.float64), @@ -356,10 +356,10 @@ def test_gmem_access_counter_specialops(): assert f32 == 2*n*m*l assert f64 == 2*n*m - f32 = poly[MemAccess('global', np.dtype(np.float32), + f32 = poly[MemAccess('global', np.float32, stride=0, direction='store', variable='c') ].eval_with_dict(params) - f64 = poly[MemAccess('global', np.dtype(np.float64), + f64 = poly[MemAccess('global', np.float64, stride=0, direction='store', variable='e') ].eval_with_dict(params) assert f32 == n*m*l @@ -388,13 +388,13 @@ def test_gmem_access_counter_bitwise(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - i32 = poly[MemAccess('global', np.dtype(np.int32), + i32 = poly[MemAccess('global', np.int32, stride=0, direction='load', variable='a') ].eval_with_dict(params) - i32 += poly[MemAccess('global', np.dtype(np.int32), + i32 += poly[MemAccess('global', np.int32, stride=0, direction='load', variable='b') ].eval_with_dict(params) - i32 += poly[MemAccess('global', np.dtype(np.int32), + i32 += poly[MemAccess('global', np.int32, stride=0, direction='load', variable='g') ].eval_with_dict(params) i32 += poly[MemAccess('global', np.dtype(np.int32), @@ -402,10 +402,10 @@ def test_gmem_access_counter_bitwise(): ].eval_with_dict(params) assert i32 == 4*n*m+2*n*m*l - i32 = poly[MemAccess('global', np.dtype(np.int32), + i32 = poly[MemAccess('global', np.int32, stride=0, direction='store', variable='c') ].eval_with_dict(params) - i32 += poly[MemAccess('global', np.dtype(np.int32), + i32 += poly[MemAccess('global', np.int32, stride=0, direction='store', variable='e') ].eval_with_dict(params) assert i32 == n*m+n*m*l @@ -434,13 +434,13 @@ def test_gmem_access_counter_mixed(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f64uniform = poly[MemAccess('global', np.dtype(np.float64), + f64uniform = poly[MemAccess('global', np.float64, stride=0, direction='load', variable='g') ].eval_with_dict(params) - f64uniform += poly[MemAccess('global', np.dtype(np.float64), + f64uniform += poly[MemAccess('global', np.float64, stride=0, direction='load', variable='h') ].eval_with_dict(params) - f32uniform = poly[MemAccess('global', np.dtype(np.float32), + f32uniform = poly[MemAccess('global', np.float32, stride=0, direction='load', variable='x') ].eval_with_dict(params) f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), @@ -455,10 +455,10 @@ def test_gmem_access_counter_mixed(): assert f32uniform == n*m*l/threads assert f32nonconsec == 3*n*m*l - f64uniform = poly[MemAccess('global', np.dtype(np.float64), + f64uniform = poly[MemAccess('global', np.float64, stride=0, direction='store', variable='e') ].eval_with_dict(params) - f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), + f32nonconsec = poly[MemAccess('global', np.float32, stride=Variable('m'), direction='store', variable='c') ].eval_with_dict(params) @@ -487,11 +487,11 @@ def test_gmem_access_counter_nonconsec(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f64nonconsec = poly[MemAccess('global', np.dtype(np.float64), + f64nonconsec = poly[MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='g') ].eval_with_dict(params) - f64nonconsec += poly[MemAccess('global', np.dtype(np.float64), + f64nonconsec += poly[MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='h') ].eval_with_dict(params) @@ -506,11 +506,11 @@ def test_gmem_access_counter_nonconsec(): assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*l - f64nonconsec = poly[MemAccess('global', np.dtype(np.float64), + f64nonconsec = poly[MemAccess('global', np.float64, stride=Variable('m'), direction='store', variable='e') ].eval_with_dict(params) - f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), + f32nonconsec = poly[MemAccess('global', np.float32, stride=Variable('m')*Variable('l'), direction='store', variable='c') ].eval_with_dict(params) @@ -539,13 +539,13 @@ def test_gmem_access_counter_consec(): l = 128 params = {'n': n, 'm': m, 'l': l} - f64consec = poly[MemAccess('global', np.dtype(np.float64), + f64consec = poly[MemAccess('global', np.float64, stride=1, direction='load', variable='g') ].eval_with_dict(params) - f64consec += poly[MemAccess('global', np.dtype(np.float64), + f64consec += poly[MemAccess('global', np.float64, stride=1, direction='load', variable='h') ].eval_with_dict(params) - f32consec = poly[MemAccess('global', np.dtype(np.float32), + f32consec = poly[MemAccess('global', np.float32, stride=1, direction='load', variable='a') ].eval_with_dict(params) f32consec += poly[MemAccess('global', np.dtype(np.float32), @@ -554,10 +554,10 @@ def test_gmem_access_counter_consec(): assert f64consec == 2*n*m assert f32consec == 3*n*m*l - f64consec = poly[MemAccess('global', np.dtype(np.float64), + f64consec = poly[MemAccess('global', np.float64, stride=1, direction='store', variable='e') ].eval_with_dict(params) - f32consec = poly[MemAccess('global', np.dtype(np.float32), + f32consec = poly[MemAccess('global', np.float32, stride=1, direction='store', variable='c') ].eval_with_dict(params) assert f64consec == n*m @@ -644,13 +644,13 @@ def test_all_counters_parallel_matmul(): op_map = lp.get_op_poly(knl) f32mul = op_map[ - Op(np.dtype(np.float32), 'mul') + Op(np.float32, 'mul') ].eval_with_dict(params) f32add = op_map[ - Op(np.dtype(np.float32), 'add') + Op(np.float32, 'add') ].eval_with_dict(params) i32ops = op_map[ - Op(np.dtype(np.int32), 'add') + Op(np.int32, 'add') ].eval_with_dict(params) i32ops += op_map[ Op(np.dtype(np.int32), 'mul') @@ -660,16 +660,16 @@ def test_all_counters_parallel_matmul(): subscript_map = lp.get_gmem_access_poly(knl) - f32coal = subscript_map[MemAccess('global', np.dtype(np.float32), + f32coal = subscript_map[MemAccess('global', np.float32, stride=1, direction='load', variable='b') ].eval_with_dict(params) - f32coal += subscript_map[MemAccess('global', np.dtype(np.float32), + f32coal += subscript_map[MemAccess('global', np.float32, stride=1, direction='load', variable='a') ].eval_with_dict(params) assert f32coal == n*m+m*l - f32coal = subscript_map[MemAccess('global', np.dtype(np.float32), + f32coal = subscript_map[MemAccess('global', np.float32, stride=1, direction='store', variable='c') ].eval_with_dict(params) -- GitLab From 2db20d6763bcfa2d400e08283accd53f2a76caaa Mon Sep 17 00:00:00 2001 From: James Stevens Date: Mon, 17 Oct 2016 20:27:39 -0500 Subject: [PATCH 13/55] combined get_lmem_access_poly and get_gmem_access_poly into get_mem_access_poly --- loopy/__init__.py | 4 +- loopy/statistics.py | 145 ++++++++++++++-------------------------- test/test_statistics.py | 23 +++---- 3 files changed, 65 insertions(+), 107 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 897567444..4cfa23fa4 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -112,7 +112,7 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction, infer_unknown_types) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel from loopy.statistics import (get_op_poly, sum_ops_to_dtypes, - get_gmem_access_poly, + get_mem_access_poly, get_gmem_access_poly, get_DRAM_access_poly, get_synchronization_poly, stringify_stats_mapping, sum_mem_access_to_bytes, gather_access_footprints, gather_access_footprint_bytes) @@ -218,7 +218,7 @@ __all__ = [ "generate_code", "generate_code_v2", "generate_body", "get_op_poly", "sum_ops_to_dtypes", "get_gmem_access_poly", - "get_DRAM_access_poly", + "get_mem_access_poly", "get_DRAM_access_poly", "get_synchronization_poly", "stringify_stats_mapping", "sum_mem_access_to_bytes", "gather_access_footprints", "gather_access_footprint_bytes", diff --git a/loopy/statistics.py b/loopy/statistics.py index df385aa6d..24ba905a5 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -837,77 +837,19 @@ def sum_ops_to_dtypes(op_poly_dict): return result -def get_lmem_access_poly(knl, numpy_types=True): - +def get_lmem_access_poly(knl): """Count the number of local memory accesses in a loopy kernel. """ - - from loopy.preprocess import preprocess_kernel, infer_unknown_types - - class CacheHolder(object): - pass - - cache_holder = CacheHolder() - - @memoize_in(cache_holder, "insn_count") - def get_insn_count(knl, insn_inames): - inames_domain = knl.get_inames_domain(insn_inames) - domain = (inames_domain.project_out_except( - insn_inames, [dim_type.set])) - return count(knl, domain) - - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - - subs_poly = ToCountMap() - subscript_counter = LocalSubscriptCounter(knl) - for insn in knl.instructions: - # count subscripts, distinguishing loads and stores - subs_expr = subscript_counter(insn.expression) - for key in subs_expr.dict: - subs_expr.dict[MemAccess('local', - key.dtype, direction='load') - ] = subs_expr.dict.pop(key) - subs_assignee = subscript_counter(insn.assignee) - #for key in subs_assignee.dict: - # print(key.dtype, key.direction, subs_assignee.dict[key]) - - # for now, not counting stores in local mem - ''' - for key in subs_assignee.dict: - subs_assignee.dict[MemAccess('local', - key.dtype, direction='store') - ] = subs_assignee.dict.pop(key) - ''' - - insn_inames = knl.insn_inames(insn) - - # use count excluding local index tags for uniform accesses - for key in subs_expr.dict: - poly = ToCountMap({key: subs_expr.dict[key]}) - subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) - - # for now, not counting stores in local mem - ''' - for key in subs_assignee.dict: - poly = ToCountMap({key: subs_assignee.dict[key]}) - subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) - ''' - - #return subs_poly.dict - result = subs_poly.dict - - if numpy_types: - result = dict((MemAccess('local', mem_access.dtype.numpy_dtype, - direction=mem_access.direction), count) - for mem_access, count in six.iteritems(result)) - - return result + from warnings import warn + warn("get_lmem_access_poly is deprecated. " + "Use get_mem_access_poly with local option instead", + DeprecationWarning, stacklevel=2) + return get_mem_access_poly(knl, 'local') # {{{ get_gmem_access_poly -def get_gmem_access_poly(knl, numpy_types=True): +def get_gmem_access_poly(knl): """Count the number of global memory accesses in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be @@ -953,7 +895,23 @@ def get_gmem_access_poly(knl, numpy_types=True): # (now use these counts to predict performance) """ + from warnings import warn + warn("get_gmem_access_poly is deprecated. " + "Use get_mem_access_poly with global option instead", + DeprecationWarning, stacklevel=2) + return get_mem_access_poly(knl, 'global') + +def get_DRAM_access_poly(knl): + from warnings import warn + warn("get_DRAM_access_poly is deprecated. " + "Use get_mem_access_poly with global option instead", + DeprecationWarning, stacklevel=2) + return get_mem_access_poly(knl, 'global') + +# }}} + +def get_mem_access_poly(knl, mtype, numpy_types=True): from loopy.preprocess import preprocess_kernel, infer_unknown_types class CacheHolder(object): @@ -977,43 +935,55 @@ def get_gmem_access_poly(knl, numpy_types=True): knl = preprocess_kernel(knl) subs_poly = ToCountMap() - subscript_counter = GlobalSubscriptCounter(knl) + if mtype == 'global': + subscript_counter = GlobalSubscriptCounter(knl) + elif mtype == 'local': + subscript_counter = LocalSubscriptCounter(knl) + else: + raise ValueError("get_mem_access_poly: mtype must be " + "'local' or 'global', received {0}" + .format(mtype)) + for insn in knl.instructions: # count subscripts, distinguishing loads and stores subs_expr = subscript_counter(insn.expression) for key in subs_expr.dict: - subs_expr.dict[MemAccess('global', key.dtype, stride=key.stride, + subs_expr.dict[MemAccess(key.mtype, key.dtype, stride=key.stride, direction='load', variable=key.variable) ] = subs_expr.dict.pop(key) - subs_assignee = subscript_counter(insn.assignee) - for key in subs_assignee.dict: - subs_assignee.dict[MemAccess('global', key.dtype, - stride=key.stride, direction='store', - variable=key.variable) - ] = subs_assignee.dict.pop(key) + + if mtype == 'global': # for now, don't count writes to local mem + subs_assignee = subscript_counter(insn.assignee) + for key in subs_assignee.dict: + subs_assignee.dict[MemAccess(key.mtype, key.dtype, + stride=key.stride, direction='store', + variable=key.variable) + ] = subs_assignee.dict.pop(key) insn_inames = knl.insn_inames(insn) # use count excluding local index tags for uniform accesses for key in subs_expr.dict: poly = ToCountMap({key: subs_expr.dict[key]}) - if isinstance(key.stride, int) and key.stride == 0: - subs_poly = subs_poly \ - + poly*get_insn_count(knl, insn_inames, True) - else: - subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) - for key in subs_assignee.dict: - poly = ToCountMap({key: subs_assignee.dict[key]}) - if isinstance(key.stride, int) and key.stride == 0: + if mtype == 'global' and isinstance(key.stride, int) and key.stride == 0: subs_poly = subs_poly \ + poly*get_insn_count(knl, insn_inames, True) else: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) + if mtype == 'global': # for now, don't count writes to local mem + for key in subs_assignee.dict: + poly = ToCountMap({key: subs_assignee.dict[key]}) + if isinstance(key.stride, int) and key.stride == 0: + subs_poly = subs_poly \ + + poly*get_insn_count(knl, insn_inames, True) + else: + subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) + result = subs_poly.dict if numpy_types: - result = dict((MemAccess('global', mem_access.dtype.numpy_dtype, + result = dict((MemAccess(mem_access.mtype, mem_access.dtype.numpy_dtype, stride=mem_access.stride, direction=mem_access.direction, variable=mem_access.variable) @@ -1022,17 +992,6 @@ def get_gmem_access_poly(knl, numpy_types=True): return result - -def get_DRAM_access_poly(knl): - from warnings import warn - warn("get_DRAM_access_poly is deprecated. " - "Use get_gmem_access_poly instead", - DeprecationWarning, stacklevel=2) - return get_gmem_access_poly(knl) - -# }}} - - # {{{ sum_mem_access_to_bytes def sum_mem_access_to_bytes(m): diff --git a/test/test_statistics.py b/test/test_statistics.py index 4a83092cd..504f15403 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -30,8 +30,7 @@ from pyopencl.tools import ( # noqa #TODO why is this import required? from loopy.statistics import ( get_op_poly, - get_gmem_access_poly, - get_lmem_access_poly, + get_mem_access_poly, get_synchronization_poly, MemAccess, Op) @@ -231,7 +230,7 @@ def test_gmem_access_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = lp.get_gmem_access_poly(knl) + poly = lp.get_mem_access_poly(knl, 'global') n = 512 m = 256 l = 128 @@ -271,7 +270,7 @@ def test_gmem_access_counter_reduction(): name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - poly = lp.get_gmem_access_poly(knl) + poly = lp.get_mem_access_poly(knl, 'global') n = 512 m = 256 l = 128 @@ -302,7 +301,7 @@ def test_gmem_access_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - poly = lp.get_gmem_access_poly(knl) + poly = lp.get_mem_access_poly(knl, 'global') n = 512 m = 256 l = 128 @@ -336,7 +335,7 @@ def test_gmem_access_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = lp.get_gmem_access_poly(knl) + poly = lp.get_mem_access_poly(knl, 'global') n = 512 m = 256 l = 128 @@ -383,7 +382,7 @@ def test_gmem_access_counter_bitwise(): a=np.int32, b=np.int32, g=np.int32, h=np.int32)) - poly = lp.get_gmem_access_poly(knl) + poly = lp.get_mem_access_poly(knl, 'global') n = 512 m = 256 l = 128 @@ -429,7 +428,7 @@ def test_gmem_access_counter_mixed(): knl = lp.split_iname(knl, "j", threads) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) - poly = lp.get_gmem_access_poly(knl) # noqa + poly = lp.get_mem_access_poly(knl, 'global') # noqa n = 512 m = 256 l = 128 @@ -482,7 +481,7 @@ def test_gmem_access_counter_nonconsec(): knl = lp.split_iname(knl, "i", 16) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) - poly = lp.get_gmem_access_poly(knl) # noqa + poly = lp.get_mem_access_poly(knl, 'global') # noqa n = 512 m = 256 l = 128 @@ -533,7 +532,7 @@ def test_gmem_access_counter_consec(): a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) - poly = lp.get_gmem_access_poly(knl) + poly = lp.get_mem_access_poly(knl, 'global') n = 512 m = 256 l = 128 @@ -658,7 +657,7 @@ def test_all_counters_parallel_matmul(): assert f32mul+f32add == n*m*l*2 - subscript_map = lp.get_gmem_access_poly(knl) + subscript_map = lp.get_mem_access_poly(knl, 'global') f32coal = subscript_map[MemAccess('global', np.float32, stride=1, direction='load', variable='b') @@ -675,7 +674,7 @@ def test_all_counters_parallel_matmul(): assert f32coal == n*l - local_subs_map = get_lmem_access_poly(knl) + local_subs_map = get_mem_access_poly(knl, 'local') # TODO currently considering all local mem access stride-1 local_subs_l = local_subs_map[MemAccess('local', np.dtype(np.float32), -- GitLab From b72b7ab3c2e421b67cc61a3467148149d7b263a8 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Sat, 22 Oct 2016 21:30:31 -0500 Subject: [PATCH 14/55] added new stats functions to __init__ file so no need for import, updated test --- loopy/__init__.py | 16 ++-- loopy/statistics.py | 19 ++--- test/test_statistics.py | 163 +++++++++++++++++++--------------------- 3 files changed, 93 insertions(+), 105 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 63268a214..a644fdf53 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -112,10 +112,10 @@ from loopy.transform.parameter import assume, fix_parameters from loopy.preprocess import (preprocess_kernel, realize_reduction, infer_unknown_types) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel -from loopy.statistics import (get_op_poly, sum_ops_to_dtypes, - get_mem_access_poly, get_gmem_access_poly, - get_DRAM_access_poly, get_synchronization_poly, stringify_stats_mapping, - sum_mem_access_to_bytes, +from loopy.statistics import (stringify_stats_mapping, Op, MemAccess, + get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly, + get_gmem_access_poly, get_DRAM_access_poly, get_mem_access_poly, + sum_mem_access_to_bytes, get_synchronization_poly, gather_access_footprints, gather_access_footprint_bytes) from loopy.codegen import ( PreambleInfo, @@ -219,10 +219,10 @@ __all__ = [ "PreambleInfo", "generate_code", "generate_code_v2", "generate_body", - "get_op_poly", "sum_ops_to_dtypes", "get_gmem_access_poly", - "get_mem_access_poly", "get_DRAM_access_poly", - "get_synchronization_poly", "stringify_stats_mapping", - "sum_mem_access_to_bytes", + "stringify_stats_mapping", "Op", "MemAccess", "get_op_poly", + "sum_ops_to_dtypes", "get_lmem_access_poly", "get_gmem_access_poly", + "get_DRAM_access_poly", "get_mem_access_poly", + "sum_mem_access_to_bytes", "get_synchronization_poly", "gather_access_footprints", "gather_access_footprint_bytes", "CompiledKernel", diff --git a/loopy/statistics.py b/loopy/statistics.py index 24ba905a5..c349b34f4 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -25,6 +25,7 @@ THE SOFTWARE. import six import loopy as lp +import numpy as np import warnings from islpy import dim_type import islpy as isl @@ -112,12 +113,8 @@ class Op: def __init__(self, dtype, name): self.name = name - import numpy as np - if issubclass(type(dtype), type): - self.dtype = np.dtype(dtype) - else: - self.dtype = dtype - #TODO should this check be more robust? + from loopy.types import to_loopy_type + self.dtype = to_loopy_type(dtype) def __eq__(self, other): return isinstance(other, Op) and ( @@ -139,12 +136,8 @@ class MemAccess: self.direction = direction self.variable = variable - import numpy as np - if issubclass(type(dtype), type): - self.dtype = np.dtype(dtype) - else: - self.dtype = dtype - #TODO should this check be more robust? + from loopy.types import to_loopy_type + self.dtype = to_loopy_type(dtype) def __eq__(self, other): return isinstance(other, MemAccess) and ( @@ -827,6 +820,7 @@ def get_op_poly(knl, numpy_types=True): def sum_ops_to_dtypes(op_poly_dict): result = {} + #TODO fix this for (dtype, kind), v in op_poly_dict.items(): new_key = dtype if new_key in result: @@ -1004,6 +998,7 @@ def sum_mem_access_to_bytes(m): """ result = {} + #TODO fix this and test for (dtype, kind, direction), v in m.items(): new_key = (kind, direction) bytes_transferred = int(dtype.itemsize) * v diff --git a/test/test_statistics.py b/test/test_statistics.py index 504f15403..a7f061c2a 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -27,13 +27,6 @@ import sys from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) -#TODO why is this import required? -from loopy.statistics import ( - get_op_poly, - get_mem_access_poly, - get_synchronization_poly, - MemAccess, - Op) import loopy as lp import numpy as np @@ -59,11 +52,11 @@ def test_op_counter_basic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32add = poly[Op(np.float32, 'add')].eval_with_dict(params) - f32mul = poly[Op(np.float32, 'mul')].eval_with_dict(params) - f32div = poly[Op(np.float32, 'div')].eval_with_dict(params) - f64mul = poly[Op(np.dtype(np.float64), 'mul')].eval_with_dict(params) - i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params) + f32add = poly[lp.Op(np.float32, 'add')].eval_with_dict(params) + f32mul = poly[lp.Op(np.float32, 'mul')].eval_with_dict(params) + f32div = poly[lp.Op(np.float32, 'div')].eval_with_dict(params) + f64mul = poly[lp.Op(np.dtype(np.float64), 'mul')].eval_with_dict(params) + i32add = poly[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32add == f32mul == f32div == n*m*l assert f64mul == n*m assert i32add == n*m*2 @@ -84,8 +77,8 @@ def test_op_counter_reduction(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32add = poly[Op(np.float32, 'add')].eval_with_dict(params) - f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32add = poly[lp.Op(np.float32, 'add')].eval_with_dict(params) + f32mul = poly[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) assert f32add == f32mul == n*m*l @@ -106,10 +99,10 @@ def test_op_counter_logic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32mul = poly[Op(np.float32, 'mul')].eval_with_dict(params) - f64add = poly[Op(np.float64, 'add')].eval_with_dict(params) - f64div = poly[Op(np.dtype(np.float64), 'div')].eval_with_dict(params) - i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params) + f32mul = poly[lp.Op(np.float32, 'mul')].eval_with_dict(params) + f64add = poly[lp.Op(np.float64, 'add')].eval_with_dict(params) + f64div = poly[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params) + i32add = poly[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32mul == n*m assert f64div == 2*n*m # TODO why? assert f64add == n*m @@ -136,14 +129,14 @@ def test_op_counter_specialops(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32mul = poly[Op(np.float32, 'mul')].eval_with_dict(params) - f32div = poly[Op(np.float32, 'div')].eval_with_dict(params) - f32add = poly[Op(np.float32, 'add')].eval_with_dict(params) - f64pow = poly[Op(np.float64, 'pow')].eval_with_dict(params) - f64add = poly[Op(np.dtype(np.float64), 'add')].eval_with_dict(params) - i32add = poly[Op(np.dtype(np.int32), 'add')].eval_with_dict(params) - f64rsq = poly[Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params) - f64sin = poly[Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params) + f32mul = poly[lp.Op(np.float32, 'mul')].eval_with_dict(params) + f32div = poly[lp.Op(np.float32, 'div')].eval_with_dict(params) + f32add = poly[lp.Op(np.float32, 'add')].eval_with_dict(params) + f64pow = poly[lp.Op(np.float64, 'pow')].eval_with_dict(params) + f64add = poly[lp.Op(np.dtype(np.float64), 'add')].eval_with_dict(params) + i32add = poly[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) + f64rsq = poly[lp.Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params) + f64sin = poly[lp.Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params) assert f32div == 2*n*m*l assert f32mul == f32add == n*m*l assert f64add == 3*n*m @@ -172,12 +165,12 @@ def test_op_counter_bitwise(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - i32add = poly[Op(np.int32, 'add')].eval_with_dict(params) - i32bw = poly[Op(np.int32, 'bw')].eval_with_dict(params) - i64bw = poly[Op(np.dtype(np.int64), 'bw')].eval_with_dict(params) - i64mul = poly[Op(np.dtype(np.int64), 'mul')].eval_with_dict(params) - i64add = poly[Op(np.dtype(np.int64), 'add')].eval_with_dict(params) - i64shift = poly[Op(np.dtype(np.int64), 'shift')].eval_with_dict(params) + i32add = poly[lp.Op(np.int32, 'add')].eval_with_dict(params) + i32bw = poly[lp.Op(np.int32, 'bw')].eval_with_dict(params) + i64bw = poly[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params) + i64mul = poly[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params) + i64add = poly[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params) + i64shift = poly[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params) assert i32add == n*m+n*m*l assert i32bw == 2*n*m*l assert i64bw == 2*n*m @@ -206,7 +199,7 @@ def test_op_counter_triangular_domain(): else: expect_fallback = False - poly = lp.get_op_poly(knl)[Op(np.float64, 'mul')] + poly = lp.get_op_poly(knl)[lp.Op(np.float64, 'mul')] value_dict = dict(m=13, n=200) flops = poly.eval_with_dict(value_dict) @@ -235,25 +228,25 @@ def test_gmem_access_counter_basic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[MemAccess('global', np.float32, + f32 = poly[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a') ].eval_with_dict(params) - f32 += poly[MemAccess('global', np.float32, + f32 += poly[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b') ].eval_with_dict(params) - f64 = poly[MemAccess('global', np.float64, + f64 = poly[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g') ].eval_with_dict(params) - f64 += poly[MemAccess('global', np.float64, + f64 += poly[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h') ].eval_with_dict(params) assert f32 == 3*n*m*l assert f64 == 2*n*m - f32 = poly[MemAccess('global', np.dtype(np.float32), + f32 = poly[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c') ].eval_with_dict(params) - f64 = poly[MemAccess('global', np.dtype(np.float64), + f64 = poly[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='store', variable='e') ].eval_with_dict(params) assert f32 == n*m*l @@ -275,15 +268,15 @@ def test_gmem_access_counter_reduction(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[MemAccess('global', np.float32, + f32 = poly[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a') ].eval_with_dict(params) - f32 += poly[MemAccess('global', np.float32, + f32 += poly[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b') ].eval_with_dict(params) assert f32 == 2*n*m*l - f32 = poly[MemAccess('global', np.dtype(np.float32), + f32 = poly[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c') ].eval_with_dict(params) assert f32 == n*l @@ -306,16 +299,16 @@ def test_gmem_access_counter_logic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[MemAccess('global', np.float32, + f32 = poly[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='g') ].eval_with_dict(params) - f64 = poly[MemAccess('global', np.float64, + f64 = poly[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h') ].eval_with_dict(params) assert f32 == 2*n*m assert f64 == n*m - f64 = poly[MemAccess('global', np.float64, + f64 = poly[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e') ].eval_with_dict(params) assert f64 == n*m @@ -340,25 +333,25 @@ def test_gmem_access_counter_specialops(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[MemAccess('global', np.float32, + f32 = poly[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a') ].eval_with_dict(params) - f32 += poly[MemAccess('global', np.float32, + f32 += poly[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b') ].eval_with_dict(params) - f64 = poly[MemAccess('global', np.dtype(np.float64), + f64 = poly[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='g') ].eval_with_dict(params) - f64 += poly[MemAccess('global', np.dtype(np.float64), + f64 += poly[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='h') ].eval_with_dict(params) assert f32 == 2*n*m*l assert f64 == 2*n*m - f32 = poly[MemAccess('global', np.float32, + f32 = poly[lp.MemAccess('global', np.float32, stride=0, direction='store', variable='c') ].eval_with_dict(params) - f64 = poly[MemAccess('global', np.float64, + f64 = poly[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e') ].eval_with_dict(params) assert f32 == n*m*l @@ -387,24 +380,24 @@ def test_gmem_access_counter_bitwise(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - i32 = poly[MemAccess('global', np.int32, + i32 = poly[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='a') ].eval_with_dict(params) - i32 += poly[MemAccess('global', np.int32, + i32 += poly[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='b') ].eval_with_dict(params) - i32 += poly[MemAccess('global', np.int32, + i32 += poly[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='g') ].eval_with_dict(params) - i32 += poly[MemAccess('global', np.dtype(np.int32), + i32 += poly[lp.MemAccess('global', np.dtype(np.int32), stride=0, direction='load', variable='h') ].eval_with_dict(params) assert i32 == 4*n*m+2*n*m*l - i32 = poly[MemAccess('global', np.int32, + i32 = poly[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='c') ].eval_with_dict(params) - i32 += poly[MemAccess('global', np.int32, + i32 += poly[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='e') ].eval_with_dict(params) assert i32 == n*m+n*m*l @@ -433,20 +426,20 @@ def test_gmem_access_counter_mixed(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f64uniform = poly[MemAccess('global', np.float64, + f64uniform = poly[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g') ].eval_with_dict(params) - f64uniform += poly[MemAccess('global', np.float64, + f64uniform += poly[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h') ].eval_with_dict(params) - f32uniform = poly[MemAccess('global', np.float32, + f32uniform = poly[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='x') ].eval_with_dict(params) - f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), + f32nonconsec = poly[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='a') ].eval_with_dict(params) - f32nonconsec += poly[MemAccess('global', np.dtype(np.float32), + f32nonconsec += poly[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='b') ].eval_with_dict(params) @@ -454,10 +447,10 @@ def test_gmem_access_counter_mixed(): assert f32uniform == n*m*l/threads assert f32nonconsec == 3*n*m*l - f64uniform = poly[MemAccess('global', np.float64, + f64uniform = poly[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e') ].eval_with_dict(params) - f32nonconsec = poly[MemAccess('global', np.float32, + f32nonconsec = poly[lp.MemAccess('global', np.float32, stride=Variable('m'), direction='store', variable='c') ].eval_with_dict(params) @@ -486,30 +479,30 @@ def test_gmem_access_counter_nonconsec(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f64nonconsec = poly[MemAccess('global', np.float64, + f64nonconsec = poly[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='g') ].eval_with_dict(params) - f64nonconsec += poly[MemAccess('global', np.float64, + f64nonconsec += poly[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='h') ].eval_with_dict(params) - f32nonconsec = poly[MemAccess('global', np.dtype(np.float32), + f32nonconsec = poly[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m')*Variable('l'), direction='load', variable='a') ].eval_with_dict(params) - f32nonconsec += poly[MemAccess('global', np.dtype(np.float32), + f32nonconsec += poly[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m')*Variable('l'), direction='load', variable='b') ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*l - f64nonconsec = poly[MemAccess('global', np.float64, + f64nonconsec = poly[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='store', variable='e') ].eval_with_dict(params) - f32nonconsec = poly[MemAccess('global', np.float32, + f32nonconsec = poly[lp.MemAccess('global', np.float32, stride=Variable('m')*Variable('l'), direction='store', variable='c') ].eval_with_dict(params) @@ -538,25 +531,25 @@ def test_gmem_access_counter_consec(): l = 128 params = {'n': n, 'm': m, 'l': l} - f64consec = poly[MemAccess('global', np.float64, + f64consec = poly[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='g') ].eval_with_dict(params) - f64consec += poly[MemAccess('global', np.float64, + f64consec += poly[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='h') ].eval_with_dict(params) - f32consec = poly[MemAccess('global', np.float32, + f32consec = poly[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a') ].eval_with_dict(params) - f32consec += poly[MemAccess('global', np.dtype(np.float32), + f32consec += poly[lp.MemAccess('global', np.dtype(np.float32), stride=1, direction='load', variable='b') ].eval_with_dict(params) assert f64consec == 2*n*m assert f32consec == 3*n*m*l - f64consec = poly[MemAccess('global', np.float64, + f64consec = poly[lp.MemAccess('global', np.float64, stride=1, direction='store', variable='e') ].eval_with_dict(params) - f32consec = poly[MemAccess('global', np.float32, + f32consec = poly[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c') ].eval_with_dict(params) assert f64consec == n*m @@ -643,41 +636,41 @@ def test_all_counters_parallel_matmul(): op_map = lp.get_op_poly(knl) f32mul = op_map[ - Op(np.float32, 'mul') + lp.Op(np.float32, 'mul') ].eval_with_dict(params) f32add = op_map[ - Op(np.float32, 'add') + lp.Op(np.float32, 'add') ].eval_with_dict(params) i32ops = op_map[ - Op(np.int32, 'add') + lp.Op(np.int32, 'add') ].eval_with_dict(params) i32ops += op_map[ - Op(np.dtype(np.int32), 'mul') + lp.Op(np.dtype(np.int32), 'mul') ].eval_with_dict(params) assert f32mul+f32add == n*m*l*2 subscript_map = lp.get_mem_access_poly(knl, 'global') - f32coal = subscript_map[MemAccess('global', np.float32, + f32coal = subscript_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='b') ].eval_with_dict(params) - f32coal += subscript_map[MemAccess('global', np.float32, + f32coal += subscript_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a') ].eval_with_dict(params) assert f32coal == n*m+m*l - f32coal = subscript_map[MemAccess('global', np.float32, + f32coal = subscript_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c') ].eval_with_dict(params) assert f32coal == n*l - local_subs_map = get_mem_access_poly(knl, 'local') + local_subs_map = lp.get_mem_access_poly(knl, 'local') # TODO currently considering all local mem access stride-1 - local_subs_l = local_subs_map[MemAccess('local', np.dtype(np.float32), + local_subs_l = local_subs_map[lp.MemAccess('local', np.dtype(np.float32), direction='load') ].eval_with_dict(params) -- GitLab From 1f079851255bf9b555d6d3446d1c4f015c69e1ff Mon Sep 17 00:00:00 2001 From: James Stevens Date: Sun, 23 Oct 2016 22:32:19 -0500 Subject: [PATCH 15/55] updated old versions of sum_xxx for new dict keys --- loopy/statistics.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index c349b34f4..7702b8995 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -820,9 +820,9 @@ def get_op_poly(knl, numpy_types=True): def sum_ops_to_dtypes(op_poly_dict): result = {} - #TODO fix this - for (dtype, kind), v in op_poly_dict.items(): - new_key = dtype + #TODO test this + for op, v in op_poly_dict.items(): + new_key = op.dtype if new_key in result: result[new_key] += v else: @@ -998,10 +998,10 @@ def sum_mem_access_to_bytes(m): """ result = {} - #TODO fix this and test - for (dtype, kind, direction), v in m.items(): - new_key = (kind, direction) - bytes_transferred = int(dtype.itemsize) * v + #TODO test this + for mem_access, v in m.items(): + new_key = (mem_access.stride, mem_access.direction) + bytes_transferred = int(mem_access.dtype.itemsize) * v if new_key in result: result[new_key] += bytes_transferred else: -- GitLab From d6b72e3b1df925af50642f74d7daab790fbc9028 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Mon, 24 Oct 2016 00:50:46 -0500 Subject: [PATCH 16/55] added tests for existing sum_xxx functions --- test/test_statistics.py | 55 ++++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index a7f061c2a..eaac2081e 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -28,6 +28,7 @@ from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) import loopy as lp +from loopy.types import to_loopy_type import numpy as np from pymbolic.primitives import Variable @@ -61,6 +62,14 @@ def test_op_counter_basic(): assert f64mul == n*m assert i32add == n*m*2 + poly_dtype = lp.sum_ops_to_dtypes(poly) + f32 = poly_dtype[to_loopy_type(np.float32)].eval_with_dict(params) + f64 = poly_dtype[to_loopy_type(np.float64)].eval_with_dict(params) + i32 = poly_dtype[to_loopy_type(np.int32)].eval_with_dict(params) + assert f32 == f32add + f32mul + f32div + assert f64 == f64mul + assert i32 == i32add + def test_op_counter_reduction(): @@ -81,6 +90,10 @@ def test_op_counter_reduction(): f32mul = poly[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) assert f32add == f32mul == n*m*l + poly_dtype = lp.sum_ops_to_dtypes(poly) + f32 = poly_dtype[to_loopy_type(np.float32)].eval_with_dict(params) + assert f32 == f32add + f32mul + def test_op_counter_logic(): @@ -228,29 +241,35 @@ def test_gmem_access_counter_basic(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[lp.MemAccess('global', np.float32, + f32l = poly[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a') ].eval_with_dict(params) - f32 += poly[lp.MemAccess('global', np.float32, + f32l += poly[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b') ].eval_with_dict(params) - f64 = poly[lp.MemAccess('global', np.float64, + f64l = poly[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g') ].eval_with_dict(params) - f64 += poly[lp.MemAccess('global', np.float64, + f64l += poly[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h') ].eval_with_dict(params) - assert f32 == 3*n*m*l - assert f64 == 2*n*m + assert f32l == 3*n*m*l + assert f64l == 2*n*m - f32 = poly[lp.MemAccess('global', np.dtype(np.float32), + f32s = poly[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c') ].eval_with_dict(params) - f64 = poly[lp.MemAccess('global', np.dtype(np.float64), + f64s = poly[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='store', variable='e') ].eval_with_dict(params) - assert f32 == n*m*l - assert f64 == n*m + assert f32s == n*m*l + assert f64s == n*m + + poly_b = lp.sum_mem_access_to_bytes(poly) + s0load = poly_b[(0, 'load')].eval_with_dict(params) + s0store = poly_b[(0, 'store')].eval_with_dict(params) + assert s0load == 4*f32l + 8*f64l + assert s0store == 4*f32s + 8*f64s def test_gmem_access_counter_reduction(): @@ -268,18 +287,24 @@ def test_gmem_access_counter_reduction(): m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[lp.MemAccess('global', np.float32, + f32l = poly[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a') ].eval_with_dict(params) - f32 += poly[lp.MemAccess('global', np.float32, + f32l += poly[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b') ].eval_with_dict(params) - assert f32 == 2*n*m*l + assert f32l == 2*n*m*l - f32 = poly[lp.MemAccess('global', np.dtype(np.float32), + f32s = poly[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c') ].eval_with_dict(params) - assert f32 == n*l + assert f32s == n*l + + poly_b = lp.sum_mem_access_to_bytes(poly) + s0load = poly_b[(0, 'load')].eval_with_dict(params) + s0store = poly_b[(0, 'store')].eval_with_dict(params) + assert s0load == 4*f32l + assert s0store == 4*f32s def test_gmem_access_counter_logic(): -- GitLab From 8fc4268b4cf0699fc10a36b1d060fdf0b61e3061 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Mon, 24 Oct 2016 00:55:51 -0500 Subject: [PATCH 17/55] added mtype to keys in sum_mem_access_to_bytes --- loopy/statistics.py | 4 +--- test/test_statistics.py | 8 ++++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 7702b8995..f3346572e 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -820,7 +820,6 @@ def get_op_poly(knl, numpy_types=True): def sum_ops_to_dtypes(op_poly_dict): result = {} - #TODO test this for op, v in op_poly_dict.items(): new_key = op.dtype if new_key in result: @@ -998,9 +997,8 @@ def sum_mem_access_to_bytes(m): """ result = {} - #TODO test this for mem_access, v in m.items(): - new_key = (mem_access.stride, mem_access.direction) + new_key = (mem_access.mtype, mem_access.stride, mem_access.direction) bytes_transferred = int(mem_access.dtype.itemsize) * v if new_key in result: result[new_key] += bytes_transferred diff --git a/test/test_statistics.py b/test/test_statistics.py index eaac2081e..feda05125 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -266,8 +266,8 @@ def test_gmem_access_counter_basic(): assert f64s == n*m poly_b = lp.sum_mem_access_to_bytes(poly) - s0load = poly_b[(0, 'load')].eval_with_dict(params) - s0store = poly_b[(0, 'store')].eval_with_dict(params) + s0load = poly_b[('global', 0, 'load')].eval_with_dict(params) + s0store = poly_b[('global', 0, 'store')].eval_with_dict(params) assert s0load == 4*f32l + 8*f64l assert s0store == 4*f32s + 8*f64s @@ -301,8 +301,8 @@ def test_gmem_access_counter_reduction(): assert f32s == n*l poly_b = lp.sum_mem_access_to_bytes(poly) - s0load = poly_b[(0, 'load')].eval_with_dict(params) - s0store = poly_b[(0, 'store')].eval_with_dict(params) + s0load = poly_b[('global', 0, 'load')].eval_with_dict(params) + s0store = poly_b[('global', 0, 'store')].eval_with_dict(params) assert s0load == 4*f32l assert s0store == 4*f32s -- GitLab From 4552b861c2b387191efb710f74a498c48f490c2d Mon Sep 17 00:00:00 2001 From: James Stevens Date: Tue, 25 Oct 2016 20:54:46 -0500 Subject: [PATCH 18/55] updated old doc strings and added new doc strings --- loopy/__init__.py | 6 +- loopy/statistics.py | 234 +++++++++++++++++++++++++++++++++----------- 2 files changed, 178 insertions(+), 62 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index a644fdf53..340aec051 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -114,7 +114,7 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction, from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel from loopy.statistics import (stringify_stats_mapping, Op, MemAccess, get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly, - get_gmem_access_poly, get_DRAM_access_poly, get_mem_access_poly, + get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly, sum_mem_access_to_bytes, get_synchronization_poly, gather_access_footprints, gather_access_footprint_bytes) from loopy.codegen import ( @@ -220,8 +220,8 @@ __all__ = [ "generate_code", "generate_code_v2", "generate_body", "stringify_stats_mapping", "Op", "MemAccess", "get_op_poly", - "sum_ops_to_dtypes", "get_lmem_access_poly", "get_gmem_access_poly", - "get_DRAM_access_poly", "get_mem_access_poly", + "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly", + "get_gmem_access_poly", "get_mem_access_poly", "sum_mem_access_to_bytes", "get_synchronization_poly", "gather_access_footprints", "gather_access_footprint_bytes", diff --git a/loopy/statistics.py b/loopy/statistics.py index f3346572e..fd9863eb2 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -42,7 +42,10 @@ __doc__ = """ .. autofunction:: get_op_poly +.. autofunction:: get_lmem_access_poly +.. autofunction:: get_DRAM_access_poly .. autofunction:: get_gmem_access_poly +.. autofunction:: get_mem_access_poly .. autofunction:: sum_mem_access_to_bytes @@ -110,6 +113,19 @@ def stringify_stats_mapping(m): class Op: + """An arithmetic operation + + .. attribute:: dtype + + A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the + data type operated on. + + .. attribute:: name + + A :class:`string` that specifies the kind of arithmetic operation as + *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. + + """ def __init__(self, dtype, name): self.name = name @@ -126,6 +142,34 @@ class Op: class MemAccess: + """A memory access + + .. attribute:: mtype + + A :class:`string` that specifies the memory type accessed as **global** + or **local** + + .. attribute:: dtype + + A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the + data type accessed. + + .. attribute:: stride + + A :class:`int` specifies stride of the memory access. A stride of 0 + indicates a uniform access (i.e. all threads access the same item). + + .. attribute:: direction + + A :class:`string` that specifies the direction of memory access as + **load** or **store**. + + .. attribute:: variable + + A :class:`string` that specifies the variable name of the data + accessed. + + """ #TODO "ANY_VAR" does not work yet #TODO currently counting all lmem access as stride-1 @@ -764,16 +808,14 @@ def get_op_poly(knl, numpy_types=True): :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted. - :return: A mapping of **{(** *type* **,** :class:`string` **)** - **:** :class:`islpy.PwQPolynomial` **}**. + :parameter numpy_types: A :class:`boolean` specifying whether the types + in the returned mapping should be numpy types + instead of :class:'loopy.LoopyType`. - - The *type* specifies the type of the data being - accessed. This can be a :class:`numpy.dtype` if - *numpy_types* is True, otherwise the internal - loopy type. + :return: A mapping of **{** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. - - The string specifies the operation type as - *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. + - The :class:`loopy.Op` specifies an arithmetic operation with + specific characteristics. - The :class:`islpy.PwQPolynomial` holds the number of operations of the kind specified in the key (in terms of the @@ -785,8 +827,8 @@ def get_op_poly(knl, numpy_types=True): poly = get_op_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} - f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) - f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32add = poly[Op(np.dtype(np.float32), 'add')].eval_with_dict(params) + f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) # (now use these counts to predict performance) @@ -819,6 +861,34 @@ def get_op_poly(knl, numpy_types=True): def sum_ops_to_dtypes(op_poly_dict): + """Sum the mapping returned by :func:`get_op_poly` to a mapping that ignores arithmetic op type + + :parameter op_poly_dict: A mapping of **{** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. + + :return: A mapping of **{** :class:`loopy.LoopyType` **:** :class:`islpy.PwQPolynomial` **}** + + - The :class:`loopy.LoopyType` specifies the data type operated on + + - The :class:`islpy.PwQPolynomial` holds the number of arithmetic + operations on the data type specified (in terms of the + :class:`loopy.LoopKernel` *inames*). + + Example usage:: + + # (first create loopy kernel and specify array data types) + + op_map = get_op_poly(knl) + op_map_by_dtype = sum_ops_to_dtypes(op_map) + params = {'n': 512, 'm': 256, 'l': 128} + + f32ops = op_map_by_dtype[to_loopy_type(np.float32)].eval_with_dict(params) + f64ops = op_map_by_dtype[to_loopy_type(np.float64)].eval_with_dict(params) + i32ops = op_map_by_dtype[to_loopy_type(np.int32)].eval_with_dict(params) + + # (now use these counts to predict performance) + + """ + result = {} for op, v in op_poly_dict.items(): new_key = op.dtype @@ -840,71 +910,86 @@ def get_lmem_access_poly(knl): return get_mem_access_poly(knl, 'local') +def get_DRAM_access_poly(knl): + """Count the number of global memory accesses in a loopy kernel. + """ + from warnings import warn + warn("get_DRAM_access_poly is deprecated. " + "Use get_mem_access_poly with global option instead", + DeprecationWarning, stacklevel=2) + return get_mem_access_poly(knl, 'global') + # {{{ get_gmem_access_poly def get_gmem_access_poly(knl): """Count the number of global memory accesses in a loopy kernel. + """ + from warnings import warn + warn("get_gmem_access_poly is deprecated. " + "Use get_mem_access_poly with global option instead", + DeprecationWarning, stacklevel=2) + return get_mem_access_poly(knl, 'global') + + +# }}} + +def get_mem_access_poly(knl, mtype, numpy_types=True): + """Count the number of memory accesses in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be counted. - :return: A mapping of **{(** *type* **,** :class:`string` **,** - :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. + :parameter mtype: A :class:`string` specifying the memory accesses as + *global* or *local*. - - The *type* specifies the type of the data being - accessed. This can be a :class:`numpy.dtype` if - *numpy_types* is True, otherwise the internal - loopy type. + :parameter numpy_types: A :class:`boolean` specifying whether the types + in the returned mapping should be numpy types + instead of :class:'loopy.LoopyType`. - - The first string in the map key specifies the global memory - access type as - *consecutive*, *nonconsecutive*, or *uniform*. + :return: A mapping of **{** :class:`loopy.MemAccess` **:** + :class:`islpy.PwQPolynomial` **}**. - - The second string in the map key specifies the global memory - access type as a - *load*, or a *store*. + - The :class:`loopy.MemAccess` specifies the type of memory + access. - - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses - with the characteristics specified in the key (in terms of the - :class:`loopy.LoopKernel` *inames*). + - The :class:`islpy.PwQPolynomial` holds the number of memory + accesses with the characteristics specified in the key (in terms + of the :class:`loopy.LoopKernel` *inames*). Example usage:: # (first create loopy kernel and specify array data types) - subscript_map = get_gmem_access_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} - - f32_uncoalesced_load = subscript_map.dict[ - (np.dtype(np.float32), 'nonconsecutive', 'load') - ].eval_with_dict(params) - f32_coalesced_load = subscript_map.dict[ - (np.dtype(np.float32), 'consecutive', 'load') - ].eval_with_dict(params) - f32_coalesced_store = subscript_map.dict[ - (np.dtype(np.float32), 'consecutive', 'store') - ].eval_with_dict(params) + gmem_access_map = get_mem_access_poly('global', knl) + + f32_stride1_g_loads_a = gmem_access_map[MemAccess('global', np.float32, + stride=1, + direction='load', + variable='a') + ].eval_with_dict(params) + f32_stride1_g_stores_a = gmem_access_map[MemAccess('global', np.float32, + stride=1, + direction='stores') + variable='a' + ].eval_with_dict(params) + + lmem_access_map = get_mem_access_poly('local', knl) + + f32_stride1_l_loads_x = lmem_access_map[MemAccess('local', np.float32, + stride=1, + direction='load', + variable='x') + ].eval_with_dict(params) + f32_stride1_l_stores_x = lmem_access_map[MemAccess('local', np.float32, + stride=1, + direction='stores', + variable='x') + ].eval_with_dict(params) # (now use these counts to predict performance) """ - from warnings import warn - warn("get_gmem_access_poly is deprecated. " - "Use get_mem_access_poly with global option instead", - DeprecationWarning, stacklevel=2) - return get_mem_access_poly(knl, 'global') - - -def get_DRAM_access_poly(knl): - from warnings import warn - warn("get_DRAM_access_poly is deprecated. " - "Use get_mem_access_poly with global option instead", - DeprecationWarning, stacklevel=2) - return get_mem_access_poly(knl, 'global') - -# }}} - -def get_mem_access_poly(knl, mtype, numpy_types=True): from loopy.preprocess import preprocess_kernel, infer_unknown_types class CacheHolder(object): @@ -988,12 +1073,43 @@ def get_mem_access_poly(knl, mtype, numpy_types=True): # {{{ sum_mem_access_to_bytes def sum_mem_access_to_bytes(m): - """Sum the mapping returned by :func:`get_gmem_access_poly` to a mapping + """Convert counts returned by :func:`get_mem_access_poly` to bytes and sum across data types and variables + + :parameter m: A mapping of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. + + :return: A mapping of **{(** :class:`string`**,** :class:`int` **,** :class:`string` **)** + **:** :class:`islpy.PwQPolynomial` **}** + + - The first string in the key specifies the memory type as *global* or *local* + + - The integer in the key specifies the *stride* + + - The second string in the key specifies the direction as *load* or *store* + + - The :class:`islpy.PwQPolynomial` holds the aggregate transfer + size in bytes for memory accesses of all data types with the + characteristics specified in the key (in terms of the + :class:`loopy.LoopKernel` *inames*). + + Example usage:: + + # (first create loopy kernel and specify array data types) + + mem_access_map = get_mem_access_poly('global', knl) + byte_totals_map = sum_mem_access_to_bytes(mem_access_map) + params = {'n': 512, 'm': 256, 'l': 128} + + stride1_global_bytes_loaded = byte_totals_map[('global', 1, 'load') + ].eval_with_dict(params) + stride2_global_bytes_loaded = byte_totals_map[('global', 2, 'load') + ].eval_with_dict(params) + stride1_global_bytes_stored = byte_totals_map[('global', 1, 'store') + ].eval_with_dict(params) + stride2_global_bytes_stored = byte_totals_map[('global', 2, 'store') + ].eval_with_dict(params) - **{(** :class:`string` **,** :class:`string` **)** - **:** :class:`islpy.PwQPolynomial` **}** + # (now use thess counts to predict performance) - i.e., aggregate the transfer numbers for all types into a single byte count. """ result = {} @@ -1030,9 +1146,9 @@ def get_synchronization_poly(knl): # (first create loopy kernel and specify array data types) - barrier_poly = get_synchronization_poly(knl) + sync_poly = get_synchronization_poly(knl) params = {'n': 512, 'm': 256, 'l': 128} - barrier_count = barrier_poly.eval_with_dict(params) + barrier_count = sync_poly['barrier_local'].eval_with_dict(params) # (now use this count to predict performance) -- GitLab From e42ad212a75e5fd6f6eb08eeb465c5565884ce5f Mon Sep 17 00:00:00 2001 From: James Stevens Date: Tue, 25 Oct 2016 21:43:42 -0500 Subject: [PATCH 19/55] added sum_mem_access_across_vars for convenience --- loopy/__init__.py | 10 ++++--- loopy/statistics.py | 60 +++++++++++++++++++++++++++++++++++++++++ test/test_statistics.py | 10 +++++++ 3 files changed, 76 insertions(+), 4 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 340aec051..6a482cc1c 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -115,8 +115,9 @@ from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel from loopy.statistics import (stringify_stats_mapping, Op, MemAccess, get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly, get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly, - sum_mem_access_to_bytes, get_synchronization_poly, - gather_access_footprints, gather_access_footprint_bytes) + sum_mem_access_to_bytes, sum_mem_access_across_vars, + get_synchronization_poly, gather_access_footprints, + gather_access_footprint_bytes) from loopy.codegen import ( PreambleInfo, generate_code, generate_code_v2, generate_body) @@ -222,8 +223,9 @@ __all__ = [ "stringify_stats_mapping", "Op", "MemAccess", "get_op_poly", "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_poly", - "sum_mem_access_to_bytes", "get_synchronization_poly", - "gather_access_footprints", "gather_access_footprint_bytes", + "sum_mem_access_to_bytes", "sum_mem_access_across_vars", + "get_synchronization_poly", "gather_access_footprints", + "gather_access_footprint_bytes", "CompiledKernel", diff --git a/loopy/statistics.py b/loopy/statistics.py index fd9863eb2..f27f26f3e 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1125,6 +1125,66 @@ def sum_mem_access_to_bytes(m): # }}} +# {{{ sum_mem_access_across_vars + +def sum_mem_access_across_vars(m): + """Remove variable name divisions in mapping returned by :func:`get_mem_access_poly` + + :parameter m: A mapping of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. + + :return: A mapping of **{(** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}** + + - The **variable** attribute in the keys of the returned mapping is set to 'ANY_VAR' + + - The :class:`islpy.PwQPolynomial` holds the aggregate transfer + size in bytes for memory accesses of all data types with the + characteristics specified in the key (in terms of the + :class:`loopy.LoopKernel` *inames*). + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + gmem_access_map = get_mem_access_poly('global', knl) + gmem_acrossvars = sum_mem_access_across_vars(gmem_access_map) + + f32_stride1_g_loads = gmem_acrossvars[MemAccess('global', np.float32, + stride=1, + direction='load') # do not specify variable + ].eval_with_dict(params) + f32_stride1_g_stores = gmem_acrossvars[MemAccess('global', np.float32, + stride=1, + direction='store') # do not specify variable + ].eval_with_dict(params) + + lmem_access_map = get_mem_access_poly('local', knl) + lmem_acrossvars = sum_mem_access_across_vars(lmem_access_map) + + f32_stride1_l_loads = lmem_acrossvars[MemAccess('local', np.float32, + stride=1, + direction='load') # do not specify variable + ].eval_with_dict(params) + f32_stride1_l_stores = lmem_acrossvars[MemAccess('local', np.float32, + stride=1, + direction='store') # do not specify variable + ].eval_with_dict(params) + + # (now use these counts to predict performance) + + """ + + result = {} + for mem_access, v in m.items(): + new_key = MemAccess(mem_access.mtype, mem_access.dtype, mem_access.stride, mem_access.direction) + if new_key in result: + result[new_key] += m[mem_access] + else: + result[new_key] = m[mem_access] + + return result + +# }}} # {{{ get_synchronization_poly diff --git a/test/test_statistics.py b/test/test_statistics.py index feda05125..5629a0702 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -271,6 +271,16 @@ def test_gmem_access_counter_basic(): assert s0load == 4*f32l + 8*f64l assert s0store == 4*f32s + 8*f64s + poly_c = lp.sum_mem_access_across_vars(poly) + f32lall = poly_c[lp.MemAccess('global', np.float32, + stride=0, direction='load') + ].eval_with_dict(params) + f64lall = poly_c[lp.MemAccess('global', np.float64, + stride=0, direction='load') + ].eval_with_dict(params) + assert f32lall == 3*n*m*l + assert f64lall == 2*n*m + def test_gmem_access_counter_reduction(): -- GitLab From 38f040cf4f244bf3848d0d8aa18d0c1f8ed9cb63 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Wed, 26 Oct 2016 03:16:07 -0500 Subject: [PATCH 20/55] added ignore_vars option to get_mem_access_poly --- loopy/statistics.py | 26 +++++++++++++++----------- test/test_statistics.py | 8 ++++---- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index f27f26f3e..ab68c4253 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -171,10 +171,8 @@ class MemAccess: """ - #TODO "ANY_VAR" does not work yet #TODO currently counting all lmem access as stride-1 - def __init__(self, mtype, dtype, stride=1, direction=None, - variable='ANY_VAR'): + def __init__(self, mtype, dtype, stride=1, direction=None, variable=None): self.mtype = mtype self.stride = stride self.direction = direction @@ -189,7 +187,7 @@ class MemAccess: other.dtype == self.dtype and other.stride == self.stride and other.direction == self.direction and - ((self.variable == 'ANY_VAR' or other.variable == 'ANY_VAR') or + ((self.variable == None or other.variable == None) or self.variable == other.variable)) def __hash__(self): @@ -198,7 +196,7 @@ class MemAccess: if direction == None: direction = 'None' if variable == None: - variable = 'ANY_VAR' + variable = 'None' return hash(str(self.mtype)+str(self.dtype)+str(self.stride) +direction+variable) @@ -933,7 +931,7 @@ def get_gmem_access_poly(knl): # }}} -def get_mem_access_poly(knl, mtype, numpy_types=True): +def get_mem_access_poly(knl, mtype, numpy_types=True, ignore_vars=False): """Count the number of memory accesses in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be @@ -946,6 +944,9 @@ def get_mem_access_poly(knl, mtype, numpy_types=True): in the returned mapping should be numpy types instead of :class:'loopy.LoopyType`. + :parameter ignore_vars: A :class:`boolean` specifying whether to separate + memory accesses by variable name. + :return: A mapping of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -970,8 +971,8 @@ def get_mem_access_poly(knl, mtype, numpy_types=True): ].eval_with_dict(params) f32_stride1_g_stores_a = gmem_access_map[MemAccess('global', np.float32, stride=1, - direction='stores') - variable='a' + direction='store', + variable='a') ].eval_with_dict(params) lmem_access_map = get_mem_access_poly('local', knl) @@ -983,7 +984,7 @@ def get_mem_access_poly(knl, mtype, numpy_types=True): ].eval_with_dict(params) f32_stride1_l_stores_x = lmem_access_map[MemAccess('local', np.float32, stride=1, - direction='stores', + direction='store', variable='x') ].eval_with_dict(params) @@ -1068,7 +1069,10 @@ def get_mem_access_poly(knl, mtype, numpy_types=True): , count) for mem_access, count in six.iteritems(result)) - return result + if ignore_vars: + return sum_mem_access_across_vars(result) + else: + return result # {{{ sum_mem_access_to_bytes @@ -1134,7 +1138,7 @@ def sum_mem_access_across_vars(m): :return: A mapping of **{(** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}** - - The **variable** attribute in the keys of the returned mapping is set to 'ANY_VAR' + - The **variable** attribute in the keys of the returned mapping is set to None - The :class:`islpy.PwQPolynomial` holds the aggregate transfer size in bytes for memory accesses of all data types with the diff --git a/test/test_statistics.py b/test/test_statistics.py index 5629a0702..dd651e7c7 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -329,22 +329,22 @@ def test_gmem_access_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - poly = lp.get_mem_access_poly(knl, 'global') + poly = lp.get_mem_access_poly(knl, 'global', ignore_vars=True) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32 = poly[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='g') + stride=0, direction='load') ].eval_with_dict(params) f64 = poly[lp.MemAccess('global', np.float64, - stride=0, direction='load', variable='h') + stride=0, direction='load') ].eval_with_dict(params) assert f32 == 2*n*m assert f64 == n*m f64 = poly[lp.MemAccess('global', np.float64, - stride=0, direction='store', variable='e') + stride=0, direction='store') ].eval_with_dict(params) assert f64 == n*m -- GitLab From 61d27603c27e0867aaf97030a20e05340d2bb837 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Wed, 26 Oct 2016 19:29:34 -0500 Subject: [PATCH 21/55] added reduce_mem_access_poly_fields function --- loopy/__init__.py | 10 +-- loopy/statistics.py | 133 +++++++++++++++++++++++++++++++++------- test/test_statistics.py | 32 ++++++---- 3 files changed, 136 insertions(+), 39 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 6a482cc1c..e50e53fb7 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -115,9 +115,9 @@ from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel from loopy.statistics import (stringify_stats_mapping, Op, MemAccess, get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly, get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly, - sum_mem_access_to_bytes, sum_mem_access_across_vars, - get_synchronization_poly, gather_access_footprints, - gather_access_footprint_bytes) + sum_mem_access_to_bytes, sum_mem_access_across_vars, + reduce_mem_access_poly_fields, get_synchronization_poly, + gather_access_footprints, gather_access_footprint_bytes) from loopy.codegen import ( PreambleInfo, generate_code, generate_code_v2, generate_body) @@ -224,8 +224,8 @@ __all__ = [ "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_poly", "sum_mem_access_to_bytes", "sum_mem_access_across_vars", - "get_synchronization_poly", "gather_access_footprints", - "gather_access_footprint_bytes", + "reduce_mem_access_poly_fields", "get_synchronization_poly", + "gather_access_footprints", "gather_access_footprint_bytes", "CompiledKernel", diff --git a/loopy/statistics.py b/loopy/statistics.py index ab68c4253..adf0781c8 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -48,6 +48,8 @@ __doc__ = """ .. autofunction:: get_mem_access_poly .. autofunction:: sum_mem_access_to_bytes +.. autofunction:: sum_mem_access_across_vars +.. autofunction:: reduce_mem_access_poly_fields .. autofunction:: get_synchronization_poly @@ -172,33 +174,47 @@ class MemAccess: """ #TODO currently counting all lmem access as stride-1 - def __init__(self, mtype, dtype, stride=1, direction=None, variable=None): + def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None): self.mtype = mtype self.stride = stride self.direction = direction self.variable = variable - - from loopy.types import to_loopy_type - self.dtype = to_loopy_type(dtype) + if dtype is None: + self.dtype = dtype + else: + from loopy.types import to_loopy_type + self.dtype = to_loopy_type(dtype) def __eq__(self, other): return isinstance(other, MemAccess) and ( - other.mtype == self.mtype and - other.dtype == self.dtype and - other.stride == self.stride and - other.direction == self.direction and - ((self.variable == None or other.variable == None) or + (self.mtype is None or other.mtype is None or + self.mtype == other.mtype) and + (self.dtype is None or other.dtype is None or + self.dtype == other.dtype) and + (self.stride is None or other.stride is None or + self.stride == other.stride) and + (self.direction is None or other.direction is None or + self.direction == other.direction) and + (self.variable is None or other.variable is None or self.variable == other.variable)) def __hash__(self): + mtype = self.mtype + dtype = self.dtype + stride = self.stride direction = self.direction variable = self.variable - if direction == None: + if mtype is None: + mtype = 'None' + if dtype is None: + dtype = 'None' + if stride is None: + stride = 'None' + if direction is None: direction = 'None' - if variable == None: + if variable is None: variable = 'None' - return hash(str(self.mtype)+str(self.dtype)+str(self.stride) - +direction+variable) + return hash(mtype+str(dtype)+str(stride)+direction+variable) @@ -931,7 +947,7 @@ def get_gmem_access_poly(knl): # }}} -def get_mem_access_poly(knl, mtype, numpy_types=True, ignore_vars=False): +def get_mem_access_poly(knl, mtype, numpy_types=True): """Count the number of memory accesses in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be @@ -944,9 +960,6 @@ def get_mem_access_poly(knl, mtype, numpy_types=True, ignore_vars=False): in the returned mapping should be numpy types instead of :class:'loopy.LoopyType`. - :parameter ignore_vars: A :class:`boolean` specifying whether to separate - memory accesses by variable name. - :return: A mapping of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1069,10 +1082,7 @@ def get_mem_access_poly(knl, mtype, numpy_types=True, ignore_vars=False): , count) for mem_access, count in six.iteritems(result)) - if ignore_vars: - return sum_mem_access_across_vars(result) - else: - return result + return result # {{{ sum_mem_access_to_bytes @@ -1190,6 +1200,87 @@ def sum_mem_access_across_vars(m): # }}} +# {{{ reduce_mem_access_poly_fields + +def reduce_mem_access_poly_fields(m, mtype=True, dtype=True, stride=True, + direction=True, variable=True): + """Take map returned from :func:`get_mem_access_poly`, remove specified MemAccess fields from keys, and combine counts + + :parameter m: A mapping of **{** :class:`loopy.MemAccess` **:** + :class:`islpy.PwQPolynomial` **}**. + + :parameter mtype: A :class:`boolean` specifying whether keys in returned + map will include the memory type. + + :parameter dtype: A :class:`boolean` specifying whether keys in returned + map will include the data type. + + :parameter stride: A :class:`boolean` specifying whether keys in returned + map will include the stride. + + :parameter direction: A :class:`boolean` specifying whether keys in + returned map will include the direction. + + :parameter variable: A :class:`boolean` specifying whether keys in returned + map will include the variable name. + + + :return: A mapping of **{(** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}** + + - The :class:`islpy.PwQPolynomial` holds the aggregate transfer + size in bytes for memory accesses of all data types with the + characteristics specified in the key (in terms of the + :class:`loopy.LoopKernel` *inames*). + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = get_mem_access_poly(knl) + reduced_mem_map = reduce_mem_access_poly_fields(mem_map, stride=False, + variable=False) + + all_f32_global_loads = reduced_mem_map[MemAccess('global', np.float32, + direction='load') + ].eval_with_dict(params) + all_f32_global_stores = reduced_mem_map[MemAccess('global', np.float32, + direction='store') + ].eval_with_dict(params) + all_f32_local_loads = reduced_mem_map[MemAccess('local', np.float32, + direction='load') + ].eval_with_dict(params) + all_f32_local_stores = reduced_mem_map[MemAccess('local', np.float32, + direction='store') + ].eval_with_dict(params) + + # (now use these counts to predict performance) + + """ + + result = {} + for k, v in m.items(): + new_key = MemAccess() + if mtype == True: + new_key.mtype = k.mtype + if dtype == True: + new_key.dtype = k.dtype + if stride == True: + new_key.stride = k.stride + if direction == True: + new_key.direction = k.direction + if variable == True: + new_key.variable = k.variable + + if new_key in result: + result[new_key] += m[k] + else: + result[new_key] = m[k] + + return result + +# }}} + # {{{ get_synchronization_poly def get_synchronization_poly(knl): diff --git a/test/test_statistics.py b/test/test_statistics.py index dd651e7c7..51368781d 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -329,24 +329,27 @@ def test_gmem_access_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - poly = lp.get_mem_access_poly(knl, 'global', ignore_vars=True) + poly = lp.get_mem_access_poly(knl, 'global') n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[lp.MemAccess('global', np.float32, - stride=0, direction='load') - ].eval_with_dict(params) - f64 = poly[lp.MemAccess('global', np.float64, - stride=0, direction='load') - ].eval_with_dict(params) - assert f32 == 2*n*m - assert f64 == n*m - f64 = poly[lp.MemAccess('global', np.float64, - stride=0, direction='store') - ].eval_with_dict(params) - assert f64 == n*m + reduced_map = lp.reduce_mem_access_poly_fields(poly, stride=False, + variable=False) + + f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32), + direction='load') + ].eval_with_dict(params) + f64_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), + direction='load') + ].eval_with_dict(params) + f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), + direction='store') + ].eval_with_dict(params) + assert f32_g_l == 2*n*m + assert f64_g_l == n*m + assert f64_g_s == n*m def test_gmem_access_counter_specialops(): @@ -566,6 +569,9 @@ def test_gmem_access_counter_consec(): l = 128 params = {'n': n, 'm': m, 'l': l} + #for k in poly: + # print(k.mtype, k.dtype, type(k.dtype), k.stride, k.direction, k.variable, " :\n", poly[k]) + f64consec = poly[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='g') ].eval_with_dict(params) -- GitLab From 3b54c505f83ef0db4346d5edae035dd2ee1145fe Mon Sep 17 00:00:00 2001 From: James Stevens Date: Wed, 26 Oct 2016 22:50:39 -0500 Subject: [PATCH 22/55] added filter_mem_access_poly_fields, sum_polys, and eval_and_sum_polys --- loopy/__init__.py | 6 ++- loopy/statistics.py | 94 ++++++++++++++++++++++++++++++++++++++--- test/test_statistics.py | 3 ++ 3 files changed, 94 insertions(+), 9 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index e50e53fb7..0f56d3d4a 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -116,7 +116,8 @@ from loopy.statistics import (stringify_stats_mapping, Op, MemAccess, get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly, get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly, sum_mem_access_to_bytes, sum_mem_access_across_vars, - reduce_mem_access_poly_fields, get_synchronization_poly, + reduce_mem_access_poly_fields, filter_mem_access_poly_fields, + sum_polys, eval_and_sum_polys, get_synchronization_poly, gather_access_footprints, gather_access_footprint_bytes) from loopy.codegen import ( PreambleInfo, @@ -224,7 +225,8 @@ __all__ = [ "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_poly", "sum_mem_access_to_bytes", "sum_mem_access_across_vars", - "reduce_mem_access_poly_fields", "get_synchronization_poly", + "reduce_mem_access_poly_fields", "filter_mem_access_poly_fields", + "sum_polys", "eval_and_sum_polys", "get_synchronization_poly", "gather_access_footprints", "gather_access_footprint_bytes", "CompiledKernel", diff --git a/loopy/statistics.py b/loopy/statistics.py index adf0781c8..921dd6893 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1150,9 +1150,8 @@ def sum_mem_access_across_vars(m): - The **variable** attribute in the keys of the returned mapping is set to None - - The :class:`islpy.PwQPolynomial` holds the aggregate transfer - size in bytes for memory accesses of all data types with the - characteristics specified in the key (in terms of the + - The :class:`islpy.PwQPolynomial` holds the aggregate counts for + memory accesses across all variables (in terms of the :class:`loopy.LoopKernel` *inames*). Example usage:: @@ -1227,10 +1226,9 @@ def reduce_mem_access_poly_fields(m, mtype=True, dtype=True, stride=True, :return: A mapping of **{(** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}** - - The :class:`islpy.PwQPolynomial` holds the aggregate transfer - size in bytes for memory accesses of all data types with the - characteristics specified in the key (in terms of the - :class:`loopy.LoopKernel` *inames*). + - The :class:`islpy.PwQPolynomial` holds the counts (in terms of + the :class:`loopy.LoopKernel` *inames*) for memory accesses + categorized by the fields not set to False. Example usage:: @@ -1281,6 +1279,88 @@ def reduce_mem_access_poly_fields(m, mtype=True, dtype=True, stride=True, # }}} +# {{{ filter_mem_access_poly_fields + +def filter_mem_access_poly_fields(m, mtypes=None, dtypes=None, strides=None, + directions=None, variables=None): + """Take map returned from :func:`get_mem_access_poly` and remove items without specified MemAccess fields + + :parameter m: A mapping of **{** :class:`loopy.MemAccess` **:** + :class:`islpy.PwQPolynomial` **}**. + + :parameter mtypes: A list of :class:`string` that specifies the memory type + accessed as **global** or **local** + + :parameter dtypes: A list of :class:`loopy.LoopyType` (or + :class:`numpy.dtype`) that specifies the data type + accessed. + + :parameter strides: A list of :class:`int` specifies stride of the memory + access. A stride of 0 indicates a uniform access (i.e. + all threads access the same item). + + :parameter directions: A list of :class:`string` that specifies the + direction of memory access as **load** or **store**. + + :parameter variables: A list of :class:`string` that specifies the variable + name of the data accessed. + + + :return: A mapping of **{(** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}** + + - The :class:`islpy.PwQPolynomial` holds the counts (in terms of + the :class:`loopy.LoopKernel` *inames*) for memory accesses + matching the fields passed as parameters. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = lp.get_mem_access_poly(knl) + filtered_map = lp.filter_mem_access_poly_fields(mem_map, + directions=['load'], + variables=['a','g']) + tot = lp.eval_and_sum_polys(filtered_map, params) + + # (now use these counts to predict performance) + + """ + + if dtypes is not None: + dtypes_lp = [to_loopy_type(d) for d in dtypes] + + result = {} + + for k, v in m.items(): + if (mtypes is None or k.mtype in mtypes) and \ + (dtypes is None or k.dtype in dtypes_lp) and \ + (strides is None or k.stride in strides) and \ + (directions is None or k.direction in directions) and \ + (variables is None or k.variable in variables): + + new_key = MemAccess(k.mtype, k.dtype, k.stride, k.direction, k.variable) + + if new_key in result: + result[new_key] += m[k] + else: + result[new_key] = m[k] + + return result + +# }}} + +def sum_polys(m): + total = isl.PwQPolynomial('{ 0 }') + for k, v in m.items(): + total += v + return total + + +def eval_and_sum_polys(m, params): + return sum_polys(m).eval_with_dict(params) + + # {{{ get_synchronization_poly def get_synchronization_poly(knl): diff --git a/test/test_statistics.py b/test/test_statistics.py index 51368781d..305a6cb9d 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -395,6 +395,9 @@ def test_gmem_access_counter_specialops(): assert f32 == n*m*l assert f64 == n*m + filtered_map = lp.filter_mem_access_poly_fields(poly, directions=['load'], variables=['a','g']) + tot = lp.eval_and_sum_polys(filtered_map, params) + assert tot == n*m*l + n*m def test_gmem_access_counter_bitwise(): -- GitLab From a7d04f6884bb41a9c8dfac0c20c8d3b6587e0322 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Thu, 27 Oct 2016 03:26:50 -0500 Subject: [PATCH 23/55] removed sum_mem_access_across_vars (now redundant), removed mtype specifyer from get_mem_access_poly, which now returns map with all mtypes --- loopy/__init__.py | 16 ++-- loopy/statistics.py | 170 +++++++++++++--------------------------- test/test_statistics.py | 91 ++++++++++++++------- 3 files changed, 122 insertions(+), 155 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 0f56d3d4a..15fe458e6 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -115,10 +115,10 @@ from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel from loopy.statistics import (stringify_stats_mapping, Op, MemAccess, get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly, get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly, - sum_mem_access_to_bytes, sum_mem_access_across_vars, - reduce_mem_access_poly_fields, filter_mem_access_poly_fields, - sum_polys, eval_and_sum_polys, get_synchronization_poly, - gather_access_footprints, gather_access_footprint_bytes) + sum_mem_access_to_bytes, reduce_mem_access_poly_fields, + filter_mem_access_poly_fields, sum_polys, eval_and_sum_polys, + get_synchronization_poly, gather_access_footprints, + gather_access_footprint_bytes) from loopy.codegen import ( PreambleInfo, generate_code, generate_code_v2, generate_body) @@ -224,10 +224,10 @@ __all__ = [ "stringify_stats_mapping", "Op", "MemAccess", "get_op_poly", "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_poly", - "sum_mem_access_to_bytes", "sum_mem_access_across_vars", - "reduce_mem_access_poly_fields", "filter_mem_access_poly_fields", - "sum_polys", "eval_and_sum_polys", "get_synchronization_poly", - "gather_access_footprints", "gather_access_footprint_bytes", + "sum_mem_access_to_bytes", "reduce_mem_access_poly_fields", + "filter_mem_access_poly_fields", "sum_polys", "eval_and_sum_polys", + "get_synchronization_poly", "gather_access_footprints", + "gather_access_footprint_bytes", "CompiledKernel", diff --git a/loopy/statistics.py b/loopy/statistics.py index 921dd6893..7c5efb3d7 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -48,7 +48,6 @@ __doc__ = """ .. autofunction:: get_mem_access_poly .. autofunction:: sum_mem_access_to_bytes -.. autofunction:: sum_mem_access_across_vars .. autofunction:: reduce_mem_access_poly_fields .. autofunction:: get_synchronization_poly @@ -919,9 +918,11 @@ def get_lmem_access_poly(knl): """ from warnings import warn warn("get_lmem_access_poly is deprecated. " - "Use get_mem_access_poly with local option instead", + "Instead, use get_mem_access_poly and then pass the result to " + "filter_mem_access_poly_fields with mtypes=['local'] option.", DeprecationWarning, stacklevel=2) - return get_mem_access_poly(knl, 'local') + return filter_mem_access_poly_fields( + get_mem_access_poly(knl), mtypes=['local']) def get_DRAM_access_poly(knl): @@ -929,9 +930,11 @@ def get_DRAM_access_poly(knl): """ from warnings import warn warn("get_DRAM_access_poly is deprecated. " - "Use get_mem_access_poly with global option instead", + "Instead, use get_mem_access_poly and then pass the result to " + "filter_mem_access_poly_fields with mtypes=['global'] option.", DeprecationWarning, stacklevel=2) - return get_mem_access_poly(knl, 'global') + return filter_mem_access_poly_fields( + get_mem_access_poly(knl), mtypes=['global']) # {{{ get_gmem_access_poly @@ -940,22 +943,21 @@ def get_gmem_access_poly(knl): """ from warnings import warn warn("get_gmem_access_poly is deprecated. " - "Use get_mem_access_poly with global option instead", + "Instead, use get_mem_access_poly and then pass the result to " + "filter_mem_access_poly_fields with mtypes=['global'] option.", DeprecationWarning, stacklevel=2) - return get_mem_access_poly(knl, 'global') + return filter_mem_access_poly_fields( + get_mem_access_poly(knl), mtypes=['global']) # }}} -def get_mem_access_poly(knl, mtype, numpy_types=True): +def get_mem_access_poly(knl, numpy_types=True): """Count the number of memory accesses in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be counted. - :parameter mtype: A :class:`string` specifying the memory accesses as - *global* or *local*. - :parameter numpy_types: A :class:`boolean` specifying whether the types in the returned mapping should be numpy types instead of :class:'loopy.LoopyType`. @@ -975,31 +977,28 @@ def get_mem_access_poly(knl, mtype, numpy_types=True): # (first create loopy kernel and specify array data types) params = {'n': 512, 'm': 256, 'l': 128} - gmem_access_map = get_mem_access_poly('global', knl) + mem_access_map = get_mem_access_poly(knl) - f32_stride1_g_loads_a = gmem_access_map[MemAccess('global', np.float32, + f32_stride1_g_loads_a = mem_access_map[MemAccess('global', np.float32, + stride=1, + direction='load', + variable='a') + ].eval_with_dict(params) + f32_stride1_g_stores_a = mem_access_map[MemAccess('global', np.float32, stride=1, - direction='load', + direction='store', variable='a') ].eval_with_dict(params) - f32_stride1_g_stores_a = gmem_access_map[MemAccess('global', np.float32, - stride=1, - direction='store', - variable='a') - ].eval_with_dict(params) - - lmem_access_map = get_mem_access_poly('local', knl) - - f32_stride1_l_loads_x = lmem_access_map[MemAccess('local', np.float32, + f32_stride1_l_loads_x = mem_access_map[MemAccess('local', np.float32, + stride=1, + direction='load', + variable='x') + ].eval_with_dict(params) + f32_stride1_l_stores_x = mem_access_map[MemAccess('local', np.float32, stride=1, - direction='load', + direction='store', variable='x') ].eval_with_dict(params) - f32_stride1_l_stores_x = lmem_access_map[MemAccess('local', np.float32, - stride=1, - direction='store', - variable='x') - ].eval_with_dict(params) # (now use these counts to predict performance) @@ -1027,50 +1026,48 @@ def get_mem_access_poly(knl, mtype, numpy_types=True): knl = preprocess_kernel(knl) subs_poly = ToCountMap() - if mtype == 'global': - subscript_counter = GlobalSubscriptCounter(knl) - elif mtype == 'local': - subscript_counter = LocalSubscriptCounter(knl) - else: - raise ValueError("get_mem_access_poly: mtype must be " - "'local' or 'global', received {0}" - .format(mtype)) + subs_counter_g = GlobalSubscriptCounter(knl) + subs_counter_l = LocalSubscriptCounter(knl) for insn in knl.instructions: - # count subscripts, distinguishing loads and stores - subs_expr = subscript_counter(insn.expression) + # count subscripts + subs_expr = subs_counter_g(insn.expression) \ + + subs_counter_l(insn.expression) + + # distinguish loads and stores for key in subs_expr.dict: subs_expr.dict[MemAccess(key.mtype, key.dtype, stride=key.stride, direction='load', variable=key.variable) ] = subs_expr.dict.pop(key) - if mtype == 'global': # for now, don't count writes to local mem - subs_assignee = subscript_counter(insn.assignee) - for key in subs_assignee.dict: - subs_assignee.dict[MemAccess(key.mtype, key.dtype, - stride=key.stride, direction='store', - variable=key.variable) - ] = subs_assignee.dict.pop(key) + subs_assignee_g = subs_counter_g(insn.assignee) + for key in subs_assignee_g.dict: + subs_assignee_g.dict[MemAccess(key.mtype, key.dtype, + stride=key.stride, direction='store', + variable=key.variable) + ] = subs_assignee_g.dict.pop(key) + # for now, don't count writes to local mem insn_inames = knl.insn_inames(insn) # use count excluding local index tags for uniform accesses for key in subs_expr.dict: poly = ToCountMap({key: subs_expr.dict[key]}) - if mtype == 'global' and isinstance(key.stride, int) and key.stride == 0: + if key.mtype == 'global' and isinstance(key.stride, int) and key.stride == 0: subs_poly = subs_poly \ + poly*get_insn_count(knl, insn_inames, True) else: subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) + #currently not counting stride of local mem access - if mtype == 'global': # for now, don't count writes to local mem - for key in subs_assignee.dict: - poly = ToCountMap({key: subs_assignee.dict[key]}) - if isinstance(key.stride, int) and key.stride == 0: - subs_poly = subs_poly \ - + poly*get_insn_count(knl, insn_inames, True) - else: - subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) + for key in subs_assignee_g.dict: + poly = ToCountMap({key: subs_assignee_g.dict[key]}) + if isinstance(key.stride, int) and key.stride == 0: + subs_poly = subs_poly \ + + poly*get_insn_count(knl, insn_inames, True) + else: + subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) + # for now, don't count writes to local mem result = subs_poly.dict @@ -1109,7 +1106,7 @@ def sum_mem_access_to_bytes(m): # (first create loopy kernel and specify array data types) - mem_access_map = get_mem_access_poly('global', knl) + mem_access_map = get_mem_access_poly(knl) byte_totals_map = sum_mem_access_to_bytes(mem_access_map) params = {'n': 512, 'm': 256, 'l': 128} @@ -1138,67 +1135,6 @@ def sum_mem_access_to_bytes(m): return result # }}} - -# {{{ sum_mem_access_across_vars - -def sum_mem_access_across_vars(m): - """Remove variable name divisions in mapping returned by :func:`get_mem_access_poly` - - :parameter m: A mapping of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. - - :return: A mapping of **{(** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}** - - - The **variable** attribute in the keys of the returned mapping is set to None - - - The :class:`islpy.PwQPolynomial` holds the aggregate counts for - memory accesses across all variables (in terms of the - :class:`loopy.LoopKernel` *inames*). - - Example usage:: - - # (first create loopy kernel and specify array data types) - - params = {'n': 512, 'm': 256, 'l': 128} - gmem_access_map = get_mem_access_poly('global', knl) - gmem_acrossvars = sum_mem_access_across_vars(gmem_access_map) - - f32_stride1_g_loads = gmem_acrossvars[MemAccess('global', np.float32, - stride=1, - direction='load') # do not specify variable - ].eval_with_dict(params) - f32_stride1_g_stores = gmem_acrossvars[MemAccess('global', np.float32, - stride=1, - direction='store') # do not specify variable - ].eval_with_dict(params) - - lmem_access_map = get_mem_access_poly('local', knl) - lmem_acrossvars = sum_mem_access_across_vars(lmem_access_map) - - f32_stride1_l_loads = lmem_acrossvars[MemAccess('local', np.float32, - stride=1, - direction='load') # do not specify variable - ].eval_with_dict(params) - f32_stride1_l_stores = lmem_acrossvars[MemAccess('local', np.float32, - stride=1, - direction='store') # do not specify variable - ].eval_with_dict(params) - - # (now use these counts to predict performance) - - """ - - result = {} - for mem_access, v in m.items(): - new_key = MemAccess(mem_access.mtype, mem_access.dtype, mem_access.stride, mem_access.direction) - if new_key in result: - result[new_key] += m[mem_access] - else: - result[new_key] = m[mem_access] - - return result - -# }}} - # {{{ reduce_mem_access_poly_fields def reduce_mem_access_poly_fields(m, mtype=True, dtype=True, stride=True, diff --git a/test/test_statistics.py b/test/test_statistics.py index 305a6cb9d..7ad06f3eb 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -236,7 +236,7 @@ def test_gmem_access_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = lp.get_mem_access_poly(knl, 'global') + poly = lp.get_mem_access_poly(knl) n = 512 m = 256 l = 128 @@ -265,22 +265,6 @@ def test_gmem_access_counter_basic(): assert f32s == n*m*l assert f64s == n*m - poly_b = lp.sum_mem_access_to_bytes(poly) - s0load = poly_b[('global', 0, 'load')].eval_with_dict(params) - s0store = poly_b[('global', 0, 'store')].eval_with_dict(params) - assert s0load == 4*f32l + 8*f64l - assert s0store == 4*f32s + 8*f64s - - poly_c = lp.sum_mem_access_across_vars(poly) - f32lall = poly_c[lp.MemAccess('global', np.float32, - stride=0, direction='load') - ].eval_with_dict(params) - f64lall = poly_c[lp.MemAccess('global', np.float64, - stride=0, direction='load') - ].eval_with_dict(params) - assert f32lall == 3*n*m*l - assert f64lall == 2*n*m - def test_gmem_access_counter_reduction(): @@ -292,7 +276,7 @@ def test_gmem_access_counter_reduction(): name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - poly = lp.get_mem_access_poly(knl, 'global') + poly = lp.get_mem_access_poly(knl) n = 512 m = 256 l = 128 @@ -329,7 +313,7 @@ def test_gmem_access_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - poly = lp.get_mem_access_poly(knl, 'global') + poly = lp.get_mem_access_poly(knl) n = 512 m = 256 l = 128 @@ -366,7 +350,7 @@ def test_gmem_access_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = lp.get_mem_access_poly(knl, 'global') + poly = lp.get_mem_access_poly(knl) n = 512 m = 256 l = 128 @@ -416,7 +400,7 @@ def test_gmem_access_counter_bitwise(): a=np.int32, b=np.int32, g=np.int32, h=np.int32)) - poly = lp.get_mem_access_poly(knl, 'global') + poly = lp.get_mem_access_poly(knl) n = 512 m = 256 l = 128 @@ -462,7 +446,7 @@ def test_gmem_access_counter_mixed(): knl = lp.split_iname(knl, "j", threads) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) - poly = lp.get_mem_access_poly(knl, 'global') # noqa + poly = lp.get_mem_access_poly(knl) # noqa n = 512 m = 256 l = 128 @@ -515,7 +499,7 @@ def test_gmem_access_counter_nonconsec(): knl = lp.split_iname(knl, "i", 16) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) - poly = lp.get_mem_access_poly(knl, 'global') # noqa + poly = lp.get_mem_access_poly(knl) # noqa n = 512 m = 256 l = 128 @@ -566,7 +550,7 @@ def test_gmem_access_counter_consec(): a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) - poly = lp.get_mem_access_poly(knl, 'global') + poly = lp.get_mem_access_poly(knl) n = 512 m = 256 l = 128 @@ -670,10 +654,7 @@ def test_all_counters_parallel_matmul(): l = 128 params = {'n': n, 'm': m, 'l': l} - #barrier_count = get_barrier_poly(knl).eval_with_dict(params) - #assert barrier_count == 2*m/16 sync_poly = lp.get_synchronization_poly(knl) - #assert len(sync_poly) == 1 #TODO why? assert len(sync_poly) == 2 assert sync_poly["kernel_launch"].eval_with_dict(params) == 1 assert sync_poly["barrier_local"].eval_with_dict(params) == 2*m/16 @@ -694,7 +675,7 @@ def test_all_counters_parallel_matmul(): assert f32mul+f32add == n*m*l*2 - subscript_map = lp.get_mem_access_poly(knl, 'global') + subscript_map = lp.get_mem_access_poly(knl) f32coal = subscript_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='b') @@ -711,9 +692,8 @@ def test_all_counters_parallel_matmul(): assert f32coal == n*l - local_subs_map = lp.get_mem_access_poly(knl, 'local') + local_subs_map = lp.get_mem_access_poly(knl) - # TODO currently considering all local mem access stride-1 local_subs_l = local_subs_map[lp.MemAccess('local', np.dtype(np.float32), direction='load') ].eval_with_dict(params) @@ -752,6 +732,57 @@ def test_gather_access_footprint_2(): print(key, count(knl, footprint)) +def test_summations_and_filters(): + + knl = lp.make_kernel( + "[n,m,l] -> {[i,k,j]: 0<=i 1: exec(sys.argv[1]) -- GitLab From 1b1180c3ee46e7e701ddc97043f87f6dfcde15be Mon Sep 17 00:00:00 2001 From: James Stevens Date: Thu, 27 Oct 2016 04:23:37 -0500 Subject: [PATCH 24/55] added reduce_op_poly_fields and filter_op_poly_fields --- loopy/__init__.py | 6 +- loopy/statistics.py | 144 ++++++++++++++++++++++++++++++++++++++-- test/test_statistics.py | 36 +++++++--- 3 files changed, 168 insertions(+), 18 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 15fe458e6..05ccddb54 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -116,7 +116,8 @@ from loopy.statistics import (stringify_stats_mapping, Op, MemAccess, get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly, get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly, sum_mem_access_to_bytes, reduce_mem_access_poly_fields, - filter_mem_access_poly_fields, sum_polys, eval_and_sum_polys, + filter_mem_access_poly_fields, reduce_op_poly_fields, + filter_op_poly_fields, sum_polys, eval_and_sum_polys, get_synchronization_poly, gather_access_footprints, gather_access_footprint_bytes) from loopy.codegen import ( @@ -225,7 +226,8 @@ __all__ = [ "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_poly", "sum_mem_access_to_bytes", "reduce_mem_access_poly_fields", - "filter_mem_access_poly_fields", "sum_polys", "eval_and_sum_polys", + "filter_mem_access_poly_fields", "reduce_op_poly_fields", + "filter_op_poly_fields", "sum_polys", "eval_and_sum_polys", "get_synchronization_poly", "gather_access_footprints", "gather_access_footprint_bytes", diff --git a/loopy/statistics.py b/loopy/statistics.py index 7c5efb3d7..5de9b6b0e 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -49,6 +49,10 @@ __doc__ = """ .. autofunction:: sum_mem_access_to_bytes .. autofunction:: reduce_mem_access_poly_fields +.. autofunction:: filter_mem_access_poly_fields + +.. autofunction:: reduce_op_poly_fields +.. autofunction:: filter_op_poly_fields .. autofunction:: get_synchronization_poly @@ -128,18 +132,29 @@ class Op: """ - def __init__(self, dtype, name): + def __init__(self, dtype=None, name=None): self.name = name - from loopy.types import to_loopy_type - self.dtype = to_loopy_type(dtype) + if dtype is None: + self.dtype = dtype + else: + from loopy.types import to_loopy_type + self.dtype = to_loopy_type(dtype) def __eq__(self, other): return isinstance(other, Op) and ( - other.dtype == self.dtype and - other.name == self.name ) + (self.dtype is None or other.dtype is None or + self.dtype == other.dtype) and + (self.name is None or other.name is None or + self.name == other.name)) def __hash__(self): - return hash(str(self.dtype)+self.name) + dtype = self.dtype + name = self.name + if dtype is None: + dtype = 'None' + if name is None: + name = 'None' + return hash(str(dtype)+name) class MemAccess: @@ -172,7 +187,7 @@ class MemAccess: """ - #TODO currently counting all lmem access as stride-1 + #TODO currently counting all lmem access as stride None def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None): self.mtype = mtype self.stride = stride @@ -1135,6 +1150,7 @@ def sum_mem_access_to_bytes(m): return result # }}} + # {{{ reduce_mem_access_poly_fields def reduce_mem_access_poly_fields(m, mtype=True, dtype=True, stride=True, @@ -1263,6 +1279,7 @@ def filter_mem_access_poly_fields(m, mtypes=None, dtypes=None, strides=None, """ + from loopy.types import to_loopy_type if dtypes is not None: dtypes_lp = [to_loopy_type(d) for d in dtypes] @@ -1286,6 +1303,119 @@ def filter_mem_access_poly_fields(m, mtypes=None, dtypes=None, strides=None, # }}} +# {{{ reduce_op_poly_fields + +def reduce_op_poly_fields(m, dtype=True, name=True): + """Take map returned from :func:`get_op_poly`, remove specified Op fields from keys, and combine counts + + :parameter m: A mapping of **{** :class:`loopy.Op` **:** + :class:`islpy.PwQPolynomial` **}**. + + :parameter dtype: A :class:`boolean` specifying whether keys in returned + map will include the data type. + + :parameter name: A :class:`boolean` specifying whether keys in returned + map will include the name of the operation. + + :return: A mapping of **{(** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}** + + - The :class:`islpy.PwQPolynomial` holds the counts (in terms of + the :class:`loopy.LoopKernel` *inames*) for arithmetic ops + categorized by the fields not set to False. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + op_map = get_op_poly(knl) + reduced_op_map = reduce_op_fields(op_map, name=False) + + all_f32_ops = reduced_op_map[Op(dtype=np.float32)].eval_with_dict(params) + all_f64_ops = reduced_op_map[Op(dtype=np.float64)].eval_with_dict(params) + + reduced_op_map = reduce_op_fields(op_map, dtype=False) + + all_add_ops = reduced_op_map[Op(name='add')].eval_with_dict(params) + all_mul_ops = reduced_op_map[Op(name='mul')].eval_with_dict(params) + + # (now use these counts to predict performance) + + """ + + result = {} + for k, v in m.items(): + new_key = Op() + if dtype == True: + new_key.dtype = k.dtype + if name == True: + new_key.name = k.name + + if new_key in result: + result[new_key] += m[k] + else: + result[new_key] = m[k] + + return result + +# }}} + +# {{{ filter_op_poly_fields + +def filter_op_poly_fields(m, dtypes=None, names=None): + """Take map returned from :func:`get_op_poly` and remove items without specified Op fields + + :parameter m: A mapping of **{** :class:`loopy.Op` **:** + :class:`islpy.PwQPolynomial` **}**. + + :parameter dtypes: A list of :class:`loopy.LoopyType` (or + :class:`numpy.dtype`) that specifies the data type + operated on. + + :parameter names: A list of :class:`string` that specifies the kind of + arithmetic operation as *add*, *sub*, *mul*, *div*, + *pow*, *shift*, *bw* (bitwise), etc. + + :return: A mapping of **{(** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}** + + - The :class:`islpy.PwQPolynomial` holds the counts (in terms of + the :class:`loopy.LoopKernel` *inames*) for arithmetic ops + matching the fields passed as parameters. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + op_map = lp.get_op_poly(knl) + filtered_map = lp.filter_op_poly_fields(op_map, names=['add', 'sub']) + tot_addsub = lp.eval_and_sum_polys(filtered_map, params) + + # (now use these counts to predict performance) + + """ + + from loopy.types import to_loopy_type + if dtypes is not None: + dtypes_lp = [to_loopy_type(d) for d in dtypes] + + result = {} + + for k, v in m.items(): + if (dtypes is None or k.dtype in dtypes_lp) and \ + (names is None or k.name in names): + + new_key = Op(k.dtype, k.name) + + if new_key in result: + result[new_key] += m[k] + else: + result[new_key] = m[k] + + return result + +# }}} + def sum_polys(m): total = isl.PwQPolynomial('{ 0 }') for k, v in m.items(): diff --git a/test/test_statistics.py b/test/test_statistics.py index 7ad06f3eb..a853e8c30 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -61,14 +61,6 @@ def test_op_counter_basic(): assert f32add == f32mul == f32div == n*m*l assert f64mul == n*m assert i32add == n*m*2 - - poly_dtype = lp.sum_ops_to_dtypes(poly) - f32 = poly_dtype[to_loopy_type(np.float32)].eval_with_dict(params) - f64 = poly_dtype[to_loopy_type(np.float64)].eval_with_dict(params) - i32 = poly_dtype[to_loopy_type(np.int32)].eval_with_dict(params) - assert f32 == f32add + f32mul + f32div - assert f64 == f64mul - assert i32 == i32add def test_op_counter_reduction(): @@ -739,7 +731,7 @@ def test_summations_and_filters(): [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] - e[i, k] = g[i,k]*h[i,k+1] + e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,l >= 1") @@ -782,6 +774,32 @@ def test_summations_and_filters(): assert f32lall == 3*n*m*l assert f64lall == 2*n*m + poly_dtype = lp.sum_ops_to_dtypes(lp.get_op_poly(knl)) + f32 = poly_dtype[to_loopy_type(np.float32)].eval_with_dict(params) + f64 = poly_dtype[to_loopy_type(np.float64)].eval_with_dict(params) + i32 = poly_dtype[to_loopy_type(np.int32)].eval_with_dict(params) + assert f32 == n*m*l*3 + assert f64 == n*m + assert i32 == n*m*2 + + addsub_all = lp.eval_and_sum_polys( + lp.filter_op_poly_fields( + lp.get_op_poly(knl), names=['add', 'sub']), + params) + f32ops_all = lp.eval_and_sum_polys( + lp.filter_op_poly_fields( + lp.get_op_poly(knl), dtypes=[np.float32]), + params) + assert addsub_all == n*m*l + n*m*2 + assert f32ops_all == n*m*l*3 + + ops_nodtype = lp.reduce_op_poly_fields(lp.get_op_poly(knl), dtype=False) + ops_noname = lp.reduce_op_poly_fields(lp.get_op_poly(knl), name=False) + mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params) + f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params) + assert mul_all == n*m*l + n*m + assert f64ops_all == n*m + if __name__ == "__main__": if len(sys.argv) > 1: -- GitLab From c892d77f8c818fcbe2335e56106260462320d282 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Sun, 30 Oct 2016 22:32:05 -0500 Subject: [PATCH 25/55] changed mem_access and op filter functions to single ToCountMap member function --- loopy/__init__.py | 8 +- loopy/statistics.py | 219 +++++++++++++--------------------------- test/test_statistics.py | 42 ++++---- 3 files changed, 94 insertions(+), 175 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 05ccddb54..80c266ba4 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -116,8 +116,8 @@ from loopy.statistics import (stringify_stats_mapping, Op, MemAccess, get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly, get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly, sum_mem_access_to_bytes, reduce_mem_access_poly_fields, - filter_mem_access_poly_fields, reduce_op_poly_fields, - filter_op_poly_fields, sum_polys, eval_and_sum_polys, + reduce_op_poly_fields, + sum_polys, eval_and_sum_polys, get_synchronization_poly, gather_access_footprints, gather_access_footprint_bytes) from loopy.codegen import ( @@ -226,8 +226,8 @@ __all__ = [ "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_poly", "sum_mem_access_to_bytes", "reduce_mem_access_poly_fields", - "filter_mem_access_poly_fields", "reduce_op_poly_fields", - "filter_op_poly_fields", "sum_polys", "eval_and_sum_polys", + "reduce_op_poly_fields", + "sum_polys", "eval_and_sum_polys", "get_synchronization_poly", "gather_access_footprints", "gather_access_footprint_bytes", diff --git a/loopy/statistics.py b/loopy/statistics.py index 5de9b6b0e..b5e37d2d0 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -36,10 +36,13 @@ from loopy.kernel.data import MultiAssignmentBase from loopy.diagnostic import warn, LoopyError +#TODO does this work for class functions? __doc__ = """ .. currentmodule:: loopy +.. autofunction:: filter + .. autofunction:: get_op_poly .. autofunction:: get_lmem_access_poly @@ -49,10 +52,8 @@ __doc__ = """ .. autofunction:: sum_mem_access_to_bytes .. autofunction:: reduce_mem_access_poly_fields -.. autofunction:: filter_mem_access_poly_fields .. autofunction:: reduce_op_poly_fields -.. autofunction:: filter_op_poly_fields .. autofunction:: get_synchronization_poly @@ -107,6 +108,59 @@ class ToCountMap: def __repr__(self): return repr(self.dict) + def items(self): + return self.dict.items() + + def filter(self, **kwargs): + """Remove items without specified key fields + + :parameter **kwargs: Keyword arguments matching fields in the keys of + the :class:`ToCountMap`, each given a list of + allowable values for that key field. + + :return: A :class:`ToCountMap` containing the subset of the items in + the oriinal :class:`ToCountMap` that match the field values + passed + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = lp.get_mem_access_poly(knl) + filtered_map = mem_map.filter(directions=['load'], + variables=['a','g']) + tot_loads_a_g = lp.eval_and_sum_polys(filtered_map, params) + + # (now use these counts to predict performance) + + """ + + new_map = ToCountMap() + + from loopy.types import to_loopy_type + if 'dtype' in kwargs.keys(): + kwargs['dtype'] = [to_loopy_type(d) for d in kwargs['dtype']] + + # for each item in self.dict + for self_key, self_val in self.dict.items(): + try: + # check to see if key attribute values match all filters + for arg_field, allowable_vals in kwargs.items(): + attr_val = getattr(self_key, arg_field) + # see if the value is in the filter list + if attr_val not in allowable_vals: + print("DEBUG: "+str(attr_val)+" not in ", allowable_vals, ", removing.") + break + else: # loop terminated without break or error + new_map.dict[self_key] = self_val + except(AttributeError): + # the field passed is not a field of this key + print("DEBUG: "+arg_field+" not in ", self_key, ", removing.") + continue + + return new_map + # }}} @@ -884,7 +938,7 @@ def get_op_poly(knl, numpy_types=True): (Op(op.dtype.numpy_dtype, op.name), count) for op, count in six.iteritems(result)) - return result + return ToCountMap(result) # }}} @@ -927,29 +981,25 @@ def sum_ops_to_dtypes(op_poly_dict): return result - +#TODO test depricated functions? def get_lmem_access_poly(knl): """Count the number of local memory accesses in a loopy kernel. """ from warnings import warn - warn("get_lmem_access_poly is deprecated. " - "Instead, use get_mem_access_poly and then pass the result to " - "filter_mem_access_poly_fields with mtypes=['local'] option.", + warn("get_lmem_access_poly is deprecated. Use get_mem_access_poly and " + "filter the result with the mtype=['local'] option.", DeprecationWarning, stacklevel=2) - return filter_mem_access_poly_fields( - get_mem_access_poly(knl), mtypes=['local']) + return get_mem_access_poly(knl).filter(mtypes=['local']) def get_DRAM_access_poly(knl): """Count the number of global memory accesses in a loopy kernel. """ from warnings import warn - warn("get_DRAM_access_poly is deprecated. " - "Instead, use get_mem_access_poly and then pass the result to " - "filter_mem_access_poly_fields with mtypes=['global'] option.", + warn("get_DRAM_access_poly is deprecated. Use get_mem_access_poly and " + "filter the result with the mtype=['global'] option.", DeprecationWarning, stacklevel=2) - return filter_mem_access_poly_fields( - get_mem_access_poly(knl), mtypes=['global']) + return get_mem_access_poly(knl).filter(mtypes=['global']) # {{{ get_gmem_access_poly @@ -957,13 +1007,10 @@ def get_gmem_access_poly(knl): """Count the number of global memory accesses in a loopy kernel. """ from warnings import warn - warn("get_gmem_access_poly is deprecated. " - "Instead, use get_mem_access_poly and then pass the result to " - "filter_mem_access_poly_fields with mtypes=['global'] option.", + warn("get_DRAM_access_poly is deprecated. Use get_mem_access_poly and " + "filter the result with the mtype=['global'] option.", DeprecationWarning, stacklevel=2) - return filter_mem_access_poly_fields( - get_mem_access_poly(knl), mtypes=['global']) - + return get_mem_access_poly(knl).filter(mtypes=['global']) # }}} @@ -1091,10 +1138,10 @@ def get_mem_access_poly(knl, numpy_types=True): stride=mem_access.stride, direction=mem_access.direction, variable=mem_access.variable) - , count) + , count) for mem_access, count in six.iteritems(result)) - return result + return ToCountMap(result) # {{{ sum_mem_access_to_bytes @@ -1231,78 +1278,6 @@ def reduce_mem_access_poly_fields(m, mtype=True, dtype=True, stride=True, # }}} -# {{{ filter_mem_access_poly_fields - -def filter_mem_access_poly_fields(m, mtypes=None, dtypes=None, strides=None, - directions=None, variables=None): - """Take map returned from :func:`get_mem_access_poly` and remove items without specified MemAccess fields - - :parameter m: A mapping of **{** :class:`loopy.MemAccess` **:** - :class:`islpy.PwQPolynomial` **}**. - - :parameter mtypes: A list of :class:`string` that specifies the memory type - accessed as **global** or **local** - - :parameter dtypes: A list of :class:`loopy.LoopyType` (or - :class:`numpy.dtype`) that specifies the data type - accessed. - - :parameter strides: A list of :class:`int` specifies stride of the memory - access. A stride of 0 indicates a uniform access (i.e. - all threads access the same item). - - :parameter directions: A list of :class:`string` that specifies the - direction of memory access as **load** or **store**. - - :parameter variables: A list of :class:`string` that specifies the variable - name of the data accessed. - - - :return: A mapping of **{(** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}** - - - The :class:`islpy.PwQPolynomial` holds the counts (in terms of - the :class:`loopy.LoopKernel` *inames*) for memory accesses - matching the fields passed as parameters. - - Example usage:: - - # (first create loopy kernel and specify array data types) - - params = {'n': 512, 'm': 256, 'l': 128} - mem_map = lp.get_mem_access_poly(knl) - filtered_map = lp.filter_mem_access_poly_fields(mem_map, - directions=['load'], - variables=['a','g']) - tot = lp.eval_and_sum_polys(filtered_map, params) - - # (now use these counts to predict performance) - - """ - - from loopy.types import to_loopy_type - if dtypes is not None: - dtypes_lp = [to_loopy_type(d) for d in dtypes] - - result = {} - - for k, v in m.items(): - if (mtypes is None or k.mtype in mtypes) and \ - (dtypes is None or k.dtype in dtypes_lp) and \ - (strides is None or k.stride in strides) and \ - (directions is None or k.direction in directions) and \ - (variables is None or k.variable in variables): - - new_key = MemAccess(k.mtype, k.dtype, k.stride, k.direction, k.variable) - - if new_key in result: - result[new_key] += m[k] - else: - result[new_key] = m[k] - - return result - -# }}} - # {{{ reduce_op_poly_fields def reduce_op_poly_fields(m, dtype=True, name=True): @@ -1360,62 +1335,6 @@ def reduce_op_poly_fields(m, dtype=True, name=True): # }}} -# {{{ filter_op_poly_fields - -def filter_op_poly_fields(m, dtypes=None, names=None): - """Take map returned from :func:`get_op_poly` and remove items without specified Op fields - - :parameter m: A mapping of **{** :class:`loopy.Op` **:** - :class:`islpy.PwQPolynomial` **}**. - - :parameter dtypes: A list of :class:`loopy.LoopyType` (or - :class:`numpy.dtype`) that specifies the data type - operated on. - - :parameter names: A list of :class:`string` that specifies the kind of - arithmetic operation as *add*, *sub*, *mul*, *div*, - *pow*, *shift*, *bw* (bitwise), etc. - - :return: A mapping of **{(** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}** - - - The :class:`islpy.PwQPolynomial` holds the counts (in terms of - the :class:`loopy.LoopKernel` *inames*) for arithmetic ops - matching the fields passed as parameters. - - Example usage:: - - # (first create loopy kernel and specify array data types) - - params = {'n': 512, 'm': 256, 'l': 128} - op_map = lp.get_op_poly(knl) - filtered_map = lp.filter_op_poly_fields(op_map, names=['add', 'sub']) - tot_addsub = lp.eval_and_sum_polys(filtered_map, params) - - # (now use these counts to predict performance) - - """ - - from loopy.types import to_loopy_type - if dtypes is not None: - dtypes_lp = [to_loopy_type(d) for d in dtypes] - - result = {} - - for k, v in m.items(): - if (dtypes is None or k.dtype in dtypes_lp) and \ - (names is None or k.name in names): - - new_key = Op(k.dtype, k.name) - - if new_key in result: - result[new_key] += m[k] - else: - result[new_key] = m[k] - - return result - -# }}} - def sum_polys(m): total = isl.PwQPolynomial('{ 0 }') for k, v in m.items(): diff --git a/test/test_statistics.py b/test/test_statistics.py index a853e8c30..eb51fbfd0 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -371,7 +371,7 @@ def test_gmem_access_counter_specialops(): assert f32 == n*m*l assert f64 == n*m - filtered_map = lp.filter_mem_access_poly_fields(poly, directions=['load'], variables=['a','g']) + filtered_map = poly.filter(direction=['load'], variable=['a','g']) tot = lp.eval_and_sum_polys(filtered_map, params) assert tot == n*m*l + n*m @@ -744,29 +744,27 @@ def test_summations_and_filters(): l = 128 params = {'n': n, 'm': m, 'l': l} + mem_map = lp.get_mem_access_poly(knl) + loads_a = lp.eval_and_sum_polys( - lp.filter_mem_access_poly_fields( - lp.get_mem_access_poly(knl), - directions=['load'], variables=['a']), + mem_map.filter(direction=['load'], variable=['a']), params) assert loads_a == 2*n*m*l global_stores = lp.eval_and_sum_polys( - lp.filter_mem_access_poly_fields( - lp.get_mem_access_poly(knl), - mtypes=['global'], directions=['store']), + mem_map.filter(mtype=['global'], direction=['store']), params) assert global_stores == n*m*l + n*m - bytes_map = lp.sum_mem_access_to_bytes(lp.get_mem_access_poly(knl)) + bytes_map = lp.sum_mem_access_to_bytes(mem_map) s0load = bytes_map[('global', 0, 'load')].eval_with_dict(params) s0store = bytes_map[('global', 0, 'store')].eval_with_dict(params) assert s0load == 4*n*m*l*3 + 8*n*m*2 assert s0store == 4*n*m*l + 8*n*m # ignore stride and variable names in this map - reduced_map = lp.reduce_mem_access_poly_fields(lp.get_mem_access_poly(knl), - stride=False, variable=False) + reduced_map = lp.reduce_mem_access_poly_fields(mem_map, stride=False, + variable=False) f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load') ].eval_with_dict(params) f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') @@ -774,7 +772,9 @@ def test_summations_and_filters(): assert f32lall == 3*n*m*l assert f64lall == 2*n*m - poly_dtype = lp.sum_ops_to_dtypes(lp.get_op_poly(knl)) + op_map = lp.get_op_poly(knl) + + poly_dtype = lp.sum_ops_to_dtypes(op_map) f32 = poly_dtype[to_loopy_type(np.float32)].eval_with_dict(params) f64 = poly_dtype[to_loopy_type(np.float64)].eval_with_dict(params) i32 = poly_dtype[to_loopy_type(np.int32)].eval_with_dict(params) @@ -782,25 +782,25 @@ def test_summations_and_filters(): assert f64 == n*m assert i32 == n*m*2 - addsub_all = lp.eval_and_sum_polys( - lp.filter_op_poly_fields( - lp.get_op_poly(knl), names=['add', 'sub']), - params) - f32ops_all = lp.eval_and_sum_polys( - lp.filter_op_poly_fields( - lp.get_op_poly(knl), dtypes=[np.float32]), - params) + addsub_all = lp.eval_and_sum_polys(op_map.filter(name=['add', 'sub']), + params) + f32ops_all = lp.eval_and_sum_polys(op_map.filter(dtype=[np.float32]), + params) assert addsub_all == n*m*l + n*m*2 assert f32ops_all == n*m*l*3 - ops_nodtype = lp.reduce_op_poly_fields(lp.get_op_poly(knl), dtype=False) - ops_noname = lp.reduce_op_poly_fields(lp.get_op_poly(knl), name=False) + non_field = lp.eval_and_sum_polys(op_map.filter(xxx=[np.float32]), params) + assert non_field == 0 + + ops_nodtype = lp.reduce_op_poly_fields(op_map, dtype=False) + ops_noname = lp.reduce_op_poly_fields(op_map, name=False) mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params) f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params) assert mul_all == n*m*l + n*m assert f64ops_all == n*m + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From a699f3232afd915b59f63a70bbeb982c43bc511c Mon Sep 17 00:00:00 2001 From: James Stevens Date: Mon, 31 Oct 2016 00:38:47 -0500 Subject: [PATCH 26/55] replaced reduce_mem_access and reduce_op functions with ToCountMap group_by member function --- loopy/__init__.py | 8 +- loopy/statistics.py | 238 +++++++++++++++------------------------- test/test_statistics.py | 22 ++-- 3 files changed, 99 insertions(+), 169 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 80c266ba4..8b10edf19 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -115,9 +115,7 @@ from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel from loopy.statistics import (stringify_stats_mapping, Op, MemAccess, get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly, get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly, - sum_mem_access_to_bytes, reduce_mem_access_poly_fields, - reduce_op_poly_fields, - sum_polys, eval_and_sum_polys, + sum_mem_access_to_bytes, sum_polys, eval_and_sum_polys, get_synchronization_poly, gather_access_footprints, gather_access_footprint_bytes) from loopy.codegen import ( @@ -225,9 +223,7 @@ __all__ = [ "stringify_stats_mapping", "Op", "MemAccess", "get_op_poly", "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_poly", - "sum_mem_access_to_bytes", "reduce_mem_access_poly_fields", - "reduce_op_poly_fields", - "sum_polys", "eval_and_sum_polys", + "sum_mem_access_to_bytes", "sum_polys", "eval_and_sum_polys", "get_synchronization_poly", "gather_access_footprints", "gather_access_footprint_bytes", diff --git a/loopy/statistics.py b/loopy/statistics.py index b5e37d2d0..d251e249b 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -41,7 +41,8 @@ __doc__ = """ .. currentmodule:: loopy -.. autofunction:: filter +.. autofunction:: filter_by +.. autofunction:: group_by .. autofunction:: get_op_poly @@ -91,7 +92,7 @@ class ToCountMap: if isinstance(other, isl.PwQPolynomial): return ToCountMap(dict( (index, self.dict[index]*other) - for index in self.dict.keys())) + for index in self.keys())) else: raise ValueError("ToCountMap: Attempted to multiply " "ToCountMap by {0} {1}." @@ -105,13 +106,19 @@ class ToCountMap: except KeyError: return isl.PwQPolynomial('{ 0 }') + def __setitem__(self, index, value): + self.dict[index] = value + def __repr__(self): return repr(self.dict) def items(self): return self.dict.items() - def filter(self, **kwargs): + def keys(self): + return self.dict.keys() + + def filter_by(self, **kwargs): """Remove items without specified key fields :parameter **kwargs: Keyword arguments matching fields in the keys of @@ -128,7 +135,7 @@ class ToCountMap: params = {'n': 512, 'm': 256, 'l': 128} mem_map = lp.get_mem_access_poly(knl) - filtered_map = mem_map.filter(directions=['load'], + filtered_map = mem_map.filter_by(directions=['load'], variables=['a','g']) tot_loads_a_g = lp.eval_and_sum_polys(filtered_map, params) @@ -136,30 +143,92 @@ class ToCountMap: """ - new_map = ToCountMap() + result_map = ToCountMap() from loopy.types import to_loopy_type if 'dtype' in kwargs.keys(): kwargs['dtype'] = [to_loopy_type(d) for d in kwargs['dtype']] # for each item in self.dict - for self_key, self_val in self.dict.items(): + for self_key, self_val in self.items(): try: # check to see if key attribute values match all filters for arg_field, allowable_vals in kwargs.items(): attr_val = getattr(self_key, arg_field) # see if the value is in the filter list if attr_val not in allowable_vals: - print("DEBUG: "+str(attr_val)+" not in ", allowable_vals, ", removing.") break else: # loop terminated without break or error - new_map.dict[self_key] = self_val + result_map.dict[self_key] = self_val except(AttributeError): # the field passed is not a field of this key - print("DEBUG: "+arg_field+" not in ", self_key, ", removing.") continue - return new_map + return result_map + + def group_by(self, *args): + """Group map items together, distinguishing by only the key fields passed in args + + :parameter args: Zero or more :class:`string` fields of map keys + + :return: A :class:`ToCountMap` containing the same total counts + grouped together by new keys that only contain the fields + specified in the arguments passed. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = get_mem_access_poly(knl) + grouped_mem_map = mem_map.group_by('mtype', 'dtype', 'direction') + + all_f32_global_loads = grouped_mem_map[MemAccess(mtype='global', + dtype=np.float32, + direction='load') + ].eval_with_dict(params) + all_f32_global_stores = grouped_mem_map[MemAccess(mtype='global', + dtype=np.float32, + direction='store') + ].eval_with_dict(params) + all_f32_local_loads = grouped_mem_map[MemAccess(mtype='local', + dtype=np.float32, + direction='load') + ].eval_with_dict(params) + all_f32_local_stores = grouped_mem_map[MemAccess(mtype='local', + dtype=np.float32, + direction='store') + ].eval_with_dict(params) + + # (now use these counts to predict performance) + + """ + + result_map = ToCountMap() + + # make sure all item keys have same type + if self.dict: + key_type = type(list(self.keys())[0]) + if not all(isinstance(x, key_type) for x in self.keys()): + raise ValueError("ToCountMap: group_by() function may only " + "be used on ToCountMaps with uniform keys") + else: + return result_map + + # for each item in self.dict + for self_key, self_val in self.items(): + new_key = key_type() + + # set all specified fields + for field in args: + setattr(new_key, field, getattr(self_key, field)) + + if new_key in result_map.keys(): + result_map[new_key] += self_val + else: + result_map[new_key] = self_val + + return result_map # }}} @@ -241,7 +310,6 @@ class MemAccess: """ - #TODO currently counting all lmem access as stride None def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None): self.mtype = mtype self.stride = stride @@ -253,6 +321,11 @@ class MemAccess: from loopy.types import to_loopy_type self.dtype = to_loopy_type(dtype) + #TODO currently counting all lmem access as stride None + if (mtype == 'local') and (stride is not None): + raise NotImplementedError("MemAccess: stride must be None when " + "mtype is 'local'") + def __eq__(self, other): return isinstance(other, MemAccess) and ( (self.mtype is None or other.mtype is None or @@ -989,7 +1062,7 @@ def get_lmem_access_poly(knl): warn("get_lmem_access_poly is deprecated. Use get_mem_access_poly and " "filter the result with the mtype=['local'] option.", DeprecationWarning, stacklevel=2) - return get_mem_access_poly(knl).filter(mtypes=['local']) + return get_mem_access_poly(knl).filter_by(mtypes=['local']) def get_DRAM_access_poly(knl): @@ -999,7 +1072,7 @@ def get_DRAM_access_poly(knl): warn("get_DRAM_access_poly is deprecated. Use get_mem_access_poly and " "filter the result with the mtype=['global'] option.", DeprecationWarning, stacklevel=2) - return get_mem_access_poly(knl).filter(mtypes=['global']) + return get_mem_access_poly(knl).filter_by(mtypes=['global']) # {{{ get_gmem_access_poly @@ -1010,7 +1083,7 @@ def get_gmem_access_poly(knl): warn("get_DRAM_access_poly is deprecated. Use get_mem_access_poly and " "filter the result with the mtype=['global'] option.", DeprecationWarning, stacklevel=2) - return get_mem_access_poly(knl).filter(mtypes=['global']) + return get_mem_access_poly(knl).filter_by(mtypes=['global']) # }}} @@ -1198,143 +1271,6 @@ def sum_mem_access_to_bytes(m): # }}} -# {{{ reduce_mem_access_poly_fields - -def reduce_mem_access_poly_fields(m, mtype=True, dtype=True, stride=True, - direction=True, variable=True): - """Take map returned from :func:`get_mem_access_poly`, remove specified MemAccess fields from keys, and combine counts - - :parameter m: A mapping of **{** :class:`loopy.MemAccess` **:** - :class:`islpy.PwQPolynomial` **}**. - - :parameter mtype: A :class:`boolean` specifying whether keys in returned - map will include the memory type. - - :parameter dtype: A :class:`boolean` specifying whether keys in returned - map will include the data type. - - :parameter stride: A :class:`boolean` specifying whether keys in returned - map will include the stride. - - :parameter direction: A :class:`boolean` specifying whether keys in - returned map will include the direction. - - :parameter variable: A :class:`boolean` specifying whether keys in returned - map will include the variable name. - - - :return: A mapping of **{(** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}** - - - The :class:`islpy.PwQPolynomial` holds the counts (in terms of - the :class:`loopy.LoopKernel` *inames*) for memory accesses - categorized by the fields not set to False. - - Example usage:: - - # (first create loopy kernel and specify array data types) - - params = {'n': 512, 'm': 256, 'l': 128} - mem_map = get_mem_access_poly(knl) - reduced_mem_map = reduce_mem_access_poly_fields(mem_map, stride=False, - variable=False) - - all_f32_global_loads = reduced_mem_map[MemAccess('global', np.float32, - direction='load') - ].eval_with_dict(params) - all_f32_global_stores = reduced_mem_map[MemAccess('global', np.float32, - direction='store') - ].eval_with_dict(params) - all_f32_local_loads = reduced_mem_map[MemAccess('local', np.float32, - direction='load') - ].eval_with_dict(params) - all_f32_local_stores = reduced_mem_map[MemAccess('local', np.float32, - direction='store') - ].eval_with_dict(params) - - # (now use these counts to predict performance) - - """ - - result = {} - for k, v in m.items(): - new_key = MemAccess() - if mtype == True: - new_key.mtype = k.mtype - if dtype == True: - new_key.dtype = k.dtype - if stride == True: - new_key.stride = k.stride - if direction == True: - new_key.direction = k.direction - if variable == True: - new_key.variable = k.variable - - if new_key in result: - result[new_key] += m[k] - else: - result[new_key] = m[k] - - return result - -# }}} - -# {{{ reduce_op_poly_fields - -def reduce_op_poly_fields(m, dtype=True, name=True): - """Take map returned from :func:`get_op_poly`, remove specified Op fields from keys, and combine counts - - :parameter m: A mapping of **{** :class:`loopy.Op` **:** - :class:`islpy.PwQPolynomial` **}**. - - :parameter dtype: A :class:`boolean` specifying whether keys in returned - map will include the data type. - - :parameter name: A :class:`boolean` specifying whether keys in returned - map will include the name of the operation. - - :return: A mapping of **{(** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}** - - - The :class:`islpy.PwQPolynomial` holds the counts (in terms of - the :class:`loopy.LoopKernel` *inames*) for arithmetic ops - categorized by the fields not set to False. - - Example usage:: - - # (first create loopy kernel and specify array data types) - - params = {'n': 512, 'm': 256, 'l': 128} - op_map = get_op_poly(knl) - reduced_op_map = reduce_op_fields(op_map, name=False) - - all_f32_ops = reduced_op_map[Op(dtype=np.float32)].eval_with_dict(params) - all_f64_ops = reduced_op_map[Op(dtype=np.float64)].eval_with_dict(params) - - reduced_op_map = reduce_op_fields(op_map, dtype=False) - - all_add_ops = reduced_op_map[Op(name='add')].eval_with_dict(params) - all_mul_ops = reduced_op_map[Op(name='mul')].eval_with_dict(params) - - # (now use these counts to predict performance) - - """ - - result = {} - for k, v in m.items(): - new_key = Op() - if dtype == True: - new_key.dtype = k.dtype - if name == True: - new_key.name = k.name - - if new_key in result: - result[new_key] += m[k] - else: - result[new_key] = m[k] - - return result - -# }}} - def sum_polys(m): total = isl.PwQPolynomial('{ 0 }') for k, v in m.items(): diff --git a/test/test_statistics.py b/test/test_statistics.py index eb51fbfd0..3657721e4 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -311,8 +311,7 @@ def test_gmem_access_counter_logic(): l = 128 params = {'n': n, 'm': m, 'l': l} - reduced_map = lp.reduce_mem_access_poly_fields(poly, stride=False, - variable=False) + reduced_map = poly.group_by('mtype', 'dtype', 'direction') f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32), direction='load') @@ -371,7 +370,7 @@ def test_gmem_access_counter_specialops(): assert f32 == n*m*l assert f64 == n*m - filtered_map = poly.filter(direction=['load'], variable=['a','g']) + filtered_map = poly.filter_by(direction=['load'], variable=['a','g']) tot = lp.eval_and_sum_polys(filtered_map, params) assert tot == n*m*l + n*m @@ -747,12 +746,12 @@ def test_summations_and_filters(): mem_map = lp.get_mem_access_poly(knl) loads_a = lp.eval_and_sum_polys( - mem_map.filter(direction=['load'], variable=['a']), + mem_map.filter_by(direction=['load'], variable=['a']), params) assert loads_a == 2*n*m*l global_stores = lp.eval_and_sum_polys( - mem_map.filter(mtype=['global'], direction=['store']), + mem_map.filter_by(mtype=['global'], direction=['store']), params) assert global_stores == n*m*l + n*m @@ -763,8 +762,7 @@ def test_summations_and_filters(): assert s0store == 4*n*m*l + 8*n*m # ignore stride and variable names in this map - reduced_map = lp.reduce_mem_access_poly_fields(mem_map, stride=False, - variable=False) + reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load') ].eval_with_dict(params) f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') @@ -782,18 +780,18 @@ def test_summations_and_filters(): assert f64 == n*m assert i32 == n*m*2 - addsub_all = lp.eval_and_sum_polys(op_map.filter(name=['add', 'sub']), + addsub_all = lp.eval_and_sum_polys(op_map.filter_by(name=['add', 'sub']), params) - f32ops_all = lp.eval_and_sum_polys(op_map.filter(dtype=[np.float32]), + f32ops_all = lp.eval_and_sum_polys(op_map.filter_by(dtype=[np.float32]), params) assert addsub_all == n*m*l + n*m*2 assert f32ops_all == n*m*l*3 - non_field = lp.eval_and_sum_polys(op_map.filter(xxx=[np.float32]), params) + non_field = lp.eval_and_sum_polys(op_map.filter_by(xxx=[np.float32]), params) assert non_field == 0 - ops_nodtype = lp.reduce_op_poly_fields(op_map, dtype=False) - ops_noname = lp.reduce_op_poly_fields(op_map, name=False) + ops_nodtype = op_map.group_by('name') + ops_noname = op_map.group_by('dtype') mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params) f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params) assert mul_all == n*m*l + n*m -- GitLab From 8c7194c113c364343e9ec4de698b6ab1ca104196 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Tue, 1 Nov 2016 00:27:32 -0500 Subject: [PATCH 27/55] added to_bytes, sum, and eval_and_sum member functions to ToCountMap, removed previous/redundant functions --- loopy/__init__.py | 11 +- loopy/statistics.py | 314 ++++++++++++++++++++-------------------- test/test_statistics.py | 55 ++++--- 3 files changed, 187 insertions(+), 193 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 8b10edf19..f505759ac 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -113,11 +113,9 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction, infer_unknown_types) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel from loopy.statistics import (stringify_stats_mapping, Op, MemAccess, - get_op_poly, sum_ops_to_dtypes, get_lmem_access_poly, - get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_poly, - sum_mem_access_to_bytes, sum_polys, eval_and_sum_polys, - get_synchronization_poly, gather_access_footprints, - gather_access_footprint_bytes) + get_op_poly, get_lmem_access_poly, get_DRAM_access_poly, + get_gmem_access_poly, get_mem_access_poly, get_synchronization_poly, + gather_access_footprints, gather_access_footprint_bytes) from loopy.codegen import ( PreambleInfo, generate_code, generate_code_v2, generate_body) @@ -221,9 +219,8 @@ __all__ = [ "generate_code", "generate_code_v2", "generate_body", "stringify_stats_mapping", "Op", "MemAccess", "get_op_poly", - "sum_ops_to_dtypes", "get_lmem_access_poly", "get_DRAM_access_poly", + "get_lmem_access_poly", "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_poly", - "sum_mem_access_to_bytes", "sum_polys", "eval_and_sum_polys", "get_synchronization_poly", "gather_access_footprints", "gather_access_footprint_bytes", diff --git a/loopy/statistics.py b/loopy/statistics.py index d251e249b..6efb61da1 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -43,6 +43,9 @@ __doc__ = """ .. autofunction:: filter_by .. autofunction:: group_by +.. autofunction:: to_bytes +.. autofunciton:: sum +.. autofunction:: eval_and_sum .. autofunction:: get_op_poly @@ -51,11 +54,6 @@ __doc__ = """ .. autofunction:: get_gmem_access_poly .. autofunction:: get_mem_access_poly -.. autofunction:: sum_mem_access_to_bytes -.. autofunction:: reduce_mem_access_poly_fields - -.. autofunction:: reduce_op_poly_fields - .. autofunction:: get_synchronization_poly .. autofunction:: gather_access_footprints @@ -112,12 +110,18 @@ class ToCountMap: def __repr__(self): return repr(self.dict) + def __len__(self): + return len(self.dict) + def items(self): return self.dict.items() def keys(self): return self.dict.keys() + def copy(self): + return ToCountMap(dict(self.dict)) + def filter_by(self, **kwargs): """Remove items without specified key fields @@ -135,9 +139,9 @@ class ToCountMap: params = {'n': 512, 'm': 256, 'l': 128} mem_map = lp.get_mem_access_poly(knl) - filtered_map = mem_map.filter_by(directions=['load'], - variables=['a','g']) - tot_loads_a_g = lp.eval_and_sum_polys(filtered_map, params) + filtered_map = mem_map.filter_by(direction=['load'], + variable=['a','g']) + tot_loads_a_g = filtered_map.eval_and_sum(params) # (now use these counts to predict performance) @@ -200,6 +204,13 @@ class ToCountMap: direction='store') ].eval_with_dict(params) + op_map = get_op_poly(knl) + ops_by_dtype = op_map.group_by('dtype') + + f32ops = ops_by_dtype[Op(dtype=np.float32)].eval_with_dict(params) + f64ops = ops_by_dtype[Op(dtype=np.float64)].eval_with_dict(params) + i32ops = ops_by_dtype[Op(dtype=np.int32)].eval_with_dict(params) + # (now use these counts to predict performance) """ @@ -230,6 +241,83 @@ class ToCountMap: return result_map + def to_bytes(self): + """Convert counts to bytes using data type in map key + + :return: A :class:`ToCountMap` mapping each original key to a + :class:`islpy.PwQPolynomial` with counts in bytes rather than + instances. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + bytes_map = get_mem_access_poly(knl).to_bytes() + params = {'n': 512, 'm': 256, 'l': 128} + + s1_global_ld_byt = bytes_map.filter_by( + mtype=['global'], stride=[1], + direction=['load']).eval_and_sum(params) + s2_global_ld_byt = bytes_map.filter_by( + mtype=['global'], stride=[2], + direction=['load']).eval_and_sum(params) + s1_global_st_byt = bytes_map.filter_by( + mtype=['global'], stride=[1], + direction=['store']).eval_and_sum(params) + s2_global_st_byt = bytes_map.filter_by( + mtype=['global'], stride=[2], + direction=['store']).eval_and_sum(params) + + # (now use these counts to predict performance) + + """ + + result = self.copy() + + for key, val in self.items(): + bytes_processed = int(key.dtype.itemsize) * val + result[key] = bytes_processed + + return result + + + def sum(self): + """Add all counts in ToCountMap + + :return: A :class:`islpy.PwQPolynomial` containing the sum of counts + + """ + total = isl.PwQPolynomial('{ 0 }') + for k, v in self.items(): + if not isinstance(v, isl.PwQPolynomial): + raise ValueError("ToCountMap: sum() encountered type {0} but " + "may only be used on PwQPolynomials." + .format(type(v))) + total += v + return total + + + def eval_and_sum(self, params): + """Add all counts in ToCountMap and evaluate with provided parameters + + :return: An :class:`integer` containing the sum of all counts in the + :class:`ToCountMap` evaluated with the parameters provided + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = lp.get_mem_access_poly(knl) + filtered_map = mem_map.filter_by(direction=['load'], + variable=['a','g']) + tot_loads_a_g = filtered_map.eval_and_sum(params) + + # (now use these counts to predict performance) + + """ + return self.sum().eval_with_dict(params) + # }}} @@ -358,7 +446,6 @@ class MemAccess: return hash(mtype+str(dtype)+str(stride)+direction+variable) - # {{{ ExpressionOpCounter class ExpressionOpCounter(CombineMapper): @@ -385,7 +472,8 @@ class ExpressionOpCounter(CombineMapper): def map_call(self, expr): return ToCountMap( - {Op(self.type_inf(expr), 'func:'+str(expr.function)): 1} + {Op(dtype=self.type_inf(expr), + name='func:'+str(expr.function)): 1} ) + self.rec(expr.parameters) # def map_call_with_kwargs(self, expr): # implemented in CombineMapper @@ -398,20 +486,21 @@ class ExpressionOpCounter(CombineMapper): def map_sum(self, expr): assert expr.children return ToCountMap( - {Op(self.type_inf(expr), 'add'): len(expr.children)-1} + {Op(dtype=self.type_inf(expr), + name='add'): len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) def map_product(self, expr): from pymbolic.primitives import is_zero assert expr.children - return sum(ToCountMap({Op(self.type_inf(expr), 'mul'): 1}) + return sum(ToCountMap({Op(dtype=self.type_inf(expr), name='mul'): 1}) + self.rec(child) for child in expr.children if not is_zero(child + 1)) + \ - ToCountMap({Op(self.type_inf(expr), 'mul'): -1}) + ToCountMap({Op(dtype=self.type_inf(expr), name='mul'): -1}) def map_quotient(self, expr, *args): - return ToCountMap({Op(self.type_inf(expr), 'div'): 1}) \ + return ToCountMap({Op(dtype=self.type_inf(expr), name='div'): 1}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) @@ -419,25 +508,25 @@ class ExpressionOpCounter(CombineMapper): map_remainder = map_quotient def map_power(self, expr): - return ToCountMap({Op(self.type_inf(expr), 'pow'): 1}) \ + return ToCountMap({Op(dtype=self.type_inf(expr), name='pow'): 1}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) def map_left_shift(self, expr): - return ToCountMap({Op(self.type_inf(expr), 'shift'): 1}) \ + return ToCountMap({Op(dtype=self.type_inf(expr), name='shift'): 1}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) map_right_shift = map_left_shift def map_bitwise_not(self, expr): - return ToCountMap({Op(self.type_inf(expr), 'bw'): 1}) \ + return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'): 1}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): - return ToCountMap( - {Op(self.type_inf(expr), 'bw'): len(expr.children)-1} - ) + sum(self.rec(child) for child in expr.children) + return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'): + len(expr.children)-1} + ) + sum(self.rec(child) for child in expr.children) map_bitwise_xor = map_bitwise_or map_bitwise_and = map_bitwise_or @@ -466,8 +555,8 @@ class ExpressionOpCounter(CombineMapper): + self.rec(expr.else_) def map_min(self, expr): - return ToCountMap({Op( - self.type_inf(expr), 'maxmin'): len(expr.children)-1} + return ToCountMap({Op(dtype=self.type_inf(expr), name='maxmin'): + len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) map_max = map_min @@ -524,7 +613,8 @@ class LocalSubscriptCounter(CombineMapper): #print("is local? ", array.is_local) if array.is_local: return ToCountMap( - {MemAccess('local', self.type_inf(expr)): 1} + {MemAccess(mtype='local', + dtype=self.type_inf(expr)): 1} ) + self.rec(expr.index) return self.rec(expr.index) @@ -661,8 +751,9 @@ class GlobalSubscriptCounter(CombineMapper): if not local_id_found: # count as uniform access - return ToCountMap({MemAccess('global', self.type_inf(expr), - stride=0, variable=name): 1} + return ToCountMap({MemAccess(mtype='global', + dtype=self.type_inf(expr), stride=0, + variable=name): 1} ) + self.rec(expr.index) # get local_id associated with minimum tag axis @@ -706,12 +797,13 @@ class GlobalSubscriptCounter(CombineMapper): #TODO temporary fix that needs changing: if min_tag_axis != 0: print("... min tag axis (%d) is not zero! ..." % (min_tag_axis)) - return ToCountMap({MemAccess('global', self.type_inf(expr), - stride=sys.maxsize, variable=name): 1} + return ToCountMap({MemAccess(mtype='global', + dtype=self.type_inf(expr), + stride=sys.maxsize, variable=name): 1} ) + self.rec(expr.index) - return ToCountMap({MemAccess('global', self.type_inf(expr), - stride=total_stride, variable=name): 1} + return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), + stride=total_stride, variable=name): 1} ) + self.rec(expr.index) def map_sum(self, expr): @@ -1004,55 +1096,16 @@ def get_op_poly(knl, numpy_types=True): insn_inames, [dim_type.set])) ops = op_counter(insn.assignee) + op_counter(insn.expression) op_poly = op_poly + ops*count(knl, domain) - result = op_poly.dict if numpy_types: - result = dict( - (Op(op.dtype.numpy_dtype, op.name), count) - for op, count in six.iteritems(result)) - - return ToCountMap(result) -# }}} - - -def sum_ops_to_dtypes(op_poly_dict): - """Sum the mapping returned by :func:`get_op_poly` to a mapping that ignores arithmetic op type - - :parameter op_poly_dict: A mapping of **{** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. + op_poly.dict = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name), + count) + for op, count in six.iteritems(op_poly.dict)) - :return: A mapping of **{** :class:`loopy.LoopyType` **:** :class:`islpy.PwQPolynomial` **}** + return op_poly - - The :class:`loopy.LoopyType` specifies the data type operated on - - - The :class:`islpy.PwQPolynomial` holds the number of arithmetic - operations on the data type specified (in terms of the - :class:`loopy.LoopKernel` *inames*). - - Example usage:: - - # (first create loopy kernel and specify array data types) - - op_map = get_op_poly(knl) - op_map_by_dtype = sum_ops_to_dtypes(op_map) - params = {'n': 512, 'm': 256, 'l': 128} - - f32ops = op_map_by_dtype[to_loopy_type(np.float32)].eval_with_dict(params) - f64ops = op_map_by_dtype[to_loopy_type(np.float64)].eval_with_dict(params) - i32ops = op_map_by_dtype[to_loopy_type(np.int32)].eval_with_dict(params) - - # (now use these counts to predict performance) - - """ - - result = {} - for op, v in op_poly_dict.items(): - new_key = op.dtype - if new_key in result: - result[new_key] += v - else: - result[new_key] = v +# }}} - return result #TODO test depricated functions? def get_lmem_access_poly(knl): @@ -1062,7 +1115,7 @@ def get_lmem_access_poly(knl): warn("get_lmem_access_poly is deprecated. Use get_mem_access_poly and " "filter the result with the mtype=['local'] option.", DeprecationWarning, stacklevel=2) - return get_mem_access_poly(knl).filter_by(mtypes=['local']) + return get_mem_access_poly(knl).filter_by(mtype=['local']) def get_DRAM_access_poly(knl): @@ -1072,7 +1125,8 @@ def get_DRAM_access_poly(knl): warn("get_DRAM_access_poly is deprecated. Use get_mem_access_poly and " "filter the result with the mtype=['global'] option.", DeprecationWarning, stacklevel=2) - return get_mem_access_poly(knl).filter_by(mtypes=['global']) + return get_mem_access_poly(knl).filter_by(mtype=['global']) + # {{{ get_gmem_access_poly @@ -1083,10 +1137,11 @@ def get_gmem_access_poly(knl): warn("get_DRAM_access_poly is deprecated. Use get_mem_access_poly and " "filter the result with the mtype=['global'] option.", DeprecationWarning, stacklevel=2) - return get_mem_access_poly(knl).filter_by(mtypes=['global']) + return get_mem_access_poly(knl).filter_by(mtype=['global']) # }}} + def get_mem_access_poly(knl, numpy_types=True): """Count the number of memory accesses in a loopy kernel. @@ -1114,22 +1169,26 @@ def get_mem_access_poly(knl, numpy_types=True): params = {'n': 512, 'm': 256, 'l': 128} mem_access_map = get_mem_access_poly(knl) - f32_stride1_g_loads_a = mem_access_map[MemAccess('global', np.float32, + f32_stride1_g_loads_a = mem_access_map[MemAccess(mtype='global', + dtype=np.float32, stride=1, direction='load', variable='a') ].eval_with_dict(params) - f32_stride1_g_stores_a = mem_access_map[MemAccess('global', np.float32, + f32_stride1_g_stores_a = mem_access_map[MemAccess(mtype='global', + dtype=np.float32, stride=1, direction='store', variable='a') ].eval_with_dict(params) - f32_stride1_l_loads_x = mem_access_map[MemAccess('local', np.float32, + f32_stride1_l_loads_x = mem_access_map[MemAccess(mtype='local', + dtype=np.float32, stride=1, direction='load', variable='x') ].eval_with_dict(params) - f32_stride1_l_stores_x = mem_access_map[MemAccess('local', np.float32, + f32_stride1_l_stores_x = mem_access_map[MemAccess(mtype='local', + dtype=np.float32, stride=1, direction='store', variable='x') @@ -1171,14 +1230,16 @@ def get_mem_access_poly(knl, numpy_types=True): # distinguish loads and stores for key in subs_expr.dict: - subs_expr.dict[MemAccess(key.mtype, key.dtype, stride=key.stride, - direction='load', variable=key.variable) + subs_expr.dict[MemAccess(mtype=key.mtype, dtype=key.dtype, + stride=key.stride, direction='load', + variable=key.variable) ] = subs_expr.dict.pop(key) subs_assignee_g = subs_counter_g(insn.assignee) for key in subs_assignee_g.dict: - subs_assignee_g.dict[MemAccess(key.mtype, key.dtype, - stride=key.stride, direction='store', + subs_assignee_g.dict[MemAccess(mtype=key.mtype, dtype=key.dtype, + stride=key.stride, + direction='store', variable=key.variable) ] = subs_assignee_g.dict.pop(key) # for now, don't count writes to local mem @@ -1204,82 +1265,18 @@ def get_mem_access_poly(knl, numpy_types=True): subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) # for now, don't count writes to local mem - result = subs_poly.dict + #result = subs_poly.dict if numpy_types: - result = dict((MemAccess(mem_access.mtype, mem_access.dtype.numpy_dtype, - stride=mem_access.stride, - direction=mem_access.direction, - variable=mem_access.variable) - , count) - for mem_access, count in six.iteritems(result)) - - return ToCountMap(result) - -# {{{ sum_mem_access_to_bytes - -def sum_mem_access_to_bytes(m): - """Convert counts returned by :func:`get_mem_access_poly` to bytes and sum across data types and variables - - :parameter m: A mapping of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. - - :return: A mapping of **{(** :class:`string`**,** :class:`int` **,** :class:`string` **)** - **:** :class:`islpy.PwQPolynomial` **}** - - - The first string in the key specifies the memory type as *global* or *local* + subs_poly.dict = dict((MemAccess(mtype=mem_access.mtype, + dtype=mem_access.dtype.numpy_dtype, + stride=mem_access.stride, + direction=mem_access.direction, + variable=mem_access.variable) + , count) + for mem_access, count in six.iteritems(subs_poly.dict)) - - The integer in the key specifies the *stride* - - - The second string in the key specifies the direction as *load* or *store* - - - The :class:`islpy.PwQPolynomial` holds the aggregate transfer - size in bytes for memory accesses of all data types with the - characteristics specified in the key (in terms of the - :class:`loopy.LoopKernel` *inames*). - - Example usage:: - - # (first create loopy kernel and specify array data types) - - mem_access_map = get_mem_access_poly(knl) - byte_totals_map = sum_mem_access_to_bytes(mem_access_map) - params = {'n': 512, 'm': 256, 'l': 128} - - stride1_global_bytes_loaded = byte_totals_map[('global', 1, 'load') - ].eval_with_dict(params) - stride2_global_bytes_loaded = byte_totals_map[('global', 2, 'load') - ].eval_with_dict(params) - stride1_global_bytes_stored = byte_totals_map[('global', 1, 'store') - ].eval_with_dict(params) - stride2_global_bytes_stored = byte_totals_map[('global', 2, 'store') - ].eval_with_dict(params) - - # (now use thess counts to predict performance) - - """ - - result = {} - for mem_access, v in m.items(): - new_key = (mem_access.mtype, mem_access.stride, mem_access.direction) - bytes_transferred = int(mem_access.dtype.itemsize) * v - if new_key in result: - result[new_key] += bytes_transferred - else: - result[new_key] = bytes_transferred - - return result - -# }}} - -def sum_polys(m): - total = isl.PwQPolynomial('{ 0 }') - for k, v in m.items(): - total += v - return total - - -def eval_and_sum_polys(m, params): - return sum_polys(m).eval_with_dict(params) + return subs_poly # {{{ get_synchronization_poly @@ -1356,7 +1353,8 @@ def get_synchronization_poly(knl): raise LoopyError("unexpected schedule item: %s" % type(sched_item).__name__) - return result.dict + #return result.dict #TODO is this okay? + return result # }}} diff --git a/test/test_statistics.py b/test/test_statistics.py index 3657721e4..7332d4ca5 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -82,8 +82,8 @@ def test_op_counter_reduction(): f32mul = poly[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) assert f32add == f32mul == n*m*l - poly_dtype = lp.sum_ops_to_dtypes(poly) - f32 = poly_dtype[to_loopy_type(np.float32)].eval_with_dict(params) + poly_dtype = poly.group_by('dtype') + f32 = poly_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) assert f32 == f32add + f32mul @@ -286,11 +286,12 @@ def test_gmem_access_counter_reduction(): ].eval_with_dict(params) assert f32s == n*l - poly_b = lp.sum_mem_access_to_bytes(poly) - s0load = poly_b[('global', 0, 'load')].eval_with_dict(params) - s0store = poly_b[('global', 0, 'store')].eval_with_dict(params) - assert s0load == 4*f32l - assert s0store == 4*f32s + ld_bytes = poly.filter_by(mtype=['global'], direction=['load'] + ).to_bytes().eval_and_sum(params) + st_bytes = poly.filter_by(mtype=['global'], direction=['store'] + ).to_bytes().eval_and_sum(params) + assert ld_bytes == 4*f32l + assert st_bytes == 4*f32s def test_gmem_access_counter_logic(): @@ -371,7 +372,8 @@ def test_gmem_access_counter_specialops(): assert f64 == n*m filtered_map = poly.filter_by(direction=['load'], variable=['a','g']) - tot = lp.eval_and_sum_polys(filtered_map, params) + #tot = lp.eval_and_sum_polys(filtered_map, params) + tot = filtered_map.eval_and_sum(params) assert tot == n*m*l + n*m def test_gmem_access_counter_bitwise(): @@ -745,21 +747,18 @@ def test_summations_and_filters(): mem_map = lp.get_mem_access_poly(knl) - loads_a = lp.eval_and_sum_polys( - mem_map.filter_by(direction=['load'], variable=['a']), - params) + loads_a = mem_map.filter_by(direction=['load'], variable=['a']).eval_and_sum(params) assert loads_a == 2*n*m*l - global_stores = lp.eval_and_sum_polys( - mem_map.filter_by(mtype=['global'], direction=['store']), - params) + global_stores = mem_map.filter_by(mtype=['global'], direction=['store']).eval_and_sum(params) assert global_stores == n*m*l + n*m - bytes_map = lp.sum_mem_access_to_bytes(mem_map) - s0load = bytes_map[('global', 0, 'load')].eval_with_dict(params) - s0store = bytes_map[('global', 0, 'store')].eval_with_dict(params) - assert s0load == 4*n*m*l*3 + 8*n*m*2 - assert s0store == 4*n*m*l + 8*n*m + ld_bytes = poly.filter_by(mtype=['global'], direction=['load'] + ).to_bytes().eval_and_sum(params) + st_bytes = poly.filter_by(mtype=['global'], direction=['store'] + ).to_bytes().eval_and_sum(params) + assert ld_bytes == 4*n*m*l*3 + 8*n*m*2 + assert st_bytes == 4*n*m*l + 8*n*m # ignore stride and variable names in this map reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') @@ -771,23 +770,23 @@ def test_summations_and_filters(): assert f64lall == 2*n*m op_map = lp.get_op_poly(knl) + #for k, v in op_map.items(): + # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) - poly_dtype = lp.sum_ops_to_dtypes(op_map) - f32 = poly_dtype[to_loopy_type(np.float32)].eval_with_dict(params) - f64 = poly_dtype[to_loopy_type(np.float64)].eval_with_dict(params) - i32 = poly_dtype[to_loopy_type(np.int32)].eval_with_dict(params) + poly_dtype = op_map.group_by('dtype') + f32 = poly_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) + f64 = poly_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params) + i32 = poly_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params) assert f32 == n*m*l*3 assert f64 == n*m assert i32 == n*m*2 - addsub_all = lp.eval_and_sum_polys(op_map.filter_by(name=['add', 'sub']), - params) - f32ops_all = lp.eval_and_sum_polys(op_map.filter_by(dtype=[np.float32]), - params) + addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params) + f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params) assert addsub_all == n*m*l + n*m*2 assert f32ops_all == n*m*l*3 - non_field = lp.eval_and_sum_polys(op_map.filter_by(xxx=[np.float32]), params) + non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params) assert non_field == 0 ops_nodtype = op_map.group_by('name') -- GitLab From 79017fe6aed2cf34ccd513bdc6e33893aeb456cd Mon Sep 17 00:00:00 2001 From: James Stevens Date: Tue, 1 Nov 2016 00:45:31 -0500 Subject: [PATCH 28/55] renamed variables in stats test --- loopy/statistics.py | 7 +- test/test_statistics.py | 225 ++++++++++++++++++++-------------------- 2 files changed, 117 insertions(+), 115 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 6efb61da1..5f5408770 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -409,11 +409,16 @@ class MemAccess: from loopy.types import to_loopy_type self.dtype = to_loopy_type(dtype) - #TODO currently counting all lmem access as stride None + #TODO currently giving all lmem access stride=None if (mtype == 'local') and (stride is not None): raise NotImplementedError("MemAccess: stride must be None when " "mtype is 'local'") + #TODO currently giving all lmem access variable=None + if (mtype == 'local') and (variable is not None): + raise NotImplementedError("MemAccess: variable must be None when " + "mtype is 'local'") + def __eq__(self, other): return isinstance(other, MemAccess) and ( (self.mtype is None or other.mtype is None or diff --git a/test/test_statistics.py b/test/test_statistics.py index 7332d4ca5..685406fee 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -48,16 +48,16 @@ def test_op_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = lp.get_op_poly(knl) + op_map = lp.get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32add = poly[lp.Op(np.float32, 'add')].eval_with_dict(params) - f32mul = poly[lp.Op(np.float32, 'mul')].eval_with_dict(params) - f32div = poly[lp.Op(np.float32, 'div')].eval_with_dict(params) - f64mul = poly[lp.Op(np.dtype(np.float64), 'mul')].eval_with_dict(params) - i32add = poly[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params) + f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul')].eval_with_dict(params) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32add == f32mul == f32div == n*m*l assert f64mul == n*m assert i32add == n*m*2 @@ -73,17 +73,17 @@ def test_op_counter_reduction(): name="matmul_serial", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - poly = lp.get_op_poly(knl) + op_map = lp.get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32add = poly[lp.Op(np.float32, 'add')].eval_with_dict(params) - f32mul = poly[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) assert f32add == f32mul == n*m*l - poly_dtype = poly.group_by('dtype') - f32 = poly_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) + op_map_dtype = op_map.group_by('dtype') + f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) assert f32 == f32add + f32mul @@ -99,15 +99,15 @@ def test_op_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - poly = lp.get_op_poly(knl) + op_map = lp.get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32mul = poly[lp.Op(np.float32, 'mul')].eval_with_dict(params) - f64add = poly[lp.Op(np.float64, 'add')].eval_with_dict(params) - f64div = poly[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params) - i32add = poly[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) + f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(params) + f64div = op_map[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32mul == n*m assert f64div == 2*n*m # TODO why? assert f64add == n*m @@ -129,19 +129,19 @@ def test_op_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = lp.get_op_poly(knl) + op_map = lp.get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32mul = poly[lp.Op(np.float32, 'mul')].eval_with_dict(params) - f32div = poly[lp.Op(np.float32, 'div')].eval_with_dict(params) - f32add = poly[lp.Op(np.float32, 'add')].eval_with_dict(params) - f64pow = poly[lp.Op(np.float64, 'pow')].eval_with_dict(params) - f64add = poly[lp.Op(np.dtype(np.float64), 'add')].eval_with_dict(params) - i32add = poly[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) - f64rsq = poly[lp.Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params) - f64sin = poly[lp.Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) + f64pow = op_map[lp.Op(np.float64, 'pow')].eval_with_dict(params) + f64add = op_map[lp.Op(np.dtype(np.float64), 'add')].eval_with_dict(params) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) + f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params) + f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params) assert f32div == 2*n*m*l assert f32mul == f32add == n*m*l assert f64add == 3*n*m @@ -165,17 +165,17 @@ def test_op_counter_bitwise(): a=np.int32, b=np.int32, g=np.int64, h=np.int64)) - poly = lp.get_op_poly(knl) + op_map = lp.get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - i32add = poly[lp.Op(np.int32, 'add')].eval_with_dict(params) - i32bw = poly[lp.Op(np.int32, 'bw')].eval_with_dict(params) - i64bw = poly[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params) - i64mul = poly[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params) - i64add = poly[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params) - i64shift = poly[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params) + i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params) + i32bw = op_map[lp.Op(np.int32, 'bw')].eval_with_dict(params) + i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params) + i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params) + i64add = op_map[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params) + i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params) assert i32add == n*m+n*m*l assert i32bw == 2*n*m*l assert i64bw == 2*n*m @@ -204,9 +204,9 @@ def test_op_counter_triangular_domain(): else: expect_fallback = False - poly = lp.get_op_poly(knl)[lp.Op(np.float64, 'mul')] + op_map = lp.get_op_poly(knl)[lp.Op(np.float64, 'mul')] value_dict = dict(m=13, n=200) - flops = poly.eval_with_dict(value_dict) + flops = op_map.eval_with_dict(value_dict) if expect_fallback: assert flops == 144 @@ -228,30 +228,30 @@ def test_gmem_access_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = lp.get_mem_access_poly(knl) + mem_map = lp.get_mem_access_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32l = poly[lp.MemAccess('global', np.float32, + f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a') ].eval_with_dict(params) - f32l += poly[lp.MemAccess('global', np.float32, + f32l += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b') ].eval_with_dict(params) - f64l = poly[lp.MemAccess('global', np.float64, + f64l = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g') ].eval_with_dict(params) - f64l += poly[lp.MemAccess('global', np.float64, + f64l += mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h') ].eval_with_dict(params) assert f32l == 3*n*m*l assert f64l == 2*n*m - f32s = poly[lp.MemAccess('global', np.dtype(np.float32), + f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c') ].eval_with_dict(params) - f64s = poly[lp.MemAccess('global', np.dtype(np.float64), + f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='store', variable='e') ].eval_with_dict(params) assert f32s == n*m*l @@ -268,27 +268,27 @@ def test_gmem_access_counter_reduction(): name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - poly = lp.get_mem_access_poly(knl) + mem_map = lp.get_mem_access_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32l = poly[lp.MemAccess('global', np.float32, + f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a') ].eval_with_dict(params) - f32l += poly[lp.MemAccess('global', np.float32, + f32l += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b') ].eval_with_dict(params) assert f32l == 2*n*m*l - f32s = poly[lp.MemAccess('global', np.dtype(np.float32), + f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c') ].eval_with_dict(params) assert f32s == n*l - ld_bytes = poly.filter_by(mtype=['global'], direction=['load'] + ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] ).to_bytes().eval_and_sum(params) - st_bytes = poly.filter_by(mtype=['global'], direction=['store'] + st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'] ).to_bytes().eval_and_sum(params) assert ld_bytes == 4*f32l assert st_bytes == 4*f32s @@ -306,13 +306,13 @@ def test_gmem_access_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - poly = lp.get_mem_access_poly(knl) + mem_map = lp.get_mem_access_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - reduced_map = poly.group_by('mtype', 'dtype', 'direction') + reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32), direction='load') @@ -342,36 +342,36 @@ def test_gmem_access_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = lp.get_mem_access_poly(knl) + mem_map = lp.get_mem_access_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f32 = poly[lp.MemAccess('global', np.float32, + f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a') ].eval_with_dict(params) - f32 += poly[lp.MemAccess('global', np.float32, + f32 += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b') ].eval_with_dict(params) - f64 = poly[lp.MemAccess('global', np.dtype(np.float64), + f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='g') ].eval_with_dict(params) - f64 += poly[lp.MemAccess('global', np.dtype(np.float64), + f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='h') ].eval_with_dict(params) assert f32 == 2*n*m*l assert f64 == 2*n*m - f32 = poly[lp.MemAccess('global', np.float32, + f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='store', variable='c') ].eval_with_dict(params) - f64 = poly[lp.MemAccess('global', np.float64, + f64 = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e') ].eval_with_dict(params) assert f32 == n*m*l assert f64 == n*m - filtered_map = poly.filter_by(direction=['load'], variable=['a','g']) + filtered_map = mem_map.filter_by(direction=['load'], variable=['a','g']) #tot = lp.eval_and_sum_polys(filtered_map, params) tot = filtered_map.eval_and_sum(params) assert tot == n*m*l + n*m @@ -393,29 +393,29 @@ def test_gmem_access_counter_bitwise(): a=np.int32, b=np.int32, g=np.int32, h=np.int32)) - poly = lp.get_mem_access_poly(knl) + mem_map = lp.get_mem_access_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - i32 = poly[lp.MemAccess('global', np.int32, + i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='a') ].eval_with_dict(params) - i32 += poly[lp.MemAccess('global', np.int32, + i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='b') ].eval_with_dict(params) - i32 += poly[lp.MemAccess('global', np.int32, + i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='g') ].eval_with_dict(params) - i32 += poly[lp.MemAccess('global', np.dtype(np.int32), + i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), stride=0, direction='load', variable='h') ].eval_with_dict(params) assert i32 == 4*n*m+2*n*m*l - i32 = poly[lp.MemAccess('global', np.int32, + i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='c') ].eval_with_dict(params) - i32 += poly[lp.MemAccess('global', np.int32, + i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='e') ].eval_with_dict(params) assert i32 == n*m+n*m*l @@ -439,25 +439,25 @@ def test_gmem_access_counter_mixed(): knl = lp.split_iname(knl, "j", threads) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) - poly = lp.get_mem_access_poly(knl) # noqa + mem_map = lp.get_mem_access_poly(knl) # noqa n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f64uniform = poly[lp.MemAccess('global', np.float64, + f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g') ].eval_with_dict(params) - f64uniform += poly[lp.MemAccess('global', np.float64, + f64uniform += mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h') ].eval_with_dict(params) - f32uniform = poly[lp.MemAccess('global', np.float32, + f32uniform = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='x') ].eval_with_dict(params) - f32nonconsec = poly[lp.MemAccess('global', np.dtype(np.float32), + f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='a') ].eval_with_dict(params) - f32nonconsec += poly[lp.MemAccess('global', np.dtype(np.float32), + f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='b') ].eval_with_dict(params) @@ -465,10 +465,10 @@ def test_gmem_access_counter_mixed(): assert f32uniform == n*m*l/threads assert f32nonconsec == 3*n*m*l - f64uniform = poly[lp.MemAccess('global', np.float64, + f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e') ].eval_with_dict(params) - f32nonconsec = poly[lp.MemAccess('global', np.float32, + f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m'), direction='store', variable='c') ].eval_with_dict(params) @@ -492,35 +492,35 @@ def test_gmem_access_counter_nonconsec(): knl = lp.split_iname(knl, "i", 16) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) - poly = lp.get_mem_access_poly(knl) # noqa + mem_map = lp.get_mem_access_poly(knl) # noqa n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - f64nonconsec = poly[lp.MemAccess('global', np.float64, + f64nonconsec = mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='g') ].eval_with_dict(params) - f64nonconsec += poly[lp.MemAccess('global', np.float64, + f64nonconsec += mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='h') ].eval_with_dict(params) - f32nonconsec = poly[lp.MemAccess('global', np.dtype(np.float32), + f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m')*Variable('l'), direction='load', variable='a') ].eval_with_dict(params) - f32nonconsec += poly[lp.MemAccess('global', np.dtype(np.float32), + f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m')*Variable('l'), direction='load', variable='b') ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*l - f64nonconsec = poly[lp.MemAccess('global', np.float64, + f64nonconsec = mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='store', variable='e') ].eval_with_dict(params) - f32nonconsec = poly[lp.MemAccess('global', np.float32, + f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m')*Variable('l'), direction='store', variable='c') ].eval_with_dict(params) @@ -543,34 +543,34 @@ def test_gmem_access_counter_consec(): a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) - poly = lp.get_mem_access_poly(knl) + mem_map = lp.get_mem_access_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - #for k in poly: - # print(k.mtype, k.dtype, type(k.dtype), k.stride, k.direction, k.variable, " :\n", poly[k]) + #for k in mem_map: + # print(k.mtype, k.dtype, type(k.dtype), k.stride, k.direction, k.variable, " :\n", mem_map[k]) - f64consec = poly[lp.MemAccess('global', np.float64, + f64consec = mem_map[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='g') ].eval_with_dict(params) - f64consec += poly[lp.MemAccess('global', np.float64, + f64consec += mem_map[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='h') ].eval_with_dict(params) - f32consec = poly[lp.MemAccess('global', np.float32, + f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a') ].eval_with_dict(params) - f32consec += poly[lp.MemAccess('global', np.dtype(np.float32), + f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=1, direction='load', variable='b') ].eval_with_dict(params) assert f64consec == 2*n*m assert f32consec == 3*n*m*l - f64consec = poly[lp.MemAccess('global', np.float64, + f64consec = mem_map[lp.MemAccess('global', np.float64, stride=1, direction='store', variable='e') ].eval_with_dict(params) - f32consec = poly[lp.MemAccess('global', np.float32, + f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c') ].eval_with_dict(params) assert f64consec == n*m @@ -591,13 +591,13 @@ def test_barrier_counter_nobarriers(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - sync_poly = lp.get_synchronization_poly(knl) + sync_map = lp.get_synchronization_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - assert len(sync_poly) == 1 - assert sync_poly["kernel_launch"].eval_with_dict(params) == 1 + assert len(sync_map) == 1 + assert sync_map["kernel_launch"].eval_with_dict(params) == 1 def test_barrier_counter_barriers(): @@ -617,13 +617,13 @@ def test_barrier_counter_barriers(): ) knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32)) knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0") - poly = lp.get_synchronization_poly(knl) - print(poly) + map = lp.get_synchronization_poly(knl) + print(map) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} - barrier_count = poly["barrier_local"].eval_with_dict(params) + barrier_count = map["barrier_local"].eval_with_dict(params) assert barrier_count == 50*10*2 @@ -647,10 +647,10 @@ def test_all_counters_parallel_matmul(): l = 128 params = {'n': n, 'm': m, 'l': l} - sync_poly = lp.get_synchronization_poly(knl) - assert len(sync_poly) == 2 - assert sync_poly["kernel_launch"].eval_with_dict(params) == 1 - assert sync_poly["barrier_local"].eval_with_dict(params) == 2*m/16 + sync_map = lp.get_synchronization_poly(knl) + assert len(sync_map) == 2 + assert sync_map["kernel_launch"].eval_with_dict(params) == 1 + assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/16 op_map = lp.get_op_poly(knl) f32mul = op_map[ @@ -668,30 +668,28 @@ def test_all_counters_parallel_matmul(): assert f32mul+f32add == n*m*l*2 - subscript_map = lp.get_mem_access_poly(knl) + op_map = lp.get_mem_access_poly(knl) - f32coal = subscript_map[lp.MemAccess('global', np.float32, + f32coal = op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='b') ].eval_with_dict(params) - f32coal += subscript_map[lp.MemAccess('global', np.float32, + f32coal += op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a') ].eval_with_dict(params) assert f32coal == n*m+m*l - f32coal = subscript_map[lp.MemAccess('global', np.float32, + f32coal = op_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c') ].eval_with_dict(params) assert f32coal == n*l - local_subs_map = lp.get_mem_access_poly(knl) - - local_subs_l = local_subs_map[lp.MemAccess('local', np.dtype(np.float32), + local_mem_map = lp.get_mem_access_poly(knl).filter_by(mtype=['local']) + local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load') ].eval_with_dict(params) - - assert local_subs_l == n*m*l*2 + assert local_mem_l == n*m*l*2 def test_gather_access_footprint(): knl = lp.make_kernel( @@ -739,7 +737,6 @@ def test_summations_and_filters(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - poly = lp.get_mem_access_poly(knl) n = 512 m = 256 l = 128 @@ -753,9 +750,9 @@ def test_summations_and_filters(): global_stores = mem_map.filter_by(mtype=['global'], direction=['store']).eval_and_sum(params) assert global_stores == n*m*l + n*m - ld_bytes = poly.filter_by(mtype=['global'], direction=['load'] + ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] ).to_bytes().eval_and_sum(params) - st_bytes = poly.filter_by(mtype=['global'], direction=['store'] + st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'] ).to_bytes().eval_and_sum(params) assert ld_bytes == 4*n*m*l*3 + 8*n*m*2 assert st_bytes == 4*n*m*l + 8*n*m @@ -773,10 +770,10 @@ def test_summations_and_filters(): #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) - poly_dtype = op_map.group_by('dtype') - f32 = poly_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) - f64 = poly_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params) - i32 = poly_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params) + op_map_dtype = op_map.group_by('dtype') + f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) + f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params) + i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params) assert f32 == n*m*l*3 assert f64 == n*m assert i32 == n*m*2 -- GitLab From bd7d74be15c7c8a85a4755f7ab71725c9fe8effb Mon Sep 17 00:00:00 2001 From: James Stevens Date: Tue, 1 Nov 2016 12:12:15 -0500 Subject: [PATCH 29/55] changed get_xxx_poly functions to get_xxx_map --- loopy/__init__.py | 13 ++--- loopy/statistics.py | 104 ++++++++++++++++++++++++---------------- test/test_statistics.py | 44 ++++++++--------- 3 files changed, 91 insertions(+), 70 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index f505759ac..a2e403b40 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -113,8 +113,9 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction, infer_unknown_types) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel from loopy.statistics import (stringify_stats_mapping, Op, MemAccess, - get_op_poly, get_lmem_access_poly, get_DRAM_access_poly, - get_gmem_access_poly, get_mem_access_poly, get_synchronization_poly, + get_op_poly, get_op_map, get_lmem_access_poly, get_DRAM_access_poly, + get_gmem_access_poly, get_mem_access_map, + get_synchronization_poly, get_synchronization_map, gather_access_footprints, gather_access_footprint_bytes) from loopy.codegen import ( PreambleInfo, @@ -219,10 +220,10 @@ __all__ = [ "generate_code", "generate_code_v2", "generate_body", "stringify_stats_mapping", "Op", "MemAccess", "get_op_poly", - "get_lmem_access_poly", "get_DRAM_access_poly", - "get_gmem_access_poly", "get_mem_access_poly", - "get_synchronization_poly", "gather_access_footprints", - "gather_access_footprint_bytes", + "get_op_map", "get_lmem_access_poly", "get_DRAM_access_poly", + "get_gmem_access_poly", "get_mem_access_map", + "get_synchronization_poly", "get_synchronization_map", + "gather_access_footprints", "gather_access_footprint_bytes", "CompiledKernel", diff --git a/loopy/statistics.py b/loopy/statistics.py index 5f5408770..b664e1f90 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -48,13 +48,15 @@ __doc__ = """ .. autofunction:: eval_and_sum .. autofunction:: get_op_poly +.. autofunction:: get_op_map .. autofunction:: get_lmem_access_poly .. autofunction:: get_DRAM_access_poly .. autofunction:: get_gmem_access_poly -.. autofunction:: get_mem_access_poly +.. autofunction:: get_mem_access_map .. autofunction:: get_synchronization_poly +.. autofunction:: get_synchronization_map .. autofunction:: gather_access_footprints .. autofunction:: gather_access_footprint_bytes @@ -138,7 +140,7 @@ class ToCountMap: # (first create loopy kernel and specify array data types) params = {'n': 512, 'm': 256, 'l': 128} - mem_map = lp.get_mem_access_poly(knl) + mem_map = lp.get_mem_access_map(knl) filtered_map = mem_map.filter_by(direction=['load'], variable=['a','g']) tot_loads_a_g = filtered_map.eval_and_sum(params) @@ -184,7 +186,7 @@ class ToCountMap: # (first create loopy kernel and specify array data types) params = {'n': 512, 'm': 256, 'l': 128} - mem_map = get_mem_access_poly(knl) + mem_map = get_mem_access_map(knl) grouped_mem_map = mem_map.group_by('mtype', 'dtype', 'direction') all_f32_global_loads = grouped_mem_map[MemAccess(mtype='global', @@ -204,7 +206,7 @@ class ToCountMap: direction='store') ].eval_with_dict(params) - op_map = get_op_poly(knl) + op_map = get_op_map(knl) ops_by_dtype = op_map.group_by('dtype') f32ops = ops_by_dtype[Op(dtype=np.float32)].eval_with_dict(params) @@ -252,7 +254,7 @@ class ToCountMap: # (first create loopy kernel and specify array data types) - bytes_map = get_mem_access_poly(knl).to_bytes() + bytes_map = get_mem_access_map(knl).to_bytes() params = {'n': 512, 'm': 256, 'l': 128} s1_global_ld_byt = bytes_map.filter_by( @@ -308,7 +310,7 @@ class ToCountMap: # (first create loopy kernel and specify array data types) params = {'n': 512, 'm': 256, 'l': 128} - mem_map = lp.get_mem_access_poly(knl) + mem_map = lp.get_mem_access_map(knl) filtered_map = mem_map.filter_by(direction=['load'], variable=['a','g']) tot_loads_a_g = filtered_map.eval_and_sum(params) @@ -1056,6 +1058,18 @@ def count(kernel, set): def get_op_poly(knl, numpy_types=True): + """Count the number of operations in a loopy kernel. + """ + from warnings import warn + warn("get_op_poly is deprecated. Use get_op_map instead.", + DeprecationWarning, stacklevel=2) + return get_op_map(knl, numpy_types) + +# }}} + + +def get_op_map(knl, numpy_types=True): + """Count the number of operations in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted. @@ -1077,10 +1091,10 @@ def get_op_poly(knl, numpy_types=True): # (first create loopy kernel and specify array data types) - poly = get_op_poly(knl) + map = get_op_map(knl) params = {'n': 512, 'm': 256, 'l': 128} - f32add = poly[Op(np.dtype(np.float32), 'add')].eval_with_dict(params) - f32mul = poly[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32add = map[Op(np.dtype(np.float32), 'add')].eval_with_dict(params) + f32mul = map[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) # (now use these counts to predict performance) @@ -1090,7 +1104,7 @@ def get_op_poly(knl, numpy_types=True): knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) - op_poly = ToCountMap() + op_map = ToCountMap() op_counter = ExpressionOpCounter(knl) for insn in knl.instructions: # how many times is this instruction executed? @@ -1100,16 +1114,14 @@ def get_op_poly(knl, numpy_types=True): domain = (inames_domain.project_out_except( insn_inames, [dim_type.set])) ops = op_counter(insn.assignee) + op_counter(insn.expression) - op_poly = op_poly + ops*count(knl, domain) + op_map = op_map + ops*count(knl, domain) if numpy_types: - op_poly.dict = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name), + op_map.dict = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name), count) - for op, count in six.iteritems(op_poly.dict)) + for op, count in six.iteritems(op_map.dict)) - return op_poly - -# }}} + return op_map #TODO test depricated functions? @@ -1117,20 +1129,20 @@ def get_lmem_access_poly(knl): """Count the number of local memory accesses in a loopy kernel. """ from warnings import warn - warn("get_lmem_access_poly is deprecated. Use get_mem_access_poly and " + warn("get_lmem_access_poly is deprecated. Use get_mem_access_map and " "filter the result with the mtype=['local'] option.", DeprecationWarning, stacklevel=2) - return get_mem_access_poly(knl).filter_by(mtype=['local']) + return get_mem_access_map(knl).filter_by(mtype=['local']) def get_DRAM_access_poly(knl): """Count the number of global memory accesses in a loopy kernel. """ from warnings import warn - warn("get_DRAM_access_poly is deprecated. Use get_mem_access_poly and " + warn("get_DRAM_access_poly is deprecated. Use get_mem_access_map and " "filter the result with the mtype=['global'] option.", DeprecationWarning, stacklevel=2) - return get_mem_access_poly(knl).filter_by(mtype=['global']) + return get_mem_access_map(knl).filter_by(mtype=['global']) # {{{ get_gmem_access_poly @@ -1139,15 +1151,15 @@ def get_gmem_access_poly(knl): """Count the number of global memory accesses in a loopy kernel. """ from warnings import warn - warn("get_DRAM_access_poly is deprecated. Use get_mem_access_poly and " + warn("get_DRAM_access_poly is deprecated. Use get_mem_access_map and " "filter the result with the mtype=['global'] option.", DeprecationWarning, stacklevel=2) - return get_mem_access_poly(knl).filter_by(mtype=['global']) + return get_mem_access_map(knl).filter_by(mtype=['global']) # }}} -def get_mem_access_poly(knl, numpy_types=True): +def get_mem_access_map(knl, numpy_types=True): """Count the number of memory accesses in a loopy kernel. :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be @@ -1172,7 +1184,7 @@ def get_mem_access_poly(knl, numpy_types=True): # (first create loopy kernel and specify array data types) params = {'n': 512, 'm': 256, 'l': 128} - mem_access_map = get_mem_access_poly(knl) + mem_access_map = get_mem_access_map(knl) f32_stride1_g_loads_a = mem_access_map[MemAccess(mtype='global', dtype=np.float32, @@ -1224,7 +1236,7 @@ def get_mem_access_poly(knl, numpy_types=True): knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) - subs_poly = ToCountMap() + subs_map = ToCountMap() subs_counter_g = GlobalSubscriptCounter(knl) subs_counter_l = LocalSubscriptCounter(knl) @@ -1253,40 +1265,50 @@ def get_mem_access_poly(knl, numpy_types=True): # use count excluding local index tags for uniform accesses for key in subs_expr.dict: - poly = ToCountMap({key: subs_expr.dict[key]}) + map = ToCountMap({key: subs_expr.dict[key]}) if key.mtype == 'global' and isinstance(key.stride, int) and key.stride == 0: - subs_poly = subs_poly \ - + poly*get_insn_count(knl, insn_inames, True) + subs_map = subs_map \ + + map*get_insn_count(knl, insn_inames, True) else: - subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) + subs_map = subs_map + map*get_insn_count(knl, insn_inames) #currently not counting stride of local mem access for key in subs_assignee_g.dict: - poly = ToCountMap({key: subs_assignee_g.dict[key]}) + map = ToCountMap({key: subs_assignee_g.dict[key]}) if isinstance(key.stride, int) and key.stride == 0: - subs_poly = subs_poly \ - + poly*get_insn_count(knl, insn_inames, True) + subs_map = subs_map \ + + map*get_insn_count(knl, insn_inames, True) else: - subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames) + subs_map = subs_map + map*get_insn_count(knl, insn_inames) # for now, don't count writes to local mem - #result = subs_poly.dict - if numpy_types: - subs_poly.dict = dict((MemAccess(mtype=mem_access.mtype, + subs_map.dict = dict((MemAccess(mtype=mem_access.mtype, dtype=mem_access.dtype.numpy_dtype, stride=mem_access.stride, direction=mem_access.direction, variable=mem_access.variable) , count) - for mem_access, count in six.iteritems(subs_poly.dict)) + for mem_access, count in six.iteritems(subs_map.dict)) - return subs_poly + return subs_map # {{{ get_synchronization_poly def get_synchronization_poly(knl): + """Count the number of synchronization events each thread encounters in a + loopy kernel. + """ + from warnings import warn + warn("get_synchronization_poly is deprecated. Use get_synchronization_map instead.", + DeprecationWarning, stacklevel=2) + return get_synchronization_map(knl) + +# }}} + + +def get_synchronization_map(knl): """Count the number of synchronization events each thread encounters in a loopy kernel. @@ -1304,9 +1326,9 @@ def get_synchronization_poly(knl): # (first create loopy kernel and specify array data types) - sync_poly = get_synchronization_poly(knl) + sync_map = get_synchronization_map(knl) params = {'n': 512, 'm': 256, 'l': 128} - barrier_count = sync_poly['barrier_local'].eval_with_dict(params) + barrier_count = sync_map['barrier_local'].eval_with_dict(params) # (now use this count to predict performance) @@ -1361,8 +1383,6 @@ def get_synchronization_poly(knl): #return result.dict #TODO is this okay? return result -# }}} - # {{{ gather_access_footprints diff --git a/test/test_statistics.py b/test/test_statistics.py index 685406fee..3f03fa955 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -48,7 +48,7 @@ def test_op_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_poly(knl) + op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 @@ -73,7 +73,7 @@ def test_op_counter_reduction(): name="matmul_serial", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - op_map = lp.get_op_poly(knl) + op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 @@ -99,7 +99,7 @@ def test_op_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - op_map = lp.get_op_poly(knl) + op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 @@ -129,7 +129,7 @@ def test_op_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_poly(knl) + op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 @@ -165,7 +165,7 @@ def test_op_counter_bitwise(): a=np.int32, b=np.int32, g=np.int64, h=np.int64)) - op_map = lp.get_op_poly(knl) + op_map = lp.get_op_map(knl) n = 512 m = 256 l = 128 @@ -204,7 +204,7 @@ def test_op_counter_triangular_domain(): else: expect_fallback = False - op_map = lp.get_op_poly(knl)[lp.Op(np.float64, 'mul')] + op_map = lp.get_op_map(knl)[lp.Op(np.float64, 'mul')] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) @@ -228,7 +228,7 @@ def test_gmem_access_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - mem_map = lp.get_mem_access_poly(knl) + mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 @@ -268,7 +268,7 @@ def test_gmem_access_counter_reduction(): name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - mem_map = lp.get_mem_access_poly(knl) + mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 @@ -306,7 +306,7 @@ def test_gmem_access_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - mem_map = lp.get_mem_access_poly(knl) + mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 @@ -342,7 +342,7 @@ def test_gmem_access_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - mem_map = lp.get_mem_access_poly(knl) + mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 @@ -393,7 +393,7 @@ def test_gmem_access_counter_bitwise(): a=np.int32, b=np.int32, g=np.int32, h=np.int32)) - mem_map = lp.get_mem_access_poly(knl) + mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 @@ -439,7 +439,7 @@ def test_gmem_access_counter_mixed(): knl = lp.split_iname(knl, "j", threads) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) - mem_map = lp.get_mem_access_poly(knl) # noqa + mem_map = lp.get_mem_access_map(knl) # noqa n = 512 m = 256 l = 128 @@ -492,7 +492,7 @@ def test_gmem_access_counter_nonconsec(): knl = lp.split_iname(knl, "i", 16) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) - mem_map = lp.get_mem_access_poly(knl) # noqa + mem_map = lp.get_mem_access_map(knl) # noqa n = 512 m = 256 l = 128 @@ -543,7 +543,7 @@ def test_gmem_access_counter_consec(): a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) - mem_map = lp.get_mem_access_poly(knl) + mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 @@ -591,7 +591,7 @@ def test_barrier_counter_nobarriers(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - sync_map = lp.get_synchronization_poly(knl) + sync_map = lp.get_synchronization_map(knl) n = 512 m = 256 l = 128 @@ -617,7 +617,7 @@ def test_barrier_counter_barriers(): ) knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32)) knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0") - map = lp.get_synchronization_poly(knl) + map = lp.get_synchronization_map(knl) print(map) n = 512 m = 256 @@ -647,12 +647,12 @@ def test_all_counters_parallel_matmul(): l = 128 params = {'n': n, 'm': m, 'l': l} - sync_map = lp.get_synchronization_poly(knl) + sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 assert sync_map["kernel_launch"].eval_with_dict(params) == 1 assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/16 - op_map = lp.get_op_poly(knl) + op_map = lp.get_op_map(knl) f32mul = op_map[ lp.Op(np.float32, 'mul') ].eval_with_dict(params) @@ -668,7 +668,7 @@ def test_all_counters_parallel_matmul(): assert f32mul+f32add == n*m*l*2 - op_map = lp.get_mem_access_poly(knl) + op_map = lp.get_mem_access_map(knl) f32coal = op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='b') @@ -685,7 +685,7 @@ def test_all_counters_parallel_matmul(): assert f32coal == n*l - local_mem_map = lp.get_mem_access_poly(knl).filter_by(mtype=['local']) + local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local']) local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load') ].eval_with_dict(params) @@ -742,7 +742,7 @@ def test_summations_and_filters(): l = 128 params = {'n': n, 'm': m, 'l': l} - mem_map = lp.get_mem_access_poly(knl) + mem_map = lp.get_mem_access_map(knl) loads_a = mem_map.filter_by(direction=['load'], variable=['a']).eval_and_sum(params) assert loads_a == 2*n*m*l @@ -766,7 +766,7 @@ def test_summations_and_filters(): assert f32lall == 3*n*m*l assert f64lall == 2*n*m - op_map = lp.get_op_poly(knl) + op_map = lp.get_op_map(knl) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) -- GitLab From d5fa573fc2dd12858817088795aa3b3c853f947b Mon Sep 17 00:00:00 2001 From: James Stevens Date: Wed, 2 Nov 2016 14:16:49 -0500 Subject: [PATCH 30/55] doc fixes/improvements --- loopy/__init__.py | 12 +-- loopy/statistics.py | 238 ++++++++++++++++++++++++-------------------- 2 files changed, 138 insertions(+), 112 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index a2e403b40..c59c7bf8a 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -112,9 +112,9 @@ from loopy.transform.parameter import assume, fix_parameters from loopy.preprocess import (preprocess_kernel, realize_reduction, infer_unknown_types) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel -from loopy.statistics import (stringify_stats_mapping, Op, MemAccess, - get_op_poly, get_op_map, get_lmem_access_poly, get_DRAM_access_poly, - get_gmem_access_poly, get_mem_access_map, +from loopy.statistics import (ToCountMap, stringify_stats_mapping, Op, + MemAccess, get_op_poly, get_op_map, get_lmem_access_poly, + get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_map, get_synchronization_poly, get_synchronization_map, gather_access_footprints, gather_access_footprint_bytes) from loopy.codegen import ( @@ -219,9 +219,9 @@ __all__ = [ "PreambleInfo", "generate_code", "generate_code_v2", "generate_body", - "stringify_stats_mapping", "Op", "MemAccess", "get_op_poly", - "get_op_map", "get_lmem_access_poly", "get_DRAM_access_poly", - "get_gmem_access_poly", "get_mem_access_map", + "ToCountMap", "stringify_stats_mapping", "Op", "MemAccess", + "get_op_poly", "get_op_map", "get_lmem_access_poly", + "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_map", "get_synchronization_poly", "get_synchronization_map", "gather_access_footprints", "gather_access_footprint_bytes", diff --git a/loopy/statistics.py b/loopy/statistics.py index b664e1f90..ee0d867e6 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -41,11 +41,9 @@ __doc__ = """ .. currentmodule:: loopy -.. autofunction:: filter_by -.. autofunction:: group_by -.. autofunction:: to_bytes -.. autofunciton:: sum -.. autofunction:: eval_and_sum +.. autoclass:: ToCountMap +.. autoclass:: Op +.. autoclass:: MemAccess .. autofunction:: get_op_poly .. autofunction:: get_op_map @@ -67,7 +65,15 @@ __doc__ = """ # {{{ ToCountMap class ToCountMap: - """Maps any type of key to an arithmetic type.""" + """Maps any type of key to an arithmetic type. + + .. automethod:: filter_by + .. automethod:: group_by + .. automethod:: to_bytes + .. automethod:: sum + .. automethod:: eval_and_sum + + """ def __init__(self, init_dict=None): if init_dict is None: @@ -125,15 +131,15 @@ class ToCountMap: return ToCountMap(dict(self.dict)) def filter_by(self, **kwargs): - """Remove items without specified key fields + """Remove items without specified key fields. - :parameter **kwargs: Keyword arguments matching fields in the keys of + :parameter \*\*kwargs: Keyword arguments matching fields in the keys of the :class:`ToCountMap`, each given a list of allowable values for that key field. :return: A :class:`ToCountMap` containing the subset of the items in - the oriinal :class:`ToCountMap` that match the field values - passed + the original :class:`ToCountMap` that match the field values + passed. Example usage:: @@ -173,9 +179,10 @@ class ToCountMap: return result_map def group_by(self, *args): - """Group map items together, distinguishing by only the key fields passed in args + """Group map items together, distinguishing by only the key fields + passed in args. - :parameter args: Zero or more :class:`string` fields of map keys + :parameter \*args: Zero or more :class:`str` fields of map keys. :return: A :class:`ToCountMap` containing the same total counts grouped together by new keys that only contain the fields @@ -187,31 +194,31 @@ class ToCountMap: params = {'n': 512, 'm': 256, 'l': 128} mem_map = get_mem_access_map(knl) - grouped_mem_map = mem_map.group_by('mtype', 'dtype', 'direction') - - all_f32_global_loads = grouped_mem_map[MemAccess(mtype='global', - dtype=np.float32, - direction='load') - ].eval_with_dict(params) - all_f32_global_stores = grouped_mem_map[MemAccess(mtype='global', - dtype=np.float32, - direction='store') - ].eval_with_dict(params) - all_f32_local_loads = grouped_mem_map[MemAccess(mtype='local', - dtype=np.float32, - direction='load') - ].eval_with_dict(params) - all_f32_local_stores = grouped_mem_map[MemAccess(mtype='local', - dtype=np.float32, - direction='store') - ].eval_with_dict(params) + grouped_map = mem_map.group_by('mtype', 'dtype', 'direction') + + f32_global_ld = grouped_map[MemAccess(mtype='global', + dtype=np.float32, + direction='load') + ].eval_with_dict(params) + f32_global_st = grouped_map[MemAccess(mtype='global', + dtype=np.float32, + direction='store') + ].eval_with_dict(params) + f32_local_ld = grouped_map[MemAccess(mtype='local', + dtype=np.float32, + direction='load') + ].eval_with_dict(params) + f32_local_st = grouped_map[MemAccess(mtype='local', + dtype=np.float32, + direction='store') + ].eval_with_dict(params) op_map = get_op_map(knl) - ops_by_dtype = op_map.group_by('dtype') + ops_dtype = op_map.group_by('dtype') - f32ops = ops_by_dtype[Op(dtype=np.float32)].eval_with_dict(params) - f64ops = ops_by_dtype[Op(dtype=np.float64)].eval_with_dict(params) - i32ops = ops_by_dtype[Op(dtype=np.int32)].eval_with_dict(params) + f32ops = ops_dtype[Op(dtype=np.float32)].eval_with_dict(params) + f64ops = ops_dtype[Op(dtype=np.float64)].eval_with_dict(params) + i32ops = ops_dtype[Op(dtype=np.int32)].eval_with_dict(params) # (now use these counts to predict performance) @@ -244,7 +251,7 @@ class ToCountMap: return result_map def to_bytes(self): - """Convert counts to bytes using data type in map key + """Convert counts to bytes using data type in map key. :return: A :class:`ToCountMap` mapping each original key to a :class:`islpy.PwQPolynomial` with counts in bytes rather than @@ -257,18 +264,18 @@ class ToCountMap: bytes_map = get_mem_access_map(knl).to_bytes() params = {'n': 512, 'm': 256, 'l': 128} - s1_global_ld_byt = bytes_map.filter_by( - mtype=['global'], stride=[1], - direction=['load']).eval_and_sum(params) - s2_global_ld_byt = bytes_map.filter_by( - mtype=['global'], stride=[2], - direction=['load']).eval_and_sum(params) - s1_global_st_byt = bytes_map.filter_by( - mtype=['global'], stride=[1], - direction=['store']).eval_and_sum(params) - s2_global_st_byt = bytes_map.filter_by( - mtype=['global'], stride=[2], - direction=['store']).eval_and_sum(params) + s1_g_ld_byt = bytes_map.filter_by( + mtype=['global'], stride=[1], + direction=['load']).eval_and_sum(params) + s2_g_ld_byt = bytes_map.filter_by( + mtype=['global'], stride=[2], + direction=['load']).eval_and_sum(params) + s1_g_st_byt = bytes_map.filter_by( + mtype=['global'], stride=[1], + direction=['store']).eval_and_sum(params) + s2_g_st_byt = bytes_map.filter_by( + mtype=['global'], stride=[2], + direction=['store']).eval_and_sum(params) # (now use these counts to predict performance) @@ -284,9 +291,9 @@ class ToCountMap: def sum(self): - """Add all counts in ToCountMap + """Add all counts in ToCountMap. - :return: A :class:`islpy.PwQPolynomial` containing the sum of counts + :return: A :class:`islpy.PwQPolynomial` containing the sum of counts. """ total = isl.PwQPolynomial('{ 0 }') @@ -300,10 +307,11 @@ class ToCountMap: def eval_and_sum(self, params): - """Add all counts in ToCountMap and evaluate with provided parameters + """Add all counts in :class:`ToCountMap` and evaluate with provided + parameter dict. - :return: An :class:`integer` containing the sum of all counts in the - :class:`ToCountMap` evaluated with the parameters provided + :return: An :class:`int` containing the sum of all counts in the + :class:`ToCountMap` evaluated with the parameters provided. Example usage:: @@ -331,7 +339,7 @@ def stringify_stats_mapping(m): class Op: - """An arithmetic operation + """An arithmetic operation. .. attribute:: dtype @@ -340,7 +348,7 @@ class Op: .. attribute:: name - A :class:`string` that specifies the kind of arithmetic operation as + A :class:`str` that specifies the kind of arithmetic operation as *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. """ @@ -371,11 +379,11 @@ class Op: class MemAccess: - """A memory access + """A memory access. .. attribute:: mtype - A :class:`string` that specifies the memory type accessed as **global** + A :class:`str` that specifies the memory type accessed as **global** or **local** .. attribute:: dtype @@ -385,17 +393,17 @@ class MemAccess: .. attribute:: stride - A :class:`int` specifies stride of the memory access. A stride of 0 + An :class:`int` that specifies stride of the memory access. A stride of 0 indicates a uniform access (i.e. all threads access the same item). .. attribute:: direction - A :class:`string` that specifies the direction of memory access as + A :class:`str` that specifies the direction of memory access as **load** or **store**. .. attribute:: variable - A :class:`string` that specifies the variable name of the data + A :class:`str` that specifies the variable name of the data accessed. """ @@ -1059,6 +1067,9 @@ def count(kernel, set): def get_op_poly(knl, numpy_types=True): """Count the number of operations in a loopy kernel. + + get_op_poly is deprecated. Use get_op_map instead. + """ from warnings import warn warn("get_op_poly is deprecated. Use get_op_map instead.", @@ -1074,27 +1085,27 @@ def get_op_map(knl, numpy_types=True): :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted. - :parameter numpy_types: A :class:`boolean` specifying whether the types + :parameter numpy_types: A :class:`bool` specifying whether the types in the returned mapping should be numpy types - instead of :class:'loopy.LoopyType`. + instead of :class:`loopy.LoopyType`. - :return: A mapping of **{** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. + :return: A mapping of **{** :class:`Op` **:** :class:`islpy.PwQPolynomial` **}**. - - The :class:`loopy.Op` specifies an arithmetic operation with - specific characteristics. + - The :class:`Op` specifies the characteristics of the arithmetic + operation. - The :class:`islpy.PwQPolynomial` holds the number of operations of the kind specified in the key (in terms of the - :class:`loopy.LoopKernel` *parameter inames*). + :class:`loopy.LoopKernel` parameter *inames*). Example usage:: # (first create loopy kernel and specify array data types) - map = get_op_map(knl) + op_map = get_op_map(knl) params = {'n': 512, 'm': 256, 'l': 128} - f32add = map[Op(np.dtype(np.float32), 'add')].eval_with_dict(params) - f32mul = map[Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32add = op_map[Op(np.float32, 'add')].eval_with_dict(params) + f32mul = op_map[Op(np.float32, 'mul')].eval_with_dict(params) # (now use these counts to predict performance) @@ -1124,9 +1135,13 @@ def get_op_map(knl, numpy_types=True): return op_map -#TODO test depricated functions? +#TODO test deprecated functions? def get_lmem_access_poly(knl): """Count the number of local memory accesses in a loopy kernel. + + get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['local'] option. + """ from warnings import warn warn("get_lmem_access_poly is deprecated. Use get_mem_access_map and " @@ -1137,6 +1152,10 @@ def get_lmem_access_poly(knl): def get_DRAM_access_poly(knl): """Count the number of global memory accesses in a loopy kernel. + + get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['global'] option. + """ from warnings import warn warn("get_DRAM_access_poly is deprecated. Use get_mem_access_map and " @@ -1149,6 +1168,10 @@ def get_DRAM_access_poly(knl): def get_gmem_access_poly(knl): """Count the number of global memory accesses in a loopy kernel. + + get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['global'] option. + """ from warnings import warn warn("get_DRAM_access_poly is deprecated. Use get_mem_access_map and " @@ -1162,18 +1185,18 @@ def get_gmem_access_poly(knl): def get_mem_access_map(knl, numpy_types=True): """Count the number of memory accesses in a loopy kernel. - :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be + :parameter knl: A :class:`loopy.LoopKernel` whose memory accesses are to be counted. - :parameter numpy_types: A :class:`boolean` specifying whether the types + :parameter numpy_types: A :class:`bool` specifying whether the types in the returned mapping should be numpy types - instead of :class:'loopy.LoopyType`. + instead of :class:`loopy.LoopyType`. - :return: A mapping of **{** :class:`loopy.MemAccess` **:** + :return: A mapping of **{** :class:`MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. - - The :class:`loopy.MemAccess` specifies the type of memory - access. + - The :class:`MemAccess` specifies the characteristics of the + memory access. - The :class:`islpy.PwQPolynomial` holds the number of memory accesses with the characteristics specified in the key (in terms @@ -1184,32 +1207,32 @@ def get_mem_access_map(knl, numpy_types=True): # (first create loopy kernel and specify array data types) params = {'n': 512, 'm': 256, 'l': 128} - mem_access_map = get_mem_access_map(knl) - - f32_stride1_g_loads_a = mem_access_map[MemAccess(mtype='global', - dtype=np.float32, - stride=1, - direction='load', - variable='a') - ].eval_with_dict(params) - f32_stride1_g_stores_a = mem_access_map[MemAccess(mtype='global', - dtype=np.float32, - stride=1, - direction='store', - variable='a') - ].eval_with_dict(params) - f32_stride1_l_loads_x = mem_access_map[MemAccess(mtype='local', - dtype=np.float32, - stride=1, - direction='load', - variable='x') - ].eval_with_dict(params) - f32_stride1_l_stores_x = mem_access_map[MemAccess(mtype='local', - dtype=np.float32, - stride=1, - direction='store', - variable='x') - ].eval_with_dict(params) + mem_map = get_mem_access_map(knl) + + f32_s1_g_ld_a = mem_map[MemAccess(mtype='global', + dtype=np.float32, + stride=1, + direction='load', + variable='a') + ].eval_with_dict(params) + f32_s1_g_st_a = mem_map[MemAccess(mtype='global', + dtype=np.float32, + stride=1, + direction='store', + variable='a') + ].eval_with_dict(params) + f32_s1_l_ld_x = mem_map[MemAccess(mtype='local', + dtype=np.float32, + stride=1, + direction='load', + variable='x') + ].eval_with_dict(params) + f32_s1_l_st_x = mem_map[MemAccess(mtype='local', + dtype=np.float32, + stride=1, + direction='store', + variable='x') + ].eval_with_dict(params) # (now use these counts to predict performance) @@ -1299,6 +1322,9 @@ def get_mem_access_map(knl, numpy_types=True): def get_synchronization_poly(knl): """Count the number of synchronization events each thread encounters in a loopy kernel. + + get_synchronization_poly is deprecated. Use get_synchronization_map instead. + """ from warnings import warn warn("get_synchronization_poly is deprecated. Use get_synchronization_map instead.", @@ -1316,8 +1342,8 @@ def get_synchronization_map(knl): :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: A dictionary mapping each type of synchronization event to a - :class:`islpy.PwQPolynomial` holding the number of such events - per thread. + :class:`islpy.PwQPolynomial` holding the number of events per + thread. Possible keys include ``barrier_local``, ``barrier_global`` (if supported by the target) and ``kernel_launch``. @@ -1328,7 +1354,7 @@ def get_synchronization_map(knl): sync_map = get_synchronization_map(knl) params = {'n': 512, 'm': 256, 'l': 128} - barrier_count = sync_map['barrier_local'].eval_with_dict(params) + barrier_ct = sync_map['barrier_local'].eval_with_dict(params) # (now use this count to predict performance) @@ -1380,7 +1406,7 @@ def get_synchronization_map(knl): raise LoopyError("unexpected schedule item: %s" % type(sched_item).__name__) - #return result.dict #TODO is this okay? + #return result.dict #TODO is this change okay? return result @@ -1392,7 +1418,7 @@ def gather_access_footprints(kernel, ignore_uncountable=False): of each the array *var_name* are read/written (where *direction* is either ``read`` or ``write``. - :arg ignore_uncountable: If *True*, an error will be raised for + :arg ignore_uncountable: If *False*, an error will be raised for accesses on which the footprint cannot be determined (e.g. data-dependent or nonlinear indices) """ -- GitLab From e137bf70bef6344b22d8040c434a03a7c35e441a Mon Sep 17 00:00:00 2001 From: James Stevens Date: Wed, 2 Nov 2016 15:50:50 -0500 Subject: [PATCH 31/55] added __str__ functions to Op and MemAccess --- loopy/statistics.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index ee0d867e6..ec10722e1 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -377,6 +377,9 @@ class Op: name = 'None' return hash(str(dtype)+name) + def __str__(self): + return "Op("+str(self.dtype)+", "+self.name+")" + class MemAccess: """A memory access. @@ -460,6 +463,10 @@ class MemAccess: variable = 'None' return hash(mtype+str(dtype)+str(stride)+direction+variable) + def __str__(self): + return "MemAccess("+self.mtype+", "+str(self.dtype)+", "+ \ + str(self.stride)+", "+self.direction+", "+self.variable+")" + # {{{ ExpressionOpCounter @@ -1089,7 +1096,8 @@ def get_op_map(knl, numpy_types=True): in the returned mapping should be numpy types instead of :class:`loopy.LoopyType`. - :return: A mapping of **{** :class:`Op` **:** :class:`islpy.PwQPolynomial` **}**. + :return: A :class:`ToCountMap` of **{** :class:`Op` **:** + :class:`islpy.PwQPolynomial` **}**. - The :class:`Op` specifies the characteristics of the arithmetic operation. @@ -1192,7 +1200,7 @@ def get_mem_access_map(knl, numpy_types=True): in the returned mapping should be numpy types instead of :class:`loopy.LoopyType`. - :return: A mapping of **{** :class:`MemAccess` **:** + :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. - The :class:`MemAccess` specifies the characteristics of the -- GitLab From 3124e4b3994c7aed6ac77b5e34fa9dd683fde981 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Wed, 2 Nov 2016 20:23:33 -0500 Subject: [PATCH 32/55] updated tutorial so that doctests past, still need to update with recently added ToCountMap member functions --- doc/tutorial.rst | 302 +++++++++++++++++++++----------------------- loopy/statistics.py | 12 +- 2 files changed, 151 insertions(+), 163 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 87daa9fc4..c633e55de 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -176,7 +176,7 @@ by passing :attr:`loopy.Options.write_cl`. #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) { for (int i = 0; i <= -1 + n; ++i) out[i] = 2.0f * a[i]; @@ -250,7 +250,7 @@ call :func:`loopy.generate_code`: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) { for (int i = 0; i <= -1 + n; ++i) out[i] = 2.0f * a[i]; @@ -365,7 +365,7 @@ Let us take a look at the generated code for the above kernel: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) { for (int i = 0; i <= -1 + n; ++i) for (int j = 0; j <= -1 + n; ++j) @@ -414,7 +414,7 @@ Now the intended code is generated and our test passes. #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) { for (int i = 0; i <= -1 + n; ++i) for (int j = 0; j <= -1 + n; ++j) @@ -557,12 +557,14 @@ relation to loop nesting. For example, it's perfectly possible to request >>> knl = lp.set_loop_priority(knl, "i_inner,i_outer") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) - ... + #define gid(N) ((int) get_group_id(N)) + + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *restrict a, int const n) + { for (int i_inner = 0; i_inner <= 15; ++i_inner) - if (-1 + -1 * i_inner + n >= 0) - for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) - a[16 * i_outer + i_inner] = 0.0f; - ... + for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) + a[16 * i_outer + i_inner] = 0.0f; + } Notice how loopy has automatically generated guard conditionals to make sure the bounds on the old iname are obeyed. @@ -701,8 +703,9 @@ Let's try this out on our vector fill kernel by creating workgroups of size >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) - ... - __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n) + #define gid(N) ((int) get_group_id(N)) + + __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *restrict a, int const n) { if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0) a[128 * gid(0) + lid(0)] = 0.0f; @@ -1182,7 +1185,7 @@ When we ask to see the code, the issue becomes apparent: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *restrict a, int const n, __global float *restrict out) { float a_fetch[16]; @@ -1207,26 +1210,30 @@ Obtaining Performance Statistics .. {{{ -Operations, array access, and barriers can all be counted, which may facilitate -performance prediction and optimization of a :mod:`loopy` kernel. +Arithmetic operations, array accesses, and synchronization operations can all +be counted, which may facilitate performance prediction and optimization of a +:mod:`loopy` kernel. .. note:: The functions used in the following examples may produce warnings. If you have already made the filterwarnings and catch_warnings calls used in the examples - above, you may need to reset these before continuing: + above, you may want to reset these before continuing. We will temporarily + supress warnings to keep the output clean: .. doctest:: - >>> from warnings import resetwarnings + >>> from warnings import resetwarnings, filterwarnings >>> resetwarnings() + >>> filterwarnings('ignore', category=Warning) Counting operations ~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_op_poly` provides information on the number and type of operations -being performed in a kernel. To demonstrate this, we'll create an example kernel -that performs several operations on arrays containing different types of data: +:func:`loopy.get_op_map` provides information on the number and type of +arithmetic operations being performed in a kernel. To demonstrate this, we'll +create an example kernel that performs several operations on arrays containing +different types of data: .. doctest:: @@ -1244,37 +1251,36 @@ information provided. Now we will count the operations: .. doctest:: - >>> from loopy.statistics import get_op_poly - >>> op_map = get_op_poly(knl) + >>> op_map = lp.get_op_map(knl) -:func:`loopy.get_op_poly` returns a mapping of **{(** :class:`numpy.dtype` **,** -:class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**. The -:class:`islpy.PwQPolynomial` holds the number of operations for the type specified -in the key (in terms of the :class:`loopy.LoopKernel` *inames*). We'll print this -map now: +:func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{** +:class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. The +:class:`islpy.PwQPolynomial` holds the number of operations for the kind of +operation specified in the key(in terms of the :class:`loopy.LoopKernel` +*inames*). We'll print this map now: .. doctest:: >>> print(lp.stringify_stats_mapping(op_map)) - (dtype('float32'), 'add') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float32'), 'div') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float32'), 'mul') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float64'), 'add') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - (dtype('float64'), 'mul') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - (dtype('int32'), 'add') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float32'), add) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float32'), div) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float32'), mul) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float64'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float64'), mul) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('int32'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } -We can evaluate these polynomials using :func:`islpy.eval_with_dict`: +One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: .. doctest:: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} - >>> f32add = op_map[(np.dtype(np.float32), 'add')].eval_with_dict(param_dict) - >>> f32div = op_map[(np.dtype(np.float32), 'div')].eval_with_dict(param_dict) - >>> f32mul = op_map[(np.dtype(np.float32), 'mul')].eval_with_dict(param_dict) - >>> f64add = op_map[(np.dtype(np.float64), 'add')].eval_with_dict(param_dict) - >>> f64mul = op_map[(np.dtype(np.float64), 'mul')].eval_with_dict(param_dict) - >>> i32add = op_map[(np.dtype(np.int32), 'add')].eval_with_dict(param_dict) + >>> f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(param_dict) + >>> f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(param_dict) + >>> f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(param_dict) + >>> f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(param_dict) + >>> f64mul = op_map[lp.Op(np.float64, 'mul')].eval_with_dict(param_dict) + >>> i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(param_dict) >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 @@ -1284,174 +1290,156 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: 65536 65536 -Counting array accesses -~~~~~~~~~~~~~~~~~~~~~~~ +Counting memory accesses +~~~~~~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_gmem_access_poly` provides information on the number and type of -array loads and stores being performed in a kernel. To demonstrate this, we'll -continue using the kernel from the previous example: +:func:`loopy.get_mem_access_map` provides information on the number and +characteristics of memory accesses performed in a kernel. To demonstrate this, +we'll continue using the kernel from the previous example: .. doctest:: - >>> from loopy.statistics import get_gmem_access_poly - >>> load_store_map = get_gmem_access_poly(knl) - >>> print(lp.stringify_stats_mapping(load_store_map)) - (dtype('float32'), 'uniform', 'load') : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float32'), 'uniform', 'store') : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - (dtype('float64'), 'uniform', 'load') : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 } - (dtype('float64'), 'uniform', 'store') : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + >>> mem_map = lp.get_mem_access_map(knl) + >>> print(lp.stringify_stats_mapping(mem_map)) + MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 2 * n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } -:func:`loopy.get_gmem_access_poly` returns a mapping of **{(** -:class:`numpy.dtype` **,** :class:`string` **,** :class:`string` **)** -**:** :class:`islpy.PwQPolynomial` **}**. +:func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** +:class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. +:class:`loopy.MemAccess` attributes include: -- The :class:`numpy.dtype` specifies the type of the data being accessed. +- mtype: A :class:`str` that specifies the memory type accessed as **global** + or **local** -- The first string in the map key specifies the DRAM access type as *consecutive*, - *nonconsecutive*, or *uniform*. *Consecutive* memory accesses occur when - consecutive threads access consecutive array elements in memory, *nonconsecutive* - accesses occur when consecutive threads access nonconsecutive array elements in - memory, and *uniform* accesses occur when consecutive threads access the *same* - element in memory. +- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the + data type accessed. -- The second string in the map key specifies the DRAM access type as a *load*, or a - *store*. +- stride: An :class:`int` that specifies stride of the memory access. A stride + of 0 indicates a uniform access (i.e. all threads access the same item). -- The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses with the - characteristics specified in the key (in terms of the :class:`loopy.LoopKernel` - *inames*). +- direction: A :class:`str` that specifies the direction of memory access as + **load** or **store**. + +- variable: A :class:`str` that specifies the variable name of the data + accessed. We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld = load_store_map[(np.dtype(np.float64), "uniform", "load") + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g') ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map[(np.dtype(np.float64), "uniform", "store") + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e') ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map[(np.dtype(np.float32), "uniform", "load") + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a') ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map[(np.dtype(np.float32), "uniform", "store") + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c') ... ].eval_with_dict(param_dict) - >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % - ... (f32ld, f32st, f64ld, f64st)) - f32 load: 1572864 - f32 store: 524288 - f64 load: 131072 - f64 store: 65536 + >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % + ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) + f32 ld a: 1048576 + f32 st c: 524288 + f64 ld g: 65536 + f64 st e: 65536 ~~~~~~~~~~~ -Since we have not tagged any of the inames or parallelized the kernel across threads -(which would have produced iname tags), :func:`loopy.get_gmem_access_poly` considers -the array accesses *uniform*. Now we'll parallelize the kernel and count the array -accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated -this time, so we'll print the mapping manually to make it more legible: +Since we have not tagged any of the inames or parallelized the kernel across +threads (which would have produced iname tags), :func:`loopy.get_mem_access_map` +considers the memory accesses *uniform*, so the *stride* of each access is 0. +Now we'll parallelize the kernel and count the array accesses again. The +resulting :class:`islpy.PwQPolynomial` will be more complicated this time. .. doctest:: >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0") - >>> load_store_map = get_gmem_access_poly(knl_consec) - >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)): - ... print("%s :\n%s\n" % (key, load_store_map[key])) - (dtype('float32'), 'consecutive', 'load') : - [n, m, l] -> { ... } - - (dtype('float32'), 'consecutive', 'store') : - [n, m, l] -> { ... } - - (dtype('float64'), 'consecutive', 'load') : - [n, m, l] -> { ... } + >>> mem_map = lp.get_mem_access_map(knl_consec) + >>> print(lp.stringify_stats_mapping(mem_map)) + MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { (2 * n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (256 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } - (dtype('float64'), 'consecutive', 'store') : - [n, m, l] -> { ... } - - With this parallelization, consecutive threads will access consecutive array elements in memory. The polynomials are a bit more complicated now due to the -parallelization, but when we evaluate them, we see that the total number of array -accesses has not changed: +parallelization, but when we evaluate them, we see that the total number of +array accesses has not changed: .. doctest:: - >>> f64ld = load_store_map[(np.dtype(np.float64), "consecutive", "load") + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g') ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map[(np.dtype(np.float64), "consecutive", "store") + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e') ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map[(np.dtype(np.float32), "consecutive", "load") + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a') ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map[(np.dtype(np.float32), "consecutive", "store") + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c') ... ].eval_with_dict(param_dict) - >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % - ... (f32ld, f32st, f64ld, f64st)) - f32 load: 1572864 - f32 store: 524288 - f64 load: 131072 - f64 store: 65536 + >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % + ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) + f32 ld a: 1048576 + f32 st c: 524288 + f64 ld g: 65536 + f64 st e: 65536 ~~~~~~~~~~~ -To produce *nonconsecutive* array accesses, we'll switch the inner and outer tags in -our parallelization of the kernel: +To produce *nonconsecutive* array accesses with stride greater than 1, we'll +switch the inner and outer tags in our parallelization of the kernel: .. doctest:: >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1") - >>> load_store_map = get_gmem_access_poly(knl_nonconsec) - >>> for key in sorted(load_store_map.keys(), key=lambda k: str(k)): - ... print("%s :\n%s\n" % (key, load_store_map[key])) - (dtype('float32'), 'nonconsecutive', 'load') : - [n, m, l] -> { ... } - - (dtype('float32'), 'nonconsecutive', 'store') : - [n, m, l] -> { ... } + >>> mem_map = lp.get_mem_access_map(knl_nonconsec) + >>> print(lp.stringify_stats_mapping(mem_map)) + MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { (2 * n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (256 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } - (dtype('float64'), 'nonconsecutive', 'load') : - [n, m, l] -> { ... } - - (dtype('float64'), 'nonconsecutive', 'store') : - [n, m, l] -> { ... } - - -With this parallelization, consecutive threads will access *nonconsecutive* array -elements in memory. The total number of array accesses has not changed: +With this parallelization, consecutive threads will access *nonconsecutive* +array elements in memory. The total number of array accesses still has not +changed: .. doctest:: - >>> f64ld = load_store_map[ - ... (np.dtype(np.float64), "nonconsecutive", "load") + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g') ... ].eval_with_dict(param_dict) - >>> f64st = load_store_map[ - ... (np.dtype(np.float64), "nonconsecutive", "store") + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e') ... ].eval_with_dict(param_dict) - >>> f32ld = load_store_map[ - ... (np.dtype(np.float32), "nonconsecutive", "load") + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a') ... ].eval_with_dict(param_dict) - >>> f32st = load_store_map[ - ... (np.dtype(np.float32), "nonconsecutive", "store") + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c') ... ].eval_with_dict(param_dict) - >>> print("f32 load: %i\nf32 store: %i\nf64 load: %i\nf64 store: %i" % - ... (f32ld, f32st, f64ld, f64st)) - f32 load: 1572864 - f32 store: 524288 - f64 load: 131072 - f64 store: 65536 + >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % + ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) + f32 ld a: 1048576 + f32 st c: 524288 + f64 ld g: 65536 + f64 st e: 65536 Counting synchronization events ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_synchronization_poly` counts the number of synchronization +:func:`loopy.get_synchronization_map` counts the number of synchronization events per **thread** in a kernel. First, we'll call this function on the kernel from the previous example: .. doctest:: - >>> from loopy.statistics import get_synchronization_poly - >>> barrier_poly = get_synchronization_poly(knl) - >>> print(lp.stringify_stats_mapping(barrier_poly)) + >>> sync_map = lp.get_synchronization_map(knl) + >>> print(lp.stringify_stats_mapping(sync_map)) kernel_launch : { 1 } @@ -1459,7 +1447,7 @@ We can evaluate this polynomial using :func:`islpy.eval_with_dict`: .. doctest:: - >>> launch_count = barrier_poly["kernel_launch"].eval_with_dict(param_dict) + >>> launch_count = sync_map["kernel_launch"].eval_with_dict(param_dict) >>> print("Kernel launch count: %s" % launch_count) Kernel launch count: 1 @@ -1485,7 +1473,7 @@ Now to make things more interesting, we'll create a kernel with barriers: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *__restrict__ a, __global int *__restrict__ e) + __kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *restrict a, __global int *restrict e) { __local int c[50 * 10 * 99]; @@ -1499,24 +1487,24 @@ Now to make things more interesting, we'll create a kernel with barriers: } } - -In this kernel, when a thread performs the second instruction it uses data produced -by *different* threads during the first instruction. Because of this, barriers are -required for correct execution, so loopy inserts them. Now we'll count the barriers -using :func:`loopy.get_barrier_poly`: +In this kernel, when a thread performs the second instruction it uses data +produced by *different* threads during the first instruction. Because of this, +barriers are required for correct execution, so loopy inserts them. Now we'll +count the barriers using :func:`loopy.get_synchronization_map`: .. doctest:: - >>> sync_map = lp.get_synchronization_poly(knl) + >>> sync_map = lp.get_synchronization_map(knl) >>> print(lp.stringify_stats_mapping(sync_map)) barrier_local : { 1000 } kernel_launch : { 1 } -Based on the kernel code printed above, we would expect each thread to encounter -50x10x2 barriers, which matches the result from :func:`loopy.get_barrier_poly`. In -this case, the number of barriers does not depend on any inames, so we can pass an -empty dictionary to :func:`islpy.eval_with_dict`. +Based on the kernel code printed above, we would expect each thread to +encounter 50x10x2 barriers, which matches the result from +:func:`loopy.get_synchronization_map`. In this case, the number of barriers +does not depend on any inames, so we can pass an empty dictionary to +:func:`islpy.eval_with_dict`. .. }}} diff --git a/loopy/statistics.py b/loopy/statistics.py index ec10722e1..468a274d7 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1043,22 +1043,22 @@ def count(kernel, set): if not (is_subset and is_superset): if is_subset: - from loopy.diagnostic import warn - warn(kernel, "count_overestimate", + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(kernel, "count_overestimate", "Barvinok wrappers are not installed. " "Counting routines have overestimated the " "number of integer points in your loop " "domain.") elif is_superset: - from loopy.diagnostic import warn - warn(kernel, "count_underestimate", + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(kernel, "count_underestimate", "Barvinok wrappers are not installed. " "Counting routines have underestimated the " "number of integer points in your loop " "domain.") else: - from loopy.diagnostic import warn - warn(kernel, "count_misestimate", + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(kernel, "count_misestimate", "Barvinok wrappers are not installed. " "Counting routines have misestimated the " "number of integer points in your loop " -- GitLab From 3d202b489764545795e60dade112140f579947c5 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Thu, 3 Nov 2016 00:20:48 -0500 Subject: [PATCH 33/55] fixed ToCountMap.__str__ to handle None values --- loopy/statistics.py | 53 +++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 468a274d7..4d3f8831a 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -369,16 +369,18 @@ class Op: self.name == other.name)) def __hash__(self): - dtype = self.dtype - name = self.name - if dtype is None: - dtype = 'None' - if name is None: - name = 'None' - return hash(str(dtype)+name) + return hash(str(self)) def __str__(self): - return "Op("+str(self.dtype)+", "+self.name+")" + if self.dtype is None: + dtype = 'None' + else: + dtype = str(self.dtype) + if self.name is None: + name = 'None' + else: + name = self.name + return "Op("+dtype+", "+name+")" class MemAccess: @@ -446,26 +448,31 @@ class MemAccess: self.variable == other.variable)) def __hash__(self): - mtype = self.mtype - dtype = self.dtype - stride = self.stride - direction = self.direction - variable = self.variable - if mtype is None: + return hash(str(self)) + + def __str__(self): + if self.mtype is None: mtype = 'None' - if dtype is None: + else: + mtype = self.mtype + if self.dtype is None: dtype = 'None' - if stride is None: + else: + dtype = str(self.dtype) + if self.stride is None: stride = 'None' - if direction is None: + else: + stride = str(self.stride) + if self.direction is None: direction = 'None' - if variable is None: + else: + direction = self.direction + if self.variable is None: variable = 'None' - return hash(mtype+str(dtype)+str(stride)+direction+variable) - - def __str__(self): - return "MemAccess("+self.mtype+", "+str(self.dtype)+", "+ \ - str(self.stride)+", "+self.direction+", "+self.variable+")" + else: + variable = self.variable + return "MemAccess("+mtype+", "+dtype+", "+stride+", "+direction+", " \ + +variable+")" # {{{ ExpressionOpCounter -- GitLab From ef374b52ca2a1f9a8025992e3040362992c48cc6 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Thu, 3 Nov 2016 00:21:52 -0500 Subject: [PATCH 34/55] added info and examples about the new filter, group, to_bytes, and summation functions for ToCountMap --- doc/tutorial.rst | 133 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 103 insertions(+), 30 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 03cceb522..eb80fa448 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1231,10 +1231,10 @@ be counted, which may facilitate performance prediction and optimization of a Counting operations ~~~~~~~~~~~~~~~~~~~ -:func:`loopy.get_op_map` provides information on the number and type of -arithmetic operations being performed in a kernel. To demonstrate this, we'll -create an example kernel that performs several operations on arrays containing -different types of data: +:func:`loopy.get_op_map` provides information on the characteristics and +quantity of arithmetic operations being performed in a kernel. To demonstrate +this, we'll create an example kernel that performs several operations on arrays +containing different types of data: .. doctest:: @@ -1253,15 +1253,6 @@ information provided. Now we will count the operations: .. doctest:: >>> op_map = lp.get_op_map(knl) - -:func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{** -:class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. The -:class:`islpy.PwQPolynomial` holds the number of operations for the kind of -operation specified in the key(in terms of the :class:`loopy.LoopKernel` -*inames*). We'll print this map now: - -.. doctest:: - >>> print(lp.stringify_stats_mapping(op_map)) Op(np:dtype('float32'), add) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } Op(np:dtype('float32'), div) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } @@ -1271,6 +1262,20 @@ operation specified in the key(in terms of the :class:`loopy.LoopKernel` Op(np:dtype('int32'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } +:func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{** +:class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. A +:class:`loopy.ToCountMap` holds a dictionary mapping any type of key to an +arithmetic type. In this case, the :class:`islpy.PwQPolynomial` holds the +number of operations matching the characteristics of the :class:`loopy.Op` +specified in the key (in terms of the :class:`loopy.LoopKernel` +*inames*). :class:`loopy.Op` attributes include: + +- dtype: A :class:`loopy.LoopyType` or :class:`numpy.dtype` that specifies the + data type operated on. + +- name: A :class:`str` that specifies the kind of arithmetic operation as + *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. + One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: .. doctest:: @@ -1291,6 +1296,39 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: 65536 65536 +:class:`loopy.ToCountMap` provides member functions that facilitate filtering, +grouping, and evaluating subsets of the counts. Suppose we want to know the +total number of 32-bit operations of any kind. We can easily count these +using functions :func:`loopy.ToCountMap.filter_by` and +:func:`loopy.ToCountMap.eval_and_sum`: + +.. doctest:: + + >>> filtered_op_map = op_map.filter_by(dtype=[np.float32]) + >>> f32op_count = filtered_op_map.eval_and_sum(param_dict) + >>> print(f32op_count) + 1572864 + +We could accomplish the same goal using :func:`loopy.ToCountMap.group_by`, +which produces a :class:`loopy.ToCountMap` that contains the same counts grouped +together into keys containing only the specified fields: + +.. doctest:: + + >>> op_map_dtype = op_map.group_by('dtype') + >>> print(lp.stringify_stats_mapping(op_map_dtype)) + Op(np:dtype('float32'), None) : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float64'), None) : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('int32'), None) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + + >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32) + ... ].eval_with_dict(param_dict) + >>> print(f32op_count) + 1572864 + +See the reference page for :class:`loopy.ToCountMap` and :class:`loopy.Op` for +more information on these functions. + Counting memory accesses ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1334,20 +1372,53 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g') - ... ].eval_with_dict(param_dict) + ... ].eval_with_dict(param_dict) >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e') - ... ].eval_with_dict(param_dict) + ... ].eval_with_dict(param_dict) >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a') - ... ].eval_with_dict(param_dict) + ... ].eval_with_dict(param_dict) >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c') - ... ].eval_with_dict(param_dict) + ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % - ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) + ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) f32 ld a: 1048576 f32 st c: 524288 f64 ld g: 65536 f64 st e: 65536 +:class:`loopy.ToCountMap` also makes it easy to determine the total amount +of data moved in bytes. Suppose we want to know the total abount of global +memory data loaded and stored. We can produce a map with just this information +using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: + +.. doctest:: + + >>> bytes_map = mem_map.to_bytes() + >>> print(lp.stringify_stats_mapping(bytes_map)) + MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 8 * n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } + + >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] + ... ).group_by('direction') + >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) + MemAccess(None, None, None, load, None) : [n, m, l] -> { (16 * n * m + 12 * n * m * l) : n > 0 and m > 0 and l > 0 } + MemAccess(None, None, None, store, None) : [n, m, l] -> { (8 * n * m + 4 * n * m * l) : n > 0 and m > 0 and l > 0 } + + >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') + ... ].eval_with_dict(param_dict) + >>> stored = global_ld_st_bytes[lp.MemAccess(direction='store') + ... ].eval_with_dict(param_dict) + >>> print("bytes loaded: %s\nbytes stored: %s" % (loaded, stored)) + bytes loaded: 7340032 + bytes stored: 2621440 + +One can see how these functions might be useful in computing, for example, +achieved memory bandwidth in byte/sec or performance in FLOP/sec. + ~~~~~~~~~~~ Since we have not tagged any of the inames or parallelized the kernel across @@ -1358,7 +1429,8 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time. .. doctest:: - >>> knl_consec = lp.split_iname(knl, "k", 128, outer_tag="l.1", inner_tag="l.0") + >>> knl_consec = lp.split_iname(knl, "k", 128, + ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec) >>> print(lp.stringify_stats_mapping(mem_map)) MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { (2 * n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (256 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } @@ -1377,15 +1449,15 @@ array accesses has not changed: .. doctest:: >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g') - ... ].eval_with_dict(param_dict) + ... ].eval_with_dict(param_dict) >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e') - ... ].eval_with_dict(param_dict) + ... ].eval_with_dict(param_dict) >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a') - ... ].eval_with_dict(param_dict) + ... ].eval_with_dict(param_dict) >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c') - ... ].eval_with_dict(param_dict) + ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % - ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) + ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) f32 ld a: 1048576 f32 st c: 524288 f64 ld g: 65536 @@ -1398,7 +1470,8 @@ switch the inner and outer tags in our parallelization of the kernel: .. doctest:: - >>> knl_nonconsec = lp.split_iname(knl, "k", 128, outer_tag="l.0", inner_tag="l.1") + >>> knl_nonconsec = lp.split_iname(knl, "k", 128, + ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec) >>> print(lp.stringify_stats_mapping(mem_map)) MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { (2 * n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (256 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } @@ -1416,15 +1489,15 @@ changed: .. doctest:: >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g') - ... ].eval_with_dict(param_dict) + ... ].eval_with_dict(param_dict) >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e') - ... ].eval_with_dict(param_dict) + ... ].eval_with_dict(param_dict) >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a') - ... ].eval_with_dict(param_dict) + ... ].eval_with_dict(param_dict) >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c') - ... ].eval_with_dict(param_dict) + ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % - ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) + ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) f32 ld a: 1048576 f32 st c: 524288 f64 ld g: 65536 -- GitLab From fc54488bc742c3f5407d214d4436a705b4339978 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Thu, 3 Nov 2016 12:27:51 -0500 Subject: [PATCH 35/55] fixed a broken test, renamed some tests --- test/test_numa_diff.py | 2 +- test/test_statistics.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index e4f303f78..33ac31f8d 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -228,7 +228,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): print(lp.stringify_stats_mapping(op_poly)) print("MEM") - gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv)) + gmem_poly = lp.get_mem_access_poly(hsv).to_bytes() print(lp.stringify_stats_mapping(gmem_poly)) hsv = lp.set_options(hsv, cl_build_options=[ diff --git a/test/test_statistics.py b/test/test_statistics.py index f768ef9bd..cdeef2b0e 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -214,7 +214,7 @@ def test_op_counter_triangular_domain(): assert flops == 78 -def test_gmem_access_counter_basic(): +def test_mem_access_counter_basic(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i Date: Thu, 3 Nov 2016 12:47:43 -0500 Subject: [PATCH 36/55] actually fixing tests this time --- test/test_numa_diff.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 33ac31f8d..dfdd7f63e 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -224,12 +224,12 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): if 1: print("OPS") - op_poly = lp.get_op_poly(hsv) - print(lp.stringify_stats_mapping(op_poly)) + op_map = lp.get_op_map(hsv) + print(lp.stringify_stats_mapping(op_map)) print("MEM") - gmem_poly = lp.get_mem_access_poly(hsv).to_bytes() - print(lp.stringify_stats_mapping(gmem_poly)) + gmem_map = lp.get_mem_access_map(hsv).to_bytes() + print(lp.stringify_stats_mapping(gmem_map)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", -- GitLab From 012954b0929d22823a397805afaaf075fd64bf62 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Thu, 3 Nov 2016 13:20:24 -0500 Subject: [PATCH 37/55] making python 2 compatible --- loopy/statistics.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 4d3f8831a..af9fffc51 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -228,7 +228,8 @@ class ToCountMap: # make sure all item keys have same type if self.dict: - key_type = type(list(self.keys())[0]) + first_key = list(self.keys())[0] + key_type = type(first_key) if not all(isinstance(x, key_type) for x in self.keys()): raise ValueError("ToCountMap: group_by() function may only " "be used on ToCountMaps with uniform keys") @@ -237,7 +238,7 @@ class ToCountMap: # for each item in self.dict for self_key, self_val in self.items(): - new_key = key_type() + new_key = first_key.__class__() # set all specified fields for field in args: -- GitLab From 4e5be04bfc152fd07a63c68578d3fe341d504aca Mon Sep 17 00:00:00 2001 From: James Stevens Date: Thu, 3 Nov 2016 14:28:12 -0500 Subject: [PATCH 38/55] restrict->__restrict__ --- doc/tutorial.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index eb80fa448..ee737ea83 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -176,7 +176,7 @@ by passing :attr:`loopy.Options.write_cl`. #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) { for (int i = 0; i <= -1 + n; ++i) out[i] = 2.0f * a[i]; @@ -250,7 +250,7 @@ call :func:`loopy.generate_code`: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) { for (int i = 0; i <= -1 + n; ++i) out[i] = 2.0f * a[i]; @@ -365,7 +365,7 @@ Let us take a look at the generated code for the above kernel: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) { for (int i = 0; i <= -1 + n; ++i) for (int j = 0; j <= -1 + n; ++j) @@ -414,7 +414,7 @@ Now the intended code is generated and our test passes. #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) { for (int i = 0; i <= -1 + n; ++i) for (int j = 0; j <= -1 + n; ++j) @@ -559,7 +559,7 @@ relation to loop nesting. For example, it's perfectly possible to request #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *restrict a, int const n) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n) { for (int i_inner = 0; i_inner <= 15; ++i_inner) for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) @@ -705,7 +705,7 @@ Let's try this out on our vector fill kernel by creating workgroups of size #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *restrict a, int const n) + __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n) { if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0) a[128 * gid(0) + lid(0)] = 0.0f; @@ -1186,7 +1186,7 @@ When we ask to see the code, the issue becomes apparent: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *restrict a, int const n, __global float *restrict out) + __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) { float a_fetch[16]; @@ -1547,7 +1547,7 @@ Now to make things more interesting, we'll create a kernel with barriers: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *restrict a, __global int *restrict e) + __kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *__restrict__ a, __global int *__restrict__ e) { __local int c[50 * 10 * 99]; -- GitLab From 652688b26637240a2cce0fc8d08539137e85673c Mon Sep 17 00:00:00 2001 From: James Stevens Date: Thu, 3 Nov 2016 22:38:42 -0500 Subject: [PATCH 39/55] make classes inherit from object --- loopy/statistics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index af9fffc51..699a86044 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -64,7 +64,7 @@ __doc__ = """ # {{{ ToCountMap -class ToCountMap: +class ToCountMap(object): """Maps any type of key to an arithmetic type. .. automethod:: filter_by @@ -339,7 +339,7 @@ def stringify_stats_mapping(m): return result -class Op: +class Op(object): """An arithmetic operation. .. attribute:: dtype @@ -384,7 +384,7 @@ class Op: return "Op("+dtype+", "+name+")" -class MemAccess: +class MemAccess(object): """A memory access. .. attribute:: mtype -- GitLab From ad255d0862a7ab67e2d74e95c2bae56cd747e682 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Thu, 3 Nov 2016 22:41:15 -0500 Subject: [PATCH 40/55] changed back to constructor call of unknown type --- loopy/statistics.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 699a86044..fbca99779 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -228,8 +228,7 @@ class ToCountMap(object): # make sure all item keys have same type if self.dict: - first_key = list(self.keys())[0] - key_type = type(first_key) + key_type = type(list(self.keys())[0]) if not all(isinstance(x, key_type) for x in self.keys()): raise ValueError("ToCountMap: group_by() function may only " "be used on ToCountMaps with uniform keys") @@ -238,7 +237,7 @@ class ToCountMap(object): # for each item in self.dict for self_key, self_val in self.items(): - new_key = first_key.__class__() + new_key = key_type() # set all specified fields for field in args: -- GitLab From a17533d073cca343daab893c2dccb5b6c4f3a92d Mon Sep 17 00:00:00 2001 From: James Stevens Date: Thu, 3 Nov 2016 22:54:44 -0500 Subject: [PATCH 41/55] renamed ToCountMap.dict to ToCountMap.count_map --- loopy/statistics.py | 87 +++++++++++++++++++++++---------------------- 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index fbca99779..9736fa0d2 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -78,12 +78,12 @@ class ToCountMap(object): def __init__(self, init_dict=None): if init_dict is None: init_dict = {} - self.dict = init_dict + self.count_map = init_dict def __add__(self, other): - result = self.dict.copy() - for k, v in six.iteritems(other.dict): - result[k] = self.dict.get(k, 0) + v + result = self.count_map.copy() + for k, v in six.iteritems(other.count_map): + result[k] = self.count_map.get(k, 0) + v return ToCountMap(result) def __radd__(self, other): @@ -97,7 +97,7 @@ class ToCountMap(object): def __mul__(self, other): if isinstance(other, isl.PwQPolynomial): return ToCountMap(dict( - (index, self.dict[index]*other) + (index, self.count_map[index]*other) for index in self.keys())) else: raise ValueError("ToCountMap: Attempted to multiply " @@ -108,27 +108,30 @@ class ToCountMap(object): def __getitem__(self, index): try: - return self.dict[index] + return self.count_map[index] except KeyError: return isl.PwQPolynomial('{ 0 }') def __setitem__(self, index, value): - self.dict[index] = value + self.count_map[index] = value def __repr__(self): - return repr(self.dict) + return repr(self.count_map) def __len__(self): - return len(self.dict) + return len(self.count_map) def items(self): - return self.dict.items() + return self.count_map.items() def keys(self): - return self.dict.keys() + return self.count_map.keys() + + def pop(self, item): + return self.count_map.pop(item) def copy(self): - return ToCountMap(dict(self.dict)) + return ToCountMap(dict(self.count_map)) def filter_by(self, **kwargs): """Remove items without specified key fields. @@ -161,7 +164,7 @@ class ToCountMap(object): if 'dtype' in kwargs.keys(): kwargs['dtype'] = [to_loopy_type(d) for d in kwargs['dtype']] - # for each item in self.dict + # for each item in self.count_map for self_key, self_val in self.items(): try: # check to see if key attribute values match all filters @@ -171,7 +174,7 @@ class ToCountMap(object): if attr_val not in allowable_vals: break else: # loop terminated without break or error - result_map.dict[self_key] = self_val + result_map[self_key] = self_val except(AttributeError): # the field passed is not a field of this key continue @@ -227,7 +230,7 @@ class ToCountMap(object): result_map = ToCountMap() # make sure all item keys have same type - if self.dict: + if self.count_map: key_type = type(list(self.keys())[0]) if not all(isinstance(x, key_type) for x in self.keys()): raise ValueError("ToCountMap: group_by() function may only " @@ -235,7 +238,7 @@ class ToCountMap(object): else: return result_map - # for each item in self.dict + # for each item in self.count_map for self_key, self_val in self.items(): new_key = key_type() @@ -1143,9 +1146,9 @@ def get_op_map(knl, numpy_types=True): op_map = op_map + ops*count(knl, domain) if numpy_types: - op_map.dict = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name), - count) - for op, count in six.iteritems(op_map.dict)) + op_map.count_map = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name), + count) + for op, count in six.iteritems(op_map.count_map)) return op_map @@ -1284,26 +1287,26 @@ def get_mem_access_map(knl, numpy_types=True): + subs_counter_l(insn.expression) # distinguish loads and stores - for key in subs_expr.dict: - subs_expr.dict[MemAccess(mtype=key.mtype, dtype=key.dtype, - stride=key.stride, direction='load', - variable=key.variable) - ] = subs_expr.dict.pop(key) + for key in subs_expr.count_map: + subs_expr[MemAccess(mtype=key.mtype, dtype=key.dtype, + stride=key.stride, direction='load', + variable=key.variable) + ] = subs_expr.pop(key) subs_assignee_g = subs_counter_g(insn.assignee) - for key in subs_assignee_g.dict: - subs_assignee_g.dict[MemAccess(mtype=key.mtype, dtype=key.dtype, - stride=key.stride, - direction='store', - variable=key.variable) - ] = subs_assignee_g.dict.pop(key) + for key in subs_assignee_g.count_map: + subs_assignee_g[MemAccess(mtype=key.mtype, dtype=key.dtype, + stride=key.stride, + direction='store', + variable=key.variable) + ] = subs_assignee_g.pop(key) # for now, don't count writes to local mem insn_inames = knl.insn_inames(insn) # use count excluding local index tags for uniform accesses - for key in subs_expr.dict: - map = ToCountMap({key: subs_expr.dict[key]}) + for key in subs_expr.count_map: + map = ToCountMap({key: subs_expr[key]}) if key.mtype == 'global' and isinstance(key.stride, int) and key.stride == 0: subs_map = subs_map \ + map*get_insn_count(knl, insn_inames, True) @@ -1311,8 +1314,8 @@ def get_mem_access_map(knl, numpy_types=True): subs_map = subs_map + map*get_insn_count(knl, insn_inames) #currently not counting stride of local mem access - for key in subs_assignee_g.dict: - map = ToCountMap({key: subs_assignee_g.dict[key]}) + for key in subs_assignee_g.count_map: + map = ToCountMap({key: subs_assignee_g[key]}) if isinstance(key.stride, int) and key.stride == 0: subs_map = subs_map \ + map*get_insn_count(knl, insn_inames, True) @@ -1321,13 +1324,13 @@ def get_mem_access_map(knl, numpy_types=True): # for now, don't count writes to local mem if numpy_types: - subs_map.dict = dict((MemAccess(mtype=mem_access.mtype, - dtype=mem_access.dtype.numpy_dtype, - stride=mem_access.stride, - direction=mem_access.direction, - variable=mem_access.variable) - , count) - for mem_access, count in six.iteritems(subs_map.dict)) + subs_map.count_map = dict((MemAccess(mtype=mem_access.mtype, + dtype=mem_access.dtype.numpy_dtype, + stride=mem_access.stride, + direction=mem_access.direction, + variable=mem_access.variable) + , count) + for mem_access, count in six.iteritems(subs_map.count_map)) return subs_map @@ -1421,7 +1424,7 @@ def get_synchronization_map(knl): raise LoopyError("unexpected schedule item: %s" % type(sched_item).__name__) - #return result.dict #TODO is this change okay? + #return result.count_map #TODO is this change okay? return result -- GitLab From 47f55a384754ba27dbbafb9d7dbadc353efc9b38 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Thu, 3 Nov 2016 23:28:53 -0500 Subject: [PATCH 42/55] undoing improper changes to tutorial expected output --- doc/tutorial.rst | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index ee737ea83..71547e695 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -557,14 +557,12 @@ relation to loop nesting. For example, it's perfectly possible to request >>> knl = lp.set_loop_priority(knl, "i_inner,i_outer") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) - #define gid(N) ((int) get_group_id(N)) - - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n) - { - for (int i_inner = 0; i_inner <= 15; ++i_inner) - for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) - a[16 * i_outer + i_inner] = 0.0f; - } + ... + for (int i_inner = 0; i_inner <= 15; ++i_inner) + if (-1 + -1 * i_inner + n >= 0) + for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) + a[16 * i_outer + i_inner] = 0.0f; + ... Notice how loopy has automatically generated guard conditionals to make sure the bounds on the old iname are obeyed. @@ -703,8 +701,7 @@ Let's try this out on our vector fill kernel by creating workgroups of size >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) - #define gid(N) ((int) get_group_id(N)) - + ... __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n) { if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0) -- GitLab From 88799f8d2aa4d4d3be75916ca53c05a8aba2ee05 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Thu, 3 Nov 2016 23:46:51 -0500 Subject: [PATCH 43/55] fixed missing spaces in tutorial --- doc/tutorial.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 71547e695..172b3a3bc 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -558,10 +558,10 @@ relation to loop nesting. For example, it's perfectly possible to request >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - for (int i_inner = 0; i_inner <= 15; ++i_inner) - if (-1 + -1 * i_inner + n >= 0) - for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) - a[16 * i_outer + i_inner] = 0.0f; + for (int i_inner = 0; i_inner <= 15; ++i_inner) + if (-1 + -1 * i_inner + n >= 0) + for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) + a[16 * i_outer + i_inner] = 0.0f; ... Notice how loopy has automatically generated guard conditionals to make -- GitLab From fe66563a2367066f6ea306b64951afc983e5104d Mon Sep 17 00:00:00 2001 From: James Stevens Date: Fri, 4 Nov 2016 00:13:00 -0500 Subject: [PATCH 44/55] added ToCountMap.filter_by_func --- loopy/statistics.py | 35 +++++++++++++++++++++++++++++++++++ test/test_statistics.py | 6 +++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 9736fa0d2..e7f6f7953 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -68,6 +68,7 @@ class ToCountMap(object): """Maps any type of key to an arithmetic type. .. automethod:: filter_by + .. automethod:: filter_by_func .. automethod:: group_by .. automethod:: to_bytes .. automethod:: sum @@ -181,6 +182,40 @@ class ToCountMap(object): return result_map + def filter_by_func(self, func): + """Keep items that pass a test. + + :parameter func: A function that takes a map key a parameter and + returns a :class:`bool`. + + :return: A :class:`ToCountMap` containing the subset of the items in + the original :class:`ToCountMap` for which func(key) is true. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = lp.get_mem_access_map(knl) + def filter_func(key): + return key.stride > 1 and key.stride <= 4: + + filtered_map = mem_map.filter_by_func(filter_func) + tot = filtered_map.eval_and_sum(params) + + # (now use these counts to predict performance) + + """ + + result_map = ToCountMap() + + # for each item in self.count_map, call func on the key + for self_key, self_val in self.items(): + if func(self_key): + result_map[self_key] = self_val + + return result_map + def group_by(self, *args): """Group map items together, distinguishing by only the key fields passed in args. diff --git a/test/test_statistics.py b/test/test_statistics.py index cdeef2b0e..05d857667 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -793,7 +793,11 @@ def test_summations_and_filters(): assert mul_all == n*m*l + n*m assert f64ops_all == n*m - + def func_filter(key): + return (key.stride < 1) and (to_loopy_type(key.dtype) == to_loopy_type(np.float64)) and \ + (key.direction == 'load') + s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) + assert s1f64l == 2*n*m if __name__ == "__main__": if len(sys.argv) > 1: -- GitLab From 6dbbc1a002fb1ba8cd87249c06654d6fec330512 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Fri, 4 Nov 2016 00:40:42 -0500 Subject: [PATCH 45/55] removed unnecessary parens --- test/test_statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index 05d857667..fb502045c 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -794,8 +794,8 @@ def test_summations_and_filters(): assert f64ops_all == n*m def func_filter(key): - return (key.stride < 1) and (to_loopy_type(key.dtype) == to_loopy_type(np.float64)) and \ - (key.direction == 'load') + return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \ + key.direction == 'load' s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) assert s1f64l == 2*n*m -- GitLab From 8b122d0aa11b201bbc5c474b69769fd3dc69caaa Mon Sep 17 00:00:00 2001 From: James Stevens Date: Fri, 4 Nov 2016 00:41:29 -0500 Subject: [PATCH 46/55] added filter_by_func example to tutorial --- doc/tutorial.rst | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 172b3a3bc..e7a87505f 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1410,7 +1410,7 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> stored = global_ld_st_bytes[lp.MemAccess(direction='store') ... ].eval_with_dict(param_dict) >>> print("bytes loaded: %s\nbytes stored: %s" % (loaded, stored)) - bytes loaded: 7340032 + bytes loaded: 7340032 bytes stored: 2621440 One can see how these functions might be useful in computing, for example, @@ -1500,6 +1500,20 @@ changed: f64 ld g: 65536 f64 st e: 65536 +We can also filter using an arbitrary test function using +:func:`loopy.ToCountMap.filter_by_func`. This is useful when the filter +criteria are more complicated than a simple list of allowable values: + +.. doctest:: + + >>> def f(key): + ... from loopy.types import to_loopy_type + ... return key.dtype == to_loopy_type(np.float32) and \ + ... key.stride > 1 + >>> count = mem_map.filter_by_func(f).eval_and_sum(param_dict) + >>> print(count) + 2097152 + Counting synchronization events ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- GitLab From 0a7bbfa5fb77fa2df2d8656da9fc2adbfb4b2f83 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Sat, 5 Nov 2016 18:38:52 -0500 Subject: [PATCH 47/55] no longer specifying exact PwQPolynomial in some doctests --- doc/tutorial.rst | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 844387c1d..8ee322a23 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1435,12 +1435,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { (2 * n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (256 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } - MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } - MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } - MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } - MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } - MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { ... } With this parallelization, consecutive threads will access consecutive array @@ -1476,12 +1476,12 @@ switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { (2 * n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (256 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } - MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } - MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { (n * m * l * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * l * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } - MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } - MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } - MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { (n * m * floor((127 + m)/128)) : n > 0 and 0 < m <= 127 and l > 0; (128 * n * floor((127 + m)/128)) : n > 0 and m >= 128 and l > 0 } + MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { ... } With this parallelization, consecutive threads will access *nonconsecutive* -- GitLab From 04cb86b45d3962f29f994b835eb911931b5cfdfc Mon Sep 17 00:00:00 2001 From: James Stevens Date: Wed, 9 Nov 2016 17:17:58 -0600 Subject: [PATCH 48/55] removed fixed TODO --- loopy/statistics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 5bed272a1..eac4ceafb 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -36,7 +36,6 @@ from loopy.kernel.data import MultiAssignmentBase from loopy.diagnostic import warn_with_kernel, LoopyError -#TODO does this work for class functions? __doc__ = """ .. currentmodule:: loopy -- GitLab From 86beb578cbb419466d4416a40f2f0251cbfd9b48 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Wed, 9 Nov 2016 18:29:43 -0600 Subject: [PATCH 49/55] removed unnecessary code from Mappers --- loopy/statistics.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index eac4ceafb..d28d2c14a 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -530,25 +530,15 @@ class ExpressionOpCounter(CombineMapper): map_tagged_variable = map_constant map_variable = map_constant - #def map_wildcard(self, expr): - # return 0,0 - - #def map_function_symbol(self, expr): - # return 0,0 - def map_call(self, expr): return ToCountMap( {Op(dtype=self.type_inf(expr), name='func:'+str(expr.function)): 1} ) + self.rec(expr.parameters) - # def map_call_with_kwargs(self, expr): # implemented in CombineMapper - - def map_subscript(self, expr): # implemented in CombineMapper + def map_subscript(self, expr): return self.rec(expr.index) - # def map_lookup(self, expr): # implemented in CombineMapper - def map_sum(self, expr): assert expr.children return ToCountMap( @@ -675,8 +665,6 @@ class LocalSubscriptCounter(CombineMapper): if name in self.knl.temporary_variables: array = self.knl.temporary_variables[name] - #print("array: ", array) - #print("is local? ", array.is_local) if array.is_local: return ToCountMap( {MemAccess(mtype='local', @@ -736,7 +724,6 @@ class LocalSubscriptCounter(CombineMapper): + self.rec(expr.else_) map_min = map_bitwise_or - map_max = map_min def map_common_subexpression(self, expr): raise NotImplementedError("LocalSubscriptCounter encountered " @@ -760,8 +747,6 @@ class LocalSubscriptCounter(CombineMapper): # }}} - - # {{{ GlobalSubscriptCounter class GlobalSubscriptCounter(CombineMapper): @@ -923,7 +908,6 @@ class GlobalSubscriptCounter(CombineMapper): + self.rec(expr.else_) map_min = map_bitwise_or - map_max = map_min def map_common_subexpression(self, expr): raise NotImplementedError("GlobalSubscriptCounter encountered " -- GitLab From 59f1355e46dd548e8c3db20bbeec33e4a4d12600 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Wed, 9 Nov 2016 18:42:50 -0600 Subject: [PATCH 50/55] removing depricated functions from reference --- loopy/statistics.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index d28d2c14a..0bc91451e 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -44,15 +44,8 @@ __doc__ = """ .. autoclass:: Op .. autoclass:: MemAccess -.. autofunction:: get_op_poly .. autofunction:: get_op_map - -.. autofunction:: get_lmem_access_poly -.. autofunction:: get_DRAM_access_poly -.. autofunction:: get_gmem_access_poly .. autofunction:: get_mem_access_map - -.. autofunction:: get_synchronization_poly .. autofunction:: get_synchronization_map .. autofunction:: gather_access_footprints -- GitLab From 16d3301e68ce5264833b7162673e4710ee898cce Mon Sep 17 00:00:00 2001 From: James Stevens Date: Wed, 9 Nov 2016 21:08:12 -0600 Subject: [PATCH 51/55] refactored map_subscript in LocalSubscriptCounter to elminitate unnecessary recursive call --- loopy/statistics.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 0bc91451e..d5e4c43c9 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -654,17 +654,13 @@ class LocalSubscriptCounter(CombineMapper): return self.rec(expr.parameters) def map_subscript(self, expr): + sub_map = ToCountMap() name = expr.aggregate.name # name of array - if name in self.knl.temporary_variables: array = self.knl.temporary_variables[name] if array.is_local: - return ToCountMap( - {MemAccess(mtype='local', - dtype=self.type_inf(expr)): 1} - ) + self.rec(expr.index) - - return self.rec(expr.index) + sub_map[MemAccess(mtype='local', dtype=self.type_inf(expr))] = 1 + return sub_map + self.rec(expr.index) def map_sum(self, expr): if expr.children: -- GitLab From b64d8af8de9fe309b8fa347fc83e1d06245aab19 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Fri, 18 Nov 2016 12:50:27 -0600 Subject: [PATCH 52/55] better handling of case where min_tag_axis != 0 --- loopy/statistics.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index d5e4c43c9..6c9742e52 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -586,10 +586,10 @@ class ExpressionOpCounter(CombineMapper): def map_logical_not(self, expr): return self.rec(expr.child) - def map_logical_or(self, expr): - return sum(self.rec(child) for child in expr.children) + #def map_logical_or(self, expr): + # return sum(self.rec(child) for child in expr.children) - map_logical_and = map_logical_or + #map_logical_and = map_logical_or def map_if(self, expr): warnings.warn("ExpressionOpCounter counting ops as " @@ -796,6 +796,18 @@ class GlobalSubscriptCounter(CombineMapper): variable=name): 1} ) + self.rec(expr.index) + if min_tag_axis != 0: + warn_with_kernel(knl, "unknown_gmem_stride", + "GlobalSubscriptCounter: " + "Memory access minimum tag axis %d != 0, " + "stride unknown, using sys.maxsize." + % (min_tag_axis)) + #TODO switch all warnings to loopy warnings warn_with_kernel + return ToCountMap({MemAccess(mtype='global', + dtype=self.type_inf(expr), + stride=sys.maxsize, variable=name): 1} + ) + self.rec(expr.index) + # get local_id associated with minimum tag axis min_lid = None for iname in my_inames: @@ -807,8 +819,7 @@ class GlobalSubscriptCounter(CombineMapper): # found local_id associated with minimum tag axis - total_stride = None - extra_stride = 1 + total_stride = 0 # check coefficient of min_lid for each axis from loopy.symbolic import CoefficientCollector from loopy.kernel.array import FixedStrideArrayDimTag @@ -830,17 +841,7 @@ class GlobalSubscriptCounter(CombineMapper): else: continue - total_stride = stride*coeff_min_lid*extra_stride - #TODO is there a case where this^ does not execute, - # or executes more than once for two different axes? - - #TODO temporary fix that needs changing: - if min_tag_axis != 0: - print("... min tag axis (%d) is not zero! ..." % (min_tag_axis)) - return ToCountMap({MemAccess(mtype='global', - dtype=self.type_inf(expr), - stride=sys.maxsize, variable=name): 1} - ) + self.rec(expr.index) + total_stride += stride*coeff_min_lid return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), stride=total_stride, variable=name): 1} -- GitLab From f5415bc0147ac05754d493762531fe0fb7a57fc5 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Fri, 18 Nov 2016 13:16:14 -0600 Subject: [PATCH 53/55] switching warnings to warn_with_kernel --- loopy/statistics.py | 75 +++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 6c9742e52..07916022e 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -592,14 +592,16 @@ class ExpressionOpCounter(CombineMapper): #map_logical_and = map_logical_or def map_if(self, expr): - warnings.warn("ExpressionOpCounter counting ops as " - "sum of if-statement branches.") + warn_with_kernel(self.knl, "summing_if_branches_ops", + "ExpressionOpCounter counting ops as sum of " + "if-statement branches.") return self.rec(expr.condition) + self.rec(expr.then) \ + self.rec(expr.else_) def map_if_positive(self, expr): - warnings.warn("ExpressionOpCounter counting ops as " - "sum of if_pos-statement branches.") + warn_with_kernel(self.knl, "summing_ifpos_branches_ops", + "ExpressionOpCounter counting ops as sum of " + "if_pos-statement branches.") return self.rec(expr.criterion) + self.rec(expr.then) \ + self.rec(expr.else_) @@ -701,14 +703,16 @@ class LocalSubscriptCounter(CombineMapper): map_logical_and = map_logical_or def map_if(self, expr): - warnings.warn("LocalSubscriptCounter counting LMEM accesses as " - "sum of if-statement branches.") + warn_with_kernel(self.knl, "summing_if_branches_lsubs", + "LocalSubscriptCounter counting LMEM accesses as sum " + "of if-statement branches.") return self.rec(expr.condition) + self.rec(expr.then) \ + self.rec(expr.else_) def map_if_positive(self, expr): - warnings.warn("LocalSubscriptCounter counting LMEM accesses as " - "sum of if_pos-statement branches.") + warn_with_kernel(self.knl, "summing_ifpos_branches_lsubs", + "LocalSubscriptCounter counting LMEM accesses as sum " + "of if_pos-statement branches.") return self.rec(expr.criterion) + self.rec(expr.then) \ + self.rec(expr.else_) @@ -797,12 +801,10 @@ class GlobalSubscriptCounter(CombineMapper): ) + self.rec(expr.index) if min_tag_axis != 0: - warn_with_kernel(knl, "unknown_gmem_stride", - "GlobalSubscriptCounter: " - "Memory access minimum tag axis %d != 0, " - "stride unknown, using sys.maxsize." - % (min_tag_axis)) - #TODO switch all warnings to loopy warnings warn_with_kernel + warn_with_kernel(self.knl, "unknown_gmem_stride", + "GlobalSubscriptCounter: Memory access minimum " + "tag axis %d != 0, stride unknown, using " + "sys.maxsize." % (min_tag_axis)) return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), stride=sys.maxsize, variable=name): 1} @@ -886,14 +888,16 @@ class GlobalSubscriptCounter(CombineMapper): map_logical_and = map_logical_or def map_if(self, expr): - warnings.warn("GlobalSubscriptCounter counting GMEM accesses as " - "sum of if-statement branches.") + warn_with_kernel(self.knl, "summing_if_branches_gsubs", + "GlobalSubscriptCounter counting GMEM accesses as " + "sum of if-statement branches.") return self.rec(expr.condition) + self.rec(expr.then) \ + self.rec(expr.else_) def map_if_positive(self, expr): - warnings.warn("GlobalSubscriptCounter counting GMEM accesses as " - "sum of if_pos-statement branches.") + warn_with_kernel(self.knl, "summing_ifpos_branches_gsubs", + "GlobalSubscriptCounter counting GMEM accesses as " + "sum of if_pos-statement branches.") return self.rec(expr.criterion) + self.rec(expr.then) \ + self.rec(expr.else_) @@ -1093,9 +1097,8 @@ def get_op_poly(knl, numpy_types=True): get_op_poly is deprecated. Use get_op_map instead. """ - from warnings import warn - warn("get_op_poly is deprecated. Use get_op_map instead.", - DeprecationWarning, stacklevel=2) + warn_with_kernel(knl, "depricated_get_op_poly", + "get_op_poly is deprecated. Use get_op_map instead.") return get_op_map(knl, numpy_types) # }}} @@ -1166,10 +1169,10 @@ def get_lmem_access_poly(knl): result with the mtype=['local'] option. """ - from warnings import warn - warn("get_lmem_access_poly is deprecated. Use get_mem_access_map and " - "filter the result with the mtype=['local'] option.", - DeprecationWarning, stacklevel=2) + warn_with_kernel(knl, "depricated_get_lmem_access_poly", + "get_lmem_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['local'] option.") return get_mem_access_map(knl).filter_by(mtype=['local']) @@ -1180,10 +1183,10 @@ def get_DRAM_access_poly(knl): result with the mtype=['global'] option. """ - from warnings import warn - warn("get_DRAM_access_poly is deprecated. Use get_mem_access_map and " - "filter the result with the mtype=['global'] option.", - DeprecationWarning, stacklevel=2) + warn_with_kernel(knl, "depricated_get_DRAM_access_poly", + "get_DRAM_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['global'] option.") return get_mem_access_map(knl).filter_by(mtype=['global']) @@ -1196,10 +1199,10 @@ def get_gmem_access_poly(knl): result with the mtype=['global'] option. """ - from warnings import warn - warn("get_DRAM_access_poly is deprecated. Use get_mem_access_map and " - "filter the result with the mtype=['global'] option.", - DeprecationWarning, stacklevel=2) + warn_with_kernel(knl, "depricated_get_gmem_access_poly", + "get_DRAM_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['global'] option.") return get_mem_access_map(knl).filter_by(mtype=['global']) # }}} @@ -1349,9 +1352,9 @@ def get_synchronization_poly(knl): get_synchronization_poly is deprecated. Use get_synchronization_map instead. """ - from warnings import warn - warn("get_synchronization_poly is deprecated. Use get_synchronization_map instead.", - DeprecationWarning, stacklevel=2) + warn_with_kernel(knl, "depricated_get_synchronization_poly", + "get_synchronization_poly is deprecated. Use " + "get_synchronization_map instead.") return get_synchronization_map(knl) # }}} -- GitLab From 064a318d894cc2a26e4d66492eb7ca898cf3aa8a Mon Sep 17 00:00:00 2001 From: James Stevens Date: Fri, 18 Nov 2016 13:39:55 -0600 Subject: [PATCH 54/55] removed map functions that are implemented in parent mapper --- loopy/statistics.py | 72 --------------------------------------------- 1 file changed, 72 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 07916022e..dae6b5bfb 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -580,17 +580,6 @@ class ExpressionOpCounter(CombineMapper): map_bitwise_xor = map_bitwise_or map_bitwise_and = map_bitwise_or - def map_comparison(self, expr): - return self.rec(expr.left)+self.rec(expr.right) - - def map_logical_not(self, expr): - return self.rec(expr.child) - - #def map_logical_or(self, expr): - # return sum(self.rec(child) for child in expr.children) - - #map_logical_and = map_logical_or - def map_if(self, expr): warn_with_kernel(self.knl, "summing_if_branches_ops", "ExpressionOpCounter counting ops as sum of " @@ -672,36 +661,9 @@ class LocalSubscriptCounter(CombineMapper): map_product = map_sum - def map_quotient(self, expr, *args): - return self.rec(expr.numerator) + self.rec(expr.denominator) - - map_floor_div = map_quotient - map_remainder = map_quotient - - def map_power(self, expr): - return self.rec(expr.base) + self.rec(expr.exponent) - - def map_left_shift(self, expr): - return self.rec(expr.shiftee)+self.rec(expr.shift) - - map_right_shift = map_left_shift - - def map_bitwise_not(self, expr): - return self.rec(expr.child) - - def map_bitwise_or(self, expr): - return sum(self.rec(child) for child in expr.children) - - map_bitwise_xor = map_bitwise_or - map_bitwise_and = map_bitwise_or - def map_comparison(self, expr): return self.rec(expr.left)+self.rec(expr.right) - map_logical_not = map_bitwise_not - map_logical_or = map_bitwise_or - map_logical_and = map_logical_or - def map_if(self, expr): warn_with_kernel(self.knl, "summing_if_branches_lsubs", "LocalSubscriptCounter counting LMEM accesses as sum " @@ -716,8 +678,6 @@ class LocalSubscriptCounter(CombineMapper): return self.rec(expr.criterion) + self.rec(expr.then) \ + self.rec(expr.else_) - map_min = map_bitwise_or - def map_common_subexpression(self, expr): raise NotImplementedError("LocalSubscriptCounter encountered " "common_subexpression, " @@ -857,36 +817,6 @@ class GlobalSubscriptCounter(CombineMapper): map_product = map_sum - def map_quotient(self, expr, *args): - return self.rec(expr.numerator) + self.rec(expr.denominator) - - map_floor_div = map_quotient - map_remainder = map_quotient - - def map_power(self, expr): - return self.rec(expr.base) + self.rec(expr.exponent) - - def map_left_shift(self, expr): - return self.rec(expr.shiftee)+self.rec(expr.shift) - - map_right_shift = map_left_shift - - def map_bitwise_not(self, expr): - return self.rec(expr.child) - - def map_bitwise_or(self, expr): - return sum(self.rec(child) for child in expr.children) - - map_bitwise_xor = map_bitwise_or - map_bitwise_and = map_bitwise_or - - def map_comparison(self, expr): - return self.rec(expr.left)+self.rec(expr.right) - - map_logical_not = map_bitwise_not - map_logical_or = map_bitwise_or - map_logical_and = map_logical_or - def map_if(self, expr): warn_with_kernel(self.knl, "summing_if_branches_gsubs", "GlobalSubscriptCounter counting GMEM accesses as " @@ -901,8 +831,6 @@ class GlobalSubscriptCounter(CombineMapper): return self.rec(expr.criterion) + self.rec(expr.then) \ + self.rec(expr.else_) - map_min = map_bitwise_or - def map_common_subexpression(self, expr): raise NotImplementedError("GlobalSubscriptCounter encountered " "common_subexpression, " -- GitLab From bc3512da0b115aaa1076e2d3d5b15ccf85f7511a Mon Sep 17 00:00:00 2001 From: James Stevens Date: Fri, 18 Nov 2016 13:48:44 -0600 Subject: [PATCH 55/55] updating TypeInferenceMapper inport statement --- loopy/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 9b676e69b..2ec5eb0d4 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -629,7 +629,7 @@ class LocalSubscriptCounter(CombineMapper): def __init__(self, knl): self.knl = knl - from loopy.expression import TypeInferenceMapper + from loopy.type_inference import TypeInferenceMapper self.type_inf = TypeInferenceMapper(knl) def combine(self, values): -- GitLab