From 3e1faa27cd6984091c68ddb79d47eea1b8060b7c Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 12 Jan 2018 14:25:35 -0600 Subject: [PATCH 01/59] added truediv to guarded poly --- loopy/statistics.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/loopy/statistics.py b/loopy/statistics.py index a2dcb6846..dd9f3854b 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -89,6 +89,14 @@ class GuardedPwQPolynomial(object): __rmul__ = __mul__ + def __truediv__(self, other): + if not isinstance(other, int): + raise ValueError("GuardedPwQPolynomial.__truediv__ only valid for " + "type int. Attempted to divide by %s" % (type(other))) + return GuardedPwQPolynomial( + self.pwqpolynomial.scale_val(isl.Val(1).div(isl.Val(other))), + self.valid_domain) + def eval_with_dict(self, value_dict): space = self.pwqpolynomial.space pt = isl.Point.zero(space.params()) -- GitLab From f89abc95bce233ba02aece3e39ecb6a82fed2ef8 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 12 Jan 2018 14:31:04 -0600 Subject: [PATCH 02/59] added count_granularity to Op --- loopy/statistics.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index dd9f3854b..7be64eb67 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -484,8 +484,9 @@ class Op(object): # FIXME: This could be done much more briefly by inheriting from Record. - def __init__(self, dtype=None, name=None): + def __init__(self, dtype=None, name=None, count_granularity='thread'): self.name = name + self.count_granularity = count_granularity if dtype is None: self.dtype = dtype else: @@ -497,13 +498,16 @@ class Op(object): (self.dtype is None or other.dtype is None or self.dtype == other.dtype) and (self.name is None or other.name is None or - self.name == other.name)) + self.name == other.name) and + (self.count_granularity is None or + other.count_granularity is None or + self.count_granularity == other.count_granularity)) def __hash__(self): return hash(str(self)) def __repr__(self): - return "Op(%s, %s)" % (self.dtype, self.name) + return "Op(%s, %s, %s)" % (self.dtype, self.name, self.count_granularity) # }}} -- GitLab From feee7c9628bf6929b8839597a0e51fc135128732 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 12 Jan 2018 14:37:11 -0600 Subject: [PATCH 03/59] added count_granularity to MemAccess --- loopy/statistics.py | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 7be64eb67..b023b7317 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -547,11 +547,13 @@ class MemAccess(object): # FIXME: This could be done much more briefly by inheriting from Record. def __init__(self, mtype=None, dtype=None, stride=None, direction=None, - variable=None): + variable=None, count_granularity='thread'): self.mtype = mtype self.stride = stride self.direction = direction self.variable = variable + self.count_granularity = count_granularity + if dtype is None: self.dtype = dtype else: @@ -569,14 +571,16 @@ class MemAccess(object): "mtype is 'local'") def copy(self, mtype=None, dtype=None, stride=None, direction=None, - variable=None): + variable=None, count_granularity=None): return MemAccess( mtype=mtype if mtype is not None else self.mtype, dtype=dtype if dtype is not None else self.dtype, stride=stride if stride is not None else self.stride, direction=direction if direction is not None else self.direction, variable=variable if variable is not None else self.variable, - ) + count_granularity=count_granularity + if count_granularity is not None + else self.count_granularity) def __eq__(self, other): return isinstance(other, MemAccess) and ( @@ -589,34 +593,23 @@ class MemAccess(object): (self.direction is None or other.direction is None or self.direction == other.direction) and (self.variable is None or other.variable is None or - self.variable == other.variable)) + self.variable == other.variable) and + (self.count_granularity is None or + other.count_granularity is None or + self.count_granularity == other.count_granularity) + ) def __hash__(self): return hash(str(self)) def __repr__(self): - if self.mtype is None: - mtype = 'None' - else: - mtype = self.mtype - if self.dtype is None: - dtype = 'None' - else: - dtype = str(self.dtype) - if self.stride is None: - stride = 'None' - else: - stride = str(self.stride) - if self.direction is None: - direction = 'None' - else: - direction = self.direction - if self.variable is None: - variable = 'None' - else: - variable = self.variable - return "MemAccess(" + mtype + ", " + dtype + ", " + stride + ", " \ - + direction + ", " + variable + ")" + return "MemAccess(%s, %s, %s, %s, %s, %s)" % ( + self.mtype, + self.dtype, + self.stride, + self.direction, + self.variable, + self.count_granularity) # }}} -- GitLab From f82702defa97bc2dc4ef2a6b65c9ab0644d2f998 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 12 Jan 2018 14:48:00 -0600 Subject: [PATCH 04/59] setting count_granularity in MemAccessCounter --- loopy/statistics.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index b023b7317..45907f579 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -877,7 +877,8 @@ class GlobalMemAccessCounter(MemAccessCounter): # count as uniform access return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), stride=0, - variable=name): 1} + variable=name, + count_granularity='warp'): 1} ) + self.rec(expr.index) if min_tag_axis != 0: @@ -931,8 +932,15 @@ class GlobalMemAccessCounter(MemAccessCounter): total_stride += stride*coeff_min_lid - return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), - stride=total_stride, variable=name): 1} + count_granularity = 'thread' if total_stride is not 0 else 'warp' + + return ToCountMap({MemAccess( + mtype='global', + dtype=self.type_inf(expr), + stride=total_stride, + variable=name, + count_granularity=count_granularity + ): 1} ) + self.rec(expr.index) # }}} -- GitLab From 8b4750d5d24bce3f43071885e9c51a0238b420df Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 12 Jan 2018 14:59:40 -0600 Subject: [PATCH 05/59] get_mem_access_map() using count_granularity in counting --- loopy/statistics.py | 62 +++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 45907f579..f2989e1aa 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1326,13 +1326,37 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False): cache_holder = CacheHolder() - @memoize_in(cache_holder, "insn_count") - def get_insn_count(knl, insn_id, uniform=False): + #@memoize_in(cache_holder, "insn_count") # TODO why doesn't this work anymore? + def get_insn_count(knl, insn_id, + disregard_local_axes=False, + count_granularity='thread'): insn = knl.id_to_insn[insn_id] - return count_insn_runs( - knl, insn, disregard_local_axes=uniform, + ct = count_insn_runs( + knl, insn, disregard_local_axes=disregard_local_axes, count_redundant_work=count_redundant_work) + if count_granularity == 'thread': + return ct + elif count_granularity == 'warp': + return ct/wsize + elif count_granularity == 'group': + from loopy.symbolic import aff_to_expr + _, local_size = knl.get_grid_size_upper_bounds() + group_threads = 1 + for size in local_size: + try: + s = aff_to_expr(size) + except AttributeError: + raise LoopyError("Cannot count insn with group granularity, " + "group size is not integer: %s" + % (local_size)) + group_threads *= s + return ct/group_threads + else: + raise ValueError("get_insn_count: count_granularity '%s' is" + "not allowed. count_granularity must be 'group', " + "'warp', or 'thread'." % (count_granularity)) + knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) @@ -1358,23 +1382,21 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False): # use count excluding local index tags for uniform accesses for key, val in six.iteritems(access_expr.count_map): - is_uniform = (key.mtype == 'global' and - isinstance(key.stride, int) and - key.stride == 0) + access_map = ( access_map + ToCountMap({key: val}) - * get_insn_count(knl, insn.id, is_uniform)) + * get_insn_count(knl, insn.id, + count_granularity=key.count_granularity)) #currently not counting stride of local mem access for key, val in six.iteritems(access_assignee_g.count_map): - is_uniform = (key.mtype == 'global' and - isinstance(key.stride, int) and - key.stride == 0) + access_map = ( access_map + ToCountMap({key: val}) - * get_insn_count(knl, insn.id, is_uniform)) + * get_insn_count(knl, insn.id, + count_granularity=key.count_granularity)) # for now, don't count writes to local mem elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): pass @@ -1384,13 +1406,15 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False): if numpy_types: # FIXME: Don't modify in-place - access_map.count_map = dict((MemAccess(mtype=mem_access.mtype, - dtype=mem_access.dtype.numpy_dtype, - stride=mem_access.stride, - direction=mem_access.direction, - variable=mem_access.variable), - count) - for mem_access, count in six.iteritems(access_map.count_map)) + access_map.count_map = dict( + (MemAccess( + mtype=mem_access.mtype, + dtype=mem_access.dtype.numpy_dtype, + stride=mem_access.stride, + direction=mem_access.direction, + variable=mem_access.variable + ), count) + for mem_access, count in six.iteritems(access_map.count_map)) return access_map -- GitLab From 62932b143e3f9f50d27d2d01c606225ac8c52c47 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 12 Jan 2018 15:08:14 -0600 Subject: [PATCH 06/59] added wsize argument to get_mem_access_map() for count_granularity --- loopy/statistics.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index f2989e1aa..71c16214d 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1258,7 +1258,8 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False): # {{{ get_mem_access_map -def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False): +def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, + wsize=None): """Count the number of memory accesses in a loopy kernel. :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be @@ -1321,6 +1322,13 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False): """ from loopy.preprocess import preprocess_kernel, infer_unknown_types + if wsize is None: + wsize = 32 + warn_with_kernel(knl, "get_mem_access_map_assumes_warpsize", + "get_mem_access_map: No warp size passed, " + "assuming warp size is %d." + % (wsize)) + class CacheHolder(object): pass -- GitLab From 3c84a24103cab1e1e5d84864b5d656a22cc63e86 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 12 Jan 2018 15:33:22 -0600 Subject: [PATCH 07/59] updated stats tests to use/test count_granularity --- test/test_statistics.py | 95 +++++++++++++++++++++++++++-------------- 1 file changed, 62 insertions(+), 33 deletions(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index eeb4a5a28..8c3c16c0d 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -232,7 +232,8 @@ def test_mem_access_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32) + n = 512 m = 256 ell = 128 @@ -249,8 +250,8 @@ def test_mem_access_counter_basic(): f64l += mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h') ].eval_with_dict(params) - assert f32l == 3*n*m*ell - assert f64l == 2*n*m + assert f32l == 3*n*m*ell/32 # /warpsize because these are considered uniform + assert f64l == 2*n*m/32 # /warpsize because these are considered uniform f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c') @@ -258,8 +259,8 @@ def test_mem_access_counter_basic(): f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='store', variable='e') ].eval_with_dict(params) - assert f32s == n*m*ell - assert f64s == n*m + assert f32s == n*m*ell/32 # /warpsize because these are considered uniform + assert f64s == n*m/32 # /warpsize because these are considered uniform def test_mem_access_counter_reduction(): @@ -272,7 +273,7 @@ def test_mem_access_counter_reduction(): name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32) n = 512 m = 256 ell = 128 @@ -283,12 +284,12 @@ def test_mem_access_counter_reduction(): f32l += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b') ].eval_with_dict(params) - assert f32l == 2*n*m*ell + assert f32l == 2*n*m*ell/32 # /warpsize because these are considered uniform f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c') ].eval_with_dict(params) - assert f32s == n*ell + assert f32s == n*ell/32 # /warpsize because these are considered uniform ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] ).to_bytes().eval_and_sum(params) @@ -312,7 +313,7 @@ def test_mem_access_counter_logic(): name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32) n = 512 m = 256 ell = 128 @@ -329,9 +330,9 @@ def test_mem_access_counter_logic(): f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), direction='store') ].eval_with_dict(params) - assert f32_g_l == 2*n*m - assert f64_g_l == n*m - assert f64_g_s == n*m + assert f32_g_l == 2*n*m/32 # /warpsize because these are considered uniform + assert f64_g_l == n*m/32 # /warpsize because these are considered uniform + assert f64_g_s == n*m/32 # /warpsize because these are considered uniform def test_mem_access_counter_specialops(): @@ -348,7 +349,7 @@ def test_mem_access_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32) n = 512 m = 256 ell = 128 @@ -365,8 +366,8 @@ def test_mem_access_counter_specialops(): f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='h') ].eval_with_dict(params) - assert f32 == 2*n*m*ell - assert f64 == 2*n*m + assert f32 == 2*n*m*ell/32 # /warpsize because these are considered uniform + assert f64 == 2*n*m/32 # /warpsize because these are considered uniform f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='store', variable='c') @@ -374,13 +375,13 @@ def test_mem_access_counter_specialops(): f64 = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e') ].eval_with_dict(params) - assert f32 == n*m*ell - assert f64 == n*m + assert f32 == n*m*ell/32 # /warpsize because these are considered uniform + assert f64 == n*m/32 # /warpsize because these are considered uniform filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g']) #tot = lp.eval_and_sum_polys(filtered_map, params) tot = filtered_map.eval_and_sum(params) - assert tot == n*m*ell + n*m + assert tot == (n*m*ell + n*m)/32 # /warpsize for uniform def test_mem_access_counter_bitwise(): @@ -400,7 +401,7 @@ def test_mem_access_counter_bitwise(): a=np.int32, b=np.int32, g=np.int32, h=np.int32)) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32) n = 512 m = 256 ell = 128 @@ -417,7 +418,7 @@ def test_mem_access_counter_bitwise(): i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), stride=0, direction='load', variable='h') ].eval_with_dict(params) - assert i32 == 4*n*m+2*n*m*ell + assert i32 == (4*n*m+2*n*m*ell)/32 # /warpsize for uniform i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='c') @@ -425,7 +426,7 @@ def test_mem_access_counter_bitwise(): i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='e') ].eval_with_dict(params) - assert i32 == n*m+n*m*ell + assert i32 == (n*m+n*m*ell)/32 # /warpsize because these are considered uniform def test_mem_access_counter_mixed(): @@ -446,7 +447,7 @@ def test_mem_access_counter_mixed(): knl = lp.split_iname(knl, "j", bsize) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) # noqa + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32) # noqa n = 512 m = 256 ell = 128 @@ -468,8 +469,8 @@ def test_mem_access_counter_mixed(): stride=Variable('m'), direction='load', variable='b') ].eval_with_dict(params) - assert f64uniform == 2*n*m*ell/bsize - assert f32uniform == n*m*ell/bsize + assert f64uniform == 2*n*m*ell/32 # /warpsize for uniform + assert f32uniform == n*m*ell/32 # /warpsize for uniform assert f32nonconsec == 3*n*m*ell f64uniform = mem_map[lp.MemAccess('global', np.float64, @@ -479,7 +480,7 @@ def test_mem_access_counter_mixed(): stride=Variable('m'), direction='store', variable='c') ].eval_with_dict(params) - assert f64uniform == n*m*ell/bsize + assert f64uniform == n*m*ell/32 # /warpsize because these are considered uniform assert f32nonconsec == n*m*ell @@ -534,6 +535,34 @@ def test_mem_access_counter_nonconsec(): assert f64nonconsec == n*m assert f32nonconsec == n*m*ell + mem_map64 = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=64) + f64nonconsec = mem_map64[lp.MemAccess( + 'global', + np.float64, stride=Variable('m'), + direction='load', variable='g') + ].eval_with_dict(params) + f64nonconsec += mem_map64[lp.MemAccess( + 'global', + np.float64, stride=Variable('m'), + direction='load', variable='h') + ].eval_with_dict(params) + f32nonconsec = mem_map64[lp.MemAccess( + 'global', + np.dtype(np.float32), + stride=Variable('m')*Variable('ell'), + direction='load', + variable='a') + ].eval_with_dict(params) + f32nonconsec += mem_map64[lp.MemAccess( + 'global', + np.dtype(np.float32), + stride=Variable('m')*Variable('ell'), + direction='load', + variable='b') + ].eval_with_dict(params) + assert f64nonconsec == 2*n*m + assert f32nonconsec == 3*n*m*ell + def test_mem_access_counter_consec(): @@ -750,22 +779,22 @@ def test_summations_and_filters(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32) loads_a = mem_map.filter_by(direction=['load'], variable=['a'] ).eval_and_sum(params) - assert loads_a == 2*n*m*ell + assert loads_a == 2*n*m*ell/32 # /warpsize because these are considered uniform global_stores = mem_map.filter_by(mtype=['global'], direction=['store'] ).eval_and_sum(params) - assert global_stores == n*m*ell + n*m + assert global_stores == (n*m*ell + n*m)/32 # /warpsize for uniform ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] ).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'] ).to_bytes().eval_and_sum(params) - assert ld_bytes == 4*n*m*ell*3 + 8*n*m*2 - assert st_bytes == 4*n*m*ell + 8*n*m + assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)/32 # /warpsize for uniform + assert st_bytes == (4*n*m*ell + 8*n*m)/32 # /warpsize for uniform # ignore stride and variable names in this map reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') @@ -773,8 +802,8 @@ def test_summations_and_filters(): ].eval_with_dict(params) f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') ].eval_with_dict(params) - assert f32lall == 3*n*m*ell - assert f64lall == 2*n*m + assert f32lall == 3*n*m*ell/32 # /warpsize because these are considered uniform + assert f64lall == 2*n*m/32 # /warpsize because these are considered uniform op_map = lp.get_op_map(knl, count_redundant_work=True) #for k, v in op_map.items(): @@ -807,7 +836,7 @@ def test_summations_and_filters(): return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \ key.direction == 'load' s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) - assert s1f64l == 2*n*m + assert s1f64l == 2*n*m/32 # /warpsize because these are considered uniform def test_strided_footprint(): -- GitLab From 70d17d3bfca757a2fb247c98d74dc9c16eb581c2 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 12 Jan 2018 19:53:11 -0600 Subject: [PATCH 08/59] updated tutorial doctests for addition of count_granularity, still needs explanations --- doc/tutorial.rst | 118 +++++++++++++++++++++++------------------------ 1 file changed, 59 insertions(+), 59 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 7196dad86..8e05cf0f4 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1552,12 +1552,12 @@ information provided. Now we will count the operations: >>> op_map = lp.get_op_map(knl) >>> print(lp.stringify_stats_mapping(op_map)) - Op(np:dtype('float32'), add) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('float32'), div) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('float32'), mul) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('float64'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('float64'), mul) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('int32'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float32'), add, thread) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float32'), div, thread) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float32'), mul, thread) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float64'), add, thread) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float64'), mul, thread) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('int32'), add, thread) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{** @@ -1579,12 +1579,12 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: .. doctest:: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} - >>> f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(param_dict) - >>> f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(param_dict) - >>> f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(param_dict) - >>> f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(param_dict) - >>> f64mul = op_map[lp.Op(np.float64, 'mul')].eval_with_dict(param_dict) - >>> i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(param_dict) + >>> f32add = op_map[lp.Op(np.float32, 'add', 'thread')].eval_with_dict(param_dict) + >>> f32div = op_map[lp.Op(np.float32, 'div', 'thread')].eval_with_dict(param_dict) + >>> f32mul = op_map[lp.Op(np.float32, 'mul', 'thread')].eval_with_dict(param_dict) + >>> f64add = op_map[lp.Op(np.float64, 'add', 'thread')].eval_with_dict(param_dict) + >>> f64mul = op_map[lp.Op(np.float64, 'mul', 'thread')].eval_with_dict(param_dict) + >>> i32add = op_map[lp.Op(np.int32, 'add', 'thread')].eval_with_dict(param_dict) >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 @@ -1615,9 +1615,9 @@ together into keys containing only the specified fields: >>> op_map_dtype = op_map.group_by('dtype') >>> print(lp.stringify_stats_mapping(op_map_dtype)) - Op(np:dtype('float32'), None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('float64'), None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('int32'), None) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float32'), None, None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float64'), None, None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('int32'), None, None) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32) ... ].eval_with_dict(param_dict) @@ -1638,12 +1638,12 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, a, warp) : [m, l, n] -> { 1/16 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b, warp) : [m, l, n] -> { 1/32 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c, warp) : [m, l, n] -> { 1/32 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g, warp) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h, warp) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e, warp) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** @@ -1669,20 +1669,20 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g') + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g', 'warp') ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e') + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e', 'warp') ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a') + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a', 'warp') ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c') + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c', 'warp') ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) - f32 ld a: 1048576 - f32 st c: 524288 - f64 ld g: 65536 - f64 st e: 65536 + f32 ld a: 32768 + f32 st c: 16384 + f64 ld g: 2048 + f64 st e: 2048 :class:`loopy.ToCountMap` also makes it easy to determine the total amount of data moved in bytes. Suppose we want to know the total amount of global @@ -1693,26 +1693,26 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, a, warp) : [m, l, n] -> { 1/4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b, warp) : [m, l, n] -> { 1/8 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c, warp) : [m, l, n] -> { 1/8 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g, warp) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h, warp) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e, warp) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 } >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) - MemAccess(None, None, None, load, None) : [m, l, n] -> { (16 * m + 12 * m * l) * n : m > 0 and l > 0 and n > 0 } - MemAccess(None, None, None, store, None) : [m, l, n] -> { (8 * m + 4 * m * l) * n : m > 0 and l > 0 and n > 0 } + MemAccess(None, None, None, load, None, None) : [m, l, n] -> { (1/2 + 3/8 * l) * n * m : m > 0 and l > 0 and n > 0 } + MemAccess(None, None, None, store, None, None) : [m, l, n] -> { (1/4 + 1/8 * l) * n * m : m > 0 and l > 0 and n > 0 } >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') ... ].eval_with_dict(param_dict) >>> stored = global_ld_st_bytes[lp.MemAccess(direction='store') ... ].eval_with_dict(param_dict) >>> print("bytes loaded: %s\nbytes stored: %s" % (loaded, stored)) - bytes loaded: 7340032 - bytes stored: 2621440 + bytes loaded: 229376 + bytes stored: 81920 One can see how these functions might be useful in computing, for example, achieved memory bandwidth in byte/sec or performance in FLOP/sec. @@ -1731,12 +1731,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 1, load, a) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float32'), 1, load, b) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float32'), 1, store, c) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 1, load, g) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 1, load, h) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 1, store, e) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 1, load, a, thread) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 1, load, b, thread) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 1, store, c, thread) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 1, load, g, thread) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 1, load, h, thread) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 1, store, e, thread) : [m, l, n] -> { ... } With this parallelization, consecutive threads will access consecutive array @@ -1746,13 +1746,13 @@ array accesses has not changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g') + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g', 'thread') ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e') + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e', 'thread') ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a') + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a', 'thread') ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c') + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c', 'thread') ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1772,12 +1772,12 @@ switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 128, load, a) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float32'), 128, load, b) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float32'), 128, store, c) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 128, load, g) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 128, load, h) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 128, store, e) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 128, load, a, thread) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 128, load, b, thread) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 128, store, c, thread) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 128, load, g, thread) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 128, load, h, thread) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 128, store, e, thread) : [m, l, n] -> { ... } With this parallelization, consecutive threads will access *nonconsecutive* @@ -1786,13 +1786,13 @@ changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g') + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g', 'thread') ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e') + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e', 'thread') ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a') + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a', 'thread') ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c') + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c', 'thread') ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) -- GitLab From 152e17eabfdce30ab1163e32128598675eec708b Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 12 Jan 2018 20:09:53 -0600 Subject: [PATCH 09/59] changed default count granularity to None instead of thread, updated tests and everything else accordingly --- loopy/statistics.py | 84 +++++++++---- test/test_statistics.py | 256 +++++++++++++++++++++++++--------------- 2 files changed, 221 insertions(+), 119 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 71c16214d..0776eb1c3 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -484,7 +484,7 @@ class Op(object): # FIXME: This could be done much more briefly by inheriting from Record. - def __init__(self, dtype=None, name=None, count_granularity='thread'): + def __init__(self, dtype=None, name=None, count_granularity=None): self.name = name self.count_granularity = count_granularity if dtype is None: @@ -547,7 +547,7 @@ class MemAccess(object): # FIXME: This could be done much more briefly by inheriting from Record. def __init__(self, mtype=None, dtype=None, stride=None, direction=None, - variable=None, count_granularity='thread'): + variable=None, count_granularity=None): self.mtype = mtype self.stride = stride self.direction = direction @@ -571,7 +571,7 @@ class MemAccess(object): "mtype is 'local'") def copy(self, mtype=None, dtype=None, stride=None, direction=None, - variable=None, count_granularity=None): + variable=None, count_granularity=None): return MemAccess( mtype=mtype if mtype is not None else self.mtype, dtype=dtype if dtype is not None else self.dtype, @@ -692,7 +692,8 @@ class ExpressionOpCounter(CounterBase): def map_call(self, expr): return ToCountMap( {Op(dtype=self.type_inf(expr), - name='func:'+str(expr.function)): 1} + name='func:'+str(expr.function), + count_granularity='thread'): 1} ) + self.rec(expr.parameters) def map_subscript(self, expr): @@ -702,20 +703,27 @@ class ExpressionOpCounter(CounterBase): assert expr.children return ToCountMap( {Op(dtype=self.type_inf(expr), - name='add'): len(expr.children)-1} + name='add', + count_granularity='thread'): len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) def map_product(self, expr): from pymbolic.primitives import is_zero assert expr.children - return sum(ToCountMap({Op(dtype=self.type_inf(expr), name='mul'): 1}) + return sum(ToCountMap({Op(dtype=self.type_inf(expr), + name='mul', + count_granularity='thread'): 1}) + self.rec(child) for child in expr.children if not is_zero(child + 1)) + \ - ToCountMap({Op(dtype=self.type_inf(expr), name='mul'): -1}) + ToCountMap({Op(dtype=self.type_inf(expr), + name='mul', + count_granularity='thread'): -1}) def map_quotient(self, expr, *args): - return ToCountMap({Op(dtype=self.type_inf(expr), name='div'): 1}) \ + return ToCountMap({Op(dtype=self.type_inf(expr), + name='div', + count_granularity='thread'): 1}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) @@ -723,23 +731,31 @@ class ExpressionOpCounter(CounterBase): map_remainder = map_quotient def map_power(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), name='pow'): 1}) \ + return ToCountMap({Op(dtype=self.type_inf(expr), + name='pow', + count_granularity='thread'): 1}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) def map_left_shift(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), name='shift'): 1}) \ + return ToCountMap({Op(dtype=self.type_inf(expr), + name='shift', + count_granularity='thread'): 1}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) map_right_shift = map_left_shift def map_bitwise_not(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'): 1}) \ + return ToCountMap({Op(dtype=self.type_inf(expr), + name='bw', + count_granularity='thread'): 1}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), name='bw'): + return ToCountMap({Op(dtype=self.type_inf(expr), + name='bw', + count_granularity='thread'): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -761,7 +777,9 @@ class ExpressionOpCounter(CounterBase): + self.rec(expr.else_) def map_min(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), name='maxmin'): + return ToCountMap({Op(dtype=self.type_inf(expr), + name='maxmin', + count_granularity='thread'): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -802,7 +820,8 @@ class LocalMemAccessCounter(MemAccessCounter): array = self.knl.temporary_variables[name] if isinstance(array, TemporaryVariable) and ( array.scope == temp_var_scope.LOCAL): - sub_map[MemAccess(mtype='local', dtype=dtype)] = 1 + sub_map[MemAccess(mtype='local', dtype=dtype, + count_granularity='thread')] = 1 return sub_map def map_variable(self, expr): @@ -838,7 +857,8 @@ class GlobalMemAccessCounter(MemAccessCounter): return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), stride=0, - variable=name): 1} + variable=name, + count_granularity='thread'): 1} ) + self.rec(expr.index) def map_subscript(self, expr): @@ -888,7 +908,8 @@ class GlobalMemAccessCounter(MemAccessCounter): "sys.maxsize." % (min_tag_axis)) return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), - stride=sys.maxsize, variable=name): 1} + stride=sys.maxsize, variable=name, + count_granularity='thread'): 1} ) + self.rec(expr.index) # get local_id associated with minimum tag axis @@ -1218,8 +1239,8 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False): op_map = get_op_map(knl) params = {'n': 512, 'm': 256, 'l': 128} - f32add = op_map[Op(np.float32, 'add')].eval_with_dict(params) - f32mul = op_map[Op(np.float32, 'mul')].eval_with_dict(params) + f32add = op_map[Op(np.float32, 'add', count_granularity='thread')].eval_with_dict(params) + f32mul = op_map[Op(np.float32, 'mul', count_granularity='thread')].eval_with_dict(params) # (now use these counts to predict performance) @@ -1247,7 +1268,10 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False): % type(insn).__name__) if numpy_types: - op_map.count_map = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name), + op_map.count_map = dict((Op( + dtype=op.dtype.numpy_dtype, + name=op.name, + count_granularity=op.count_granularity), count) for op, count in six.iteritems(op_map.count_map)) @@ -1296,25 +1320,29 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, dtype=np.float32, stride=1, direction='load', - variable='a') + variable='a', + count_granularity='thread') ].eval_with_dict(params) f32_s1_g_st_a = mem_map[MemAccess(mtype='global', dtype=np.float32, stride=1, direction='store', - variable='a') + variable='a', + count_granularity='thread') ].eval_with_dict(params) f32_s1_l_ld_x = mem_map[MemAccess(mtype='local', dtype=np.float32, stride=1, direction='load', - variable='x') + variable='x', + count_granularity='thread') ].eval_with_dict(params) f32_s1_l_st_x = mem_map[MemAccess(mtype='local', dtype=np.float32, stride=1, direction='store', - variable='x') + variable='x', + count_granularity='thread') ].eval_with_dict(params) # (now use these counts to predict performance) @@ -1343,7 +1371,12 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, knl, insn, disregard_local_axes=disregard_local_axes, count_redundant_work=count_redundant_work) - if count_granularity == 'thread': + if count_granularity is None: + warn_with_kernel(knl, "get_insn_count_assumes_granularity", + "get_insn_count: No count granularity passed for " + "MemAccess, assuming thread granularity.") + return ct + elif count_granularity == 'thread': return ct elif count_granularity == 'warp': return ct/wsize @@ -1420,7 +1453,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, dtype=mem_access.dtype.numpy_dtype, stride=mem_access.stride, direction=mem_access.direction, - variable=mem_access.variable + variable=mem_access.variable, + count_granularity=mem_access.count_granularity ), count) for mem_access, count in six.iteritems(access_map.count_map)) diff --git a/test/test_statistics.py b/test/test_statistics.py index 8c3c16c0d..a5132b94f 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -54,11 +54,13 @@ def test_op_counter_basic(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) - f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params) - f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul')].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add', 'thread')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul', 'thread')].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div', 'thread')].eval_with_dict(params) + f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', 'thread') + ].eval_with_dict(params) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'thread') + ].eval_with_dict(params) assert f32add == f32mul == f32div == n*m*ell assert f64mul == n*m assert i32add == n*m*2 @@ -79,8 +81,9 @@ def test_op_counter_reduction(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) - f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add', 'thread')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', 'thread') + ].eval_with_dict(params) assert f32add == f32mul == n*m*ell op_map_dtype = op_map.group_by('dtype') @@ -108,10 +111,12 @@ def test_op_counter_logic(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) - f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(params) - f64div = op_map[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul', 'thread')].eval_with_dict(params) + f64add = op_map[lp.Op(np.float64, 'add', 'thread')].eval_with_dict(params) + f64div = op_map[lp.Op(np.dtype(np.float64), 'div', 'thread') + ].eval_with_dict(params) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'thread') + ].eval_with_dict(params) assert f32mul == n*m assert f64div == 2*n*m # TODO why? assert f64add == n*m @@ -138,14 +143,18 @@ def test_op_counter_specialops(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params) - f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) - f64pow = op_map[lp.Op(np.float64, 'pow')].eval_with_dict(params) - f64add = op_map[lp.Op(np.dtype(np.float64), 'add')].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params) - f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params) - f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul', 'thread')].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div', 'thread')].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add', 'thread')].eval_with_dict(params) + f64pow = op_map[lp.Op(np.float64, 'pow', 'thread')].eval_with_dict(params) + f64add = op_map[lp.Op(np.dtype(np.float64), 'add', 'thread') + ].eval_with_dict(params) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'thread') + ].eval_with_dict(params) + f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', 'thread') + ].eval_with_dict(params) + f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', 'thread') + ].eval_with_dict(params) assert f32div == 2*n*m*ell assert f32mul == f32add == n*m*ell assert f64add == 3*n*m @@ -174,12 +183,15 @@ def test_op_counter_bitwise(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params) - i32bw = op_map[lp.Op(np.int32, 'bw')].eval_with_dict(params) - i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params) - i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params) - i64add = op_map[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params) - i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params) + i32add = op_map[lp.Op(np.int32, 'add', 'thread')].eval_with_dict(params) + i32bw = op_map[lp.Op(np.int32, 'bw', 'thread')].eval_with_dict(params) + i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', 'thread')].eval_with_dict(params) + i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', 'thread') + ].eval_with_dict(params) + i64add = op_map[lp.Op(np.dtype(np.int64), 'add', 'thread') + ].eval_with_dict(params) + i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', 'thread') + ].eval_with_dict(params) assert i32add == n*m+n*m*ell assert i32bw == 2*n*m*ell assert i64bw == 2*n*m @@ -208,7 +220,10 @@ def test_op_counter_triangular_domain(): else: expect_fallback = False - op_map = lp.get_op_map(knl, count_redundant_work=True)[lp.Op(np.float64, 'mul')] + op_map = lp.get_op_map( + knl, + count_redundant_work=True + )[lp.Op(np.float64, 'mul', 'thread')] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) @@ -231,7 +246,7 @@ def test_mem_access_counter_basic(): name="basic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, - dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) + dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32) n = 512 @@ -239,25 +254,31 @@ def test_mem_access_counter_basic(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32l = mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='a') + stride=0, direction='load', variable='a', + count_granularity='warp') ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='b') + stride=0, direction='load', variable='b', + count_granularity='warp') ].eval_with_dict(params) f64l = mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='load', variable='g') + stride=0, direction='load', variable='g', + count_granularity='warp') ].eval_with_dict(params) f64l += mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='load', variable='h') + stride=0, direction='load', variable='h', + count_granularity='warp') ].eval_with_dict(params) assert f32l == 3*n*m*ell/32 # /warpsize because these are considered uniform assert f64l == 2*n*m/32 # /warpsize because these are considered uniform f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=0, direction='store', variable='c') + stride=0, direction='store', variable='c', + count_granularity='warp') ].eval_with_dict(params) f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), - stride=0, direction='store', variable='e') + stride=0, direction='store', variable='e', + count_granularity='warp') ].eval_with_dict(params) assert f32s == n*m*ell/32 # /warpsize because these are considered uniform assert f64s == n*m/32 # /warpsize because these are considered uniform @@ -279,15 +300,18 @@ def test_mem_access_counter_reduction(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32l = mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='a') + stride=0, direction='load', variable='a', + count_granularity='warp') ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='b') + stride=0, direction='load', variable='b', + count_granularity='warp') ].eval_with_dict(params) assert f32l == 2*n*m*ell/32 # /warpsize because these are considered uniform f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=0, direction='store', variable='c') + stride=0, direction='store', variable='c', + count_granularity='warp') ].eval_with_dict(params) assert f32s == n*ell/32 # /warpsize because these are considered uniform @@ -355,30 +379,37 @@ def test_mem_access_counter_specialops(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32 = mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='a') + stride=0, direction='load', variable='a', + count_granularity='warp') ].eval_with_dict(params) f32 += mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='b') + stride=0, direction='load', variable='b', + count_granularity='warp') ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), - stride=0, direction='load', variable='g') + stride=0, direction='load', variable='g', + count_granularity='warp') ].eval_with_dict(params) f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), - stride=0, direction='load', variable='h') + stride=0, direction='load', variable='h', + count_granularity='warp') ].eval_with_dict(params) assert f32 == 2*n*m*ell/32 # /warpsize because these are considered uniform assert f64 == 2*n*m/32 # /warpsize because these are considered uniform f32 = mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='store', variable='c') + stride=0, direction='store', variable='c', + count_granularity='warp') ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='store', variable='e') + stride=0, direction='store', variable='e', + count_granularity='warp') ].eval_with_dict(params) assert f32 == n*m*ell/32 # /warpsize because these are considered uniform assert f64 == n*m/32 # /warpsize because these are considered uniform - filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g']) + filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'], + count_granularity='warp') #tot = lp.eval_and_sum_polys(filtered_map, params) tot = filtered_map.eval_and_sum(params) assert tot == (n*m*ell + n*m)/32 # /warpsize for uniform @@ -407,24 +438,30 @@ def test_mem_access_counter_bitwise(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} i32 = mem_map[lp.MemAccess('global', np.int32, - stride=0, direction='load', variable='a') + stride=0, direction='load', variable='a', + count_granularity='warp') ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, - stride=0, direction='load', variable='b') + stride=0, direction='load', variable='b', + count_granularity='warp') ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, - stride=0, direction='load', variable='g') + stride=0, direction='load', variable='g', + count_granularity='warp') ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), - stride=0, direction='load', variable='h') + stride=0, direction='load', variable='h', + count_granularity='warp') ].eval_with_dict(params) assert i32 == (4*n*m+2*n*m*ell)/32 # /warpsize for uniform i32 = mem_map[lp.MemAccess('global', np.int32, - stride=0, direction='store', variable='c') + stride=0, direction='store', variable='c', + count_granularity='warp') ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, - stride=0, direction='store', variable='e') + stride=0, direction='store', variable='e', + count_granularity='warp') ].eval_with_dict(params) assert i32 == (n*m+n*m*ell)/32 # /warpsize because these are considered uniform @@ -453,32 +490,39 @@ def test_mem_access_counter_mixed(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} f64uniform = mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='load', variable='g') + stride=0, direction='load', variable='g', + count_granularity='warp') ].eval_with_dict(params) f64uniform += mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='load', variable='h') + stride=0, direction='load', variable='h', + count_granularity='warp') ].eval_with_dict(params) f32uniform = mem_map[lp.MemAccess('global', np.float32, - stride=0, direction='load', variable='x') + stride=0, direction='load', variable='x', + count_granularity='warp') ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=Variable('m'), direction='load', - variable='a') + stride=Variable('m'), direction='load', + variable='a', + count_granularity='thread') ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=Variable('m'), direction='load', - variable='b') + stride=Variable('m'), direction='load', + variable='b', + count_granularity='thread') ].eval_with_dict(params) assert f64uniform == 2*n*m*ell/32 # /warpsize for uniform assert f32uniform == n*m*ell/32 # /warpsize for uniform assert f32nonconsec == 3*n*m*ell f64uniform = mem_map[lp.MemAccess('global', np.float64, - stride=0, direction='store', variable='e') + stride=0, direction='store', variable='e', + count_granularity='warp') ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, - stride=Variable('m'), direction='store', - variable='c') + stride=Variable('m'), direction='store', + variable='c', + count_granularity='thread') ].eval_with_dict(params) assert f64uniform == n*m*ell/32 # /warpsize because these are considered uniform assert f32nonconsec == n*m*ell @@ -506,31 +550,37 @@ def test_mem_access_counter_nonconsec(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} f64nonconsec = mem_map[lp.MemAccess('global', np.float64, - stride=Variable('m'), direction='load', - variable='g') + stride=Variable('m'), direction='load', + variable='g', + count_granularity='thread') ].eval_with_dict(params) f64nonconsec += mem_map[lp.MemAccess('global', np.float64, - stride=Variable('m'), direction='load', - variable='h') + stride=Variable('m'), direction='load', + variable='h', + count_granularity='thread') ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=Variable('m')*Variable('ell'), - direction='load', variable='a') + stride=Variable('m')*Variable('ell'), + direction='load', variable='a', + count_granularity='thread') ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=Variable('m')*Variable('ell'), - direction='load', variable='b') + stride=Variable('m')*Variable('ell'), + direction='load', variable='b', + count_granularity='thread') ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*ell f64nonconsec = mem_map[lp.MemAccess('global', np.float64, - stride=Variable('m'), direction='store', - variable='e') + stride=Variable('m'), direction='store', + variable='e', + count_granularity='thread') ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, - stride=Variable('m')*Variable('ell'), - direction='store', variable='c') + stride=Variable('m')*Variable('ell'), + direction='store', variable='c', + count_granularity='thread') ].eval_with_dict(params) assert f64nonconsec == n*m assert f32nonconsec == n*m*ell @@ -539,26 +589,30 @@ def test_mem_access_counter_nonconsec(): f64nonconsec = mem_map64[lp.MemAccess( 'global', np.float64, stride=Variable('m'), - direction='load', variable='g') + direction='load', variable='g', + count_granularity='thread') ].eval_with_dict(params) f64nonconsec += mem_map64[lp.MemAccess( 'global', np.float64, stride=Variable('m'), - direction='load', variable='h') + direction='load', variable='h', + count_granularity='thread') ].eval_with_dict(params) f32nonconsec = mem_map64[lp.MemAccess( 'global', np.dtype(np.float32), stride=Variable('m')*Variable('ell'), direction='load', - variable='a') + variable='a', + count_granularity='thread') ].eval_with_dict(params) f32nonconsec += mem_map64[lp.MemAccess( 'global', np.dtype(np.float32), stride=Variable('m')*Variable('ell'), direction='load', - variable='b') + variable='b', + count_granularity='thread') ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*ell @@ -586,25 +640,31 @@ def test_mem_access_counter_consec(): params = {'n': n, 'm': m, 'ell': ell} f64consec = mem_map[lp.MemAccess('global', np.float64, - stride=1, direction='load', variable='g') + stride=1, direction='load', variable='g', + count_granularity='thread') ].eval_with_dict(params) f64consec += mem_map[lp.MemAccess('global', np.float64, - stride=1, direction='load', variable='h') + stride=1, direction='load', variable='h', + count_granularity='thread') ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, - stride=1, direction='load', variable='a') + stride=1, direction='load', variable='a', + count_granularity='thread') ].eval_with_dict(params) f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), - stride=1, direction='load', variable='b') + stride=1, direction='load', variable='b', + count_granularity='thread') ].eval_with_dict(params) assert f64consec == 2*n*m*ell assert f32consec == 3*n*m*ell f64consec = mem_map[lp.MemAccess('global', np.float64, - stride=1, direction='store', variable='e') + stride=1, direction='store', variable='e', + count_granularity='thread') ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, - stride=1, direction='store', variable='c') + stride=1, direction='store', variable='c', + count_granularity='thread') ].eval_with_dict(params) assert f64consec == n*m*ell assert f32consec == n*m*ell @@ -688,16 +748,16 @@ def test_all_counters_parallel_matmul(): op_map = lp.get_op_map(knl, count_redundant_work=True) f32mul = op_map[ - lp.Op(np.float32, 'mul') + lp.Op(np.float32, 'mul', 'thread') ].eval_with_dict(params) f32add = op_map[ - lp.Op(np.float32, 'add') + lp.Op(np.float32, 'add', 'thread') ].eval_with_dict(params) i32ops = op_map[ - lp.Op(np.int32, 'add') + lp.Op(np.int32, 'add', 'thread') ].eval_with_dict(params) i32ops += op_map[ - lp.Op(np.dtype(np.int32), 'mul') + lp.Op(np.dtype(np.int32), 'mul', 'thread') ].eval_with_dict(params) assert f32mul+f32add == n*m*ell*2 @@ -705,17 +765,20 @@ def test_all_counters_parallel_matmul(): op_map = lp.get_mem_access_map(knl, count_redundant_work=True) f32s1lb = op_map[lp.MemAccess('global', np.float32, - stride=1, direction='load', variable='b') + stride=1, direction='load', variable='b', + count_granularity='thread') ].eval_with_dict(params) f32s1la = op_map[lp.MemAccess('global', np.float32, - stride=1, direction='load', variable='a') + stride=1, direction='load', variable='a', + count_granularity='thread') ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize assert f32s1la == n*m*ell/bsize f32coal = op_map[lp.MemAccess('global', np.float32, - stride=1, direction='store', variable='c') + stride=1, direction='store', variable='c', + count_granularity='thread') ].eval_with_dict(params) assert f32coal == n*ell @@ -723,7 +786,8 @@ def test_all_counters_parallel_matmul(): local_mem_map = lp.get_mem_access_map(knl, count_redundant_work=True).filter_by(mtype=['local']) local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), - direction='load') + direction='load', + count_granularity='thread') ].eval_with_dict(params) assert local_mem_l == n*m*ell*2 @@ -773,7 +837,7 @@ def test_summations_and_filters(): name="basic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, - dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) + dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) n = 512 m = 256 ell = 128 @@ -781,17 +845,21 @@ def test_summations_and_filters(): mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32) - loads_a = mem_map.filter_by(direction=['load'], variable=['a'] + loads_a = mem_map.filter_by(direction=['load'], variable=['a'], + count_granularity=['warp'] ).eval_and_sum(params) assert loads_a == 2*n*m*ell/32 # /warpsize because these are considered uniform - global_stores = mem_map.filter_by(mtype=['global'], direction=['store'] + global_stores = mem_map.filter_by(mtype=['global'], direction=['store'], + count_granularity=['warp'] ).eval_and_sum(params) assert global_stores == (n*m*ell + n*m)/32 # /warpsize for uniform - ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] + ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'], + count_granularity=['warp'] ).to_bytes().eval_and_sum(params) - st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'] + st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'], + count_granularity=['warp'] ).to_bytes().eval_and_sum(params) assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)/32 # /warpsize for uniform assert st_bytes == (4*n*m*ell + 8*n*m)/32 # /warpsize for uniform -- GitLab From 88e505b077c86819027226118457622e92b2c625 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 12 Jan 2018 21:38:43 -0600 Subject: [PATCH 10/59] flake8 fixes --- loopy/statistics.py | 11 ++++++----- test/test_statistics.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 0776eb1c3..fd1e2039c 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -27,7 +27,6 @@ import six import loopy as lp from islpy import dim_type import islpy as isl -from pytools import memoize_in from pymbolic.mapper import CombineMapper from functools import reduce from loopy.kernel.data import ( @@ -1239,8 +1238,10 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False): op_map = get_op_map(knl) params = {'n': 512, 'm': 256, 'l': 128} - f32add = op_map[Op(np.float32, 'add', count_granularity='thread')].eval_with_dict(params) - f32mul = op_map[Op(np.float32, 'mul', count_granularity='thread')].eval_with_dict(params) + f32add = op_map[Op(np.float32, 'add', count_granularity='thread') + ].eval_with_dict(params) + f32mul = op_map[Op(np.float32, 'mul', count_granularity='thread') + ].eval_with_dict(params) # (now use these counts to predict performance) @@ -1360,8 +1361,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, class CacheHolder(object): pass - cache_holder = CacheHolder() - + #cache_holder = CacheHolder() + #from pytools import memoize_in #@memoize_in(cache_holder, "insn_count") # TODO why doesn't this work anymore? def get_insn_count(knl, insn_id, disregard_local_axes=False, diff --git a/test/test_statistics.py b/test/test_statistics.py index a5132b94f..b3f4d2226 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -191,7 +191,7 @@ def test_op_counter_bitwise(): i64add = op_map[lp.Op(np.dtype(np.int64), 'add', 'thread') ].eval_with_dict(params) i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', 'thread') - ].eval_with_dict(params) + ].eval_with_dict(params) assert i32add == n*m+n*m*ell assert i32bw == 2*n*m*ell assert i64bw == 2*n*m -- GitLab From c5cff697ae74e5127dd3b5358562c641d0d53896 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 12 Jan 2018 21:42:22 -0600 Subject: [PATCH 11/59] factoring out m in polynomial --- doc/tutorial.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 8e05cf0f4..c45e711f5 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1703,8 +1703,8 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) - MemAccess(None, None, None, load, None, None) : [m, l, n] -> { (1/2 + 3/8 * l) * n * m : m > 0 and l > 0 and n > 0 } - MemAccess(None, None, None, store, None, None) : [m, l, n] -> { (1/4 + 1/8 * l) * n * m : m > 0 and l > 0 and n > 0 } + MemAccess(None, None, None, load, None, None) : [m, l, n] -> { (1/2 * m + 3/8 * m * l) * n : m > 0 and l > 0 and n > 0 } + MemAccess(None, None, None, store, None, None) : [m, l, n] -> { (1/4 * m + 1/8 * m * l) * n : m > 0 and l > 0 and n > 0 } >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') ... ].eval_with_dict(param_dict) -- GitLab From bc1b9f71a8c46c78ed0c66d6682a2f5f481d0a55 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sat, 13 Jan 2018 17:08:42 -0600 Subject: [PATCH 12/59] changed dependency url --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b8f36d125..91e81ee51 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,7 @@ setup(name="loo.py", }, dependency_links=[ - "hg+https://bitbucket.org/inducer/f2py#egg=f2py==0.3.1" + "git+https://github.com/pearu/f2py.git" ], scripts=["bin/loopy"], -- GitLab From 777eb04ad97768ff26d9567cfbbc095482dd329f Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 22 Jan 2018 03:02:18 -0600 Subject: [PATCH 13/59] renamed warp->subgroup --- loopy/statistics.py | 24 ++++---- test/test_statistics.py | 129 ++++++++++++++++++++-------------------- 2 files changed, 77 insertions(+), 76 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index fd1e2039c..e3dc2f503 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -897,7 +897,7 @@ class GlobalMemAccessCounter(MemAccessCounter): return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), stride=0, variable=name, - count_granularity='warp'): 1} + count_granularity='subgroup'): 1} ) + self.rec(expr.index) if min_tag_axis != 0: @@ -952,7 +952,7 @@ class GlobalMemAccessCounter(MemAccessCounter): total_stride += stride*coeff_min_lid - count_granularity = 'thread' if total_stride is not 0 else 'warp' + count_granularity = 'thread' if total_stride is not 0 else 'subgroup' return ToCountMap({MemAccess( mtype='global', @@ -1284,7 +1284,7 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False): # {{{ get_mem_access_map def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, - wsize=None): + subgroup_size=None): """Count the number of memory accesses in a loopy kernel. :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be @@ -1351,12 +1351,12 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, """ from loopy.preprocess import preprocess_kernel, infer_unknown_types - if wsize is None: - wsize = 32 - warn_with_kernel(knl, "get_mem_access_map_assumes_warpsize", - "get_mem_access_map: No warp size passed, " - "assuming warp size is %d." - % (wsize)) + if subgroup_size is None: + subgroup_size = 32 + warn_with_kernel(knl, "get_mem_access_map_assumes_subgroup_size", + "get_mem_access_map: No subgroup size passed, " + "assuming subgroup size is %d." + % (subgroup_size)) class CacheHolder(object): pass @@ -1379,8 +1379,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, return ct elif count_granularity == 'thread': return ct - elif count_granularity == 'warp': - return ct/wsize + elif count_granularity == 'subgroup': + return ct/subgroup_size elif count_granularity == 'group': from loopy.symbolic import aff_to_expr _, local_size = knl.get_grid_size_upper_bounds() @@ -1397,7 +1397,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, else: raise ValueError("get_insn_count: count_granularity '%s' is" "not allowed. count_granularity must be 'group', " - "'warp', or 'thread'." % (count_granularity)) + "'subgroup', or 'thread'." % (count_granularity)) knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) diff --git a/test/test_statistics.py b/test/test_statistics.py index b3f4d2226..b93e26264 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -247,7 +247,7 @@ def test_mem_access_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32) n = 512 m = 256 @@ -255,33 +255,33 @@ def test_mem_access_counter_basic(): params = {'n': n, 'm': m, 'ell': ell} f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) f64l = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) f64l += mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) - assert f32l == 3*n*m*ell/32 # /warpsize because these are considered uniform - assert f64l == 2*n*m/32 # /warpsize because these are considered uniform + assert f32l == 3*n*m*ell/32 # /subgroup_size because these are uniform + assert f64l == 2*n*m/32 # /subgroup_size because these are uniform f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='store', variable='e', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) - assert f32s == n*m*ell/32 # /warpsize because these are considered uniform - assert f64s == n*m/32 # /warpsize because these are considered uniform + assert f32s == n*m*ell/32 # /subgroup_size because these are uniform + assert f64s == n*m/32 # /subgroup_size because these are uniform def test_mem_access_counter_reduction(): @@ -294,26 +294,26 @@ def test_mem_access_counter_reduction(): name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) - assert f32l == 2*n*m*ell/32 # /warpsize because these are considered uniform + assert f32l == 2*n*m*ell/32 # /subgroup_size because these are uniform f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) - assert f32s == n*ell/32 # /warpsize because these are considered uniform + assert f32s == n*ell/32 # /subgroup_size because these are uniform ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] ).to_bytes().eval_and_sum(params) @@ -337,7 +337,7 @@ def test_mem_access_counter_logic(): name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32) n = 512 m = 256 ell = 128 @@ -354,9 +354,9 @@ def test_mem_access_counter_logic(): f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), direction='store') ].eval_with_dict(params) - assert f32_g_l == 2*n*m/32 # /warpsize because these are considered uniform - assert f64_g_l == n*m/32 # /warpsize because these are considered uniform - assert f64_g_s == n*m/32 # /warpsize because these are considered uniform + assert f32_g_l == 2*n*m/32 # /subgroup_size because these are uniform + assert f64_g_l == n*m/32 # /subgroup_size because these are uniform + assert f64_g_s == n*m/32 # /subgroup_size because these are uniform def test_mem_access_counter_specialops(): @@ -373,46 +373,46 @@ def test_mem_access_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) f32 += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='g', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='h', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) - assert f32 == 2*n*m*ell/32 # /warpsize because these are considered uniform - assert f64 == 2*n*m/32 # /warpsize because these are considered uniform + assert f32 == 2*n*m*ell/32 # /subgroup_size because these are uniform + assert f64 == 2*n*m/32 # /subgroup_size because these are uniform f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='store', variable='c', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) - assert f32 == n*m*ell/32 # /warpsize because these are considered uniform - assert f64 == n*m/32 # /warpsize because these are considered uniform + assert f32 == n*m*ell/32 # /subgroup_size because these are uniform + assert f64 == n*m/32 # /subgroup_size because these are uniform filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'], - count_granularity='warp') + count_granularity='subgroup') #tot = lp.eval_and_sum_polys(filtered_map, params) tot = filtered_map.eval_and_sum(params) - assert tot == (n*m*ell + n*m)/32 # /warpsize for uniform + assert tot == (n*m*ell + n*m)/32 # /subgroup_size for uniform def test_mem_access_counter_bitwise(): @@ -432,38 +432,38 @@ def test_mem_access_counter_bitwise(): a=np.int32, b=np.int32, g=np.int32, h=np.int32)) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='a', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='b', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='g', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), stride=0, direction='load', variable='h', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) - assert i32 == (4*n*m+2*n*m*ell)/32 # /warpsize for uniform + assert i32 == (4*n*m+2*n*m*ell)/32 # /subgroup_size for uniform i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='c', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='e', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) - assert i32 == (n*m+n*m*ell)/32 # /warpsize because these are considered uniform + assert i32 == (n*m+n*m*ell)/32 # /subgroup_size because these are uniform def test_mem_access_counter_mixed(): @@ -484,22 +484,22 @@ def test_mem_access_counter_mixed(): knl = lp.split_iname(knl, "j", bsize) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32) # noqa + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32) # noqa n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) f64uniform += mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) f32uniform = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='x', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', @@ -511,20 +511,20 @@ def test_mem_access_counter_mixed(): variable='b', count_granularity='thread') ].eval_with_dict(params) - assert f64uniform == 2*n*m*ell/32 # /warpsize for uniform - assert f32uniform == n*m*ell/32 # /warpsize for uniform + assert f64uniform == 2*n*m*ell/32 # /subgroup_size for uniform + assert f32uniform == n*m*ell/32 # /subgroup_size for uniform assert f32nonconsec == 3*n*m*ell f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e', - count_granularity='warp') + count_granularity='subgroup') ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m'), direction='store', variable='c', count_granularity='thread') ].eval_with_dict(params) - assert f64uniform == n*m*ell/32 # /warpsize because these are considered uniform + assert f64uniform == n*m*ell/32 # /subgroup_size because these are uniform assert f32nonconsec == n*m*ell @@ -585,7 +585,8 @@ def test_mem_access_counter_nonconsec(): assert f64nonconsec == n*m assert f32nonconsec == n*m*ell - mem_map64 = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=64) + mem_map64 = lp.get_mem_access_map(knl, count_redundant_work=True, + subgroup_size=64) f64nonconsec = mem_map64[lp.MemAccess( 'global', np.float64, stride=Variable('m'), @@ -843,26 +844,26 @@ def test_summations_and_filters(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, wsize=32) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32) loads_a = mem_map.filter_by(direction=['load'], variable=['a'], - count_granularity=['warp'] + count_granularity=['subgroup'] ).eval_and_sum(params) - assert loads_a == 2*n*m*ell/32 # /warpsize because these are considered uniform + assert loads_a == 2*n*m*ell/32 # /subgroup_size because these are uniform global_stores = mem_map.filter_by(mtype=['global'], direction=['store'], - count_granularity=['warp'] + count_granularity=['subgroup'] ).eval_and_sum(params) - assert global_stores == (n*m*ell + n*m)/32 # /warpsize for uniform + assert global_stores == (n*m*ell + n*m)/32 # /subgroup_size for uniform ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'], - count_granularity=['warp'] + count_granularity=['subgroup'] ).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'], - count_granularity=['warp'] + count_granularity=['subgroup'] ).to_bytes().eval_and_sum(params) - assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)/32 # /warpsize for uniform - assert st_bytes == (4*n*m*ell + 8*n*m)/32 # /warpsize for uniform + assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)/32 # /subgroup_size for uniform + assert st_bytes == (4*n*m*ell + 8*n*m)/32 # /subgroup_size for uniform # ignore stride and variable names in this map reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') @@ -870,8 +871,8 @@ def test_summations_and_filters(): ].eval_with_dict(params) f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') ].eval_with_dict(params) - assert f32lall == 3*n*m*ell/32 # /warpsize because these are considered uniform - assert f64lall == 2*n*m/32 # /warpsize because these are considered uniform + assert f32lall == 3*n*m*ell/32 # /subgroup_size because these are uniform + assert f64lall == 2*n*m/32 # /subgroup_size because these are uniform op_map = lp.get_op_map(knl, count_redundant_work=True) #for k, v in op_map.items(): @@ -904,7 +905,7 @@ def test_summations_and_filters(): return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \ key.direction == 'load' s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) - assert s1f64l == 2*n*m/32 # /warpsize because these are considered uniform + assert s1f64l == 2*n*m/32 # /subgroup_size because these are uniform def test_strided_footprint(): -- GitLab From 80c6f8b4ad33f31cd5e748103e2c895601ad0f5a Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 22 Jan 2018 03:12:16 -0600 Subject: [PATCH 14/59] renaming thread->workitem --- loopy/statistics.py | 62 +++++++++++------------ test/test_statistics.py | 107 ++++++++++++++++++++-------------------- 2 files changed, 85 insertions(+), 84 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index e3dc2f503..765c75a8f 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -529,7 +529,7 @@ class MemAccess(object): .. attribute:: stride An :class:`int` that specifies stride of the memory access. A stride of 0 - indicates a uniform access (i.e. all threads access the same item). + indicates a uniform access (i.e. all work items access the same item). .. attribute:: direction @@ -692,7 +692,7 @@ class ExpressionOpCounter(CounterBase): return ToCountMap( {Op(dtype=self.type_inf(expr), name='func:'+str(expr.function), - count_granularity='thread'): 1} + count_granularity='workitem'): 1} ) + self.rec(expr.parameters) def map_subscript(self, expr): @@ -703,7 +703,7 @@ class ExpressionOpCounter(CounterBase): return ToCountMap( {Op(dtype=self.type_inf(expr), name='add', - count_granularity='thread'): len(expr.children)-1} + count_granularity='workitem'): len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) def map_product(self, expr): @@ -711,18 +711,18 @@ class ExpressionOpCounter(CounterBase): assert expr.children return sum(ToCountMap({Op(dtype=self.type_inf(expr), name='mul', - count_granularity='thread'): 1}) + count_granularity='workitem'): 1}) + self.rec(child) for child in expr.children if not is_zero(child + 1)) + \ ToCountMap({Op(dtype=self.type_inf(expr), name='mul', - count_granularity='thread'): -1}) + count_granularity='workitem'): -1}) def map_quotient(self, expr, *args): return ToCountMap({Op(dtype=self.type_inf(expr), name='div', - count_granularity='thread'): 1}) \ + count_granularity='workitem'): 1}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) @@ -732,14 +732,14 @@ class ExpressionOpCounter(CounterBase): def map_power(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='pow', - count_granularity='thread'): 1}) \ + count_granularity='workitem'): 1}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) def map_left_shift(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='shift', - count_granularity='thread'): 1}) \ + count_granularity='workitem'): 1}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) @@ -748,13 +748,13 @@ class ExpressionOpCounter(CounterBase): def map_bitwise_not(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='bw', - count_granularity='thread'): 1}) \ + count_granularity='workitem'): 1}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='bw', - count_granularity='thread'): + count_granularity='workitem'): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -778,7 +778,7 @@ class ExpressionOpCounter(CounterBase): def map_min(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='maxmin', - count_granularity='thread'): + count_granularity='workitem'): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -820,7 +820,7 @@ class LocalMemAccessCounter(MemAccessCounter): if isinstance(array, TemporaryVariable) and ( array.scope == temp_var_scope.LOCAL): sub_map[MemAccess(mtype='local', dtype=dtype, - count_granularity='thread')] = 1 + count_granularity='workitem')] = 1 return sub_map def map_variable(self, expr): @@ -857,7 +857,7 @@ class GlobalMemAccessCounter(MemAccessCounter): return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), stride=0, variable=name, - count_granularity='thread'): 1} + count_granularity='workitem'): 1} ) + self.rec(expr.index) def map_subscript(self, expr): @@ -908,7 +908,7 @@ class GlobalMemAccessCounter(MemAccessCounter): return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), stride=sys.maxsize, variable=name, - count_granularity='thread'): 1} + count_granularity='workitem'): 1} ) + self.rec(expr.index) # get local_id associated with minimum tag axis @@ -952,7 +952,7 @@ class GlobalMemAccessCounter(MemAccessCounter): total_stride += stride*coeff_min_lid - count_granularity = 'thread' if total_stride is not 0 else 'subgroup' + count_granularity = 'workitem' if total_stride is not 0 else 'subgroup' return ToCountMap({MemAccess( mtype='global', @@ -1238,9 +1238,9 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False): op_map = get_op_map(knl) params = {'n': 512, 'm': 256, 'l': 128} - f32add = op_map[Op(np.float32, 'add', count_granularity='thread') + f32add = op_map[Op(np.float32, 'add', count_granularity='workitem') ].eval_with_dict(params) - f32mul = op_map[Op(np.float32, 'mul', count_granularity='thread') + f32mul = op_map[Op(np.float32, 'mul', count_granularity='workitem') ].eval_with_dict(params) # (now use these counts to predict performance) @@ -1322,28 +1322,28 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, stride=1, direction='load', variable='a', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) f32_s1_g_st_a = mem_map[MemAccess(mtype='global', dtype=np.float32, stride=1, direction='store', variable='a', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) f32_s1_l_ld_x = mem_map[MemAccess(mtype='local', dtype=np.float32, stride=1, direction='load', variable='x', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) f32_s1_l_st_x = mem_map[MemAccess(mtype='local', dtype=np.float32, stride=1, direction='store', variable='x', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) # (now use these counts to predict performance) @@ -1366,7 +1366,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, #@memoize_in(cache_holder, "insn_count") # TODO why doesn't this work anymore? def get_insn_count(knl, insn_id, disregard_local_axes=False, - count_granularity='thread'): + count_granularity='workitem'): insn = knl.id_to_insn[insn_id] ct = count_insn_runs( knl, insn, disregard_local_axes=disregard_local_axes, @@ -1375,16 +1375,16 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, if count_granularity is None: warn_with_kernel(knl, "get_insn_count_assumes_granularity", "get_insn_count: No count granularity passed for " - "MemAccess, assuming thread granularity.") + "MemAccess, assuming workitem granularity.") return ct - elif count_granularity == 'thread': + elif count_granularity == 'workitem': return ct elif count_granularity == 'subgroup': return ct/subgroup_size elif count_granularity == 'group': from loopy.symbolic import aff_to_expr _, local_size = knl.get_grid_size_upper_bounds() - group_threads = 1 + group_workitems = 1 for size in local_size: try: s = aff_to_expr(size) @@ -1392,12 +1392,12 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, raise LoopyError("Cannot count insn with group granularity, " "group size is not integer: %s" % (local_size)) - group_threads *= s - return ct/group_threads + group_workitems *= s + return ct/group_workitems else: raise ValueError("get_insn_count: count_granularity '%s' is" "not allowed. count_granularity must be 'group', " - "'subgroup', or 'thread'." % (count_granularity)) + "'subgroup', or 'workitem'." % (count_granularity)) knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) @@ -1468,14 +1468,14 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, def get_synchronization_map(knl): - """Count the number of synchronization events each thread encounters in a + """Count the number of synchronization events each work item encounters in a loopy kernel. :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: A dictionary mapping each type of synchronization event to a :class:`islpy.PwQPolynomial` holding the number of events per - thread. + work item. Possible keys include ``barrier_local``, ``barrier_global`` (if supported by the target) and ``kernel_launch``. @@ -1684,7 +1684,7 @@ def get_gmem_access_poly(knl): def get_synchronization_poly(knl): - """Count the number of synchronization events each thread encounters in a + """Count the number of synchronization events each work item encounters in a loopy kernel. get_synchronization_poly is deprecated. Use get_synchronization_map instead. diff --git a/test/test_statistics.py b/test/test_statistics.py index b93e26264..f8735553f 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -54,12 +54,12 @@ def test_op_counter_basic(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', 'thread')].eval_with_dict(params) - f32mul = op_map[lp.Op(np.float32, 'mul', 'thread')].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div', 'thread')].eval_with_dict(params) - f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', 'thread') + f32add = op_map[lp.Op(np.float32, 'add', 'workitem')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul', 'workitem')].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div', 'workitem')].eval_with_dict(params) + f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', 'workitem') ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'thread') + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'workitem') ].eval_with_dict(params) assert f32add == f32mul == f32div == n*m*ell assert f64mul == n*m @@ -81,8 +81,8 @@ def test_op_counter_reduction(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', 'thread')].eval_with_dict(params) - f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', 'thread') + f32add = op_map[lp.Op(np.float32, 'add', 'workitem')].eval_with_dict(params) + f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', 'workitem') ].eval_with_dict(params) assert f32add == f32mul == n*m*ell @@ -111,11 +111,11 @@ def test_op_counter_logic(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', 'thread')].eval_with_dict(params) - f64add = op_map[lp.Op(np.float64, 'add', 'thread')].eval_with_dict(params) - f64div = op_map[lp.Op(np.dtype(np.float64), 'div', 'thread') + f32mul = op_map[lp.Op(np.float32, 'mul', 'workitem')].eval_with_dict(params) + f64add = op_map[lp.Op(np.float64, 'add', 'workitem')].eval_with_dict(params) + f64div = op_map[lp.Op(np.dtype(np.float64), 'div', 'workitem') ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'thread') + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'workitem') ].eval_with_dict(params) assert f32mul == n*m assert f64div == 2*n*m # TODO why? @@ -143,17 +143,17 @@ def test_op_counter_specialops(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', 'thread')].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div', 'thread')].eval_with_dict(params) - f32add = op_map[lp.Op(np.float32, 'add', 'thread')].eval_with_dict(params) - f64pow = op_map[lp.Op(np.float64, 'pow', 'thread')].eval_with_dict(params) - f64add = op_map[lp.Op(np.dtype(np.float64), 'add', 'thread') + f32mul = op_map[lp.Op(np.float32, 'mul', 'workitem')].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div', 'workitem')].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add', 'workitem')].eval_with_dict(params) + f64pow = op_map[lp.Op(np.float64, 'pow', 'workitem')].eval_with_dict(params) + f64add = op_map[lp.Op(np.dtype(np.float64), 'add', 'workitem') ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'thread') + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'workitem') ].eval_with_dict(params) - f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', 'thread') + f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', 'workitem') ].eval_with_dict(params) - f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', 'thread') + f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', 'workitem') ].eval_with_dict(params) assert f32div == 2*n*m*ell assert f32mul == f32add == n*m*ell @@ -183,14 +183,15 @@ def test_op_counter_bitwise(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - i32add = op_map[lp.Op(np.int32, 'add', 'thread')].eval_with_dict(params) - i32bw = op_map[lp.Op(np.int32, 'bw', 'thread')].eval_with_dict(params) - i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', 'thread')].eval_with_dict(params) - i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', 'thread') + i32add = op_map[lp.Op(np.int32, 'add', 'workitem')].eval_with_dict(params) + i32bw = op_map[lp.Op(np.int32, 'bw', 'workitem')].eval_with_dict(params) + i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', 'workitem') + ].eval_with_dict(params) + i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', 'workitem') ].eval_with_dict(params) - i64add = op_map[lp.Op(np.dtype(np.int64), 'add', 'thread') + i64add = op_map[lp.Op(np.dtype(np.int64), 'add', 'workitem') ].eval_with_dict(params) - i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', 'thread') + i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', 'workitem') ].eval_with_dict(params) assert i32add == n*m+n*m*ell assert i32bw == 2*n*m*ell @@ -223,7 +224,7 @@ def test_op_counter_triangular_domain(): op_map = lp.get_op_map( knl, count_redundant_work=True - )[lp.Op(np.float64, 'mul', 'thread')] + )[lp.Op(np.float64, 'mul', 'workitem')] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) @@ -504,12 +505,12 @@ def test_mem_access_counter_mixed(): f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='a', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='b', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) assert f64uniform == 2*n*m*ell/32 # /subgroup_size for uniform assert f32uniform == n*m*ell/32 # /subgroup_size for uniform @@ -522,7 +523,7 @@ def test_mem_access_counter_mixed(): f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m'), direction='store', variable='c', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) assert f64uniform == n*m*ell/32 # /subgroup_size because these are uniform assert f32nonconsec == n*m*ell @@ -552,22 +553,22 @@ def test_mem_access_counter_nonconsec(): f64nonconsec = mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='g', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) f64nonconsec += mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='h', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m')*Variable('ell'), direction='load', variable='a', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m')*Variable('ell'), direction='load', variable='b', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*ell @@ -575,12 +576,12 @@ def test_mem_access_counter_nonconsec(): f64nonconsec = mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='store', variable='e', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m')*Variable('ell'), direction='store', variable='c', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) assert f64nonconsec == n*m assert f32nonconsec == n*m*ell @@ -591,13 +592,13 @@ def test_mem_access_counter_nonconsec(): 'global', np.float64, stride=Variable('m'), direction='load', variable='g', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) f64nonconsec += mem_map64[lp.MemAccess( 'global', np.float64, stride=Variable('m'), direction='load', variable='h', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) f32nonconsec = mem_map64[lp.MemAccess( 'global', @@ -605,7 +606,7 @@ def test_mem_access_counter_nonconsec(): stride=Variable('m')*Variable('ell'), direction='load', variable='a', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) f32nonconsec += mem_map64[lp.MemAccess( 'global', @@ -613,7 +614,7 @@ def test_mem_access_counter_nonconsec(): stride=Variable('m')*Variable('ell'), direction='load', variable='b', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*ell @@ -642,30 +643,30 @@ def test_mem_access_counter_consec(): f64consec = mem_map[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='g', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) f64consec += mem_map[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='h', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=1, direction='load', variable='b', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) assert f64consec == 2*n*m*ell assert f32consec == 3*n*m*ell f64consec = mem_map[lp.MemAccess('global', np.float64, stride=1, direction='store', variable='e', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) assert f64consec == n*m*ell assert f32consec == n*m*ell @@ -749,16 +750,16 @@ def test_all_counters_parallel_matmul(): op_map = lp.get_op_map(knl, count_redundant_work=True) f32mul = op_map[ - lp.Op(np.float32, 'mul', 'thread') + lp.Op(np.float32, 'mul', 'workitem') ].eval_with_dict(params) f32add = op_map[ - lp.Op(np.float32, 'add', 'thread') + lp.Op(np.float32, 'add', 'workitem') ].eval_with_dict(params) i32ops = op_map[ - lp.Op(np.int32, 'add', 'thread') + lp.Op(np.int32, 'add', 'workitem') ].eval_with_dict(params) i32ops += op_map[ - lp.Op(np.dtype(np.int32), 'mul', 'thread') + lp.Op(np.dtype(np.int32), 'mul', 'workitem') ].eval_with_dict(params) assert f32mul+f32add == n*m*ell*2 @@ -767,11 +768,11 @@ def test_all_counters_parallel_matmul(): f32s1lb = op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='b', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) f32s1la = op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize @@ -779,7 +780,7 @@ def test_all_counters_parallel_matmul(): f32coal = op_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) assert f32coal == n*ell @@ -788,7 +789,7 @@ def test_all_counters_parallel_matmul(): count_redundant_work=True).filter_by(mtype=['local']) local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', - count_granularity='thread') + count_granularity='workitem') ].eval_with_dict(params) assert local_mem_l == n*m*ell*2 -- GitLab From 46f9acabeb46295fb9780843fae2806437461862 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 22 Jan 2018 03:47:13 -0600 Subject: [PATCH 15/59] renaming thread->workitem in tutorial --- doc/tutorial.rst | 80 ++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index c45e711f5..b94708ed3 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1552,12 +1552,12 @@ information provided. Now we will count the operations: >>> op_map = lp.get_op_map(knl) >>> print(lp.stringify_stats_mapping(op_map)) - Op(np:dtype('float32'), add, thread) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('float32'), div, thread) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('float32'), mul, thread) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('float64'), add, thread) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('float64'), mul, thread) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('int32'), add, thread) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float32'), add, workitem) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float32'), div, workitem) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float32'), mul, workitem) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float64'), add, workitem) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float64'), mul, workitem) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('int32'), add, workitem) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{** @@ -1579,12 +1579,12 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: .. doctest:: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} - >>> f32add = op_map[lp.Op(np.float32, 'add', 'thread')].eval_with_dict(param_dict) - >>> f32div = op_map[lp.Op(np.float32, 'div', 'thread')].eval_with_dict(param_dict) - >>> f32mul = op_map[lp.Op(np.float32, 'mul', 'thread')].eval_with_dict(param_dict) - >>> f64add = op_map[lp.Op(np.float64, 'add', 'thread')].eval_with_dict(param_dict) - >>> f64mul = op_map[lp.Op(np.float64, 'mul', 'thread')].eval_with_dict(param_dict) - >>> i32add = op_map[lp.Op(np.int32, 'add', 'thread')].eval_with_dict(param_dict) + >>> f32add = op_map[lp.Op(np.float32, 'add', 'workitem')].eval_with_dict(param_dict) + >>> f32div = op_map[lp.Op(np.float32, 'div', 'workitem')].eval_with_dict(param_dict) + >>> f32mul = op_map[lp.Op(np.float32, 'mul', 'workitem')].eval_with_dict(param_dict) + >>> f64add = op_map[lp.Op(np.float64, 'add', 'workitem')].eval_with_dict(param_dict) + >>> f64mul = op_map[lp.Op(np.float64, 'mul', 'workitem')].eval_with_dict(param_dict) + >>> i32add = op_map[lp.Op(np.int32, 'add', 'workitem')].eval_with_dict(param_dict) >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 @@ -1657,7 +1657,7 @@ we'll continue using the kernel from the previous example: data type accessed. - stride: An :class:`int` that specifies stride of the memory access. A stride - of 0 indicates a uniform access (i.e. all threads access the same item). + of 0 indicates a uniform access (i.e. all work-items access the same item). - direction: A :class:`str` that specifies the direction of memory access as **load** or **store**. @@ -1720,7 +1720,7 @@ achieved memory bandwidth in byte/sec or performance in FLOP/sec. ~~~~~~~~~~~ Since we have not tagged any of the inames or parallelized the kernel across -threads (which would have produced iname tags), :func:`loopy.get_mem_access_map` +work-items (which would have produced iname tags), :func:`loopy.get_mem_access_map` considers the memory accesses *uniform*, so the *stride* of each access is 0. Now we'll parallelize the kernel and count the array accesses again. The resulting :class:`islpy.PwQPolynomial` will be more complicated this time. @@ -1731,28 +1731,28 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 1, load, a, thread) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float32'), 1, load, b, thread) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float32'), 1, store, c, thread) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 1, load, g, thread) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 1, load, h, thread) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 1, store, e, thread) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 1, load, a, workitem) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 1, load, b, workitem) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 1, store, c, workitem) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 1, load, g, workitem) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 1, load, h, workitem) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 1, store, e, workitem) : [m, l, n] -> { ... } -With this parallelization, consecutive threads will access consecutive array +With this parallelization, consecutive work-items will access consecutive array elements in memory. The polynomials are a bit more complicated now due to the parallelization, but when we evaluate them, we see that the total number of array accesses has not changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g', 'thread') + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g', 'workitem') ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e', 'thread') + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e', 'workitem') ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a', 'thread') + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a', 'workitem') ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c', 'thread') + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c', 'workitem') ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1772,27 +1772,27 @@ switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 128, load, a, thread) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float32'), 128, load, b, thread) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float32'), 128, store, c, thread) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 128, load, g, thread) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 128, load, h, thread) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 128, store, e, thread) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 128, load, a, workitem) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 128, load, b, workitem) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 128, store, c, workitem) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 128, load, g, workitem) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 128, load, h, workitem) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 128, store, e, workitem) : [m, l, n] -> { ... } -With this parallelization, consecutive threads will access *nonconsecutive* +With this parallelization, consecutive work-items will access *nonconsecutive* array elements in memory. The total number of array accesses still has not changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g', 'thread') + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g', 'workitem') ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e', 'thread') + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e', 'workitem') ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a', 'thread') + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a', 'workitem') ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c', 'thread') + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c', 'workitem') ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1819,7 +1819,7 @@ Counting synchronization events ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :func:`loopy.get_synchronization_map` counts the number of synchronization -events per **thread** in a kernel. First, we'll call this function on the +events per **work-item** in a kernel. First, we'll call this function on the kernel from the previous example: .. doctest:: @@ -1877,8 +1877,8 @@ Now to make things more interesting, we'll create a kernel with barriers: } } -In this kernel, when a thread performs the second instruction it uses data -produced by *different* threads during the first instruction. Because of this, +In this kernel, when a work-item performs the second instruction it uses data +produced by *different* work-items during the first instruction. Because of this, barriers are required for correct execution, so loopy inserts them. Now we'll count the barriers using :func:`loopy.get_synchronization_map`: @@ -1890,7 +1890,7 @@ count the barriers using :func:`loopy.get_synchronization_map`: kernel_launch : { 1 } -Based on the kernel code printed above, we would expect each thread to +Based on the kernel code printed above, we would expect each work-item to encounter 50x10x2 barriers, which matches the result from :func:`loopy.get_synchronization_map`. In this case, the number of barriers does not depend on any inames, so we can pass an empty dictionary to -- GitLab From 9ba5d96d5f7c529b33694eed29745f04fff819b3 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 22 Jan 2018 03:48:34 -0600 Subject: [PATCH 16/59] renaming warp->subgroup in tutorial --- doc/tutorial.rst | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index b94708ed3..7a2fb04fc 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1638,12 +1638,12 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 0, load, a, warp) : [m, l, n] -> { 1/16 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b, warp) : [m, l, n] -> { 1/32 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c, warp) : [m, l, n] -> { 1/32 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, g, warp) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, h, warp) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, store, e, warp) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 1/16 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { 1/32 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { 1/32 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g, subgroup) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h, subgroup) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e, subgroup) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** @@ -1669,13 +1669,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g', 'warp') + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g', 'subgroup') ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e', 'warp') + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e', 'subgroup') ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a', 'warp') + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a', 'subgroup') ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c', 'warp') + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c', 'subgroup') ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1693,12 +1693,12 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), 0, load, a, warp) : [m, l, n] -> { 1/4 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b, warp) : [m, l, n] -> { 1/8 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c, warp) : [m, l, n] -> { 1/8 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, g, warp) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, h, warp) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, store, e, warp) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 1/4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { 1/8 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { 1/8 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g, subgroup) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h, subgroup) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e, subgroup) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 } >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') -- GitLab From 61595cb3a877980c4827b0b7a355d3d69f9a02df Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 22 Jan 2018 07:45:40 -0600 Subject: [PATCH 17/59] inheriting from record in Op and MemAccess --- loopy/statistics.py | 73 ++++++++++----------------------------------- 1 file changed, 16 insertions(+), 57 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 765c75a8f..4987b27df 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -32,6 +32,7 @@ from functools import reduce from loopy.kernel.data import ( MultiAssignmentBase, TemporaryVariable, temp_var_scope) from loopy.diagnostic import warn_with_kernel, LoopyError +from pytools import Record __doc__ = """ @@ -466,7 +467,7 @@ def stringify_stats_mapping(m): # {{{ Op descriptor -class Op(object): +class Op(Record): """A descriptor for a type of arithmetic operation. .. attribute:: dtype @@ -481,26 +482,14 @@ class Op(object): """ - # FIXME: This could be done much more briefly by inheriting from Record. - def __init__(self, dtype=None, name=None, count_granularity=None): - self.name = name - self.count_granularity = count_granularity if dtype is None: - self.dtype = dtype + Record.__init__(self, dtype=dtype, name=name, + count_granularity=count_granularity) else: from loopy.types import to_loopy_type - self.dtype = to_loopy_type(dtype) - - def __eq__(self, other): - return isinstance(other, Op) and ( - (self.dtype is None or other.dtype is None or - self.dtype == other.dtype) and - (self.name is None or other.name is None or - self.name == other.name) and - (self.count_granularity is None or - other.count_granularity is None or - self.count_granularity == other.count_granularity)) + Record.__init__(self, dtype=to_loopy_type(dtype), name=name, + count_granularity=count_granularity) def __hash__(self): return hash(str(self)) @@ -513,7 +502,7 @@ class Op(object): # {{{ MemAccess descriptor -class MemAccess(object): +class MemAccess(Record): """A descriptor for a type of memory access. .. attribute:: mtype @@ -547,17 +536,6 @@ class MemAccess(object): def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None, count_granularity=None): - self.mtype = mtype - self.stride = stride - self.direction = direction - self.variable = variable - self.count_granularity = count_granularity - - if dtype is None: - self.dtype = dtype - else: - from loopy.types import to_loopy_type - self.dtype = to_loopy_type(dtype) #TODO currently giving all lmem access stride=None if (mtype == 'local') and (stride is not None): @@ -569,34 +547,15 @@ class MemAccess(object): raise NotImplementedError("MemAccess: variable must be None when " "mtype is 'local'") - def copy(self, mtype=None, dtype=None, stride=None, direction=None, - variable=None, count_granularity=None): - return MemAccess( - mtype=mtype if mtype is not None else self.mtype, - dtype=dtype if dtype is not None else self.dtype, - stride=stride if stride is not None else self.stride, - direction=direction if direction is not None else self.direction, - variable=variable if variable is not None else self.variable, - count_granularity=count_granularity - if count_granularity is not None - else self.count_granularity) - - def __eq__(self, other): - return isinstance(other, MemAccess) and ( - (self.mtype is None or other.mtype is None or - self.mtype == other.mtype) and - (self.dtype is None or other.dtype is None or - self.dtype == other.dtype) and - (self.stride is None or other.stride is None or - self.stride == other.stride) and - (self.direction is None or other.direction is None or - self.direction == other.direction) and - (self.variable is None or other.variable is None or - self.variable == other.variable) and - (self.count_granularity is None or - other.count_granularity is None or - self.count_granularity == other.count_granularity) - ) + if dtype is None: + Record.__init__(self, mtype=mtype, dtype=dtype, stride=stride, + direction=direction, variable=variable, + count_granularity=count_granularity) + else: + from loopy.types import to_loopy_type + Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype), stride=stride, + direction=direction, variable=variable, + count_granularity=count_granularity) def __hash__(self): return hash(str(self)) -- GitLab From 3d8945c39cd8652962b93223a62c3e74ea34febf Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 22 Jan 2018 08:35:13 -0600 Subject: [PATCH 18/59] line too long, shortened --- loopy/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 4987b27df..3c88a56fd 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -553,8 +553,8 @@ class MemAccess(Record): count_granularity=count_granularity) else: from loopy.types import to_loopy_type - Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype), stride=stride, - direction=direction, variable=variable, + Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype), + stride=stride, direction=direction, variable=variable, count_granularity=count_granularity) def __hash__(self): -- GitLab From db6d9a4aa0612f6082fe0d5baf106b6db396f159 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 23 Jan 2018 00:22:55 -0600 Subject: [PATCH 19/59] updated docstrings --- loopy/statistics.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 3c88a56fd..05009ce49 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -478,7 +478,12 @@ class Op(Record): .. attribute:: name A :class:`str` that specifies the kind of arithmetic operation as - *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. + *add*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc. + + .. attribute:: count_granularity + + A :class:`str` that specifies whether this operation should be counted + once per *workitem*, *subgroup*, or *group*. """ @@ -530,6 +535,11 @@ class MemAccess(Record): A :class:`str` that specifies the variable name of the data accessed. + .. attribute:: count_granularity + + A :class:`str` that specifies whether this operation should be counted + once per *workitem*, *subgroup*, or *group*. + """ # FIXME: This could be done much more briefly by inheriting from Record. @@ -1259,6 +1269,11 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, (Likely desirable for performance modeling, but undesirable for code optimization.) + :arg subgroup_size: A :class:`int` that specifies the sub-group size. This + is used, e.g., when counting a :class:`MemAccess` whose count_granularity + specifies that it should only be counted once per sub-group. The default + subgroup_size is 32. + :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1380,6 +1395,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, direction="store") # FIXME: (!!!!) for now, don't count writes to local mem + # (^this is updated in a branch that will be merged soon) # use count excluding local index tags for uniform accesses for key, val in six.iteritems(access_expr.count_map): -- GitLab From 93bda9b2ae96242ba3b41dd20f10e1560dd395e8 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 23 Jan 2018 00:28:44 -0600 Subject: [PATCH 20/59] removing finished TODO --- loopy/statistics.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 05009ce49..dbbdb97da 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -542,8 +542,6 @@ class MemAccess(Record): """ - # FIXME: This could be done much more briefly by inheriting from Record. - def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None, count_granularity=None): -- GitLab From d1433a2ab7cc087e94eaffd3a23038a7c1f4e1df Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 23 Jan 2018 01:02:18 -0600 Subject: [PATCH 21/59] no longer modifying maps in place when converting to numpy types --- loopy/statistics.py | 50 ++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index dbbdb97da..6c7f20d36 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1236,14 +1236,19 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False): % type(insn).__name__) if numpy_types: - op_map.count_map = dict((Op( - dtype=op.dtype.numpy_dtype, - name=op.name, - count_granularity=op.count_granularity), - count) - for op, count in six.iteritems(op_map.count_map)) - - return op_map + return ToCountMap( + init_dict=dict( + (Op( + dtype=op.dtype.numpy_dtype, + name=op.name, + count_granularity=op.count_granularity + ) + , ct) + for op, ct in six.iteritems(op_map.count_map)), + val_type=op_map.val_type + ) + else: + return op_map # }}} @@ -1420,19 +1425,22 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, % type(insn).__name__) if numpy_types: - # FIXME: Don't modify in-place - access_map.count_map = dict( - (MemAccess( - mtype=mem_access.mtype, - dtype=mem_access.dtype.numpy_dtype, - stride=mem_access.stride, - direction=mem_access.direction, - variable=mem_access.variable, - count_granularity=mem_access.count_granularity - ), count) - for mem_access, count in six.iteritems(access_map.count_map)) - - return access_map + return ToCountMap( + init_dict=dict( + (MemAccess( + mtype=mem_access.mtype, + dtype=mem_access.dtype.numpy_dtype, + stride=mem_access.stride, + direction=mem_access.direction, + variable=mem_access.variable, + count_granularity=mem_access.count_granularity + ) + , ct) + for mem_access, ct in six.iteritems(access_map.count_map)), + val_type=access_map.val_type + ) + else: + return access_map # }}} -- GitLab From dcd7259fc0a0ba6c243b5e5b7587348201e6768e Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 23 Jan 2018 03:50:15 -0600 Subject: [PATCH 22/59] fixing formatting problems --- loopy/statistics.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 6c7f20d36..a56be22a3 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1241,8 +1241,7 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False): (Op( dtype=op.dtype.numpy_dtype, name=op.name, - count_granularity=op.count_granularity - ) + count_granularity=op.count_granularity) , ct) for op, ct in six.iteritems(op_map.count_map)), val_type=op_map.val_type @@ -1433,8 +1432,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, stride=mem_access.stride, direction=mem_access.direction, variable=mem_access.variable, - count_granularity=mem_access.count_granularity - ) + count_granularity=mem_access.count_granularity) , ct) for mem_access, ct in six.iteritems(access_map.count_map)), val_type=access_map.val_type -- GitLab From 6dfc346bc735f8165bfdd81b0578042663b0292f Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 23 Jan 2018 04:16:58 -0600 Subject: [PATCH 23/59] ensuring count_granularity values are valid in Op.__init__ and MemAccess.__init__ --- loopy/statistics.py | 18 ++++++++++++++++-- test/test_statistics.py | 25 +++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index a56be22a3..4dac09c0d 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -487,7 +487,13 @@ class Op(Record): """ + count_granularity_options = ["workitem", "subgroup", "group", None] + def __init__(self, dtype=None, name=None, count_granularity=None): + if not count_granularity in self.count_granularity_options: + raise ValueError("Op.__init__: count_granularity '%s' is" + "not allowed. count_granularity options: %s" + % (count_granularity, self.count_granularity_options)) if dtype is None: Record.__init__(self, dtype=dtype, name=name, count_granularity=count_granularity) @@ -542,6 +548,8 @@ class MemAccess(Record): """ + count_granularity_options = ["workitem", "subgroup", "group", None] + def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None, count_granularity=None): @@ -555,6 +563,11 @@ class MemAccess(Record): raise NotImplementedError("MemAccess: variable must be None when " "mtype is 'local'") + if not count_granularity in self.count_granularity_options: + raise ValueError("Op.__init__: count_granularity '%s' is" + "not allowed. count_granularity options: %s" + % (count_granularity, self.count_granularity_options)) + if dtype is None: Record.__init__(self, mtype=mtype, dtype=dtype, stride=stride, direction=direction, variable=variable, @@ -1371,9 +1384,10 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, group_workitems *= s return ct/group_workitems else: + # this should not happen since this is enforced in MemAccess raise ValueError("get_insn_count: count_granularity '%s' is" - "not allowed. count_granularity must be 'group', " - "'subgroup', or 'workitem'." % (count_granularity)) + "not allowed. count_granularity options: %s" + % (count_granularity, MemAccess.count_granularity_options)) knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) diff --git a/test/test_statistics.py b/test/test_statistics.py index f8735553f..82f9f0886 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -672,6 +672,31 @@ def test_mem_access_counter_consec(): assert f32consec == n*m*ell +def test_count_granularity_val_checks(): + + try: + lp.MemAccess(count_granularity='workitem') + lp.MemAccess(count_granularity='subgroup') + lp.MemAccess(count_granularity='group') + lp.MemAccess(count_granularity=None) + assert True + lp.MemAccess(count_granularity='bushel') + assert False + except ValueError: + assert True + + try: + lp.Op(count_granularity='workitem') + lp.Op(count_granularity='subgroup') + lp.Op(count_granularity='group') + lp.Op(count_granularity=None) + assert True + lp.Op(count_granularity='bushel') + assert False + except ValueError: + assert True + + def test_barrier_counter_nobarriers(): knl = lp.make_kernel( -- GitLab From 2705f321105f65ffa94eafa7c9530add3e062ec1 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 23 Jan 2018 04:38:17 -0600 Subject: [PATCH 24/59] fixing more flake8 issues --- loopy/statistics.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 4dac09c0d..acd2755ba 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -490,7 +490,7 @@ class Op(Record): count_granularity_options = ["workitem", "subgroup", "group", None] def __init__(self, dtype=None, name=None, count_granularity=None): - if not count_granularity in self.count_granularity_options: + if count_granularity not in self.count_granularity_options: raise ValueError("Op.__init__: count_granularity '%s' is" "not allowed. count_granularity options: %s" % (count_granularity, self.count_granularity_options)) @@ -563,7 +563,7 @@ class MemAccess(Record): raise NotImplementedError("MemAccess: variable must be None when " "mtype is 'local'") - if not count_granularity in self.count_granularity_options: + if count_granularity not in self.count_granularity_options: raise ValueError("Op.__init__: count_granularity '%s' is" "not allowed. count_granularity options: %s" % (count_granularity, self.count_granularity_options)) @@ -1254,8 +1254,8 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False): (Op( dtype=op.dtype.numpy_dtype, name=op.name, - count_granularity=op.count_granularity) - , ct) + count_granularity=op.count_granularity), + ct) for op, ct in six.iteritems(op_map.count_map)), val_type=op_map.val_type ) @@ -1446,8 +1446,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, stride=mem_access.stride, direction=mem_access.direction, variable=mem_access.variable, - count_granularity=mem_access.count_granularity) - , ct) + count_granularity=mem_access.count_granularity), + ct) for mem_access, ct in six.iteritems(access_map.count_map)), val_type=access_map.val_type ) -- GitLab From 8c75f8eeeeefb7075bfd89c4b534125be0e15664 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 24 Jan 2018 03:19:28 -0600 Subject: [PATCH 25/59] changed truediv->floordiv, added ceildiv --- loopy/statistics.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index acd2755ba..341c6aaa9 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -89,14 +89,23 @@ class GuardedPwQPolynomial(object): __rmul__ = __mul__ - def __truediv__(self, other): + def __floordiv__(self, other): if not isinstance(other, int): - raise ValueError("GuardedPwQPolynomial.__truediv__ only valid for " + raise ValueError("GuardedPwQPolynomial.__floordiv__ only valid for " "type int. Attempted to divide by %s" % (type(other))) return GuardedPwQPolynomial( self.pwqpolynomial.scale_val(isl.Val(1).div(isl.Val(other))), self.valid_domain) + def ceildiv(self, other): + if not isinstance(other, int): + raise ValueError("GuardedPwQPolynomial.ceildiv only valid for " + "type int. Attempted to divide by %s" % (type(other))) + return GuardedPwQPolynomial( + (self.pwqpolynomial + other - 1).scale_val(isl.Val(1).div(isl.Val(other))), + self.valid_domain) + + def eval_with_dict(self, value_dict): space = self.pwqpolynomial.space pt = isl.Point.zero(space.params()) @@ -1369,7 +1378,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, elif count_granularity == 'workitem': return ct elif count_granularity == 'subgroup': - return ct/subgroup_size + return ct//subgroup_size elif count_granularity == 'group': from loopy.symbolic import aff_to_expr _, local_size = knl.get_grid_size_upper_bounds() @@ -1382,7 +1391,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, "group size is not integer: %s" % (local_size)) group_workitems *= s - return ct/group_workitems + return ct//group_workitems else: # this should not happen since this is enforced in MemAccess raise ValueError("get_insn_count: count_granularity '%s' is" -- GitLab From 0c6fa60190ac8b225410e6bf5e513048540fcec1 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 24 Jan 2018 16:23:00 -0600 Subject: [PATCH 26/59] fixed count_granularity rounding behavior for groups not evenly divisible by subgroups, updated tests --- loopy/statistics.py | 45 ++++++----- test/test_statistics.py | 168 +++++++++++++++++++++++++++++++--------- 2 files changed, 158 insertions(+), 55 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 341c6aaa9..936a840b1 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1166,6 +1166,7 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False): + insn_inames = knl.insn_inames(insn) if disregard_local_axes: @@ -1363,35 +1364,41 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, #from pytools import memoize_in #@memoize_in(cache_holder, "insn_count") # TODO why doesn't this work anymore? def get_insn_count(knl, insn_id, - disregard_local_axes=False, count_granularity='workitem'): insn = knl.id_to_insn[insn_id] - ct = count_insn_runs( - knl, insn, disregard_local_axes=disregard_local_axes, - count_redundant_work=count_redundant_work) if count_granularity is None: warn_with_kernel(knl, "get_insn_count_assumes_granularity", "get_insn_count: No count granularity passed for " "MemAccess, assuming workitem granularity.") - return ct - elif count_granularity == 'workitem': - return ct + count_granularity == 'workitem' + + if count_granularity == 'workitem': + return count_insn_runs( + knl, insn, count_redundant_work=count_redundant_work) + + ct_disregard_local = count_insn_runs( + knl, insn, disregard_local_axes=True, + count_redundant_work=count_redundant_work) + + if count_granularity == 'group': + return ct_disregard_local elif count_granularity == 'subgroup': - return ct//subgroup_size - elif count_granularity == 'group': + # get the group size from loopy.symbolic import aff_to_expr - _, local_size = knl.get_grid_size_upper_bounds() - group_workitems = 1 - for size in local_size: - try: + global_size, local_size = knl.get_grid_size_upper_bounds() + group_size = 1 + if local_size: + for size in local_size: s = aff_to_expr(size) - except AttributeError: - raise LoopyError("Cannot count insn with group granularity, " - "group size is not integer: %s" - % (local_size)) - group_workitems *= s - return ct//group_workitems + if not isinstance(s, int): + raise LoopyError("Cannot count insn with subgroup granularity, " + "group size is not integer: %s" + % (local_size)) + group_size *= s + + from pytools import div_ceil + return ct_disregard_local*div_ceil(group_size, subgroup_size) else: # this should not happen since this is enforced in MemAccess raise ValueError("get_insn_count: count_granularity '%s' is" diff --git a/test/test_statistics.py b/test/test_statistics.py index 82f9f0886..c2fb4ffe2 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -30,6 +30,7 @@ from pyopencl.tools import ( # noqa import loopy as lp from loopy.types import to_loopy_type import numpy as np +from pytools import div_ceil from pymbolic.primitives import Variable @@ -248,12 +249,21 @@ def test_mem_access_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32) + + subgroup_size = 32 + + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, + subgroup_size=subgroup_size) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} + + n_groups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, subgroup_size) + f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a', count_granularity='subgroup') @@ -270,8 +280,10 @@ def test_mem_access_counter_basic(): stride=0, direction='load', variable='h', count_granularity='subgroup') ].eval_with_dict(params) - assert f32l == 3*n*m*ell/32 # /subgroup_size because these are uniform - assert f64l == 2*n*m/32 # /subgroup_size because these are uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert f32l == (3*n*m*ell)*n_groups*subgroups_per_group # these are uniform + assert f64l == (2*n*m)*n_groups*subgroups_per_group # these are uniform f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c', @@ -281,8 +293,10 @@ def test_mem_access_counter_basic(): stride=0, direction='store', variable='e', count_granularity='subgroup') ].eval_with_dict(params) - assert f32s == n*m*ell/32 # /subgroup_size because these are uniform - assert f64s == n*m/32 # /subgroup_size because these are uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert f32s == (n*m*ell)*n_groups*subgroups_per_group # these are uniform + assert f64s == (n*m)*n_groups*subgroups_per_group # these are uniform def test_mem_access_counter_reduction(): @@ -295,11 +309,20 @@ def test_mem_access_counter_reduction(): name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32) + + subgroup_size = 32 + + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, + subgroup_size=subgroup_size) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} + + n_groups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, subgroup_size) + f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a', count_granularity='subgroup') @@ -308,13 +331,17 @@ def test_mem_access_counter_reduction(): stride=0, direction='load', variable='b', count_granularity='subgroup') ].eval_with_dict(params) - assert f32l == 2*n*m*ell/32 # /subgroup_size because these are uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert f32l == (2*n*m*ell)*n_groups*subgroups_per_group # these are uniform f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c', count_granularity='subgroup') ].eval_with_dict(params) - assert f32s == n*ell/32 # /subgroup_size because these are uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert f32s == (n*ell)*n_groups*subgroups_per_group # these are uniform ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] ).to_bytes().eval_and_sum(params) @@ -338,12 +365,20 @@ def test_mem_access_counter_logic(): name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32) + + subgroup_size = 32 + + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, + subgroup_size=subgroup_size) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} + n_groups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, subgroup_size) + reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32), @@ -355,9 +390,11 @@ def test_mem_access_counter_logic(): f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), direction='store') ].eval_with_dict(params) - assert f32_g_l == 2*n*m/32 # /subgroup_size because these are uniform - assert f64_g_l == n*m/32 # /subgroup_size because these are uniform - assert f64_g_s == n*m/32 # /subgroup_size because these are uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert f32_g_l == (2*n*m)*n_groups*subgroups_per_group # these are uniform + assert f64_g_l == (n*m)*n_groups*subgroups_per_group # these are uniform + assert f64_g_s == (n*m)*n_groups*subgroups_per_group # these are uniform def test_mem_access_counter_specialops(): @@ -374,11 +411,20 @@ def test_mem_access_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32) + + subgroup_size = 32 + + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, + subgroup_size=subgroup_size) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} + + n_groups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, subgroup_size) + f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a', count_granularity='subgroup') @@ -395,8 +441,10 @@ def test_mem_access_counter_specialops(): stride=0, direction='load', variable='h', count_granularity='subgroup') ].eval_with_dict(params) - assert f32 == 2*n*m*ell/32 # /subgroup_size because these are uniform - assert f64 == 2*n*m/32 # /subgroup_size because these are uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert f32 == (2*n*m*ell)*n_groups*subgroups_per_group # these are uniform + assert f64 == (2*n*m)*n_groups*subgroups_per_group # these are uniform f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='store', variable='c', @@ -406,14 +454,17 @@ def test_mem_access_counter_specialops(): stride=0, direction='store', variable='e', count_granularity='subgroup') ].eval_with_dict(params) - assert f32 == n*m*ell/32 # /subgroup_size because these are uniform - assert f64 == n*m/32 # /subgroup_size because these are uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert f32 == (n*m*ell)*n_groups*subgroups_per_group # these are uniform + assert f64 == (n*m)*n_groups*subgroups_per_group # these are uniform filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'], count_granularity='subgroup') - #tot = lp.eval_and_sum_polys(filtered_map, params) tot = filtered_map.eval_and_sum(params) - assert tot == (n*m*ell + n*m)/32 # /subgroup_size for uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert tot == (n*m*ell + n*m)*n_groups*subgroups_per_group # these are uniform def test_mem_access_counter_bitwise(): @@ -433,11 +484,19 @@ def test_mem_access_counter_bitwise(): a=np.int32, b=np.int32, g=np.int32, h=np.int32)) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32) + subgroup_size = 32 + + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, + subgroup_size=subgroup_size) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} + + n_groups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, subgroup_size) + i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='a', count_granularity='subgroup') @@ -454,7 +513,9 @@ def test_mem_access_counter_bitwise(): stride=0, direction='load', variable='h', count_granularity='subgroup') ].eval_with_dict(params) - assert i32 == (4*n*m+2*n*m*ell)/32 # /subgroup_size for uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert i32 == (4*n*m+2*n*m*ell)*n_groups*subgroups_per_group # these are uniform i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='c', @@ -464,7 +525,9 @@ def test_mem_access_counter_bitwise(): stride=0, direction='store', variable='e', count_granularity='subgroup') ].eval_with_dict(params) - assert i32 == (n*m+n*m*ell)/32 # /subgroup_size because these are uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert i32 == (n*m+n*m*ell)*n_groups*subgroups_per_group # these are uniform def test_mem_access_counter_mixed(): @@ -478,18 +541,28 @@ def test_mem_access_counter_mixed(): """ ], name="mixed", assumptions="n,m,ell >= 1") + knl = lp.add_and_infer_dtypes(knl, dict( a=np.float32, b=np.float32, g=np.float64, h=np.float64, x=np.float32)) - bsize = 16 - knl = lp.split_iname(knl, "j", bsize) + + bsize0 = 65 + subgroup_size = 32 + + knl = lp.split_iname(knl, "j", bsize0) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32) # noqa n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} + + n_groups = div_ceil(ell, bsize0) + group_size = bsize0 + subgroups_per_group = div_ceil(group_size, subgroup_size) + + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, + subgroup_size=subgroup_size) f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g', count_granularity='subgroup') @@ -512,8 +585,11 @@ def test_mem_access_counter_mixed(): variable='b', count_granularity='workitem') ].eval_with_dict(params) - assert f64uniform == 2*n*m*ell/32 # /subgroup_size for uniform - assert f32uniform == n*m*ell/32 # /subgroup_size for uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert f64uniform == (2*n*m)*n_groups*subgroups_per_group # /subgroup_size for uniform + assert f32uniform == (m*n)*n_groups*subgroups_per_group # /subgroup_size for uniform + assert f32nonconsec == 3*n*m*ell f64uniform = mem_map[lp.MemAccess('global', np.float64, @@ -525,7 +601,9 @@ def test_mem_access_counter_mixed(): variable='c', count_granularity='workitem') ].eval_with_dict(params) - assert f64uniform == n*m*ell/32 # /subgroup_size because these are uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert f64uniform == m*n*n_groups*subgroups_per_group # /subgroup_size for uniform assert f32nonconsec == n*m*ell @@ -865,22 +943,34 @@ def test_summations_and_filters(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) + + subgroup_size = 32 + n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=32) + n_groups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, subgroup_size) + + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, + subgroup_size=subgroup_size) loads_a = mem_map.filter_by(direction=['load'], variable=['a'], count_granularity=['subgroup'] ).eval_and_sum(params) - assert loads_a == 2*n*m*ell/32 # /subgroup_size because these are uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert loads_a == (2*n*m*ell)*n_groups*subgroups_per_group # these are uniform global_stores = mem_map.filter_by(mtype=['global'], direction=['store'], count_granularity=['subgroup'] ).eval_and_sum(params) - assert global_stores == (n*m*ell + n*m)/32 # /subgroup_size for uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert global_stores == (n*m*ell + n*m)*n_groups*subgroups_per_group # these are uniform ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'], count_granularity=['subgroup'] @@ -888,8 +978,10 @@ def test_summations_and_filters(): st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'], count_granularity=['subgroup'] ).to_bytes().eval_and_sum(params) - assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)/32 # /subgroup_size for uniform - assert st_bytes == (4*n*m*ell + 8*n*m)/32 # /subgroup_size for uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_groups*subgroups_per_group # these are uniform + assert st_bytes == (4*n*m*ell + 8*n*m)*n_groups*subgroups_per_group # these are uniform # ignore stride and variable names in this map reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') @@ -897,8 +989,10 @@ def test_summations_and_filters(): ].eval_with_dict(params) f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') ].eval_with_dict(params) - assert f32lall == 3*n*m*ell/32 # /subgroup_size because these are uniform - assert f64lall == 2*n*m/32 # /subgroup_size because these are uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert f32lall== (3*n*m*ell)*n_groups*subgroups_per_group # these are uniform + assert f64lall == (2*n*m)*n_groups*subgroups_per_group # these are uniform op_map = lp.get_op_map(knl, count_redundant_work=True) #for k, v in op_map.items(): @@ -931,7 +1025,9 @@ def test_summations_and_filters(): return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \ key.direction == 'load' s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) - assert s1f64l == 2*n*m/32 # /subgroup_size because these are uniform + + # (count-per-sub-group*n_groups*subgroups_per_group) + assert s1f64l == (2*n*m)*n_groups*subgroups_per_group # these are uniform def test_strided_footprint(): -- GitLab From aa7fb37a3155311a31efbecf80c9cc889405b7d0 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 24 Jan 2018 16:24:07 -0600 Subject: [PATCH 27/59] removed now-unused div functions in GuardedPwQPolynomial --- loopy/statistics.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 936a840b1..19fa3d71a 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -89,23 +89,6 @@ class GuardedPwQPolynomial(object): __rmul__ = __mul__ - def __floordiv__(self, other): - if not isinstance(other, int): - raise ValueError("GuardedPwQPolynomial.__floordiv__ only valid for " - "type int. Attempted to divide by %s" % (type(other))) - return GuardedPwQPolynomial( - self.pwqpolynomial.scale_val(isl.Val(1).div(isl.Val(other))), - self.valid_domain) - - def ceildiv(self, other): - if not isinstance(other, int): - raise ValueError("GuardedPwQPolynomial.ceildiv only valid for " - "type int. Attempted to divide by %s" % (type(other))) - return GuardedPwQPolynomial( - (self.pwqpolynomial + other - 1).scale_val(isl.Val(1).div(isl.Val(other))), - self.valid_domain) - - def eval_with_dict(self, value_dict): space = self.pwqpolynomial.space pt = isl.Point.zero(space.params()) -- GitLab From f591dac06746a5c49e3174c5ab9eeb227c1f99a0 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 24 Jan 2018 16:46:56 -0600 Subject: [PATCH 28/59] fixed flake8 issues --- loopy/statistics.py | 4 +- test/test_statistics.py | 86 ++++++++++++++++++++--------------------- 2 files changed, 45 insertions(+), 45 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 19fa3d71a..92762156b 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1375,8 +1375,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, for size in local_size: s = aff_to_expr(size) if not isinstance(s, int): - raise LoopyError("Cannot count insn with subgroup granularity, " - "group size is not integer: %s" + raise LoopyError("Cannot count insn with subgroup " + "granularity, group size is not integer: %s" % (local_size)) group_size *= s diff --git a/test/test_statistics.py b/test/test_statistics.py index c2fb4ffe2..7d1b6df0d 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -281,9 +281,9 @@ def test_mem_access_counter_basic(): count_granularity='subgroup') ].eval_with_dict(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert f32l == (3*n*m*ell)*n_groups*subgroups_per_group # these are uniform - assert f64l == (2*n*m)*n_groups*subgroups_per_group # these are uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert f32l == (3*n*m*ell)*n_groups*subgroups_per_group + assert f64l == (2*n*m)*n_groups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c', @@ -294,9 +294,9 @@ def test_mem_access_counter_basic(): count_granularity='subgroup') ].eval_with_dict(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert f32s == (n*m*ell)*n_groups*subgroups_per_group # these are uniform - assert f64s == (n*m)*n_groups*subgroups_per_group # these are uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert f32s == (n*m*ell)*n_groups*subgroups_per_group + assert f64s == (n*m)*n_groups*subgroups_per_group def test_mem_access_counter_reduction(): @@ -332,16 +332,16 @@ def test_mem_access_counter_reduction(): count_granularity='subgroup') ].eval_with_dict(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert f32l == (2*n*m*ell)*n_groups*subgroups_per_group # these are uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert f32l == (2*n*m*ell)*n_groups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c', count_granularity='subgroup') ].eval_with_dict(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert f32s == (n*ell)*n_groups*subgroups_per_group # these are uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert f32s == (n*ell)*n_groups*subgroups_per_group ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] ).to_bytes().eval_and_sum(params) @@ -391,10 +391,10 @@ def test_mem_access_counter_logic(): direction='store') ].eval_with_dict(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert f32_g_l == (2*n*m)*n_groups*subgroups_per_group # these are uniform - assert f64_g_l == (n*m)*n_groups*subgroups_per_group # these are uniform - assert f64_g_s == (n*m)*n_groups*subgroups_per_group # these are uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert f32_g_l == (2*n*m)*n_groups*subgroups_per_group + assert f64_g_l == (n*m)*n_groups*subgroups_per_group + assert f64_g_s == (n*m)*n_groups*subgroups_per_group def test_mem_access_counter_specialops(): @@ -442,9 +442,9 @@ def test_mem_access_counter_specialops(): count_granularity='subgroup') ].eval_with_dict(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert f32 == (2*n*m*ell)*n_groups*subgroups_per_group # these are uniform - assert f64 == (2*n*m)*n_groups*subgroups_per_group # these are uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert f32 == (2*n*m*ell)*n_groups*subgroups_per_group + assert f64 == (2*n*m)*n_groups*subgroups_per_group f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='store', variable='c', @@ -455,16 +455,16 @@ def test_mem_access_counter_specialops(): count_granularity='subgroup') ].eval_with_dict(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert f32 == (n*m*ell)*n_groups*subgroups_per_group # these are uniform - assert f64 == (n*m)*n_groups*subgroups_per_group # these are uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert f32 == (n*m*ell)*n_groups*subgroups_per_group + assert f64 == (n*m)*n_groups*subgroups_per_group filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'], count_granularity='subgroup') tot = filtered_map.eval_and_sum(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert tot == (n*m*ell + n*m)*n_groups*subgroups_per_group # these are uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert tot == (n*m*ell + n*m)*n_groups*subgroups_per_group def test_mem_access_counter_bitwise(): @@ -514,8 +514,8 @@ def test_mem_access_counter_bitwise(): count_granularity='subgroup') ].eval_with_dict(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert i32 == (4*n*m+2*n*m*ell)*n_groups*subgroups_per_group # these are uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert i32 == (4*n*m+2*n*m*ell)*n_groups*subgroups_per_group i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='c', @@ -526,8 +526,8 @@ def test_mem_access_counter_bitwise(): count_granularity='subgroup') ].eval_with_dict(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert i32 == (n*m+n*m*ell)*n_groups*subgroups_per_group # these are uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert i32 == (n*m+n*m*ell)*n_groups*subgroups_per_group def test_mem_access_counter_mixed(): @@ -586,9 +586,9 @@ def test_mem_access_counter_mixed(): count_granularity='workitem') ].eval_with_dict(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert f64uniform == (2*n*m)*n_groups*subgroups_per_group # /subgroup_size for uniform - assert f32uniform == (m*n)*n_groups*subgroups_per_group # /subgroup_size for uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert f64uniform == (2*n*m)*n_groups*subgroups_per_group + assert f32uniform == (m*n)*n_groups*subgroups_per_group assert f32nonconsec == 3*n*m*ell @@ -602,8 +602,8 @@ def test_mem_access_counter_mixed(): count_granularity='workitem') ].eval_with_dict(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert f64uniform == m*n*n_groups*subgroups_per_group # /subgroup_size for uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert f64uniform == m*n*n_groups*subgroups_per_group assert f32nonconsec == n*m*ell @@ -962,15 +962,15 @@ def test_summations_and_filters(): count_granularity=['subgroup'] ).eval_and_sum(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert loads_a == (2*n*m*ell)*n_groups*subgroups_per_group # these are uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert loads_a == (2*n*m*ell)*n_groups*subgroups_per_group global_stores = mem_map.filter_by(mtype=['global'], direction=['store'], count_granularity=['subgroup'] ).eval_and_sum(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert global_stores == (n*m*ell + n*m)*n_groups*subgroups_per_group # these are uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert global_stores == (n*m*ell + n*m)*n_groups*subgroups_per_group ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'], count_granularity=['subgroup'] @@ -979,9 +979,9 @@ def test_summations_and_filters(): count_granularity=['subgroup'] ).to_bytes().eval_and_sum(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_groups*subgroups_per_group # these are uniform - assert st_bytes == (4*n*m*ell + 8*n*m)*n_groups*subgroups_per_group # these are uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_groups*subgroups_per_group + assert st_bytes == (4*n*m*ell + 8*n*m)*n_groups*subgroups_per_group # ignore stride and variable names in this map reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') @@ -990,9 +990,9 @@ def test_summations_and_filters(): f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') ].eval_with_dict(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert f32lall== (3*n*m*ell)*n_groups*subgroups_per_group # these are uniform - assert f64lall == (2*n*m)*n_groups*subgroups_per_group # these are uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert f32lall== (3*n*m*ell)*n_groups*subgroups_per_group + assert f64lall == (2*n*m)*n_groups*subgroups_per_group op_map = lp.get_op_map(knl, count_redundant_work=True) #for k, v in op_map.items(): @@ -1026,8 +1026,8 @@ def test_summations_and_filters(): key.direction == 'load' s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) - # (count-per-sub-group*n_groups*subgroups_per_group) - assert s1f64l == (2*n*m)*n_groups*subgroups_per_group # these are uniform + # uniform: (count-per-sub-group)*n_groups*subgroups_per_group + assert s1f64l == (2*n*m)*n_groups*subgroups_per_group def test_strided_footprint(): -- GitLab From d1df544477c2073c44db1dfc31dc9a5a14fc31e1 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 24 Jan 2018 16:50:01 -0600 Subject: [PATCH 29/59] fixed flake8 issue --- test/test_statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index 7d1b6df0d..25ae3b2da 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -991,7 +991,7 @@ def test_summations_and_filters(): ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32lall== (3*n*m*ell)*n_groups*subgroups_per_group + assert f32lall == (3*n*m*ell)*n_groups*subgroups_per_group assert f64lall == (2*n*m)*n_groups*subgroups_per_group op_map = lp.get_op_map(knl, count_redundant_work=True) -- GitLab From fffeb48b647e631257a4f7211f7304b5f67e7461 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 24 Jan 2018 17:18:49 -0600 Subject: [PATCH 30/59] updated doctests for count_granularity rounding changes --- doc/tutorial.rst | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 7a2fb04fc..5fd4f72cb 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1638,12 +1638,12 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 1/16 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { 1/32 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { 1/32 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, g, subgroup) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, h, subgroup) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, store, e, subgroup) : [m, l, n] -> { 1/32 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g, subgroup) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h, subgroup) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e, subgroup) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** @@ -1679,10 +1679,10 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) - f32 ld a: 32768 - f32 st c: 16384 - f64 ld g: 2048 - f64 st e: 2048 + f32 ld a: 1048576 + f32 st c: 524288 + f64 ld g: 65536 + f64 st e: 65536 :class:`loopy.ToCountMap` also makes it easy to determine the total amount of data moved in bytes. Suppose we want to know the total amount of global @@ -1693,26 +1693,26 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 1/4 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { 1/8 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { 1/8 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, g, subgroup) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, h, subgroup) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, store, e, subgroup) : [m, l, n] -> { 1/4 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) - MemAccess(None, None, None, load, None, None) : [m, l, n] -> { (1/2 * m + 3/8 * m * l) * n : m > 0 and l > 0 and n > 0 } - MemAccess(None, None, None, store, None, None) : [m, l, n] -> { (1/4 * m + 1/8 * m * l) * n : m > 0 and l > 0 and n > 0 } + MemAccess(None, None, None, load, None, None) : [m, l, n] -> { (16 * m + 12 * m * l) * n : m > 0 and l > 0 and n > 0 } + MemAccess(None, None, None, store, None, None) : [m, l, n] -> { (8 * m + 4 * m * l) * n : m > 0 and l > 0 and n > 0 } >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') ... ].eval_with_dict(param_dict) >>> stored = global_ld_st_bytes[lp.MemAccess(direction='store') ... ].eval_with_dict(param_dict) >>> print("bytes loaded: %s\nbytes stored: %s" % (loaded, stored)) - bytes loaded: 229376 - bytes stored: 81920 + bytes loaded: 7340032 + bytes stored: 2621440 One can see how these functions might be useful in computing, for example, achieved memory bandwidth in byte/sec or performance in FLOP/sec. -- GitLab From 0df9212cd9b9d4d55240af65263fecfe30c95bf2 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Thu, 25 Jan 2018 15:12:22 -0600 Subject: [PATCH 31/59] renamed kernel --- test/test_statistics.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index 25ae3b2da..a507c1bd0 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -540,7 +540,7 @@ def test_mem_access_counter_mixed(): e[i, k] = g[i,k]*(2+h[i,k]) """ ], - name="mixed", assumptions="n,m,ell >= 1") + name="mixed_knl", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict( a=np.float32, b=np.float32, g=np.float64, h=np.float64, @@ -589,7 +589,6 @@ def test_mem_access_counter_mixed(): # uniform: (count-per-sub-group)*n_groups*subgroups_per_group assert f64uniform == (2*n*m)*n_groups*subgroups_per_group assert f32uniform == (m*n)*n_groups*subgroups_per_group - assert f32nonconsec == 3*n*m*ell f64uniform = mem_map[lp.MemAccess('global', np.float64, -- GitLab From ca62d04904552dbeedf5db8f4a115925002565c3 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Thu, 25 Jan 2018 15:13:26 -0600 Subject: [PATCH 32/59] putting unnecessary default arg back into call to count_insn_runs --- loopy/statistics.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 92762156b..d08046876 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1343,9 +1343,9 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, class CacheHolder(object): pass - #cache_holder = CacheHolder() - #from pytools import memoize_in - #@memoize_in(cache_holder, "insn_count") # TODO why doesn't this work anymore? + # cache_holder = CacheHolder() + # from pytools import memoize_in + # @memoize_in(cache_holder, "insn_count") # TODO why doesn't this work anymore? def get_insn_count(knl, insn_id, count_granularity='workitem'): insn = knl.id_to_insn[insn_id] @@ -1358,7 +1358,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, if count_granularity == 'workitem': return count_insn_runs( - knl, insn, count_redundant_work=count_redundant_work) + knl, insn, count_redundant_work=count_redundant_work, + disregard_local_axes=False) ct_disregard_local = count_insn_runs( knl, insn, disregard_local_axes=True, -- GitLab From 9b760d4ad8a848217a3c0e02f71ba947470dce5f Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sat, 27 Jan 2018 08:29:20 -0600 Subject: [PATCH 33/59] made test_mem_access_counter_mixed handle non-barvinok counting --- test/test_statistics.py | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index a507c1bd0..0c51bb7e5 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -531,7 +531,6 @@ def test_mem_access_counter_bitwise(): def test_mem_access_counter_mixed(): - knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i Date: Tue, 30 Jan 2018 16:28:02 -0600 Subject: [PATCH 34/59] added warning, get_insn_count uses upper bound for group size --- loopy/statistics.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index d08046876..bd61cda3f 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1370,7 +1370,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, elif count_granularity == 'subgroup': # get the group size from loopy.symbolic import aff_to_expr - global_size, local_size = knl.get_grid_size_upper_bounds() + _, local_size = knl.get_grid_size_upper_bounds() group_size = 1 if local_size: for size in local_size: @@ -1381,6 +1381,13 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, % (local_size)) group_size *= s + warn_with_kernel(knl, "insn_count_subgroups_upper_bound", + "get_insn_count: when counting instruction %s with " + "count_granularity=subgroup, using upper bound for group size " + "(%d workitems) to compute subgroups per group. If kernel has " + "multiple device programs, actual subgroup count may be lower." + % (insn_id, group_size)) + from pytools import div_ceil return ct_disregard_local*div_ceil(group_size, subgroup_size) else: -- GitLab From b26368374d754861f2ff22a204fa4a8adc70bbfe Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 30 Jan 2018 16:42:35 -0600 Subject: [PATCH 35/59] re-worded warning --- loopy/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index bd61cda3f..6eb17aca1 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1384,8 +1384,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, warn_with_kernel(knl, "insn_count_subgroups_upper_bound", "get_insn_count: when counting instruction %s with " "count_granularity=subgroup, using upper bound for group size " - "(%d workitems) to compute subgroups per group. If kernel has " - "multiple device programs, actual subgroup count may be lower." + "(%d workitems) to compute subgroups per group. When multiple " + "device programs present, actual subgroup count may be lower." % (insn_id, group_size)) from pytools import div_ceil -- GitLab From ef79671a3f3ad2116df2704dfba78183d9b5770f Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Thu, 8 Feb 2018 15:50:24 -0600 Subject: [PATCH 36/59] made docstring comment more precise about usage of counts --- loopy/statistics.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 6eb17aca1..2019a5791 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -225,7 +225,7 @@ class ToCountMap(object): variable=['a','g']) tot_loads_a_g = filtered_map.eval_and_sum(params) - # (now use these counts to predict performance) + # (now use these counts to, e.g., predict performance) """ @@ -273,7 +273,7 @@ class ToCountMap(object): filtered_map = mem_map.filter_by_func(filter_func) tot = filtered_map.eval_and_sum(params) - # (now use these counts to predict performance) + # (now use these counts to, e.g., predict performance) """ @@ -328,7 +328,7 @@ class ToCountMap(object): f64ops = ops_dtype[Op(dtype=np.float64)].eval_with_dict(params) i32ops = ops_dtype[Op(dtype=np.int32)].eval_with_dict(params) - # (now use these counts to predict performance) + # (now use these counts to, e.g., predict performance) """ @@ -385,7 +385,7 @@ class ToCountMap(object): mtype=['global'], stride=[2], direction=['store']).eval_and_sum(params) - # (now use these counts to predict performance) + # (now use these counts to, e.g., predict performance) """ @@ -442,7 +442,7 @@ class ToCountMap(object): variable=['a','g']) tot_loads_a_g = filtered_map.eval_and_sum(params) - # (now use these counts to predict performance) + # (now use these counts to, e.g., predict performance) """ return self.sum().eval_with_dict(params) @@ -1216,7 +1216,7 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False): f32mul = op_map[Op(np.float32, 'mul', count_granularity='workitem') ].eval_with_dict(params) - # (now use these counts to predict performance) + # (now use these counts to, e.g., predict performance) """ @@ -1328,7 +1328,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, count_granularity='workitem') ].eval_with_dict(params) - # (now use these counts to predict performance) + # (now use these counts to, e.g., predict performance) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types @@ -1488,7 +1488,7 @@ def get_synchronization_map(knl): params = {'n': 512, 'm': 256, 'l': 128} barrier_ct = sync_map['barrier_local'].eval_with_dict(params) - # (now use this count to predict performance) + # (now use this count to, e.g., predict performance) """ -- GitLab From d56044f8ce19abfb2f529aa11d97f514e8e311a6 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Thu, 8 Feb 2018 21:12:54 -0600 Subject: [PATCH 37/59] can't pass kwarg to get_insn_count when using @memoize_in, so passing key.count_granularity without keyword --- loopy/statistics.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 2019a5791..f6ca1eac5 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1343,11 +1343,10 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, class CacheHolder(object): pass - # cache_holder = CacheHolder() - # from pytools import memoize_in - # @memoize_in(cache_holder, "insn_count") # TODO why doesn't this work anymore? - def get_insn_count(knl, insn_id, - count_granularity='workitem'): + cache_holder = CacheHolder() + from pytools import memoize_in + @memoize_in(cache_holder, "insn_count") + def get_insn_count(knl, insn_id, count_granularity='workitem'): insn = knl.id_to_insn[insn_id] if count_granularity is None: @@ -1426,8 +1425,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, access_map = ( access_map + ToCountMap({key: val}) - * get_insn_count(knl, insn.id, - count_granularity=key.count_granularity)) + * get_insn_count(knl, insn.id, key.count_granularity)) #currently not counting stride of local mem access for key, val in six.iteritems(access_assignee_g.count_map): @@ -1435,8 +1433,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, access_map = ( access_map + ToCountMap({key: val}) - * get_insn_count(knl, insn.id, - count_granularity=key.count_granularity)) + * get_insn_count(knl, insn.id, key.count_granularity)) # for now, don't count writes to local mem elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): pass -- GitLab From 7e237cf03780e696faf5063c495e2e31f58003f0 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Thu, 8 Feb 2018 21:34:25 -0600 Subject: [PATCH 38/59] added unused subgroup_size argument to get_op_map and get_syncronization_map for consistency and potential future use --- loopy/statistics.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index f6ca1eac5..7babc7b6b 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1179,7 +1179,8 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False) # {{{ get_op_map -def get_op_map(knl, numpy_types=True, count_redundant_work=False): +def get_op_map(knl, numpy_types=True, count_redundant_work=False, + subgroup_size=None): """Count the number of operations in a loopy kernel. @@ -1463,7 +1464,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, # {{{ get_synchronization_map -def get_synchronization_map(knl): +def get_synchronization_map(knl, subgroup_size=None): """Count the number of synchronization events each work item encounters in a loopy kernel. -- GitLab From 6fbf29d71924533b040b5b1a3c3f48d15774f208 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Thu, 8 Feb 2018 21:36:22 -0600 Subject: [PATCH 39/59] fixed flake8 issue --- loopy/statistics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/statistics.py b/loopy/statistics.py index 7babc7b6b..64b849a43 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1346,6 +1346,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, cache_holder = CacheHolder() from pytools import memoize_in + @memoize_in(cache_holder, "insn_count") def get_insn_count(knl, insn_id, count_granularity='workitem'): insn = knl.id_to_insn[insn_id] -- GitLab From f790e4397c82262793bc23e4ce436af14dc52630 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 9 Feb 2018 00:59:09 -0600 Subject: [PATCH 40/59] comment documenting reason for overriding Record.__repr__ --- loopy/statistics.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/loopy/statistics.py b/loopy/statistics.py index 64b849a43..a6b461e88 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -498,6 +498,7 @@ class Op(Record): return hash(str(self)) def __repr__(self): + # Record.__repr__ overridden for consistent ordering and conciseness return "Op(%s, %s, %s)" % (self.dtype, self.name, self.count_granularity) # }}} @@ -574,6 +575,7 @@ class MemAccess(Record): return hash(str(self)) def __repr__(self): + # Record.__repr__ overridden for consistent ordering and conciseness return "MemAccess(%s, %s, %s, %s, %s, %s)" % ( self.mtype, self.dtype, -- GitLab From 479d89b822281ffc895b76f967679186a1e29123 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 19 Feb 2018 20:47:39 -0600 Subject: [PATCH 41/59] made CountGranularity class to contain cg strings --- loopy/statistics.py | 173 ++++++++++++++++++++++++-------------- test/test_statistics.py | 179 ++++++++++++++++++++-------------------- 2 files changed, 199 insertions(+), 153 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index a6b461e88..2305144ac 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -40,6 +40,7 @@ __doc__ = """ .. currentmodule:: loopy .. autoclass:: ToCountMap +.. autoclass:: CountGranularity .. autoclass:: Op .. autoclass:: MemAccess @@ -457,6 +458,31 @@ def stringify_stats_mapping(m): return result +class CountGranularity: + """Strings specifying whether an operation should be counted once per + *work-item*, *sub-group*, or *group*. + + .. attribute :: WORKITEM + + A :class:`str` that specifies that an operation should be counted + once per *work-item*. + + .. attribute :: SUBGROUP + + A :class:`str` that specifies that an operation should be counted + once per *sub-group*. + + .. attribute :: GROUP + + A :class:`str` that specifies that an operation should be counted + once per *group*. + + """ + WORKITEM = "workitem" + SUBGROUP = "subgroup" + GROUP = "group" + + # {{{ Op descriptor class Op(Record): @@ -479,7 +505,10 @@ class Op(Record): """ - count_granularity_options = ["workitem", "subgroup", "group", None] + count_granularity_options = [CountGranularity.WORKITEM, + CountGranularity.SUBGROUP, + CountGranularity.GROUP, + None] def __init__(self, dtype=None, name=None, count_granularity=None): if count_granularity not in self.count_granularity_options: @@ -522,7 +551,7 @@ class MemAccess(Record): .. attribute:: stride An :class:`int` that specifies stride of the memory access. A stride of 0 - indicates a uniform access (i.e. all work items access the same item). + indicates a uniform access (i.e. all work-items access the same item). .. attribute:: direction @@ -541,7 +570,10 @@ class MemAccess(Record): """ - count_granularity_options = ["workitem", "subgroup", "group", None] + count_granularity_options = [CountGranularity.WORKITEM, + CountGranularity.SUBGROUP, + CountGranularity.GROUP, + None] def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None, count_granularity=None): @@ -666,7 +698,7 @@ class ExpressionOpCounter(CounterBase): return ToCountMap( {Op(dtype=self.type_inf(expr), name='func:'+str(expr.function), - count_granularity='workitem'): 1} + count_granularity=CountGranularity.WORKITEM): 1} ) + self.rec(expr.parameters) def map_subscript(self, expr): @@ -677,7 +709,8 @@ class ExpressionOpCounter(CounterBase): return ToCountMap( {Op(dtype=self.type_inf(expr), name='add', - count_granularity='workitem'): len(expr.children)-1} + count_granularity=CountGranularity.WORKITEM): + len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) def map_product(self, expr): @@ -685,18 +718,18 @@ class ExpressionOpCounter(CounterBase): assert expr.children return sum(ToCountMap({Op(dtype=self.type_inf(expr), name='mul', - count_granularity='workitem'): 1}) + count_granularity=CountGranularity.WORKITEM): 1}) + self.rec(child) for child in expr.children if not is_zero(child + 1)) + \ ToCountMap({Op(dtype=self.type_inf(expr), name='mul', - count_granularity='workitem'): -1}) + count_granularity=CountGranularity.WORKITEM): -1}) def map_quotient(self, expr, *args): return ToCountMap({Op(dtype=self.type_inf(expr), name='div', - count_granularity='workitem'): 1}) \ + count_granularity=CountGranularity.WORKITEM): 1}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) @@ -706,14 +739,14 @@ class ExpressionOpCounter(CounterBase): def map_power(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='pow', - count_granularity='workitem'): 1}) \ + count_granularity=CountGranularity.WORKITEM): 1}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) def map_left_shift(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='shift', - count_granularity='workitem'): 1}) \ + count_granularity=CountGranularity.WORKITEM): 1}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) @@ -722,13 +755,13 @@ class ExpressionOpCounter(CounterBase): def map_bitwise_not(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='bw', - count_granularity='workitem'): 1}) \ + count_granularity=CountGranularity.WORKITEM): 1}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='bw', - count_granularity='workitem'): + count_granularity=CountGranularity.WORKITEM): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -752,7 +785,7 @@ class ExpressionOpCounter(CounterBase): def map_min(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='maxmin', - count_granularity='workitem'): + count_granularity=CountGranularity.WORKITEM): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -794,7 +827,7 @@ class LocalMemAccessCounter(MemAccessCounter): if isinstance(array, TemporaryVariable) and ( array.scope == temp_var_scope.LOCAL): sub_map[MemAccess(mtype='local', dtype=dtype, - count_granularity='workitem')] = 1 + count_granularity=CountGranularity.WORKITEM)] = 1 return sub_map def map_variable(self, expr): @@ -831,7 +864,7 @@ class GlobalMemAccessCounter(MemAccessCounter): return ToCountMap({MemAccess(mtype='global', dtype=self.type_inf(expr), stride=0, variable=name, - count_granularity='workitem'): 1} + count_granularity=CountGranularity.WORKITEM): 1} ) + self.rec(expr.index) def map_subscript(self, expr): @@ -868,10 +901,11 @@ class GlobalMemAccessCounter(MemAccessCounter): if not local_id_found: # count as uniform access - return ToCountMap({MemAccess(mtype='global', - dtype=self.type_inf(expr), stride=0, - variable=name, - count_granularity='subgroup'): 1} + return ToCountMap({MemAccess( + mtype='global', + dtype=self.type_inf(expr), stride=0, + variable=name, + count_granularity=CountGranularity.SUBGROUP): 1} ) + self.rec(expr.index) if min_tag_axis != 0: @@ -879,10 +913,11 @@ class GlobalMemAccessCounter(MemAccessCounter): "GlobalSubscriptCounter: Memory access minimum " "tag axis %d != 0, stride unknown, using " "sys.maxsize." % (min_tag_axis)) - return ToCountMap({MemAccess(mtype='global', - dtype=self.type_inf(expr), - stride=sys.maxsize, variable=name, - count_granularity='workitem'): 1} + return ToCountMap({MemAccess( + mtype='global', + dtype=self.type_inf(expr), + stride=sys.maxsize, variable=name, + count_granularity=CountGranularity.WORKITEM): 1} ) + self.rec(expr.index) # get local_id associated with minimum tag axis @@ -926,7 +961,8 @@ class GlobalMemAccessCounter(MemAccessCounter): total_stride += stride*coeff_min_lid - count_granularity = 'workitem' if total_stride is not 0 else 'subgroup' + count_granularity = CountGranularity.WORKITEM if total_stride is not 0 \ + else CountGranularity.SUBGROUP return ToCountMap({MemAccess( mtype='global', @@ -1214,9 +1250,13 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, op_map = get_op_map(knl) params = {'n': 512, 'm': 256, 'l': 128} - f32add = op_map[Op(np.float32, 'add', count_granularity='workitem') + f32add = op_map[Op(np.float32, + 'add', + count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) - f32mul = op_map[Op(np.float32, 'mul', count_granularity='workitem') + f32mul = op_map[Op(np.float32, + 'mul', + count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) # (now use these counts to, e.g., predict performance) @@ -1302,33 +1342,37 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, params = {'n': 512, 'm': 256, 'l': 128} mem_map = get_mem_access_map(knl) - f32_s1_g_ld_a = mem_map[MemAccess(mtype='global', - dtype=np.float32, - stride=1, - direction='load', - variable='a', - count_granularity='workitem') + f32_s1_g_ld_a = mem_map[MemAccess( + mtype='global', + dtype=np.float32, + stride=1, + direction='load', + variable='a', + count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) - f32_s1_g_st_a = mem_map[MemAccess(mtype='global', - dtype=np.float32, - stride=1, - direction='store', - variable='a', - count_granularity='workitem') + f32_s1_g_st_a = mem_map[MemAccess( + mtype='global', + dtype=np.float32, + stride=1, + direction='store', + variable='a', + count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) - f32_s1_l_ld_x = mem_map[MemAccess(mtype='local', - dtype=np.float32, - stride=1, - direction='load', - variable='x', - count_granularity='workitem') + f32_s1_l_ld_x = mem_map[MemAccess( + mtype='local', + dtype=np.float32, + stride=1, + direction='load', + variable='x', + count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) - f32_s1_l_st_x = mem_map[MemAccess(mtype='local', - dtype=np.float32, - stride=1, - direction='store', - variable='x', - count_granularity='workitem') + f32_s1_l_st_x = mem_map[MemAccess( + mtype='local', + dtype=np.float32, + stride=1, + direction='store', + variable='x', + count_granularity=CountGranularity.WORKITEM) ].eval_with_dict(params) # (now use these counts to, e.g., predict performance) @@ -1350,16 +1394,17 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, from pytools import memoize_in @memoize_in(cache_holder, "insn_count") - def get_insn_count(knl, insn_id, count_granularity='workitem'): + def get_insn_count(knl, insn_id, count_granularity=CountGranularity.WORKITEM): insn = knl.id_to_insn[insn_id] if count_granularity is None: warn_with_kernel(knl, "get_insn_count_assumes_granularity", "get_insn_count: No count granularity passed for " - "MemAccess, assuming workitem granularity.") - count_granularity == 'workitem' + "MemAccess, assuming %s granularity." + % (CountGranularity.WORKITEM)) + count_granularity == CountGranularity.WORKITEM - if count_granularity == 'workitem': + if count_granularity == CountGranularity.WORKITEM: return count_insn_runs( knl, insn, count_redundant_work=count_redundant_work, disregard_local_axes=False) @@ -1368,9 +1413,9 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, knl, insn, disregard_local_axes=True, count_redundant_work=count_redundant_work) - if count_granularity == 'group': + if count_granularity == CountGranularity.GROUP: return ct_disregard_local - elif count_granularity == 'subgroup': + elif count_granularity == CountGranularity.SUBGROUP: # get the group size from loopy.symbolic import aff_to_expr _, local_size = knl.get_grid_size_upper_bounds() @@ -1379,17 +1424,17 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, for size in local_size: s = aff_to_expr(size) if not isinstance(s, int): - raise LoopyError("Cannot count insn with subgroup " - "granularity, group size is not integer: %s" - % (local_size)) + raise LoopyError("Cannot count insn with %s granularity, " + "group size is not integer: %s" + % (CountGranularity.SUBGROUP, local_size)) group_size *= s warn_with_kernel(knl, "insn_count_subgroups_upper_bound", "get_insn_count: when counting instruction %s with " - "count_granularity=subgroup, using upper bound for group size " + "count_granularity=%s, using upper bound for group size " "(%d workitems) to compute subgroups per group. When multiple " "device programs present, actual subgroup count may be lower." - % (insn_id, group_size)) + % (insn_id, CountGranularity.SUBGROUP, group_size)) from pytools import div_ceil return ct_disregard_local*div_ceil(group_size, subgroup_size) @@ -1469,14 +1514,14 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, def get_synchronization_map(knl, subgroup_size=None): - """Count the number of synchronization events each work item encounters in a + """Count the number of synchronization events each work-item encounters in a loopy kernel. :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: A dictionary mapping each type of synchronization event to a :class:`islpy.PwQPolynomial` holding the number of events per - work item. + work-item. Possible keys include ``barrier_local``, ``barrier_global`` (if supported by the target) and ``kernel_launch``. @@ -1685,7 +1730,7 @@ def get_gmem_access_poly(knl): def get_synchronization_poly(knl): - """Count the number of synchronization events each work item encounters in a + """Count the number of synchronization events each work-item encounters in a loopy kernel. get_synchronization_poly is deprecated. Use get_synchronization_map instead. diff --git a/test/test_statistics.py b/test/test_statistics.py index 0c51bb7e5..25c6dffee 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -31,6 +31,7 @@ import loopy as lp from loopy.types import to_loopy_type import numpy as np from pytools import div_ceil +from loopy.statistics import CountGranularity as cg from pymbolic.primitives import Variable @@ -55,12 +56,12 @@ def test_op_counter_basic(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', 'workitem')].eval_with_dict(params) - f32mul = op_map[lp.Op(np.float32, 'mul', 'workitem')].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div', 'workitem')].eval_with_dict(params) - f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', 'workitem') + f32add = op_map[lp.Op(np.float32, 'add', cg.WORKITEM)].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul', cg.WORKITEM)].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div', cg.WORKITEM)].eval_with_dict(params) + f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', cg.WORKITEM) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'workitem') + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', cg.WORKITEM) ].eval_with_dict(params) assert f32add == f32mul == f32div == n*m*ell assert f64mul == n*m @@ -82,8 +83,8 @@ def test_op_counter_reduction(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', 'workitem')].eval_with_dict(params) - f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', 'workitem') + f32add = op_map[lp.Op(np.float32, 'add', cg.WORKITEM)].eval_with_dict(params) + f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', cg.WORKITEM) ].eval_with_dict(params) assert f32add == f32mul == n*m*ell @@ -112,11 +113,11 @@ def test_op_counter_logic(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', 'workitem')].eval_with_dict(params) - f64add = op_map[lp.Op(np.float64, 'add', 'workitem')].eval_with_dict(params) - f64div = op_map[lp.Op(np.dtype(np.float64), 'div', 'workitem') + f32mul = op_map[lp.Op(np.float32, 'mul', cg.WORKITEM)].eval_with_dict(params) + f64add = op_map[lp.Op(np.float64, 'add', cg.WORKITEM)].eval_with_dict(params) + f64div = op_map[lp.Op(np.dtype(np.float64), 'div', cg.WORKITEM) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'workitem') + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', cg.WORKITEM) ].eval_with_dict(params) assert f32mul == n*m assert f64div == 2*n*m # TODO why? @@ -144,17 +145,17 @@ def test_op_counter_specialops(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', 'workitem')].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div', 'workitem')].eval_with_dict(params) - f32add = op_map[lp.Op(np.float32, 'add', 'workitem')].eval_with_dict(params) - f64pow = op_map[lp.Op(np.float64, 'pow', 'workitem')].eval_with_dict(params) - f64add = op_map[lp.Op(np.dtype(np.float64), 'add', 'workitem') + f32mul = op_map[lp.Op(np.float32, 'mul', cg.WORKITEM)].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div', cg.WORKITEM)].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add', cg.WORKITEM)].eval_with_dict(params) + f64pow = op_map[lp.Op(np.float64, 'pow', cg.WORKITEM)].eval_with_dict(params) + f64add = op_map[lp.Op(np.dtype(np.float64), 'add', cg.WORKITEM) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', 'workitem') + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', cg.WORKITEM) ].eval_with_dict(params) - f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', 'workitem') + f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', cg.WORKITEM) ].eval_with_dict(params) - f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', 'workitem') + f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', cg.WORKITEM) ].eval_with_dict(params) assert f32div == 2*n*m*ell assert f32mul == f32add == n*m*ell @@ -184,15 +185,15 @@ def test_op_counter_bitwise(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - i32add = op_map[lp.Op(np.int32, 'add', 'workitem')].eval_with_dict(params) - i32bw = op_map[lp.Op(np.int32, 'bw', 'workitem')].eval_with_dict(params) - i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', 'workitem') + i32add = op_map[lp.Op(np.int32, 'add', cg.WORKITEM)].eval_with_dict(params) + i32bw = op_map[lp.Op(np.int32, 'bw', cg.WORKITEM)].eval_with_dict(params) + i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', cg.WORKITEM) ].eval_with_dict(params) - i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', 'workitem') + i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', cg.WORKITEM) ].eval_with_dict(params) - i64add = op_map[lp.Op(np.dtype(np.int64), 'add', 'workitem') + i64add = op_map[lp.Op(np.dtype(np.int64), 'add', cg.WORKITEM) ].eval_with_dict(params) - i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', 'workitem') + i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', cg.WORKITEM) ].eval_with_dict(params) assert i32add == n*m+n*m*ell assert i32bw == 2*n*m*ell @@ -225,7 +226,7 @@ def test_op_counter_triangular_domain(): op_map = lp.get_op_map( knl, count_redundant_work=True - )[lp.Op(np.float64, 'mul', 'workitem')] + )[lp.Op(np.float64, 'mul', cg.WORKITEM)] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) @@ -266,19 +267,19 @@ def test_mem_access_counter_basic(): f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) f64l = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) f64l += mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -287,11 +288,11 @@ def test_mem_access_counter_basic(): f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='store', variable='e', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -325,11 +326,11 @@ def test_mem_access_counter_reduction(): f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -337,7 +338,7 @@ def test_mem_access_counter_reduction(): f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -427,19 +428,19 @@ def test_mem_access_counter_specialops(): f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) f32 += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='g', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='h', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -448,11 +449,11 @@ def test_mem_access_counter_specialops(): f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='store', variable='c', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -460,7 +461,7 @@ def test_mem_access_counter_specialops(): assert f64 == (n*m)*n_groups*subgroups_per_group filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'], - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) tot = filtered_map.eval_and_sum(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -499,19 +500,19 @@ def test_mem_access_counter_bitwise(): i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='a', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='b', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='g', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), stride=0, direction='load', variable='h', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -519,11 +520,11 @@ def test_mem_access_counter_bitwise(): i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='c', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='e', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -564,25 +565,25 @@ def test_mem_access_counter_mixed(): subgroup_size=subgroup_size) f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) f64uniform += mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) f32uniform = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='x', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='a', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='b', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -608,12 +609,12 @@ def test_mem_access_counter_mixed(): f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e', - count_granularity='subgroup') + count_granularity=cg.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m'), direction='store', variable='c', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -652,22 +653,22 @@ def test_mem_access_counter_nonconsec(): f64nonconsec = mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='g', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) f64nonconsec += mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='h', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m')*Variable('ell'), direction='load', variable='a', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m')*Variable('ell'), direction='load', variable='b', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*ell @@ -675,12 +676,12 @@ def test_mem_access_counter_nonconsec(): f64nonconsec = mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='store', variable='e', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m')*Variable('ell'), direction='store', variable='c', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) assert f64nonconsec == n*m assert f32nonconsec == n*m*ell @@ -691,13 +692,13 @@ def test_mem_access_counter_nonconsec(): 'global', np.float64, stride=Variable('m'), direction='load', variable='g', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) f64nonconsec += mem_map64[lp.MemAccess( 'global', np.float64, stride=Variable('m'), direction='load', variable='h', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map64[lp.MemAccess( 'global', @@ -705,7 +706,7 @@ def test_mem_access_counter_nonconsec(): stride=Variable('m')*Variable('ell'), direction='load', variable='a', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map64[lp.MemAccess( 'global', @@ -713,7 +714,7 @@ def test_mem_access_counter_nonconsec(): stride=Variable('m')*Variable('ell'), direction='load', variable='b', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*ell @@ -742,30 +743,30 @@ def test_mem_access_counter_consec(): f64consec = mem_map[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='g', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) f64consec += mem_map[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='h', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=1, direction='load', variable='b', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) assert f64consec == 2*n*m*ell assert f32consec == 3*n*m*ell f64consec = mem_map[lp.MemAccess('global', np.float64, stride=1, direction='store', variable='e', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) assert f64consec == n*m*ell assert f32consec == n*m*ell @@ -774,9 +775,9 @@ def test_mem_access_counter_consec(): def test_count_granularity_val_checks(): try: - lp.MemAccess(count_granularity='workitem') - lp.MemAccess(count_granularity='subgroup') - lp.MemAccess(count_granularity='group') + lp.MemAccess(count_granularity=cg.WORKITEM) + lp.MemAccess(count_granularity=cg.SUBGROUP) + lp.MemAccess(count_granularity=cg.GROUP) lp.MemAccess(count_granularity=None) assert True lp.MemAccess(count_granularity='bushel') @@ -785,9 +786,9 @@ def test_count_granularity_val_checks(): assert True try: - lp.Op(count_granularity='workitem') - lp.Op(count_granularity='subgroup') - lp.Op(count_granularity='group') + lp.Op(count_granularity=cg.WORKITEM) + lp.Op(count_granularity=cg.SUBGROUP) + lp.Op(count_granularity=cg.GROUP) lp.Op(count_granularity=None) assert True lp.Op(count_granularity='bushel') @@ -874,16 +875,16 @@ def test_all_counters_parallel_matmul(): op_map = lp.get_op_map(knl, count_redundant_work=True) f32mul = op_map[ - lp.Op(np.float32, 'mul', 'workitem') + lp.Op(np.float32, 'mul', cg.WORKITEM) ].eval_with_dict(params) f32add = op_map[ - lp.Op(np.float32, 'add', 'workitem') + lp.Op(np.float32, 'add', cg.WORKITEM) ].eval_with_dict(params) i32ops = op_map[ - lp.Op(np.int32, 'add', 'workitem') + lp.Op(np.int32, 'add', cg.WORKITEM) ].eval_with_dict(params) i32ops += op_map[ - lp.Op(np.dtype(np.int32), 'mul', 'workitem') + lp.Op(np.dtype(np.int32), 'mul', cg.WORKITEM) ].eval_with_dict(params) assert f32mul+f32add == n*m*ell*2 @@ -892,11 +893,11 @@ def test_all_counters_parallel_matmul(): f32s1lb = op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='b', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) f32s1la = op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize @@ -904,7 +905,7 @@ def test_all_counters_parallel_matmul(): f32coal = op_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) assert f32coal == n*ell @@ -913,7 +914,7 @@ def test_all_counters_parallel_matmul(): count_redundant_work=True).filter_by(mtype=['local']) local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', - count_granularity='workitem') + count_granularity=cg.WORKITEM) ].eval_with_dict(params) assert local_mem_l == n*m*ell*2 @@ -980,24 +981,24 @@ def test_summations_and_filters(): subgroup_size=subgroup_size) loads_a = mem_map.filter_by(direction=['load'], variable=['a'], - count_granularity=['subgroup'] + count_granularity=[cg.SUBGROUP] ).eval_and_sum(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group assert loads_a == (2*n*m*ell)*n_groups*subgroups_per_group global_stores = mem_map.filter_by(mtype=['global'], direction=['store'], - count_granularity=['subgroup'] + count_granularity=[cg.SUBGROUP] ).eval_and_sum(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group assert global_stores == (n*m*ell + n*m)*n_groups*subgroups_per_group ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'], - count_granularity=['subgroup'] + count_granularity=[cg.SUBGROUP] ).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'], - count_granularity=['subgroup'] + count_granularity=[cg.SUBGROUP] ).to_bytes().eval_and_sum(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group -- GitLab From c5a06ed8a6bb2f68096e3a4cbe4651950f02d1be Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 19 Feb 2018 23:23:17 -0600 Subject: [PATCH 42/59] attempt to get subgroup size from device, don't guess unless explicitly told --- loopy/statistics.py | 45 +++++++++++++++++++++++++++++++---------- test/test_statistics.py | 18 ++++++++++------- 2 files changed, 45 insertions(+), 18 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 2305144ac..2b5e3876e 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -501,7 +501,7 @@ class Op(Record): .. attribute:: count_granularity A :class:`str` that specifies whether this operation should be counted - once per *workitem*, *subgroup*, or *group*. + once per *work-item*, *sub-group*, or *group*. """ @@ -566,7 +566,7 @@ class MemAccess(Record): .. attribute:: count_granularity A :class:`str` that specifies whether this operation should be counted - once per *workitem*, *subgroup*, or *group*. + once per *work-item*, *sub-group*, or *group*. """ @@ -1323,7 +1323,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, :arg subgroup_size: A :class:`int` that specifies the sub-group size. This is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. The default - subgroup_size is 32. + sub-group_size is 32. :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1380,12 +1380,35 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, """ from loopy.preprocess import preprocess_kernel, infer_unknown_types - if subgroup_size is None: - subgroup_size = 32 - warn_with_kernel(knl, "get_mem_access_map_assumes_subgroup_size", - "get_mem_access_map: No subgroup size passed, " - "assuming subgroup size is %d." - % (subgroup_size)) + if not isinstance(subgroup_size, int): + # try to find subgroup_size + from loopy.target.pyopencl import PyOpenCLTarget + if isinstance(knl.target, PyOpenCLTarget) and knl.target.device is not None: + from pyopencl.characterize import get_simd_group_size + subgroup_size_guess = get_simd_group_size(knl.target.device, None) + warn_with_kernel(knl, "get_mem_access_map_assumes_subgroup_size", + "subgroup_size passed: %s. Device: %s. Using " + "sub-group size given by get_simd_group_size(): %d" + % (subgroup_size, knl.target.device, + subgroup_size_guess)) + subgroup_size = subgroup_size_guess + elif subgroup_size == 'guess': + # unable to get subgroup_size from device, so guess + subgroup_size = 32 + warn_with_kernel(knl, "get_mem_access_map_guessing_subgroup_size", + "get_mem_access_map: 'guess' sub-group size passed, " + "no target device found, wildly guessing that " + "sub-group size is %d." + % (subgroup_size)) + + if subgroup_size is None: + # 'guess' was not passed and either no target device found + # or get_simd_group_size returned None + raise ValueError("No sub-group size passed and no target device found. " + "Either (1) pass integer value for subgroup_size, " + "(2) ensure that kernel.target is PyOpenClTarget " + "and kernel.target.device is set, or (3) pass " + "subgroup_size='guess' and hope for the best.") class CacheHolder(object): pass @@ -1432,8 +1455,8 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, warn_with_kernel(knl, "insn_count_subgroups_upper_bound", "get_insn_count: when counting instruction %s with " "count_granularity=%s, using upper bound for group size " - "(%d workitems) to compute subgroups per group. When multiple " - "device programs present, actual subgroup count may be lower." + "(%d work-items) to compute sub-groups per group. When multiple " + "device programs present, actual sub-group count may be lower." % (insn_id, CountGranularity.SUBGROUP, group_size)) from pytools import div_ceil diff --git a/test/test_statistics.py b/test/test_statistics.py index 25c6dffee..9bfea34ab 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -645,7 +645,8 @@ def test_mem_access_counter_nonconsec(): knl = lp.split_iname(knl, "i", 16) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) # noqa + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, + subgroup_size=32) # noqa n = 512 m = 256 ell = 128 @@ -735,7 +736,8 @@ def test_mem_access_counter_consec(): a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, + subgroup_size='guess') n = 512 m = 256 ell = 128 @@ -889,13 +891,14 @@ def test_all_counters_parallel_matmul(): assert f32mul+f32add == n*m*ell*2 - op_map = lp.get_mem_access_map(knl, count_redundant_work=True) + mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True, + subgroup_size=32) - f32s1lb = op_map[lp.MemAccess('global', np.float32, + f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='b', count_granularity=cg.WORKITEM) ].eval_with_dict(params) - f32s1la = op_map[lp.MemAccess('global', np.float32, + f32s1la = mem_access_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a', count_granularity=cg.WORKITEM) ].eval_with_dict(params) @@ -903,7 +906,7 @@ def test_all_counters_parallel_matmul(): assert f32s1lb == n*m*ell/bsize assert f32s1la == n*m*ell/bsize - f32coal = op_map[lp.MemAccess('global', np.float32, + f32coal = mem_access_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c', count_granularity=cg.WORKITEM) ].eval_with_dict(params) @@ -911,7 +914,8 @@ def test_all_counters_parallel_matmul(): assert f32coal == n*ell local_mem_map = lp.get_mem_access_map(knl, - count_redundant_work=True).filter_by(mtype=['local']) + count_redundant_work=True, + subgroup_size=32).filter_by(mtype=['local']) local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', count_granularity=cg.WORKITEM) -- GitLab From 8af713d354ab3b63c749c72c66b204856f02eb6c Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 19 Feb 2018 23:52:08 -0600 Subject: [PATCH 43/59] fixing flake8 issues --- test/test_statistics.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index 9bfea34ab..0687bff5a 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -895,21 +895,21 @@ def test_all_counters_parallel_matmul(): subgroup_size=32) f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, - stride=1, direction='load', variable='b', - count_granularity=cg.WORKITEM) - ].eval_with_dict(params) + stride=1, direction='load', variable='b', + count_granularity=cg.WORKITEM) + ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess('global', np.float32, - stride=1, direction='load', variable='a', - count_granularity=cg.WORKITEM) - ].eval_with_dict(params) + stride=1, direction='load', variable='a', + count_granularity=cg.WORKITEM) + ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize assert f32s1la == n*m*ell/bsize f32coal = mem_access_map[lp.MemAccess('global', np.float32, - stride=1, direction='store', variable='c', - count_granularity=cg.WORKITEM) - ].eval_with_dict(params) + stride=1, direction='store', variable='c', + count_granularity=cg.WORKITEM) + ].eval_with_dict(params) assert f32coal == n*ell -- GitLab From 6ffb9e3a654c8aab0b456ade2adecd001c0d82ef Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 19 Feb 2018 23:53:00 -0600 Subject: [PATCH 44/59] passing subgroup size to get_mem_access_map in tutorial --- doc/tutorial.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 5fd4f72cb..dd0bf6367 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1636,7 +1636,7 @@ we'll continue using the kernel from the previous example: .. doctest:: - >>> mem_map = lp.get_mem_access_map(knl) + >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) MemAccess(global, np:dtype('float32'), 0, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } MemAccess(global, np:dtype('float32'), 0, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } @@ -1729,7 +1729,7 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time. >>> knl_consec = lp.split_iname(knl, "k", 128, ... outer_tag="l.1", inner_tag="l.0") - >>> mem_map = lp.get_mem_access_map(knl_consec) + >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) MemAccess(global, np:dtype('float32'), 1, load, a, workitem) : [m, l, n] -> { ... } MemAccess(global, np:dtype('float32'), 1, load, b, workitem) : [m, l, n] -> { ... } @@ -1770,7 +1770,7 @@ switch the inner and outer tags in our parallelization of the kernel: >>> knl_nonconsec = lp.split_iname(knl, "k", 128, ... outer_tag="l.0", inner_tag="l.1") - >>> mem_map = lp.get_mem_access_map(knl_nonconsec) + >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) MemAccess(global, np:dtype('float32'), 128, load, a, workitem) : [m, l, n] -> { ... } MemAccess(global, np:dtype('float32'), 128, load, b, workitem) : [m, l, n] -> { ... } -- GitLab From 66693459ed20a65f6180a531e2690c37ae33e4ca Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 19 Feb 2018 23:54:10 -0600 Subject: [PATCH 45/59] added CountGranularity to loopy.__init__ --- loopy/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 5e8a3fb06..b6a72c021 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -119,8 +119,8 @@ from loopy.transform.add_barrier import add_barrier from loopy.type_inference import infer_unknown_types from loopy.preprocess import preprocess_kernel, realize_reduction from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel -from loopy.statistics import (ToCountMap, stringify_stats_mapping, Op, - MemAccess, get_op_poly, get_op_map, get_lmem_access_poly, +from loopy.statistics import (ToCountMap, CountGranularity, stringify_stats_mapping, + Op, MemAccess, get_op_poly, get_op_map, get_lmem_access_poly, get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_map, get_synchronization_poly, get_synchronization_map, gather_access_footprints, gather_access_footprint_bytes) -- GitLab From dd57c36050600bca28bcf630d6d33bd91c4d6cfe Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 20 Feb 2018 00:04:20 -0600 Subject: [PATCH 46/59] fixed flake8 issue --- test/test_statistics.py | 180 ++++++++++++++++++++-------------------- 1 file changed, 90 insertions(+), 90 deletions(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index 0687bff5a..7a5d13949 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -31,7 +31,7 @@ import loopy as lp from loopy.types import to_loopy_type import numpy as np from pytools import div_ceil -from loopy.statistics import CountGranularity as cg +from loopy.statistics import CountGranularity as CG from pymbolic.primitives import Variable @@ -56,12 +56,12 @@ def test_op_counter_basic(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', cg.WORKITEM)].eval_with_dict(params) - f32mul = op_map[lp.Op(np.float32, 'mul', cg.WORKITEM)].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div', cg.WORKITEM)].eval_with_dict(params) - f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', cg.WORKITEM) + f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params) + f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.WORKITEM) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', cg.WORKITEM) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM) ].eval_with_dict(params) assert f32add == f32mul == f32div == n*m*ell assert f64mul == n*m @@ -83,8 +83,8 @@ def test_op_counter_reduction(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', cg.WORKITEM)].eval_with_dict(params) - f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', cg.WORKITEM) + f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params) + f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.WORKITEM) ].eval_with_dict(params) assert f32add == f32mul == n*m*ell @@ -113,11 +113,11 @@ def test_op_counter_logic(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', cg.WORKITEM)].eval_with_dict(params) - f64add = op_map[lp.Op(np.float64, 'add', cg.WORKITEM)].eval_with_dict(params) - f64div = op_map[lp.Op(np.dtype(np.float64), 'div', cg.WORKITEM) + f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params) + f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(params) + f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.WORKITEM) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', cg.WORKITEM) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM) ].eval_with_dict(params) assert f32mul == n*m assert f64div == 2*n*m # TODO why? @@ -145,17 +145,17 @@ def test_op_counter_specialops(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', cg.WORKITEM)].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div', cg.WORKITEM)].eval_with_dict(params) - f32add = op_map[lp.Op(np.float32, 'add', cg.WORKITEM)].eval_with_dict(params) - f64pow = op_map[lp.Op(np.float64, 'pow', cg.WORKITEM)].eval_with_dict(params) - f64add = op_map[lp.Op(np.dtype(np.float64), 'add', cg.WORKITEM) + f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params) + f64pow = op_map[lp.Op(np.float64, 'pow', CG.WORKITEM)].eval_with_dict(params) + f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.WORKITEM) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', cg.WORKITEM) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM) ].eval_with_dict(params) - f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', cg.WORKITEM) + f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.WORKITEM) ].eval_with_dict(params) - f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', cg.WORKITEM) + f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.WORKITEM) ].eval_with_dict(params) assert f32div == 2*n*m*ell assert f32mul == f32add == n*m*ell @@ -185,15 +185,15 @@ def test_op_counter_bitwise(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - i32add = op_map[lp.Op(np.int32, 'add', cg.WORKITEM)].eval_with_dict(params) - i32bw = op_map[lp.Op(np.int32, 'bw', cg.WORKITEM)].eval_with_dict(params) - i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', cg.WORKITEM) + i32add = op_map[lp.Op(np.int32, 'add', CG.WORKITEM)].eval_with_dict(params) + i32bw = op_map[lp.Op(np.int32, 'bw', CG.WORKITEM)].eval_with_dict(params) + i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.WORKITEM) ].eval_with_dict(params) - i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', cg.WORKITEM) + i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.WORKITEM) ].eval_with_dict(params) - i64add = op_map[lp.Op(np.dtype(np.int64), 'add', cg.WORKITEM) + i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.WORKITEM) ].eval_with_dict(params) - i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', cg.WORKITEM) + i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.WORKITEM) ].eval_with_dict(params) assert i32add == n*m+n*m*ell assert i32bw == 2*n*m*ell @@ -226,7 +226,7 @@ def test_op_counter_triangular_domain(): op_map = lp.get_op_map( knl, count_redundant_work=True - )[lp.Op(np.float64, 'mul', cg.WORKITEM)] + )[lp.Op(np.float64, 'mul', CG.WORKITEM)] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) @@ -267,19 +267,19 @@ def test_mem_access_counter_basic(): f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64l = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64l += mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -288,11 +288,11 @@ def test_mem_access_counter_basic(): f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='store', variable='e', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -326,11 +326,11 @@ def test_mem_access_counter_reduction(): f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -338,7 +338,7 @@ def test_mem_access_counter_reduction(): f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -428,19 +428,19 @@ def test_mem_access_counter_specialops(): f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32 += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='g', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='h', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -449,11 +449,11 @@ def test_mem_access_counter_specialops(): f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='store', variable='c', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -461,7 +461,7 @@ def test_mem_access_counter_specialops(): assert f64 == (n*m)*n_groups*subgroups_per_group filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'], - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) tot = filtered_map.eval_and_sum(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -500,19 +500,19 @@ def test_mem_access_counter_bitwise(): i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='a', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='b', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='g', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), stride=0, direction='load', variable='h', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -520,11 +520,11 @@ def test_mem_access_counter_bitwise(): i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='c', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='e', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -565,25 +565,25 @@ def test_mem_access_counter_mixed(): subgroup_size=subgroup_size) f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64uniform += mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32uniform = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='x', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='a', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='b', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -609,12 +609,12 @@ def test_mem_access_counter_mixed(): f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e', - count_granularity=cg.SUBGROUP) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m'), direction='store', variable='c', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group @@ -654,22 +654,22 @@ def test_mem_access_counter_nonconsec(): f64nonconsec = mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='g', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64nonconsec += mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='h', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m')*Variable('ell'), direction='load', variable='a', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m')*Variable('ell'), direction='load', variable='b', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*ell @@ -677,12 +677,12 @@ def test_mem_access_counter_nonconsec(): f64nonconsec = mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='store', variable='e', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m')*Variable('ell'), direction='store', variable='c', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64nonconsec == n*m assert f32nonconsec == n*m*ell @@ -693,13 +693,13 @@ def test_mem_access_counter_nonconsec(): 'global', np.float64, stride=Variable('m'), direction='load', variable='g', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64nonconsec += mem_map64[lp.MemAccess( 'global', np.float64, stride=Variable('m'), direction='load', variable='h', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec = mem_map64[lp.MemAccess( 'global', @@ -707,7 +707,7 @@ def test_mem_access_counter_nonconsec(): stride=Variable('m')*Variable('ell'), direction='load', variable='a', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map64[lp.MemAccess( 'global', @@ -715,7 +715,7 @@ def test_mem_access_counter_nonconsec(): stride=Variable('m')*Variable('ell'), direction='load', variable='b', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*ell @@ -745,30 +745,30 @@ def test_mem_access_counter_consec(): f64consec = mem_map[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='g', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64consec += mem_map[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='h', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=1, direction='load', variable='b', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64consec == 2*n*m*ell assert f32consec == 3*n*m*ell f64consec = mem_map[lp.MemAccess('global', np.float64, stride=1, direction='store', variable='e', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64consec == n*m*ell assert f32consec == n*m*ell @@ -777,9 +777,9 @@ def test_mem_access_counter_consec(): def test_count_granularity_val_checks(): try: - lp.MemAccess(count_granularity=cg.WORKITEM) - lp.MemAccess(count_granularity=cg.SUBGROUP) - lp.MemAccess(count_granularity=cg.GROUP) + lp.MemAccess(count_granularity=CG.WORKITEM) + lp.MemAccess(count_granularity=CG.SUBGROUP) + lp.MemAccess(count_granularity=CG.GROUP) lp.MemAccess(count_granularity=None) assert True lp.MemAccess(count_granularity='bushel') @@ -788,9 +788,9 @@ def test_count_granularity_val_checks(): assert True try: - lp.Op(count_granularity=cg.WORKITEM) - lp.Op(count_granularity=cg.SUBGROUP) - lp.Op(count_granularity=cg.GROUP) + lp.Op(count_granularity=CG.WORKITEM) + lp.Op(count_granularity=CG.SUBGROUP) + lp.Op(count_granularity=CG.GROUP) lp.Op(count_granularity=None) assert True lp.Op(count_granularity='bushel') @@ -877,16 +877,16 @@ def test_all_counters_parallel_matmul(): op_map = lp.get_op_map(knl, count_redundant_work=True) f32mul = op_map[ - lp.Op(np.float32, 'mul', cg.WORKITEM) + lp.Op(np.float32, 'mul', CG.WORKITEM) ].eval_with_dict(params) f32add = op_map[ - lp.Op(np.float32, 'add', cg.WORKITEM) + lp.Op(np.float32, 'add', CG.WORKITEM) ].eval_with_dict(params) i32ops = op_map[ - lp.Op(np.int32, 'add', cg.WORKITEM) + lp.Op(np.int32, 'add', CG.WORKITEM) ].eval_with_dict(params) i32ops += op_map[ - lp.Op(np.dtype(np.int32), 'mul', cg.WORKITEM) + lp.Op(np.dtype(np.int32), 'mul', CG.WORKITEM) ].eval_with_dict(params) assert f32mul+f32add == n*m*ell*2 @@ -896,11 +896,11 @@ def test_all_counters_parallel_matmul(): f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='b', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize @@ -908,7 +908,7 @@ def test_all_counters_parallel_matmul(): f32coal = mem_access_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f32coal == n*ell @@ -918,7 +918,7 @@ def test_all_counters_parallel_matmul(): subgroup_size=32).filter_by(mtype=['local']) local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', - count_granularity=cg.WORKITEM) + count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert local_mem_l == n*m*ell*2 @@ -985,24 +985,24 @@ def test_summations_and_filters(): subgroup_size=subgroup_size) loads_a = mem_map.filter_by(direction=['load'], variable=['a'], - count_granularity=[cg.SUBGROUP] + count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group assert loads_a == (2*n*m*ell)*n_groups*subgroups_per_group global_stores = mem_map.filter_by(mtype=['global'], direction=['store'], - count_granularity=[cg.SUBGROUP] + count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group assert global_stores == (n*m*ell + n*m)*n_groups*subgroups_per_group ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'], - count_granularity=[cg.SUBGROUP] + count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'], - count_granularity=[cg.SUBGROUP] + count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) # uniform: (count-per-sub-group)*n_groups*subgroups_per_group -- GitLab From d1a587c3d0ee6e922008a8afdb8c2dda10460ba6 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 20 Feb 2018 00:33:36 -0600 Subject: [PATCH 47/59] added CountGranularity to __all__ --- loopy/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index b6a72c021..a09fdd184 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -239,8 +239,8 @@ __all__ = [ "PreambleInfo", "generate_code", "generate_code_v2", "generate_body", - "ToCountMap", "stringify_stats_mapping", "Op", "MemAccess", - "get_op_poly", "get_op_map", "get_lmem_access_poly", + "ToCountMap", "CountGranularity", "stringify_stats_mapping", "Op", + "MemAccess", "get_op_poly", "get_op_map", "get_lmem_access_poly", "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_map", "get_synchronization_poly", "get_synchronization_map", "gather_access_footprints", "gather_access_footprint_bytes", -- GitLab From 4af3e4bed1fd113afce903152c592b6968a39535 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 20 Feb 2018 00:34:30 -0600 Subject: [PATCH 48/59] passing subgroup_size (now required) in test_gnuma_hroiz_kernel --- test/test_numa_diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index eff3dbd0e..d30a81e84 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -229,7 +229,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa print(lp.stringify_stats_mapping(op_map)) print("MEM") - gmem_map = lp.get_mem_access_map(hsv).to_bytes() + gmem_map = lp.get_mem_access_map(hsv, subgroup_size=32).to_bytes() print(lp.stringify_stats_mapping(gmem_map)) hsv = lp.set_options(hsv, cl_build_options=[ -- GitLab From b03a9222826f0dd9b58aaa3b91fe7f4bb55d240a Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 20 Feb 2018 03:12:01 -0600 Subject: [PATCH 49/59] defined work-item, sub-group, and group --- loopy/statistics.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 2b5e3876e..d9bbd4b24 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -501,7 +501,12 @@ class Op(Record): .. attribute:: count_granularity A :class:`str` that specifies whether this operation should be counted - once per *work-item*, *sub-group*, or *group*. + once per *work-item*, *sub-group*, or *group*. A work-item is a single + instance of computation executing on a single processor (think 'thread'), + a collection of which may be grouped together into a work-group. Each + work-group executes on a single compute unit with all work-items within + the group sharing local memory. A sub-group is an implementation-dependent + grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. """ @@ -1320,10 +1325,15 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, (Likely desirable for performance modeling, but undesirable for code optimization.) - :arg subgroup_size: A :class:`int` that specifies the sub-group size. This - is used, e.g., when counting a :class:`MemAccess` whose count_granularity - specifies that it should only be counted once per sub-group. The default - sub-group_size is 32. + :arg subgroup_size: A :class:`int` that specifies the sub-group size. An OpenCL + sub-group is an implementation-dependent grouping of work-items within a + work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., + when counting a :class:`MemAccess` whose count_granularity specifies that it + should only be counted once per sub-group. If set to None an attempt to find + the sub-group size using the device will be made. A :class:`string` 'guess' + may also be passed as the subgroup_size, in which case get_mem_access_map + will attempt to find the sub-group sizeusing the device and, if + unsuccessful, will make a wild guess. :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. -- GitLab From b50249339cfde1279c5c98afbbe376a8c9df3408 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 20 Feb 2018 03:35:37 -0600 Subject: [PATCH 50/59] made docstring indentation more consistent --- loopy/statistics.py | 133 ++++++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 66 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index d9bbd4b24..a63ee41ad 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -208,13 +208,13 @@ class ToCountMap(object): def filter_by(self, **kwargs): """Remove items without specified key fields. - :arg kwargs: Keyword arguments matching fields in the keys of - the :class:`ToCountMap`, each given a list of - allowable values for that key field. + :arg kwargs: Keyword arguments matching fields in the keys of the + :class:`ToCountMap`, each given a list of allowable values for that + key field. :return: A :class:`ToCountMap` containing the subset of the items in - the original :class:`ToCountMap` that match the field values - passed. + the original :class:`ToCountMap` that match the field values + passed. Example usage:: @@ -256,11 +256,11 @@ class ToCountMap(object): def filter_by_func(self, func): """Keep items that pass a test. - :arg func: A function that takes a map key a parameter and - returns a :class:`bool`. + :arg func: A function that takes a map key a parameter and returns a + :class:`bool`. - :arg: A :class:`ToCountMap` containing the subset of the items in - the original :class:`ToCountMap` for which func(key) is true. + :arg: A :class:`ToCountMap` containing the subset of the items in the + original :class:`ToCountMap` for which func(key) is true. Example usage:: @@ -289,13 +289,13 @@ class ToCountMap(object): def group_by(self, *args): """Group map items together, distinguishing by only the key fields - passed in args. + passed in args. :arg args: Zero or more :class:`str` fields of map keys. - :return: A :class:`ToCountMap` containing the same total counts - grouped together by new keys that only contain the fields - specified in the arguments passed. + :return: A :class:`ToCountMap` containing the same total counts grouped + together by new keys that only contain the fields specified in the + arguments passed. Example usage:: @@ -363,8 +363,8 @@ class ToCountMap(object): """Convert counts to bytes using data type in map key. :return: A :class:`ToCountMap` mapping each original key to a - :class:`islpy.PwQPolynomial` with counts in bytes rather than - instances. + :class:`islpy.PwQPolynomial` with counts in bytes rather than + instances. Example usage:: @@ -404,8 +404,8 @@ class ToCountMap(object): def sum(self): """Add all counts in ToCountMap. - :return: A :class:`islpy.PwQPolynomial` or :class:`int` containing the sum of - counts. + :return: A :class:`islpy.PwQPolynomial` or :class:`int` containing the + sum of counts. """ @@ -431,7 +431,7 @@ class ToCountMap(object): parameter dict. :return: An :class:`int` containing the sum of all counts in the - :class:`ToCountMap` evaluated with the parameters provided. + :class:`ToCountMap` evaluated with the parameters provided. Example usage:: @@ -502,11 +502,12 @@ class Op(Record): A :class:`str` that specifies whether this operation should be counted once per *work-item*, *sub-group*, or *group*. A work-item is a single - instance of computation executing on a single processor (think 'thread'), - a collection of which may be grouped together into a work-group. Each - work-group executes on a single compute unit with all work-items within - the group sharing local memory. A sub-group is an implementation-dependent - grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. + instance of computation executing on a single processor (think + 'thread'), a collection of which may be grouped together into a + work-group. Each work-group executes on a single compute unit with all + work-items within the group sharing local memory. A sub-group is an + implementation-dependent grouping of work-items within a work-group, + analagous to an NVIDIA CUDA warp. """ @@ -555,8 +556,8 @@ class MemAccess(Record): .. attribute:: stride - An :class:`int` that specifies stride of the memory access. A stride of 0 - indicates a uniform access (i.e. all work-items access the same item). + An :class:`int` that specifies stride of the memory access. A stride of + 0 indicates a uniform access (i.e. all work-items access the same item). .. attribute:: direction @@ -1229,15 +1230,15 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, :arg knl: A :class:`loopy.LoopKernel` whose operations are to be counted. - :arg numpy_types: A :class:`bool` specifying whether the types - in the returned mapping should be numpy types - instead of :class:`loopy.LoopyType`. + :arg numpy_types: A :class:`bool` specifying whether the types in the + returned mapping should be numpy types instead of + :class:`loopy.LoopyType`. :arg count_redundant_work: Based on usage of hardware axes or other specifics, a kernel may perform work redundantly. This :class:`bool` flag indicates whether this work should be included in the count. - (Likely desirable for performance modeling, but undesirable for - code optimization.) + (Likely desirable for performance modeling, but undesirable for code + optimization.) :return: A :class:`ToCountMap` of **{** :class:`Op` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1315,9 +1316,9 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be counted. - :arg numpy_types: A :class:`bool` specifying whether the types - in the returned mapping should be numpy types - instead of :class:`loopy.LoopyType`. + :arg numpy_types: A :class:`bool` specifying whether the types in the + returned mapping should be numpy types instead of + :class:`loopy.LoopyType`. :arg count_redundant_work: Based on usage of hardware axes or other specifics, a kernel may perform work redundantly. This :class:`bool` @@ -1325,25 +1326,26 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, (Likely desirable for performance modeling, but undesirable for code optimization.) - :arg subgroup_size: A :class:`int` that specifies the sub-group size. An OpenCL - sub-group is an implementation-dependent grouping of work-items within a - work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., - when counting a :class:`MemAccess` whose count_granularity specifies that it - should only be counted once per sub-group. If set to None an attempt to find - the sub-group size using the device will be made. A :class:`string` 'guess' - may also be passed as the subgroup_size, in which case get_mem_access_map - will attempt to find the sub-group sizeusing the device and, if - unsuccessful, will make a wild guess. + :arg subgroup_size: A :class:`int` that specifies the sub-group size. An + OpenCL sub-group is an implementation-dependent grouping of work-items + within a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is + used, e.g., when counting a :class:`MemAccess` whose count_granularity + specifies that it should only be counted once per sub-group. If set to + None an attempt to find the sub-group size using the device will be + made. A :class:`string` 'guess' may also be passed as the + subgroup_size, in which case get_mem_access_map will attempt to find + the sub-group sizeusing the device and, if unsuccessful, will make a + wild guess. :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. - - The :class:`MemAccess` specifies the characteristics of the - memory access. + - The :class:`MemAccess` specifies the characteristics of the memory + access. - - The :class:`islpy.PwQPolynomial` holds the number of memory - accesses with the characteristics specified in the key (in terms - of the :class:`loopy.LoopKernel` *inames*). + - The :class:`islpy.PwQPolynomial` holds the number of memory accesses + with the characteristics specified in the key (in terms of the + :class:`loopy.LoopKernel` *inames*). Example usage:: @@ -1547,17 +1549,16 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, def get_synchronization_map(knl, subgroup_size=None): - """Count the number of synchronization events each work-item encounters in a - loopy kernel. + """Count the number of synchronization events each work-item encounters in + a loopy kernel. :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: A dictionary mapping each type of synchronization event to a - :class:`islpy.PwQPolynomial` holding the number of events per - work-item. + :class:`islpy.PwQPolynomial` holding the number of events per work-item. - Possible keys include ``barrier_local``, ``barrier_global`` - (if supported by the target) and ``kernel_launch``. + Possible keys include ``barrier_local``, ``barrier_global`` + (if supported by the target) and ``kernel_launch``. Example usage:: @@ -1626,14 +1627,14 @@ def get_synchronization_map(knl, subgroup_size=None): # {{{ gather_access_footprints def gather_access_footprints(kernel, ignore_uncountable=False): - """Return a dictionary mapping ``(var_name, direction)`` - to :class:`islpy.Set` instances capturing which indices - of each the array *var_name* are read/written (where - *direction* is either ``read`` or ``write``. - - :arg ignore_uncountable: If *False*, an error will be raised for - accesses on which the footprint cannot be determined (e.g. - data-dependent or nonlinear indices) + """Return a dictionary mapping ``(var_name, direction)`` to + :class:`islpy.Set` instances capturing which indices of each the array + *var_name* are read/written (where *direction* is either ``read`` or + ``write``. + + :arg ignore_uncountable: If *False*, an error will be raised for accesses + on which the footprint cannot be determined (e.g. data-dependent or + nonlinear indices) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types @@ -1685,9 +1686,9 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): read/written (where *direction* is either ``read`` or ``write`` on array *var_name* - :arg ignore_uncountable: If *True*, an error will be raised for - accesses on which the footprint cannot be determined (e.g. - data-dependent or nonlinear indices) + :arg ignore_uncountable: If *True*, an error will be raised for accesses on + which the footprint cannot be determined (e.g. data-dependent or + nonlinear indices) """ from loopy.preprocess import preprocess_kernel, infer_unknown_types @@ -1763,8 +1764,8 @@ def get_gmem_access_poly(knl): def get_synchronization_poly(knl): - """Count the number of synchronization events each work-item encounters in a - loopy kernel. + """Count the number of synchronization events each work-item encounters in + a loopy kernel. get_synchronization_poly is deprecated. Use get_synchronization_map instead. -- GitLab From 8406772c33b86457a5a72ddbde71de890d404fbe Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 20 Feb 2018 03:40:54 -0600 Subject: [PATCH 51/59] more subgroup_size and count_granularity doc --- loopy/statistics.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index a63ee41ad..0607a769e 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -572,7 +572,13 @@ class MemAccess(Record): .. attribute:: count_granularity A :class:`str` that specifies whether this operation should be counted - once per *work-item*, *sub-group*, or *group*. + once per *work-item*, *sub-group*, or *group*. A work-item is a single + instance of computation executing on a single processor (think + 'thread'), a collection of which may be grouped together into a + work-group. Each work-group executes on a single compute unit with all + work-items within the group sharing local memory. A sub-group is an + implementation-dependent grouping of work-items within a work-group, + analagous to an NVIDIA CUDA warp. """ @@ -1240,6 +1246,13 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, (Likely desirable for performance modeling, but undesirable for code optimization.) + :arg subgroup_size: (currently unused) A :class:`int` that specifies the + sub-group size. An OpenCL sub-group is an implementation-dependent + grouping of work-items within a work-group, analagous to an NVIDIA CUDA + warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` + whose count_granularity specifies that it should only be counted once + per sub-group. + :return: A :class:`ToCountMap` of **{** :class:`Op` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1554,6 +1567,13 @@ def get_synchronization_map(knl, subgroup_size=None): :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. + :arg subgroup_size: (currently unused) A :class:`int` that specifies the + sub-group size. An OpenCL sub-group is an implementation-dependent + grouping of work-items within a work-group, analagous to an NVIDIA CUDA + warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` + whose count_granularity specifies that it should only be counted once + per sub-group. + :return: A dictionary mapping each type of synchronization event to a :class:`islpy.PwQPolynomial` holding the number of events per work-item. -- GitLab From 4243a56391ecd42280a318c10f5fb19fc88fa9be Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 20 Feb 2018 03:55:38 -0600 Subject: [PATCH 52/59] renamed group->work-group to match opencl terminology --- loopy/statistics.py | 42 +++++++-------- test/test_statistics.py | 112 ++++++++++++++++++++-------------------- 2 files changed, 77 insertions(+), 77 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 0607a769e..ed21dd045 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -460,7 +460,7 @@ def stringify_stats_mapping(m): class CountGranularity: """Strings specifying whether an operation should be counted once per - *work-item*, *sub-group*, or *group*. + *work-item*, *sub-group*, or *work-group*. .. attribute :: WORKITEM @@ -472,15 +472,15 @@ class CountGranularity: A :class:`str` that specifies that an operation should be counted once per *sub-group*. - .. attribute :: GROUP + .. attribute :: WORKGROUP A :class:`str` that specifies that an operation should be counted - once per *group*. + once per *work-group*. """ WORKITEM = "workitem" SUBGROUP = "subgroup" - GROUP = "group" + WORKGROUP = "workgroup" # {{{ Op descriptor @@ -501,11 +501,11 @@ class Op(Record): .. attribute:: count_granularity A :class:`str` that specifies whether this operation should be counted - once per *work-item*, *sub-group*, or *group*. A work-item is a single - instance of computation executing on a single processor (think + once per *work-item*, *sub-group*, or *work-group*. A work-item is a + single instance of computation executing on a single processor (think 'thread'), a collection of which may be grouped together into a work-group. Each work-group executes on a single compute unit with all - work-items within the group sharing local memory. A sub-group is an + work-items within the work-group sharing local memory. A sub-group is an implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. @@ -513,7 +513,7 @@ class Op(Record): count_granularity_options = [CountGranularity.WORKITEM, CountGranularity.SUBGROUP, - CountGranularity.GROUP, + CountGranularity.WORKGROUP, None] def __init__(self, dtype=None, name=None, count_granularity=None): @@ -572,11 +572,11 @@ class MemAccess(Record): .. attribute:: count_granularity A :class:`str` that specifies whether this operation should be counted - once per *work-item*, *sub-group*, or *group*. A work-item is a single - instance of computation executing on a single processor (think + once per *work-item*, *sub-group*, or *work-group*. A work-item is a + single instance of computation executing on a single processor (think 'thread'), a collection of which may be grouped together into a work-group. Each work-group executes on a single compute unit with all - work-items within the group sharing local memory. A sub-group is an + work-items within the work-group sharing local memory. A sub-group is an implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. @@ -584,7 +584,7 @@ class MemAccess(Record): count_granularity_options = [CountGranularity.WORKITEM, CountGranularity.SUBGROUP, - CountGranularity.GROUP, + CountGranularity.WORKGROUP, None] def __init__(self, mtype=None, dtype=None, stride=None, direction=None, @@ -1461,31 +1461,31 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, knl, insn, disregard_local_axes=True, count_redundant_work=count_redundant_work) - if count_granularity == CountGranularity.GROUP: + if count_granularity == CountGranularity.WORKGROUP: return ct_disregard_local elif count_granularity == CountGranularity.SUBGROUP: # get the group size from loopy.symbolic import aff_to_expr _, local_size = knl.get_grid_size_upper_bounds() - group_size = 1 + workgroup_size = 1 if local_size: for size in local_size: s = aff_to_expr(size) if not isinstance(s, int): raise LoopyError("Cannot count insn with %s granularity, " - "group size is not integer: %s" + "work-group size is not integer: %s" % (CountGranularity.SUBGROUP, local_size)) - group_size *= s + workgroup_size *= s warn_with_kernel(knl, "insn_count_subgroups_upper_bound", "get_insn_count: when counting instruction %s with " - "count_granularity=%s, using upper bound for group size " - "(%d work-items) to compute sub-groups per group. When multiple " - "device programs present, actual sub-group count may be lower." - % (insn_id, CountGranularity.SUBGROUP, group_size)) + "count_granularity=%s, using upper bound for work-group size " + "(%d work-items) to compute sub-groups per work-group. When " + "multiple device programs present, actual sub-group count may be" + "lower." % (insn_id, CountGranularity.SUBGROUP, workgroup_size)) from pytools import div_ceil - return ct_disregard_local*div_ceil(group_size, subgroup_size) + return ct_disregard_local*div_ceil(workgroup_size, subgroup_size) else: # this should not happen since this is enforced in MemAccess raise ValueError("get_insn_count: count_granularity '%s' is" diff --git a/test/test_statistics.py b/test/test_statistics.py index 7a5d13949..bdc64cf83 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -261,7 +261,7 @@ def test_mem_access_counter_basic(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - n_groups = 1 + n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) @@ -282,9 +282,9 @@ def test_mem_access_counter_basic(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32l == (3*n*m*ell)*n_groups*subgroups_per_group - assert f64l == (2*n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f32l == (3*n*m*ell)*n_workgroups*subgroups_per_group + assert f64l == (2*n*m)*n_workgroups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c', @@ -295,9 +295,9 @@ def test_mem_access_counter_basic(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32s == (n*m*ell)*n_groups*subgroups_per_group - assert f64s == (n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f32s == (n*m*ell)*n_workgroups*subgroups_per_group + assert f64s == (n*m)*n_workgroups*subgroups_per_group def test_mem_access_counter_reduction(): @@ -320,7 +320,7 @@ def test_mem_access_counter_reduction(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - n_groups = 1 + n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) @@ -333,16 +333,16 @@ def test_mem_access_counter_reduction(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32l == (2*n*m*ell)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32s == (n*ell)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f32s == (n*ell)*n_workgroups*subgroups_per_group ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] ).to_bytes().eval_and_sum(params) @@ -376,7 +376,7 @@ def test_mem_access_counter_logic(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - n_groups = 1 + n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) @@ -392,10 +392,10 @@ def test_mem_access_counter_logic(): direction='store') ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32_g_l == (2*n*m)*n_groups*subgroups_per_group - assert f64_g_l == (n*m)*n_groups*subgroups_per_group - assert f64_g_s == (n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f32_g_l == (2*n*m)*n_workgroups*subgroups_per_group + assert f64_g_l == (n*m)*n_workgroups*subgroups_per_group + assert f64_g_s == (n*m)*n_workgroups*subgroups_per_group def test_mem_access_counter_specialops(): @@ -422,7 +422,7 @@ def test_mem_access_counter_specialops(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - n_groups = 1 + n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) @@ -443,9 +443,9 @@ def test_mem_access_counter_specialops(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32 == (2*n*m*ell)*n_groups*subgroups_per_group - assert f64 == (2*n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f32 == (2*n*m*ell)*n_workgroups*subgroups_per_group + assert f64 == (2*n*m)*n_workgroups*subgroups_per_group f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='store', variable='c', @@ -456,16 +456,16 @@ def test_mem_access_counter_specialops(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32 == (n*m*ell)*n_groups*subgroups_per_group - assert f64 == (n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f32 == (n*m*ell)*n_workgroups*subgroups_per_group + assert f64 == (n*m)*n_workgroups*subgroups_per_group filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'], count_granularity=CG.SUBGROUP) tot = filtered_map.eval_and_sum(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert tot == (n*m*ell + n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert tot == (n*m*ell + n*m)*n_workgroups*subgroups_per_group def test_mem_access_counter_bitwise(): @@ -494,7 +494,7 @@ def test_mem_access_counter_bitwise(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - n_groups = 1 + n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) @@ -515,8 +515,8 @@ def test_mem_access_counter_bitwise(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert i32 == (4*n*m+2*n*m*ell)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert i32 == (4*n*m+2*n*m*ell)*n_workgroups*subgroups_per_group i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='c', @@ -527,8 +527,8 @@ def test_mem_access_counter_bitwise(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert i32 == (n*m+n*m*ell)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert i32 == (n*m+n*m*ell)*n_workgroups*subgroups_per_group def test_mem_access_counter_mixed(): @@ -557,7 +557,7 @@ def test_mem_access_counter_mixed(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - n_groups = div_ceil(ell, group_size_0) + n_workgroups = div_ceil(ell, group_size_0) group_size = group_size_0 subgroups_per_group = div_ceil(group_size, subgroup_size) @@ -586,9 +586,9 @@ def test_mem_access_counter_mixed(): count_granularity=CG.WORKITEM) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f64uniform == (2*n*m)*n_groups*subgroups_per_group - assert f32uniform == (m*n)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f64uniform == (2*n*m)*n_workgroups*subgroups_per_group + assert f32uniform == (m*n)*n_workgroups*subgroups_per_group expect_fallback = False import islpy as isl @@ -601,9 +601,9 @@ def test_mem_access_counter_mixed(): if expect_fallback: if ell < group_size_0: - assert f32nonconsec == 3*n*m*ell*n_groups + assert f32nonconsec == 3*n*m*ell*n_workgroups else: - assert f32nonconsec == 3*n*m*n_groups*group_size_0 + assert f32nonconsec == 3*n*m*n_workgroups*group_size_0 else: assert f32nonconsec == 3*n*m*ell @@ -617,14 +617,14 @@ def test_mem_access_counter_mixed(): count_granularity=CG.WORKITEM) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f64uniform == m*n*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f64uniform == m*n*n_workgroups*subgroups_per_group if expect_fallback: if ell < group_size_0: - assert f32nonconsec == n*m*ell*n_groups + assert f32nonconsec == n*m*ell*n_workgroups else: - assert f32nonconsec == n*m*n_groups*group_size_0 + assert f32nonconsec == n*m*n_workgroups*group_size_0 else: assert f32nonconsec == n*m*ell @@ -779,7 +779,7 @@ def test_count_granularity_val_checks(): try: lp.MemAccess(count_granularity=CG.WORKITEM) lp.MemAccess(count_granularity=CG.SUBGROUP) - lp.MemAccess(count_granularity=CG.GROUP) + lp.MemAccess(count_granularity=CG.WORKGROUP) lp.MemAccess(count_granularity=None) assert True lp.MemAccess(count_granularity='bushel') @@ -790,7 +790,7 @@ def test_count_granularity_val_checks(): try: lp.Op(count_granularity=CG.WORKITEM) lp.Op(count_granularity=CG.SUBGROUP) - lp.Op(count_granularity=CG.GROUP) + lp.Op(count_granularity=CG.WORKGROUP) lp.Op(count_granularity=None) assert True lp.Op(count_granularity='bushel') @@ -977,7 +977,7 @@ def test_summations_and_filters(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} - n_groups = 1 + n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) @@ -988,15 +988,15 @@ def test_summations_and_filters(): count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert loads_a == (2*n*m*ell)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert loads_a == (2*n*m*ell)*n_workgroups*subgroups_per_group global_stores = mem_map.filter_by(mtype=['global'], direction=['store'], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert global_stores == (n*m*ell + n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert global_stores == (n*m*ell + n*m)*n_workgroups*subgroups_per_group ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'], count_granularity=[CG.SUBGROUP] @@ -1005,9 +1005,9 @@ def test_summations_and_filters(): count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_groups*subgroups_per_group - assert st_bytes == (4*n*m*ell + 8*n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_workgroups*subgroups_per_group + assert st_bytes == (4*n*m*ell + 8*n*m)*n_workgroups*subgroups_per_group # ignore stride and variable names in this map reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') @@ -1016,9 +1016,9 @@ def test_summations_and_filters(): f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert f32lall == (3*n*m*ell)*n_groups*subgroups_per_group - assert f64lall == (2*n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert f32lall == (3*n*m*ell)*n_workgroups*subgroups_per_group + assert f64lall == (2*n*m)*n_workgroups*subgroups_per_group op_map = lp.get_op_map(knl, count_redundant_work=True) #for k, v in op_map.items(): @@ -1052,8 +1052,8 @@ def test_summations_and_filters(): key.direction == 'load' s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) - # uniform: (count-per-sub-group)*n_groups*subgroups_per_group - assert s1f64l == (2*n*m)*n_groups*subgroups_per_group + # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group + assert s1f64l == (2*n*m)*n_workgroups*subgroups_per_group def test_strided_footprint(): -- GitLab From f8cf6fcf8025e4412f2327c4d7ece9b055734ffe Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 20 Feb 2018 22:50:40 -0600 Subject: [PATCH 53/59] added CountGranularity.ALL to list all granularities --- loopy/statistics.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index ed21dd045..5a5f85f65 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -478,9 +478,11 @@ class CountGranularity: once per *work-group*. """ + WORKITEM = "workitem" SUBGROUP = "subgroup" WORKGROUP = "workgroup" + ALL = [WORKITEM, SUBGROUP, WORKGROUP] # {{{ Op descriptor @@ -511,16 +513,11 @@ class Op(Record): """ - count_granularity_options = [CountGranularity.WORKITEM, - CountGranularity.SUBGROUP, - CountGranularity.WORKGROUP, - None] - def __init__(self, dtype=None, name=None, count_granularity=None): - if count_granularity not in self.count_granularity_options: - raise ValueError("Op.__init__: count_granularity '%s' is" + if count_granularity not in CountGranularity.ALL+[None]: + raise ValueError("Op.__init__: count_granularity '%s' is " "not allowed. count_granularity options: %s" - % (count_granularity, self.count_granularity_options)) + % (count_granularity, CountGranularity.ALL+[None])) if dtype is None: Record.__init__(self, dtype=dtype, name=name, count_granularity=count_granularity) @@ -582,11 +579,6 @@ class MemAccess(Record): """ - count_granularity_options = [CountGranularity.WORKITEM, - CountGranularity.SUBGROUP, - CountGranularity.WORKGROUP, - None] - def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None, count_granularity=None): @@ -600,10 +592,10 @@ class MemAccess(Record): raise NotImplementedError("MemAccess: variable must be None when " "mtype is 'local'") - if count_granularity not in self.count_granularity_options: - raise ValueError("Op.__init__: count_granularity '%s' is" + if count_granularity not in CountGranularity.ALL+[None]: + raise ValueError("Op.__init__: count_granularity '%s' is " "not allowed. count_granularity options: %s" - % (count_granularity, self.count_granularity_options)) + % (count_granularity, CountGranularity.ALL+[None])) if dtype is None: Record.__init__(self, mtype=mtype, dtype=dtype, stride=stride, @@ -1490,7 +1482,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, # this should not happen since this is enforced in MemAccess raise ValueError("get_insn_count: count_granularity '%s' is" "not allowed. count_granularity options: %s" - % (count_granularity, MemAccess.count_granularity_options)) + % (count_granularity, CountGranularity.ALL+[None])) knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) -- GitLab From 98d7b5464d6b8c0d693c6b583e1c21ea860f54f6 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 20 Feb 2018 22:59:47 -0600 Subject: [PATCH 54/59] using enum instead of string for count granularity in tutorial --- doc/tutorial.rst | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 52d6e169c..4efc13de4 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1578,12 +1578,13 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: .. doctest:: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} - >>> f32add = op_map[lp.Op(np.float32, 'add', 'workitem')].eval_with_dict(param_dict) - >>> f32div = op_map[lp.Op(np.float32, 'div', 'workitem')].eval_with_dict(param_dict) - >>> f32mul = op_map[lp.Op(np.float32, 'mul', 'workitem')].eval_with_dict(param_dict) - >>> f64add = op_map[lp.Op(np.float64, 'add', 'workitem')].eval_with_dict(param_dict) - >>> f64mul = op_map[lp.Op(np.float64, 'mul', 'workitem')].eval_with_dict(param_dict) - >>> i32add = op_map[lp.Op(np.int32, 'add', 'workitem')].eval_with_dict(param_dict) + >>> from loopy.statistics import CountGranularity as CG + >>> f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(param_dict) + >>> f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(param_dict) + >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(param_dict) + >>> f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(param_dict) + >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.WORKITEM)].eval_with_dict(param_dict) + >>> i32add = op_map[lp.Op(np.int32, 'add', CG.WORKITEM)].eval_with_dict(param_dict) >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 @@ -1673,13 +1674,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g', 'subgroup') + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 0, 'load', 'g', CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e', 'subgroup') + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 0, 'store', 'e', CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a', 'subgroup') + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 0, 'load', 'a', CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c', 'subgroup') + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 0, 'store', 'c', CG.SUBGROUP) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1754,13 +1755,13 @@ array accesses has not changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g', 'workitem') + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 1, 'load', 'g', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e', 'workitem') + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 1, 'store', 'e', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a', 'workitem') + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 1, 'load', 'a', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c', 'workitem') + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 1, 'store', 'c', CG.WORKITEM) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1794,13 +1795,13 @@ changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g', 'workitem') + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, 128, 'load', 'g', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e', 'workitem') + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, 128, 'store', 'e', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a', 'workitem') + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, 128, 'load', 'a', CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c', 'workitem') + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, 128, 'store', 'c', CG.WORKITEM) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) -- GitLab From abf3f04b74dcfae12f2c967bbc38fdf200a27189 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 21 Feb 2018 19:22:01 -0600 Subject: [PATCH 55/59] reorganized subgroup_size processing/guessing for clarity --- loopy/statistics.py | 66 ++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 25 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 5a5f85f65..2f87734b0 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1312,6 +1312,20 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, # }}} +def _find_subgroup_size_for_knl(knl): + from loopy.target.pyopencl import PyOpenCLTarget + if isinstance(knl.target, PyOpenCLTarget) and knl.target.device is not None: + from pyopencl.characterize import get_simd_group_size + subgroup_size_guess = get_simd_group_size(knl.target.device, None) + warn_with_kernel(knl, "getting_subgroup_size_from_device", + "Device: %s. Using sub-group size given by " + "pyopencl.characterize.get_simd_group_size(): %d" + % (knl.target.device, subgroup_size_guess)) + return subgroup_size_guess + else: + return None + + # {{{ get_mem_access_map def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, @@ -1399,33 +1413,35 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, if not isinstance(subgroup_size, int): # try to find subgroup_size - from loopy.target.pyopencl import PyOpenCLTarget - if isinstance(knl.target, PyOpenCLTarget) and knl.target.device is not None: - from pyopencl.characterize import get_simd_group_size - subgroup_size_guess = get_simd_group_size(knl.target.device, None) - warn_with_kernel(knl, "get_mem_access_map_assumes_subgroup_size", - "subgroup_size passed: %s. Device: %s. Using " - "sub-group size given by get_simd_group_size(): %d" - % (subgroup_size, knl.target.device, - subgroup_size_guess)) - subgroup_size = subgroup_size_guess - elif subgroup_size == 'guess': - # unable to get subgroup_size from device, so guess - subgroup_size = 32 - warn_with_kernel(knl, "get_mem_access_map_guessing_subgroup_size", - "get_mem_access_map: 'guess' sub-group size passed, " - "no target device found, wildly guessing that " - "sub-group size is %d." - % (subgroup_size)) + subgroup_size_guess = _find_subgroup_size_for_knl(knl) if subgroup_size is None: - # 'guess' was not passed and either no target device found - # or get_simd_group_size returned None - raise ValueError("No sub-group size passed and no target device found. " - "Either (1) pass integer value for subgroup_size, " - "(2) ensure that kernel.target is PyOpenClTarget " - "and kernel.target.device is set, or (3) pass " - "subgroup_size='guess' and hope for the best.") + if subgroup_size_guess is None: + # 'guess' was not passed and either no target device found + # or get_simd_group_size returned None + raise ValueError("No sub-group size passed and no target device found. " + "Either (1) pass integer value for subgroup_size, " + "(2) ensure that kernel.target is PyOpenClTarget " + "and kernel.target.device is set, or (3) pass " + "subgroup_size='guess' and hope for the best.") + else: + subgroup_size = subgroup_size_guess + + elif subgroup_size == 'guess': + if subgroup_size_guess is None: + # unable to get subgroup_size from device, so guess + subgroup_size = 32 + warn_with_kernel(knl, "get_mem_access_map_guessing_subgroup_size", + "get_mem_access_map: 'guess' sub-group size passed, " + "no target device found, wildly guessing that " + "sub-group size is %d." + % (subgroup_size)) + else: + subgroup_size = subgroup_size_guess + else: + raise ValueError("Invalid value for subgroup_size: %s. subgroup_size " + "must be integer, 'guess', or, if you're feeling " + "lucky, None." % (subgroup_size)) class CacheHolder(object): pass -- GitLab From 509692c616a68797ab0eef9be3df604ccade5e22 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 21 Feb 2018 19:36:53 -0600 Subject: [PATCH 56/59] fixing flake8 issues --- loopy/statistics.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 2f87734b0..f9f068fd3 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1419,7 +1419,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, if subgroup_size_guess is None: # 'guess' was not passed and either no target device found # or get_simd_group_size returned None - raise ValueError("No sub-group size passed and no target device found. " + raise ValueError("No sub-group size passed, no target device found. " "Either (1) pass integer value for subgroup_size, " "(2) ensure that kernel.target is PyOpenClTarget " "and kernel.target.device is set, or (3) pass " @@ -1432,10 +1432,9 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, # unable to get subgroup_size from device, so guess subgroup_size = 32 warn_with_kernel(knl, "get_mem_access_map_guessing_subgroup_size", - "get_mem_access_map: 'guess' sub-group size passed, " - "no target device found, wildly guessing that " - "sub-group size is %d." - % (subgroup_size)) + "get_mem_access_map: 'guess' sub-group size " + "passed, no target device found, wildly guessing " + "that sub-group size is %d." % (subgroup_size)) else: subgroup_size = subgroup_size_guess else: -- GitLab From 309775a7bbdc6f250d54f27ec157e5e3721af9a8 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 21 Feb 2018 20:07:33 -0600 Subject: [PATCH 57/59] updated subgroup_size explanation slightly --- loopy/statistics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index f9f068fd3..3d44826bb 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1351,10 +1351,10 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. If set to None an attempt to find the sub-group size using the device will be - made. A :class:`string` 'guess' may also be passed as the - subgroup_size, in which case get_mem_access_map will attempt to find - the sub-group sizeusing the device and, if unsuccessful, will make a - wild guess. + made, if this fails an error will be raised. A :class:`string` 'guess' + may also be passed as the subgroup_size, in which case + get_mem_access_map will attempt to find the sub-group size using the + device and, if unsuccessful, will make a wild guess. :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. -- GitLab From d00bfdeeae0b0ffc17bc5dd61efd68370da16984 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 23 Feb 2018 20:33:47 -0600 Subject: [PATCH 58/59] improved docs --- loopy/statistics.py | 97 ++++++++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 40 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 3d44826bb..31cc94e73 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -362,7 +362,7 @@ class ToCountMap(object): def to_bytes(self): """Convert counts to bytes using data type in map key. - :return: A :class:`ToCountMap` mapping each original key to a + :return: A :class:`ToCountMap` mapping each original key to an :class:`islpy.PwQPolynomial` with counts in bytes rather than instances. @@ -404,7 +404,7 @@ class ToCountMap(object): def sum(self): """Add all counts in ToCountMap. - :return: A :class:`islpy.PwQPolynomial` or :class:`int` containing the + :return: An :class:`islpy.PwQPolynomial` or :class:`int` containing the sum of counts. """ @@ -462,17 +462,17 @@ class CountGranularity: """Strings specifying whether an operation should be counted once per *work-item*, *sub-group*, or *work-group*. - .. attribute :: WORKITEM + .. attribute:: WORKITEM A :class:`str` that specifies that an operation should be counted once per *work-item*. - .. attribute :: SUBGROUP + .. attribute:: SUBGROUP A :class:`str` that specifies that an operation should be counted once per *sub-group*. - .. attribute :: WORKGROUP + .. attribute:: WORKGROUP A :class:`str` that specifies that an operation should be counted once per *work-group*. @@ -503,11 +503,13 @@ class Op(Record): .. attribute:: count_granularity A :class:`str` that specifies whether this operation should be counted - once per *work-item*, *sub-group*, or *work-group*. A work-item is a - single instance of computation executing on a single processor (think - 'thread'), a collection of which may be grouped together into a - work-group. Each work-group executes on a single compute unit with all - work-items within the work-group sharing local memory. A sub-group is an + once per *work-item*, *sub-group*, or *work-group*. The granularities + allowed can be found in :class:`CountGranularity`, and may be accessed, + e.g., as ``CountGranularity.WORKITEM``. A work-item is a single instance + of computation executing on a single processor (think 'thread'), a + collection of which may be grouped together into a work-group. Each + work-group executes on a single compute unit with all work-items within + the work-group sharing local memory. A sub-group is an implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. @@ -569,11 +571,13 @@ class MemAccess(Record): .. attribute:: count_granularity A :class:`str` that specifies whether this operation should be counted - once per *work-item*, *sub-group*, or *work-group*. A work-item is a - single instance of computation executing on a single processor (think - 'thread'), a collection of which may be grouped together into a - work-group. Each work-group executes on a single compute unit with all - work-items within the work-group sharing local memory. A sub-group is an + once per *work-item*, *sub-group*, or *work-group*. The granularities + allowed can be found in :class:`CountGranularity`, and may be accessed, + e.g., as ``CountGranularity.WORKITEM``. A work-item is a single instance + of computation executing on a single processor (think 'thread'), a + collection of which may be grouped together into a work-group. Each + work-group executes on a single compute unit with all work-items within + the work-group sharing local memory. A sub-group is an implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. @@ -1238,12 +1242,17 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, (Likely desirable for performance modeling, but undesirable for code optimization.) - :arg subgroup_size: (currently unused) A :class:`int` that specifies the - sub-group size. An OpenCL sub-group is an implementation-dependent - grouping of work-items within a work-group, analagous to an NVIDIA CUDA - warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` - whose count_granularity specifies that it should only be counted once - per sub-group. + :arg subgroup_size: (currently unused) An :class:`int`, :class:`string` + ``'guess'``, or *None* that specifies the sub-group size. An OpenCL + sub-group is an implementation-dependent grouping of work-items within + a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, + e.g., when counting a :class:`MemAccess` whose count_granularity + specifies that it should only be counted once per sub-group. If set to + *None* an attempt to find the sub-group size using the device will be + made, if this fails an error will be raised. If a :class:`string` + ``'guess'`` is passed as the subgroup_size, get_mem_access_map will + attempt to find the sub-group size using the device and, if + unsuccessful, will make a wild guess. :return: A :class:`ToCountMap` of **{** :class:`Op` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1345,16 +1354,17 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, (Likely desirable for performance modeling, but undesirable for code optimization.) - :arg subgroup_size: A :class:`int` that specifies the sub-group size. An - OpenCL sub-group is an implementation-dependent grouping of work-items - within a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is - used, e.g., when counting a :class:`MemAccess` whose count_granularity - specifies that it should only be counted once per sub-group. If set to - None an attempt to find the sub-group size using the device will be - made, if this fails an error will be raised. A :class:`string` 'guess' - may also be passed as the subgroup_size, in which case - get_mem_access_map will attempt to find the sub-group size using the - device and, if unsuccessful, will make a wild guess. + :arg subgroup_size: An :class:`int`, :class:`string` ``'guess'``, or + *None* that specifies the sub-group size. An OpenCL sub-group is an + implementation-dependent grouping of work-items within a work-group, + analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when + counting a :class:`MemAccess` whose count_granularity specifies that it + should only be counted once per sub-group. If set to *None* an attempt + to find the sub-group size using the device will be made, if this fails + an error will be raised. If a :class:`string` ``'guess'`` is passed as + the subgroup_size, get_mem_access_map will attempt to find the + sub-group size using the device and, if unsuccessful, will make a wild + guess. :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1574,15 +1584,21 @@ def get_synchronization_map(knl, subgroup_size=None): :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. - :arg subgroup_size: (currently unused) A :class:`int` that specifies the - sub-group size. An OpenCL sub-group is an implementation-dependent - grouping of work-items within a work-group, analagous to an NVIDIA CUDA - warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` - whose count_granularity specifies that it should only be counted once - per sub-group. + :arg subgroup_size: (currently unused) An :class:`int`, :class:`string` + ``'guess'``, or *None* that specifies the sub-group size. An OpenCL + sub-group is an implementation-dependent grouping of work-items within + a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, + e.g., when counting a :class:`MemAccess` whose count_granularity + specifies that it should only be counted once per sub-group. If set to + *None* an attempt to find the sub-group size using the device will be + made, if this fails an error will be raised. If a :class:`string` + ``'guess'`` is passed as the subgroup_size, get_mem_access_map will + attempt to find the sub-group size using the device and, if + unsuccessful, will make a wild guess. - :return: A dictionary mapping each type of synchronization event to a - :class:`islpy.PwQPolynomial` holding the number of events per work-item. + :return: A dictionary mapping each type of synchronization event to an + :class:`islpy.PwQPolynomial` holding the number of events per + work-item. Possible keys include ``barrier_local``, ``barrier_global`` (if supported by the target) and ``kernel_launch``. @@ -1794,7 +1810,8 @@ def get_synchronization_poly(knl): """Count the number of synchronization events each work-item encounters in a loopy kernel. - get_synchronization_poly is deprecated. Use get_synchronization_map instead. + get_synchronization_poly is deprecated. Use get_synchronization_map + instead. """ warn_with_kernel(knl, "deprecated_get_synchronization_poly", -- GitLab From 68a108ba3057c26db083cc10748fd613765e1271 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sat, 24 Feb 2018 01:01:19 -0600 Subject: [PATCH 59/59] string->str in docs --- loopy/statistics.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 31cc94e73..17c5bd355 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1242,14 +1242,14 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, (Likely desirable for performance modeling, but undesirable for code optimization.) - :arg subgroup_size: (currently unused) An :class:`int`, :class:`string` + :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` ``'guess'``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be - made, if this fails an error will be raised. If a :class:`string` + made, if this fails an error will be raised. If a :class:`str` ``'guess'`` is passed as the subgroup_size, get_mem_access_map will attempt to find the sub-group size using the device and, if unsuccessful, will make a wild guess. @@ -1354,14 +1354,14 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, (Likely desirable for performance modeling, but undesirable for code optimization.) - :arg subgroup_size: An :class:`int`, :class:`string` ``'guess'``, or + :arg subgroup_size: An :class:`int`, :class:`str` ``'guess'``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be made, if this fails - an error will be raised. If a :class:`string` ``'guess'`` is passed as + an error will be raised. If a :class:`str` ``'guess'`` is passed as the subgroup_size, get_mem_access_map will attempt to find the sub-group size using the device and, if unsuccessful, will make a wild guess. @@ -1584,14 +1584,14 @@ def get_synchronization_map(knl, subgroup_size=None): :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. - :arg subgroup_size: (currently unused) An :class:`int`, :class:`string` + :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` ``'guess'``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when counting a :class:`MemAccess` whose count_granularity specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be - made, if this fails an error will be raised. If a :class:`string` + made, if this fails an error will be raised. If a :class:`str` ``'guess'`` is passed as the subgroup_size, get_mem_access_map will attempt to find the sub-group size using the device and, if unsuccessful, will make a wild guess. -- GitLab