From 23b994c68467c870d55a5c5f7dc65780789033b0 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 29 Jun 2017 17:20:55 +0200 Subject: [PATCH 01/15] Stats: take into account unused hw axes in run count, refactor code --- loopy/statistics.py | 547 +++++++++++++++++++++++--------------------- 1 file changed, 284 insertions(+), 263 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index cb15eb554..8d9c371a7 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -132,6 +132,12 @@ class ToCountMap(object): def copy(self): return ToCountMap(dict(self.count_map), self.val_type) + def with_set_attributes(self, **kwargs): + return ToCountMap(dict( + (key.copy(**kwargs), val) + for key, val in six.iteritems(self.count_map)), + self.val_type) + def filter_by(self, **kwargs): """Remove items without specified key fields. @@ -385,8 +391,10 @@ def stringify_stats_mapping(m): return result +# {{{ Op descriptor + class Op(object): - """An arithmetic operation. + """A descriptor for a type of arithmetic operation. .. attribute:: dtype @@ -400,6 +408,8 @@ class Op(object): """ + # FIXME: This could be done much more briefly by inheriting from Record. + def __init__(self, dtype=None, name=None): self.name = name if dtype is None: @@ -419,19 +429,15 @@ class Op(object): return hash(str(self)) def __repr__(self): - if self.dtype is None: - dtype = 'None' - else: - dtype = str(self.dtype) - if self.name is None: - name = 'None' - else: - name = self.name - return "Op("+dtype+", "+name+")" + return "Op(%s, %s)" % (self.dtype, self.name) + +# }}} +# {{{ MemAccess descriptor + class MemAccess(object): - """A memory access. + """A descriptor for a type of memory access. .. attribute:: mtype @@ -460,6 +466,8 @@ class MemAccess(object): """ + # FIXME: This could be done much more briefly by inheriting from Record. + def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None): self.mtype = mtype @@ -482,6 +490,16 @@ class MemAccess(object): raise NotImplementedError("MemAccess: variable must be None when " "mtype is 'local'") + def copy(self, mtype=None, dtype=None, stride=None, direction=None, + variable=None): + return MemAccess( + mtype=mtype if mtype is not None else self.mtype, + dtype=dtype if dtype is not None else self.dtype, + stride=stride if stride is not None else self.stride, + direction=direction if direction is not None else self.direction, + variable=variable if variable is not None else self.variable, + ) + def __eq__(self, other): return isinstance(other, MemAccess) and ( (self.mtype is None or other.mtype is None or @@ -522,11 +540,70 @@ class MemAccess(object): return "MemAccess(" + mtype + ", " + dtype + ", " + stride + ", " \ + direction + ", " + variable + ")" +# }}} -# {{{ ExpressionOpCounter -class ExpressionOpCounter(CombineMapper): +# {{{ counter base +class CounterBase(CombineMapper): + def __init__(self, knl): + self.knl = knl + from loopy.type_inference import TypeInferenceMapper + self.type_inf = TypeInferenceMapper(knl) + + def combine(self, values): + return sum(values) + + def map_constant(self, expr): + return ToCountMap() + + def map_call(self, expr): + return self.rec(expr.parameters) + + def map_sum(self, expr): + if expr.children: + return sum(self.rec(child) for child in expr.children) + else: + return ToCountMap() + + map_product = map_sum + + def map_comparison(self, expr): + return self.rec(expr.left)+self.rec(expr.right) + + def map_if(self, expr): + warn_with_kernel(self.knl, "summing_if_branches", + "%s counting sum of if-expression branches." + % type(self).__name__) + return self.rec(expr.condition) + self.rec(expr.then) \ + + self.rec(expr.else_) + + def map_if_positive(self, expr): + warn_with_kernel(self.knl, "summing_if_branches", + "%s counting sum of if-expression branches." + % type(self).__name__) + return self.rec(expr.criterion) + self.rec(expr.then) \ + + self.rec(expr.else_) + + def map_common_subexpression(self, expr): + raise RuntimeError("%s encountered %s--not supposed to happen" + % (type(self).__name__, type(expr).__name__)) + + map_substitution = map_common_subexpression + map_derivative = map_common_subexpression + map_slice = map_common_subexpression + + # preprocessing should have removed these + def map_reduction(self, expr): + raise RuntimeError("%s encountered %s--not supposed to happen" + % (type(self).__name__, type(expr).__name__)) + +# }}} + + +# {{{ ExpressionOpCounter + +class ExpressionOpCounter(CounterBase): def __init__(self, knl): self.knl = knl from loopy.type_inference import TypeInferenceMapper @@ -641,106 +718,59 @@ class ExpressionOpCounter(CombineMapper): # }}} -# {{{ LocalSubscriptCounter - -class LocalSubscriptCounter(CombineMapper): +class MemAccessCounter(CounterBase): + pass - def __init__(self, knl): - self.knl = knl - from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) - - def combine(self, values): - return sum(values) - def map_constant(self, expr): - return ToCountMap() +# {{{ LocalMemAccessCounter - map_tagged_variable = map_constant - map_variable = map_constant - - def map_call(self, expr): - return self.rec(expr.parameters) - - def map_subscript(self, expr): +class LocalMemAccessCounter(MemAccessCounter): + def count_var_access(self, dtype, name, subscript): sub_map = ToCountMap() - name = expr.aggregate.name # name of array if name in self.knl.temporary_variables: array = self.knl.temporary_variables[name] if array.is_local: - sub_map[MemAccess(mtype='local', dtype=self.type_inf(expr))] = 1 - return sub_map + self.rec(expr.index) + sub_map[MemAccess(mtype='local', dtype=dtype)] = 1 + return sub_map - def map_sum(self, expr): - if expr.children: - return sum(self.rec(child) for child in expr.children) - else: - return ToCountMap() - - map_product = map_sum - - def map_comparison(self, expr): - return self.rec(expr.left)+self.rec(expr.right) - - def map_if(self, expr): - warn_with_kernel(self.knl, "summing_if_branches_lsubs", - "LocalSubscriptCounter counting LMEM accesses as sum " - "of if-statement branches.") - return self.rec(expr.condition) + self.rec(expr.then) \ - + self.rec(expr.else_) - - def map_if_positive(self, expr): - warn_with_kernel(self.knl, "summing_ifpos_branches_lsubs", - "LocalSubscriptCounter counting LMEM accesses as sum " - "of if_pos-statement branches.") - return self.rec(expr.criterion) + self.rec(expr.then) \ - + self.rec(expr.else_) - - def map_common_subexpression(self, expr): - raise NotImplementedError("LocalSubscriptCounter encountered " - "common_subexpression, " - "map_common_subexpression not implemented.") - - def map_substitution(self, expr): - raise NotImplementedError("LocalSubscriptCounter encountered " - "substitution, " - "map_substitution not implemented.") + def map_variable(self, expr): + return self.count_var_access( + self.type_inf(expr), expr.name, None) - def map_derivative(self, expr): - raise NotImplementedError("LocalSubscriptCounter encountered " - "derivative, " - "map_derivative not implemented.") + map_tagged_variable = map_variable - def map_slice(self, expr): - raise NotImplementedError("LocalSubscriptCounter encountered slice, " - "map_slice not implemented.") + def map_subscript(self, expr): + return ( + self.count_var_access( + self.type_inf(expr), expr.aggregate.name, expr.index) + + self.rec(expr.index)) # }}} -# {{{ GlobalSubscriptCounter - -class GlobalSubscriptCounter(CombineMapper): +# {{{ GlobalMemAccessCounter - def __init__(self, knl): - self.knl = knl - from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) - - def combine(self, values): - return sum(values) +class GlobalMemAccessCounter(MemAccessCounter): + def map_variable(self, expr): + name = expr.name - def map_constant(self, expr): - return ToCountMap() + if name in self.knl.arg_dict: + array = self.knl.arg_dict[name] + else: + # this is a temporary variable + return ToCountMap() - map_tagged_variable = map_constant - map_variable = map_constant + if not isinstance(array, lp.GlobalArg): + # this array is not in global memory + return ToCountMap() - def map_call(self, expr): - return self.rec(expr.parameters) + return ToCountMap({MemAccess(mtype='global', + dtype=self.type_inf(expr), stride=0, + variable=name): 1} + ) + self.rec(expr.index) def map_subscript(self, expr): - name = expr.aggregate.name # name of array + name = expr.aggregate.name if name in self.knl.arg_dict: array = self.knl.arg_dict[name] @@ -827,47 +857,6 @@ class GlobalSubscriptCounter(CombineMapper): stride=total_stride, variable=name): 1} ) + self.rec(expr.index) - def map_sum(self, expr): - if expr.children: - return sum(self.rec(child) for child in expr.children) - else: - return ToCountMap() - - map_product = map_sum - - def map_if(self, expr): - warn_with_kernel(self.knl, "summing_if_branches_gsubs", - "GlobalSubscriptCounter counting GMEM accesses as " - "sum of if-statement branches.") - return self.rec(expr.condition) + self.rec(expr.then) \ - + self.rec(expr.else_) - - def map_if_positive(self, expr): - warn_with_kernel(self.knl, "summing_ifpos_branches_gsubs", - "GlobalSubscriptCounter counting GMEM accesses as " - "sum of if_pos-statement branches.") - return self.rec(expr.criterion) + self.rec(expr.then) \ - + self.rec(expr.else_) - - def map_common_subexpression(self, expr): - raise NotImplementedError("GlobalSubscriptCounter encountered " - "common_subexpression, " - "map_common_subexpression not implemented.") - - def map_substitution(self, expr): - raise NotImplementedError("GlobalSubscriptCounter encountered " - "substitution, " - "map_substitution not implemented.") - - def map_derivative(self, expr): - raise NotImplementedError("GlobalSubscriptCounter encountered " - "derivative, " - "map_derivative not implemented.") - - def map_slice(self, expr): - raise NotImplementedError("GlobalSubscriptCounter encountered slice, " - "map_slice not implemented.") - # }}} @@ -1031,25 +1020,64 @@ def count(kernel, set): return count -# }}} +def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): + # FIXME: Multi-kernel support + gsize, lsize = knl.get_grid_size_upper_bounds() -# {{{ get_op_poly + g_used = set() + l_used = set() -def get_op_poly(knl, numpy_types=True): + from loopy.kernel.data import LocalIndexTag, GroupIndexTag + for iname in knl.insn_inames(insn): + tag = knl.iname_to_tag.get(iname) - """Count the number of operations in a loopy kernel. + if isinstance(tag, LocalIndexTag): + l_used.add(tag.axis) + elif isinstance(tag, GroupIndexTag): + g_used.add(tag.axis) - get_op_poly is deprecated. Use get_op_map instead. + def mult_grid_factor(used_axes, size): + result = 1 + for used, size in zip(used_axes, size): + if not used: + result = result * size - """ - warn_with_kernel(knl, "depricated_get_op_poly", - "get_op_poly is deprecated. Use get_op_map instead.") - return get_op_map(knl, numpy_types) + if not isinstance(result, int): + if space is not None: + result = result.align_params(space) + + result = isl.PwQPolynomial.from_pw_aff(result) + + return result + + if disregard_local_axes: + return mult_grid_factor(g_used, gsize) + else: + return mult_grid_factor(g_used, gsize) * mult_grid_factor(l_used, lsize) + + +def count_insn_runs(knl, insn, disregard_local_axes=False): + insn_inames = knl.insn_inames(insn) + + if disregard_local_axes: + from loopy.kernel.data import LocalIndexTag + insn_inames = [iname for iname in insn_inames if not + isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)] + + inames_domain = knl.get_inames_domain(insn_inames) + domain = (inames_domain.project_out_except( + insn_inames, [dim_type.set])) + c = count(knl, domain) + return (c * get_unused_hw_axes_factor(knl, insn, + disregard_local_axes=disregard_local_axes, + space=c.space)) # }}} +# {{{ get_op_map + def get_op_map(knl, numpy_types=True): """Count the number of operations in a loopy kernel. @@ -1090,14 +1118,8 @@ def get_op_map(knl, numpy_types=True): op_map = ToCountMap() op_counter = ExpressionOpCounter(knl) for insn in knl.instructions: - # how many times is this instruction executed? - # check domain size: - insn_inames = knl.insn_inames(insn) - inames_domain = knl.get_inames_domain(insn_inames) - domain = (inames_domain.project_out_except( - insn_inames, [dim_type.set])) ops = op_counter(insn.assignee) + op_counter(insn.expression) - op_map = op_map + ops*count(knl, domain) + op_map = op_map + ops*count_insn_runs(knl, insn) if numpy_types: op_map.count_map = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name), @@ -1106,54 +1128,11 @@ def get_op_map(knl, numpy_types=True): return op_map - -#TODO test deprecated functions? -def get_lmem_access_poly(knl): - """Count the number of local memory accesses in a loopy kernel. - - get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['local'] option. - - """ - warn_with_kernel(knl, "depricated_get_lmem_access_poly", - "get_lmem_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['local'] option.") - return get_mem_access_map(knl).filter_by(mtype=['local']) - - -def get_DRAM_access_poly(knl): - """Count the number of global memory accesses in a loopy kernel. - - get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['global'] option. - - """ - warn_with_kernel(knl, "depricated_get_DRAM_access_poly", - "get_DRAM_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['global'] option.") - return get_mem_access_map(knl).filter_by(mtype=['global']) - - -# {{{ get_gmem_access_poly - -def get_gmem_access_poly(knl): - """Count the number of global memory accesses in a loopy kernel. - - get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['global'] option. - - """ - warn_with_kernel(knl, "depricated_get_gmem_access_poly", - "get_DRAM_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['global'] option.") - return get_mem_access_map(knl).filter_by(mtype=['global']) - # }}} +# {{{ get_mem_access_map + def get_mem_access_map(knl, numpy_types=True): """Count the number of memory accesses in a loopy kernel. @@ -1217,96 +1196,66 @@ def get_mem_access_map(knl, numpy_types=True): cache_holder = CacheHolder() @memoize_in(cache_holder, "insn_count") - def get_insn_count(knl, insn_inames, uniform=False): - if uniform: - from loopy.kernel.data import LocalIndexTag - insn_inames = [iname for iname in insn_inames if not - isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)] - inames_domain = knl.get_inames_domain(insn_inames) - domain = (inames_domain.project_out_except( - insn_inames, [dim_type.set])) - return count(knl, domain) + def get_insn_count(knl, insn_id, uniform=False): + insn = knl.id_to_insn[insn_id] + return count_insn_runs(knl, insn, disregard_local_axes=uniform) knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) - subs_map = ToCountMap() - subs_counter_g = GlobalSubscriptCounter(knl) - subs_counter_l = LocalSubscriptCounter(knl) + access_map = ToCountMap() + access_counter_g = GlobalMemAccessCounter(knl) + access_counter_l = LocalMemAccessCounter(knl) for insn in knl.instructions: - # count subscripts - subs_expr = subs_counter_g(insn.expression) \ - + subs_counter_l(insn.expression) - - # distinguish loads and stores - for key in subs_expr.count_map: - subs_expr[MemAccess(mtype=key.mtype, dtype=key.dtype, - stride=key.stride, direction='load', - variable=key.variable) - ] = subs_expr.pop(key) - - subs_assignee_g = subs_counter_g(insn.assignee) - for key in subs_assignee_g.count_map: - subs_assignee_g[MemAccess(mtype=key.mtype, dtype=key.dtype, - stride=key.stride, - direction='store', - variable=key.variable) - ] = subs_assignee_g.pop(key) - # for now, don't count writes to local mem - - insn_inames = knl.insn_inames(insn) + access_expr = ( + access_counter_g(insn.expression) + + access_counter_l(insn.expression) + ).with_set_attributes(direction="load") + + access_assignee_g = access_counter_g(insn.assignee).with_set_attributes( + direction="store") + + # FIXME: (!!!!) for now, don't count writes to local mem # use count excluding local index tags for uniform accesses - for key in subs_expr.count_map: - map = ToCountMap({key: subs_expr[key]}) - if (key.mtype == 'global' and + for key, val in six.iteritems(access_expr.count_map): + is_uniform = (key.mtype == 'global' and isinstance(key.stride, int) and - key.stride == 0): - subs_map = subs_map \ - + map*get_insn_count(knl, insn_inames, True) - else: - subs_map = subs_map + map*get_insn_count(knl, insn_inames) - #currently not counting stride of local mem access - - for key in subs_assignee_g.count_map: - map = ToCountMap({key: subs_assignee_g[key]}) - if isinstance(key.stride, int) and key.stride == 0: - subs_map = subs_map \ - + map*get_insn_count(knl, insn_inames, True) - else: - subs_map = subs_map + map*get_insn_count(knl, insn_inames) + key.stride == 0) + access_map = ( + access_map + + ToCountMap({key: val}) + * get_insn_count(knl, insn.id, is_uniform)) + #currently not counting stride of local mem access + + for key, val in six.iteritems(access_assignee_g.count_map): + is_uniform = (key.mtype == 'global' and + isinstance(key.stride, int) and + key.stride == 0) + access_map = ( + access_map + + ToCountMap({key: val}) + * get_insn_count(knl, insn.id, is_uniform)) # for now, don't count writes to local mem if numpy_types: - subs_map.count_map = dict((MemAccess(mtype=mem_access.mtype, + # FIXME: Don't modify in-place + access_map.count_map = dict((MemAccess(mtype=mem_access.mtype, dtype=mem_access.dtype.numpy_dtype, stride=mem_access.stride, direction=mem_access.direction, variable=mem_access.variable), count) - for mem_access, count in six.iteritems(subs_map.count_map)) - - return subs_map - - -# {{{ get_synchronization_poly - -def get_synchronization_poly(knl): - """Count the number of synchronization events each thread encounters in a - loopy kernel. - - get_synchronization_poly is deprecated. Use get_synchronization_map instead. + for mem_access, count in six.iteritems(access_map.count_map)) - """ - warn_with_kernel(knl, "depricated_get_synchronization_poly", - "get_synchronization_poly is deprecated. Use " - "get_synchronization_map instead.") - return get_synchronization_map(knl) + return access_map # }}} +# {{{ get_synchronization_map + def get_synchronization_map(knl): """Count the number of synchronization events each thread encounters in a @@ -1382,6 +1331,8 @@ def get_synchronization_map(knl): #return result.count_map #TODO is this change okay? return result +# }}} + # {{{ gather_access_footprints @@ -1477,4 +1428,74 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): # }}} + +# {{{ compat goop + +def get_lmem_access_poly(knl): + """Count the number of local memory accesses in a loopy kernel. + + get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['local'] option. + + """ + warn_with_kernel(knl, "deprecated_get_lmem_access_poly", + "get_lmem_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['local'] option.") + return get_mem_access_map(knl).filter_by(mtype=['local']) + + +def get_DRAM_access_poly(knl): + """Count the number of global memory accesses in a loopy kernel. + + get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['global'] option. + + """ + warn_with_kernel(knl, "deprecated_get_DRAM_access_poly", + "get_DRAM_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['global'] option.") + return get_mem_access_map(knl).filter_by(mtype=['global']) + + +def get_gmem_access_poly(knl): + """Count the number of global memory accesses in a loopy kernel. + + get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['global'] option. + + """ + warn_with_kernel(knl, "deprecated_get_gmem_access_poly", + "get_DRAM_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['global'] option.") + return get_mem_access_map(knl).filter_by(mtype=['global']) + + +def get_synchronization_poly(knl): + """Count the number of synchronization events each thread encounters in a + loopy kernel. + + get_synchronization_poly is deprecated. Use get_synchronization_map instead. + + """ + warn_with_kernel(knl, "deprecated_get_synchronization_poly", + "get_synchronization_poly is deprecated. Use " + "get_synchronization_map instead.") + return get_synchronization_map(knl) + + +def get_op_poly(knl, numpy_types=True): + """Count the number of operations in a loopy kernel. + + get_op_poly is deprecated. Use get_op_map instead. + + """ + warn_with_kernel(knl, "deprecated_get_op_poly", + "get_op_poly is deprecated. Use get_op_map instead.") + return get_op_map(knl, numpy_types) + +# }}} + # vim: foldmethod=marker -- GitLab From c67e2de5dc1a605cb06e79887a38bad048b90921 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 29 Jun 2017 19:13:38 +0200 Subject: [PATCH 02/15] Mess with stats/count_insn_runs to get spaces to line up --- loopy/statistics.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 8d9c371a7..44d55d10f 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -929,7 +929,7 @@ class AccessFootprintGatherer(CombineMapper): # {{{ count -def count(kernel, set): +def count(kernel, set, space=None): try: return set.card() except AttributeError: @@ -958,7 +958,11 @@ def count(kernel, set): if stride is None: stride = 1 - length = isl.PwQPolynomial.from_pw_aff(dmax - dmin + stride) + length_pwaff = dmax - dmin + stride + if space is not None: + length_pwaff = length_pwaff.align_params(space) + + length = isl.PwQPolynomial.from_pw_aff(length_pwaff) length = length.scale_down_val(stride) if bset_count is None: @@ -1068,10 +1072,15 @@ def count_insn_runs(knl, insn, disregard_local_axes=False): inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except( insn_inames, [dim_type.set])) - c = count(knl, domain) - return (c * get_unused_hw_axes_factor(knl, insn, - disregard_local_axes=disregard_local_axes, - space=c.space)) + + space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT, + set=[], params=knl.outer_params()) + + c = count(knl, domain, space=space) + unused_fac = get_unused_hw_axes_factor(knl, insn, + disregard_local_axes=disregard_local_axes, + space=space) + return c * unused_fac # }}} -- GitLab From b1686701ec5532489f1bc81baebb5e3e62734c6b Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 30 Jun 2017 01:55:49 -0500 Subject: [PATCH 03/15] updated mm test assertions, but tests still fail --- test/test_statistics.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index 5e363f135..82bde8f5c 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -515,7 +515,7 @@ def test_mem_access_counter_nonconsec(): stride=Variable('m')*Variable('l'), direction='load', variable='b') ].eval_with_dict(params) - assert f64nonconsec == 2*n*m + assert f64nonconsec == 2*n*m*l assert f32nonconsec == 3*n*m*l f64nonconsec = mem_map[lp.MemAccess('global', np.float64, @@ -563,7 +563,7 @@ def test_mem_access_counter_consec(): f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=1, direction='load', variable='b') ].eval_with_dict(params) - assert f64consec == 2*n*m + assert f64consec == 2*n*m*l assert f32consec == 3*n*m*l f64consec = mem_map[lp.MemAccess('global', np.float64, @@ -628,6 +628,7 @@ def test_barrier_counter_barriers(): def test_all_counters_parallel_matmul(): + bsize = 16 knl = lp.make_kernel( "{[i,k,j]: 0<=i Date: Fri, 30 Jun 2017 12:23:27 +0200 Subject: [PATCH 04/15] Fix botched conditional in get_unused_hw_axes_factor --- loopy/statistics.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 44d55d10f..dc938ef70 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1043,8 +1043,8 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): def mult_grid_factor(used_axes, size): result = 1 - for used, size in zip(used_axes, size): - if not used: + for iaxis, size in enumerate(size): + if iaxis not in used_axes: result = result * size if not isinstance(result, int): @@ -1337,7 +1337,6 @@ def get_synchronization_map(knl): raise LoopyError("unexpected schedule item: %s" % type(sched_item).__name__) - #return result.count_map #TODO is this change okay? return result # }}} -- GitLab From 1d33ea5a4fa3d0d6a8e00dc1a57df42b8b2d07ec Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 30 Jun 2017 15:52:46 -0500 Subject: [PATCH 05/15] fixed test assertions after counting w/unused hw axes update --- test/test_statistics.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index 82bde8f5c..87fa92a58 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -437,8 +437,8 @@ def test_mem_access_counter_mixed(): knl = lp.add_and_infer_dtypes(knl, dict( a=np.float32, b=np.float32, g=np.float64, h=np.float64, x=np.float32)) - threads = 16 - knl = lp.split_iname(knl, "j", threads) + bsize = 16 + knl = lp.split_iname(knl, "j", bsize) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) mem_map = lp.get_mem_access_map(knl) # noqa @@ -463,8 +463,8 @@ def test_mem_access_counter_mixed(): stride=Variable('m'), direction='load', variable='b') ].eval_with_dict(params) - assert f64uniform == 2*n*m - assert f32uniform == n*m*l/threads + assert f64uniform == 2*n*m*l/bsize + assert f32uniform == n*m*l/bsize assert f32nonconsec == 3*n*m*l f64uniform = mem_map[lp.MemAccess('global', np.float64, @@ -474,7 +474,7 @@ def test_mem_access_counter_mixed(): stride=Variable('m'), direction='store', variable='c') ].eval_with_dict(params) - assert f64uniform == n*m + assert f64uniform == n*m*l/bsize assert f32nonconsec == n*m*l @@ -515,7 +515,7 @@ def test_mem_access_counter_nonconsec(): stride=Variable('m')*Variable('l'), direction='load', variable='b') ].eval_with_dict(params) - assert f64nonconsec == 2*n*m*l + assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*l f64nonconsec = mem_map[lp.MemAccess('global', np.float64, @@ -572,7 +572,7 @@ def test_mem_access_counter_consec(): f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c') ].eval_with_dict(params) - assert f64consec == n*m + assert f64consec == n*m*l assert f32consec == n*m*l @@ -677,8 +677,8 @@ def test_all_counters_parallel_matmul(): stride=1, direction='load', variable='a') ].eval_with_dict(params) - assert f32s1lb == (m/bsize)*n*m - assert f32s1la == (m/bsize)*m*l + assert f32s1lb == n*m*l/bsize + assert f32s1la == n*m*l/bsize f32coal = op_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c') -- GitLab From 8991cf13ad2a5bb205ea8d2fe4b041858030e58a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 1 Jul 2017 13:52:22 +0200 Subject: [PATCH 06/15] Fix grid factor computation in get_unused_hw_axes_factor --- loopy/statistics.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index dc938ef70..2df2cb53c 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1045,13 +1045,13 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): result = 1 for iaxis, size in enumerate(size): if iaxis not in used_axes: - result = result * size + if not isinstance(size, int): + if space is not None: + size = size.align_params(space) - if not isinstance(result, int): - if space is not None: - result = result.align_params(space) + size = isl.PwQPolynomial.from_pw_aff(size) - result = isl.PwQPolynomial.from_pw_aff(result) + result = result * size return result -- GitLab From 9bf895a0565f57cae86eacae1e0046d1e4540fcf Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 1 Jul 2017 14:01:25 +0200 Subject: [PATCH 07/15] Stats: Fix docstring style --- loopy/statistics.py | 59 +++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 2df2cb53c..1e61e07ed 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -141,9 +141,9 @@ class ToCountMap(object): def filter_by(self, **kwargs): """Remove items without specified key fields. - :parameter \*\*kwargs: Keyword arguments matching fields in the keys of - the :class:`ToCountMap`, each given a list of - allowable values for that key field. + :arg kwargs: Keyword arguments matching fields in the keys of + the :class:`ToCountMap`, each given a list of + allowable values for that key field. :return: A :class:`ToCountMap` containing the subset of the items in the original :class:`ToCountMap` that match the field values @@ -189,10 +189,10 @@ class ToCountMap(object): def filter_by_func(self, func): """Keep items that pass a test. - :parameter func: A function that takes a map key a parameter and - returns a :class:`bool`. + :arg func: A function that takes a map key a parameter and + returns a :class:`bool`. - :return: A :class:`ToCountMap` containing the subset of the items in + :arg: A :class:`ToCountMap` containing the subset of the items in the original :class:`ToCountMap` for which func(key) is true. Example usage:: @@ -224,7 +224,7 @@ class ToCountMap(object): """Group map items together, distinguishing by only the key fields passed in args. - :parameter \*args: Zero or more :class:`str` fields of map keys. + :arg args: Zero or more :class:`str` fields of map keys. :return: A :class:`ToCountMap` containing the same total counts grouped together by new keys that only contain the fields @@ -1091,21 +1091,21 @@ def get_op_map(knl, numpy_types=True): """Count the number of operations in a loopy kernel. - :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted. + :arg knl: A :class:`loopy.LoopKernel` whose operations are to be counted. - :parameter numpy_types: A :class:`bool` specifying whether the types - in the returned mapping should be numpy types - instead of :class:`loopy.LoopyType`. + :arg numpy_types: A :class:`bool` specifying whether the types + in the returned mapping should be numpy types + instead of :class:`loopy.LoopyType`. :return: A :class:`ToCountMap` of **{** :class:`Op` **:** - :class:`islpy.PwQPolynomial` **}**. + :class:`islpy.PwQPolynomial` **}**. - - The :class:`Op` specifies the characteristics of the arithmetic - operation. + - The :class:`Op` specifies the characteristics of the arithmetic + operation. - - The :class:`islpy.PwQPolynomial` holds the number of operations of - the kind specified in the key (in terms of the - :class:`loopy.LoopKernel` parameter *inames*). + - The :class:`islpy.PwQPolynomial` holds the number of operations of + the kind specified in the key (in terms of the + :class:`loopy.LoopKernel` parameter *inames*). Example usage:: @@ -1145,22 +1145,23 @@ def get_op_map(knl, numpy_types=True): def get_mem_access_map(knl, numpy_types=True): """Count the number of memory accesses in a loopy kernel. - :parameter knl: A :class:`loopy.LoopKernel` whose memory accesses are to be - counted. + :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be + counted. + + :arg numpy_types: A :class:`bool` specifying whether the types + in the returned mapping should be numpy types + instead of :class:`loopy.LoopyType`. - :parameter numpy_types: A :class:`bool` specifying whether the types - in the returned mapping should be numpy types - instead of :class:`loopy.LoopyType`. :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** - :class:`islpy.PwQPolynomial` **}**. + :class:`islpy.PwQPolynomial` **}**. - - The :class:`MemAccess` specifies the characteristics of the - memory access. + - The :class:`MemAccess` specifies the characteristics of the + memory access. - - The :class:`islpy.PwQPolynomial` holds the number of memory - accesses with the characteristics specified in the key (in terms - of the :class:`loopy.LoopKernel` *inames*). + - The :class:`islpy.PwQPolynomial` holds the number of memory + accesses with the characteristics specified in the key (in terms + of the :class:`loopy.LoopKernel` *inames*). Example usage:: @@ -1270,7 +1271,7 @@ def get_synchronization_map(knl): """Count the number of synchronization events each thread encounters in a loopy kernel. - :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. + :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: A dictionary mapping each type of synchronization event to a :class:`islpy.PwQPolynomial` holding the number of events per -- GitLab From c67e0bda4c0333969bccbdbc9828a6e38e8f8b11 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 1 Jul 2017 14:06:30 +0200 Subject: [PATCH 08/15] Stats: Add count_redundant_work flag --- loopy/statistics.py | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 1e61e07ed..af9d6d47d 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1061,7 +1061,7 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): return mult_grid_factor(g_used, gsize) * mult_grid_factor(l_used, lsize) -def count_insn_runs(knl, insn, disregard_local_axes=False): +def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False): insn_inames = knl.insn_inames(insn) if disregard_local_axes: @@ -1077,17 +1077,21 @@ def count_insn_runs(knl, insn, disregard_local_axes=False): set=[], params=knl.outer_params()) c = count(knl, domain, space=space) - unused_fac = get_unused_hw_axes_factor(knl, insn, - disregard_local_axes=disregard_local_axes, - space=space) - return c * unused_fac + + if count_redundant_work: + unused_fac = get_unused_hw_axes_factor(knl, insn, + disregard_local_axes=disregard_local_axes, + space=space) + return c * unused_fac + else: + return c # }}} # {{{ get_op_map -def get_op_map(knl, numpy_types=True): +def get_op_map(knl, numpy_types=True, count_redundant_work=False): """Count the number of operations in a loopy kernel. @@ -1097,6 +1101,12 @@ def get_op_map(knl, numpy_types=True): in the returned mapping should be numpy types instead of :class:`loopy.LoopyType`. + :arg count_redundant_work: Based on usage of hardware axes or other + specifics, a kernel may perform work redundantly. This :class:`bool` + flag indicates whether this work should be included in the count. + (Likely desirable for performance modeling, but undesirable for + code optimization.) + :return: A :class:`ToCountMap` of **{** :class:`Op` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1128,7 +1138,9 @@ def get_op_map(knl, numpy_types=True): op_counter = ExpressionOpCounter(knl) for insn in knl.instructions: ops = op_counter(insn.assignee) + op_counter(insn.expression) - op_map = op_map + ops*count_insn_runs(knl, insn) + op_map = op_map + ops*count_insn_runs( + knl, insn, + count_redundant_work=count_redundant_work) if numpy_types: op_map.count_map = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name), @@ -1142,7 +1154,7 @@ def get_op_map(knl, numpy_types=True): # {{{ get_mem_access_map -def get_mem_access_map(knl, numpy_types=True): +def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False): """Count the number of memory accesses in a loopy kernel. :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be @@ -1152,6 +1164,11 @@ def get_mem_access_map(knl, numpy_types=True): in the returned mapping should be numpy types instead of :class:`loopy.LoopyType`. + :arg count_redundant_work: Based on usage of hardware axes or other + specifics, a kernel may perform work redundantly. This :class:`bool` + flag indicates whether this work should be included in the count. + (Likely desirable for performance modeling, but undesirable for + code optimization.) :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1208,7 +1225,9 @@ def get_mem_access_map(knl, numpy_types=True): @memoize_in(cache_holder, "insn_count") def get_insn_count(knl, insn_id, uniform=False): insn = knl.id_to_insn[insn_id] - return count_insn_runs(knl, insn, disregard_local_axes=uniform) + return count_insn_runs( + knl, insn, disregard_local_axes=uniform, + count_redundant_work=count_redundant_work) knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) -- GitLab From a9576589becc99ce985356ea7ebce5589532856a Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sun, 2 Jul 2017 17:30:18 -0500 Subject: [PATCH 09/15] updated stats tests to include new redundant work flag --- test/test_statistics.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/test/test_statistics.py b/test/test_statistics.py index 87fa92a58..a72b62af9 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -49,7 +49,7 @@ def test_op_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -74,7 +74,7 @@ def test_op_counter_reduction(): name="matmul_serial", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -100,7 +100,7 @@ def test_op_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -130,7 +130,7 @@ def test_op_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -166,7 +166,7 @@ def test_op_counter_bitwise(): a=np.int32, b=np.int32, g=np.int64, h=np.int64)) - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -205,7 +205,7 @@ def test_op_counter_triangular_domain(): else: expect_fallback = False - op_map = lp.get_op_map(knl)[lp.Op(np.float64, 'mul')] + op_map = lp.get_op_map(knl, count_redundant_work=True)[lp.Op(np.float64, 'mul')] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) @@ -229,7 +229,7 @@ def test_mem_access_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -269,7 +269,7 @@ def test_mem_access_counter_reduction(): name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -307,7 +307,7 @@ def test_mem_access_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -343,7 +343,7 @@ def test_mem_access_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -395,7 +395,7 @@ def test_mem_access_counter_bitwise(): a=np.int32, b=np.int32, g=np.int32, h=np.int32)) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -441,7 +441,7 @@ def test_mem_access_counter_mixed(): knl = lp.split_iname(knl, "j", bsize) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) - mem_map = lp.get_mem_access_map(knl) # noqa + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) # noqa n = 512 m = 256 l = 128 @@ -494,7 +494,7 @@ def test_mem_access_counter_nonconsec(): knl = lp.split_iname(knl, "i", 16) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) - mem_map = lp.get_mem_access_map(knl) # noqa + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) # noqa n = 512 m = 256 l = 128 @@ -545,7 +545,7 @@ def test_mem_access_counter_consec(): a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -652,7 +652,7 @@ def test_all_counters_parallel_matmul(): assert sync_map["kernel_launch"].eval_with_dict(params) == 1 assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) f32mul = op_map[ lp.Op(np.float32, 'mul') ].eval_with_dict(params) @@ -668,7 +668,7 @@ def test_all_counters_parallel_matmul(): assert f32mul+f32add == n*m*l*2 - op_map = lp.get_mem_access_map(knl) + op_map = lp.get_mem_access_map(knl, count_redundant_work=True) f32s1lb = op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='b') @@ -686,7 +686,8 @@ def test_all_counters_parallel_matmul(): assert f32coal == n*l - local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local']) + local_mem_map = lp.get_mem_access_map(knl, + count_redundant_work=True).filter_by(mtype=['local']) local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load') ].eval_with_dict(params) @@ -744,7 +745,7 @@ def test_summations_and_filters(): l = 128 params = {'n': n, 'm': m, 'l': l} - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) loads_a = mem_map.filter_by(direction=['load'], variable=['a'] ).eval_and_sum(params) @@ -770,7 +771,7 @@ def test_summations_and_filters(): assert f32lall == 3*n*m*l assert f64lall == 2*n*m - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) -- GitLab From a6cb6790bda494b64adced1e92a8f58e0338d896 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 4 Jul 2017 11:33:55 +0200 Subject: [PATCH 10/15] Typo fix [ci skip] --- loopy/kernel/instruction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 0d22dbb88..08268ca9f 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -714,7 +714,7 @@ class Assignment(MultiAssignmentBase): z[i] = z[i+1-1] + a {atomic} - :mod:`loopy` may to evaluate the right-hand side *multiple times* + :mod:`loopy` may choose to evaluate the right-hand side *multiple times* as part of a single assignment. It is up to the user to ensure that this retains correct semantics. -- GitLab From 9bde2387282d1bab0d126030aac314ea0852e328 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 7 Jul 2017 05:44:02 -0400 Subject: [PATCH 11/15] Guard assumptions on PwQPolynomials returned from loopy.statistics (rather than rely on the PwQPolynomial's domain, which only captures where the corresponding poly is non-zero) --- loopy/statistics.py | 81 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 71 insertions(+), 10 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index af9d6d47d..a0b3a0165 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -38,6 +38,7 @@ __doc__ = """ .. currentmodule:: loopy +.. autoclass:: GuardedPwQPolynomial .. autoclass:: ToCountMap .. autoclass:: Op .. autoclass:: MemAccess @@ -52,6 +53,60 @@ __doc__ = """ """ +# {{{ GuardedPwQPolynomial + +class GuardedPwQPolynomial(object): + def __init__(self, pwqpolynomial, valid_domain): + self.pwqpolynomial = pwqpolynomial + self.valid_domain = valid_domain + + def __add__(self, other): + if isinstance(other, GuardedPwQPolynomial): + return GuardedPwQPolynomial( + self.pwqpolynomial + other.pwqpolynomial, + self.valid_domain & other.valid_domain) + else: + return GuardedPwQPolynomial( + self.pwqpolynomial + other, + self.valid_domain) + + __radd__ = __add__ + + def __mul__(self, other): + if isinstance(other, GuardedPwQPolynomial): + return GuardedPwQPolynomial( + self.pwqpolynomial * other.pwqpolynomial, + self.valid_domain & other.valid_domain) + else: + return GuardedPwQPolynomial( + self.pwqpolynomial * other, + self.valid_domain) + + __rmul__ = __mul__ + + def eval_with_dict(self, value_dict): + space = self.pwqpolynomial.space + pt = isl.Point.zero(space.params()) + + for i in range(space.dim(dim_type.param)): + par_name = space.get_dim_name(dim_type.param, i) + pt = pt.set_coordinate_val( + dim_type.param, i, value_dict[par_name]) + + if not (isl.Set.from_point(pt) <= self.valid_domain): + raise ValueError("evaluation point outside of domain of " + "definition of piecewise quasipolynomial") + + return self.pwqpolynomial.eval(pt).to_python() + + @staticmethod + def zero(): + p = isl.PwQPolynomial('{ 0 }') + return GuardedPwQPolynomial(p, isl.Set.universe(p.domain().space)) + +# }}} + + # {{{ ToCountMap class ToCountMap(object): @@ -66,7 +121,7 @@ class ToCountMap(object): """ - def __init__(self, init_dict=None, val_type=isl.PwQPolynomial): + def __init__(self, init_dict=None, val_type=GuardedPwQPolynomial): if init_dict is None: init_dict = {} self.count_map = init_dict @@ -87,7 +142,7 @@ class ToCountMap(object): return self def __mul__(self, other): - if isinstance(other, isl.PwQPolynomial): + if isinstance(other, GuardedPwQPolynomial): return ToCountMap(dict( (index, self.count_map[index]*other) for index in self.keys())) @@ -103,8 +158,8 @@ class ToCountMap(object): return self.count_map[index] except KeyError: #TODO what is the best way to handle this? - if self.val_type is isl.PwQPolynomial: - return isl.PwQPolynomial('{ 0 }') + if self.val_type is GuardedPwQPolynomial: + return GuardedPwQPolynomial.zero() else: return 0 @@ -342,8 +397,8 @@ class ToCountMap(object): """ - if self.val_type is isl.PwQPolynomial: - total = isl.PwQPolynomial('{ 0 }') + if self.val_type is GuardedPwQPolynomial: + total = GuardedPwQPolynomial.zero() else: total = 0 @@ -929,9 +984,13 @@ class AccessFootprintGatherer(CombineMapper): # {{{ count +def add_assumptions_guard(kernel, pwqpolynomial): + return GuardedPwQPolynomial(pwqpolynomial, kernel.assumptions) + + def count(kernel, set, space=None): try: - return set.card() + return add_assumptions_guard(kernel, set.card()) except AttributeError: pass @@ -1022,7 +1081,7 @@ def count(kernel, set, space=None): "number of integer points in your loop " "domain.") - return count + return add_assumptions_guard(kernel, count) def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): @@ -1056,9 +1115,11 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): return result if disregard_local_axes: - return mult_grid_factor(g_used, gsize) + result = mult_grid_factor(g_used, gsize) else: - return mult_grid_factor(g_used, gsize) * mult_grid_factor(l_used, lsize) + result = mult_grid_factor(g_used, gsize) * mult_grid_factor(l_used, lsize) + + return add_assumptions_guard(knl, result) def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False): -- GitLab From 911fbba8cdee8b956fc7f26dfaa18236e1f804af Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 7 Jul 2017 12:54:53 -0400 Subject: [PATCH 12/15] Re-fix stride handling in count() --- loopy/isl_helpers.py | 64 +++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 36fbb49f4..5f0884fd4 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -598,37 +598,63 @@ def get_simple_strides(bset, key_by="name"): assert len(comp_div_set_pieces) == 1 bset, = comp_div_set_pieces - lspace = bset.get_local_space() - for idiv in range(lspace.dim(dim_type.div)): - div = lspace.get_div(idiv) + def _get_indices_and_coeffs(obj, dts): + result = [] + for dt in dts: + for dim_idx in range(obj.dim(dt)): + coeff_val = obj.get_coefficient_val(dt, dim_idx) + if not coeff_val.is_zero(): + result.append((dt, dim_idx, coeff_val)) - # check for sub-divs - supported = True - for dim_idx in range(div.dim(dim_type.div)): - coeff_val = div.get_coefficient_val(dim_type.div, dim_idx) - if not coeff_val.is_zero(): - # sub-divs not supported - supported = False - break + return result + + for cns in bset.get_constraints(): + if not cns.is_equality(): + continue + aff = cns.get_aff() - if not supported: + # recognizes constraints of the form + # -i0 + 2*floor((i0)/2) == 0 + + if aff.dim(dim_type.div) != 1: + continue + + idiv = 0 + div = aff.get_div(idiv) + + # check for sub-divs + if _get_indices_and_coeffs(div, [dim_type.div]): + # found one -> not supported continue denom = div.get_denominator_val().to_python() - inames_and_coeffs = [] - for dt in [dim_type.param, dim_type.in_]: - for dim_idx in range(div.dim(dt)): - coeff_val = div.get_coefficient_val(dt, dim_idx) * denom - if not coeff_val.is_zero(): - inames_and_coeffs.append((dt, dim_idx, coeff_val)) + # if the coefficient in front of the div is not the same as the denominator + if not aff.get_coefficient_val(dim_type.div, idiv).div(denom).is_one(): + # not supported + continue + + inames_and_coeffs = _get_indices_and_coeffs( + div, [dim_type.param, dim_type.in_]) if len(inames_and_coeffs) != 1: continue (dt, dim_idx, coeff), = inames_and_coeffs - if coeff != 1: + if not (coeff * denom).is_one(): + # not supported + continue + + inames_and_coeffs = _get_indices_and_coeffs( + aff, [dim_type.param, dim_type.in_]) + + if len(inames_and_coeffs) != 1: + continue + + (outer_dt, outer_dim_idx, outer_coeff), = inames_and_coeffs + if (not outer_coeff.neg().is_one() + or (outer_dt, outer_dim_idx) != (dt, dim_idx)): # not supported continue -- GitLab From f206d0dfeee86cd70ca9416fcefad03d1d8dadf2 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 10 Jul 2017 16:59:38 -0500 Subject: [PATCH 13/15] Kernel.stringify: add show_labels, use_separators --- loopy/kernel/__init__.py | 54 ++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index dccaca2ec..622f5e49b 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1111,7 +1111,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): return embedding - def stringify(self, what=None, with_dependencies=False): + def stringify(self, what=None, with_dependencies=False, use_separators=True, + show_labels=True): all_what = set([ "name", "arguments", @@ -1150,7 +1151,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): kernel = self - sep = 75*"-" + if use_separators: + sep = [75*"-"] + else: + sep = [] def natorder(key): # Return natural ordering for strings, as opposed to dictionary order. @@ -1167,44 +1171,50 @@ class LoopKernel(ImmutableRecordWithoutPickling): return sorted(seq, key=lambda y: natorder(key(y))) if "name" in what: - lines.append(sep) + lines.extend(sep) lines.append("KERNEL: " + kernel.name) if "arguments" in what: - lines.append(sep) - lines.append("ARGUMENTS:") + lines.extend(sep) + if show_labels: + lines.append("ARGUMENTS:") for arg_name in natsorted(kernel.arg_dict): lines.append(str(kernel.arg_dict[arg_name])) if "domains" in what: - lines.append(sep) - lines.append("DOMAINS:") + lines.extend(sep) + if show_labels: + lines.append("DOMAINS:") for dom, parents in zip(kernel.domains, kernel.all_parents_per_domain()): lines.append(len(parents)*" " + str(dom)) if "tags" in what: - lines.append(sep) - lines.append("INAME IMPLEMENTATION TAGS:") + lines.extend(sep) + if show_labels: + lines.append("INAME IMPLEMENTATION TAGS:") for iname in natsorted(kernel.all_inames()): line = "%s: %s" % (iname, kernel.iname_to_tag.get(iname)) lines.append(line) if "variables" in what and kernel.temporary_variables: - lines.append(sep) - lines.append("TEMPORARIES:") + lines.extend(sep) + if show_labels: + lines.append("TEMPORARIES:") for tv in natsorted(six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): lines.append(str(tv)) if "rules" in what and kernel.substitutions: - lines.append(sep) - lines.append("SUBSTIUTION RULES:") + lines.extend(sep) + if show_labels: + lines.append("SUBSTIUTION RULES:") for rule_name in natsorted(six.iterkeys(kernel.substitutions)): lines.append(str(kernel.substitutions[rule_name])) if "instructions" in what: - lines.append(sep) - lines.append("INSTRUCTIONS:") + lines.extend(sep) + if show_labels: + lines.append("INSTRUCTIONS:") loop_list_width = 35 # {{{ topological sort @@ -1319,18 +1329,20 @@ class LoopKernel(ImmutableRecordWithoutPickling): dep_lines.append("%s : %s" % (insn.id, ",".join(insn.depends_on))) if "Dependencies" in what and dep_lines: - lines.append(sep) - lines.append("DEPENDENCIES: " - "(use loopy.show_dependency_graph to visualize)") + lines.extend(sep) + if show_labels: + lines.append("DEPENDENCIES: " + "(use loopy.show_dependency_graph to visualize)") lines.extend(dep_lines) if "schedule" in what and kernel.schedule is not None: - lines.append(sep) - lines.append("SCHEDULE:") + lines.extend(sep) + if show_labels: + lines.append("SCHEDULE:") from loopy.schedule import dump_schedule lines.append(dump_schedule(kernel, kernel.schedule)) - lines.append(sep) + lines.extend(sep) return "\n".join(lines) -- GitLab From d2f772733370097fd77f0008f6795ee70f18948d Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 10 Jul 2017 17:13:18 -0500 Subject: [PATCH 14/15] Add string representation of GuardedPwQPolynomial --- loopy/statistics.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/loopy/statistics.py b/loopy/statistics.py index a0b3a0165..9b15ec471 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -104,6 +104,12 @@ class GuardedPwQPolynomial(object): p = isl.PwQPolynomial('{ 0 }') return GuardedPwQPolynomial(p, isl.Set.universe(p.domain().space)) + def __str__(self): + return str(self.pwqpolynomial) + + def __repr__(self): + return repr(self.pwqpolynomial) + # }}} -- GitLab From 14cb7fabbd7af5f9339f2146b34536a96960a1d3 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 10 Jul 2017 18:22:09 -0500 Subject: [PATCH 15/15] Fix doctest alphabet swizzles --- doc/tutorial.rst | 74 ++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 5eaa12b81..92ec799f7 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1120,7 +1120,7 @@ all work items reach the same barrier, the kernel will hang during execution. By default, :mod:`loopy` inserts local barriers between two instructions when it detects that a dependency involving local memory may occur across work items. To -see this in action, take a look at the section on :ref:`local_temporaries`. +see this in action, take a look at the section on :ref:`local_temporaries`. In contrast, :mod:`loopy` will *not* insert global barriers automatically. Global barriers require manual intervention along with some special @@ -1308,7 +1308,7 @@ tagged, as in the following example:: assumptions="n>0") .. [#global-barrier-note] In particular, this is *not* the same as a call to - ``barrier(CLK_GLOBAL_MEM_FENCE)``. + ``barrier(CLK_GLOBAL_MEM_FENCE)``. .. }}} @@ -1533,12 +1533,12 @@ information provided. Now we will count the operations: >>> op_map = lp.get_op_map(knl) >>> print(lp.stringify_stats_mapping(op_map)) - Op(np:dtype('float32'), add) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - Op(np:dtype('float32'), div) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - Op(np:dtype('float32'), mul) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - Op(np:dtype('float64'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - Op(np:dtype('float64'), mul) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - Op(np:dtype('int32'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float32'), add) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float32'), div) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float32'), mul) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float64'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float64'), mul) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('int32'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{** @@ -1596,9 +1596,9 @@ together into keys containing only the specified fields: >>> op_map_dtype = op_map.group_by('dtype') >>> print(lp.stringify_stats_mapping(op_map_dtype)) - Op(np:dtype('float32'), None) : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 } - Op(np:dtype('float64'), None) : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 } - Op(np:dtype('int32'), None) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float32'), None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float64'), None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('int32'), None) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32) ... ].eval_with_dict(param_dict) @@ -1619,12 +1619,12 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 2 * n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** @@ -1674,18 +1674,18 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 8 * n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) - MemAccess(None, None, None, load, None) : [n, m, l] -> { (16 * n * m + 12 * n * m * l) : n > 0 and m > 0 and l > 0 } - MemAccess(None, None, None, store, None) : [n, m, l] -> { (8 * n * m + 4 * n * m * l) : n > 0 and m > 0 and l > 0 } + MemAccess(None, None, None, load, None) : [m, l, n] -> { (16 * m + 12 * m * l) * n : m > 0 and l > 0 and n > 0 } + MemAccess(None, None, None, store, None) : [m, l, n] -> { (8 * m + 4 * m * l) * n : m > 0 and l > 0 and n > 0 } >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') ... ].eval_with_dict(param_dict) @@ -1712,12 +1712,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 1, load, a) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 1, load, b) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 1, store, c) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 1, load, g) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 1, load, h) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 1, store, e) : [m, l, n] -> { ... } With this parallelization, consecutive threads will access consecutive array @@ -1753,12 +1753,12 @@ switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 128, load, a) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 128, load, b) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 128, store, c) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 128, load, g) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 128, load, h) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 128, store, e) : [m, l, n] -> { ... } With this parallelization, consecutive threads will access *nonconsecutive* -- GitLab