From 6f7ba19a8b9ab0a494df9e56661e0aa13e950e23 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Fri, 11 Nov 2011 04:34:19 -0500 Subject: [PATCH] Many more fixes to subst-precompute. --- MEMO | 5 +- loopy/__init__.py | 9 +- loopy/cse.py | 151 ++++++++++++++++++++--------- loopy/isl_helpers.py | 38 +++++--- loopy/kernel.py | 23 ++--- loopy/subst.py | 2 +- loopy/symbolic.py | 6 +- test/test_advect_dealias.py | 142 --------------------------- test/test_fem_assembly.py | 38 ++++---- test/test_interp_diff.py | 87 ----------------- test/test_linalg.py | 20 ++-- test/test_sem.py | 188 ++++++++++++++++++++++++++++++++++++ 12 files changed, 370 insertions(+), 339 deletions(-) delete mode 100644 test/test_advect_dealias.py delete mode 100644 test/test_interp_diff.py diff --git a/MEMO b/MEMO index cd9014742..d6300e1dc 100644 --- a/MEMO +++ b/MEMO @@ -39,7 +39,8 @@ Things to consider To-do ^^^^^ -- CSE should be more like variable assignment +- What if no universally valid precompute base index expression is found? + (test_intel_matrix_mul with n = 6*16, e.g.?) - Fix all tests @@ -83,6 +84,8 @@ Future ideas Dealt with ^^^^^^^^^^ +- CSE should be more like variable assignment + - Deal with equality constraints. (These arise, e.g., when partitioning a loop of length 16 into 16s.) diff --git a/loopy/__init__.py b/loopy/__init__.py index c899824dc..67b1b2d21 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -421,14 +421,15 @@ def add_prefetch(kernel, var_name, sweep_dims, dim_args=None, new_inames=None, default_tag="l.auto", rule_name=None): if rule_name is None: - rule_name = kernel.make_unique_subst_rule_name("%s_fetch" % var_name) + rule_name = kernel.make_unique_var_name("%s_fetch" % var_name) + + newly_created_vars = set([rule_name]) arg = kernel.arg_dict[var_name] - newly_created_vars = set() parameters = [] - for i in range(len(arg.shape)): - based_on = "%s_fetch_%d" % (var_name, i) + for i in range(arg.dimensions): + based_on = "%s_dim_%d" % (var_name, i) if dim_args is not None and i < len(dim_args): based_on = dim_args[i] diff --git a/loopy/cse.py b/loopy/cse.py index 9f184edda..1db1ac2d5 100644 --- a/loopy/cse.py +++ b/loopy/cse.py @@ -40,38 +40,69 @@ def get_footprint(kernel, subst_name, old_arg_names, arg_names, sweep_inames, invocation_descriptors): global_footprint_map = None + # {{{ find sweep inames referenced by arguments + processed_sweep_inames = set() for invdesc in invocation_descriptors: - for iname in sweep_inames: if iname in old_arg_names: arg_idx = old_arg_names.index(iname) - processed_sweep_inames.add( + processed_sweep_inames.update( get_dependencies(invdesc.args[arg_idx])) else: processed_sweep_inames.add(iname) - # {{{ construct, check mapping + sweep_inames = list(processed_sweep_inames) + del processed_sweep_inames + + # }}} + + # {{{ duplicate sweep inames + + primed_sweep_inames = [psin+"'" for psin in sweep_inames] + from loopy.isl_helpers import duplicate_axes + dup_sweep_index = kernel.space.dim(dim_type.out) + domain_dup_sweep = duplicate_axes( + kernel.domain, sweep_inames, + primed_sweep_inames) + + prime_sweep_inames = SubstitutionMapper(make_subst_func( + dict((sin, var(psin)) for sin, psin in zip(sweep_inames, primed_sweep_inames)))) + + # }}} + + # {{{ construct arg mapping - map_space = kernel.space + # map goes from substitution arguments to domain_dup_sweep + + for invdesc in invocation_descriptors: + map_space = domain_dup_sweep.get_space() ln = len(arg_names) - rn = kernel.space.dim(dim_type.out) + rn = map_space.dim(dim_type.out) map_space = map_space.add_dims(dim_type.in_, ln) for i, iname in enumerate(arg_names): + # arg names are initially primed, to be replaced with unprimed + # base-0 versions below + map_space = map_space.set_dim_name(dim_type.in_, i, iname+"'") + # map_space: [arg_names] -> [domain](dup_sweep_index)[dup_sweep] + set_space = map_space.move_dims( dim_type.out, rn, dim_type.in_, 0, ln).range() + # set_space: <domain>(dup_sweep_index)<dup_sweep><arg_names> + footprint_map = None from loopy.symbolic import aff_from_expr for uarg_name, arg_val in zip(arg_names, invdesc.args): cns = isl.Constraint.equality_from_aff( - aff_from_expr(set_space, var(uarg_name+"'") - arg_val)) + aff_from_expr(set_space, + var(uarg_name+"'") - prime_sweep_inames(arg_val))) cns_map = isl.BasicMap.from_constraint(cns) if footprint_map is None: @@ -83,23 +114,26 @@ def get_footprint(kernel, subst_name, old_arg_names, arg_names, dim_type.in_, 0, dim_type.out, rn, ln) + # footprint_map is back in map_space + if global_footprint_map is None: global_footprint_map = footprint_map else: global_footprint_map = global_footprint_map.union(footprint_map) - # }}} + # }}} - processed_sweep_inames = list(processed_sweep_inames) + if isinstance(global_footprint_map, isl.BasicMap): + global_footprint_map = isl.Map.from_basic_map(global_footprint_map) + global_footprint_map = global_footprint_map.intersect_range(domain_dup_sweep) - global_footprint_map = (isl.Map.from_basic_map(global_footprint_map) - .intersect_range(kernel.domain)) + # {{{ compute bounds indices # move non-sweep-dimensions into parameter space - sweep_footprint_map = global_footprint_map.coalesce() + sweep_footprint_map = global_footprint_map for iname in kernel.all_inames(): - if iname not in processed_sweep_inames: + if iname not in sweep_inames: sp = sweep_footprint_map.get_space() dt, idx = sp.get_var_dict()[iname] sweep_footprint_map = sweep_footprint_map.move_dims( @@ -107,7 +141,7 @@ def get_footprint(kernel, subst_name, old_arg_names, arg_names, dt, idx, 1) # compute bounding boxes to each set of parameters - sfm_dom = sweep_footprint_map.domain().coalesce() + sfm_dom = sweep_footprint_map.domain() if not sfm_dom.is_bounded(): raise RuntimeError("In precomputation of substitution '%s': " @@ -117,8 +151,9 @@ def get_footprint(kernel, subst_name, old_arg_names, arg_names, from loopy.kernel import find_var_base_indices_and_shape_from_inames arg_base_indices, shape = find_var_base_indices_and_shape_from_inames( sfm_dom, [uarg+"'" for uarg in arg_names], - kernel.cache_manager) - print arg_names, shape + kernel.cache_manager, context=kernel.assumptions) + + # }}} # compute augmented domain @@ -129,7 +164,7 @@ def get_footprint(kernel, subst_name, old_arg_names, arg_names, non1_shape = [] for arg_name, bi, l in zip(arg_names, arg_base_indices, shape): - if l > 1: + if l != 1: non1_arg_names.append(arg_name) non1_arg_base_indices.append(bi) non1_shape.append(l) @@ -140,26 +175,28 @@ def get_footprint(kernel, subst_name, old_arg_names, arg_names, # add the new, base-0 as new in dimensions sp = global_footprint_map.get_space() - tgt_idx = sp.dim(dim_type.out) + arg_idx = sp.dim(dim_type.out) n_args = len(arg_names) nn1_args = len(non1_arg_names) aug_domain = global_footprint_map.move_dims( - dim_type.out, tgt_idx, + dim_type.out, arg_idx, dim_type.in_, 0, n_args).range().coalesce() - aug_domain = aug_domain.insert_dims(dim_type.set, tgt_idx, nn1_args) + aug_domain = aug_domain.insert_dims(dim_type.set, arg_idx, nn1_args) for i, name in enumerate(non1_arg_names): - aug_domain = aug_domain.set_dim_name(dim_type.set, tgt_idx+i, name) + aug_domain = aug_domain.set_dim_name(dim_type.set, arg_idx+i, name) # index layout now: - # <....out.....> (tgt_idx) <base-0 non-1-length args> <args> + # + # <domain> (dup_sweep_index) <dup_sweep> (arg_index) ... + # ... <base-0 non-1-length args> <all args> from loopy.symbolic import aff_from_expr for arg_name, bi, s in zip(arg_names, arg_base_indices, shape): - if s > 1: + if s != 1: cns = isl.Constraint.equality_from_aff( aff_from_expr(aug_domain.get_space(), var(arg_name) - (var(arg_name+"'") - bi))) @@ -170,15 +207,24 @@ def get_footprint(kernel, subst_name, old_arg_names, arg_names, # eliminate inames with non-zero base indices - aug_domain = aug_domain.eliminate(dim_type.set, tgt_idx+nn1_args, n_args) - aug_domain = aug_domain.remove_dims(dim_type.set, tgt_idx+nn1_args, n_args) + aug_domain = aug_domain.eliminate(dim_type.set, arg_idx+nn1_args, n_args) + aug_domain = aug_domain.remove_dims(dim_type.set, arg_idx+nn1_args, n_args) base_indices_2, shape_2 = find_var_base_indices_and_shape_from_inames( - aug_domain, non1_arg_names, kernel.cache_manager) + aug_domain, non1_arg_names, kernel.cache_manager, + context=kernel.assumptions) assert base_indices_2 == [0] * nn1_args assert shape_2 == non1_shape + # {{{ eliminate duplicated sweep_inames + + nsweep = len(sweep_inames) + aug_domain = aug_domain.eliminate(dim_type.set, dup_sweep_index, nsweep) + aug_domain = aug_domain.remove_dims(dim_type.set, dup_sweep_index, nsweep) + + # }}} + return (non1_arg_names, aug_domain, arg_base_indices, non1_arg_base_indices, non1_shape) @@ -186,9 +232,12 @@ def get_footprint(kernel, subst_name, old_arg_names, arg_names, -def simplify_via_aff(space, expr): +def simplify_via_aff(expr): from loopy.symbolic import aff_from_expr, aff_to_expr - return aff_to_expr(aff_from_expr(space, expr)) + deps = get_dependencies(expr) + return aff_to_expr(aff_from_expr( + isl.Space.create_from_names(isl.Context(), list(deps)), + expr)) @@ -206,6 +255,10 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[], invocation_arg_deps = set() def gather_substs(expr, name, args, rec): + if len(args) != len(subst.arguments): + raise RuntimeError("Rule '%s' invoked with %d arguments (needs %d)" + % (subst_name, len(args), len(subst.arguments), )) + arg_deps = get_dependencies(args) if not arg_deps <= kernel.all_inames(): raise RuntimeError("CSE arguments in '%s' do not consist " @@ -219,11 +272,18 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[], from loopy.symbolic import SubstitutionCallbackMapper scm = SubstitutionCallbackMapper([subst_name], gather_substs) + + from loopy.symbolic import ParametrizedSubstitutor + rules_except_mine = kernel.substitutions.copy() + del rules_except_mine[subst_name] + subst_expander = ParametrizedSubstitutor(rules_except_mine) + for insn in kernel.instructions: - scm(insn.expression) - for s in kernel.substitutions.itervalues(): - if s is not subst: - scm(s.expression) + # We can't deal with invocations that involve other substitution's + # arguments. Therefore, fully expand each instruction and look at + # the invocations in subst_name occurring there. + + scm(subst_expander(insn.expression)) allowable_sweep_inames = invocation_arg_deps | set(arg_names) if not set(sweep_inames) <= allowable_sweep_inames: @@ -356,19 +416,14 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[], # }}} - # {{{ substitute rule into instructions + # {{{ substitute rule into expressions in kernel def do_substs(expr, name, args, rec): - found = False - for invdesc in invocation_descriptors: - if expr is invdesc.expr: - found = True - break - - if not found: - return + if len(args) != len(subst.arguments): + raise ValueError("invocation of '%s' with too few arguments" + % name) - args = [simplify_via_aff(new_domain.get_space(), arg-bi) + args = [simplify_via_aff(arg-bi) for arg, bi in zip(args, non1_arg_base_indices)] new_outer_expr = var(target_var_name) @@ -384,16 +439,16 @@ def precompute(kernel, subst_name, dtype, sweep_inames=[], for insn in kernel.instructions: new_insns.append(insn.copy(expression=sub_map(insn.expression))) + new_substs = dict( + (s.name, s.copy(expression=sub_map(s.expression))) + for s in kernel.substitutions.itervalues() + if s.name != subst_name) + # }}} new_iname_to_tag = kernel.iname_to_tag.copy() - if sweep_inames: - new_iname_to_tag.update(arg_name_to_tag) - - new_substs = dict( - (s.name, s.copy(expression=sub_map(subst.expression))) - for s in kernel.substitutions.itervalues()) - del new_substs[subst_name] + for arg_name in non1_arg_names: + new_iname_to_tag[arg_name] = arg_name_to_tag[arg_name] return kernel.copy( domain=new_domain, diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 2e68cc1d5..a509b8a30 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -153,19 +153,30 @@ def iname_rel_aff(space, iname, rel, aff): -def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what): +def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context): pieces = pw_aff.get_pieces() if len(pieces) == 1: return pieces[0][1] - agg_domain = pw_aff.get_aggregate_domain() + reference = pw_aff.get_aggregate_domain() + + if context is not None: + context = isl.align_spaces(context, pw_aff.get_domain_space()) + reference = reference.intersect(context) for set, candidate_aff in pieces: - if constants_only and not candidate_aff.is_cst(): - continue + for use_gist in [False, True]: + if use_gist: + if context is not None: + candidate_aff = pw_aff.gist(set & context) + else: + candidate_aff = pw_aff.gist(set) + + if constants_only and not candidate_aff.is_cst(): + continue - if set_method(pw_aff, candidate_aff) == agg_domain: - return candidate_aff + if reference <= set_method(pw_aff, candidate_aff): + return candidate_aff raise ValueError("a static %s was not found for PwAff '%s'" % (what, pw_aff)) @@ -173,22 +184,25 @@ def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what): -def static_min_of_pw_aff(pw_aff, constants_only): +def static_min_of_pw_aff(pw_aff, constants_only, context=None): return static_extremum_of_pw_aff(pw_aff, constants_only, isl.PwAff.ge_set, - "minimum") + "minimum", context) -def static_max_of_pw_aff(pw_aff, constants_only): +def static_max_of_pw_aff(pw_aff, constants_only, context=None): return static_extremum_of_pw_aff(pw_aff, constants_only, isl.PwAff.le_set, - "maximum") + "maximum", context) -def static_value_of_pw_aff(pw_aff, constants_only): +def static_value_of_pw_aff(pw_aff, constants_only, context=None): return static_extremum_of_pw_aff(pw_aff, constants_only, isl.PwAff.eq_set, - "value") + "value", context) def duplicate_axes(isl_obj, duplicate_inames, new_inames): + if not duplicate_inames: + return isl_obj + # {{{ add dims start_idx = isl_obj.dim(dim_type.set) diff --git a/loopy/kernel.py b/loopy/kernel.py index bcd09f820..1c5651a0c 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -153,6 +153,10 @@ class ArrayArg: self.constant_mem = constant_mem + @property + def dimensions(self): + return len(self.shape) + def __repr__(self): return "<ArrayArg '%s' of type %s>" % (self.name, self.dtype) @@ -516,11 +520,11 @@ class LoopKernel(Record): "(?:\|(?P<duplicate_inames_and_tags>[\s\w,:.]*))?" "\])?" "\s*(?:\<(?P<temp_var_type>.+)\>)?" - "\s*(?P<lhs>.+)\s*=\s*(?P<rhs>.+?)" + "\s*(?P<lhs>.+)\s*(?<!\:)=\s*(?P<rhs>.+?)" "\s*?(?:\:\s*(?P<insn_deps>[\s\w,]+))?$" ) SUBST_RE = re.compile( - r"^\s*(?P<lhs>.+)\s*:=\s*(?P<rhs>.+?)\s*$" + r"^\s*(?P<lhs>.+)\s*:=\s*(?P<rhs>.+)\s*$" ) def parse_iname_and_tag_list(s): @@ -692,12 +696,6 @@ class LoopKernel(Record): if id_str not in used_ids: return id_str - def make_unique_subst_rule_name(self, based_on="subst"): - from loopy.tools import generate_unique_possibilities - for id_str in generate_unique_possibilities(based_on): - if id_str not in self.substitutions: - return id_str - @memoize_method def all_inames(self): from islpy import dim_type @@ -1103,7 +1101,8 @@ class LoopKernel(Record): -def find_var_base_indices_and_shape_from_inames(domain, inames, cache_manager): +def find_var_base_indices_and_shape_from_inames( + domain, inames, cache_manager, context=None): base_indices = [] shape = [] @@ -1116,9 +1115,11 @@ def find_var_base_indices_and_shape_from_inames(domain, inames, cache_manager): from loopy.symbolic import pw_aff_to_expr shape.append(pw_aff_to_expr(static_max_of_pw_aff( - upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=True))) + upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=True, + context=context))) base_indices.append(pw_aff_to_expr( - static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False))) + static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False, + context=context))) return base_indices, shape diff --git a/loopy/subst.py b/loopy/subst.py index 1d44ed29d..c89bc5869 100644 --- a/loopy/subst.py +++ b/loopy/subst.py @@ -165,7 +165,7 @@ def extract_subst(kernel, subst_name, template, parameters): for subst in kernel.substitutions.itervalues(): new_substs[subst.name] = subst.copy( - expression=cbmapper(insn.expression)) + expression=cbmapper(subst.expression)) # }}} diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 7804c1cdc..e1be27965 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -601,11 +601,11 @@ class ParametrizedSubstitutor(IdentityMapper): or expr.function.name not in self.rules): return IdentityMapper.map_variable(self, expr) - cse_name = expr.function.name - rule = self.rules[cse_name] + rule_name = expr.function.name + rule = self.rules[rule_name] if len(rule.arguments) != len(expr.parameters): raise RuntimeError("Rule '%s' invoked with %d arguments (needs %d)" - % (cse_name, len(expr.parameters), len(rule.arguments), )) + % (rule_name, len(expr.parameters), len(rule.arguments), )) from pymbolic.mapper.substitutor import make_subst_func subst_map = SubstitutionMapper(make_subst_func( diff --git a/test/test_advect_dealias.py b/test/test_advect_dealias.py deleted file mode 100644 index cd0df0da7..000000000 --- a/test/test_advect_dealias.py +++ /dev/null @@ -1,142 +0,0 @@ - -def test_advect(ctx_factory): - - dtype = np.float32 - ctx = ctx_factory() - order = "C" - queue = cl.CommandQueue(ctx, - properties=cl.command_queue_properties.PROFILING_ENABLE) - - N = 8 - M = 8 - - from pymbolic import var - K_sym = var("K") - - field_shape = (N, N, N, K_sym) - interim_field_shape = (M, M, M, K_sym) - - # 1. direction-by-direction similarity transform on u - # 2. invert diagonal - # 3. transform back (direction-by-direction) - - # K - run-time symbolic - knl = lp.make_kernel(ctx.devices[0], - "[K] -> {[i,ip,j,jp,k,kp,m,e]: 0<=i,j,k,m<%d AND 0<=o,ip,jp,kp<%d 0<=e<K}" %M %N - [ - - # interpolate u to integration nodes - "CSE: u0[i,jp,kp,e] = sum_float32(@o, I[i,o]*u[o,jp,kp,e])", - "CSE: u1[i,j,kp,e] = sum_float32(@o, I[j,o]*u0[i,o,kp,e])", - "CSE: Iu[i,j,k,e] = sum_float32(@o, I[k,o]*u1[i,j,o,e])", - - # differentiate u on integration nodes - "CSE: Iur[i,j,k,e] = sum_float32(@m, D[i,m]*Iu[m,j,k,e])", - "CSE: Ius[i,j,k,e] = sum_float32(@m, D[j,m]*Iu[i,m,k,e])", - "CSE: Iut[i,j,k,e] = sum_float32(@m, D[k,m]*Iu[i,j,m,e])", - - # interpolate v to integration nodes - "CSE: v0[i,jp,kp,e] = sum_float32(@o, I[i,o]*v[o,jp,kp,e])", - "CSE: v1[i,j,kp,e] = sum_float32(@o, I[j,o]*v0[i,o,kp,e])", - "CSE: Iv[i,j,k,e] = sum_float32(@o, I[k,o]*v1[i,j,o,e])", - - # differentiate v on integration nodes - "CSE: Ivr[i,j,k,e] = sum_float32(@m, D[i,m]*Iv[m,j,k,e])", - "CSE: Ivs[i,j,k,e] = sum_float32(@m, D[j,m]*Iv[i,m,k,e])", - "CSE: Ivt[i,j,k,e] = sum_float32(@m, D[k,m]*Iv[i,j,m,e])", - - # interpolate w to integration nodes - "CSE: w0[i,jp,kp,e] = sum_float32(@o, I[i,o]*w[o,jp,kp,e])", - "CSE: w1[i,j,kp,e] = sum_float32(@o, I[j,o]*w0[i,o,kp,e])", - "CSE: Iw[i,j,k,e] = sum_float32(@o, I[k,o]*w1[i,j,o,e])", - - # differentiate v on integration nodes - "CSE: Iwr[i,j,k,e] = sum_float32(@m, D[i,m]*Iw[m,j,k,e])", - "CSE: Iws[i,j,k,e] = sum_float32(@m, D[j,m]*Iw[i,m,k,e])", - "CSE: Iwt[i,j,k,e] = sum_float32(@m, D[k,m]*Iw[i,j,m,e])", - - # find velocity in (r,s,t) coordinates - # QUESTION: should I use CSE here ? - "CSE: Vr[i,j,k,e] = G[i,j,k,0,e]*Iu[i,j,k,e] + G[i,j,k,1,e]*Iv[i,j,k,e] + G[i,j,k,2,e]*Iw[i,j,k,e]", - "CSE: Vs[i,j,k,e] = G[i,j,k,3,e]*Iu[i,j,k,e] + G[i,j,k,4,e]*Iv[i,j,k,e] + G[i,j,k,5,e]*Iw[i,j,k,e]", - "CSE: Vt[i,j,k,e] = G[i,j,k,6,e]*Iu[i,j,k,e] + G[i,j,k,7,e]*Iv[i,j,k,e] + G[i,j,k,8,e]*Iw[i,j,k,e]", - - # form nonlinear term on integration nodes - # QUESTION: should I use CSE here ? - "<SE: Nu[i,j,k,e] = Vr[i,j,k,e]*Iur[i,j,k,e]+Vs[i,j,k,e]*Ius[i,j,k,e]+Vt[i,j,k,e]*Iut[i,j,k,e]", - "<SE: Nv[i,j,k,e] = Vr[i,j,k,e]*Ivr[i,j,k,e]+Vs[i,j,k,e]*Ivs[i,j,k,e]+Vt[i,j,k,e]*Ivt[i,j,k,e]", - "<SE: Nw[i,j,k,e] = Vr[i,j,k,e]*Iwr[i,j,k,e]+Vs[i,j,k,e]*Iws[i,j,k,e]+Vt[i,j,k,e]*Iwt[i,j,k,e]", - - # L2 project Nu back to Lagrange basis - "CSE: Nu2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nu[m,j,k,e])", - "CSE: Nu1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nu2[ip,m,k,e])", - "INu[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nu1[ip,jp,m,e])", - - # L2 project Nv back to Lagrange basis - "CSE: Nv2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nv[m,j,k,e])", - "CSE: Nv1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nv2[ip,m,k,e])", - "INv[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nv1[ip,jp,m,e])", - - # L2 project Nw back to Lagrange basis - "CSE: Nw2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nw[m,j,k,e])", - "CSE: Nw1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nw2[ip,m,k,e])", - "INw[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nw1[ip,jp,m,e])", - - ], - [ - lp.ArrayArg("u", dtype, shape=field_shape, order=order), - lp.ArrayArg("v", dtype, shape=field_shape, order=order), - lp.ArrayArg("w", dtype, shape=field_shape, order=order), - lp.ArrayArg("INu", dtype, shape=field_shape, order=order), - lp.ArrayArg("INv", dtype, shape=field_shape, order=order), - lp.ArrayArg("INw", dtype, shape=field_shape, order=order), - lp.ArrayArg("D", dtype, shape=(M,M), order=order), - lp.ArrayArg("I", dtype, shape=(M, N), order=order), - lp.ArrayArg("V", dtype, shape=(N, M), order=order), - lp.ScalarArg("K", np.int32, approximately=1000), - ], - name="sem_advect", assumptions="K>=1") - - print knl - 1/0 - - knl = lp.split_dimension(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) - - knl = lp.tag_dimensions(knl, dict(i="l.0", j="l.1")) - - print knl - #1/0 - - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) - - a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) - b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) - c = cl_array.empty_like(a) - refsol = np.dot(a.get(), b.get()) - - def launcher(kernel, gsize, lsize, check): - evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data, - g_times_l=True) - - if check: - check_error(refsol, c.get()) - - return evt - - lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) - - - - -if __name__ == "__main__": - # make sure that import failures get reported, instead of skipping the - # tests. - import pyopencl as cl - - import sys - if len(sys.argv) > 1: - exec(sys.argv[1]) - else: - from py.test.cmdline import main - main([__file__]) diff --git a/test/test_fem_assembly.py b/test/test_fem_assembly.py index 7e57d1508..4502fd13b 100644 --- a/test/test_fem_assembly.py +++ b/test/test_fem_assembly.py @@ -31,58 +31,60 @@ def test_laplacian_stiffness(ctx_factory): Nc_sym = var("Nc") knl = lp.make_kernel(ctx.devices[0], - "[Nc] -> {[K,i,j,q]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d}" - % dict(Nb=Nb, Nq=Nq), + "[Nc] -> {[K,i,j,q, ax_a, ax_b, ax_c]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d " + "and 0<= ax_a, ax_b, ax_c < %(dim)d}" + % dict(Nb=Nb, Nq=Nq, dim=dim), [ - "CSE: dPsi(a,dxi) = jacInv[K,q,0,dxi] * DPsi[a,q,0] " - "+ jacInv[K,q,1,dxi] * DPsi[a,q,1]", + "dPsi(a, dxi) := sum_float32(ax_c," + " jacInv[ax_c,dxi,K,q] * DPsi[ax_c,a,q])", "A[K, i, j] = sum_float32(q, w[q] * jacDet[K,q] * (" "dPsi(0,0)*dPsi(1,0) + dPsi(0,1)*dPsi(1,1)))" ], [ - lp.ArrayArg("jacInv", dtype, shape=(Nc_sym, Nq, dim, dim), order=order), - lp.ConstantArrayArg("DPsi", dtype, shape=(Nb, Nq, dim), order=order), + lp.ArrayArg("jacInv", dtype, shape=(dim, dim, Nc_sym, Nq), order=order), + lp.ConstantArrayArg("DPsi", dtype, shape=(dim, Nb, Nq), order=order), lp.ArrayArg("jacDet", dtype, shape=(Nc_sym, Nq), order=order), lp.ConstantArrayArg("w", dtype, shape=(Nq, dim), order=order), lp.ArrayArg("A", dtype, shape=(Nc_sym, Nb, Nb), order=order), lp.ScalarArg("Nc", np.int32, approximately=1000), ], - name="semlap", assumptions="Nc>=1") + name="lapquad", assumptions="Nc>=1") + #knl = lp.tag_dimensions(knl, dict(ax_c="unr")) seq_knl = knl def variant_1(knl): # no ILP across elements - knl= lp.remove_cse_by_tag(knl, "dPsi") knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) - knl = lp.add_prefetch(knl, 'jacInv', ["Kii", "Kio", "q", "x", "y"], - uni_template="jacInv[Kii +16*Ko,q,x,y]") + knl = lp.add_prefetch(knl, 'jacInv', + ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"]) return knl def variant_2(knl): # with ILP across elements - knl= lp.remove_cse_by_tag(knl, "dPsi") knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) knl = lp.split_dimension(knl, "K_inner", 4, inner_tag="ilp") knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) - knl = lp.add_prefetch(knl, 'jacInv', ["Kii", "Kio", "q", "x", "y"], - uni_template="jacInv[Kii + 4*Kio +16*Ko,q,x,y]") + knl = lp.add_prefetch(knl, "jacInv", + ["jacInv_dim_0", "jacInv_dim_1", "K_inner_inner", "K_inner_outer", "q"]) return knl - def variant_1(knl): + def variant_3(knl): # no ILP across elements, precompute dPsiTransf - knl= lp.realize_cse(knl, "dPsi", ["a", "dxi", ""]) knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) - knl = lp.add_prefetch(knl, 'jacInv', ["Kii", "Kio", "q", "x", "y"], - uni_template="jacInv[Kii +16*Ko,q,x,y]") + knl = lp.precompute(knl, "dPsi", + ["a", "dxi"]) + knl = lp.add_prefetch(knl, "jacInv", + ["jacInv_dim_0", "jacInv_dim_1", "K_inner", "q"]) return knl for variant in [variant_1, variant_2]: + #for variant in [variant_3]: kernel_gen = lp.generate_loop_schedules(variant(knl), - loop_priority=["K", "i", "j"]) + loop_priority=["jacInv_dim_0", "jacInv_dim_1"]) kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc)) lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, diff --git a/test/test_interp_diff.py b/test/test_interp_diff.py deleted file mode 100644 index eeed86646..000000000 --- a/test/test_interp_diff.py +++ /dev/null @@ -1,87 +0,0 @@ - -def test_interp_diff(ctx_factory): - - dtype = np.float32 - ctx = ctx_factory() - order = "C" - queue = cl.CommandQueue(ctx, - properties=cl.command_queue_properties.PROFILING_ENABLE) - - N = 8 - M = 8 - - from pymbolic import var - K_sym = var("K") - - field_shape = (N, N, N, K_sym) - interim_field_shape = (M, M, M, K_sym) - - # 1. direction-by-direction similarity transform on u - # 2. invert diagonal - # 3. transform back (direction-by-direction) - - # K - run-time symbolic - knl = lp.make_kernel(ctx.devices[0], - "[K] -> {[i,ip,j,jp,k,kp,e]: 0<=i,j,k<%d AND 0<=ip,jp,kp<%d 0<=e<K}" %M %N - [ - "[|i,jp,kp] <float32> u1[i ,jp,kp,e] = sum_float32(ip, I[i,ip]*u [ip,jp,kp,e])", - "[|i,j ,kp] <float32> u2[i ,j ,kp,e] = sum_float32(jp, I[j,jp]*u1[i ,jp,kp,e])", - "[|i,j ,k ] <float32> u3[i ,j ,k ,e] = sum_float32(kp, I[k,kp]*u2[i ,j ,kp,e])", - "[|i,j ,k ] <float32> Pu[i ,j ,k ,e] = P[i,j,k,e]*u3[i,j,k,e]", - "[|i,j ,kp] <float32> Pu3[i ,j ,kp,e] = sum_float32(k, V[kp,k]*Pu[i ,j , k,e])", - "[|i,jp,kp] <float32> Pu2[i ,jp,kp,e] = sum_float32(j, V[jp,j]*Pu[i ,j ,kp,e])", - "Pu[ip,jp,kp,e] = sum_float32(i, V[ip,i]*Pu[i ,jp,kp,e])", - ], - [ - lp.ArrayArg("u", dtype, shape=field_shape, order=order), - lp.ArrayArg("P", dtype, shape=interim_field_shape, order=order), - lp.ArrayArg("I", dtype, shape=(M, N), order=order), - lp.ArrayArg("V", dtype, shape=(N, M), order=order), - lp.ArrayArg("Pu", dtype, shape=field_shape, order=order), - lp.ScalarArg("K", np.int32, approximately=1000), - ], - name="sem_lap_precon", assumptions="K>=1") - - print knl - 1/0 - - knl = lp.split_dimension(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) - - knl = lp.tag_dimensions(knl, dict(i="l.0", j="l.1")) - - print knl - #1/0 - - kernel_gen = lp.generate_loop_schedules(knl) - kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) - - a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) - b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) - c = cl_array.empty_like(a) - refsol = np.dot(a.get(), b.get()) - - def launcher(kernel, gsize, lsize, check): - evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data, - g_times_l=True) - - if check: - check_error(refsol, c.get()) - - return evt - - lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) - - - - -if __name__ == "__main__": - # make sure that import failures get reported, instead of skipping the - # tests. - import pyopencl as cl - - import sys - if len(sys.argv) > 1: - exec(sys.argv[1]) - else: - from py.test.cmdline import main - main([__file__]) diff --git a/test/test_linalg.py b/test/test_linalg.py index 55f21cf24..48040d2aa 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -222,8 +222,6 @@ def test_plain_matrix_mul(ctx_factory): knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ]) - print lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, {}) @@ -259,7 +257,7 @@ def test_variable_size_matrix_mul(ctx_factory): knl = lp.make_kernel(ctx.devices[0], "[n] -> {[i,j,k]: 0<=i,j,k<n}", [ - "label: c[i, j] = sum_float32(k, cse(a[i, k], lhsmat)*cse(b[k, j], rhsmat))" + "label: c[i, j] = sum_float32(k, a[i, k]*b[k, j])" ], [ lp.ArrayArg("a", dtype, shape=(n, n), order=order), @@ -275,8 +273,8 @@ def test_variable_size_matrix_mul(ctx_factory): outer_tag="g.1", inner_tag="l.0") knl = lp.split_dimension(knl, "k", 32) - knl = lp.realize_cse(knl, "lhsmat", dtype, ["k_inner", "i_inner"]) - knl = lp.realize_cse(knl, "rhsmat", dtype, ["j_inner", "k_inner"]) + knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) + knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) @@ -362,9 +360,9 @@ def test_rank_one(ctx_factory): knl = lp.split_dimension(knl, "j_inner", 16, inner_tag="l.1") - knl = lp.split_dimension(knl, "a_fetch_0", 16, + knl = lp.split_dimension(knl, "a_dim_0", 16, outer_tag="l.1", inner_tag="l.0") - knl = lp.split_dimension(knl, "b_fetch_0", 16, + knl = lp.split_dimension(knl, "b_dim_0", 16, outer_tag="l.1", inner_tag="l.0") return knl @@ -443,7 +441,7 @@ def test_intel_matrix_mul(ctx_factory): queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) - n = 6*16 + n = 128+32 knl = lp.make_kernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, @@ -640,11 +638,9 @@ def test_image_matrix_mul_ilp(ctx_factory): knl = lp.split_dimension(knl, "j", ilp*j_inner_split, outer_tag="g.1") knl = lp.split_dimension(knl, "j_inner", j_inner_split, outer_tag="ilp", inner_tag="l.0") knl = lp.split_dimension(knl, "k", 2) - # conflict-free + # conflict-free? knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) - knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"], - new_inames=["b_j_io", "b_j_ii", "b_k_i"]) - knl = lp.join_dimensions(knl, ["b_j_io", "b_j_ii"]) + knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"]) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) diff --git a/test/test_sem.py b/test/test_sem.py index a5970e24b..d42e030ec 100644 --- a/test/test_sem.py +++ b/test/test_sem.py @@ -11,6 +11,8 @@ from pyopencl.tools import pytest_generate_tests_for_pyopencl \ def test_laplacian(ctx_factory): + 1/0 # not adapted to new language + dtype = np.float32 ctx = ctx_factory() order = "C" @@ -109,6 +111,8 @@ def test_laplacian(ctx_factory): def test_laplacian_lmem(ctx_factory): + 1/0 # not adapted to new language + dtype = np.float32 ctx = ctx_factory() order = "C" @@ -181,6 +185,8 @@ def test_laplacian_lmem(ctx_factory): def test_advect(ctx_factory): + 1/0 # not ready + dtype = np.float32 ctx = ctx_factory() @@ -272,6 +278,188 @@ def test_advect(ctx_factory): +def test_advect_dealias(ctx_factory): + 1/0 # not ready + + dtype = np.float32 + ctx = ctx_factory() + order = "C" + + N = 8 + M = 8 + + from pymbolic import var + K_sym = var("K") + + field_shape = (N, N, N, K_sym) + interim_field_shape = (M, M, M, K_sym) + + # 1. direction-by-direction similarity transform on u + # 2. invert diagonal + # 3. transform back (direction-by-direction) + + # K - run-time symbolic + knl = lp.make_kernel(ctx.devices[0], + "[K] -> {[i,ip,j,jp,k,kp,m,e]: 0<=i,j,k,m<%d AND 0<=o,ip,jp,kp<%d 0<=e<K}" %M %N + [ + + # interpolate u to integration nodes + "CSE: u0[i,jp,kp,e] = sum_float32(@o, I[i,o]*u[o,jp,kp,e])", + "CSE: u1[i,j,kp,e] = sum_float32(@o, I[j,o]*u0[i,o,kp,e])", + "CSE: Iu[i,j,k,e] = sum_float32(@o, I[k,o]*u1[i,j,o,e])", + + # differentiate u on integration nodes + "CSE: Iur[i,j,k,e] = sum_float32(@m, D[i,m]*Iu[m,j,k,e])", + "CSE: Ius[i,j,k,e] = sum_float32(@m, D[j,m]*Iu[i,m,k,e])", + "CSE: Iut[i,j,k,e] = sum_float32(@m, D[k,m]*Iu[i,j,m,e])", + + # interpolate v to integration nodes + "CSE: v0[i,jp,kp,e] = sum_float32(@o, I[i,o]*v[o,jp,kp,e])", + "CSE: v1[i,j,kp,e] = sum_float32(@o, I[j,o]*v0[i,o,kp,e])", + "CSE: Iv[i,j,k,e] = sum_float32(@o, I[k,o]*v1[i,j,o,e])", + + # differentiate v on integration nodes + "CSE: Ivr[i,j,k,e] = sum_float32(@m, D[i,m]*Iv[m,j,k,e])", + "CSE: Ivs[i,j,k,e] = sum_float32(@m, D[j,m]*Iv[i,m,k,e])", + "CSE: Ivt[i,j,k,e] = sum_float32(@m, D[k,m]*Iv[i,j,m,e])", + + # interpolate w to integration nodes + "CSE: w0[i,jp,kp,e] = sum_float32(@o, I[i,o]*w[o,jp,kp,e])", + "CSE: w1[i,j,kp,e] = sum_float32(@o, I[j,o]*w0[i,o,kp,e])", + "CSE: Iw[i,j,k,e] = sum_float32(@o, I[k,o]*w1[i,j,o,e])", + + # differentiate v on integration nodes + "CSE: Iwr[i,j,k,e] = sum_float32(@m, D[i,m]*Iw[m,j,k,e])", + "CSE: Iws[i,j,k,e] = sum_float32(@m, D[j,m]*Iw[i,m,k,e])", + "CSE: Iwt[i,j,k,e] = sum_float32(@m, D[k,m]*Iw[i,j,m,e])", + + # find velocity in (r,s,t) coordinates + # QUESTION: should I use CSE here ? + "CSE: Vr[i,j,k,e] = G[i,j,k,0,e]*Iu[i,j,k,e] + G[i,j,k,1,e]*Iv[i,j,k,e] + G[i,j,k,2,e]*Iw[i,j,k,e]", + "CSE: Vs[i,j,k,e] = G[i,j,k,3,e]*Iu[i,j,k,e] + G[i,j,k,4,e]*Iv[i,j,k,e] + G[i,j,k,5,e]*Iw[i,j,k,e]", + "CSE: Vt[i,j,k,e] = G[i,j,k,6,e]*Iu[i,j,k,e] + G[i,j,k,7,e]*Iv[i,j,k,e] + G[i,j,k,8,e]*Iw[i,j,k,e]", + + # form nonlinear term on integration nodes + # QUESTION: should I use CSE here ? + "<SE: Nu[i,j,k,e] = Vr[i,j,k,e]*Iur[i,j,k,e]+Vs[i,j,k,e]*Ius[i,j,k,e]+Vt[i,j,k,e]*Iut[i,j,k,e]", + "<SE: Nv[i,j,k,e] = Vr[i,j,k,e]*Ivr[i,j,k,e]+Vs[i,j,k,e]*Ivs[i,j,k,e]+Vt[i,j,k,e]*Ivt[i,j,k,e]", + "<SE: Nw[i,j,k,e] = Vr[i,j,k,e]*Iwr[i,j,k,e]+Vs[i,j,k,e]*Iws[i,j,k,e]+Vt[i,j,k,e]*Iwt[i,j,k,e]", + + # L2 project Nu back to Lagrange basis + "CSE: Nu2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nu[m,j,k,e])", + "CSE: Nu1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nu2[ip,m,k,e])", + "INu[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nu1[ip,jp,m,e])", + + # L2 project Nv back to Lagrange basis + "CSE: Nv2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nv[m,j,k,e])", + "CSE: Nv1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nv2[ip,m,k,e])", + "INv[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nv1[ip,jp,m,e])", + + # L2 project Nw back to Lagrange basis + "CSE: Nw2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nw[m,j,k,e])", + "CSE: Nw1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nw2[ip,m,k,e])", + "INw[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nw1[ip,jp,m,e])", + + ], + [ + lp.ArrayArg("u", dtype, shape=field_shape, order=order), + lp.ArrayArg("v", dtype, shape=field_shape, order=order), + lp.ArrayArg("w", dtype, shape=field_shape, order=order), + lp.ArrayArg("INu", dtype, shape=field_shape, order=order), + lp.ArrayArg("INv", dtype, shape=field_shape, order=order), + lp.ArrayArg("INw", dtype, shape=field_shape, order=order), + lp.ArrayArg("D", dtype, shape=(M,M), order=order), + lp.ArrayArg("I", dtype, shape=(M, N), order=order), + lp.ArrayArg("V", dtype, shape=(N, M), order=order), + lp.ScalarArg("K", np.int32, approximately=1000), + ], + name="sem_advect", assumptions="K>=1") + + print knl + 1/0 + + knl = lp.split_dimension(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) + + knl = lp.tag_dimensions(knl, dict(i="l.0", j="l.1")) + + print knl + #1/0 + + kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) + + + K = 1000 + lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, + op_count=0, + op_label="GFlops", + parameters={"K": K}, print_seq_code=True,) + + + + +def test_interp_diff(ctx_factory): + 1/0 # not ready + dtype = np.float32 + ctx = ctx_factory() + order = "C" + + N = 8 + M = 8 + + from pymbolic import var + K_sym = var("K") + + field_shape = (N, N, N, K_sym) + interim_field_shape = (M, M, M, K_sym) + + # 1. direction-by-direction similarity transform on u + # 2. invert diagonal + # 3. transform back (direction-by-direction) + + # K - run-time symbolic + knl = lp.make_kernel(ctx.devices[0], + "[K] -> {[i,ip,j,jp,k,kp,e]: 0<=i,j,k<%d AND 0<=ip,jp,kp<%d 0<=e<K}" %M %N + [ + "[|i,jp,kp] <float32> u1[i ,jp,kp,e] = sum_float32(ip, I[i,ip]*u [ip,jp,kp,e])", + "[|i,j ,kp] <float32> u2[i ,j ,kp,e] = sum_float32(jp, I[j,jp]*u1[i ,jp,kp,e])", + "[|i,j ,k ] <float32> u3[i ,j ,k ,e] = sum_float32(kp, I[k,kp]*u2[i ,j ,kp,e])", + "[|i,j ,k ] <float32> Pu[i ,j ,k ,e] = P[i,j,k,e]*u3[i,j,k,e]", + "[|i,j ,kp] <float32> Pu3[i ,j ,kp,e] = sum_float32(k, V[kp,k]*Pu[i ,j , k,e])", + "[|i,jp,kp] <float32> Pu2[i ,jp,kp,e] = sum_float32(j, V[jp,j]*Pu[i ,j ,kp,e])", + "Pu[ip,jp,kp,e] = sum_float32(i, V[ip,i]*Pu[i ,jp,kp,e])", + ], + [ + lp.ArrayArg("u", dtype, shape=field_shape, order=order), + lp.ArrayArg("P", dtype, shape=interim_field_shape, order=order), + lp.ArrayArg("I", dtype, shape=(M, N), order=order), + lp.ArrayArg("V", dtype, shape=(N, M), order=order), + lp.ArrayArg("Pu", dtype, shape=field_shape, order=order), + lp.ScalarArg("K", np.int32, approximately=1000), + ], + name="sem_lap_precon", assumptions="K>=1") + + print knl + 1/0 + + knl = lp.split_dimension(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) + + knl = lp.tag_dimensions(knl, dict(i="l.0", j="l.1")) + + print knl + #1/0 + + kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) + + lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, + op_count=0, + op_label="GFlops", + parameters={"K": K}, print_seq_code=True,) + + + + if __name__ == "__main__": import sys if len(sys.argv) > 1: -- GitLab