From 9956ff0566adec2e080073502d19ca7aa5547877 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Thu, 3 Nov 2011 03:32:28 -0400 Subject: [PATCH] Find insn iname deps by fixed point iteration. Dot dependency graphing. Schedule improvements. insn.all_inames() -> kernel.insn_inames(insn) Scheduling: - Only schedule referenced inames. - Only work towards insns that have their dependencies satisfied. --- loopy/__init__.py | 5 +- loopy/check.py | 20 ++--- loopy/codegen/instruction.py | 4 +- loopy/cse.py | 11 +-- loopy/kernel.py | 156 ++++++++++++++++++++++++++++++----- loopy/preprocess.py | 54 +++--------- loopy/schedule.py | 20 +++-- loopy/symbolic.py | 22 ++++- test/test_sem.py | 26 +++--- 9 files changed, 211 insertions(+), 107 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index d952538a3..a66b9958f 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -22,7 +22,7 @@ class LoopyAdvisory(UserWarning): from loopy.kernel import ScalarArg, ArrayArg, ImageArg -from loopy.kernel import AutoFitLocalIndexTag +from loopy.kernel import AutoFitLocalIndexTag, get_dot_dependency_graph from loopy.cse import realize_cse from loopy.preprocess import preprocess_kernel from loopy.schedule import generate_loop_schedules @@ -31,6 +31,7 @@ from loopy.compiled import CompiledKernel, drive_timing_run, auto_test_vs_seq from loopy.check import check_kernels __all__ = ["ScalarArg", "ArrayArg", "ImageArg", + "get_dot_dependency_graph", "preprocess_kernel", "generate_loop_schedules", "generate_code", "CompiledKernel", "drive_timing_run", "check_kernels", @@ -155,7 +156,7 @@ def make_kernel(*args, **kwargs): from pymbolic.primitives import Variable for index_expr in insn.get_assignee_indices(): if (not isinstance(index_expr, Variable) - or not index_expr.name in insn.all_inames()): + or not index_expr.name in knl.insn_inames(insn)): raise RuntimeError( "only plain inames are allowed in " "the lvalue index when declaring the " diff --git a/loopy/check.py b/loopy/check.py index 8eb853035..d8c63a302 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -21,7 +21,7 @@ def check_for_unused_hw_axes_in_insns(kernel): group_axes_used = set() local_axes_used = set() - for iname in insn.all_inames(): + for iname in kernel.insn_inames(insn): tag = kernel.iname_to_tag.get(iname) if isinstance(tag, LocalIndexTag): @@ -53,7 +53,7 @@ def check_for_double_use_of_hw_axes(kernel): for insn in kernel.instructions: insn_tag_keys = set() - for iname in insn.all_inames(): + for iname in kernel.insn_inames(insn): tag = kernel.iname_to_tag.get(iname) if isinstance(tag, UniqueTag): key = tag.key @@ -74,7 +74,7 @@ def check_for_inactive_iname_access(kernel): expression_indices = depmap(insn.expression) expression_inames = expression_indices & kernel.all_inames() - if not expression_inames <= insn.all_inames(): + if not expression_inames <= kernel.insn_inames(insn): raise RuntimeError( "instructiosn '%s' references " "inames that the instruction does not depend on" @@ -100,7 +100,7 @@ def check_for_write_races(kernel): assignee_indices = set(strip_var(index) for index in assignee_indices) assignee_inames = assignee_indices & kernel.all_inames() - if not assignee_inames <= insn.all_inames(): + if not assignee_inames <= kernel.insn_inames(insn): raise RuntimeError( "assignee of instructiosn '%s' references " "iname that the instruction does not depend on" @@ -114,7 +114,7 @@ def check_for_write_races(kernel): parallel_insn_inames = set( iname - for iname in insn.all_inames() + for iname in kernel.insn_inames(insn) if isinstance(kernel.iname_to_tag.get(iname), ParallelTag)) inames_without_write_dep = parallel_insn_inames - ( @@ -125,7 +125,7 @@ def check_for_write_races(kernel): if temp_var.is_local == True: local_parallel_insn_inames = set( iname - for iname in insn.all_inames() + for iname in kernel.insn_inames(insn) if isinstance(kernel.iname_to_tag.get(iname), ParallelTag) and not isinstance(kernel.iname_to_tag.get(iname), GroupIndexTag)) @@ -135,7 +135,7 @@ def check_for_write_races(kernel): elif temp_var.is_local == False: ilp_inames = set( iname - for iname in insn.all_inames() + for iname in kernel.insn_inames(insn) if isinstance(kernel.iname_to_tag.get(iname), IlpTag)) inames_without_write_dep = ilp_inames - ( @@ -204,10 +204,10 @@ def check_implemented_domains(kernel, implemented_domains): insn_impl_domain = insn_impl_domain | idomain insn_impl_domain = ( (insn_impl_domain & assumptions) - .project_out_except(insn.all_inames(), [dim_type.set])) + .project_out_except(kernel.insn_inames(insn), [dim_type.set])) desired_domain = ((kernel.domain & assumptions) - .project_out_except(insn.all_inames(), [dim_type.set])) + .project_out_except(kernel.insn_inames(insn), [dim_type.set])) if insn_impl_domain != desired_domain: i_minus_d = insn_impl_domain - desired_domain @@ -228,7 +228,7 @@ def check_implemented_domains(kernel, implemented_domains): iname_to_dim = pt.get_space().get_var_dict() point_axes = [] - for iname in insn.all_inames() | parameter_inames: + for iname in kernel.insn_inames(insn) | parameter_inames: tp, dim = iname_to_dim[iname] point_axes.append("%s=%d" % (iname, pt.get_coordinate(tp, dim))) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index e67898ebd..688ae2c7f 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -52,7 +52,7 @@ def generate_ilp_instances(kernel, insn, codegen_state): # {{{ pass 2: treat all ILP dimensions - for iname in insn.all_inames(): + for iname in kernel.insn_inames(insn): tag = kernel.iname_to_tag.get(iname) if not isinstance(tag, IlpTag): @@ -99,7 +99,7 @@ def generate_instruction_code(kernel, insn, codegen_state): insn_code = Assign(ccm(insn.assignee), ccm(insn.expression)) from loopy.codegen.bounds import wrap_in_bounds_checks insn_code, impl_domain = wrap_in_bounds_checks( - ccm, kernel.domain, insn.all_inames(), ilpi.implemented_domain, + ccm, kernel.domain, kernel.insn_inames(insn), ilpi.implemented_domain, insn_code) result.append(GeneratedInstruction( diff --git a/loopy/cse.py b/loopy/cse.py index e76145292..ab576a701 100644 --- a/loopy/cse.py +++ b/loopy/cse.py @@ -283,13 +283,11 @@ def make_compute_insn(kernel, cse_tag, lead_expr, target_var_name, # {{{ decide whether to force a dep - forced_iname_deps = set() - from loopy.symbolic import IndexVariableFinder dependencies = IndexVariableFinder( include_reduction_inames=False)(lead_expr) - parent_inames = insn.all_inames() | insn.reduction_inames() + parent_inames = kernel.insn_inames(insn) | insn.reduction_inames() #print dependencies, parent_inames #assert dependencies <= parent_inames @@ -326,8 +324,7 @@ def make_compute_insn(kernel, cse_tag, lead_expr, target_var_name, return Instruction( id=kernel.make_unique_instruction_id(based_on=insn_prefix+"_compute"), assignee=assignee, - expression=new_inner_expr, - forced_iname_deps=forced_iname_deps) + expression=new_inner_expr) @@ -499,9 +496,7 @@ def realize_cse(kernel, cse_tag, dtype, independent_inames=[], for insn in kernel.instructions: new_expr = cse_cb_mapper(insn.expression) - new_insns.append(insn.copy( - expression=new_expr, - forced_iname_deps=insn.all_inames())) + new_insns.append(insn.copy(expression=new_expr)) # }}} diff --git a/loopy/kernel.py b/loopy/kernel.py index ee1bcba47..53ef81e68 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -259,8 +259,8 @@ class Instruction(Record): def reduction_inames(self): def map_reduction(expr, rec): rec(expr.expr) - for iname in expr.inames: - result.add(iname.lstrip("@")) + for iname in expr.untagged_inames: + result.add(iname) from loopy.symbolic import ReductionCallbackMapper cb_mapper = ReductionCallbackMapper(map_reduction) @@ -270,19 +270,9 @@ class Instruction(Record): return result - @memoize_method - def all_inames(self): - """Does not (!) include reduction inames.""" - - from loopy.symbolic import IndexVariableFinder - ivarf = IndexVariableFinder(include_reduction_inames=False) - index_vars = (ivarf(self.expression) | ivarf(self.assignee)) - - return index_vars | set(self.forced_iname_deps) - def __str__(self): - result = "%s: %s <- %s\n [%s]" % (self.id, - self.assignee, self.expression, ", ".join(sorted(self.all_inames()))) + result = "%s: %s <- %s" % (self.id, + self.assignee, self.expression) if self.boostable == True: result += " (boostable)" @@ -658,6 +648,76 @@ class LoopKernel(Record): if id_str not in used_ids: return id_str + @memoize_method + def all_inames(self): + from islpy import dim_type + return set(self.space.get_var_dict(dim_type.set).iterkeys()) + + @memoize_method + def all_insn_inames(self): + from loopy.symbolic import get_dependencies + + insn_id_to_inames = {} + insn_assignee_inames = {} + + for insn in self.instructions: + read_deps = get_dependencies(insn.expression) + write_deps = get_dependencies(insn.assignee) + deps = read_deps | write_deps + + iname_deps = ( + deps & self.all_inames() + | insn.forced_iname_deps) + + insn_id_to_inames[insn.id] = iname_deps + insn_assignee_inames[insn.id] = write_deps & self.all_inames() + + writers = self.find_writers() + temp_var_names = set(self.temporary_variables.iterkeys()) + + # fixed point iteration until all iname dep sets have converged + while True: + did_something = False + for insn in self.instructions: + for tv_name in (get_dependencies(insn.expression) + & temp_var_names): + implicit_inames = None + + for writer_id in writers[tv_name]: + writer_implicit_inames = ( + insn_id_to_inames[writer_id] + - insn_assignee_inames[writer_id]) + if implicit_inames is None: + implicit_inames = writer_implicit_inames + else: + implicit_inames = (implicit_inames + & writer_implicit_inames) + + inames_old = insn_id_to_inames[insn.id] + inames_new = inames_old | implicit_inames + insn_id_to_inames[insn.id] = inames_new + + if inames_new != inames_old: + did_something = True + + if not did_something: + break + + return insn_id_to_inames + + @memoize_method + def all_referenced_inames(self): + result = set() + for inames in self.all_insn_inames().itervalues(): + result.update(inames) + return result + + def insn_inames(self, insn): + if isinstance(insn, str): + return self.all_insn_inames()[insn] + else: + return self.all_insn_inames()[insn.id] + @property @memoize_method def sequential_inames(self): @@ -680,6 +740,44 @@ class LoopKernel(Record): return result + def find_readers(self): + """ + :return: a dict that maps variable names to ids of insns that + read that variable. + """ + result = {} + + admissible_vars = ( + set(arg.name for arg in self.args) + | set(self.temporary_variables.iterkeys())) + + for insn in self.instructions: + for var_name in insn.get_read_var_names() & admissible_vars: + result.setdefault(var_name, set()).add(insn.id) + + def find_writers(self): + """ + :return: a dict that maps variable names to ids of insns that + write to that variable. + """ + result = {} + + admissible_vars = ( + set(arg.name for arg in self.args) + | set(self.temporary_variables.iterkeys())) + + for insn in self.instructions: + var_name = insn.get_assignee_var_name() + + if var_name not in admissible_vars: + raise RuntimeError("writing to '%s' is not allowed" % var_name) + var_names = [var_name] + + for var_name in var_names: + result.setdefault(var_name, set()).add(insn.id) + + return result + @property @memoize_method def iname_to_dim(self): @@ -729,11 +827,6 @@ class LoopKernel(Record): return [arg.name for arg in self.args if isinstance(arg, ScalarArg) if arg.name in loop_arg_names] - @memoize_method - def all_inames(self): - from islpy import dim_type - return set(self.space.get_var_dict(dim_type.set).iterkeys()) - @memoize_method def get_iname_bounds(self, iname): dom_intersect_assumptions = ( @@ -771,7 +864,7 @@ class LoopKernel(Record): def get_grid_sizes(self, ignore_auto=False): all_inames_by_insns = set() for insn in self.instructions: - all_inames_by_insns |= insn.all_inames() + all_inames_by_insns |= self.insn_inames(insn) if not all_inames_by_insns <= self.all_inames(): raise RuntimeError("inames collected from instructions (%s) " @@ -888,6 +981,7 @@ class LoopKernel(Record): lines.append("") for insn in self.instructions: lines.append(str(insn)) + lines.append(" [%s]" % ",".join(sorted(self.insn_inames(insn)))) return "\n".join(lines) @@ -917,4 +1011,26 @@ def find_var_base_indices_and_shape_from_inames(domain, inames): +def get_dot_dependency_graph(kernel, iname_cluster=False, iname_edge=True): + lines = [] + for insn in kernel.instructions: + lines.append("%s [shape=\"box\"];" % insn.id) + for dep in insn.insn_deps: + lines.append("%s -> %s;" % (dep, insn.id)) + + if iname_edge: + for iname in kernel.insn_inames(insn): + lines.append("%s -> %s [style=\"dotted\"];" % (iname, insn.id)) + + if iname_cluster: + for iname in kernel.all_inames(): + lines.append("subgraph cluster_%s { label=\"%s\" %s }" % (iname, iname, + " ".join(insn.id for insn in kernel.instructions + if iname in kernel.insn_inames(insn)))) + + return "digraph loopy_deps {\n%s\n}" % "\n".join(lines) + + + + # vim: foldmethod=marker diff --git a/loopy/preprocess.py b/loopy/preprocess.py index eede44a43..ca6b0d0c6 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -12,7 +12,7 @@ def mark_local_temporaries(kernel): new_temp_vars = {} from loopy.kernel import LocalIndexTagBase - writers = find_accessors(kernel, readers=False) + writers = kernel.find_writers() from loopy.symbolic import get_dependencies @@ -65,11 +65,10 @@ def duplicate_reduction_inames(kernel): from pymbolic.mapper.substitutor import make_subst_func from pymbolic import var - old_inames = [iname.lstrip("@") for iname in reduction_expr.inames] subst_dict = dict( (old_iname, var(new_iname)) for old_iname, new_iname in zip( - old_inames, new_red_inames)) + reduction_expr.untagged_inames, new_red_inames)) subst_map = SubstitutionMapper(make_subst_func(subst_dict)) child = subst_map(child) @@ -118,7 +117,7 @@ def realize_reduction(kernel): # {{{ see if this reduction is nested inside some ILP loops ilp_inames = [iname - for iname in insn.all_inames() + for iname in kernel.insn_inames(insn) if isinstance(kernel.iname_to_tag.get(iname), IlpTag)] from loopy.isl_helpers import static_max_of_pw_aff @@ -158,7 +157,7 @@ def realize_reduction(kernel): based_on="%s_%s_init" % (insn.id, "_".join(expr.inames)), extra_used_ids=set(ni.id for ni in new_insns)), assignee=target_var, - forced_iname_deps=insn.all_inames() - set(expr.inames), + forced_iname_deps=kernel.insn_inames(insn) - set(expr.inames), expression=expr.operation.neutral_element) new_insns.append(init_insn) @@ -170,7 +169,7 @@ def realize_reduction(kernel): assignee=target_var, expression=expr.operation(target_var, sub_expr), insn_deps=set([init_insn.id]) | insn.insn_deps, - forced_iname_deps=insn.all_inames() | set(expr.inames)) + forced_iname_deps=kernel.insn_inames(insn) | set(expr.inames)) new_insns.append(reduction_insn) @@ -190,7 +189,7 @@ def realize_reduction(kernel): expression=new_expression, insn_deps=insn.insn_deps | new_insn_insn_deps, - forced_iname_deps=insn.all_inames()) + forced_iname_deps=kernel.insn_inames(insn)) new_insns.append(new_insn) @@ -202,39 +201,8 @@ def realize_reduction(kernel): # {{{ automatic dependencies, find boostability of instructions -def find_accessors(kernel, readers): - """ - :arg readers: whether to find insns that read or that write - the variables in question. - :return: a dict that maps variable names to ids of insns that - write to that variable. - """ - result = {} - - admissible_vars = ( - set(arg.name for arg in kernel.args) - | set(kernel.temporary_variables.iterkeys())) - - for insn in kernel.instructions: - if readers: - var_names = insn.get_read_var_names() & admissible_vars - else: - var_name = insn.get_assignee_var_name() - - if var_name not in admissible_vars: - raise RuntimeError("writing to '%s' is not allowed" % var_name) - var_names = [var_name] - - for var_name in var_names: - result.setdefault(var_name, set()).add(insn.id) - - return result - - - - def add_boostability_and_automatic_dependencies(kernel): - writer_map = find_accessors(kernel, readers=False) + writer_map = kernel.find_writers() arg_names = set(arg.name for arg in kernel.args) @@ -363,7 +331,7 @@ def get_axis_0_ranking(kernel, insn): from loopy.kernel import AutoLocalIndexTagBase axis0_candidates = set( iname - for iname in insn.all_inames() + for iname in kernel.insn_inames(insn) if isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase)) @@ -427,7 +395,7 @@ def get_axis_0_ranking(kernel, insn): + vote_strength) if saw_relevant_access: - return sorted((iname for iname in insn.all_inames()), + return sorted((iname for iname in kernel.insn_inames(insn)), key=lambda iname: vote_count_for_l0.get(iname, 0), reverse=True) else: @@ -524,7 +492,7 @@ def assign_automatic_axes(kernel, phase="axis0", local_size=None): for insn in kernel.instructions: auto_axis_inames = [ iname - for iname in insn.all_inames() + for iname in kernel.insn_inames(insn) if isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase)] @@ -533,7 +501,7 @@ def assign_automatic_axes(kernel, phase="axis0", local_size=None): assigned_local_axes = set() - for iname in insn.all_inames(): + for iname in kernel.insn_inames(insn): tag = kernel.iname_to_tag.get(iname) if isinstance(tag, LocalIndexTag): assigned_local_axes.add(tag.axis) diff --git a/loopy/schedule.py b/loopy/schedule.py index 7b8524cf9..b27069670 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -141,7 +141,7 @@ def find_used_inames_within(kernel, sched_index): result = set() for sched_item in run_insns: - result.update(kernel.id_to_insn[sched_item.insn_id].all_inames()) + result.update(kernel.insn_inames(sched_item.insn_id)) return result @@ -218,11 +218,14 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[]): # {{{ see if any insn can be scheduled now unscheduled_insn_ids = list(all_insn_ids - scheduled_insn_ids) + insns_with_satisfied_deps = set() for insn_id in unscheduled_insn_ids: insn = kernel.id_to_insn[insn_id] schedule_now = set(insn.insn_deps) <= scheduled_insn_ids + if schedule_now: + insns_with_satisfied_deps.add(insn_id) if not schedule_now: if debug_mode: @@ -242,7 +245,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[]): for active_loop_count in xrange(len(active_inames), -1, -1): outer_active_inames = set(active_inames[:active_loop_count]) if ( - insn.all_inames() - parallel_inames + kernel.insn_inames(insn) - parallel_inames <= outer_active_inames - parallel_inames): @@ -257,7 +260,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[]): else: print ("instruction '%s' is missing inames '%s'" % (insn.id, ",".join( - (insn.all_inames() - parallel_inames) + (kernel.insn_inames(insn) - parallel_inames) - (outer_active_inames - parallel_inames)))) @@ -266,7 +269,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[]): # the exactly correct set of loops. schedule_now = schedule_now and ( - insn.all_inames() - parallel_inames + kernel.insn_inames(insn) - parallel_inames == active_inames_set - parallel_inames) @@ -289,7 +292,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[]): # {{{ see if any loop can be entered now - available_loops = (kernel.all_inames() + available_loops = (kernel.all_referenced_inames() # loops can only be entered once - entered_inames # there's no notion of 'entering' a parallel loop @@ -306,8 +309,11 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[]): hypothetical_active_loops = active_inames_set | set([iname]) for insn_id in unscheduled_insn_ids: + if insn_id not in insns_with_satisfied_deps: + continue + insn = kernel.id_to_insn[insn_id] - if hypothetical_active_loops <= insn.all_inames(): + if hypothetical_active_loops <= kernel.insn_inames(insn): useful = True break @@ -365,7 +371,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[]): can_leave = True for insn_id in unscheduled_insn_ids: insn = kernel.id_to_insn[insn_id] - if last_entered_loop in insn.all_inames(): + if last_entered_loop in kernel.insn_inames(insn): can_leave = False break diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 6ecf02360..21ff0bd16 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -2,6 +2,8 @@ from __future__ import division +from pytools import memoize, memoize_method + from pymbolic.primitives import AlgebraicLeaf from pymbolic.mapper import ( CombineMapper as CombineMapperBase, @@ -55,6 +57,16 @@ class Reduction(AlgebraicLeaf): def stringifier(self): return StringifyMapper + @property + @memoize_method + def untagged_inames(self): + return tuple(iname.lstrip("@") for iname in self.inames) + + @property + @memoize_method + def untagged_inames_set(self): + return set(self.untagged_inames) + mapper_method = intern("map_reduction") # }}} @@ -82,7 +94,9 @@ class StringifyMapper(StringifyMapperBase): class DependencyMapper(DependencyMapperBase): def map_reduction(self, expr): - return self.rec(expr.expr) + from pymbolic.primitives import Variable + return (self.rec(expr.expr) + - set(Variable(iname) for iname in expr.untagged_inames)) class BidirectionalUnifier(BidirectionalUnifierBase): def map_reduction(self, expr, other, unis): @@ -552,14 +566,13 @@ class IndexVariableFinder(CombineMapper): def map_reduction(self, expr): result = self.rec(expr.expr) - real_inames = set(iname.lstrip("@") for iname in expr.inames) - if not (real_inames & result): + if not (expr.untagged_inames_set & result): raise RuntimeError("reduction '%s' does not depend on " "reduction inames (%s)" % (expr, ",".join(expr.inames))) if self.include_reduction_inames: return result else: - return result - real_inames + return result - expr.untagged_inames_set # }}} @@ -645,6 +658,7 @@ class PrimeAdder(IdentityMapper): # }}} +@memoize def get_dependencies(expr): from loopy.symbolic import DependencyMapper dep_mapper = DependencyMapper(composite_leaves=False) diff --git a/test/test_sem.py b/test/test_sem.py index 6b3ac68b8..7212860d0 100644 --- a/test/test_sem.py +++ b/test/test_sem.py @@ -304,21 +304,25 @@ def test_sem_3d(ctx_factory): name="semlap", assumptions="K>=1") - knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]") - knl = lp.add_prefetch(knl, "D", ["m", "j"]) - knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[e,i,j,k]") - knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"]) - knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"]) - knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"]) - - seq_knl = knl - print seq_knl - #print lp.preprocess_kernel(seq_knl) - 1/0 + def add_pf(knl): + knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]") + knl = lp.add_prefetch(knl, "D", ["m", "j"]) + knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[e,i,j,k]") + knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"]) + knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"]) + knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"]) + + seq_knl = add_pf(knl) knl = lp.split_dimension(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) #knl = lp.split_dimension(knl, "e_inner", 4, inner_tag="ilp") + knl = add_pf(knl) + #print seq_knl + #print lp.preprocess_kernel(seq_knl) + #1/0 + + knl = lp.tag_dimensions(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules(knl, -- GitLab