diff --git a/MEMO b/MEMO index 05eaf2de75a810bd9ff726646ac8ae53d56bd382..c13497a091cdbcd4e70448f9277e56bceadcb977 100644 --- a/MEMO +++ b/MEMO @@ -36,19 +36,17 @@ Things to consider - Loopy as a data model for implementing custom rewritings +- We won't generate WAWs barrier-needing dependencies + from one instruction to itself. + To-do ^^^^^ - Just touching a variable written to by a non-idempotent instruction makes that instruction also not idempotent -- assert dependencies <= parent_inames in loopy/__init__.py - ??? - - user interface for dim length prescription -- Way too many barriers in SEM test. - - Deal with equality constraints. (These arise, e.g., when partitioning a loop of length 16 into 16s.) @@ -91,6 +89,9 @@ Future ideas Dealt with ^^^^^^^^^^ +- assert dependencies <= parent_inames in loopy/__init__.py + -> Yes, this must be the case. + - Give a good error message if a parameter assignment in get_problems() is missing. diff --git a/loopy/__init__.py b/loopy/__init__.py index 46f6a76555afcf51c65cb4b2fcc7d7283fbf397a..f0fe3f0fda31cbc07d0d3b295dc21d8c0d12f011 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -234,8 +234,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non dependencies = IndexVariableFinder( include_reduction_inames=False)(expr.child) - # FIXME: can happen with - # assert dependencies <= parent_inames + assert dependencies <= parent_inames for iname in parent_inames: if iname in duplicate_inames: diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 788625e0ee7cf6e0229cdf30a7b2776bacdfbad6..f4e1d1c0a619ae6fdcd863bc30c719f43b1de57e 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -55,8 +55,14 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state): elif isinstance(sched_item, Barrier): from loopy.codegen import GeneratedInstruction from cgen import Statement as S + + if sched_item.comment: + comment = " /* %s */" % sched_item.comment + else: + comment = "" + return GeneratedInstruction( - ast=S("barrier(CLK_LOCAL_MEM_FENCE)"), + ast=S("barrier(CLK_LOCAL_MEM_FENCE)%s" % comment), implemented_domain=None) elif isinstance(sched_item, RunInstruction): diff --git a/loopy/kernel.py b/loopy/kernel.py index 1cc356f77d4d19a4c9c334a1de2bce49a83b4f63..db1a326a7c79c2be2527eb7879da1bab597e80ce 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -310,6 +310,12 @@ class Instruction(Record): return result + @memoize_method + def get_read_var_names(self): + from loopy.symbolic import DependencyMapper + return set(var.name for var in + DependencyMapper(composite_leaves=False)(self.expression)) + # }}} # {{{ reduction operations @@ -774,6 +780,13 @@ class LoopKernel(Record): return tup_to_exprs(grid_size), tup_to_exprs(group_size) + @memoize_method + def local_var_names(self): + return set( + tv.name + for tv in self.temporary_variables.itervalues() + if tv.is_local) + def local_mem_use(self): return sum(lv.nbytes for lv in self.temporary_variables.itervalues() if lv.is_local) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 990a4fc7f2cb526b73a7afa514f9e571c85c0c40..85ecca37d997a8f98abab945291acaf2f63be6eb 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -193,8 +193,7 @@ def find_accessors(kernel, readers): for insn in kernel.instructions: if readers: - from loopy.symbolic import DependencyMapper - var_names = DependencyMapper()(insn.expression) & admissible_vars + var_names = insn.get_read_var_names() & admissible_vars else: var_name = insn.get_assignee_var_name() diff --git a/loopy/schedule.py b/loopy/schedule.py index 07ce6c8cbff67ce04ca8fc06fa6545caec4932f1..d9e52dfa8aedacf003f3b3190dc57ff5dacb2297 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -17,7 +17,7 @@ class RunInstruction(Record): __slots__ = ["insn_id"] class Barrier(Record): - __slots__ = [] + __slots__ = ["comment"] # }}} @@ -43,12 +43,52 @@ def gather_schedule_subloop(schedule, start_idx): -def has_dependent_in_schedule(kernel, insn_id, schedule): - from pytools import any - return any(sched_item - for sched_item in schedule - if isinstance(sched_item, RunInstruction) - and kernel.id_to_insn[sched_item.insn_id].insn_deps) + +def get_barrier_needing_dependency(kernel, target, source): + from loopy.kernel import Instruction + if not isinstance(source, Instruction): + source = kernel.id_to_insn[source] + if not isinstance(target, Instruction): + target = kernel.id_to_insn[target] + + local_vars = kernel.local_var_names() + + tgt_write = set([target.get_assignee_var_name()]) & local_vars + tgt_read = target.get_read_var_names() & local_vars + + src_write = set([source.get_assignee_var_name()]) & local_vars + src_read = source.get_read_var_names() & local_vars + + waw = tgt_write & src_write + raw = tgt_read & src_write + war = tgt_write & src_read + + for var_name in raw | war: + assert source.id in target.insn_deps + return (target, source, var_name) + + if source is target: + return None + + for var_name in waw: + assert (source.id in target.insn_deps + or source is target) + return (target, source, var_name) + + return None + + + + + + +def get_barrier_dependent_in_schedule(kernel, source, schedule): + for sched_item in schedule: + if isinstance(sched_item, RunInstruction): + temp_res = get_barrier_needing_dependency( + kernel, sched_item.insn_id, source) + if temp_res: + return temp_res @@ -289,8 +329,7 @@ def insert_barriers(kernel, schedule, level=0): # dependencies for which the 'normal' mechanism below will generate # barriers. - def issue_barrier(is_pre_barrier): - owed_barriers.clear() + def issue_barrier(is_pre_barrier, dep): if result and isinstance(result[-1], Barrier): return @@ -298,8 +337,18 @@ def insert_barriers(kernel, schedule, level=0): if loop_had_barrier[0] or level == 0: return + owed_barriers.clear() + + cmt = None + if dep is not None: + target, source, var = dep + if is_pre_barrier: + cmt = "pre-barrier: %s" % var + else: + cmt = "dependency: %s" % var + loop_had_barrier[0] = True - result.append(Barrier()) + result.append(Barrier(comment=cmt)) i = 0 while i < len(schedule): @@ -313,8 +362,9 @@ def insert_barriers(kernel, schedule, level=0): # (i.e. if anything *in* the loop depends on something beforehand) for insn_id in owed_barriers: - if has_dependent_in_schedule(kernel, insn_id, subloop): - issue_barrier(is_pre_barrier=False) + dep = get_barrier_dependent_in_schedule(kernel, insn_id, subloop) + if dep: + issue_barrier(is_pre_barrier=False, dep=dep) break # }}} @@ -326,9 +376,10 @@ def insert_barriers(kernel, schedule, level=0): if not loop_had_barrier[0]: for insn_id in sub_owed_barriers: - if has_dependent_in_schedule( - kernel, insn_id, schedule): - issue_barrier(is_pre_barrier=True) + dep = get_barrier_dependent_in_schedule( + kernel, insn_id, schedule) + if dep: + issue_barrier(is_pre_barrier=True, dep=dep) # }}} @@ -347,20 +398,23 @@ def insert_barriers(kernel, schedule, level=0): # {{{ issue dependency-based barriers for this instruction - if set(insn.insn_deps) & owed_barriers: - issue_barrier(is_pre_barrier=False) + for dep_src_insn_id in set(insn.insn_deps) & owed_barriers: + dep = get_barrier_needing_dependency(kernel, insn, dep_src_insn_id) + if dep: + issue_barrier(is_pre_barrier=False, dep=dep) # }}} assignee_temp_var = kernel.temporary_variables.get( insn.get_assignee_var_name()) if assignee_temp_var is not None and assignee_temp_var.is_local: + dep = get_barrier_dependent_in_schedule(kernel, insn.id, schedule) + if level == 0: - assert has_dependent_in_schedule( - kernel, insn.id, schedule) + assert dep - if has_dependent_in_schedule(kernel, insn.id, schedule): - issue_barrier(is_pre_barrier=True) + if dep: + issue_barrier(is_pre_barrier=True, dep=dep) result.append(sched_item) owed_barriers.add(insn.id)