diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 921d2538a5068a8c309fe9f412aba588aa6243f2..c54d1fc329a3a8797b17458dc40e489044e9374a 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -364,7 +364,7 @@ class InstructionBase(Record): raise RuntimeError("unexpected value for Instruction.boostable") if self.depends_on: - result.append("deps="+":".join(self.depends_on)) + result.append("dep="+":".join(self.depends_on)) if self.no_sync_with: result.append("nosync="+":".join(self.no_sync_with)) if self.groups: diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 27fad67f8838622d24e276defc7b8c66eaf11306..db993b771d9088f0644c2406704f5b2e4c97ea89 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -136,7 +136,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): access_descriptors, array_base_map, storage_axis_names, storage_axis_sources, non1_storage_axis_names, - temporary_name, compute_insn_id): + temporary_name, compute_insn_id, compute_read_variables): super(RuleInvocationReplacer, self).__init__(rule_mapping_context) self.subst_name = subst_name @@ -153,6 +153,9 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): self.temporary_name = temporary_name self.compute_insn_id = compute_insn_id + self.compute_read_variables = compute_read_variables + self.compute_insn_deps = set() + def map_substitution(self, name, tag, arguments, expn_state): if not ( name == self.subst_name @@ -230,6 +233,16 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): insn.depends_on | frozenset([self.compute_insn_id]))) + for dep in insn.depends_on: + if dep == self.compute_insn_id: + continue + + dep_insn = kernel.id_to_insn[dep] + if (frozenset(dep_insn.assignee_var_names()) + & self.compute_read_variables): + self.compute_insn_deps.update( + insn.depends_on - set([self.compute_insn_id])) + new_insns.append(insn) return kernel.copy(instructions=new_insns) @@ -782,12 +795,16 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # {{{ substitute rule into expressions in kernel (if within footprint) + from loopy.symbolic import SubstitutionRuleExpander + expander = SubstitutionRuleExpander(kernel.substitutions) + invr = RuleInvocationReplacer(rule_mapping_context, subst_name, subst_tag, within, access_descriptors, abm, storage_axis_names, storage_axis_sources, non1_storage_axis_names, - temporary_name, compute_insn_id) + temporary_name, compute_insn_id, + compute_read_variables=get_dependencies(expander(compute_expression))) kernel = invr.map_kernel(kernel) kernel = kernel.copy( @@ -796,6 +813,17 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # }}} + # {{{ add dependencies to compute insn + + kernel = kernel.copy( + instructions=[ + insn.copy(depends_on=frozenset(invr.compute_insn_deps)) + if insn.id == compute_insn_id + else insn + for insn in kernel.instructions]) + + # }}} + # {{{ determine inames for compute insn if precompute_outer_inames is None: