diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index bf435d3fe08d022790bf31a4d583d4923f0bfeff..1ad108b417fcae130c63ddbe9248cb46c9e6812a 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -210,8 +210,9 @@ def _fuse_two_kernels(knla, knlb): from pymbolic.imperative.transform import \ fuse_instruction_streams_with_unique_ids - new_instructions, _ = fuse_instruction_streams_with_unique_ids( - knla.instructions, knlb.instructions) + new_instructions, old_b_id_to_new_b_id = \ + fuse_instruction_streams_with_unique_ids( + knla.instructions, knlb.instructions) # {{{ fuse assumptions @@ -283,12 +284,12 @@ def _fuse_two_kernels(knla, knlb): "target", knla.target, knlb.target), - options=knla.options) + options=knla.options), old_b_id_to_new_b_id # }}} -def fuse_kernels(kernels, suffixes=None): +def fuse_kernels(kernels, suffixes=None, data_flow=None): """Return a kernel that performs all the operations in all entries of *kernels*. @@ -296,6 +297,11 @@ def fuse_kernels(kernels, suffixes=None): :arg suffixes: If given, must be a list of strings of a length matching that of *kernels*. This will be used to disambiguate the names of temporaries, as described below. + :arg data_flow: A list of data dependencies + ``[(var_name, from_kernel, to_kernel), ...]``. + Based on this, the fuser will create dependencies between all + writers of *var_name* in ``kernels[from_kernel]`` to + readers of *var_name* in ``kernels[to_kernel]``. The components of the kernels are fused as follows: @@ -321,9 +327,16 @@ def fuse_kernels(kernels, suffixes=None): * The resulting kernel will contain all instructions from each entry of *kernels*. Clashing instruction IDs will be renamed to ensure uniqueness. + + .. versionchanged:: 2016.2 + + *data_flow* was added in version 2016.2 """ kernels = list(kernels) + if data_flow is None: + data_flow = [] + if suffixes: suffixes = list(suffixes) if len(suffixes) != len(kernels): @@ -356,9 +369,44 @@ def fuse_kernels(kernels, suffixes=None): # }}} - result = kernels.pop(0) - while kernels: - result = _fuse_two_kernels(result, kernels.pop(0)) + kernel_insn_ids = [] + result = None + + for knlb in kernels: + if result is None: + result = knlb + kernel_insn_ids.append([ + insn.id for insn in knlb.instructions]) + else: + result, old_b_id_to_new_b_id = _fuse_two_kernels( + knla=result, + knlb=knlb) + + kernel_insn_ids.append([ + old_b_id_to_new_b_id[insn.id] + for insn in knlb.instructions]) + + # {{{ realize data_flow dependencies + + id_to_insn = result.id_to_insn.copy() + + for var_name, from_kernel, to_kernel in data_flow: + from_writer_ids = frozenset( + insn_id + for insn_id in kernel_insn_ids[from_kernel] + if var_name in id_to_insn[insn_id].assignee_var_names()) + + for insn_id in kernel_insn_ids[to_kernel]: + insn = id_to_insn[insn_id] + if var_name in insn.read_dependency_names(): + insn = insn.copy(depends_on=insn.depends_on | from_writer_ids) + + id_to_insn[insn_id] = insn + + result = result.copy( + instructions=list(six.itervalues(id_to_insn))) + + # }}} return result diff --git a/test/test_loopy.py b/test/test_loopy.py index 1fed3289aac1c184b2267e3425aed2d8023f9a03..48ed2e2e70a109d6120e177f023bfa5f6267fe5e 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2269,7 +2269,10 @@ def test_finite_difference_expr_subst(ctx_factory): lp.GlobalArg("u", shape="n+2"), ]) - fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl]) + fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl], + data_flow=[ + ("f", 1, 0) + ]) fused_knl = lp.set_options(fused_knl, write_cl=True) evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))