diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 7b73093873e7a5364945f8888723be639f37dcf4..fbe4e4c0390a70080e80889e85984a56add4edf3 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -75,7 +75,7 @@ def generate_code_for_sched_index(codegen_state, sched_index): from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( - new_codegen_state, sched_index + 1) + new_codegen_state, sched_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index)) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index a9eb44f84d44015fa11ce83afe410e833bb8e214..648c3fe6f5b748dcc47de5ac972bb82ce605a9a9 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -233,17 +233,23 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, from loopy.kernel.data import ( UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag) + from loopy.schedule import get_insn_ids_for_block_at + insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) + if hw_inames_left is None: + all_inames_by_insns = set() + for insn_id in insn_ids_for_block: + all_inames_by_insns |= kernel.insn_inames(insn_id) + hw_inames_left = [iname - for iname in kernel.all_inames() + for iname in all_inames_by_insns if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)] if not hw_inames_left: return next_func(codegen_state) - from loopy.schedule import get_insn_ids_for_block_at global_size, local_size = kernel.get_grid_sizes_for_insn_ids( - get_insn_ids_for_block_at(kernel.schedule, schedule_index)) + insn_ids_for_block) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 3306d30e4486418b7b4f78e4f1d95a4fd39b45bc..a46500ea515974b00b5bfce4faadfadd010106a4 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -246,14 +246,17 @@ def generate_host_or_device_program(codegen_state, schedule_index): from functools import partial from loopy.codegen.control import build_loop_nest - next_func = partial(build_loop_nest, schedule_index=schedule_index) - if codegen_state.is_generating_device_code: + from loopy.schedule import CallKernel + assert isinstance(codegen_state.kernel.schedule[schedule_index], CallKernel) + from loopy.codegen.loop import set_up_hw_parallel_loops codegen_result = set_up_hw_parallel_loops( - codegen_state, schedule_index, next_func=next_func) + codegen_state, schedule_index, + next_func=partial(build_loop_nest, + schedule_index=schedule_index + 1)) else: - codegen_result = next_func(codegen_state) + codegen_result = build_loop_nest(codegen_state, schedule_index) codegen_result = merge_codegen_results( codegen_state,