diff --git a/loopy/codegen/dispatch.py b/loopy/codegen/dispatch.py index 9d1f0261ab21ed0828d9573ba7b0e033a55bfc0e..f9c29a08a1453d27d2a5c9dedfb0b2d13a96c631 100644 --- a/loopy/codegen/dispatch.py +++ b/loopy/codegen/dispatch.py @@ -146,8 +146,14 @@ def build_loop_nest(kernel, sched_index, codegen_state): # Success: found a big enough group of inames for a conditional. # See if there are bounds checks available for that set. + from loopy.schedule import find_used_inames_within + used_inames = set() + for subsched_index, _ in sched_indices_and_cond_inames[0:idx]: + used_inames |= find_used_inames_within(kernel, subsched_index) + from loopy.codegen.bounds import generate_bounds_checks - bounds_checks = generate_bounds_checks(kernel.domain, current_iname_set, + bounds_checks = generate_bounds_checks(kernel.domain, + current_iname_set & used_inames, codegen_state.implemented_domain) else: bounds_checks = [] diff --git a/loopy/schedule.py b/loopy/schedule.py index 55137d1d8b90b97a67eae6d9cf0a2969fc46a4db..5cbf88471067c6cffe4c58b9123e9e6549dda1ea 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -759,6 +759,29 @@ def has_barrier_within(kernel, sched_index): else: return False + + + +def find_used_inames_within(kernel, sched_index): + sched_item = kernel.schedule[sched_index] + + if isinstance(sched_item, EnterLoop): + loop_contents, _ = gather_schedule_subloop( + kernel.schedule, sched_index) + run_insns = [subsched_item + for subsched_item in loop_contents + if isinstance(subsched_item, RunInstruction)] + elif isinstance(sched_item, RunInstruction): + run_insns = [sched_item] + else: + return set() + + result = set() + for sched_item in run_insns: + result.update(kernel.id_to_insn[sched_item.insn_id].all_inames()) + + return result + # }}} diff --git a/test/test_matmul.py b/test/test_matmul.py index 42ebf7e9e8f174ad24fb2d1292cefb2223d6ccb1..71b1d409e1f2d95c225544be1cf92c30f2a5947a 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -211,7 +211,7 @@ def test_plain_matrix_mul_new_ui(ctx_factory): lp.ArrayArg("c", dtype, shape=(n, n), order=order), lp.ScalarArg("n", np.int32, approximately=n), ], - name="matmul", assumptions="n >= 1") + name="matmul", assumptions="n >= 16") knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1", no_slabs=True)