diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index 61f4b3a9b8c38dfc25ebc81243812aa963423f8a..93a34d2334105d60db596e76a11545d9dac8e64d 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -58,7 +58,6 @@ def get_approximate_convex_bounds_checks(domain, check_inames, implemented_domai def get_usable_inames_for_conditional(kernel, sched_index): from loopy.schedule import ( find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) - from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag result = find_active_inames_at(kernel, sched_index) crosses_barrier = has_barrier_within(kernel, sched_index) @@ -78,15 +77,35 @@ def get_usable_inames_for_conditional(kernel, sched_index): # Outside all subkernels - use only inames available to host. return frozenset(result) - insn_ids_for_subkernel = get_insn_ids_for_block_at( - kernel.schedule, subkernel_index) + insn_ids_for_block = list( + get_insn_ids_for_block_at(kernel.schedule, subkernel_index)) + + # Iterate through the block, and pick out the common set of admissible inames. + # + # The admissible inames must be common to all instructions in the block to + # ensure that constraints from irrelevant inames don't get pulled into the + # conditional. + + if len(insn_ids_for_block) == 0: + return frozenset(result) + + common_admissible_inames = _pick_out_admissible_inames( + kernel, insn_ids_for_block[0], crosses_barrier) - inames_for_subkernel = ( - iname - for insn in insn_ids_for_subkernel - for iname in kernel.insn_inames(insn)) + from itertools import islice + for insn_id in islice(insn_ids_for_block, 1, None): + common_admissible_inames &= kernel.insn_inames(insn_id) + + return frozenset(result | common_admissible_inames) + + +def _pick_out_admissible_inames(kernel, insn_id, crosses_barrier): + # Given an instruction, pick out the set of (parallel) inames on + # which a conditional may depend. + from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag + admissible_inames = [] - for iname in inames_for_subkernel: + for iname in kernel.insn_inames(insn_id): tag = kernel.iname_to_tag.get(iname) # Parallel inames are defined within a subkernel, BUT: @@ -101,9 +120,9 @@ def get_usable_inames_for_conditional(kernel, sched_index): and not (isinstance(tag, LocalIndexTagBase) and crosses_barrier) and not isinstance(tag, IlpBaseTag) ): - result.add(iname) + admissible_inames.append(iname) - return frozenset(result) + return set(admissible_inames) # }}} diff --git a/loopy/version.py b/loopy/version.py index 8516ce006bde8b8616172a72a766ec86dfcd44f1..02244f55d0dbf207a4641c3ebf6cc33b536f0421 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v63-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v64-islpy%s" % _islpy_version diff --git a/test/test_loopy.py b/test/test_loopy.py index 21db62610f3a3160bcc3069c3e480e85cc4712f8..921fe8a4a1c1863eb2ee6870c2f415fbb7059f15 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2321,6 +2321,32 @@ def test_inames_conditional_generation(ctx_factory): knl(queue) +def test_inames_conditional_generation_avoids_irrelevant_constraints(ctx_factory): + ctx = ctx_factory() + knl = lp.make_kernel( + "{[i, loc1, loc2]: 0 <= loc1 <= 1 and 0 <= loc2 <= 2" + " and 0 <= i <= loc1 and 0 <= i <= loc2}", + """ + <>tmp[loc2] = 0 + + for i + tmp[i] = 1 {inames=i:loc2} + end + + out[loc2] = tmp[loc2] + """, + "...", + seq_dependencies=True) + + knl = lp.tag_inames(knl, dict(loc1="l.0", loc2="l.0")) + knl = lp.set_temporary_scope(knl, "tmp", "local") + + with cl.CommandQueue(ctx) as queue: + evt, (out,) = knl(queue) + + assert all(out.get() == 1) + + def test_kernel_var_name_generator(): knl = lp.make_kernel( "{[i]: 0 <= i <= 10}",