diff --git a/loopy/check.py b/loopy/check.py index a8ec1ad35e42410454b36fa38ef5f0a2fbefc0d6..4b2af1b13a00a1bf216528cbe98ea01dafbeb2b8 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -708,6 +708,16 @@ def check_implemented_domains(kernel, implemented_domains, code=None): (insn_impl_domain & assumptions) .project_out_except(insn_inames, [dim_type.set])) + from loopy.kernel.instruction import BarrierInstruction + from loopy.kernel.data import LocalIndexTag + if isinstance(insn, BarrierInstruction): + # project out local-id-mapped inames, solves #94 on gitlab + non_lid_inames = frozenset( + [iname for iname in insn_inames if not isinstance( + kernel.iname_to_tag.get(iname), LocalIndexTag)]) + insn_impl_domain = insn_impl_domain.project_out_except( + non_lid_inames, [dim_type.set]) + insn_domain = kernel.get_inames_domain(insn_inames) insn_parameters = frozenset(insn_domain.get_var_names(dim_type.param)) assumptions, insn_domain = align_two(assumption_non_param, insn_domain) @@ -715,6 +725,11 @@ def check_implemented_domains(kernel, implemented_domains, code=None): .project_out_except(insn_inames, [dim_type.set]) .project_out_except(insn_parameters, [dim_type.param])) + if isinstance(insn, BarrierInstruction): + # project out local-id-mapped inames, solves #94 on gitlab + desired_domain = desired_domain.project_out_except( + non_lid_inames, [dim_type.set]) + insn_impl_domain = (insn_impl_domain .project_out_except(insn_parameters, [dim_type.param])) insn_impl_domain, desired_domain = align_two( diff --git a/test/test_loopy.py b/test/test_loopy.py index 563964cf04dfbce5d8983b66010863ef36a74ce7..97e3a080668b491c778902aa49a1686b021e0dda 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2265,6 +2265,43 @@ def test_barrier_insertion_near_bottom_of_loop(): assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1]) +def test_barrier_in_overridden_get_grid_size_expanded_kernel(): + from loopy.kernel.data import temp_var_scope as scopes + + # make simple barrier'd kernel + knl = lp.make_kernel('{[i]: 0 <= i < 10}', + """ + for i + a[i] = i {id=a} + ... lbarrier {id=barrier} + b[i + 1] = a[i] {nosync=a} + end + """, + [lp.TemporaryVariable("a", np.float32, shape=(10,), order='C', + scope=scopes.LOCAL), + lp.GlobalArg("b", np.float32, shape=(11,), order='C')], + seq_dependencies=True) + + # split into kernel w/ vesize larger than iname domain + vecsize = 16 + knl = lp.split_iname(knl, 'i', vecsize, inner_tag='l.0') + + # artifically expand via overridden_get_grid_sizes_for_insn_ids + class GridOverride(object): + def __init__(self, clean, vecsize=vecsize): + self.clean = clean + self.vecsize = vecsize + + def __call__(self, insn_ids, ignore_auto=True): + gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto) + return gsize, (self.vecsize,) + + knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride( + knl.copy(), vecsize)) + # make sure we can generate the code + lp.generate_code_v2(knl) + + def test_multi_argument_reduction_type_inference(): from loopy.type_inference import TypeInferenceMapper from loopy.library.reduction import SegmentedSumReductionOperation