diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 8ad4a08caf226e14a3e524d6b3e29b2f0819ca5a..57255c179ea484c68b9c2ad9ef9be81607bff630 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -146,13 +146,12 @@ def generate_unroll_loop(kernel, sched_index, codegen_state): def intersect_kernel_with_slab(kernel, slab, iname): - hdi = kernel.get_home_domain_index(iname) - home_domain = kernel.domains[hdi] - new_domains = kernel.domains[:] - new_domains[hdi] = home_domain & isl.align_spaces(slab, home_domain) + from loopy.kernel.tools import DomainChanger - return kernel.copy(domains=new_domains, - get_grid_sizes=kernel.get_grid_sizes) + domch = DomainChanger(kernel, (iname,)) + orig_domain = domch.get_original_domain() + orig_domain, slab = isl.align_two(orig_domain, slab) + return domch.get_kernel_with(orig_domain & slab) # {{{ hw-parallel loop @@ -302,35 +301,30 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] - lbound = kernel.cache_manager.dim_min( - dom_and_slab, loop_iname_idx).coalesce() - ubound = kernel.cache_manager.dim_max( - dom_and_slab, loop_iname_idx).coalesce() from loopy.isl_helpers import ( static_min_of_pw_aff, static_max_of_pw_aff) - lbound = static_min_of_pw_aff(lbound, + static_lbound = static_min_of_pw_aff( + kernel.cache_manager.dim_min( + dom_and_slab, loop_iname_idx).coalesce(), constants_only=False) - ubound = static_max_of_pw_aff(ubound, + static_ubound = static_max_of_pw_aff( + kernel.cache_manager.dim_max( + dom_and_slab, loop_iname_idx).coalesce(), constants_only=False) # }}} # {{{ find implemented slab, build inner code - from loopy.isl_helpers import iname_rel_aff - impl_slab = ( - isl.BasicSet.universe(dom_and_slab.space) - .add_constraint( - isl.Constraint.inequality_from_aff( - iname_rel_aff(dom_and_slab.space, - loop_iname, ">=", lbound))) - .add_constraint( - isl.Constraint.inequality_from_aff( - iname_rel_aff(dom_and_slab.space, - loop_iname, "<=", ubound)))) + from loopy.isl_helpers import make_slab_from_bound_pwaffs + + # impl_slab may be overapproximated + impl_slab = make_slab_from_bound_pwaffs( + dom_and_slab.space, + loop_iname, static_lbound, static_ubound) for iname in moved_inames: dt, idx = impl_slab.get_var_dict()[iname] @@ -340,8 +334,10 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): new_codegen_state = codegen_state.intersect(impl_slab) - inner = build_loop_nest(kernel, sched_index+1, - new_codegen_state) + inner = build_loop_nest( + intersect_kernel_with_slab( + kernel, slab, iname), + sched_index+1, new_codegen_state) # }}} @@ -352,20 +348,24 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): from cgen import Initializer, POD, Const, Line, For from loopy.symbolic import aff_to_expr - if (ubound - lbound).plain_is_zero(): + if (static_ubound - static_lbound).plain_is_zero(): # single-trip, generate just a variable assignment, not a loop result.append(gen_code_block([ Initializer(Const(POD(kernel.index_dtype, loop_iname)), - ccm(aff_to_expr(lbound), "i")), + ccm(aff_to_expr(static_lbound), "i")), Line(), inner, ])) else: from loopy.codegen import wrap_in + from pyopencl.tools import dtype_to_ctype + result.append(wrap_in(For, - "int %s = %s" % (loop_iname, ccm(aff_to_expr(lbound), "i")), - "%s <= %s" % (loop_iname, ccm(aff_to_expr(ubound), "i")), + "%s %s = %s" + % (dtype_to_ctype(kernel.index_dtype), + loop_iname, ccm(aff_to_expr(static_lbound), "i")), + "%s <= %s" % (loop_iname, ccm(aff_to_expr(static_ubound), "i")), "++%s" % loop_iname, inner)) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 587b1be383c0dc580f4e2d9952aa5c695fec8b0a..201aee16ce3571070aaeeabdce4e6c8b726df7cd 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -152,6 +152,21 @@ def make_slab(space, iname, start, stop): return result +def make_slab_from_bound_pwaffs(space, iname, lbound, ubound): + dt, pos = space.get_var_dict()[iname] + iname_pwaff = isl.PwAff.var_on_domain(space, dt, pos) + + iname_pwaff, lbound = isl.align_two(iname_pwaff, lbound) + iname_pwaff, ubound = isl.align_two(iname_pwaff, ubound) + assert iname_pwaff.space == lbound.space + assert iname_pwaff.space == ubound.space + + return convexify( + iname_pwaff.ge_set(lbound) + & + iname_pwaff.le_set(ubound)) + + def iname_rel_aff(space, iname, rel, aff): """*aff*'s domain space is allowed to not match *space*.""" @@ -198,7 +213,10 @@ def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context) if context is not None: reference = reference.intersect(context) + # {{{ find bounds that are also global bounds + for set, candidate_aff in pieces: + # gist can be time-consuming, try without first for use_gist in [False, True]: if use_gist: candidate_aff = candidate_aff.gist(set) @@ -209,6 +227,8 @@ def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context) if reference <= set_method(pw_aff, candidate_aff): return candidate_aff + # }}} + raise ValueError("a static %s was not found for PwAff '%s'" % (what, pw_aff)) diff --git a/test/test_loopy.py b/test/test_loopy.py index 80ff20c5145c3cd7edf9c4117520118820562044..7957fc9cad3e55d295956df5efbe0ee1159ce319 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1624,7 +1624,7 @@ def test_slab_decomposition_does_not_double_execute(ctx_factory): ref_knl = knl - for outer_tag in ["for", "unr", "l.0"]: + for outer_tag in ["for", "g.0"]: knl = ref_knl knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr", outer_tag=outer_tag) @@ -1639,7 +1639,6 @@ def test_slab_decomposition_does_not_double_execute(ctx_factory): evt, _ = knl(queue, a=a_knl) assert (a_ref == a_knl).get().all() - 1/0 def test_multiple_writes_to_local_temporary(ctx_factory):