diff --git a/doc/tutorial.rst b/doc/tutorial.rst index fa6fcc95088198c28f17b2e383a54eb961419467..6c3175dc044aecef0989d69d77b0d67ed807e957 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -532,9 +532,8 @@ Consider this example: #define lid(N) ((int) get_local_id(N)) ... for (int i_outer = 0; i_outer <= -1 + ((15 + n) / 16); ++i_outer) - for (int i_inner = 0; i_inner <= 15; ++i_inner) - if (-1 + -1 * i_inner + -16 * i_outer + n >= 0) - a[16 * i_outer + i_inner] = 0.0f; + for (int i_inner = 0; i_inner <= (-16 + n + -16 * i_outer >= 0 ? 15 : -1 + n + -16 * i_outer); ++i_inner) + a[16 * i_outer + i_inner] = 0.0f; ... By default, the new, split inames are named *OLD_outer* and *OLD_inner*, @@ -563,10 +562,9 @@ relation to loop nesting. For example, it's perfectly possible to request >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - for (int i_inner = 0; i_inner <= 15; ++i_inner) - if (-1 + -1 * i_inner + n >= 0) - for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) - a[16 * i_outer + i_inner] = 0.0f; + for (int i_inner = 0; i_inner <= (-17 + n >= 0 ? 15 : -1 + n); ++i_inner) + for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) + a[16 * i_outer + i_inner] = 0.0f; ... Notice how loopy has automatically generated guard conditionals to make @@ -791,17 +789,18 @@ enabling some cost savings: a[4 * i_outer + 3] = 0.0f; } /* final slab for 'i_outer' */ - for (int i_outer = -1 + n + -1 * (3 * n / 4); i_outer <= -1 + ((3 + n) / 4); ++i_outer) - if (-1 + n >= 0) - { - a[4 * i_outer] = 0.0f; - if (-2 + -4 * i_outer + n >= 0) - a[4 * i_outer + 1] = 0.0f; - if (-3 + -4 * i_outer + n >= 0) - a[4 * i_outer + 2] = 0.0f; - if (4 + 4 * i_outer + -1 * n == 0) - a[4 * i_outer + 3] = 0.0f; - } + int const i_outer = -1 + n + -1 * (3 * n / 4); + + if (-1 + n >= 0) + { + a[4 * i_outer] = 0.0f; + if (-2 + -4 * i_outer + n >= 0) + a[4 * i_outer + 1] = 0.0f; + if (-3 + -4 * i_outer + n >= 0) + a[4 * i_outer + 2] = 0.0f; + if (4 + 4 * i_outer + -1 * n == 0) + a[4 * i_outer + 3] = 0.0f; + } ... .. }}} diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index fb254bd54480f716de54de96f6aab9a4bb427767..61a825fb9fef8c4d847e3e8f1310814e56e13a0a 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -62,23 +62,33 @@ def get_bounds_checks(domain, check_inames, implemented_domain, # {{{ on which inames may a conditional depend? def get_usable_inames_for_conditional(kernel, sched_index): - from loopy.schedule import EnterLoop, LeaveLoop + from loopy.schedule import ( + find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag - result = set() + result = find_active_inames_at(kernel, sched_index) + crosses_barrier = has_barrier_within(kernel, sched_index) - for i, sched_item in enumerate(kernel.schedule): - if i >= sched_index: - break - if isinstance(sched_item, EnterLoop): - result.add(sched_item.iname) - elif isinstance(sched_item, LeaveLoop): - result.remove(sched_item.iname) + # Find our containing subkernel, grab inames for all insns from there. - for iname in kernel.all_inames(): + subkernel_index = sched_index + from loopy.schedule import CallKernel + + while not isinstance(kernel.schedule[subkernel_index], CallKernel): + subkernel_index -= 1 + + insn_ids_for_subkernel = get_insn_ids_for_block_at( + kernel.schedule, subkernel_index) + + inames_for_subkernel = ( + iname + for insn in insn_ids_for_subkernel + for iname in kernel.insn_inames(insn)) + + for iname in inames_for_subkernel: tag = kernel.iname_to_tag.get(iname) - # Parallel inames are always defined, BUT: + # Parallel inames are defined within a subkernel, BUT: # # - local indices may not be used in conditionals that cross barriers. # @@ -87,7 +97,7 @@ def get_usable_inames_for_conditional(kernel, sched_index): if ( isinstance(tag, ParallelTag) - and not isinstance(tag, LocalIndexTagBase) + and not (isinstance(tag, LocalIndexTagBase) and crosses_barrier) and not isinstance(tag, IlpBaseTag) ): result.add(iname) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 648c3fe6f5b748dcc47de5ac972bb82ce605a9a9..d0d68edecf3f3c64705d8caf2afb4bf0e96697ad 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -386,38 +386,29 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] - from loopy.isl_helpers import ( - static_min_of_pw_aff, - static_max_of_pw_aff) - lbound = ( kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) + .gist(dom_and_slab.params()) .coalesce()) ubound = ( kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) + .gist(dom_and_slab.params()) .coalesce()) - static_lbound = static_min_of_pw_aff( - lbound, - constants_only=False) - static_ubound = static_max_of_pw_aff( - ubound, - constants_only=False) - # }}} # {{{ find implemented slab, build inner code - from loopy.isl_helpers import make_slab_from_bound_pwaffs + from loopy.isl_helpers import make_loop_bound_from_pwaffs # impl_slab may be overapproximated - impl_slab = make_slab_from_bound_pwaffs( + impl_slab = make_loop_bound_from_pwaffs( dom_and_slab.space, - loop_iname, static_lbound, static_ubound) + loop_iname, lbound, ubound) for iname in moved_inames: dt, idx = impl_slab.get_var_dict()[iname] @@ -438,17 +429,21 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): if cmt is not None: result.append(codegen_state.ast_builder.emit_comment(cmt)) - from loopy.symbolic import aff_to_expr - astb = codegen_state.ast_builder - if (static_ubound - static_lbound).plain_is_zero(): + zero = isl.PwAff.zero_on_domain( + isl.LocalSpace.from_space( + lbound.get_space()).domain()) + + from loopy.symbolic import pw_aff_to_expr + + if (ubound - lbound).plain_is_equal(zero): # single-trip, generate just a variable assignment, not a loop result.append(merge_codegen_results(codegen_state, [ astb.emit_initializer( codegen_state, kernel.index_dtype, loop_iname, - ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), + ecm(pw_aff_to_expr(lbound), PREC_NONE, "i"), is_const=True), astb.emit_blank_line(), inner, @@ -461,7 +456,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): codegen_state, astb.emit_sequential_loop( codegen_state, loop_iname, kernel.index_dtype, - static_lbound, static_ubound, inner_ast))) + pw_aff_to_expr(lbound), pw_aff_to_expr(ubound), inner_ast))) return merge_codegen_results(codegen_state, result) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index e657beecbc5453ae5b2390da5a958d2fc9a70771..82441eb7f4b083810336916d477740693cceb012 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -102,7 +102,7 @@ def make_slab(space, iname, start, stop): return result -def make_slab_from_bound_pwaffs(space, iname, lbound, ubound): +def make_loop_bound_from_pwaffs(space, iname, lbound, ubound): dt, pos = space.get_var_dict()[iname] iname_pwaff = isl.PwAff.var_on_domain(space, dt, pos) @@ -111,10 +111,10 @@ def make_slab_from_bound_pwaffs(space, iname, lbound, ubound): assert iname_pwaff.space == lbound.space assert iname_pwaff.space == ubound.space - return convexify( - iname_pwaff.ge_set(lbound) - & - iname_pwaff.le_set(ubound)) + return ( + iname_pwaff.ge_set(lbound) + & + iname_pwaff.le_set(ubound)) # }}} diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 5b5b2477651c4026cfb4b0618481fbb8b3710728..74bb5c1d1a917fd00edad3bfcd2d5ba241d1ff49 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1142,12 +1142,35 @@ def pw_aff_to_expr(pw_aff, int_ok=False): return pw_aff pieces = pw_aff.get_pieces() + last_expr = aff_to_expr(pieces[-1][1]) - if len(pieces) != 1: - raise NotImplementedError("pw_aff_to_expr for multi-piece PwAff instances") + # {{{ make exprs from set constraints - (set, aff), = pieces - return aff_to_expr(aff) + from pymbolic.primitives import LogicalAnd, LogicalOr + + def set_to_expr(isl_set): + constrs = [] + for isl_basicset in isl_set.get_basic_sets(): + constrs.append(basic_set_to_expr(isl_basicset)) + return LogicalOr(tuple(constrs)) + + def basic_set_to_expr(isl_basicset): + constrs = [] + for constr in isl_basicset.get_constraints(): + constrs.append(constraint_to_expr(constr)) + return LogicalAnd(tuple(constrs)) + + # }}} + + pairs = [(set_to_expr(constr_set), aff_to_expr(aff)) + for constr_set, aff in pieces[:-1]] + + from pymbolic.primitives import If + expr = last_expr + for condition, then_expr in reversed(pairs): + expr = If(condition, then_expr, expr) + + return expr # }}} diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 6cfd8f0c7a8471004b8c4417bf63d3edd8a57f65..e28da7453427425e5db1e80f8dbfe80ae911bef0 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -703,11 +703,9 @@ class CASTBuilder(ASTBuilderBase): CExpression(self.get_c_expression_to_code_mapper(), result)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): + lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper - from loopy.symbolic import aff_to_expr - from pymbolic import var from pymbolic.primitives import Comparison from pymbolic.mapper.stringifier import PREC_NONE @@ -716,12 +714,12 @@ class CASTBuilder(ASTBuilderBase): return For( InlineInitializer( POD(self, iname_dtype, iname), - ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), + ecm(lbound, PREC_NONE, "i")), ecm( Comparison( var(iname), "<=", - aff_to_expr(static_ubound)), + ubound), PREC_NONE, "i"), "++%s" % iname, inner) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 4194d445f3cf74df4eecd748fc674061d2befbd5..f07d5f20a40467eef6a5dfd3fc52eb318a2b64d1 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -469,10 +469,9 @@ class ISPCASTBuilder(CASTBuilder): return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): + lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper - from loopy.symbolic import aff_to_expr from loopy.target.c import POD from pymbolic.mapper.stringifier import PREC_NONE @@ -483,9 +482,9 @@ class ISPCASTBuilder(CASTBuilder): return For( InlineInitializer( ISPCUniform(POD(self, iname_dtype, iname)), - ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), + ecm(lbound, PREC_NONE, "i")), ecm( - p.Comparison(var(iname), "<=", aff_to_expr(static_ubound)), + p.Comparison(var(iname), "<=", ubound), PREC_NONE, "i"), "++%s" % iname, inner) diff --git a/loopy/target/python.py b/loopy/target/python.py index 036e60ab10d08d4f2825c7a8369260c351413ffa..a348cba83008794ee3d02a61ff90f6d81d1a9322 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -129,6 +129,19 @@ class ExpressionToPythonMapper(StringifyMapper): def map_local_hw_index(self, expr, enclosing_prec): raise LoopyError("plain Python does not have local hw axes") + def map_if(self, expr, enclosing_prec): + # Synthesize PREC_IFTHENELSE, make sure it is in the right place in the + # operator precedence hierarchy (right above "or"). + from pymbolic.mapper.stringifier import PREC_LOGICAL_OR, PREC_NONE + PREC_IFTHENELSE = PREC_LOGICAL_OR - 1 + + return self.parenthesize_if_needed( + "{then} if {cond} else {else_}".format( + then=self.rec(expr.then, PREC_IFTHENELSE), + cond=self.rec(expr.condition, PREC_IFTHENELSE), + else_=self.rec(expr.else_, PREC_IFTHENELSE)), + enclosing_prec, PREC_NONE) + # }}} @@ -223,11 +236,9 @@ class PythonASTBuilderBase(ASTBuilderBase): return Suite def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): + lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper - from loopy.symbolic import aff_to_expr - from pymbolic.mapper.stringifier import PREC_NONE from genpy import For @@ -235,8 +246,8 @@ class PythonASTBuilderBase(ASTBuilderBase): (iname,), "range(%s, %s + 1)" % ( - ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), - ecm(aff_to_expr(static_ubound), PREC_NONE, "i"), + ecm(lbound, PREC_NONE, "i"), + ecm(ubound, PREC_NONE, "i"), ), inner) diff --git a/test/test_isl.py b/test/test_isl.py index 3bd3d221e54df685238cfd1532d2b32662aac99f..f793b1fa99f8768ff4e2fcfaa02aa87119ffcc92 100644 --- a/test/test_isl.py +++ b/test/test_isl.py @@ -44,6 +44,13 @@ def test_aff_to_expr_2(): assert aff_to_expr(x) == (-1)*i0 + 2*(i0 // 2) +def test_pw_aff_to_conditional_expr(): + from loopy.symbolic import pw_aff_to_expr + cond = isl.PwAff("[i] -> { [(0)] : i = 0; [(-1 + i)] : i > 0 }") + expr = pw_aff_to_expr(cond) + assert str(expr) == "If(i == 0, 0, -1 + i)" + + if __name__ == "__main__": import sys if len(sys.argv) > 1: diff --git a/test/test_loopy.py b/test/test_loopy.py index 347c08d0d1f68544cca628b688767b3054630870..ae41779eb7f1ac2ea09b007ca2fabec927f95a3b 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1601,6 +1601,55 @@ def test_scalars_with_base_storage(ctx_factory): knl(queue, out_host=True) +def test_tight_loop_bounds(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + ["{ [i] : 0 <= i <= 5 }", + "[i] -> { [j] : 2 * i - 2 < j <= 2 * i and 0 <= j <= 9 }"], + """ + for i + for j + out[j] = j + end + end + """, + silenced_warnings="write_race(insn)") + + knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0") + + evt, (out,) = knl(queue, out_host=True) + + assert (out == np.arange(10)).all() + + +def test_tight_loop_bounds_codegen(): + knl = lp.make_kernel( + ["{ [i] : 0 <= i <= 5 }", + "[i] -> { [j] : 2 * i - 2 <= j <= 2 * i and 0 <= j <= 9 }"], + """ + for i + for j + out[j] = j + end + end + """, + silenced_warnings="write_race(insn)", + target=lp.OpenCLTarget()) + + knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0") + + cgr = lp.generate_code_v2(knl) + #print(cgr.device_code()) + + for_loop = \ + "for (int j = " \ + "(lid(0) == 0 && gid(0) == 0 ? 0 : -2 + 10 * gid(0) + 2 * lid(0)); " \ + "j <= (lid(0) == 0 && -1 + gid(0) == 0 ? 9 : 2 * lid(0)); ++j)" + + assert for_loop in cgr.device_code() + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) @@ -1608,4 +1657,4 @@ if __name__ == "__main__": from py.test.cmdline import main main([__file__]) -# vim: foldmethod=marker \ No newline at end of file +# vim: foldmethod=marker