diff --git a/doc/tutorial.rst b/doc/tutorial.rst index ebdd2dd2975ada3622cbd0525372646b4c2d0be3..31dafac0f7b819a174d29df70f1cdee4885b869c 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -540,9 +540,8 @@ Consider this example: #define lid(N) ((int) get_local_id(N)) ... for (int i_outer = 0; i_outer <= -1 + ((15 + n) / 16); ++i_outer) - for (int i_inner = 0; i_inner <= 15; ++i_inner) - if (-1 + -1 * i_inner + -16 * i_outer + n >= 0) - a[16 * i_outer + i_inner] = 0.0f; + for (int i_inner = 0; i_inner <= (-16 + n + -16 * i_outer >= 0 ? 15 : -1 + n + -16 * i_outer); ++i_inner) + a[16 * i_outer + i_inner] = 0.0f; ... By default, the new, split inames are named *OLD_outer* and *OLD_inner*, @@ -571,10 +570,9 @@ relation to loop nesting. For example, it's perfectly possible to request >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - for (int i_inner = 0; i_inner <= 15; ++i_inner) - if (-1 + -1 * i_inner + n >= 0) - for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) - a[16 * i_outer + i_inner] = 0.0f; + for (int i_inner = 0; i_inner <= (-17 + n >= 0 ? 15 : -1 + n); ++i_inner) + for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) + a[16 * i_outer + i_inner] = 0.0f; ... Notice how loopy has automatically generated guard conditionals to make @@ -799,17 +797,18 @@ enabling some cost savings: a[4 * i_outer + 3] = 0.0f; } /* final slab for 'i_outer' */ - for (int i_outer = -1 + n + -1 * (3 * n / 4); i_outer <= -1 + ((3 + n) / 4); ++i_outer) - if (-1 + n >= 0) - { - a[4 * i_outer] = 0.0f; - if (-2 + -4 * i_outer + n >= 0) - a[4 * i_outer + 1] = 0.0f; - if (-3 + -4 * i_outer + n >= 0) - a[4 * i_outer + 2] = 0.0f; - if (4 + 4 * i_outer + -1 * n == 0) - a[4 * i_outer + 3] = 0.0f; - } + int const i_outer = -1 + n + -1 * (3 * n / 4); + + if (-1 + n >= 0) + { + a[4 * i_outer] = 0.0f; + if (-2 + -4 * i_outer + n >= 0) + a[4 * i_outer + 1] = 0.0f; + if (-3 + -4 * i_outer + n >= 0) + a[4 * i_outer + 2] = 0.0f; + if (4 + 4 * i_outer + -1 * n == 0) + a[4 * i_outer + 3] = 0.0f; + } ... .. }}} diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index fb254bd54480f716de54de96f6aab9a4bb427767..61a825fb9fef8c4d847e3e8f1310814e56e13a0a 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -62,23 +62,33 @@ def get_bounds_checks(domain, check_inames, implemented_domain, # {{{ on which inames may a conditional depend? def get_usable_inames_for_conditional(kernel, sched_index): - from loopy.schedule import EnterLoop, LeaveLoop + from loopy.schedule import ( + find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag - result = set() + result = find_active_inames_at(kernel, sched_index) + crosses_barrier = has_barrier_within(kernel, sched_index) - for i, sched_item in enumerate(kernel.schedule): - if i >= sched_index: - break - if isinstance(sched_item, EnterLoop): - result.add(sched_item.iname) - elif isinstance(sched_item, LeaveLoop): - result.remove(sched_item.iname) + # Find our containing subkernel, grab inames for all insns from there. - for iname in kernel.all_inames(): + subkernel_index = sched_index + from loopy.schedule import CallKernel + + while not isinstance(kernel.schedule[subkernel_index], CallKernel): + subkernel_index -= 1 + + insn_ids_for_subkernel = get_insn_ids_for_block_at( + kernel.schedule, subkernel_index) + + inames_for_subkernel = ( + iname + for insn in insn_ids_for_subkernel + for iname in kernel.insn_inames(insn)) + + for iname in inames_for_subkernel: tag = kernel.iname_to_tag.get(iname) - # Parallel inames are always defined, BUT: + # Parallel inames are defined within a subkernel, BUT: # # - local indices may not be used in conditionals that cross barriers. # @@ -87,7 +97,7 @@ def get_usable_inames_for_conditional(kernel, sched_index): if ( isinstance(tag, ParallelTag) - and not isinstance(tag, LocalIndexTagBase) + and not (isinstance(tag, LocalIndexTagBase) and crosses_barrier) and not isinstance(tag, IlpBaseTag) ): result.add(iname) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 648c3fe6f5b748dcc47de5ac972bb82ce605a9a9..3d3095d535e67161ec833549cb4b1aa0dedd1eef 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -386,48 +386,39 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] - from loopy.isl_helpers import ( - static_min_of_pw_aff, - static_max_of_pw_aff) - lbound = ( kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) + .gist(dom_and_slab.params()) .coalesce()) ubound = ( kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) + .gist(dom_and_slab.params()) .coalesce()) - static_lbound = static_min_of_pw_aff( - lbound, - constants_only=False) - static_ubound = static_max_of_pw_aff( - ubound, - constants_only=False) - # }}} - # {{{ find implemented slab, build inner code + # {{{ find implemented loop, build inner code - from loopy.isl_helpers import make_slab_from_bound_pwaffs + from loopy.isl_helpers import make_loop_bounds_from_pwaffs - # impl_slab may be overapproximated - impl_slab = make_slab_from_bound_pwaffs( + # impl_loop may be overapproximated + impl_loop = make_loop_bounds_from_pwaffs( dom_and_slab.space, - loop_iname, static_lbound, static_ubound) + loop_iname, lbound, ubound) for iname in moved_inames: - dt, idx = impl_slab.get_var_dict()[iname] - impl_slab = impl_slab.move_dims( - dim_type.set, impl_slab.dim(dim_type.set), + dt, idx = impl_loop.get_var_dict()[iname] + impl_loop = impl_loop.move_dims( + dim_type.set, impl_loop.dim(dim_type.set), dt, idx, 1) new_codegen_state = ( codegen_state - .intersect(impl_slab) + .intersect(impl_loop) .copy(kernel=intersect_kernel_with_slab( kernel, slab, iname))) @@ -438,17 +429,21 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): if cmt is not None: result.append(codegen_state.ast_builder.emit_comment(cmt)) - from loopy.symbolic import aff_to_expr - astb = codegen_state.ast_builder - if (static_ubound - static_lbound).plain_is_zero(): + zero = isl.PwAff.zero_on_domain( + isl.LocalSpace.from_space( + lbound.get_space()).domain()) + + from loopy.symbolic import pw_aff_to_expr + + if (ubound - lbound).plain_is_equal(zero): # single-trip, generate just a variable assignment, not a loop result.append(merge_codegen_results(codegen_state, [ astb.emit_initializer( codegen_state, kernel.index_dtype, loop_iname, - ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), + ecm(pw_aff_to_expr(lbound), PREC_NONE, "i"), is_const=True), astb.emit_blank_line(), inner, @@ -461,7 +456,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): codegen_state, astb.emit_sequential_loop( codegen_state, loop_iname, kernel.index_dtype, - static_lbound, static_ubound, inner_ast))) + pw_aff_to_expr(lbound), pw_aff_to_expr(ubound), inner_ast))) return merge_codegen_results(codegen_state, result) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index e657beecbc5453ae5b2390da5a958d2fc9a70771..602830de38e457c5ff4a55d7685dc346a7b4de35 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -102,7 +102,7 @@ def make_slab(space, iname, start, stop): return result -def make_slab_from_bound_pwaffs(space, iname, lbound, ubound): +def make_loop_bounds_from_pwaffs(space, iname, lbound, ubound): dt, pos = space.get_var_dict()[iname] iname_pwaff = isl.PwAff.var_on_domain(space, dt, pos) @@ -111,10 +111,10 @@ def make_slab_from_bound_pwaffs(space, iname, lbound, ubound): assert iname_pwaff.space == lbound.space assert iname_pwaff.space == ubound.space - return convexify( - iname_pwaff.ge_set(lbound) - & - iname_pwaff.le_set(ubound)) + return ( + iname_pwaff.ge_set(lbound) + & + iname_pwaff.le_set(ubound)) # }}} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 027e98ae51c7992f98c7ab22f6574259bbca00c8..f93ea891fd7f1aa420c8c109c31b50fd8305a9e3 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -72,6 +72,24 @@ def prepare_for_caching(kernel): # }}} +# {{{ check for writes to predicates + +def check_for_writes_to_predicates(kernel): + from loopy.symbolic import get_dependencies + for insn in kernel.instructions: + pred_vars = ( + frozenset.union( + *(get_dependencies(pred) for pred in insn.predicates)) + if insn.predicates else frozenset()) + written_pred_vars = frozenset(insn.assignee_var_names()) & pred_vars + if written_pred_vars: + raise LoopyError("In instruction '%s': may not write to " + "variable(s) '%s' involved in the instruction's predicates" + % (insn.id, ", ".join(written_pred_vars))) + +# }}} + + # {{{ check reduction iname uniqueness def check_reduction_iname_uniqueness(kernel): @@ -876,6 +894,7 @@ def preprocess_kernel(kernel, device=None): kernel = infer_unknown_types(kernel, expect_completion=False) + check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) from loopy.kernel.creation import apply_single_writer_depencency_heuristic diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 5b5b2477651c4026cfb4b0618481fbb8b3710728..74bb5c1d1a917fd00edad3bfcd2d5ba241d1ff49 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1142,12 +1142,35 @@ def pw_aff_to_expr(pw_aff, int_ok=False): return pw_aff pieces = pw_aff.get_pieces() + last_expr = aff_to_expr(pieces[-1][1]) - if len(pieces) != 1: - raise NotImplementedError("pw_aff_to_expr for multi-piece PwAff instances") + # {{{ make exprs from set constraints - (set, aff), = pieces - return aff_to_expr(aff) + from pymbolic.primitives import LogicalAnd, LogicalOr + + def set_to_expr(isl_set): + constrs = [] + for isl_basicset in isl_set.get_basic_sets(): + constrs.append(basic_set_to_expr(isl_basicset)) + return LogicalOr(tuple(constrs)) + + def basic_set_to_expr(isl_basicset): + constrs = [] + for constr in isl_basicset.get_constraints(): + constrs.append(constraint_to_expr(constr)) + return LogicalAnd(tuple(constrs)) + + # }}} + + pairs = [(set_to_expr(constr_set), aff_to_expr(aff)) + for constr_set, aff in pieces[:-1]] + + from pymbolic.primitives import If + expr = last_expr + for condition, then_expr in reversed(pairs): + expr = If(condition, then_expr, expr) + + return expr # }}} diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index aa54fd42365b2d91452b966f65529ba80ab492a8..513386a58d5b933c20dfc95879863d00e1f4dcb4 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -337,7 +337,7 @@ class CASTBuilder(ASTBuilderBase): index_dtype=kernel.index_dtype) decl = self.wrap_global_constant( self.get_temporary_decl( - kernel, schedule_index, tv, + codegen_state, schedule_index, tv, decl_info)) if tv.initializer is not None: @@ -706,11 +706,9 @@ class CASTBuilder(ASTBuilderBase): CExpression(self.get_c_expression_to_code_mapper(), result)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): + lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper - from loopy.symbolic import aff_to_expr - from pymbolic import var from pymbolic.primitives import Comparison from pymbolic.mapper.stringifier import PREC_NONE @@ -719,12 +717,12 @@ class CASTBuilder(ASTBuilderBase): return For( InlineInitializer( POD(self, iname_dtype, iname), - ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), + ecm(lbound, PREC_NONE, "i")), ecm( Comparison( var(iname), "<=", - aff_to_expr(static_ubound)), + ubound), PREC_NONE, "i"), "++%s" % iname, inner) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index ffa2c4e629f25eb4c79966320d8e23202880b103..bd5a74782dc5dce7bf82985bea3a7c77404d9d26 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -148,6 +148,10 @@ class ExpressionToCExpressionMapper(IdentityMapper): from loopy.kernel.data import ValueArg if isinstance(arg, ValueArg) and self.fortran_abi: postproc = lambda x: x[0] # noqa + elif expr.name in self.kernel.temporary_variables: + temporary = self.kernel.temporary_variables[expr.name] + if temporary.base_storage: + postproc = lambda x: x[0] # noqa result = self.kernel.mangle_symbol(self.codegen_state.ast_builder, expr.name) if result is not None: @@ -216,12 +220,14 @@ class ExpressionToCExpressionMapper(IdentityMapper): elif isinstance(ary, (GlobalArg, TemporaryVariable, ConstantArg)): if len(access_info.subscripts) == 0: - if isinstance(ary, GlobalArg) or isinstance(ary, ConstantArg): + if (isinstance(ary, (ConstantArg, GlobalArg)) or + (isinstance(ary, TemporaryVariable) and ary.base_storage)): # unsubscripted global args are pointers result = var(access_info.array_name)[0] else: # unsubscripted temp vars are scalars + # (unless they use base_storage) result = var(access_info.array_name) else: diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 230b284c5b04a61f5be566eeff3a52abf802a667..2c48fb902e746780599f039a44b2d0a5ea787b88 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -315,7 +315,7 @@ class ISPCASTBuilder(CASTBuilder): ecm = self.get_expression_to_code_mapper(codegen_state) temp_var_decl = ArrayOf( temp_var_decl, - ecm(p.flattened_product(decl_info.shape), + ecm(p.flattened_product(shape), prec=PREC_NONE, type_context="i")) return temp_var_decl @@ -469,23 +469,22 @@ class ISPCASTBuilder(CASTBuilder): return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): + lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper - from loopy.symbolic import aff_to_expr from loopy.target.c import POD from pymbolic.mapper.stringifier import PREC_NONE - from cgen import For, Initializer + from cgen import For, InlineInitializer from cgen.ispc import ISPCUniform return For( - Initializer( + InlineInitializer( ISPCUniform(POD(self, iname_dtype, iname)), - ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), + ecm(lbound, PREC_NONE, "i")), ecm( - p.Comparison(var(iname), "<=", aff_to_expr(static_ubound)), + p.Comparison(var(iname), "<=", ubound), PREC_NONE, "i"), "++%s" % iname, inner) diff --git a/loopy/target/python.py b/loopy/target/python.py index 036e60ab10d08d4f2825c7a8369260c351413ffa..a348cba83008794ee3d02a61ff90f6d81d1a9322 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -129,6 +129,19 @@ class ExpressionToPythonMapper(StringifyMapper): def map_local_hw_index(self, expr, enclosing_prec): raise LoopyError("plain Python does not have local hw axes") + def map_if(self, expr, enclosing_prec): + # Synthesize PREC_IFTHENELSE, make sure it is in the right place in the + # operator precedence hierarchy (right above "or"). + from pymbolic.mapper.stringifier import PREC_LOGICAL_OR, PREC_NONE + PREC_IFTHENELSE = PREC_LOGICAL_OR - 1 + + return self.parenthesize_if_needed( + "{then} if {cond} else {else_}".format( + then=self.rec(expr.then, PREC_IFTHENELSE), + cond=self.rec(expr.condition, PREC_IFTHENELSE), + else_=self.rec(expr.else_, PREC_IFTHENELSE)), + enclosing_prec, PREC_NONE) + # }}} @@ -223,11 +236,9 @@ class PythonASTBuilderBase(ASTBuilderBase): return Suite def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): + lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper - from loopy.symbolic import aff_to_expr - from pymbolic.mapper.stringifier import PREC_NONE from genpy import For @@ -235,8 +246,8 @@ class PythonASTBuilderBase(ASTBuilderBase): (iname,), "range(%s, %s + 1)" % ( - ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), - ecm(aff_to_expr(static_ubound), PREC_NONE, "i"), + ecm(lbound, PREC_NONE, "i"), + ecm(ubound, PREC_NONE, "i"), ), inner) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index bf6a6e1b98e6abbc4b483383f4bb9cf8b06bed1a..c35b5064365293ac78cdd01af537c9d28bd67193 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -79,8 +79,8 @@ __doc__ = """ def set_loop_priority(kernel, loop_priority): from warnings import warn - warn("set_loop_priority is deprecated. Use prioritize_loops instead." - "Attention: A call to set_loop_priority will overwrite any previously" + warn("set_loop_priority is deprecated. Use prioritize_loops instead. " + "Attention: A call to set_loop_priority will overwrite any previously " "set priorities!", DeprecationWarning, stacklevel=2) if isinstance(loop_priority, str): diff --git a/test/test_apps.py b/test/test_apps.py index 790a44f6acac72e4fa6fe04a45f32813e6204bb9..9eab3fdb1fbc152b65344362d39766793d372d90 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -502,6 +502,112 @@ def test_lbm(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nx": 20, "ny": 20}) +def test_fd_demo(): + knl = lp.make_kernel( + "{[i,j]: 0<=i,j 1: exec(sys.argv[1]) diff --git a/test/test_isl.py b/test/test_isl.py index 3bd3d221e54df685238cfd1532d2b32662aac99f..f793b1fa99f8768ff4e2fcfaa02aa87119ffcc92 100644 --- a/test/test_isl.py +++ b/test/test_isl.py @@ -44,6 +44,13 @@ def test_aff_to_expr_2(): assert aff_to_expr(x) == (-1)*i0 + 2*(i0 // 2) +def test_pw_aff_to_conditional_expr(): + from loopy.symbolic import pw_aff_to_expr + cond = isl.PwAff("[i] -> { [(0)] : i = 0; [(-1 + i)] : i > 0 }") + expr = pw_aff_to_expr(cond) + assert str(expr) == "If(i == 0, 0, -1 + i)" + + if __name__ == "__main__": import sys if len(sys.argv) > 1: diff --git a/test/test_loopy.py b/test/test_loopy.py index 1d29fd4dc2c0dec21d7d2291d74003018dff5587..9f1b6ea8e93a62f147aaa67f70b563137766c404 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1615,18 +1615,68 @@ def test_header_extract(ctx_factory): oclknl.target = lp.PyOpenCLTarget() assert lp.generate_header(oclknl) == '__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *restrict T);' -def test_base_storage_decl(): +def test_scalars_with_base_storage(ctx_factory): + """ Regression test for !50 """ + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + knl = lp.make_kernel( - "{ [i]: 0<=i { [j] : 2 * i - 2 < j <= 2 * i and 0 <= j <= 9 }"], + """ + for i + for j + out[j] = j + end + end + """, + silenced_warnings="write_race(insn)") + + knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0") + + evt, (out,) = knl(queue, out_host=True) + + assert (out == np.arange(10)).all() + + +def test_tight_loop_bounds_codegen(): + knl = lp.make_kernel( + ["{ [i] : 0 <= i <= 5 }", + "[i] -> { [j] : 2 * i - 2 <= j <= 2 * i and 0 <= j <= 9 }"], + """ + for i + for j + out[j] = j + end + end + """, + silenced_warnings="write_race(insn)", + target=lp.OpenCLTarget()) + + knl = lp.split_iname(knl, "i", 5, inner_tag="l.0", outer_tag="g.0") + + cgr = lp.generate_code_v2(knl) + #print(cgr.device_code()) - lp.generate_code_v2(knl) + for_loop = \ + "for (int j = " \ + "(lid(0) == 0 && gid(0) == 0 ? 0 : -2 + 10 * gid(0) + 2 * lid(0)); " \ + "j <= (lid(0) == 0 && -1 + gid(0) == 0 ? 9 : 2 * lid(0)); ++j)" + assert for_loop in cgr.device_code() if __name__ == "__main__": if len(sys.argv) > 1: diff --git a/test/test_reduction.py b/test/test_reduction.py index 3a68fbd947c7e6422123ab3f068c2a9f3aeeb6a8..fec72138da24d41cc14da357ebe596b189b5d2f4 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -393,112 +393,6 @@ def test_double_sum_made_unique(ctx_factory): assert b.get() == ref -def test_fd_demo(): - knl = lp.make_kernel( - "{[i,j]: 0<=i,j 1: exec(sys.argv[1])