diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 0110a06095fa0bd690045f050136027d7bed3a28..afb23fe4e3af4b9ac8191ed5d7cbda14bc287dd9 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -410,17 +410,21 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): # {{{ find implemented loop, build inner code - from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr - impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound) - impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound) + loop_nonempty_params = lbound.params() & ubound.params() + has_nonempty_check = False + + if not loop_nonempty_params.plain_is_universe(): + has_nonempty_check = True + from loopy.symbolic import set_to_cond_expr + loop_nonempty_cond = set_to_cond_expr(loop_nonempty_params) # impl_loop may be overapproximated from loopy.isl_helpers import make_loop_bounds_from_pwaffs impl_loop = make_loop_bounds_from_pwaffs( dom_and_slab.space, loop_iname, - impl_lbound, - impl_ubound) + lbound, + ubound) for moved_iname in moved_inames: # move moved_iname to 'set' dim_type in impl_loop @@ -446,6 +450,16 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): from loopy.symbolic import pw_aff_to_expr + def add_nonempty_check_maybe(result): + if has_nonempty_check: + from loopy.codegen.result import wrap_in_if + return wrap_in_if( + codegen_state, + [loop_nonempty_cond], + result) + else: + return result + if ubound.is_equal(lbound): # single-trip, generate just a variable assignment, not a loop inner = merge_codegen_results(codegen_state, [ @@ -458,10 +472,11 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): inner, ]) result.append( + add_nonempty_check_maybe( inner.with_new_ast( codegen_state, astb.ast_block_scope_class( - inner.current_ast(codegen_state)))) + inner.current_ast(codegen_state))))) else: inner_ast = inner.current_ast(codegen_state) @@ -469,13 +484,14 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): from loopy.isl_helpers import simplify_pw_aff result.append( - inner.with_new_ast( - codegen_state, - astb.emit_sequential_loop( - codegen_state, loop_iname, kernel.index_dtype, - pw_aff_to_expr(simplify_pw_aff(lbound, kernel.assumptions)), - pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)), - inner_ast))) + add_nonempty_check_maybe( + inner.with_new_ast( + codegen_state, + astb.emit_sequential_loop( + codegen_state, loop_iname, kernel.index_dtype, + pw_aff_to_expr(simplify_pw_aff(lbound, kernel.assumptions)), + pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)), + inner_ast)))) return merge_codegen_results(codegen_state, result) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index f1a494f30d469511817d204c0476ff79abe00e3b..3779e37d83e196260ad60bb3f8baa09a285bcecc 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1190,23 +1190,6 @@ def pw_aff_to_expr(pw_aff, int_ok=False): return expr - -def pw_aff_to_pw_aff_implemented_by_expr(pw_aff): - pieces = pw_aff.get_pieces() - - rest = isl.Set.universe(pw_aff.space.params()) - aff_set, aff = pieces[0] - impl_pw_aff = isl.PwAff.alloc(aff_set, aff) - rest = rest.intersect_params(aff_set.complement()) - - for aff_set, aff in pieces[1:-1]: - impl_pw_aff = impl_pw_aff.union_max( - isl.PwAff.alloc(aff_set, aff)) - rest = rest.intersect_params(aff_set.complement()) - - _, aff = pieces[-1] - return impl_pw_aff.union_max(isl.PwAff.alloc(rest, aff)).coalesce() - # }}} diff --git a/test/test_loopy.py b/test/test_loopy.py index 21db62610f3a3160bcc3069c3e480e85cc4712f8..c1739e151e69ad7b63fd1c0a37bc392f077cb12e 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2298,6 +2298,45 @@ def test_struct_assignment(ctx_factory): knl(queue, N=200) +def test_hoisting_of_conditionals_for_nonempty_loop(): + # This checks that conditionals that are redundant when the loop is + # non-empty aren't generated by the bounds check code inside the loop. + + dtype = np.float32 + + # group size + by = 16 + bx = 24 + + # define kernel + knl = lp.make_kernel( + "{[i,k,j]: 0<=i= 0 && -17 + m" + " + -16 * k_outer >= 0 ? 15 : -1 + m + -16 * k_outer); ++k_inner)" + "\n" + " acc_k_outer_k_inner = acc_k_outer_k_inner + ") + + assert expected_substr in cgr.device_code() + + def test_inames_conditional_generation(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel(