From 4e92cab99445cad6d19f48eb566bed01c82936eb Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 25 Apr 2017 21:41:47 -0500 Subject: [PATCH 1/2] Sequential loop generation: hoist the non-empty condition out of the loop. This allows for using a more precise implemented domain inside the loop, eliminating the need for redundant checks when the loop is non-empty (closes #64). Also fixes a wraparound issue in the admissible inames finder. --- loopy/codegen/bounds.py | 19 ++++++++++++----- loopy/codegen/loop.py | 47 ++++++++++++++++++++++++++++------------- loopy/symbolic.py | 17 --------------- test/test_loopy.py | 40 +++++++++++++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 37 deletions(-) diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index 7cc381f11..ae91abd56 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -65,11 +65,20 @@ def get_usable_inames_for_conditional(kernel, sched_index): # Find our containing subkernel, grab inames for all insns from there. - subkernel_index = sched_index - from loopy.schedule import CallKernel - - while not isinstance(kernel.schedule[subkernel_index], CallKernel): - subkernel_index -= 1 + within_subkernel = False + + for prev_sched_index, sched_item in enumerate(kernel.schedule): + if prev_sched_index == sched_index: + if not within_subkernel: + # Outside all subkernels - use only inames available to device. + return frozenset(result) + + from loopy.schedule import CallKernel, ReturnFromKernel + if isinstance(sched_item, CallKernel): + within_subkernel = True + subkernel_index = prev_sched_index + elif isinstance(sched_item, ReturnFromKernel): + within_subkernel = False insn_ids_for_subkernel = get_insn_ids_for_block_at( kernel.schedule, subkernel_index) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 0110a0609..70ee89d75 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -410,17 +410,21 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): # {{{ find implemented loop, build inner code - from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr - impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound) - impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound) + loop_nonempty_params = lbound.params() & ubound.params() + has_nonempty_check = False + + if not loop_nonempty_params.plain_is_universe(): + has_nonempty_check = True + from loopy.symbolic import set_to_cond_expr + loop_nonempty_cond = set_to_cond_expr(loop_nonempty_params) # impl_loop may be overapproximated from loopy.isl_helpers import make_loop_bounds_from_pwaffs impl_loop = make_loop_bounds_from_pwaffs( dom_and_slab.space, loop_iname, - impl_lbound, - impl_ubound) + lbound, + ubound) for moved_iname in moved_inames: # move moved_iname to 'set' dim_type in impl_loop @@ -446,6 +450,16 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): from loopy.symbolic import pw_aff_to_expr + def add_nonempty_check_maybe(result): + if has_nonempty_check: + from loopy.codegen.result import wrap_in_if + return wrap_in_if( + codegen_state, + [loop_nonempty_cond], + result) + else: + return result + if ubound.is_equal(lbound): # single-trip, generate just a variable assignment, not a loop inner = merge_codegen_results(codegen_state, [ @@ -458,10 +472,11 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): inner, ]) result.append( + add_nonempty_check_maybe( inner.with_new_ast( codegen_state, astb.ast_block_scope_class( - inner.current_ast(codegen_state)))) + inner.current_ast(codegen_state))))) else: inner_ast = inner.current_ast(codegen_state) @@ -469,15 +484,17 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): from loopy.isl_helpers import simplify_pw_aff result.append( - inner.with_new_ast( - codegen_state, - astb.emit_sequential_loop( - codegen_state, loop_iname, kernel.index_dtype, - pw_aff_to_expr(simplify_pw_aff(lbound, kernel.assumptions)), - pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)), - inner_ast))) - - return merge_codegen_results(codegen_state, result) + add_nonempty_check_maybe( + inner.with_new_ast( + codegen_state, + astb.emit_sequential_loop( + codegen_state, loop_iname, kernel.index_dtype, + pw_aff_to_expr(simplify_pw_aff(lbound, kernel.assumptions)), + pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)), + inner_ast)))) + + res = merge_codegen_results(codegen_state, result) + return res # }}} diff --git a/loopy/symbolic.py b/loopy/symbolic.py index f1a494f30..3779e37d8 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1190,23 +1190,6 @@ def pw_aff_to_expr(pw_aff, int_ok=False): return expr - -def pw_aff_to_pw_aff_implemented_by_expr(pw_aff): - pieces = pw_aff.get_pieces() - - rest = isl.Set.universe(pw_aff.space.params()) - aff_set, aff = pieces[0] - impl_pw_aff = isl.PwAff.alloc(aff_set, aff) - rest = rest.intersect_params(aff_set.complement()) - - for aff_set, aff in pieces[1:-1]: - impl_pw_aff = impl_pw_aff.union_max( - isl.PwAff.alloc(aff_set, aff)) - rest = rest.intersect_params(aff_set.complement()) - - _, aff = pieces[-1] - return impl_pw_aff.union_max(isl.PwAff.alloc(rest, aff)).coalesce() - # }}} diff --git a/test/test_loopy.py b/test/test_loopy.py index 4bb6a2726..723be51c7 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2231,6 +2231,46 @@ def test_struct_assignment(ctx_factory): knl(queue, N=200) +def test_hoisting_of_conditionals_for_nonempty_loop(): + # This checks that conditionals that are redundant when the loop is + # non-empty aren't generated by the bounds check code inside the loop. + + param_dict = dict(n=512, m=512, l=512) + dtype = np.float32 + + # group size + by = 16 + bx = 24 + + # define kernel + knl = lp.make_kernel( + "{[i,k,j]: 0<=i= 0 && -17 + m" + " + -16 * k_outer >= 0 ? 15 : -1 + m + -16 * k_outer); ++k_inner)" + "\n" + " acc_k_outer_k_inner = acc_k_outer_k_inner + ") + + assert expected_substr in cgr.device_code() + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 06db894fee70d282da559927a273a50a8c783381 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 25 Apr 2017 21:45:19 -0500 Subject: [PATCH 2/2] remove debugging code --- loopy/codegen/loop.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 70ee89d75..afb23fe4e 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -493,8 +493,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)), inner_ast)))) - res = merge_codegen_results(codegen_state, result) - return res + return merge_codegen_results(codegen_state, result) # }}} -- GitLab