diff --git a/MEMO b/MEMO index d6300e1dc6cb51351d31cd5e394365dbb673e000..ebd410f4f9f333041fafb805a743cc18b6a87b56 100644 --- a/MEMO +++ b/MEMO @@ -42,11 +42,25 @@ To-do - What if no universally valid precompute base index expression is found? (test_intel_matrix_mul with n = 6*16, e.g.?) +- "No schedule found" debug help: + + - Find longest dead-end + - Automatically report on what hinders progress there + +- When duplicating, use iname aliases to relieve burden on isl + +- Differentiate ilp.unr from ilp.seq + +- Expose iname-duplicate-and-rename as a primitive. + - Fix all tests Future ideas ^^^^^^^^^^^^ +- How is intra-instruction ordering of ILP loops going to be determined? + (taking into account that it could vary even per-instruction?) + - Barriers for data exchanged via global vars? - Float4 joining on fetch/store? diff --git a/doc/reference.rst b/doc/reference.rst index c8844f4b82b5caf577bfa6b36680fd2cabb78b5e..71062604e2e2d6fe8462e327b07bbf3e0444a520 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -114,6 +114,11 @@ Precomputation and Prefetching Uses :func:`extract_subst` and :func:`precompute`. +Manipulating Reductions +----------------------- + +.. autofunction:: realize_reduction + Finishing up ------------ diff --git a/loopy/__init__.py b/loopy/__init__.py index 912532ae104c3566c8b792cbb84a33db9dc3e4c7..94616f25ddab2f60d6a1b2170ec728268d71f4be 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -25,7 +25,7 @@ from loopy.kernel import ScalarArg, ArrayArg, ConstantArrayArg, ImageArg from loopy.kernel import AutoFitLocalIndexTag, get_dot_dependency_graph, LoopKernel from loopy.subst import extract_subst, apply_subst from loopy.cse import precompute -from loopy.preprocess import preprocess_kernel +from loopy.preprocess import preprocess_kernel, realize_reduction from loopy.schedule import generate_loop_schedules from loopy.codegen import generate_code from loopy.compiled import CompiledKernel, drive_timing_run, auto_test_vs_ref diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index fb94fbecaf819537d829249197bf1c9756914207..d1ffea7a395722c477b4fca0d6771adde4924443 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -149,7 +149,7 @@ def generate_unroll_loop(kernel, sched_index, codegen_state): # }}} -# {{{ parallel loop +# {{{ hw-parallel loop def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=None): from loopy.kernel import UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag diff --git a/loopy/preprocess.py b/loopy/preprocess.py index a9e39c9fbd5581b5349833a40ef07888fe2a4345..43e93ddd0e2e77e8c4cd57b3efa8f4b71c74cd64 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -110,7 +110,25 @@ def duplicate_reduction_inames(kernel): # {{{ rewrite reduction to imperative form -def realize_reduction(kernel): +def realize_reduction(kernel, insn_id_filter=None): + """Rewrites reductions into their imperative form. With *insn_id_filter* specified, + operate only on the instruction with an instruction id matching insn_id_filter. + + If *insn_id_filter* is given, only the outermost level of reductions will be + expanded, inner reductions will be left alone (because they end up in a new + instruction with a different ID, which doesn't match the filter). + + If *insn_id_filter* is not given, all reductions in all instructions will + be realized. + + This routine also implicitly performs (global) reduction iname duplication, + if requested by '@' prefixes on any reduction iname. + """ + + # Reduction iname duplication needs to happen beforehand, and it is + # idempotent. So just call it now. + kernel = duplicate_reduction_inames(kernel) + new_insns = [] new_temporary_variables = kernel.temporary_variables.copy() @@ -130,7 +148,9 @@ def realize_reduction(kernel): ilp_iname_lengths = [] for iname in ilp_inames: - # original kernel ok here--we're not messing with inames + # Using the original kernel is ok here. Nothing in realize_reductions + # messes with inames. This is useful because it takes advantage + # of bounds caching. bounds = kernel.get_iname_bounds(iname) from loopy.symbolic import pw_aff_to_expr @@ -206,6 +226,10 @@ def realize_reduction(kernel): insn = insn_queue.pop(0) + if insn_id_filter is not None and insn.id != insn_id_filter: + new_insns.append(insn) + continue + # Run reduction expansion. new_expression = cb_mapper(insn.expression) @@ -681,9 +705,8 @@ def preprocess_kernel(kernel): from loopy.subst import apply_subst kernel = apply_subst(kernel) - kernel = mark_local_temporaries(kernel) - kernel = duplicate_reduction_inames(kernel) kernel = realize_reduction(kernel) + kernel = mark_local_temporaries(kernel) kernel = assign_automatic_axes(kernel) kernel = add_boostability_and_automatic_dependencies(kernel) kernel = limit_boostability(kernel) diff --git a/loopy/schedule.py b/loopy/schedule.py index 64204f20da7f76800a17ce68edef2a04626d1384..9bdf9ca5c8fb6bf285a631509eea4fbd4850440d 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -267,6 +267,9 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b # {{{ decide about debug mode debug_mode = False + #if len(schedule) == 15: + #debug_mode = True + if debug is not None: if (debug.debug_length is not None and len(schedule) >= debug.debug_length): @@ -313,6 +316,8 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b # nested loop without harm. if allow_boost: + # Note that the inames in 'insn.boostable_into' necessarily won't + # be contained in 'want'. have = have - insn.boostable_into if want != have: @@ -330,10 +335,6 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b if (not schedule_now and have <= want): reachable_insn_ids.add(insn_id) - else: - if debug_mode: - print (" '%s' also not reachable because it won't work under '%s'" - % (insn.id, ",".join(have-want))) # }}}