Skip to content
Snippets Groups Projects
Commit 8ba555dc authored by Andreas Klöckner's avatar Andreas Klöckner
Browse files

Expose reduction realization to the user. Add more notes.

parent c8064886
No related branches found
No related tags found
No related merge requests found
......@@ -42,11 +42,25 @@ To-do
- What if no universally valid precompute base index expression is found?
(test_intel_matrix_mul with n = 6*16, e.g.?)
- "No schedule found" debug help:
- Find longest dead-end
- Automatically report on what hinders progress there
- When duplicating, use iname aliases to relieve burden on isl
- Differentiate ilp.unr from ilp.seq
- Expose iname-duplicate-and-rename as a primitive.
- Fix all tests
Future ideas
^^^^^^^^^^^^
- How is intra-instruction ordering of ILP loops going to be determined?
(taking into account that it could vary even per-instruction?)
- Barriers for data exchanged via global vars?
- Float4 joining on fetch/store?
......
......@@ -114,6 +114,11 @@ Precomputation and Prefetching
Uses :func:`extract_subst` and :func:`precompute`.
Manipulating Reductions
-----------------------
.. autofunction:: realize_reduction
Finishing up
------------
......
......@@ -25,7 +25,7 @@ from loopy.kernel import ScalarArg, ArrayArg, ConstantArrayArg, ImageArg
from loopy.kernel import AutoFitLocalIndexTag, get_dot_dependency_graph, LoopKernel
from loopy.subst import extract_subst, apply_subst
from loopy.cse import precompute
from loopy.preprocess import preprocess_kernel
from loopy.preprocess import preprocess_kernel, realize_reduction
from loopy.schedule import generate_loop_schedules
from loopy.codegen import generate_code
from loopy.compiled import CompiledKernel, drive_timing_run, auto_test_vs_ref
......
......@@ -149,7 +149,7 @@ def generate_unroll_loop(kernel, sched_index, codegen_state):
# }}}
# {{{ parallel loop
# {{{ hw-parallel loop
def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=None):
from loopy.kernel import UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag
......
......@@ -110,7 +110,25 @@ def duplicate_reduction_inames(kernel):
# {{{ rewrite reduction to imperative form
def realize_reduction(kernel):
def realize_reduction(kernel, insn_id_filter=None):
"""Rewrites reductions into their imperative form. With *insn_id_filter* specified,
operate only on the instruction with an instruction id matching insn_id_filter.
If *insn_id_filter* is given, only the outermost level of reductions will be
expanded, inner reductions will be left alone (because they end up in a new
instruction with a different ID, which doesn't match the filter).
If *insn_id_filter* is not given, all reductions in all instructions will
be realized.
This routine also implicitly performs (global) reduction iname duplication,
if requested by '@' prefixes on any reduction iname.
"""
# Reduction iname duplication needs to happen beforehand, and it is
# idempotent. So just call it now.
kernel = duplicate_reduction_inames(kernel)
new_insns = []
new_temporary_variables = kernel.temporary_variables.copy()
......@@ -130,7 +148,9 @@ def realize_reduction(kernel):
ilp_iname_lengths = []
for iname in ilp_inames:
# original kernel ok here--we're not messing with inames
# Using the original kernel is ok here. Nothing in realize_reductions
# messes with inames. This is useful because it takes advantage
# of bounds caching.
bounds = kernel.get_iname_bounds(iname)
from loopy.symbolic import pw_aff_to_expr
......@@ -206,6 +226,10 @@ def realize_reduction(kernel):
insn = insn_queue.pop(0)
if insn_id_filter is not None and insn.id != insn_id_filter:
new_insns.append(insn)
continue
# Run reduction expansion.
new_expression = cb_mapper(insn.expression)
......@@ -681,9 +705,8 @@ def preprocess_kernel(kernel):
from loopy.subst import apply_subst
kernel = apply_subst(kernel)
kernel = mark_local_temporaries(kernel)
kernel = duplicate_reduction_inames(kernel)
kernel = realize_reduction(kernel)
kernel = mark_local_temporaries(kernel)
kernel = assign_automatic_axes(kernel)
kernel = add_boostability_and_automatic_dependencies(kernel)
kernel = limit_boostability(kernel)
......
......@@ -267,6 +267,9 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
# {{{ decide about debug mode
debug_mode = False
#if len(schedule) == 15:
#debug_mode = True
if debug is not None:
if (debug.debug_length is not None
and len(schedule) >= debug.debug_length):
......@@ -313,6 +316,8 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
# nested loop without harm.
if allow_boost:
# Note that the inames in 'insn.boostable_into' necessarily won't
# be contained in 'want'.
have = have - insn.boostable_into
if want != have:
......@@ -330,10 +335,6 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b
if (not schedule_now and have <= want):
reachable_insn_ids.add(insn_id)
else:
if debug_mode:
print (" '%s' also not reachable because it won't work under '%s'"
% (insn.id, ",".join(have-want)))
# }}}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment