diff --git a/MEMO b/MEMO index 815d874235c804dfa3d7d8a3a8dc038cbc8902bf..0e3d65aad124a443f4e99260fac8ff931f954584 100644 --- a/MEMO +++ b/MEMO @@ -53,14 +53,15 @@ Things to consider other inames -> Is that reasonable? +- Parallel dimension splitting/merging via tags + -> unnecessary? + TODO ^^^^ - implemented_domain may end up being smaller than requested in cse evaluations--check that! -- Parallel dimension splitting/merging via tags - - FIXME: Deal with insns losing a seq iname dep in a CSE realization a <- cse(reduce(stuff)) @@ -75,6 +76,8 @@ TODO - Sharing of checks across ILP instances +- Slab decomposition for ILP + - Some things involving CSEs might be impossible to schedule a[i,j] = cse(b[i]) * cse(c[j]) @@ -82,17 +85,15 @@ TODO - How should we implement the dim shuffling for odd-size prefetches? -- Slab decomposition for ILP - - Better for loop bound generation -> Try a triangular loop -- Implement condition hoisting - (needed, e.g., by slab decomposition) - Dealt with ^^^^^^^^^^ +- Implement condition hoisting + (needed, e.g., by slab decomposition) + - Check for non-use of hardware axes - Slab decomposition for parallel dimensions diff --git a/loopy/codegen/dispatch.py b/loopy/codegen/dispatch.py index f9c29a08a1453d27d2a5c9dedfb0b2d13a96c631..e3cd775a435d67831a9bf6355bfa97d186b3bcdd 100644 --- a/loopy/codegen/dispatch.py +++ b/loopy/codegen/dispatch.py @@ -69,7 +69,43 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state): +def remove_inames_for_shared_hw_axes(kernel, cond_inames): + """ + See if cond_inames contains references to two (or more) inames that + boil down to the same tag. If so, exclude them. (We shouldn't be writing + conditionals for such inames because we would be implicitly restricting + the other inames as well.) + """ + + tag_key_use_count = {} + + from loopy.kernel import HardwareParallelTag + + for iname in cond_inames: + tag = kernel.iname_to_tag.get(iname) + + if isinstance(tag, HardwareParallelTag): + tag_key_use_count[tag.key] = tag_key_use_count.get(tag.key, 0) + 1 + + multi_use_keys = set( + key for key, count in tag_key_use_count.iteritems() + if count > 1) + + multi_use_inames = set() + for iname in cond_inames: + tag = kernel.iname_to_tag.get(iname) + if isinstance(tag, HardwareParallelTag) and tag.key in multi_use_keys: + multi_use_inames.add(iname) + + return cond_inames - multi_use_inames + + + + def build_loop_nest(kernel, sched_index, codegen_state): + # Most of the complexity of this function goes towards finding groups of + # instructions that can be nested inside a shared conditional. + assert isinstance(codegen_state, CodeGenerationState) from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier, @@ -101,10 +137,7 @@ def build_loop_nest(kernel, sched_index, codegen_state): # }}} - # {{{ pass 2: find admissible conditional inames - - # FIXME: See if another inner insn relies on a different iname - # boiling down to the same tag. If so, exclude that. + # {{{ pass 2: find admissible conditional inames for each schedule item admissible_cond_inames = [ get_admissible_conditional_inames_for(kernel, sched_index) @@ -112,7 +145,7 @@ def build_loop_nest(kernel, sched_index, codegen_state): # }}} - # {{{ pass 3: greedily group instructions that share admissible conditionals + # {{{ pass 3: greedily group schedule items that share admissible inames def build_insn_group(sched_indices_and_cond_inames, codegen_state, min_iname_count=1): @@ -124,7 +157,10 @@ def build_loop_nest(kernel, sched_index, codegen_state): sched_index, cond_inames = sched_indices_and_cond_inames[0] - # {{{ keep growing instruction group as long as shared inames exist + # {{{ grow schedule item group + + # Keep growing schedule item group as long as group fulfills minimum + # size requirement. current_iname_set = cond_inames @@ -146,14 +182,20 @@ def build_loop_nest(kernel, sched_index, codegen_state): # Success: found a big enough group of inames for a conditional. # See if there are bounds checks available for that set. + # {{{ see which inames were actually used in group + + # And only generate conditionals for those. from loopy.schedule import find_used_inames_within used_inames = set() for subsched_index, _ in sched_indices_and_cond_inames[0:idx]: used_inames |= find_used_inames_within(kernel, subsched_index) + # }}} + from loopy.codegen.bounds import generate_bounds_checks bounds_checks = generate_bounds_checks(kernel.domain, - current_iname_set & used_inames, + remove_inames_for_shared_hw_axes(kernel, + current_iname_set & used_inames), codegen_state.implemented_domain) else: bounds_checks = []