From 85194627bf520b3feaa4829c9773a8da3cc6842a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Tue, 25 Oct 2011 01:46:41 -0400 Subject: [PATCH] Revamp conditional hoist algorithm. The previous algorithm would miss hoist opportunities if a larger group of candidate inames existed which then in turn did not lead to a viable hoisted condition. --- MEMO | 3 - loopy/codegen/control.py | 66 ++++++----- loopy/codegen/dispatch.py | 242 -------------------------------------- test/test_matmul.py | 50 ++++---- 4 files changed, 62 insertions(+), 299 deletions(-) delete mode 100644 loopy/codegen/dispatch.py diff --git a/MEMO b/MEMO index b16d1c3ac..56bc50c2a 100644 --- a/MEMO +++ b/MEMO @@ -6,10 +6,7 @@ For writeup: TODO: Reimplement forced lengths TODO: Try, fix reg. prefetch (DG example) / CSEs ILP and reg. prefetch interact! -TODO: Functions -TODO: ILP arrays FIXME: support non-reductive dimensions (what did I mean here?) -FIXME: write names should be assigned during scheduling FIXME: screwy lower bounds in ILP FIXME: Leading syncthreads elimination diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 2b6767db9..788625e0e 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -80,7 +80,7 @@ def remove_inames_for_shared_hw_axes(kernel, cond_inames): the other inames as well.) """ - tag_key_use_count = {} + tag_key_uses = {} from loopy.kernel import HardwareParallelTag @@ -88,11 +88,11 @@ def remove_inames_for_shared_hw_axes(kernel, cond_inames): tag = kernel.iname_to_tag.get(iname) if isinstance(tag, HardwareParallelTag): - tag_key_use_count[tag.key] = tag_key_use_count.get(tag.key, 0) + 1 + tag_key_uses.setdefault(tag.key, []).append(iname) multi_use_keys = set( - key for key, count in tag_key_use_count.iteritems() - if count > 1) + key for key, user_inames in tag_key_uses.iteritems() + if len(user_inames) > 1) multi_use_inames = set() for iname in cond_inames: @@ -150,8 +150,7 @@ def build_loop_nest(kernel, sched_index, codegen_state): # {{{ pass 3: greedily group schedule items that share admissible inames - def build_insn_group(sched_indices_and_cond_inames, codegen_state, - min_iname_count=1): + def build_insn_group(sched_indices_and_cond_inames, codegen_state, done_group_lengths=set()): # min_iname_count serves to prevent infinite recursion by imposing a # bigger and bigger minimum size on the group of shared inames found. @@ -167,41 +166,46 @@ def build_loop_nest(kernel, sched_index, codegen_state): current_iname_set = cond_inames - idx = 1 - while (len(current_iname_set) >= min_iname_count - and idx < len(sched_indices_and_cond_inames)): - other_sched_index, other_cond_inames = sched_indices_and_cond_inames[idx] - new_iname_set = current_iname_set & other_cond_inames + found_hoists = [] - if len(new_iname_set) >= min_iname_count: - idx += 1 - current_iname_set = new_iname_set - else: - break + candidate_group_length = 1 + while candidate_group_length <= len(sched_indices_and_cond_inames): + if candidate_group_length in done_group_lengths: + candidate_group_length += 1 + continue - # }}} - - if len(current_iname_set) >= min_iname_count: - # Success: found a big enough group of inames for a conditional. - # See if there are bounds checks available for that set. + other_sched_index, other_cond_inames = sched_indices_and_cond_inames[candidate_group_length-1] + current_iname_set = current_iname_set & other_cond_inames - # {{{ see which inames were actually used in group + # {{{ see which inames are actually used in group # And only generate conditionals for those. from loopy.schedule import find_used_inames_within used_inames = set() - for subsched_index, _ in sched_indices_and_cond_inames[0:idx]: + for subsched_index, _ in sched_indices_and_cond_inames[0:candidate_group_length]: used_inames |= find_used_inames_within(kernel, subsched_index) # }}} from loopy.codegen.bounds import generate_bounds_checks + only_unshared_inames = remove_inames_for_shared_hw_axes(kernel, + current_iname_set & used_inames) + bounds_checks = generate_bounds_checks(kernel.domain, remove_inames_for_shared_hw_axes(kernel, - current_iname_set & used_inames), + only_unshared_inames), codegen_state.implemented_domain) - else: - bounds_checks = [] + + if bounds_checks or candidate_group_length == 1: + # length-1 must always be an option to reach the recursion base case below + found_hoists.append((candidate_group_length, bounds_checks)) + + candidate_group_length += 1 + + # }}} + + # pick largest such group + group_length, bounds_checks = max(found_hoists) if bounds_checks: check_set = isl.BasicSet.universe(kernel.space) @@ -212,13 +216,15 @@ def build_loop_nest(kernel, sched_index, codegen_state): else: new_codegen_state = codegen_state - if idx == 1: + if group_length == 1: # group only contains starting schedule item result = [generate_code_for_sched_index(kernel, sched_index, new_codegen_state)] else: # recurse with a bigger minimum iname count - result = build_insn_group(sched_indices_and_cond_inames[0:idx], - new_codegen_state, len(current_iname_set)+1) + result = build_insn_group( + sched_indices_and_cond_inames[0:group_length], + new_codegen_state, + done_group_lengths=done_group_lengths | set([group_length])) if bounds_checks: from loopy.codegen import wrap_in_if @@ -228,7 +234,7 @@ def build_loop_nest(kernel, sched_index, codegen_state): gen_code_block(result))] return result + build_insn_group( - sched_indices_and_cond_inames[idx:], codegen_state) + sched_indices_and_cond_inames[group_length:], codegen_state) # }}} diff --git a/loopy/codegen/dispatch.py b/loopy/codegen/dispatch.py deleted file mode 100644 index d5ee62839..000000000 --- a/loopy/codegen/dispatch.py +++ /dev/null @@ -1,242 +0,0 @@ -"""Loop nest build top-level dispatch.""" -from __future__ import division - -from loopy.codegen import CodeGenerationState, gen_code_block -import islpy as isl - - - - -def get_admissible_conditional_inames_for(kernel, sched_index): - """This function disallows conditionals on local-idx tagged - inames if there is a barrier nested somewhere within. - """ - - from loopy.kernel import LocalIndexTag, HardwareParallelTag - - from loopy.schedule import find_active_inames_at, has_barrier_within - result = find_active_inames_at(kernel, sched_index) - - has_barrier = has_barrier_within(kernel, sched_index) - - for iname, tag in kernel.iname_to_tag.iteritems(): - if isinstance(tag, HardwareParallelTag): - if not has_barrier or not isinstance(tag, LocalIndexTag): - result.add(iname) - - return result - - - - -def generate_code_for_sched_index(kernel, sched_index, codegen_state): - from loopy.schedule import (EnterLoop, RunInstruction, Barrier) - - sched_item = kernel.schedule[sched_index] - - if isinstance(sched_item, EnterLoop): - tag = kernel.iname_to_tag.get(sched_item.iname) - - from loopy.codegen.loop import ( - generate_unroll_loop, - generate_sequential_loop_dim_code) - - from loopy.kernel import UnrollTag, SequentialTag - if isinstance(tag, UnrollTag): - func = generate_unroll_loop - elif tag is None or isinstance(tag, SequentialTag): - func = generate_sequential_loop_dim_code - else: - raise RuntimeError("encountered (invalid) EnterLoop for '%s', tagged '%s'" - % (sched_item.iname, tag)) - - return func(kernel, sched_index, codegen_state) - - elif isinstance(sched_item, Barrier): - from loopy.codegen import GeneratedInstruction - from cgen import Statement as S - return GeneratedInstruction( - ast=S("barrier(CLK_LOCAL_MEM_FENCE)"), - implemented_domain=None) - - elif isinstance(sched_item, RunInstruction): - insn = kernel.id_to_insn[sched_item.insn_id] - - from loopy.codegen.instruction import generate_instruction_code - return generate_instruction_code(kernel, insn, codegen_state) - - else: - raise RuntimeError("unexpected schedule item type: %s" - % type(sched_item)) - - - - -def remove_inames_for_shared_hw_axes(kernel, cond_inames): - """ - See if cond_inames contains references to two (or more) inames that - boil down to the same tag. If so, exclude them. (We shouldn't be writing - conditionals for such inames because we would be implicitly restricting - the other inames as well.) - """ - - tag_key_use_count = {} - - from loopy.kernel import HardwareParallelTag - - for iname in cond_inames: - tag = kernel.iname_to_tag.get(iname) - - if isinstance(tag, HardwareParallelTag): - tag_key_use_count[tag.key] = tag_key_use_count.get(tag.key, 0) + 1 - - multi_use_keys = set( - key for key, count in tag_key_use_count.iteritems() - if count > 1) - - multi_use_inames = set() - for iname in cond_inames: - tag = kernel.iname_to_tag.get(iname) - if isinstance(tag, HardwareParallelTag) and tag.key in multi_use_keys: - multi_use_inames.add(iname) - - return cond_inames - multi_use_inames - - - - -def build_loop_nest(kernel, sched_index, codegen_state): - # Most of the complexity of this function goes towards finding groups of - # instructions that can be nested inside a shared conditional. - - assert isinstance(codegen_state, CodeGenerationState) - - from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier, - gather_schedule_subloop) - - # {{{ pass 1: pre-scan schedule for my schedule items' indices - - my_sched_indices = [] - - while sched_index < len(kernel.schedule): - sched_item = kernel.schedule[sched_index] - - if isinstance(sched_item, LeaveLoop): - break - - my_sched_indices.append(sched_index) - - if isinstance(sched_item, EnterLoop): - _, sched_index = gather_schedule_subloop( - kernel.schedule, sched_index) - elif isinstance(sched_item, Barrier): - sched_index += 1 - - elif isinstance(sched_item, RunInstruction): - sched_index += 1 - else: - raise RuntimeError("unexpected schedule item type: %s" - % type(sched_item)) - - # }}} - - # {{{ pass 2: find admissible conditional inames for each schedule item - - admissible_cond_inames = [ - get_admissible_conditional_inames_for(kernel, sched_index) - for sched_index in my_sched_indices] - - # }}} - - # {{{ pass 3: greedily group schedule items that share admissible inames - - def build_insn_group(sched_indices_and_cond_inames, codegen_state, - min_iname_count=1): - # min_iname_count serves to prevent infinite recursion by imposing a - # bigger and bigger minimum size on the group of shared inames found. - - if not sched_indices_and_cond_inames: - return [] - - sched_index, cond_inames = sched_indices_and_cond_inames[0] - - # {{{ grow schedule item group - - # Keep growing schedule item group as long as group fulfills minimum - # size requirement. - - current_iname_set = cond_inames - - idx = 1 - while (len(current_iname_set) >= min_iname_count - and idx < len(sched_indices_and_cond_inames)): - other_sched_index, other_cond_inames = sched_indices_and_cond_inames[idx] - new_iname_set = current_iname_set & other_cond_inames - - if len(new_iname_set) >= min_iname_count: - idx += 1 - current_iname_set = new_iname_set - else: - break - - # }}} - - if len(current_iname_set) >= min_iname_count: - # Success: found a big enough group of inames for a conditional. - # See if there are bounds checks available for that set. - - # {{{ see which inames were actually used in group - - # And only generate conditionals for those. - from loopy.schedule import find_used_inames_within - used_inames = set() - for subsched_index, _ in sched_indices_and_cond_inames[0:idx]: - used_inames |= find_used_inames_within(kernel, subsched_index) - - # }}} - - from loopy.codegen.bounds import generate_bounds_checks - bounds_checks = generate_bounds_checks(kernel.domain, - remove_inames_for_shared_hw_axes(kernel, - current_iname_set & used_inames), - codegen_state.implemented_domain) - else: - bounds_checks = [] - - if bounds_checks: - check_set = isl.BasicSet.universe(kernel.space) - for cns in bounds_checks: - check_set = check_set.add_constraint(cns) - - new_codegen_state = codegen_state.intersect(check_set) - else: - new_codegen_state = codegen_state - - if idx == 1: - # group only contains starting schedule item - result = [generate_code_for_sched_index(kernel, sched_index, new_codegen_state)] - else: - # recurse with a bigger minimum iname count - result = build_insn_group(sched_indices_and_cond_inames[0:idx], - new_codegen_state, len(current_iname_set)+1) - - if bounds_checks: - from loopy.codegen import wrap_in_if - from loopy.codegen.bounds import constraint_to_code - result = [wrap_in_if( - [constraint_to_code(codegen_state.c_code_mapper, cns) for cns in bounds_checks], - gen_code_block(result))] - - return result + build_insn_group( - sched_indices_and_cond_inames[idx:], codegen_state) - - # }}} - - return gen_code_block( - build_insn_group(zip( - my_sched_indices, admissible_cond_inames), codegen_state)) - - - - -# vim: foldmethod=marker diff --git a/test/test_matmul.py b/test/test_matmul.py index e54893cb6..bc2b0c65b 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -361,7 +361,7 @@ def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): knl = lp.LoopKernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, [ - "c[i, j] = a[i, k]*b[k, j]" + "c[i, j] = sum_float32(k, a[i, k]*b[k, j])" ], [ lp.ArrayArg("a", dtype, shape=(n, n), order=order), @@ -374,16 +374,15 @@ def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): j_reg = 2 i_chunks = 16 j_chunks = 16 - knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0", no_slabs=True) - knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp", no_slabs=True) - knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1", no_slabs=True) - knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp", no_slabs=True) - knl = lp.split_dimension(knl, "k", 16, no_slabs=True) - knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner"]) - assert knl.get_problems({})[0] <= 2 + knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0") + knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp") + knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1") + knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") + knl = lp.split_dimension(knl, "k", 16) + knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"]) - kernel_gen = (lp.insert_register_prefetches(knl) - for knl in lp.generate_loop_schedules(knl)) + kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) @@ -411,12 +410,12 @@ def test_intel_matrix_mul(ctx_factory): queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) - n = 6*16*16 + n = 6*16 knl = lp.LoopKernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, [ - "c[i, j] = a[i, k]*b[k, j]" + "c[i, j] = sum_float32(k, a[i, k]*b[k, j])" ], [ lp.ArrayArg("a", dtype, shape=(n, n), order=order), @@ -429,20 +428,23 @@ def test_intel_matrix_mul(ctx_factory): j_reg = 4 i_chunks = 16 j_chunks = 16 - knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0", no_slabs=True) - knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp", no_slabs=True) - knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1", no_slabs=True) - knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp", no_slabs=True) - knl = lp.split_dimension(knl, "k", 16, no_slabs=True) + knl = lp.split_dimension(knl, "i", i_reg*i_chunks, outer_tag="g.0") + knl = lp.split_dimension(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp") + knl = lp.split_dimension(knl, "j", j_reg*j_chunks, outer_tag="g.1") + knl = lp.split_dimension(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") + knl = lp.split_dimension(knl, "k", 16) #knl = lp.split_dimension(knl, "k_inner", 8, outer_tag="unr") - knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")]) - knl = lp.add_prefetch(knl, 'b', ["k_inner", ("j_inner_inner", "j_inner_outer"),]) - assert knl.get_problems({})[0] <= 2 - kernel_gen = (lp.insert_register_prefetches(knl) - for knl in lp.generate_loop_schedules(knl, - hints=["k_outer", "k_inner_outer", "k_inner_inner"] - )) + knl = lp.add_prefetch(knl, 'a', ["i_inner_inner", "k_inner", "i_inner_outer"]) + knl = lp.add_prefetch(knl, 'b', ["j_inner_inner", "k_inner", "j_inner_outer"]) + + # FIXME: Grouped prefetch + #knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")]) + #knl = lp.add_prefetch(knl, 'b', ["k_inner", ("j_inner_inner", "j_inner_outer"),]) + + kernel_gen = lp.generate_loop_schedules(knl) + #hints=["k_outer", "k_inner_outer", "k_inner_inner"] + kernel_gen = lp.check_kernels(kernel_gen, dict(n=n), kill_level_min=5) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) -- GitLab