From e606dd4639711a0e712fb7c247e28c563cf6e5b1 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Wed, 12 Oct 2011 18:31:52 -0400 Subject: [PATCH] First cut of conditional hoisting. --- MEMO | 27 ++++-- loopy/codegen/__init__.py | 46 +++++++-- loopy/codegen/bounds.py | 12 +-- loopy/codegen/dispatch.py | 182 +++++++++++++++++++++++++++++------ loopy/codegen/instruction.py | 44 ++------- loopy/codegen/loop.py | 132 ++++++++++++------------- loopy/isl.py | 20 +--- loopy/kernel.py | 6 +- loopy/schedule.py | 45 +++++++-- test/test_matmul.py | 2 +- 10 files changed, 333 insertions(+), 183 deletions(-) diff --git a/MEMO b/MEMO index 8eb2fd5ee..24ddf973c 100644 --- a/MEMO +++ b/MEMO @@ -49,6 +49,13 @@ Things to consider - Loop bounds currently may not depend on parallel dimensions Does it make sense to relax this? +- Why do CSEs necessarily have to duplicate the inames? + -> because that would be necessary for a sequential prefetch + +- Cannot do slab decomposition on inames that share a tag with + other inames + -> Is that reasonable? + TODO ^^^^ @@ -69,10 +76,6 @@ TODO -> Reduction -> CSEs? -- Slab decomposition for parallel dimensions - - implement at the outermost nesting level regardless - - bound *all* tagged inames - - Sharing of checks across ILP instances - Some things involving CSEs might be impossible to schedule @@ -80,12 +83,24 @@ TODO - Flag, exploit idempotence -- Implement insert_parallel_dim_check_points - (but first: find a kernel that needs it) +- How should we implement the dim shuffling for odd-size prefetches? + +- Slab decomposition for ILP + +- Better for loop bound generation + -> Try a triangular loop + +- Implement condition hoisting + (needed, e.g., by slab decomposition) Dealt with ^^^^^^^^^^ +- Slab decomposition for parallel dimensions + - implement at the outermost nesting level regardless + - bound *all* tagged inames + - can't slab inames that share tags with other inames (for now) + - Make syntax for iname dependencies - make syntax for insn dependencies diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index b15178dc0..f022e6098 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -127,6 +127,39 @@ class CodeGenerationState(object): # }}} +# {{{ initial assignments + +def make_initial_assignments(kernel): + assignments = {} + + global_size, local_size = kernel.get_grid_sizes() + + from loopy.kernel import TAG_LOCAL_IDX, TAG_GROUP_IDX + from pymbolic import var + + for iname in kernel.all_inames(): + tag = kernel.iname_to_tag.get(iname) + + if isinstance(tag, TAG_LOCAL_IDX): + hw_axis_expr = var("lid")(tag.axis) + hw_axis_size = local_size[tag.axis] + + elif isinstance(tag, TAG_GROUP_IDX): + hw_axis_expr = var("gid")(tag.axis) + hw_axis_size = global_size[tag.axis] + + else: + continue + + bounds = kernel.get_iname_bounds(iname) + + from loopy.symbolic import pw_aff_to_expr + assignments[iname] = pw_aff_to_expr(bounds.lower_bound_pw_aff) + hw_axis_expr + + return assignments + +# }}} + # {{{ main code generation entrypoint def generate_code(kernel): @@ -138,9 +171,8 @@ def generate_code(kernel): CLLocal, CLImage, CLConstant) from loopy.symbolic import LoopyCCodeMapper - ccm = LoopyCCodeMapper(kernel) - - # {{{ build top-level + ccm = LoopyCCodeMapper(kernel).copy_and_assign_many( + make_initial_assignments(kernel)) mod = Module() @@ -244,12 +276,12 @@ def generate_code(kernel): # }}} - from loopy.codegen.dispatch import build_loop_nest - from islpy import align_spaces initial_implemented_domain = align_spaces(kernel.assumptions, kernel.domain) - gen_code = build_loop_nest(kernel, 0, - CodeGenerationState(initial_implemented_domain, c_code_mapper=ccm)) + codegen_state = CodeGenerationState(initial_implemented_domain, c_code_mapper=ccm) + + from loopy.codegen.loop import set_up_hw_parallel_loops + gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state) body.append(Line()) diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index 84f130e2d..a7a5cedee 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -143,23 +143,23 @@ def filter_necessary_constraints(implemented_domain, constraints): if not implemented_domain.is_subset( isl.Set.universe(space).add_constraint(cns))] -def generate_bounds_checks(domain, check_vars, implemented_domain): +def generate_bounds_checks(domain, check_inames, implemented_domain): domain_bset, = (domain - .eliminate_except(check_vars, [dim_type.set]) + .eliminate_except(check_inames, [dim_type.set]) .coalesce() .get_basic_sets()) return filter_necessary_constraints( implemented_domain, domain_bset.get_constraints()) -def generate_bounds_checks_code(ccm, domain, check_vars, implemented_domain): +def generate_bounds_checks_code(ccm, domain, check_inames, implemented_domain): return [constraint_to_code(ccm, cns) for cns in - generate_bounds_checks(domain, check_vars, implemented_domain)] + generate_bounds_checks(domain, check_inames, implemented_domain)] -def wrap_in_bounds_checks(ccm, domain, check_vars, implemented_domain, stmt): +def wrap_in_bounds_checks(ccm, domain, check_inames, implemented_domain, stmt): from loopy.codegen import wrap_in_if return wrap_in_if( - generate_bounds_checks_code(ccm, domain, check_vars, + generate_bounds_checks_code(ccm, domain, check_inames, implemented_domain), stmt) diff --git a/loopy/codegen/dispatch.py b/loopy/codegen/dispatch.py index 77b2f718e..9d1f0261a 100644 --- a/loopy/codegen/dispatch.py +++ b/loopy/codegen/dispatch.py @@ -2,6 +2,69 @@ from __future__ import division from loopy.codegen import CodeGenerationState, gen_code_block +import islpy as isl + + + + +def get_admissible_conditional_inames_for(kernel, sched_index): + """This function disallows conditionals on local-idx tagged + inames if there is a barrier nested somewhere within. + """ + + from loopy.kernel import TAG_LOCAL_IDX, ParallelTag + + from loopy.schedule import find_active_inames_at, has_barrier_within + result = find_active_inames_at(kernel, sched_index) + + has_barrier = has_barrier_within(kernel, sched_index) + + for iname, tag in kernel.iname_to_tag.iteritems(): + if isinstance(tag, ParallelTag): + if not has_barrier or not isinstance(tag, TAG_LOCAL_IDX): + result.add(iname) + + return result + + + + +def generate_code_for_sched_index(kernel, sched_index, codegen_state): + from loopy.schedule import (EnterLoop, RunInstruction, Barrier) + + sched_item = kernel.schedule[sched_index] + + if isinstance(sched_item, EnterLoop): + tag = kernel.iname_to_tag[sched_item.iname] + + from loopy.codegen.loop import ( + generate_unroll_loop, + generate_sequential_loop_dim_code) + + from loopy.kernel import TAG_UNROLL, SequentialTag + if isinstance(tag, TAG_UNROLL): + func = generate_unroll_loop + elif tag is None or isinstance(tag, SequentialTag): + func = generate_sequential_loop_dim_code + else: + raise RuntimeError("encountered (invalid) EnterLoop for '%s', tagged '%s'" + % (sched_item.iname, tag)) + + return func(kernel, sched_index, codegen_state) + + elif isinstance(sched_item, Barrier): + from cgen import Statement as S + return S("barrier(CLK_LOCAL_MEM_FENCE)") + + elif isinstance(sched_item, RunInstruction): + insn = kernel.id_to_insn[sched_item.insn_id] + + from loopy.codegen.instruction import generate_instruction_code + return generate_instruction_code(kernel, insn, codegen_state) + + else: + raise RuntimeError("unexpected schedule item type: %s" + % type(sched_item)) @@ -11,9 +74,10 @@ def build_loop_nest(kernel, sched_index, codegen_state): from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier, gather_schedule_subloop) - from cgen import Statement as S - result = [] + # {{{ pass 1: pre-scan schedule for my schedule items' indices + + my_sched_indices = [] while sched_index < len(kernel.schedule): sched_item = kernel.schedule[sched_index] @@ -21,49 +85,105 @@ def build_loop_nest(kernel, sched_index, codegen_state): if isinstance(sched_item, LeaveLoop): break - elif isinstance(sched_item, EnterLoop): - tag = kernel.iname_to_tag[sched_item.iname] - - from loopy.codegen.loop import ( - generate_unroll_or_ilp_code, - generate_parallel_loop_dim_code, - generate_sequential_loop_dim_code) - - from loopy.kernel import (TAG_UNROLL, TAG_ILP, - ParallelTagWithAxis) - if isinstance(tag, (TAG_UNROLL, TAG_ILP)): - func = generate_unroll_or_ilp_code - elif isinstance(tag, ParallelTagWithAxis): - func = generate_parallel_loop_dim_code - else: - func = generate_sequential_loop_dim_code - - result.append(func(kernel, sched_index, codegen_state)) + my_sched_indices.append(sched_index) + if isinstance(sched_item, EnterLoop): _, sched_index = gather_schedule_subloop( kernel.schedule, sched_index) - elif isinstance(sched_item, Barrier): - result.append(S("barrier(CLK_LOCAL_MEM_FENCE)")) - sched_index += 1 elif isinstance(sched_item, RunInstruction): - insn = kernel.id_to_insn[sched_item.insn_id] + sched_index += 1 + else: + raise RuntimeError("unexpected schedule item type: %s" + % type(sched_item)) - from loopy.codegen.instruction import generate_instruction_code + # }}} - result.append( - generate_instruction_code(kernel, insn, codegen_state)) + # {{{ pass 2: find admissible conditional inames - sched_index += 1 + # FIXME: See if another inner insn relies on a different iname + # boiling down to the same tag. If so, exclude that. + + admissible_cond_inames = [ + get_admissible_conditional_inames_for(kernel, sched_index) + for sched_index in my_sched_indices] + + # }}} + + # {{{ pass 3: greedily group instructions that share admissible conditionals + + def build_insn_group(sched_indices_and_cond_inames, codegen_state, + min_iname_count=1): + # min_iname_count serves to prevent infinite recursion by imposing a + # bigger and bigger minimum size on the group of shared inames found. + + if not sched_indices_and_cond_inames: + return [] + + sched_index, cond_inames = sched_indices_and_cond_inames[0] + + # {{{ keep growing instruction group as long as shared inames exist + + current_iname_set = cond_inames + idx = 1 + while (len(current_iname_set) >= min_iname_count + and idx < len(sched_indices_and_cond_inames)): + other_sched_index, other_cond_inames = sched_indices_and_cond_inames[idx] + new_iname_set = current_iname_set & other_cond_inames + + if len(new_iname_set) >= min_iname_count: + idx += 1 + current_iname_set = new_iname_set + else: + break + + # }}} + + if len(current_iname_set) >= min_iname_count: + # Success: found a big enough group of inames for a conditional. + # See if there are bounds checks available for that set. + + from loopy.codegen.bounds import generate_bounds_checks + bounds_checks = generate_bounds_checks(kernel.domain, current_iname_set, + codegen_state.implemented_domain) else: - raise RuntimeError("unexpected schedule item type: %s" - % type(sched_item)) + bounds_checks = [] + + if bounds_checks: + check_set = isl.BasicSet.universe(kernel.space) + for cns in bounds_checks: + check_set = check_set.add_constraint(cns) + + new_codegen_state = codegen_state.intersect(check_set) + else: + new_codegen_state = codegen_state + + if idx == 1: + # group only contains starting schedule item + result = [generate_code_for_sched_index(kernel, sched_index, new_codegen_state)] + else: + # recurse with a bigger iname count + result = build_insn_group(sched_indices_and_cond_inames[0:idx], + new_codegen_state, len(current_iname_set)+1) + + if bounds_checks: + from loopy.codegen import wrap_in_if + from loopy.codegen.bounds import constraint_to_code + result = [wrap_in_if( + [constraint_to_code(codegen_state.c_code_mapper, cns) for cns in bounds_checks], + gen_code_block(result))] + + return result + build_insn_group( + sched_indices_and_cond_inames[idx:], codegen_state) + # }}} - return gen_code_block(result) + return gen_code_block( + build_insn_group(zip( + my_sched_indices, admissible_cond_inames), codegen_state)) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index e9ddcf2c0..1d1bc55d5 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -41,45 +41,11 @@ class ILPInstance(Record): def generate_ilp_instances(kernel, insn, codegen_state): - assignments = {} impl_domain = codegen_state.implemented_domain - from loopy.kernel import (TAG_ILP, - TAG_LOCAL_IDX, TAG_GROUP_IDX) + from loopy.kernel import TAG_ILP - from pymbolic import var - - # {{{ pass 1: assign all hw-parallel dimensions - - global_size, local_size = kernel.get_grid_sizes() - - for iname in insn.all_inames(): - tag = kernel.iname_to_tag.get(iname) - - if isinstance(tag, TAG_LOCAL_IDX): - hw_axis_expr = var("lid")(tag.axis) - hw_axis_size = local_size[tag.axis] - - elif isinstance(tag, TAG_GROUP_IDX): - hw_axis_expr = var("gid")(tag.axis) - hw_axis_size = global_size[tag.axis] - - else: - continue - - bounds = kernel.get_iname_bounds(iname) - - from loopy.isl import make_slab - slab = make_slab(impl_domain.get_space(), iname, - bounds.lower_bound_pw_aff, bounds.lower_bound_pw_aff+hw_axis_size) - impl_domain = impl_domain.intersect(slab) - - from loopy.symbolic import pw_aff_to_expr - assignments[iname] = pw_aff_to_expr(bounds.lower_bound_pw_aff) + hw_axis_expr - - # }}} - - result = [ILPInstance(impl_domain, assignments, frozenset())] + result = [ILPInstance(impl_domain, {}, frozenset())] # {{{ pass 2: treat all ILP dimensions @@ -118,3 +84,9 @@ def generate_instruction_code(kernel, insn, codegen_state): from loopy.codegen import gen_code_block return gen_code_block(result) + + + + + +# vim: foldmethod=marker diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index e667a9d76..20405c0ff 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -12,7 +12,6 @@ from loopy.codegen.dispatch import build_loop_nest def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain): - from loopy.isl import cast_constraint_to_space from loopy.codegen.bounds import get_bounds_constraints, get_defined_inames lower_constraints_orig, upper_constraints_orig, equality_constraints_orig = \ get_bounds_constraints(kernel.domain, iname, @@ -32,16 +31,15 @@ def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain): # {{{ conditional-minimizing slab decomposition -def get_slab_decomposition(kernel, sched_index, exec_domain): +def get_slab_decomposition(kernel, iname, sched_index, codegen_state): from loopy.isl import block_shift_constraint, negate_constraint - ccm = exec_domain.c_code_mapper + ccm = codegen_state.c_code_mapper space = kernel.space - iname = kernel.schedule[sched_index].iname tag = kernel.iname_to_tag.get(iname) lb_cns_orig, ub_cns_orig = get_simple_loop_bounds(kernel, sched_index, iname, - exec_domain.implemented_domain) + codegen_state.implemented_domain) lower_incr, upper_incr = kernel.iname_slab_increments.get(iname, (0, 0)) @@ -83,11 +81,10 @@ def get_slab_decomposition(kernel, sched_index, exec_domain): # {{{ unrolled/ILP loops -def generate_unroll_or_ilp_code(kernel, sched_index, codegen_state): +def generate_unroll_loop(kernel, sched_index, codegen_state): from loopy.isl import block_shift_constraint - from loopy.codegen.bounds import solve_constraint_for_bound - from cgen import (POD, Assign, Line, Statement as S, Initializer, Const) + from cgen import (POD, Line) ccm = codegen_state.c_code_mapper space = kernel.space @@ -97,12 +94,6 @@ def generate_unroll_or_ilp_code(kernel, sched_index, codegen_state): lower_cns, upper_cns = get_simple_loop_bounds(kernel, sched_index, iname, codegen_state.implemented_domain) - lower_kind, lower_bound = solve_constraint_for_bound(lower_cns, iname) - upper_kind, upper_bound = solve_constraint_for_bound(upper_cns, iname) - - assert lower_kind == ">=" - assert upper_kind == "<" - bounds = kernel.get_iname_bounds(iname) from loopy.isl import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr @@ -111,7 +102,8 @@ def generate_unroll_or_ilp_code(kernel, sched_index, codegen_state): lower_bound_pw_aff_pieces = bounds.lower_bound_pw_aff.coalesce().get_pieces() if len(lower_bound_pw_aff_pieces) > 1: - raise NotImplementedError("lower bound for ILP/unroll needed conditional") + raise NotImplementedError("lower bound for unroll needs conditional/" + "has more than one piece") (_, lower_bound_aff), = lower_bound_pw_aff_pieces @@ -122,7 +114,7 @@ def generate_unroll_or_ilp_code(kernel, sched_index, codegen_state): block_shift_constraint( lower_cns, iname, -i, as_equality=True))) - from loopy.kernel import TAG_ILP, TAG_UNROLL + from loopy.kernel import TAG_UNROLL if isinstance(tag, TAG_UNROLL): result = [POD(np.int32, iname), Line()] @@ -134,23 +126,6 @@ def generate_unroll_or_ilp_code(kernel, sched_index, codegen_state): return gen_code_block(result) - elif isinstance(tag, TAG_ILP): - new_ilp_instances = [] - for ilpi in codegen_state.ilp_instances: - for i in range(length): - idx_aff = lower_bound_aff + i - new_ilp_instances.append(ilpi.fix(iname, idx_aff)) - - overall_slab = (isl.Set.universe(kernel.space) - .add_constraint(lower_cns) - .add_constraint(upper_cns)) - - return build_loop_nest(kernel, sched_index+1, - CodeGenerationState( - codegen_state.implemented_domain.intersect(overall_slab), - codegen_state.c_code_mapper, - new_ilp_instances)) - else: raise RuntimeError("unexpected tag") @@ -158,36 +133,61 @@ def generate_unroll_or_ilp_code(kernel, sched_index, codegen_state): # {{{ parallel loop -def generate_parallel_loop_dim_code(kernel, sched_index, exec_domain): - from loopy.isl import make_slab +def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=None): + from loopy.kernel import UniqueTag, HardwareParallelTag, TAG_LOCAL_IDX, TAG_GROUP_IDX - ccm = exec_domain.c_code_mapper - space = kernel.space - iname = kernel.schedule[sched_index].iname + if hw_inames_left is None: + hw_inames_left = [iname + for iname in kernel.all_inames() + if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)] + + from loopy.codegen.dispatch import build_loop_nest + if not hw_inames_left: + return build_loop_nest(kernel, sched_index, codegen_state) + + global_size, local_size = kernel.get_grid_sizes() + + iname = hw_inames_left.pop() tag = kernel.iname_to_tag.get(iname) - lb_cns_orig, ub_cns_orig, slabs = get_slab_decomposition( - kernel, sched_index, exec_domain) + assert isinstance(tag, UniqueTag) - # For a parallel loop dimension, the global loop bounds are - # automatically obeyed--simply because no work items are launched - # outside the requested grid. - # - # For a forced length, this is implemented by an if below. + other_inames_with_same_tag = [ + other_iname for other_iname in kernel.all_inames() + if isinstance(kernel.iname_to_tag.get(other_iname), UniqueTag) + and kernel.iname_to_tag.get(other_iname).key == tag.key + and other_iname != iname] - if tag.forced_length is None: - exec_domain = exec_domain.intersect( - isl.Set.universe(kernel.space) - .add_constraint(lb_cns_orig) - .add_constraint(ub_cns_orig)) + # {{{ 'implement' hardware axis boundaries + + if isinstance(tag, TAG_LOCAL_IDX): + hw_axis_size = local_size[tag.axis] + elif isinstance(tag, TAG_GROUP_IDX): + hw_axis_size = global_size[tag.axis] else: - impl_len = tag.forced_length - start, _, _ = kernel.get_bounds(iname, (iname,), allow_parameters=True) - exec_domain = exec_domain.intersect( - make_slab(kernel.space, iname, start, start+impl_len)) + raise RuntimeError("unknown hardware parallel tag") + + result = [] + + bounds = kernel.get_iname_bounds(iname) + + from loopy.isl import make_slab + slab = make_slab(kernel.space, iname, + bounds.lower_bound_pw_aff, bounds.lower_bound_pw_aff+hw_axis_size) + codegen_state = codegen_state.intersect(slab) + + # }}} + + lb_cns_orig, ub_cns_orig, slabs = get_slab_decomposition( + kernel, iname, sched_index, codegen_state) + + if other_inames_with_same_tag and len(slabs) > 1: + raise RuntimeError("cannot do slab decomposition on inames that share " + "a tag with other inames") + + ccm = codegen_state.c_code_mapper result = [] - nums_of_conditionals = [] from loopy.codegen import add_comment @@ -196,11 +196,10 @@ def generate_parallel_loop_dim_code(kernel, sched_index, exec_domain): if len(slabs) == 1: cmt = None - new_kernel = kernel.copy( - domain=kernel.domain.intersect(slab)) - result.append( - add_comment(cmt, - build_loop_nest(new_kernel, sched_index+1, exec_domain))) + new_kernel = kernel.copy(domain=kernel.domain.intersect(slab)) + inner = set_up_hw_parallel_loops( + new_kernel, sched_index, codegen_state, hw_inames_left) + result.append(add_comment(cmt, inner)) from loopy.codegen import gen_code_block return gen_code_block(result, is_alternatives=True) @@ -209,31 +208,28 @@ def generate_parallel_loop_dim_code(kernel, sched_index, exec_domain): # {{{ sequential loop -def generate_sequential_loop_dim_code(kernel, sched_index, exec_domain): - - ccm = exec_domain.c_code_mapper +def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): + ccm = codegen_state.c_code_mapper space = kernel.space iname = kernel.schedule[sched_index].iname tag = kernel.iname_to_tag.get(iname) lb_cns_orig, ub_cns_orig, slabs = get_slab_decomposition( - kernel, sched_index, exec_domain) + kernel, iname, sched_index, codegen_state) result = [] - nums_of_conditionals = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, iname) if len(slabs) == 1: cmt = None - new_exec_domain = exec_domain.intersect(slab) + new_codegen_state = codegen_state.intersect(slab) inner = build_loop_nest(kernel, sched_index+1, - new_exec_domain) + new_codegen_state) from loopy.codegen.bounds import wrap_in_for_from_constraints - # regular loop if cmt is not None: from cgen import Comment result.append(Comment(cmt)) diff --git a/loopy/isl.py b/loopy/isl.py index ef1b6cf51..055b65655 100644 --- a/loopy/isl.py +++ b/loopy/isl.py @@ -9,21 +9,6 @@ from islpy import dim_type -def cast_constraint_to_space(cns, new_space, as_equality=None): - 1/0 # bad routine, shouldn't be used - - if as_equality is None: - as_equality = cns.is_equality() - - if as_equality: - factory = isl.Constraint.eq_from_names - else: - factory = isl.Constraint.ineq_from_names - return factory(new_space, cns.get_coefficients_by_name()) - - - - def block_shift_constraint(cns, type, pos, multiple, as_equality=None): if as_equality != cns.is_equality(): if as_equality: @@ -107,10 +92,13 @@ def pw_aff_to_aff(pw_aff): -def dump_local_space(ls): +def dump_space(ls): return " ".join("%s: %d" % (dt, ls.dim(getattr(dim_type, dt))) for dt in dim_type.names) + + + def make_slab(space, iname, start, stop): zero = isl.Aff.zero_on_domain(space) diff --git a/loopy/kernel.py b/loopy/kernel.py index 2a8256ec4..0da1d5671 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -35,7 +35,7 @@ class UniqueTag(IndexTag): def key(self): return type(self) -class ParallelTagWithAxis(ParallelTag, UniqueTag): +class HardwareParallelTag(ParallelTag, UniqueTag): __slots__ = ["axis"] def __init__(self, axis): @@ -56,10 +56,10 @@ class ParallelTagWithAxis(ParallelTag, UniqueTag): -class TAG_GROUP_IDX(ParallelTagWithAxis): +class TAG_GROUP_IDX(HardwareParallelTag): print_name = "g" -class TAG_LOCAL_IDX(ParallelTagWithAxis): +class TAG_LOCAL_IDX(HardwareParallelTag): print_name = "l" class TAG_AUTO_LOCAL_IDX(ParallelTag): diff --git a/loopy/schedule.py b/loopy/schedule.py index 5fef6bcfb..0dbb8af74 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -667,13 +667,6 @@ def insert_barriers(kernel, schedule, level=0): -def insert_parallel_dim_check_points(kernel, schedule): - # FIXME: Unimplemented - return kernel - - - - def generate_loop_schedules(kernel): kernel = realize_reduction(kernel) @@ -701,12 +694,46 @@ def generate_loop_schedules(kernel): gen_sched, owed_barriers = insert_barriers(kernel, gen_sched) assert not owed_barriers - schedule = insert_parallel_dim_check_points(kernel, gen_sched) - yield kernel.copy(schedule=gen_sched) +# {{{ schedule utilities + +def find_active_inames_at(kernel, sched_index): + active_inames = [] + + from loopy.schedule import EnterLoop, LeaveLoop + for sched_item in kernel.schedule[:sched_index]: + if isinstance(sched_item, EnterLoop): + active_inames.append(sched_item.iname) + if isinstance(sched_item, LeaveLoop): + active_inames.pop() + + return set(active_inames) + + + + +def has_barrier_within(kernel, sched_index): + sched_item = kernel.schedule[sched_index] + + if isinstance(sched_item, EnterLoop): + loop_contents, _ = gather_schedule_subloop( + kernel.schedule, sched_index) + from pytools import any + return any(isinstance(subsched_item, Barrier) + for subsched_item in loop_contents) + elif isinstance(sched_item, Barrier): + return True + else: + return False + +# }}} + + + + # vim: foldmethod=marker diff --git a/test/test_matmul.py b/test/test_matmul.py index 6a54e5864..42ebf7e9e 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -216,7 +216,7 @@ def test_plain_matrix_mul_new_ui(ctx_factory): knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1", no_slabs=True) knl = lp.split_dimension(knl, "j", 16, - outer_tag="g.1", inner_tag="l.0", no_slabs=True) + outer_tag="g.1", inner_tag="l.0") knl = lp.split_dimension(knl, "k", 16) knl = lp.realize_cse(knl, "lhsmat", dtype, ["k_inner", "i_inner"]) -- GitLab