diff --git a/MEMO b/MEMO index f88800f525cb85dfd71d9876763ecf83f799d9d9..20f7895ec6cb60f181409b2e0da813c6acebb433 100644 --- a/MEMO +++ b/MEMO @@ -66,6 +66,13 @@ Things to consider - multiple insns could fight over which iname gets local axis 0 -> complicated optimization problem +- How to determine which variables need to be duplicated for ILP? + -> Only reduction + +- Slab decomposition for parallel dimensions + +- Sharing of checks across ILP instances + Dealt with ^^^^^^^^^^ diff --git a/loopy/__init__.py b/loopy/__init__.py index 606d579c666c5b0503dcf467395deb5914190e2f..add477149517a64ecf588b8f58477edfbbd27050 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -123,7 +123,7 @@ def split_dimension(kernel, iname, inner_length, padded_length=None, .copy(domain=new_domain, assumptions=new_assumptions, iname_slab_increments=iname_slab_increments, - name_to_dim=None, + iname_to_dim=None, instructions=new_insns)) return tag_dimensions(result, {outer_iname: outer_tag, inner_iname: inner_tag}) @@ -338,41 +338,49 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non # {{{ build new domain, duplicating each constraint on duplicated inames - start_idx = kernel.domain.dim(dim_type.set) - new_domain = kernel.domain.insert_dims( - dim_type.set, start_idx, - len(duplicate_inames)) - new_name_to_dim = kernel.name_to_dim.copy() - for i, iname in enumerate(new_inames): - new_idx = start_idx+i - new_domain = new_domain.set_dim_name( - dim_type.set, new_idx, iname) - new_name_to_dim[iname] = (dim_type.set, new_idx) - - dup_iname_dims = [kernel.name_to_dim[iname] + dup_iname_dims = [kernel.iname_to_dim[iname] for iname in duplicate_inames] old_to_new = dict((old_iname, new_iname) for old_iname, new_iname in zip(duplicate_inames, new_inames)) - new_domain_bs, = new_domain.get_basic_sets() + def realize_duplication(set): + start_idx = set.dim(dim_type.set) + result = set.insert_dims( + dim_type.set, start_idx, + len(duplicate_inames)) - for cns in new_domain_bs.get_constraints(): - if any(cns.involves_dims(*dim+(1,)) for dim in dup_iname_dims): - assert not cns.is_div_constraint() - if cns.is_equality(): - new_cns = cns.equality_alloc(new_domain.get_space()) - else: - new_cns = cns.inequality_alloc(new_domain.get_space()) + new_iname_to_dim = kernel.iname_to_dim.copy() + for i, iname in enumerate(new_inames): + new_idx = start_idx+i + result = result.set_dim_name( + dim_type.set, new_idx, iname) + new_iname_to_dim[iname] = (dim_type.set, new_idx) + + + set_bs, = set.get_basic_sets() - new_coeffs = {} - for key, val in cns.get_coefficients_by_name().iteritems(): - if key in old_to_new: - new_coeffs[old_to_new[key]] = val + for cns in set_bs.get_constraints(): + if any(cns.involves_dims(*dim+(1,)) for dim in dup_iname_dims): + assert not cns.is_div_constraint() + if cns.is_equality(): + new_cns = cns.equality_alloc(result.get_space()) else: - new_coeffs[key] = val + new_cns = cns.inequality_alloc(result.get_space()) + + new_coeffs = {} + for key, val in cns.get_coefficients_by_name().iteritems(): + if key in old_to_new: + new_coeffs[old_to_new[key]] = val + else: + new_coeffs[key] = val + + new_cns = new_cns.set_coefficients_by_name(new_coeffs) + result = result.add_constraint(new_cns) + + return result, new_iname_to_dim - new_cns = new_cns.set_coefficients_by_name(new_coeffs) - new_domain = new_domain.add_constraint(new_cns) + new_domain, new_iname_to_dim = realize_duplication(kernel.domain) + new_assumptions, _ = realize_duplication(kernel.assumptions) # }}} @@ -382,8 +390,8 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non target_var_shape = [] for iname in new_inames: - lower_bound_pw_aff = new_domain.dim_min(new_name_to_dim[iname][1]) - upper_bound_pw_aff = new_domain.dim_max(new_name_to_dim[iname][1]) + lower_bound_pw_aff = new_domain.dim_min(new_iname_to_dim[iname][1]) + upper_bound_pw_aff = new_domain.dim_max(new_iname_to_dim[iname][1]) from loopy.isl import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr @@ -409,9 +417,10 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non return kernel.copy( domain=new_domain, + assumptions=new_assumptions, instructions=new_insns, temporary_variables=new_temporary_variables, - name_to_dim=new_name_to_dim, + iname_to_dim=new_iname_to_dim, iname_to_tag=new_iname_to_tag) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 1c0e358e4df93caa8b0046b4078c190fce69f15a..65a07c6d0a96939cffa5b9604618a687bf78605c 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -2,6 +2,7 @@ from __future__ import division from pytools import Record import numpy as np +import islpy as isl @@ -92,65 +93,46 @@ def add_comment(cmt, code): # }}} -# {{{ main code generation entrypoint - -class ExecutionSubdomain(Record): - __slots__ = ["implemented_domain", "c_code_mapper"] +# {{{ code generation state +class CodeGenerationState(object): def __init__(self, implemented_domain, c_code_mapper): - Record.__init__(self, - implemented_domain=implemented_domain, - c_code_mapper=c_code_mapper) - - def intersect(self, set): - return ExecutionSubdomain( - self.implemented_domain.intersect(set), - self.c_code_mapper) - -class ExecutionDomain(object): - def __init__(self, implemented_domain, c_code_mapper, subdomains=None): """ :param implemented_domain: The entire implemented domain, i.e. all constraints that have been enforced so far. - :param subdomains: a list of :class:`ExecutionSubdomain` - instances. - - The point of this being a list is the implementation of - ILP, and each entry represents a 'fake-parallel' trip through the - ILP'd loop, with the requisite implemented_domain - and a C code mapper that realizes the necessary assignments. :param c_code_mapper: A C code mapper that does not take per-ILP assignments into account. """ self.implemented_domain = implemented_domain - if subdomains is None: - self.subdomains = [ - ExecutionSubdomain(implemented_domain, c_code_mapper)] - else: - self.subdomains = subdomains self.c_code_mapper = c_code_mapper def intersect(self, set): - return ExecutionDomain( + return CodeGenerationState( self.implemented_domain.intersect(set), - self.c_code_mapper, - [subd.intersect(set) for subd in self.subdomains]) + self.c_code_mapper) - def get_the_one_domain(self): - assert len(self.subdomains) == 1 - return self.implemented_domain + def fix(self, iname, aff): + dt, pos = aff.get_space().get_var_dict()[iname] + iname_plus_lb_aff = aff.add_coefficient( + dt, pos, -1) + from loopy.symbolic import pw_aff_to_expr + cns = isl.Constraint.equality_from_aff(iname_plus_lb_aff) + expr = pw_aff_to_expr(aff) + return CodeGenerationState( + self.implemented_domain.add_constraint(cns), + self.c_code_mapper.copy_and_assign(iname, expr)) +# }}} -def generate_code(kernel): - from loopy.codegen.prefetch import preprocess_prefetch - kernel = preprocess_prefetch(kernel) +# {{{ main code generation entrypoint +def generate_code(kernel): from cgen import (FunctionBody, FunctionDeclaration, POD, Value, ArrayOf, Module, Block, - Define, Line, Const, LiteralLines, Initializer) + Line, Const, LiteralLines, Initializer) from cgen.opencl import (CLKernel, CLGlobal, CLRequiredWorkGroupSize, CLLocal, CLImage, CLConstant) @@ -184,7 +166,7 @@ def generate_code(kernel): if isinstance(arg, ArrayArg): arg_decl = restrict_ptr_if_not_nvidia( POD(arg.dtype, arg.name)) - if arg_decl.name in kernel.input_vectors(): + if arg_decl.name not in kernel.get_written_variables(): if arg.constant_mem: arg_decl = CLConstant(Const(arg_decl)) else: @@ -238,51 +220,38 @@ def generate_code(kernel): """), Line()]) - # {{{ symbolic names for group and local indices + # {{{ build lmem array declarators for prefetches - from loopy.kernel import TAG_GROUP_IDX, TAG_LOCAL_IDX - for what_cls, func in [ - (TAG_GROUP_IDX, "get_group_id"), - (TAG_LOCAL_IDX, "get_local_id")]: - for iname in kernel.ordered_inames_by_tag_type(what_cls): - lower, upper, equality = kernel.get_bounds(iname, (iname,), allow_parameters=True) - assert not equality - mod.append(Define(iname, "(%s + (int) %s(%d)) /* [%s, %s) */" - % (ccm(lower), - func, - kernel.iname_to_tag[iname].axis, - ccm(lower), - ccm(upper)))) + for tv in kernel.temporary_variables.itervalues(): + temp_var_decl = POD(tv.dtype, tv.name) - mod.append(Line()) + try: + storage_shape = tv.storage_shape + except AttributeError: + storage_shape = tv.shape - # }}} + from loopy.symbolic import pw_aff_to_expr + for l in storage_shape: + temp_var_decl = ArrayOf(temp_var_decl, int(pw_aff_to_expr(l))) - # {{{ build lmem array declarators for prefetches + if tv.is_local: + temp_var_decl = CLLocal(temp_var_decl) - for pf in kernel.prefetch.itervalues(): - smem_pf_array = POD(kernel.arg_dict[pf.input_vector].dtype, pf.name) - for l in pf.dim_storage_lengths: - smem_pf_array = ArrayOf(smem_pf_array, l) - body.append(CLLocal(smem_pf_array)) + body.append(temp_var_decl) # }}} from loopy.codegen.dispatch import build_loop_nest gen_code = build_loop_nest(kernel, 0, - ExecutionDomain( kernel.assumptions, c_code_mapper=ccm)) + CodeGenerationState(kernel.assumptions, c_code_mapper=ccm)) body.extend([Line(), gen_code.ast]) - #print "# conditionals: %d" % gen_code.num_conditionals - from loopy.kernel import TAG_LOCAL_IDX + from loopy.symbolic import pw_aff_to_expr mod.append( FunctionBody( CLRequiredWorkGroupSize( - tuple(dim_length - for dim_length in kernel.tag_type_lengths( - TAG_LOCAL_IDX, - allow_parameters=False)), + tuple(pw_aff_to_expr(sz) for sz in kernel.fix_grid_sizes()[1]), CLKernel(FunctionDeclaration( Value("void", kernel.name), args))), body)) diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index 5d89b10485551d1aa853d6140433a8abab56be1c..0b97179b5aed6c0c3bc82f6ecac89da181d697a2 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -20,10 +20,7 @@ def get_bounds_constraints(set, iname, admissible_inames, allow_parameters): if not allow_parameters: proj_type.append(dim_type.param) - set = (set - .project_out_except(admissible_inames, proj_type) - .compute_divs() - .remove_divs_of_dim_type(dim_type.set)) + set = set.eliminate_except(admissible_inames, proj_type) basic_sets = set.get_basic_sets() if len(basic_sets) > 1: @@ -34,6 +31,10 @@ def get_bounds_constraints(set, iname, admissible_inames, allow_parameters): bset, = basic_sets + # FIXME: hackety hack--elimination leaves the set in an + # invalid ('non-final'?) state + bset = bset.intersect(isl.BasicSet.universe(bset.get_space())) + # FIXME perhaps use some form of hull here if there's more than one # basic set? @@ -65,7 +66,7 @@ def get_bounds_constraints(set, iname, admissible_inames, allow_parameters): def solve_constraint_for_bound(cns, iname): from warnings import warn - warn("deprecated") + warn("solve_constraint_for_bound deprecated?") from loopy.symbolic import constraint_to_expr rhs, iname_coeff = constraint_to_expr(cns, except_name=iname) @@ -147,32 +148,21 @@ def filter_necessary_constraints(implemented_domain, constraints): return [cns for cns in constraints if not implemented_domain.is_subset( - isl.Set.universe(space) - .add_constraint(cns))] + isl.Set.universe(space).add_constraint(cns))] def generate_bounds_checks(domain, check_vars, implemented_domain): - projected_domain_bset, = (domain - .project_out_except(check_vars, [dim_type.set]) - .compute_divs() - .remove_divs_of_dim_type(dim_type.set) + domain_bset, = (domain + .eliminate_except(check_vars, [dim_type.set]) .coalesce() .get_basic_sets()) - space = domain.get_space() - - cast_constraints = [] - - from loopy.isl import cast_constraint_to_space - - def examine_constraint(cns): - assert not cns.is_div_constraint() - cast_constraints.append( - cast_constraint_to_space(cns, space)) - - projected_domain_bset.foreach_constraint(examine_constraint) + # FIXME: hackety hack--elimination leaves the set in an + # invalid ('non-final'?) state + domain_bset = domain_bset.intersect( + isl.BasicSet.universe(domain_bset.get_space())) return filter_necessary_constraints( - implemented_domain, cast_constraints) + implemented_domain, domain_bset.get_constraints()) def generate_bounds_checks_code(ccm, domain, check_vars, implemented_domain): return [constraint_to_code(ccm, cns) for cns in @@ -258,53 +248,35 @@ def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt): # {{{ on which variables may a conditional depend? -def get_defined_vars(kernel, sched_index, allow_ilp, exclude_tag_classes=()): +def get_defined_inames(kernel, sched_index, allow_ilp, exclude_tag_classes=()): """ :param exclude_tags: a tuple of tag classes to exclude """ + from loopy.schedule import EnterLoop, LeaveLoop - if not allow_ilp: - from loopy.kernel import TAG_ILP - exclude_tag_classes = exclude_tag_classes + (TAG_ILP,) - - from loopy.schedule import ScheduledLoop - defined_vars = set( - sched_item.iname - for sched_item in kernel.schedule[:sched_index] - if isinstance(sched_item, ScheduledLoop)) + result = set() - defined_vars = set( - iname - for iname in defined_vars - if not isinstance( - kernel.iname_to_tag.get(iname), - exclude_tag_classes)) + for i, sched_item in enumerate(kernel.schedule): + if i >= sched_index: + break + if isinstance(sched_item, EnterLoop): + result.add(sched_item.iname) + elif isinstance(sched_item, LeaveLoop): + result.remove(sched_item.iname) - return defined_vars + from loopy.kernel import TAG_ILP, ParallelTagWithAxis + for iname in kernel.all_inames(): + tag = kernel.iname_to_tag.get(iname) -def get_valid_check_vars(kernel, sched_index, allow_ilp, exclude_tag_classes=()): - """ - :param exclude_tags: a tuple of tag classes to exclude - """ - - allowed_vars = get_defined_vars(kernel, sched_index, allow_ilp, exclude_tag_classes) - - from pytools import any - from loopy.prefetch import LocalMemoryPrefetch - all_lmem_prefetches_scheduled = not any( - isinstance(sched_item, LocalMemoryPrefetch) - for sched_item in kernel.schedule[sched_index:]) - - if not all_lmem_prefetches_scheduled: - # Lmem prefetches use barriers. Barriers are only allowed if *all* work - # items in a work group hit them. Therefore, as long as not all lmem - # prefetches are scheduled, we may not check work item indices - # (and thereby conceivably mask out some work items). + if isinstance(tag, exclude_tag_classes): + continue - from loopy.kernel import TAG_LOCAL_IDX - allowed_vars -= set(kernel.inames_by_tag_type(TAG_LOCAL_IDX)) + if isinstance(tag, ParallelTagWithAxis): + result.add(iname) + elif isinstance(tag, TAG_ILP) and allow_ilp: + result.add(iname) - return allowed_vars + return result # }}} diff --git a/loopy/codegen/dispatch.py b/loopy/codegen/dispatch.py index e1f688a051fd3c1456813f05965e6c84fe34191f..ad6d4e523b8dbc976be88b0f3f5f846143a4313a 100644 --- a/loopy/codegen/dispatch.py +++ b/loopy/codegen/dispatch.py @@ -1,12 +1,74 @@ """Loop nest build top-level dispatch.""" from __future__ import division -from loopy.codegen import ExecutionDomain, gen_code_block +from loopy.codegen import CodeGenerationState, gen_code_block -def build_loop_nest(kernel, sched_index, exec_domain, no_conditional_check=False): +def build_loop_nest(kernel, sched_index, codegen_state): + assert isinstance(codegen_state, CodeGenerationState) + + from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier, + gather_schedule_subloop) + from cgen import Statement as S + + result = [] + + while sched_index < len(kernel.schedule): + sched_item = kernel.schedule[sched_index] + + if isinstance(sched_item, LeaveLoop): + break + + elif isinstance(sched_item, EnterLoop): + tag = kernel.iname_to_tag[sched_item.iname] + + from loopy.codegen.loop import ( + generate_unroll_or_ilp_code, + generate_parallel_loop_dim_code, + generate_sequential_loop_dim_code) + + from loopy.kernel import (TAG_UNROLL, TAG_ILP, + ParallelTagWithAxis) + if isinstance(tag, (TAG_UNROLL, TAG_ILP)): + func = generate_unroll_or_ilp_code + elif isinstance(tag, ParallelTagWithAxis): + func = generate_parallel_loop_dim_code + else: + func = generate_sequential_loop_dim_code + + result.append(func(kernel, sched_index, codegen_state)) + + _, sched_index = gather_schedule_subloop( + kernel.schedule, sched_index) + + elif isinstance(sched_item, Barrier): + result.append(S("barrier(CLK_LOCAL_MEM_FENCE)")) + + sched_index += 1 + + elif isinstance(sched_item, RunInstruction): + insn = kernel.id_to_insn[sched_item.insn_id] + + from loopy.codegen.instruction import generate_instruction_code + + result.append( + generate_instruction_code(kernel, insn, codegen_state)) + + sched_index += 1 + + else: + raise RuntimeError("unexpected schedule item type: %s" + % type(sched_item)) + + + return gen_code_block(result) + + + + +def build_loop_nest_old(kernel, sched_index, codegen_state, no_conditional_check=False): assert isinstance(exec_domain, ExecutionDomain) ccm = exec_domain.c_code_mapper @@ -84,16 +146,16 @@ def build_loop_nest(kernel, sched_index, exec_domain, no_conditional_check=False from loopy.codegen.bounds import wrap_in_bounds_checks if isinstance(sched_item, ScheduledLoop): - from loopy.codegen.loop_dim import ( + from loopy.codegen.loop import ( generate_unroll_or_ilp_code, generate_parallel_loop_dim_code, generate_sequential_loop_dim_code) - from loopy.kernel import (BaseUnrollTag, TAG_ILP, + from loopy.kernel import (TAG_UNROLL, TAG_ILP, ParallelTagWithAxis) tag = kernel.iname_to_tag.get(sched_item.iname) - if isinstance(tag, (BaseUnrollTag, TAG_ILP)): + if isinstance(tag, (TAG_UNROLL, TAG_ILP)): func = generate_unroll_or_ilp_code elif isinstance(tag, ParallelTagWithAxis): func = generate_parallel_loop_dim_code diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py new file mode 100644 index 0000000000000000000000000000000000000000..5651a24d5f5df198de3aa66f7ce0be332b74f6a2 --- /dev/null +++ b/loopy/codegen/instruction.py @@ -0,0 +1,120 @@ +"""Code generation for Instruction objects.""" +from __future__ import division + +from pytools import Record +import islpy as isl + + + + +# {{{ ILP instance + +class ILPInstance(Record): + """ + :ivar ilp_key: a frozenset of tuples (iname, assignment) + """ + __slots__ = ["implemented_domain", "assignments", "ilp_key"] + + def __init__(self, implemented_domain, assignments, ilp_key): + Record.__init__(self, + implemented_domain=implemented_domain, + assignments=assignments, + ilp_key=ilp_key) + + def fix(self, iname, aff): + dt, pos = aff.get_space().get_var_dict()[iname] + iname_plus_lb_aff = aff.add_coefficient( + dt, pos, -1) + + from loopy.symbolic import pw_aff_to_expr + cns = isl.Constraint.equality_from_aff(iname_plus_lb_aff) + expr = pw_aff_to_expr(aff) + + return ILPInstance( + implemented_domain=self.implemented_domain.add_constraint(cns), + c_code_mapper=self.c_code_mapper.copy_and_assign(iname, expr), + ilp_key=self.ilp_key | frozenset([(iname, expr)])) + +# }}} + + + + +def generate_ilp_instances(kernel, insn, codegen_state): + assignments = {} + impl_domain = codegen_state.implemented_domain + + from loopy.kernel import (TAG_ILP, + TAG_LOCAL_IDX, TAG_GROUP_IDX) + + from pymbolic import var + + # {{{ pass 1: assign all hw-parallel dimensions + + global_size, local_size = kernel.get_grid_sizes() + + for iname in insn.all_inames(): + tag = kernel.iname_to_tag.get(iname) + + if isinstance(tag, TAG_LOCAL_IDX): + hw_axis_expr = var("(int) get_local_id")(tag.axis) + hw_axis_size = local_size[tag.axis] + + elif isinstance(tag, TAG_GROUP_IDX): + hw_axis_expr = var("(int) get_group_id")(tag.axis) + hw_axis_size = global_size[tag.axis] + + else: + continue + + bounds = kernel.get_iname_bounds(iname) + + from loopy.isl import make_slab + impl_domain = impl_domain.intersect( + make_slab(impl_domain.get_space(), iname, + bounds.lower_bound_pw_aff, bounds.lower_bound_pw_aff+hw_axis_size)) + + from loopy.symbolic import pw_aff_to_expr + assignments[iname] = pw_aff_to_expr(bounds.lower_bound_pw_aff + hw_axis_expr) + + # }}} + + result = [ILPInstance(impl_domain, assignments, frozenset())] + + # {{{ pass 2: treat all ILP dimensions + + for iname in insn.all_inames(): + tag = kernel.iname_to_tag.get(iname) + + if not isinstance(tag, TAG_ILP): + continue + + from warnings import warn + warn("implement ILP instance generation") + + # }}} + + return result + + + + +def generate_instruction_code(kernel, insn, codegen_state): + result = [] + + for ilpi in generate_ilp_instances(kernel, insn, codegen_state): + ccm = codegen_state.c_code_mapper.copy_and_assign_many(ilpi.assignments) + + # FIXME we should probably share some checks across ILP instances + + from cgen import Assign + insn_code = Assign(ccm(insn.assignee), ccm(insn.expression)) + from loopy.codegen.bounds import wrap_in_bounds_checks + insn_code = wrap_in_bounds_checks( + ccm, kernel.domain, insn.all_inames(), ilpi.implemented_domain, + insn_code) + + result.append(insn_code) + + from loopy.codegen import gen_code_block + return gen_code_block(result) diff --git a/loopy/codegen/loop_dim.py b/loopy/codegen/loop.py similarity index 72% rename from loopy/codegen/loop_dim.py rename to loopy/codegen/loop.py index 888682e5aa76169879c9bc0e1ea1d27b089eac27..0260ca858c5108f9e53ca07315166441b191f251 100644 --- a/loopy/codegen/loop_dim.py +++ b/loopy/codegen/loop.py @@ -1,7 +1,7 @@ from __future__ import division import numpy as np -from loopy.codegen import ExecutionDomain, gen_code_block +from loopy.codegen import CodeGenerationState, gen_code_block from pytools import Record import islpy as isl from islpy import dim_type @@ -13,11 +13,11 @@ from loopy.codegen.dispatch import build_loop_nest def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain): from loopy.isl import cast_constraint_to_space - from loopy.codegen.bounds import get_bounds_constraints, get_defined_vars + from loopy.codegen.bounds import get_bounds_constraints, get_defined_inames lower_constraints_orig, upper_constraints_orig, equality_constraints_orig = \ get_bounds_constraints(kernel.domain, iname, frozenset([iname]) - | frozenset(get_defined_vars(kernel, sched_index+1, allow_ilp=False)), + | frozenset(get_defined_inames(kernel, sched_index+1, allow_ilp=False)), allow_parameters=True) assert not equality_constraints_orig @@ -25,9 +25,6 @@ def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain): lb_cns_orig = pick_simple_constraint(lower_constraints_orig, iname) ub_cns_orig = pick_simple_constraint(upper_constraints_orig, iname) - lb_cns_orig = cast_constraint_to_space(lb_cns_orig, kernel.space) - ub_cns_orig = cast_constraint_to_space(ub_cns_orig, kernel.space) - return lb_cns_orig, ub_cns_orig # {{{ conditional-minimizing slab decomposition @@ -47,6 +44,8 @@ def get_slab_decomposition(kernel, sched_index, exec_domain): # {{{ build slabs + iname_tp, iname_idx = kernel.iname_to_dim[iname] + slabs = [] if lower_incr: slabs.append(("initial", isl.Set.universe(kernel.space) @@ -55,14 +54,14 @@ def get_slab_decomposition(kernel, sched_index, exec_domain): .add_constraint( negate_constraint( block_shift_constraint( - lb_cns_orig, iname, -lower_incr))))) + lb_cns_orig, iname_tp, iname_idx, -lower_incr))))) - slabs.append(("bulk", + slabs.append(("bulk", (isl.Set.universe(kernel.space) .add_constraint( - block_shift_constraint(lb_cns_orig, iname, -lower_incr)) + block_shift_constraint(lb_cns_orig, iname_tp, iname_idx, -lower_incr)) .add_constraint( - block_shift_constraint(ub_cns_orig, iname, -upper_incr))))) + block_shift_constraint(ub_cns_orig, iname_tp, iname_idx, -upper_incr))))) if upper_incr: slabs.append(("final", isl.Set.universe(kernel.space) @@ -71,7 +70,7 @@ def get_slab_decomposition(kernel, sched_index, exec_domain): .add_constraint( negate_constraint( block_shift_constraint( - ub_cns_orig, iname, -upper_incr))))) + ub_cns_orig, iname_tp, iname_idx, -upper_incr))))) # }}} @@ -81,19 +80,19 @@ def get_slab_decomposition(kernel, sched_index, exec_domain): # {{{ unrolled/ILP loops -def generate_unroll_or_ilp_code(kernel, sched_index, exec_domain): +def generate_unroll_or_ilp_code(kernel, sched_index, codegen_state): from loopy.isl import block_shift_constraint from loopy.codegen.bounds import solve_constraint_for_bound from cgen import (POD, Assign, Line, Statement as S, Initializer, Const) - ccm = exec_domain.c_code_mapper + ccm = codegen_state.c_code_mapper space = kernel.space iname = kernel.schedule[sched_index].iname tag = kernel.iname_to_tag.get(iname) lower_cns, upper_cns = get_simple_loop_bounds(kernel, sched_index, iname, - exec_domain.implemented_domain) + codegen_state.implemented_domain) lower_kind, lower_bound = solve_constraint_for_bound(lower_cns, iname) upper_kind, upper_bound = solve_constraint_for_bound(upper_cns, iname) @@ -101,13 +100,17 @@ def generate_unroll_or_ilp_code(kernel, sched_index, exec_domain): assert lower_kind == ">=" assert upper_kind == "<" - proj_domain = (kernel.domain - .project_out_except([iname], [dim_type.set]) - .project_out_except([], [dim_type.param]) - .remove_divs()) - assert proj_domain.is_bounded() - success, length = proj_domain.count() - assert success == 0 + bounds = kernel.get_iname_bounds(iname) + from loopy.isl import static_max_of_pw_aff + from loopy.symbolic import pw_aff_to_expr + + length = int(pw_aff_to_expr(static_max_of_pw_aff(bounds.length))) + lower_bound_pw_aff_pieces = bounds.lower_bound_pw_aff.coalesce().get_pieces() + + if len(lower_bound_pw_aff_pieces) > 1: + raise NotImplementedError("lower bound for ILP/unroll needed conditional") + + (_, lower_bound_aff), = lower_bound_pw_aff_pieces def generate_idx_eq_slabs(): for i in xrange(length): @@ -116,45 +119,37 @@ def generate_unroll_or_ilp_code(kernel, sched_index, exec_domain): block_shift_constraint( lower_cns, iname, -i, as_equality=True))) - from loopy.kernel import BaseUnrollTag, TAG_ILP, TAG_UNROLL_STATIC, TAG_UNROLL_INCR - if isinstance(tag, BaseUnrollTag): + from loopy.kernel import TAG_ILP, TAG_UNROLL + if isinstance(tag, TAG_UNROLL): result = [POD(np.int32, iname), Line()] - for i, slab in generate_idx_eq_slabs(): - new_exec_domain = exec_domain.intersect(slab) - inner = build_loop_nest(kernel, sched_index+1, new_exec_domain) - - if isinstance(tag, TAG_UNROLL_STATIC): - result.extend([ - Assign(iname, ccm(lower_bound+i)), - Line(), inner]) - elif isinstance(tag, TAG_UNROLL_INCR): - result.append(S("++%s" % iname)) + for i in range(length): + idx_aff = lower_bound_aff + i + new_codegen_state = codegen_state.fix(iname, idx_aff) + result.append( + build_loop_nest(kernel, sched_index+1, new_codegen_state)) return gen_code_block(result) elif isinstance(tag, TAG_ILP): - new_subdomains = [] - for subd in exec_domain.subdomains: - for i, single_slab in generate_idx_eq_slabs(): - from loopy.codegen import ExecutionSubdomain - new_subdomains.append( - ExecutionSubdomain( - subd.implemented_domain.intersect(single_slab), - subd.c_code_mapper.copy_and_assign( - iname, lower_bound+i))) + new_ilp_instances = [] + for ilpi in codegen_state.ilp_instances: + for i in range(length): + idx_aff = lower_bound_aff + i + new_ilp_instances.append(ilpi.fix(iname, idx_aff)) overall_slab = (isl.Set.universe(kernel.space) .add_constraint(lower_cns) .add_constraint(upper_cns)) return build_loop_nest(kernel, sched_index+1, - ExecutionDomain( - exec_domain.implemented_domain.intersect(overall_slab), - exec_domain.c_code_mapper, - new_subdomains)) + CodeGenerationState( + codegen_state.implemented_domain.intersect(overall_slab), + codegen_state.c_code_mapper, + new_ilp_instances)) + else: - assert False, "not supposed to get here" + raise RuntimeError("unexpected tag") # }}} diff --git a/loopy/isl.py b/loopy/isl.py index 6af301a9f445fa685e602cd346e00b7e4e6a2b97..fac42ecf60793aa8205005a7c533c4c815d43789 100644 --- a/loopy/isl.py +++ b/loopy/isl.py @@ -35,6 +35,7 @@ def block_shift_constraint(cns, type, pos, multiple, as_equality=None): cns = cns.set_constant(cns.get_constant() + cns.get_coefficient(type, pos)*multiple) + return cns @@ -85,17 +86,37 @@ def make_index_map(set, index_expr): +def pw_aff_to_aff(pw_aff): + assert isinstance(pw_aff, isl.PwAff) + pieces = pw_aff.get_pieces() + + if len(pieces) != 1: + raise NotImplementedError("only single-piece PwAff instances are supported here") + + return pieces[0][1] + + + + def make_slab(space, iname, start, stop): - from loopy.symbolic import ineq_constraint_from_expr - from pymbolic import var - var_iname = var(iname) + if isinstance(start, isl.PwAff): start = pw_aff_to_aff(start) + if isinstance(stop, isl.PwAff): stop = pw_aff_to_aff(stop) + + zero = isl.Aff.zero_on_domain(space) + + if isinstance(start, int): start = zero + start + if isinstance(stop, int): stop = zero + stop + + iname_dt, iname_idx = zero.get_space().get_var_dict()[iname] + iname_aff = zero.add_coefficient(iname_dt, iname_idx, 1) + return (isl.Set.universe(space) # start <= inner - .add_constraint(ineq_constraint_from_expr( - space, var_iname -start)) + .add_constraint(isl.Constraint.inequality_from_aff( + iname_aff - start)) # inner < stop - .add_constraint(ineq_constraint_from_expr( - space, stop-1 - var_iname))) + .add_constraint(isl.Constraint.inequality_from_aff( + stop-1 - iname_aff))) diff --git a/loopy/kernel.py b/loopy/kernel.py index c2527cf07b71b86bbd12b4412075934ff957dc4b..1dcfd43cd931cb9ae267ef01187f4ed91e14e2e7 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -70,17 +70,10 @@ class TAG_ILP(ParallelTag): def __str__(self): return "ilp" -class BaseUnrollTag(IndexTag): - pass - -class TAG_UNROLL_STATIC(BaseUnrollTag): +class TAG_UNROLL(IndexTag): def __str__(self): return "unr" -class TAG_UNROLL_INCR(BaseUnrollTag): - def __str__(self): - return "unri" - def parse_tag(tag): if tag is None: return tag @@ -91,10 +84,8 @@ def parse_tag(tag): if not isinstance(tag, str): raise ValueError("cannot parse tag: %s" % tag) - if tag in ["unrs", "unr"]: - return TAG_UNROLL_STATIC() - elif tag == "unri": - return TAG_UNROLL_INCR() + if tag in ["unr"]: + return TAG_UNROLL() elif tag == "ilp": return TAG_ILP() elif tag.startswith("g."): @@ -188,15 +179,19 @@ class TemporaryVariable(Record): :ivar name: :ivar dtype: :ivar shape: + :ivar storage_shape: :ivar base_indices: :ivar is_local: """ - def __init__(self, name, dtype, shape, is_local, base_indices=None): + def __init__(self, name, dtype, shape, is_local, base_indices=None, + storage_shape=None): if base_indices is None: base_indices = (0,) * len(shape) - Record.__init__(self, name=name, dtype=dtype, shape=shape, is_local=is_local) + Record.__init__(self, name=name, dtype=dtype, shape=shape, is_local=is_local, + base_indices=base_indices, + storage_shape=storage_shape) @property def nbytes(self): @@ -208,8 +203,6 @@ class TemporaryVariable(Record): # {{{ instruction class Instruction(Record): - #:ivar kernel: handle to the :class:`LoopKernel` of which this instruction - #is a part. (not yet) """ :ivar id: An (otherwise meaningless) identifier that is unique within a :class:`LoopKernel`. @@ -392,7 +385,7 @@ class LoopKernel(Record): upper_incr) tuples that will be separated out in the execution to generate 'bulk' slabs with fewer conditionals. :ivar temporary_variables: - :ivar name_to_dim: A lookup table from inames to ISL-style + :ivar iname_to_dim: A lookup table from inames to ISL-style (dim_type, index) tuples :ivar iname_to_tag: """ @@ -403,7 +396,7 @@ class LoopKernel(Record): iname_slab_increments={}, temporary_variables={}, workgroup_size=None, - name_to_dim=None, + iname_to_dim=None, iname_to_tag={}): """ :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl. @@ -434,10 +427,10 @@ class LoopKernel(Record): if isinstance(domain, str): ctx = isl.Context() - domain = isl.Set.read_from_str(ctx, domain, nparam=-1) + domain = isl.Set.read_from_str(ctx, domain) - if name_to_dim is None: - name_to_dim = domain.get_space().get_var_dict() + if iname_to_dim is None: + iname_to_dim = domain.get_space().get_var_dict() insns = [] for insn in instructions: @@ -484,8 +477,7 @@ class LoopKernel(Record): for i in range(s.size(dim_type.param))), ",".join(s.get_name(dim_type.set, i) for i in range(s.size(dim_type.set))), - assumptions), - nparam=-1) + assumptions)) Record.__init__(self, device=device, domain=domain, instructions=insns, @@ -497,7 +489,7 @@ class LoopKernel(Record): iname_slab_increments=iname_slab_increments, temporary_variables=temporary_variables, workgroup_size=workgroup_size, - name_to_dim=name_to_dim, + iname_to_dim=iname_to_dim, iname_to_tag=iname_to_tag) def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()): @@ -511,11 +503,17 @@ class LoopKernel(Record): if id_str not in used_ids: return id_str + @memoize_method + def get_written_variables(self): + return set( + insn.get_assignee_var_name() + for insn in self.instructions) + def make_unique_var_name(self, based_on="var", extra_used_vars=set()): used_vars = ( set(self.temporary_variables.iterkeys()) | set(arg.name for arg in self.args) - | set(self.name_to_dim.keys()) + | set(self.iname_to_dim.keys()) | extra_used_vars) from loopy.tools import generate_unique_possibilities @@ -523,16 +521,11 @@ class LoopKernel(Record): if var_name not in used_vars: return var_name - @property - @memoize_method - def arg_dict(self): - return dict((arg.name, arg) for arg in self.args) - @property @memoize_method def dim_to_name(self): from pytools import reverse_dict - return reverse_dict(self.name_to_dim) + return reverse_dict(self.iname_to_dim) @property @memoize_method @@ -571,8 +564,12 @@ class LoopKernel(Record): @memoize_method def get_iname_bounds(self, iname): - lower_bound_pw_aff = self.domain.dim_min(self.name_to_dim[iname][1]) - upper_bound_pw_aff = self.domain.dim_max(self.name_to_dim[iname][1]) + lower_bound_pw_aff = (self.domain + .dim_min(self.iname_to_dim[iname][1]) + .coalesce()) + upper_bound_pw_aff = (self.domain + .dim_max(self.iname_to_dim[iname][1]) + .coalesce()) class BoundsRecord(Record): pass @@ -584,12 +581,12 @@ class LoopKernel(Record): upper_bound_pw_aff=upper_bound_pw_aff, size=size) - def fix_grid_sizes(kernel): + def get_grid_sizes(self): all_inames_by_insns = set() - for insn in kernel.instructions: + for insn in self.instructions: all_inames_by_insns |= insn.all_inames() - if all_inames_by_insns != kernel.all_inames(): + if all_inames_by_insns != self.all_inames(): raise RuntimeError("inames collected from instructions " "do not match domain inames") @@ -600,8 +597,8 @@ class LoopKernel(Record): TAG_GROUP_IDX, TAG_LOCAL_IDX, TAG_AUTO_LOCAL_IDX) - for iname in kernel.all_inames(): - tag = kernel.iname_to_tag.get(iname) + for iname in self.all_inames(): + tag = self.iname_to_tag.get(iname) if isinstance(tag, TAG_GROUP_IDX): tgt_dict = global_sizes @@ -618,7 +615,7 @@ class LoopKernel(Record): if tgt_dict is None: continue - bounds = kernel.get_iname_bounds(iname) + bounds = self.get_iname_bounds(iname) size = bounds.size @@ -633,7 +630,7 @@ class LoopKernel(Record): else: tgt_dict[tag.axis] = size - max_dims = kernel.device.max_work_item_dimensions + max_dims = self.device.max_work_item_dimensions def to_dim_tuple(size_dict, which): size_list = [] @@ -642,6 +639,7 @@ class LoopKernel(Record): cur_axis = sorted_axes.pop(0) while cur_axis > len(size_list): from loopy import LoopyAdvisory + from warnings import warn warn("%s axis %d unassigned--assuming length 1" % len(size_list), LoopyAdvisory) size_list.append(1) diff --git a/loopy/schedule.py b/loopy/schedule.py index 8ec6393dc61599925c6cf3709b482d2f19c07919..7371b5ae8fb9f53c67fda2ec7d46d99577900042 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -1,6 +1,8 @@ from __future__ import division from pytools import Record +import pyopencl as cl +import pyopencl.characterize as cl_char @@ -43,9 +45,68 @@ def check_double_use_of_hw_dimensions(kernel): def adjust_local_temp_var_storage(kernel): - from warnings import warn - warn("adjust_local_temp_var_storage is unimplemented") - return kernel + new_temp_vars = {} + + lmem_size = cl_char.usable_local_mem_size(kernel.device) + for temp_var in kernel.temporary_variables.itervalues(): + other_loctemp_nbytes = [tv.nbytes for tv in kernel.temporary_variables.itervalues() + if tv.is_local and tv.name != temp_var.name] + + storage_shape = temp_var.storage_shape + if storage_shape is None: + storage_shape = temp_var.shape + + # sizes of all dims except the last one, which we may change + # below to avoid bank conflicts + from pytools import product + other_dim_sizes = (tv.dtype.itemsize + * product(storage_shape[:-1])) + + if kernel.device.local_mem_type == cl.device_local_mem_type.GLOBAL: + # FIXME: could try to avoid cache associativity disasters + new_storage_shape = storage_shape + + elif kernel.device.local_mem_type == cl.device_local_mem_type.LOCAL: + min_mult = cl_char.local_memory_bank_count(kernel.device) + good_incr = None + new_storage_shape = storage_shape + min_why_not = None + + for increment in range(storage_shape[-1]//2): + + test_storage_shape = storage_shape[:] + test_storage_shape[-1] = test_storage_shape[-1] + increment + new_mult, why_not = cl_char.why_not_local_access_conflict_free( + kernel.device, temp_var.dtype.itemsize, + temp_var.shape, test_storage_shape) + + # will choose smallest increment 'automatically' + if new_mult < min_mult: + new_lmem_use = (other_loctemp_nbytes + + temp_var.dtype.itemsize*product(test_storage_shape)) + if new_lmem_use < lmem_size: + new_storage_shape = test_storage_shape + min_mult = new_mult + min_why_not = why_not + good_incr = increment + + if min_mult != 1: + from warnings import warn + from loopy import LoopyAdvisory + warn("could not find a conflict-free mem layout " + "for local variable '%s' " + "(currently: %dx conflict, increment: %d, reason: %s)" + % (temp_var.name, min_mult, good_incr, min_why_not), + LoopyAdvisory) + else: + from warnings import warn + warn("unknown type of local memory") + + new_storage_shape = storage_shape + + new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape) + + return kernel.copy(temporary_variables=new_temp_vars) @@ -133,7 +194,6 @@ def guess_good_iname_for_axis_0(kernel, insn): if isinstance(insn.assignee, Subscript): ary_acc_exprs.append(insn.assignee) - print ary_acc_exprs # }}} @@ -533,6 +593,14 @@ def insert_barriers(kernel, schedule, level=0): +def insert_parallel_dim_check_points(kernel, schedule): + from warnings import warn + warn("insert_parallel_dim_check_points is unimplemented") + return kernel + + + + def generate_loop_schedules(kernel): from loopy import realize_reduction kernel = realize_reduction(kernel) @@ -561,14 +629,9 @@ def generate_loop_schedules(kernel): gen_sched, owed_barriers = insert_barriers(kernel, gen_sched) assert not owed_barriers - print gen_sched - - if False: - schedule = insert_parallel_dim_check_points(schedule=gen_sched) - yield kernel.copy(schedule=gen_sched) - + schedule = insert_parallel_dim_check_points(kernel, gen_sched) - 1/0 + yield kernel.copy(schedule=gen_sched) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index cde52d38a047499635d8bb9e6a37e1cf43985feb..1f15e3fc065fc7a295e2dc56df4d7eb685207268 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -252,8 +252,7 @@ class ArrayAccessFinder(CombineMapper): # {{{ C code mapper class LoopyCCodeMapper(CCodeMapper): - def __init__(self, kernel, no_prefetch=False, cse_name_list=[], - var_subst_map={}): + def __init__(self, kernel, cse_name_list=[], var_subst_map={}): def constant_mapper(c): if isinstance(c, float): # FIXME: type-variable @@ -267,14 +266,12 @@ class LoopyCCodeMapper(CCodeMapper): self.var_subst_map = var_subst_map.copy() - self.no_prefetch = no_prefetch - def copy(self, var_subst_map=None, cse_name_list=None): if var_subst_map is None: var_subst_map = self.var_subst_map if cse_name_list is None: cse_name_list = self.cse_name_list - return LoopyCCodeMapper(self.kernel, no_prefetch=self.no_prefetch, + return LoopyCCodeMapper(self.kernel, cse_name_list=cse_name_list, var_subst_map=var_subst_map) def copy_and_assign(self, name, value): @@ -296,22 +293,10 @@ class LoopyCCodeMapper(CCodeMapper): def map_subscript(self, expr, enclosing_prec): from pymbolic.primitives import Variable - if (not self.no_prefetch - and isinstance(expr.aggregate, Variable) - and expr.aggregate.name in self.kernel.input_vectors()): - try: - pf = self.kernel.prefetch[expr.aggregate.name, expr.index] - except KeyError: - pass - else: - from pymbolic import var - return pf.name+"".join( - "[%s]" % self.rec( - var(iname) - pf.dim_bounds_by_iname[iname][0], - PREC_NONE) - for iname in pf.all_inames()) - - if isinstance(expr.aggregate, Variable): + if not isinstance(expr.aggregate, Variable): + return CCodeMapper.map_subscript(self, expr, enclosing_prec) + + if expr.aggregate.name in self.kernel.arg_dict: arg = self.kernel.arg_dict[expr.aggregate.name] from loopy.kernel import ImageArg @@ -349,7 +334,12 @@ class LoopyCCodeMapper(CCodeMapper): stride*expr_i for stride, expr_i in zip( ary_strides, index_expr))), enclosing_prec) - return CCodeMapper.map_subscript(self, expr, enclosing_prec) + + if expr.aggregate.name in self.kernel.temporary_variables: + temp_var = self.kernel.temporary_variables[expr.aggregate.name] + + return (temp_var.name + "".join("[%s]" % self.rec(idx, PREC_NONE) + for idx in expr.index)) def map_floor_div(self, expr, prec): if isinstance(expr.denominator, int) and expr.denominator > 0: @@ -380,24 +370,42 @@ class LoopyCCodeMapper(CCodeMapper): # {{{ aff -> expr conversion -def aff_to_expr(aff): +def aff_to_expr(aff, except_name=None, error_on_name=None): + if except_name is not None and error_on_name is not None: + raise ValueError("except_name and error_on_name may not be specified " + "at the same time") from pymbolic import var + except_coeff = 0 + result = int(aff.get_constant()) for dt in [dim_type.in_, dim_type.param]: - for i in xrange(aff.dim(dim_type.in_)): + for i in xrange(aff.dim(dt)): coeff = int(aff.get_coefficient(dt, i)) if coeff: - result += coeff*var(aff.get_dim_name(dt, i)) + dim_name = aff.get_dim_name(dt, i) + if dim_name == except_name: + except_coeff += coeff + elif dim_name == error_on_name: + raise RuntimeError("'%s' occurred in this subexpression--" + "this is not allowed" % dim_name) + else: + result += coeff*var(dim_name) + + error_on_name = error_on_name or except_name for i in xrange(aff.dim(dim_type.div)): coeff = int(aff.get_coefficient(dim_type.div, i)) if coeff: - result += coeff*aff_to_expr(aff.get_div(i)) + result += coeff*aff_to_expr(aff.get_div(i), error_on_name=error_on_name) - denom = aff.get_denominator() - if denom == 1: - return result + denom = int(aff.get_denominator()) + if except_name is not None: + if except_coeff % denom != 0: + raise RuntimeError("coefficient of '%s' is not divisible by " + "aff denominator" % except_name) + + return result // denom, except_coeff // denom else: return result // denom @@ -438,7 +446,7 @@ def ineq_constraint_from_expr(space, expr): return isl.Constraint.inequality_from_aff(aff_from_expr(space,expr)) def constraint_to_expr(cns, except_name=None): - return aff_to_expr(cns.get_aff()) + return aff_to_expr(cns.get_aff(), except_name=except_name) # }}}