diff --git a/MEMO b/MEMO index 213114e595096637530bf78800862f287c85306d..f88800f525cb85dfd71d9876763ecf83f799d9d9 100644 --- a/MEMO +++ b/MEMO @@ -1,10 +1,6 @@ TODO list ^^^^^^^^^ -Immediately: ------------- -TODO: Imitate codegen bulk slab handling in bulk slab trials - For writeup: ------------ TODO: Reimplement forced lengths @@ -61,7 +57,14 @@ Things to consider - Syntax to declare insn deps -- Make CSE tags replaceable by array names if the access is unique +- reimplement add_prefetch + +- user interface for dim length prescription + +- make syntax for explicit loop dependencies + +- multiple insns could fight over which iname gets local axis 0 + -> complicated optimization problem Dealt with ^^^^^^^^^^ diff --git a/loopy/__init__.py b/loopy/__init__.py index bb6bb6a48c9c2828d7e6faa7fa58b99450d5ac91..606d579c666c5b0503dcf467395deb5914190e2f 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -177,9 +177,9 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non parallel_inames = duplicate_inames dup_iname_to_tag = dup_iname_to_tag.copy() - from loopy.kernel import TAG_AUTO_WORK_ITEM_IDX + from loopy.kernel import TAG_AUTO_LOCAL_IDX for piname in parallel_inames: - dup_iname_to_tag[piname] = TAG_AUTO_WORK_ITEM_IDX() + dup_iname_to_tag[piname] = TAG_AUTO_LOCAL_IDX() for diname in duplicate_inames: dup_iname_to_tag.setdefault(diname, None) @@ -211,10 +211,10 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non target_var_name = kernel.make_unique_var_name(cse_tag) - from loopy.kernel import (TAG_WORK_ITEM_IDX, TAG_AUTO_WORK_ITEM_IDX, + from loopy.kernel import (TAG_LOCAL_IDX, TAG_AUTO_LOCAL_IDX, TAG_GROUP_IDX) target_var_is_local = any( - isinstance(tag, (TAG_WORK_ITEM_IDX, TAG_AUTO_WORK_ITEM_IDX)) + isinstance(tag, (TAG_LOCAL_IDX, TAG_AUTO_LOCAL_IDX)) for tag in dup_iname_to_tag.itervalues()) cse_lookup_table = {} @@ -254,7 +254,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non else: tag = kernel.iname_to_tag[iname] - if isinstance(tag, (TAG_WORK_ITEM_IDX, TAG_AUTO_WORK_ITEM_IDX)): + if isinstance(tag, (TAG_LOCAL_IDX, TAG_AUTO_LOCAL_IDX)): kind = "l" elif isinstance(tag, TAG_GROUP_IDX): kind = "g" @@ -507,7 +507,7 @@ def get_problems(kernel, parameters, emit_warnings=True): msgs.append((severity, s)) glens = kernel.tag_type_lengths(TAG_GROUP_IDX, allow_parameters=True) - llens = kernel.tag_type_lengths(TAG_WORK_ITEM_IDX, allow_parameters=False) + llens = kernel.tag_type_lengths(TAG_LOCAL_IDX, allow_parameters=False) from pymbolic import evaluate glens = evaluate(glens, parameters) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 6b9cb966fbd2aafbe94617e16edf6626e773def3..1c0e358e4df93caa8b0046b4078c190fce69f15a 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -240,10 +240,10 @@ def generate_code(kernel): # {{{ symbolic names for group and local indices - from loopy.kernel import TAG_GROUP_IDX, TAG_WORK_ITEM_IDX + from loopy.kernel import TAG_GROUP_IDX, TAG_LOCAL_IDX for what_cls, func in [ (TAG_GROUP_IDX, "get_group_id"), - (TAG_WORK_ITEM_IDX, "get_local_id")]: + (TAG_LOCAL_IDX, "get_local_id")]: for iname in kernel.ordered_inames_by_tag_type(what_cls): lower, upper, equality = kernel.get_bounds(iname, (iname,), allow_parameters=True) assert not equality @@ -275,13 +275,13 @@ def generate_code(kernel): body.extend([Line(), gen_code.ast]) #print "# conditionals: %d" % gen_code.num_conditionals - from loopy.kernel import TAG_WORK_ITEM_IDX + from loopy.kernel import TAG_LOCAL_IDX mod.append( FunctionBody( CLRequiredWorkGroupSize( tuple(dim_length for dim_length in kernel.tag_type_lengths( - TAG_WORK_ITEM_IDX, + TAG_LOCAL_IDX, allow_parameters=False)), CLKernel(FunctionDeclaration( Value("void", kernel.name), args))), diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index e2cc6f0f3e966cf8630fc6c53284081527d0167f..5d89b10485551d1aa853d6140433a8abab56be1c 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -301,8 +301,8 @@ def get_valid_check_vars(kernel, sched_index, allow_ilp, exclude_tag_classes=()) # prefetches are scheduled, we may not check work item indices # (and thereby conceivably mask out some work items). - from loopy.kernel import TAG_WORK_ITEM_IDX - allowed_vars -= set(kernel.inames_by_tag_type(TAG_WORK_ITEM_IDX)) + from loopy.kernel import TAG_LOCAL_IDX + allowed_vars -= set(kernel.inames_by_tag_type(TAG_LOCAL_IDX)) return allowed_vars diff --git a/loopy/codegen/prefetch.py b/loopy/codegen/prefetch.py index 1b7206a48cf593824fb3b775847965277cb96c8c..c35a11fcb77ebc473ddd9ad165b4f12e0c389b67 100644 --- a/loopy/codegen/prefetch.py +++ b/loopy/codegen/prefetch.py @@ -207,8 +207,8 @@ def make_fetch_loop_nest(flnd, fetch_dim_idx, pf_dim_exprs, iname_subst_map, pf_idx_expr = 0 for realiz_iname, length in zip(realiz_inames, realiz_lengths): tag = flnd.kernel.iname_to_tag[realiz_iname] - from loopy.kernel import TAG_WORK_ITEM_IDX - assert isinstance(tag, TAG_WORK_ITEM_IDX) + from loopy.kernel import TAG_LOCAL_IDX + assert isinstance(tag, TAG_LOCAL_IDX) pf_idx_expr = (pf_idx_expr*length + var("(int) get_local_id(%d)" % tag.axis)) @@ -301,8 +301,8 @@ def generate_prefetch_code(kernel, sched_index, exec_domain): # {{{ first, fix the user-specified fetch dims - from loopy.kernel import TAG_WORK_ITEM_IDX - knl_work_item_inames = kernel.ordered_inames_by_tag_type(TAG_WORK_ITEM_IDX) + from loopy.kernel import TAG_LOCAL_IDX + knl_work_item_inames = kernel.ordered_inames_by_tag_type(TAG_LOCAL_IDX) used_kernel_work_item_inames = [] for realization_dim_idx, loc_fetch_axis_list in \ @@ -390,7 +390,7 @@ def generate_prefetch_code(kernel, sched_index, exec_domain): from loopy.codegen.bounds import get_valid_check_vars valid_index_vars = get_valid_check_vars(kernel, sched_index, allow_ilp=True, - exclude_tag_classes=(TAG_WORK_ITEM_IDX,)) + exclude_tag_classes=(TAG_LOCAL_IDX,)) from loopy.symbolic import LoopyCCodeMapper flnd = FetchLoopNestData(prefetch=pf, diff --git a/loopy/compiled.py b/loopy/compiled.py index a2b4210d6105f4c458f19766f8677b887619edb8..864683bdd13605d8d619a76e14ea13f77d3c7874 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -48,11 +48,11 @@ class CompiledKernel: else: self.size_args = size_args - from loopy.kernel import TAG_GROUP_IDX, TAG_WORK_ITEM_IDX + from loopy.kernel import TAG_GROUP_IDX, TAG_LOCAL_IDX gsize_expr = tuple(self.kernel.tag_type_lengths( TAG_GROUP_IDX, allow_parameters=True)) lsize_expr = tuple(self.kernel.tag_type_lengths( - TAG_WORK_ITEM_IDX, allow_parameters=False)) + TAG_LOCAL_IDX, allow_parameters=False)) if not gsize_expr: gsize_expr = (1,) if not lsize_expr: lsize_expr = (1,) diff --git a/loopy/isl.py b/loopy/isl.py index bf7b5a064733fcebfeaf9a1e879d41246e6865b0..6af301a9f445fa685e602cd346e00b7e4e6a2b97 100644 --- a/loopy/isl.py +++ b/loopy/isl.py @@ -110,12 +110,21 @@ def set_is_universe(set): -def static_max_of_pw_aff(pw_aff): - for set, aff in pw_aff.get_pieces(): - candidate_pw_aff = isl.PwAff.from_aff(aff) +def static_min_of_pw_aff(pw_aff): + for set, candidate_aff in pw_aff.get_pieces(): + if set_is_universe(candidate_aff.le_set(pw_aff)): + return candidate_aff + + raise ValueError("a static minimum was not found for PwAff '%s'" + % pw_aff) + - if set_is_universe(candidate_pw_aff.ge_set(pw_aff)): - return aff + + +def static_max_of_pw_aff(pw_aff): + for set, candidate_aff in pw_aff.get_pieces(): + if set_is_universe(candidate_aff.ge_set(pw_aff)): + return candidate_aff raise ValueError("a static maximum was not found for PwAff '%s'" % pw_aff) diff --git a/loopy/kernel.py b/loopy/kernel.py index 4767f58d4e87cca70204a0798797eb18657854b3..c2527cf07b71b86bbd12b4412075934ff957dc4b 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -50,13 +50,19 @@ class ParallelTagWithAxis(ParallelTag, UniqueTag): return "%s.%d" % ( self.print_name, self.axis) +#class MultiTag(IndexTag): + +#class SplitTag(IndexTag): + + + class TAG_GROUP_IDX(ParallelTagWithAxis): print_name = "g" -class TAG_WORK_ITEM_IDX(ParallelTagWithAxis): +class TAG_LOCAL_IDX(ParallelTagWithAxis): print_name = "l" -class TAG_AUTO_WORK_ITEM_IDX(ParallelTag): +class TAG_AUTO_LOCAL_IDX(ParallelTag): def __str__(self): return "l.auto" @@ -94,7 +100,7 @@ def parse_tag(tag): elif tag.startswith("g."): return TAG_GROUP_IDX(int(tag[2:])) elif tag.startswith("l."): - return TAG_WORK_ITEM_IDX(int(tag[2:])) + return TAG_LOCAL_IDX(int(tag[2:])) else: raise ValueError("cannot parse tag: %s" % tag) @@ -386,7 +392,6 @@ class LoopKernel(Record): upper_incr) tuples that will be separated out in the execution to generate 'bulk' slabs with fewer conditionals. :ivar temporary_variables: - :ivar workgroup_size: :ivar name_to_dim: A lookup table from inames to ISL-style (dim_type, index) tuples :ivar iname_to_tag: @@ -518,6 +523,11 @@ class LoopKernel(Record): if var_name not in used_vars: return var_name + @property + @memoize_method + def arg_dict(self): + return dict((arg.name, arg) for arg in self.args) + @property @memoize_method def dim_to_name(self): @@ -534,14 +544,6 @@ class LoopKernel(Record): def space(self): return self.domain.get_space() - @property - @memoize_method - def tag_key_to_iname(self): - return dict( - (tag.key, iname) - for iname, tag in self.iname_to_tag.iteritems() - if isinstance(tag, UniqueTag)) - @property @memoize_method def arg_dict(self): @@ -567,57 +569,93 @@ class LoopKernel(Record): return [iname for iname in self.all_inames() if isinstance(self.iname_to_tag.get(iname), tag_type)] - def ordered_inames_by_tag_type(self, tag_type): - result = [] - from itertools import count - for i in count(): - try: - dim = self.tag_key_to_iname[tag_type(i).key] - except KeyError: - return result + @memoize_method + def get_iname_bounds(self, iname): + lower_bound_pw_aff = self.domain.dim_min(self.name_to_dim[iname][1]) + upper_bound_pw_aff = self.domain.dim_max(self.name_to_dim[iname][1]) + + class BoundsRecord(Record): + pass + + size = upper_bound_pw_aff - lower_bound_pw_aff + 1 + + return BoundsRecord( + lower_bound_pw_aff=lower_bound_pw_aff, + upper_bound_pw_aff=upper_bound_pw_aff, + size=size) + + def fix_grid_sizes(kernel): + all_inames_by_insns = set() + for insn in kernel.instructions: + all_inames_by_insns |= insn.all_inames() + + if all_inames_by_insns != kernel.all_inames(): + raise RuntimeError("inames collected from instructions " + "do not match domain inames") + + global_sizes = {} + local_sizes = {} + + from loopy.kernel import ( + TAG_GROUP_IDX, TAG_LOCAL_IDX, + TAG_AUTO_LOCAL_IDX) + + for iname in kernel.all_inames(): + tag = kernel.iname_to_tag.get(iname) + + if isinstance(tag, TAG_GROUP_IDX): + tgt_dict = global_sizes + elif isinstance(tag, TAG_LOCAL_IDX): + tgt_dict = local_sizes + elif isinstance(tag, TAG_AUTO_LOCAL_IDX): + #raise RuntimeError("cannot find grid sizes if AUTO_LOCAL_IDX tags are " + #"present") + pass + tgt_dict = None else: - result.append(dim) + tgt_dict = None - @memoize_method - def get_bounds_constraints(self, iname, admissible_vars, allow_parameters): - """Get an overapproximation of the loop bounds for the variable *iname*.""" + if tgt_dict is None: + continue - from loopy.codegen.bounds import get_bounds_constraints - return get_bounds_constraints(self.domain, iname, admissible_vars, - allow_parameters) + bounds = kernel.get_iname_bounds(iname) - @memoize_method - def get_bounds(self, iname, admissible_vars, allow_parameters): - """Get an overapproximation of the loop bounds for the variable *iname*.""" + size = bounds.size - from loopy.codegen.bounds import get_bounds - return get_bounds(self.domain, iname, admissible_vars, allow_parameters) + from loopy.isl import static_max_of_pw_aff + try: + size = static_max_of_pw_aff(size) + except ValueError: + pass - def tag_type_lengths(self, tag_cls, allow_parameters): - def get_length(iname): - tag = self.iname_to_tag[iname] - if tag.forced_length is not None: - return tag.forced_length + if tag.axis in tgt_dict: + tgt_dict[tag.axis] = tgt_dict[tag.axis].max(size) + else: + tgt_dict[tag.axis] = size - lower, upper, equality = self.get_bounds(iname, (iname,), - allow_parameters=allow_parameters) - return upper-lower + max_dims = kernel.device.max_work_item_dimensions - return [get_length(iname) - for iname in self.ordered_inames_by_tag_type(tag_cls)] + def to_dim_tuple(size_dict, which): + size_list = [] + sorted_axes = sorted(size_dict.iterkeys()) + while sorted_axes: + cur_axis = sorted_axes.pop(0) + while cur_axis > len(size_list): + from loopy import LoopyAdvisory + warn("%s axis %d unassigned--assuming length 1" % len(size_list), + LoopyAdvisory) + size_list.append(1) - def tag_or_iname_to_iname(self, s): - try: - tag = parse_tag(s) - except ValueError: - pass - else: - return self.tag_key_to_iname[tag.key] + size_list.append(size_dict[cur_axis]) + + if len(size_list) > max_dims: + raise ValueError("more %s dimensions assigned than supported " + "by hardware (%d > %d)" % (which, len(size_list), max_dims)) - if s not in self.all_inames(): - raise RuntimeError("invalid index name '%s'" % s) + return tuple(size_list) - return s + return (to_dim_tuple(global_sizes, "global"), + to_dim_tuple(local_sizes, "local")) def local_mem_use(self): return sum(lv.nbytes for lv in self.temporary_variables.itervalues() diff --git a/loopy/schedule.py b/loopy/schedule.py index 1756b8e66223ddd1bef353f3863202c9f1b9fc52..8ec6393dc61599925c6cf3709b482d2f19c07919 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -24,14 +24,6 @@ class Barrier(Record): -def fix_grid_sizes(kernel): - from warnings import warn - warn("fix_grid_sizes is unimplemented") - return kernel - - - - def check_double_use_of_hw_dimensions(kernel): from loopy.kernel import UniqueTag @@ -124,6 +116,176 @@ def add_automatic_dependencies(kernel): +def guess_good_iname_for_axis_0(kernel, insn): + from loopy.kernel import ImageArg, ScalarArg + + approximate_arg_values = dict( + (arg.name, arg.approximately) + for arg in kernel.args + if isinstance(arg, ScalarArg)) + + # {{{ find all array accesses in insn + + from loopy.symbolic import ArrayAccessFinder + ary_acc_exprs = list(ArrayAccessFinder()(insn.expression)) + + from pymbolic.primitives import Subscript + + if isinstance(insn.assignee, Subscript): + ary_acc_exprs.append(insn.assignee) + print ary_acc_exprs + + # }}} + + # {{{ filter array accesses to only the global ones + + global_ary_acc_exprs = [] + + for aae in ary_acc_exprs: + ary_name = aae.aggregate.name + arg = kernel.arg_dict.get(ary_name) + if arg is None: + continue + + if isinstance(arg, ImageArg): + continue + + global_ary_acc_exprs.append(aae) + + # }}} + + # {{{ figure out which iname should get mapped to local axis 0 + + # maps inames to vote counts + vote_count_for_l0 = {} + + from loopy.symbolic import CoefficientCollector + + from pytools import argmin2, argmax2 + + for aae in global_ary_acc_exprs: + index_expr = aae.index + if not isinstance(index_expr, tuple): + index_expr = (index_expr,) + + ary_name = aae.aggregate.name + arg = kernel.arg_dict.get(ary_name) + + ary_strides = arg.strides + if ary_strides is None and len(index_expr) == 1: + ary_strides = (1,) + + iname_to_stride = {} + for iexpr_i, stride in zip(index_expr, ary_strides): + coeffs = CoefficientCollector()(iexpr_i) + for var_name, coeff in coeffs.iteritems(): + if var_name != 1: + new_stride = coeff*stride + old_stride = iname_to_stride.get(var_name, None) + if old_stride is None or new_stride < old_stride: + iname_to_stride[var_name] = new_stride + + from pymbolic import evaluate + least_stride_iname, least_stride = argmin2(( + (iname, + evaluate(iname_to_stride[iname], approximate_arg_values)) + for iname in iname_to_stride), + return_value=True) + + if least_stride == 1: + vote_strength = 1 + else: + vote_strength = 0.5 + + vote_count_for_l0[least_stride_iname] = ( + vote_count_for_l0.get(least_stride_iname, 0) + + vote_strength) + + return argmax2(vote_count_for_l0.iteritems()) + + # }}} + + + + + +def find_inadmissible_tag_keys(kernel, iname, iname_to_tag=None): + if iname_to_tag is None: + iname_to_tag = kernel.iname_to_tag + + result = set() + + from loopy.kernel import UniqueTag + + for insn in kernel.instructions: + if iname in insn.all_inames(): + for insn_iname in insn.all_inames(): + if insn_iname == iname: + continue + + tag = iname_to_tag.get(insn_iname) + if isinstance(tag, UniqueTag): + result.add(tag.key) + + return result + + + + +def assign_automatic_axes(kernel): + from loopy.kernel import ( + TAG_AUTO_LOCAL_IDX, TAG_LOCAL_IDX) + + new_iname_to_tag = kernel.iname_to_tag + + # first assign each insn's axis 0, then the rest + for only_axis_0 in [True, False]: + + for insn in kernel.instructions: + auto_axis_inames = [ + iname + for iname in insn.all_inames() + if isinstance(new_iname_to_tag.get(iname), TAG_AUTO_LOCAL_IDX)] + + if not auto_axis_inames: + continue + + local_assigned_axes = set() + + for iname in insn.all_inames(): + tag = new_iname_to_tag.get(iname) + if isinstance(tag, TAG_LOCAL_IDX): + local_assigned_axes.add(tag.axis) + + if 0 not in local_assigned_axes: + axis0_iname = guess_good_iname_for_axis_0(kernel, insn) + + axis0_iname_tag = new_iname_to_tag.get(axis0_iname) + ax0_tag = TAG_LOCAL_IDX(0) + if (isinstance(axis0_iname_tag, TAG_AUTO_LOCAL_IDX) + and ax0_tag.key not in find_inadmissible_tag_keys( + kernel, axis0_iname, new_iname_to_tag)): + new_iname_to_tag[axis0_iname] = ax0_tag + local_assigned_axes.add(0) + auto_axis_inames.remove(axis0_iname) + + if only_axis_0: + continue + + next_axis = 0 + while auto_axis_inames: + iname = auto_axis_inames.pop() + while next_axis in local_assigned_axes: + next_axis += 1 + + new_iname_to_tag[iname] = TAG_LOCAL_IDX(next_axis) + local_assigned_axes.add(next_axis) + + return kernel.copy(iname_to_tag=new_iname_to_tag) + + + + def generate_loop_schedules_internal(kernel, schedule=[]): all_insn_ids = set(insn.id for insn in kernel.instructions) @@ -391,21 +553,9 @@ def generate_loop_schedules(kernel): # }}} - kernel = fix_grid_sizes(kernel) - - if 0: - loop_dep_graph = generate_loop_dep_graph(kernel) - for k, v in loop_dep_graph.iteritems(): - print "%s: %s" % (k, ",".join(v)) - 1/0 - kernel = add_automatic_dependencies(kernel) - print kernel - - #grid_size, group_size = find_known_grid_and_group_sizes(kernel) - - #kernel = assign_grid_and_group_indices(kernel) + kernel = assign_automatic_axes(kernel) for gen_sched in generate_loop_schedules_internal(kernel): gen_sched, owed_barriers = insert_barriers(kernel, gen_sched) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index e3e599e8bb6856bf4d6ed64f619c1e3a84317a16..cde52d38a047499635d8bb9e6a37e1cf43985feb 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -224,7 +224,7 @@ class CoefficientCollector(RecursiveMapper): # {{{ variable index expression collector -class VariableIndexExpressionCollector(CombineMapper): +class ArrayAccessFinder(CombineMapper): def __init__(self, tgt_vector_name=None): self.tgt_vector_name = tgt_vector_name @@ -243,7 +243,7 @@ class VariableIndexExpressionCollector(CombineMapper): assert isinstance(expr.aggregate, Variable) if self.tgt_vector_name is None or expr.aggregate.name == self.tgt_vector_name: - return set([expr.index]) | self.rec(expr.index) + return set([expr]) | self.rec(expr.index) else: return CombineMapper.map_subscript(self, expr)