diff --git a/loopy/__init__.py b/loopy/__init__.py index 65e0a22cc707961c8eee6b6ead96313768abdc41..9b6361ecb89c83be84400245de4168f05771f1d0 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -5,12 +5,20 @@ from pytools import Record, memoize_method from pymbolic.mapper.dependency import DependencyMapper from pymbolic.mapper.c_code import CCodeMapper from pymbolic.mapper.stringifier import PREC_NONE -from pymbolic.mapper import CombineMapper, RecursiveMapper +from pymbolic.mapper import IdentityMapper, CombineMapper, RecursiveMapper import pyopencl as cl import islpy as isl from islpy import dim_type +def register_mpz_with_pymbolic(): + from pymbolic.primitives import register_constant_class + import gmpy + mpz_type = type(gmpy.mpz(1)) + register_constant_class(mpz_type) + +register_mpz_with_pymbolic() + @@ -76,7 +84,7 @@ class IndexTag(object): return hash(type(self)) ^ hash(self.axis) -class GROUP_IDX_TAG(IndexTag): +class TAG_GROUP_IDX(IndexTag): def __repr__(self): if self.axis is None: return "GROUP_IDX" @@ -84,7 +92,7 @@ class GROUP_IDX_TAG(IndexTag): return "GROUP_IDX(%d)" % self.axis -class WORK_ITEM_IDX_TAG(IndexTag): +class TAG_WORK_ITEM_IDX(IndexTag): def __repr__(self): if self.axis is None: return "WORK_ITEM_IDX" @@ -102,9 +110,9 @@ def parse_tag(tag): raise ValueError("cannot parse tag: %s" % tag) if tag.startswith("g."): - return GROUP_IDX_TAG(int(tag[2:])) + return TAG_GROUP_IDX(int(tag[2:])) if tag.startswith("l."): - return WORK_ITEM_IDX_TAG(int(tag[2:])) + return TAG_WORK_ITEM_IDX(int(tag[2:])) else: raise ValueError("cannot parse tag: %s" % tag) @@ -172,9 +180,6 @@ class LoopDomain(Record): return i raise KeyError("invalid tag: %s" % tag) - def tag_to_dim(self, tag): - return self.dims[self.tag_to_idx(tag)] - def indices_by_tag_type(self, tag_type): return [i for i, dim in enumerate(self.dims) if isinstance(dim.tag, tag_type)] @@ -183,12 +188,12 @@ class LoopDomain(Record): return [dim for dim in self.dims if isinstance(dim.tag, tag_type)] - def ordered_dims_by_tag_type(self, tag_type): + def ordered_inames_by_tag_type(self, tag_type): result = [] from itertools import count for i in count(): try: - dim = self.tag_to_dim(tag_type(i)) + dim = self.tag_to_iname[tag_type(i)] except KeyError: return result else: @@ -264,7 +269,7 @@ class LoopKernel(LoopDomain): # possible attributes: # - device, a PyOpenCL target device # - domain - # - tags + # - iname_to_tag # - instructions # - args # - prefetch @@ -275,12 +280,12 @@ class LoopKernel(LoopDomain): def __init__(self, device, domain, instructions, args=None, prefetch={}, schedule=None, register_prefetch=None, name="loopy_kernel", - tags={}, + iname_to_tag={}, is_divisible=lambda dividend, divisor: False, preamble=None): """ :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl. Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}" - :arg tags: a map from loop domain variables to subclasses of :class:`IndexTag` + :arg iname_to_tag: a map from loop domain variables to subclasses of :class:`IndexTag` """ from pymbolic import parse @@ -303,7 +308,9 @@ class LoopKernel(LoopDomain): device=device, args=args, domain=domain, instructions=insns, prefetch=prefetch, schedule=schedule, register_prefetch=register_prefetch, name=name, - preamble=preamble, tags=tags) + iname_to_tag=iname_to_tag, + is_divisible=is_divisible, + preamble=preamble) # FIXME: assert empyt intersection of loop vars and args # FIXME: assert that isl knows about loop parameters @@ -313,6 +320,12 @@ class LoopKernel(LoopDomain): def space(self): return self.domain.get_dim() + @property + @memoize_method + def tag_to_iname(self): + from pytools import reverse_dictionary + return reverse_dictionary(self.iname_to_tag) + @property @memoize_method def arg_dict(self): @@ -327,7 +340,7 @@ class LoopKernel(LoopDomain): @memoize_method def all_indices(self): - return set(dim.name for dim in self.dims) - self.scalar_args() + return set(self.space.get_var_dict(dim_type.set).iterkeys()) @memoize_method def output_indices(self): @@ -349,41 +362,88 @@ class LoopKernel(LoopDomain): def reduction_dimensions(self): return [dim for dim in self.dims if dim.name not in self.output_indices()] - def group_dims(self): - dims = self.ordered_dims_by_tag_type(GROUP_IDX_TAG) - return tuple(dim.length for dim in dims) + def get_bounds(self, iname): + """Get an overapproximation of the loop bounds for the variable *iname*.""" - def local_dims(self): - dims = self.ordered_dims_by_tag_type(WORK_ITEM_IDX_TAG) - return tuple(dim.length for dim in dims) + iname_dim_type, iname_idx = self.space.get_var_dict()[iname] + assert iname_dim_type == dim_type.set - def group_size(self): - from pytools import product - return product(self.local_dims()) + # project out every variable except iname + projected_domain = (self.domain + # vars after iname + .project_out( + iname_dim_type, iname_idx+1, self.space.size(iname_dim_type)-iname_idx-1) + .project_out( + iname_dim_type, 0, iname_idx)) - def group_count(self): - from pytools import product - return product(self.group_dims()) + basic_sets = [] + projected_domain.foreach_basic_set(basic_sets.append) - def parse_sloppy_dim_to_dim_idx(self, dim): - if isinstance(dim, str): - try: - tag = parse_tag(dim) - except ValueError: - pass - else: - return self.tag_to_idx(tag) + # FIXME perhaps use some form of hull here if there's more than one + # basic set? + bset, = basic_sets - return self.name_to_idx(dim) + # Python-style, half-open bounds + upper_bounds = [] + lower_bounds = [] + bset = bset.remove_divs() + print bset - if isinstance(dim, LoopDimension): - return self.dims.index(dim) + bset_iname_dim_type, bset_iname_idx = bset.get_dim().get_var_dict()[iname] - if isinstance(dim, int): - return dim + def examine_constraint(cns): + coeffs = cns.get_coefficients_by_name() + print coeffs - def parse_sloppy_dim(self, dim): - return self.dims[self.parse_sloppy_dim_to_dim_idx(dim)] + iname_coeff = int(coeffs.get(iname, 0)) + if iname_coeff == 0: + return + + rhs = cns.get_constant() + from pymbolic import var + for var_name, coeff in coeffs.iteritems(): + if var_name == iname: + continue + rhs += int(coeff)*var(var_name) + + if iname_coeff < 0: + from pytools import div_ceil + upper_bounds.append(div_ceil(rhs, -iname_coeff)) + else: # iname_coeff > 0 + lower_bounds.append(rhs//iname_coeff) + + bset.foreach_constraint(examine_constraint) + + lb, = lower_bounds + ub, = upper_bounds + print iname, lb, ub + print "WRONG: Inclusive bounds!" + + return lb, ub + + def tag_type_bounds(self, tag_cls): + return [self.get_bounds(iname) + for iname in self.ordered_inames_by_tag_type(tag_cls)] + + def tag_type_lengths(self, tag_cls): + return [stop-start for start, stop in self.tag_type_bounds(tag_cls)] + + def tag_type_count(self, tag_cls): + from pytools import product + return product(self.tag_type_lengths(tag_cls)) + + def tag_or_iname_to_iname(self, s): + try: + tag = parse_tag(s) + except ValueError: + pass + else: + return self.tag_to_iname[tag] + + if s not in self.all_indices(): + raise RuntimeError("invalid index name '%s'" % s) + + return s def local_mem_use(self): return sum(pf.size() for pf in self.prefetch.itervalues()) @@ -466,87 +526,60 @@ class LoopKernel(LoopDomain): new_tags = set(tag for tag in [outer_tag, inner_tag] if tag is not None) - for d in self.dims: - if d.tag in new_tags: - raise RuntimeError("repeated tag: %s" % d.tag) + if self.iname_to_tag.get(name) is not None: + raise RuntimeError("cannot split tagged dimension '%s'" % name) + + repeated_tags = new_tags & set(self.iname_to_tag.values()) + if repeated_tags: + raise RuntimeError("repeated tag(s): %s" % repeated_tags) if outer_name is None: outer_name = name+"_outer" if inner_name is None: inner_name = name+"_inner" - nr_new_var = self.space.size(dim_type.set) - new_set = self.domain.add_dims - - # {{{ make dim with same parameters and an equality constraint - - new_space = isl.Dim.create_from_names( - params=self.space.get_var_dict(isl.dim_type.param).keys(), - set=[name, inner_name, outer_name]) - - #new_set = isl. - - - - - - dim = self.dims[idx] - if dim.end_cond is not None or dim.last_cond is not None: - raise NotImplementedError("don't yet know how to split " - "last_cond or end_cond loops") - - # }}} - - if dim.tag: - raise ValueError("cannot split already-tagged dimension") - - if new_tags and dim.name not in self.output_indices(): - raise NotImplementedError("cannot yet tag a non-output dimension") - - if is_even_split != False and dim.length % inner_length == 0: - is_even_split = True + new_iname_to_tag = self.iname_to_tag.copy() + if inner_tag is not None: + new_iname_to_tag[inner_name] = inner_tag + if outer_tag is not None: + new_iname_to_tag[outer_name] = outer_tag + + outer_var_nr = self.space.size(dim_type.set) + inner_var_nr = self.space.size(dim_type.set)+1 + new_domain = self.domain.add_dims(dim_type.set, 2) + new_domain.set_dim_name(dim_type.set, outer_var_nr, outer_name) + new_domain.set_dim_name(dim_type.set, inner_var_nr, inner_name) + + space = new_domain.get_dim() + inner_constraint_set = (isl.Set.universe(space) + # name = inner + length*outer + .add_constraint(isl.Constraint.eq_from_names( + space, 0, {name:1, inner_name: -1, outer_name:-inner_length})) + # 0 <= inner + .add_constraint(isl.Constraint.ineq_from_names( + space, 0, {inner_name: 1})) + # inner < length + .add_constraint(isl.Constraint.ineq_from_names( + space, inner_length-1, {inner_name: -1}))) + + name_dim_type, name_idx = space.get_var_dict()[name] + new_domain = (new_domain + .intersect(inner_constraint_set) + .project_out(name_dim_type, name_idx, 1)) from pymbolic import var - outer = var(outer_name) inner = var(inner_name) - + outer = var(outer_name) new_loop_index = inner + outer*inner_length - if is_even_split: - new_dims = [ - LoopDimension( - name=outer_name, - length=dim.length//inner_length, - tag=outer_tag), - LoopDimension( - name=inner_name, - length=inner_length, - tag=inner_tag), - ] - else: - from pytools import div_ceil - new_dims = [ - LoopDimension( - name=outer_name, - length=div_ceil(dim.length, inner_length), - last_cond=((outer+1)*inner_length, ">=", dim.length), - tag=outer_tag), - LoopDimension( - name=inner_name, - length=inner_length, - end_cond=(new_loop_index, ">=", dim.length), - tag=inner_tag, - end_cond_if_last_of=dim.end_cond_if_last_of | set([outer_name])), - ] - return (self - .substitute(dim.name, new_loop_index) - .copy(dims=self.dims[:idx] + new_dims + self.dims[(idx+1):]), - new_loop_index) + .substitute(name, new_loop_index) + .copy(domain=new_domain, iname_to_tag=new_iname_to_tag)) def get_invalid_reason(self): - gdims = self.group_dims() - ldims = self.local_dims() + gdims = self.tag_type_count(TAG_GROUP_IDX) + ldims = self.tag_type_count(TAG_WORK_ITEM_IDX) + 1/0 if (max(len(gdims), len(ldims)) > self.device.max_work_item_dimensions): return "too many work item dimensions" @@ -708,8 +741,8 @@ def generate_loop_schedules(kernel): prev_schedule = kernel.schedule if prev_schedule is None: prev_schedule = ( - kernel.dims_by_tag_type(GROUP_IDX_TAG) - + kernel.dims_by_tag_type(WORK_ITEM_IDX_TAG)) + kernel.dims_by_tag_type(TAG_GROUP_IDX) + + kernel.dims_by_tag_type(TAG_WORK_ITEM_IDX)) already_scheduled = set(sch_item for sch_item in prev_schedule @@ -721,7 +754,7 @@ def generate_loop_schedules(kernel): had_usable_prefetch = False scheduled_work_item_dim_names = set( dim.name for dim in already_scheduled - if isinstance(dim.tag, WORK_ITEM_IDX_TAG)) + if isinstance(dim.tag, TAG_WORK_ITEM_IDX)) for pf in kernel.prefetch.itervalues(): # already scheduled? never mind then. @@ -950,7 +983,7 @@ def generate_prefetch_code(ccm, kernel, sched_index, last_of): # figure out dimension types from pytools import partition2 work_item_pf_dims, non_work_item_pf_dims = partition2( - (isinstance(dim.tag, WORK_ITEM_IDX_TAG), dim) + (isinstance(dim.tag, TAG_WORK_ITEM_IDX), dim) for dim in pf.dims) # Prefetch has a good amount of flexibility over what axes it @@ -966,7 +999,7 @@ def generate_prefetch_code(ccm, kernel, sched_index, last_of): # {{{ first, fix the user-specified fetch dims - knl_work_item_dims = kernel.ordered_dims_by_tag_type(WORK_ITEM_IDX_TAG) + knl_work_item_dims = kernel.ordered_dims_by_tag_type(TAG_WORK_ITEM_IDX) for realization_dim_idx, loc_fetch_axis_list in \ getattr(pf, "loc_fetch_axes", {}).iteritems(): @@ -1052,7 +1085,7 @@ def generate_prefetch_code(ccm, kernel, sched_index, last_of): while start_index < pf_dim.length: pf_dim_expr = 0 for realiz_dim in realiz_dim_list: - assert isinstance(realiz_dim.tag, WORK_ITEM_IDX_TAG) + assert isinstance(realiz_dim.tag, TAG_WORK_ITEM_IDX) pf_dim_expr = (pf_dim_expr*realiz_dim.length + var("get_local_id(%d)" % realiz_dim.tag.axis)) @@ -1203,8 +1236,8 @@ def get_parallel_dim_bounds_checks(ccm, kernel, last_of, stmt): from cgen import If for dim in ( - kernel.dims_by_tag_type(GROUP_IDX_TAG) - + kernel.dims_by_tag_type(WORK_ITEM_IDX_TAG)): + kernel.dims_by_tag_type(TAG_GROUP_IDX) + + kernel.dims_by_tag_type(TAG_WORK_ITEM_IDX)): if (dim.end_cond is not None and dim.end_cond_if_last_of <= last_of): stmt = If( @@ -1358,10 +1391,10 @@ def generate_code(kernel): mod.extend([Define(dim.name, "get_group_id(%d) /* 0..(%s) */" % (dim.tag.axis, ccm(dim.length-1, PREC_NONE))) - for dim in kernel.ordered_dims_by_tag_type(GROUP_IDX_TAG)] + for dim in kernel.ordered_dims_by_tag_type(TAG_GROUP_IDX)] + [Define(dim.name, "get_local_id(%d) /* 0..(%s) */" % (dim.tag.axis, ccm(dim.length-1, PREC_NONE))) - for dim in kernel.ordered_dims_by_tag_type(WORK_ITEM_IDX_TAG)] + for dim in kernel.ordered_dims_by_tag_type(TAG_WORK_ITEM_IDX)] + [Line()]) # }}} @@ -1386,7 +1419,7 @@ def generate_code(kernel): FunctionBody( CLRequiredWorkGroupSize( tuple(dim.length - for dim in kernel.ordered_dims_by_tag_type(WORK_ITEM_IDX_TAG)), + for dim in kernel.ordered_dims_by_tag_type(TAG_WORK_ITEM_IDX)), CLKernel(FunctionDeclaration( Value("void", kernel.name), args))), body)) @@ -1423,8 +1456,7 @@ def print_kernel_info(knl): # {{{ high-level modifiers def split_dimension(knl, *args, **kwargs): - knl, _ = knl.split_dimension(*args, **kwargs) - return knl + return knl.split_dimension(*args, **kwargs) def get_input_access_descriptors(kernel): """Return a dictionary mapping input vectors to @@ -1461,7 +1493,7 @@ def add_prefetch_dims(kernel, input_access_descr, dims, loc_fetch_axes={}): input_access_descr, = var_iads - dims = [kernel.parse_sloppy_dim(dim) for dim in dims] + dims = [kernel.tag_or_iname_to_iname(s) for s in dims] ivec, iexpr = input_access_descr new_prefetch = getattr(kernel, "prefetch", {}).copy()