diff --git a/loopy/__init__.py b/loopy/__init__.py index e7d8ce637967a674f73aa1ada6bae6b6f098cef4..4d296fe4b0a6898040b564a59aacf3101cc4375b 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -23,20 +23,23 @@ class LoopyAdvisory(UserWarning): from loopy.kernel import ScalarArg, ArrayArg, ConstantArrayArg, ImageArg from loopy.kernel import AutoFitLocalIndexTag, get_dot_dependency_graph -from loopy.cse import realize_cse +from loopy.subst import extract_subst, apply_subst +from loopy.cse import precompute from loopy.preprocess import preprocess_kernel from loopy.schedule import generate_loop_schedules from loopy.codegen import generate_code from loopy.compiled import CompiledKernel, drive_timing_run, auto_test_vs_seq from loopy.check import check_kernels -__all__ = ["ScalarArg", "ArrayArg", "ImageArg", +__all__ = ["ScalarArg", "ArrayArg", "ConstantArrayArg", "ImageArg", "get_dot_dependency_graph", "preprocess_kernel", "generate_loop_schedules", "generate_code", - "CompiledKernel", "drive_timing_run", "check_kernels", + "CompiledKernel", "drive_timing_run", "auto_test_vs_seq", "check_kernels", "make_kernel", "split_dimension", "join_dimensions", - "tag_dimensions", "realize_cse", "add_prefetch" + "tag_dimensions", + "extract_subst", "apply_subst", + "precompute", "add_prefetch" ] # }}} @@ -62,12 +65,7 @@ def make_kernel(*args, **kwargs): newly_created_vars = set() - from loopy.symbolic import ParametrizedSubstitutor - cse_sub = ParametrizedSubstitutor(knl.cses, wrap_cse=True) - subst_sub = ParametrizedSubstitutor(knl.substitutions, wrap_cse=False) - for insn in knl.instructions: - insn = insn.copy(expression=subst_sub(cse_sub(insn.expression))) # {{{ sanity checking @@ -190,8 +188,7 @@ def make_kernel(*args, **kwargs): domain=new_domain, temporary_variables=new_temp_vars, iname_to_tag=new_iname_to_tag, - iname_to_tag_requests=[], - cses={}) + iname_to_tag_requests=[]) # }}} @@ -276,6 +273,7 @@ def split_dimension(kernel, split_iname, inner_length, iname_slab_increments = kernel.iname_slab_increments.copy() iname_slab_increments[outer_iname] = slabs result = (kernel + .map_expressions(subst_mapper, exclude_instructions=True) .copy(domain=new_domain, iname_slab_increments=iname_slab_increments, instructions=new_insns, @@ -367,9 +365,9 @@ def join_dimensions(kernel, inames, new_iname=None, tag=AutoFitLocalIndexTag()): forced_iname_deps=subst_forced_iname_deps(insn.forced_iname_deps)) for insn in kernel.instructions] - result = kernel.copy( - instructions=new_insns, - domain=new_domain) + result = (kernel + .map_expressions(subst_map, exclude_instructions=True) + .copy(instructions=new_insns, domain=new_domain)) return tag_dimensions(result, {new_iname: tag}) @@ -419,53 +417,47 @@ def tag_dimensions(kernel, iname_to_tag, force=False): # {{{ convenience: add_prefetch -def add_prefetch(kernel, var_name, fetch_dims=[], uni_template=None, - new_inames=None, default_tag="l.auto"): - used_cse_tags = set() - def map_cse(expr, rec): - used_cse_tags.add(expr.tag) - rec(expr.child) +def add_prefetch(kernel, var_name, sweep_dims, dim_args=None, + new_inames=None, default_tag="l.auto", rule_name=None): - def get_unique_cse_tag(): - from loopy.tools import generate_unique_possibilities - for cse_tag in generate_unique_possibilities(prefix="fetch_"+var_name): - if cse_tag not in used_cse_tags: - used_cse_tags.add(cse_tag) - return cse_tag + if rule_name is None: + rule_name = kernel.make_unique_subst_rule_name("%s_fetch" % var_name) - cse_tag = get_unique_cse_tag() + arg = kernel.arg_dict[var_name] - from loopy.symbolic import VariableFetchCSEMapper - vf_cse_mapper = VariableFetchCSEMapper(var_name, lambda: cse_tag) - kernel = kernel.copy(instructions=[ - insn.copy(expression=vf_cse_mapper(insn.expression)) - for insn in kernel.instructions]) + newly_created_vars = set() + parameters = [] + for i in range(len(arg.shape)): + based_on = "%s_i%d" % (var_name, i) + if dim_args is not None and i < len(dim_args): + based_on = dim_args[i] - if var_name in kernel.arg_dict: - dtype = kernel.arg_dict[var_name].dtype - else: - dtype = kernel.temporary_variables[var_name].dtype + par_name = kernel.make_unique_var_name(based_on=based_on, + extra_used_vars=newly_created_vars) + newly_created_vars.add(par_name) + parameters.append(par_name) - kernel = realize_cse(kernel, cse_tag, dtype, fetch_dims, uni_template=uni_template, - new_inames=new_inames, default_tag=default_tag) + from pymbolic import var + uni_template = var(var_name) + if len(parameters) > 1: + uni_template = uni_template[tuple(var(par_name) for par_name in parameters)] + elif len(parameters) == 1: + uni_template = uni_template[var(parameters[0])] + + kernel = extract_subst(kernel, rule_name, uni_template, parameters) + + new_fetch_dims = [] + for fd in sweep_dims: + if isinstance(fd, int): + new_fetch_dims.append(parameters[fd]) + else: + new_fetch_dims.append(fd) - return kernel + return precompute(kernel, rule_name, arg.dtype, sweep_dims, new_arg_names=dim_args, + default_tag=default_tag) # }}} -def remove_cses(kernel): - from loopy.symbolic import CSECallbackMapper - - def map_cse(expr, rec): - return expr.child - - new_insns = [] - for insn in kernel.instructions: - new_insns.append( - insn.copy( - expression=CSECallbackMapper(map_cse)(insn.expression))) - - return kernel.copy(instructions=new_insns) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 0beaf47dbaf0958ee1c3e663151a165925a33a79..fb94fbecaf819537d829249197bf1c9756914207 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -62,7 +62,7 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): if lower_incr: assert lower_incr > 0 - lower_slab = ("initial", isl.Set.universe(kernel.space) + lower_slab = ("initial", isl.BasicSet.universe(kernel.space) .add_constraint(lb_cns_orig) .add_constraint(ub_cns_orig) .add_constraint( @@ -78,7 +78,7 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): if upper_incr: assert upper_incr > 0 - upper_slab = ("final", isl.Set.universe(kernel.space) + upper_slab = ("final", isl.BasicSet.universe(kernel.space) .add_constraint(lb_cns_orig) .add_constraint(ub_cns_orig) .add_constraint( @@ -98,7 +98,7 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): slabs.append(lower_slab) slabs.append(( ("bulk", - (isl.Set.universe(kernel.space) + (isl.BasicSet.universe(kernel.space) .add_constraint(lower_bulk_bound) .add_constraint(upper_bulk_bound))))) if upper_slab: @@ -108,7 +108,7 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): else: return [("bulk", - (isl.Set.universe(kernel.space) + (isl.BasicSet.universe(kernel.space) .add_constraint(lb_cns_orig) .add_constraint(ub_cns_orig)))] diff --git a/loopy/cse.py b/loopy/cse.py index ee22a8b8fcbe79f3943783dbefffba51db04515f..748ef458b443be1e154a9d8751805337376b2cf1 100644 --- a/loopy/cse.py +++ b/loopy/cse.py @@ -12,50 +12,8 @@ from pymbolic import var -def check_cse_iname_deps(iname, duplicate_inames, tag, dependencies, cse_tag, uni_template): - from loopy.kernel import (LocalIndexTagBase, GroupIndexTag, IlpTag) - - if isinstance(tag, LocalIndexTagBase): - kind = "l" - elif isinstance(tag, GroupIndexTag): - kind = "g" - elif isinstance(tag, IlpTag): - kind = "i" - else: - kind = "o" - - if iname not in duplicate_inames and iname in dependencies: - if kind == "i": - raise RuntimeError( - "When realizing CSE with tag '%s', encountered iname " - "'%s' which is depended upon by the CSE and tagged " - "'%s', but not duplicated. The CSE would " - "inherit this iname, which would lead to a write race. " - "A likely solution of this problem is to also duplicate this " - "iname." - % (cse_tag, iname, tag)) - - if iname in duplicate_inames and kind == "g": - raise RuntimeError("duplicating the iname '%s' into " - "group index axes is not helpful, as they cannot " - "collaborate in computing a local/private variable" - %iname) - - if iname in dependencies: - return - - # the iname is *not* a dependency of the fetch expression - if iname in duplicate_inames: - raise RuntimeError("duplicating an iname ('%s') " - "that the CSE ('%s') does not depend on " - "does not make sense" % (iname, uni_template)) - - - - -class CSEDescriptor(Record): - __slots__ = ["insn", "cse", "independent_inames", - "unif_var_dict"] +class InvocationDescriptor(Record): + __slots__ = ["expr", "args", ] @@ -78,366 +36,233 @@ def to_parameters_or_project_out(param_inames, set_inames, set): -def process_cses(kernel, uni_template, - independent_inames, matching_vars, cse_descriptors): - from loopy.symbolic import UnidirectionalUnifier - - ind_inames_set = set(independent_inames) - - uni_iname_list = independent_inames + matching_vars - footprint = None - - uni_recs = [] - matching_var_values = {} - - for csed in cse_descriptors: - # {{{ find unifier - - unif = UnidirectionalUnifier( - lhs_mapping_candidates=ind_inames_set | set(matching_vars)) - unifiers = unif(uni_template, csed.cse.child) - if not unifiers: - raise RuntimeError("Unable to unify " - "CSEs '%s' and '%s' (with lhs candidates '%s')" % ( - uni_template, csed.cse.child, - ",".join(unif.lhs_mapping_candidates), - )) - - # }}} - - found_good_unifier = False - - for unifier in unifiers: - # {{{ construct, check mapping - - map_space = kernel.space - ln = len(uni_iname_list) - rn = kernel.space.dim(dim_type.out) - - map_space = map_space.add_dims(dim_type.in_, ln) - for i, iname in enumerate(uni_iname_list): - map_space = map_space.set_dim_name(dim_type.in_, i, iname) - - set_space = map_space.move_dims( - dim_type.out, rn, - dim_type.in_, 0, ln).range() - - var_map = None +def get_footprint(kernel, subst_name, arg_names, unique_new_arg_names, + sweep_inames, invocation_descriptors): + global_footprint_map = None - rhs_deps = set() + processed_sweep_inames = set() - from loopy.symbolic import aff_from_expr - for lhs, rhs in unifier.equations: - cns = isl.Constraint.equality_from_aff( - aff_from_expr(set_space, lhs - rhs)) + for invdesc in invocation_descriptors: - rhs_deps.update(get_dependencies(rhs)) - - cns_map = isl.BasicMap.from_constraint(cns) - if var_map is None: - var_map = cns_map - else: - var_map = var_map.intersect(cns_map) + for iname in sweep_inames: + if iname in arg_names: + arg_idx = arg_names.index(iname) + processed_sweep_inames.add( + get_dependencies(invdesc.args[arg_idx])) + else: + processed_sweep_inames.add(iname) - var_map = var_map.move_dims( - dim_type.in_, 0, - dim_type.out, rn, ln) + # {{{ construct, check mapping - restr_rhs_map = ( - isl.Map.from_basic_map(var_map) - .intersect_range(kernel.domain)) + map_space = kernel.space + ln = len(unique_new_arg_names) + rn = kernel.space.dim(dim_type.out) - # Sanity check: If the range of the map does not recover the - # domain of the expression, the unifier must have been no - # good. - if restr_rhs_map.range() != kernel.domain: - continue + map_space = map_space.add_dims(dim_type.in_, ln) + for i, iname in enumerate(unique_new_arg_names): + map_space = map_space.set_dim_name(dim_type.in_, i, iname+"'") - restr_rhs_map = restr_rhs_map.project_out_except( - rhs_deps, [dim_type.out]) + set_space = map_space.move_dims( + dim_type.out, rn, + dim_type.in_, 0, ln).range() - # Sanity check: Injectivity here means that unique lead indices - # can be found for each + footprint_map = None - if not restr_rhs_map.is_injective(): - raise RuntimeError("In CSEs '%s' and '%s': " - "cannot find lead indices uniquely" - % (uni_template, csed.cse.child)) + from loopy.symbolic import aff_from_expr + for uarg_name, arg_val in zip(unique_new_arg_names, invdesc.args): + cns = isl.Constraint.equality_from_aff( + aff_from_expr(set_space, var(uarg_name+"'") - arg_val)) - footprint_contrib = restr_rhs_map.domain() - if footprint is None: - footprint = footprint_contrib + cns_map = isl.BasicMap.from_constraint(cns) + if footprint_map is None: + footprint_map = cns_map else: - footprint = footprint.union(footprint_contrib) - - found_good_unifier = True - - # }}} - - if not found_good_unifier: - raise RuntimeError("No valid unifier for '%s' and '%s'" - % (csed.cse.child, uni_template)) - - uni_recs.append(unifier) + footprint_map = footprint_map.intersect(cns_map) - # {{{ check that matching_vars have a unique_value + footprint_map = footprint_map.move_dims( + dim_type.in_, 0, + dim_type.out, rn, ln) - csed.unif_var_dict = dict((lhs.name, rhs) - for lhs, rhs in unifier.equations) - for mv_name in matching_vars: - if mv_name in matching_var_values: - if matching_var_values[mv_name] != csed.unif_var_dict[mv_name]: - raise RuntimeError("two different expressions encountered " - "for matching variable '%s' in unification template '%s':" - "'%s' and '%s'" % ( - mv_name, uni_template, - matching_var_values[mv_name], csed.unif_var_dict[mv_name])) - else: - matching_var_values[mv_name] = csed.unif_var_dict[mv_name] + if global_footprint_map is None: + global_footprint_map = footprint_map + else: + global_footprint_map = global_footprint_map.union(footprint_map) # }}} - assert (footprint - .project_out_except(independent_inames, [dim_type.set]) - .is_bounded()) + processed_sweep_inames = list(processed_sweep_inames) - return footprint, matching_var_values, + global_footprint_map = global_footprint_map.intersect_range(kernel.domain) + # move non-sweep-dimensions into parameter space + sweep_footprint_map = global_footprint_map.coalesce() + for iname in kernel.all_inames(): + if iname not in processed_sweep_inames: + sp = sweep_footprint_map.get_space() + dt, idx = sp.get_var_dict()[iname] + sweep_footprint_map = sweep_footprint_map.move_dims( + dim_type.param, sp.dim(dim_type.param), + dt, idx, 1) + # compute bounding boxes to each set of parameters + sfm_dom = sweep_footprint_map.domain().coalesce() + if not sfm_dom.is_bounded(): + raise RuntimeError("In precomputation of substitution '%s': " + "sweep did not result in a bounded footprint" + % subst_name) -def make_compute_insn(kernel, cse_tag, uni_template, - target_var_name, target_var_base_indices, - independent_inames, ind_iname_to_tag, insn): + from loopy.kernel import find_var_base_indices_and_shape_from_inames + base_indices, shape = find_var_base_indices_and_shape_from_inames( + sfm_dom, [uarg+"'" for uarg in unique_new_arg_names], + kernel.cache_manager) - # {{{ decide whether to force a dep + # compute augmented domain - from loopy.symbolic import IndexVariableFinder - dependencies = IndexVariableFinder( - include_reduction_inames=False)(uni_template) + # {{{ subtract off the base indices + # add the new, base-0 as new in dimensions - parent_inames = kernel.insn_inames(insn) | insn.reduction_inames() - #print dependencies, parent_inames - #assert dependencies <= parent_inames + sp = global_footprint_map.get_space() + tgt_idx = sp.dim(dim_type.out) - for iname in parent_inames: - if iname in independent_inames: - tag = ind_iname_to_tag[iname] - else: - tag = kernel.iname_to_tag.get(iname) + n_args = len(unique_new_arg_names) - check_cse_iname_deps( - iname, independent_inames, tag, dependencies, cse_tag, uni_template) + aug_domain = global_footprint_map.move_dims( + dim_type.out, tgt_idx, + dim_type.in_, 0, + n_args).range().coalesce() - # }}} + aug_domain = aug_domain.insert_dims(dim_type.set, tgt_idx, n_args) + for i, name in enumerate(unique_new_arg_names): + aug_domain = aug_domain.set_dim_name(dim_type.set, tgt_idx+i, name) - assignee = var(target_var_name) + # index layout now: + # <....out.....> (tgt_idx) - if independent_inames: - assignee = assignee[tuple( - var(iname)-bi - for iname, bi in zip(independent_inames, target_var_base_indices) - )] + from loopy.symbolic import aff_from_expr + for uarg_name, bi in zip(unique_new_arg_names, base_indices): + cns = isl.Constraint.equality_from_aff( + aff_from_expr(aug_domain.get_space(), + var(uarg_name) - (var(uarg_name+"'") - bi))) - insn_prefix = cse_tag - if insn_prefix is None: - insn_prefix = "cse" - from loopy.kernel import Instruction - return Instruction( - id=kernel.make_unique_instruction_id(based_on=insn_prefix+"_compute"), - assignee=assignee, - expression=uni_template) + aug_domain = aug_domain.add_constraint(cns) + aug_domain = aug_domain.eliminate(dim_type.set, tgt_idx+n_args, n_args) + aug_domain = aug_domain.remove_dims(dim_type.set, tgt_idx+n_args, n_args) + base_indices_2, shape_2 = find_var_base_indices_and_shape_from_inames( + aug_domain, unique_new_arg_names, + kernel.cache_manager) + assert base_indices_2 == [0] * n_args + assert shape_2 == shape -def realize_cse(kernel, cse_tag, dtype, independent_inames=[], - uni_template=None, ind_iname_to_tag={}, new_inames=None, default_tag="l.auto"): - """ - :arg independent_inames: which inames are supposed to be separate loops - in the CSE. Also determines index order of temporary array. - The variables in independent_inames refer to the unification - template. - :arg uni_template: An expression against which all targeted subexpressions - must unify + return aug_domain, base_indices, shape - If None, a unification template will be chosen from among the targeted - CSEs. That CSE is chosen to depend on all the variables in - *independent_inames*. It is an error if no such expression can be - found. - May contain '*' wildcards that will have to match exactly across all - unifications. - Process: - - Find all targeted CSEs. - - Find *uni_template* as described above. +def simplify_via_aff(space, expr): + from loopy.symbolic import aff_from_expr, aff_to_expr + return aff_to_expr(aff_from_expr(space, expr)) - - Turn all wildcards in *uni_template* into matching-relevant (but not - independent, in the sense of *independent_inames*) variables. - - Unify the CSEs with the unification template, detecting mappings - of template variables to variables used in the CSE. - - Find the (union) footprint of the CSEs in terms of the - *independent_inames*. - - Augment the kernel domain by that footprint and generate the fetch - instruction. +def precompute(kernel, subst_name, dtype, sweep_inames=[], + new_arg_names=None, arg_name_to_tag={}, default_tag="l.auto"): - - Replace the CSEs according to the mapping detected in unification. - """ + subst = kernel.substitutions[subst_name] + arg_names = subst.arguments + subst_expr = subst.expression - newly_created_var_names = set() + # {{{ gather up invocations - # {{{ replace any wildcards in uni_template with new variables + invocation_descriptors = [] + invocation_arg_deps = set() - if isinstance(uni_template, str): - from pymbolic import parse - uni_template = parse(uni_template) + def gather_substs(expr, name, args, rec): + arg_deps = get_dependencies(args) + if not arg_deps <= kernel.all_inames(): + raise RuntimeError("CSE arguments in '%s' do not consist " + "exclusively of inames" % expr) - def get_unique_var_name(): - if cse_tag is None: - based_on = "cse_wc" - else: - based_on = cse_tag+"_wc" + invocation_arg_deps.update(arg_deps) - result = kernel.make_unique_var_name( - based_on=based_on, extra_used_vars=newly_created_var_names) - newly_created_var_names.add(result) - return result + invocation_descriptors.append( + InvocationDescriptor(expr=expr, args=args)) + return expr - if uni_template is not None: - from loopy.symbolic import WildcardToUniqueVariableMapper - wc_map = WildcardToUniqueVariableMapper(get_unique_var_name) - uni_template = wc_map(uni_template) + from loopy.symbolic import SubstitutionCallbackMapper + scm = SubstitutionCallbackMapper([subst_name], gather_substs) + for insn in kernel.instructions: + scm(insn.expression) + for s in kernel.substitutions.itervalues(): + if s is not subst: + scm(s.expression) + + allowable_sweep_inames = invocation_arg_deps | set(arg_names) + if not set(sweep_inames) <= allowable_sweep_inames: + raise RuntimeError("independent iname(s) '%s' do not occur as arg names " + "of subsitution rule or in arguments of invocation" % (",".join( + set(sweep_inames)-allowable_sweep_inames))) # }}} # {{{ process ind_iname_to_tag argument - ind_iname_to_tag = ind_iname_to_tag.copy() + arg_name_to_tag = arg_name_to_tag.copy() from loopy.kernel import parse_tag default_tag = parse_tag(default_tag) - for iname in independent_inames: - ind_iname_to_tag.setdefault(iname, default_tag) + for iname in arg_names: + arg_name_to_tag.setdefault(iname, default_tag) - if not set(ind_iname_to_tag.iterkeys()) <= set(independent_inames): - raise RuntimeError("tags for non-new inames may not be passed") + if not set(arg_name_to_tag.iterkeys()) <= set(arg_names): + raise RuntimeError("tags for non-argument names may not be passed") # here, all information is consolidated into ind_iname_to_tag # }}} - # {{{ gather cse descriptors - - cse_descriptors = [] - - def gather_cses(cse, rec): - if cse.prefix != cse_tag: - rec(cse.child) - return - - cse_descriptors.append( - CSEDescriptor(insn=insn, cse=cse)) - # can't nest, don't recurse - - from loopy.symbolic import CSECallbackMapper - cse_cb_mapper = CSECallbackMapper(gather_cses) - - for insn in kernel.instructions: - cse_cb_mapper(insn.expression) - - # }}} - - # {{{ find/pick a unification template - - if not cse_descriptors: - raise RuntimeError("no CSEs tagged '%s' found" % cse_tag) - - if uni_template is None: - for csed in cse_descriptors: - if set(independent_inames) <= get_dependencies(csed.cse.child): - # pick the first cse that has the required inames as the unification template - uni_template = csed.cse.child - break - - if uni_template is None: - raise RuntimeError("could not find a suitable unification template that depends on " - "inames '%s'" % ",".join(independent_inames)) - - # }}} - - # {{{ make sure that independent inames and kernel inames do not overlap - - # (and substitute in uni_template if any variable name changes are necessary) - - if set(independent_inames) & kernel.all_inames(): - old_to_new = {} - - new_independent_inames = [] - new_ind_iname_to_tag = {} - for i, iname in enumerate(independent_inames): - if iname in kernel.all_inames(): - based_on = iname - if new_inames is not None and i < len(new_inames): - based_on = new_inames[i] - elif cse_tag is not None: - based_on = "%s_%s" % (iname, cse_tag) - - new_iname = kernel.make_unique_var_name( - based_on=based_on, extra_used_vars=newly_created_var_names) - old_to_new[iname] = var(new_iname) - newly_created_var_names.add(new_iname) - new_independent_inames.append(new_iname) - new_ind_iname_to_tag[new_iname] = ind_iname_to_tag[iname] - else: - new_independent_inames.append(iname) - new_ind_iname_to_tag[iname] = ind_iname_to_tag[iname] - - independent_inames = new_independent_inames - ind_iname_to_tag = new_ind_iname_to_tag - uni_template = ( - SubstitutionMapper(make_subst_func(old_to_new)) - (uni_template)) - - # }}} - - if not set(independent_inames) <= get_dependencies(uni_template): - raise RuntimeError("independent iname(s) '%s' do not occur in unification " - "template" % (",".join( - set(independent_inames)-get_dependencies(uni_template)))) + newly_created_var_names = set() - # {{{ deal with iname deps of uni_template that are not independent_inames + # {{{ make sure that new - # (We call these 'matching_vars', because they have to match exactly in - # every CSE. As above, they might need to be renamed to make them unique - # within the kernel.) + # (and substitute in subst_expressions if any variable name changes are necessary) - matching_vars = [] old_to_new = {} - for iname in (get_dependencies(uni_template) - - set(independent_inames) - - kernel.non_iname_variable_names()): - if iname in kernel.all_inames(): - # need to rename to be unique - new_iname = kernel.make_unique_var_name( - based_on=iname, extra_used_vars=newly_created_var_names) - old_to_new[iname] = var(new_iname) - newly_created_var_names.add(new_iname) - matching_vars.append(new_iname) + unique_new_arg_names = [] + new_arg_name_to_tag = {} + for i, name in enumerate(arg_names): + new_name = None + + if new_arg_names is not None and i < len(new_arg_names): + new_name = new_arg_names[i] + if new_name in kernel.all_variable_names(): + raise RuntimeError("new name '%s' already exists" % new_name) + + if name in kernel.all_variable_names(): + based_on = "%s_%s" % (name, subst_name) + new_name = kernel.make_unique_var_name( + based_on=based_on, extra_used_vars=newly_created_var_names) + + if new_name is not None: + old_to_new[name] = var(new_name) + newly_created_var_names.add(new_name) + unique_new_arg_names.append(new_name) + new_arg_name_to_tag[new_name] = arg_name_to_tag[name] else: - matching_vars.append(iname) + unique_new_arg_names.append(name) + new_arg_name_to_tag[name] = arg_name_to_tag[name] - if old_to_new: - uni_template = ( - SubstitutionMapper(make_subst_func(old_to_new)) - (uni_template)) + arg_name_to_tag = new_arg_name_to_tag + subst_expr = ( + SubstitutionMapper(make_subst_func(old_to_new)) + (subst_expr)) # }}} @@ -445,61 +270,27 @@ def realize_cse(kernel, cse_tag, dtype, independent_inames=[], # (If there are independent inames, this adds extra dimensions to the domain.) - footprint, matching_var_values = process_cses(kernel, uni_template, - independent_inames, matching_vars, - cse_descriptors) - - if isinstance(footprint, isl.Set): - footprint = footprint.coalesce() - footprint_bsets = footprint.get_basic_sets() - if len(footprint_bsets) > 1: - raise NotImplementedError("CSE '%s' yielded a non-convex footprint" - % cse_tag) + new_domain, target_var_base_indices, target_var_shape = \ + get_footprint(kernel, subst_name, arg_names, unique_new_arg_names, + sweep_inames, invocation_descriptors) - footprint, = footprint_bsets + new_domain = new_domain.coalesce() + if isinstance(new_domain, isl.Set): + dom_bsets = new_domain.get_basic_sets() + if len(dom_bsets) > 1: + raise NotImplementedError("Substitution '%s' yielded a non-convex footprint" + % subst_name) - ndim = kernel.space.dim(dim_type.set) - footprint = footprint.insert_dims(dim_type.set, 0, ndim) - for i in range(ndim): - footprint = footprint.set_dim_name(dim_type.set, i, - kernel.space.get_dim_name(dim_type.set, i)) - - from islpy import align_spaces - new_domain = align_spaces(kernel.domain, footprint).intersect(footprint) - - # set matching vars equal to their unified value, eliminate them - from loopy.symbolic import aff_from_expr - - assert set(matching_var_values) == set(matching_vars) - - for var_name, value in matching_var_values.iteritems(): - cns = isl.Constraint.equality_from_aff( - aff_from_expr(new_domain.get_space(), var(var_name) - value)) - new_domain = new_domain.add_constraint(cns) - - new_domain = (new_domain - .eliminate(dim_type.set, - new_domain.dim(dim_type.set)-len(matching_vars), len(matching_vars)) - .remove_dims(dim_type.set, - new_domain.dim(dim_type.set)-len(matching_vars), len(matching_vars))) - new_domain = new_domain.remove_redundancies() + new_domain, = dom_bsets # }}} # {{{ set up temp variable - var_base = cse_tag - if var_base is None: - var_base = "cse" - target_var_name = kernel.make_unique_var_name(var_base) - - from loopy.kernel import (TemporaryVariable, - find_var_base_indices_and_shape_from_inames) + target_var_name = kernel.make_unique_var_name(based_on=subst_name, + extra_used_vars=newly_created_var_names) - target_var_base_indices, target_var_shape = \ - find_var_base_indices_and_shape_from_inames( - new_domain, independent_inames, - kernel.cache_manager) + from loopy.kernel import TemporaryVariable new_temporary_variables = kernel.temporary_variables.copy() new_temporary_variables[target_var_name] = TemporaryVariable( @@ -511,56 +302,63 @@ def realize_cse(kernel, cse_tag, dtype, independent_inames=[], # }}} - mv_subst = SubstitutionMapper(make_subst_func( - dict((mv, matching_var_values[mv]) for mv in matching_vars))) + # {{{ set up compute insn - compute_insn = make_compute_insn( - kernel, cse_tag, mv_subst(uni_template), - target_var_name, target_var_base_indices, - independent_inames, ind_iname_to_tag, - # pick one insn at random for dep check - cse_descriptors[0].insn) + assignee = var(target_var_name) + + if unique_new_arg_names: + assignee = assignee[tuple(var(iname) for iname in unique_new_arg_names)] + + from loopy.kernel import Instruction + compute_insn = Instruction( + id=kernel.make_unique_instruction_id(based_on=subst_name), + assignee=assignee, + expression=subst_expr) - # {{{ substitute variable references into instructions + # }}} + + # {{{ substitute rule into instructions - def subst_cses(cse, rec): + def do_substs(expr, name, args, rec): found = False - for csed in cse_descriptors: - if cse is csed.cse: + for invdesc in invocation_descriptors: + if expr is invdesc.expr: found = True break if not found: - from pymbolic.primitives import CommonSubexpression - return CommonSubexpression( - rec(cse.child), cse.prefix) + return - indices = [csed.unif_var_dict[iname]-bi - for iname, bi in zip(independent_inames, target_var_base_indices)] + args = [simplify_via_aff(new_domain.get_space(), arg-bi) + for arg, bi in zip(args, target_var_base_indices)] new_outer_expr = var(target_var_name) - if indices: - new_outer_expr = new_outer_expr[tuple(indices)] + if args: + new_outer_expr = new_outer_expr[tuple(args)] return new_outer_expr # can't nest, don't recurse - cse_cb_mapper = CSECallbackMapper(subst_cses) - new_insns = [compute_insn] + sub_map = SubstitutionCallbackMapper([subst_name], do_substs) for insn in kernel.instructions: - new_expr = cse_cb_mapper(insn.expression) - new_insns.append(insn.copy(expression=new_expr)) + new_insns.append(insn.copy(expression=sub_map(insn.expression))) # }}} new_iname_to_tag = kernel.iname_to_tag.copy() - new_iname_to_tag.update(ind_iname_to_tag) + new_iname_to_tag.update(arg_name_to_tag) + + new_substs = dict( + (s.name, s.copy(expression=sub_map(subst.expression))) + for s in kernel.substitutions.itervalues()) + del new_substs[subst_name] return kernel.copy( domain=new_domain, instructions=new_insns, + substitutions=new_substs, temporary_variables=new_temporary_variables, iname_to_tag=new_iname_to_tag) diff --git a/loopy/kernel.py b/loopy/kernel.py index dfdae050229367911eece85980090724bede98ef..e8c80d36a2c24d952a97011570cb3d6ee847f4ec 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -217,6 +217,26 @@ class TemporaryVariable(Record): # }}} +# {{{ subsitution rule + +class SubstitutionRule(Record): + """ + :ivar name: + :ivar arguments: + :ivar expression: + """ + + def __init__(self, name, arguments, expression): + Record.__init__(self, + name=name, arguments=arguments, expression=expression) + + def __str__(self): + return "%s(%s) := %s" % ( + self.name, ", ".join(self.arguments), self.expression) + + +# }}} + # {{{ instruction class Instruction(Record): @@ -451,6 +471,8 @@ class LoopKernel(Record): :ivar local_sizes: A dictionary from integers to integers, mapping workgroup axes to ther sizes, e.g. *{0: 16}* forces axis 0 to be length 16. + :ivar substitutions: a mapping from substitution names to :class:`SubstitutionRule` + objects :ivar cache_manager: @@ -458,8 +480,6 @@ class LoopKernel(Record): finished: :ivar iname_to_tag_requests: - :ivar cses: a mapping from CSE names to tuples (arg_names, expr). - :ivar substitutions: a mapping from CSE names to tuples (arg_names, expr). """ def __init__(self, device, domain, instructions, args=None, schedule=None, @@ -468,7 +488,7 @@ class LoopKernel(Record): iname_slab_increments={}, temporary_variables={}, local_sizes={}, - iname_to_tag={}, iname_to_tag_requests=None, cses={}, substitutions={}, + iname_to_tag={}, iname_to_tag_requests=None, substitutions={}, cache_manager=None): """ :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl. @@ -489,7 +509,7 @@ class LoopKernel(Record): INAME_ENTRY_RE = re.compile( r"^\s*(?P\w+)\s*(?:\:\s*(?P[\w.]+))?\s*$") - LABEL_DEP_RE = re.compile( + INSN_RE = re.compile( r"^\s*(?:(?P