diff --git a/loopy/__init__.py b/loopy/__init__.py index e7d8ce637967a674f73aa1ada6bae6b6f098cef4..4d296fe4b0a6898040b564a59aacf3101cc4375b 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -23,20 +23,23 @@ class LoopyAdvisory(UserWarning): from loopy.kernel import ScalarArg, ArrayArg, ConstantArrayArg, ImageArg from loopy.kernel import AutoFitLocalIndexTag, get_dot_dependency_graph -from loopy.cse import realize_cse +from loopy.subst import extract_subst, apply_subst +from loopy.cse import precompute from loopy.preprocess import preprocess_kernel from loopy.schedule import generate_loop_schedules from loopy.codegen import generate_code from loopy.compiled import CompiledKernel, drive_timing_run, auto_test_vs_seq from loopy.check import check_kernels -__all__ = ["ScalarArg", "ArrayArg", "ImageArg", +__all__ = ["ScalarArg", "ArrayArg", "ConstantArrayArg", "ImageArg", "get_dot_dependency_graph", "preprocess_kernel", "generate_loop_schedules", "generate_code", - "CompiledKernel", "drive_timing_run", "check_kernels", + "CompiledKernel", "drive_timing_run", "auto_test_vs_seq", "check_kernels", "make_kernel", "split_dimension", "join_dimensions", - "tag_dimensions", "realize_cse", "add_prefetch" + "tag_dimensions", + "extract_subst", "apply_subst", + "precompute", "add_prefetch" ] # }}} @@ -62,12 +65,7 @@ def make_kernel(*args, **kwargs): newly_created_vars = set() - from loopy.symbolic import ParametrizedSubstitutor - cse_sub = ParametrizedSubstitutor(knl.cses, wrap_cse=True) - subst_sub = ParametrizedSubstitutor(knl.substitutions, wrap_cse=False) - for insn in knl.instructions: - insn = insn.copy(expression=subst_sub(cse_sub(insn.expression))) # {{{ sanity checking @@ -190,8 +188,7 @@ def make_kernel(*args, **kwargs): domain=new_domain, temporary_variables=new_temp_vars, iname_to_tag=new_iname_to_tag, - iname_to_tag_requests=[], - cses={}) + iname_to_tag_requests=[]) # }}} @@ -276,6 +273,7 @@ def split_dimension(kernel, split_iname, inner_length, iname_slab_increments = kernel.iname_slab_increments.copy() iname_slab_increments[outer_iname] = slabs result = (kernel + .map_expressions(subst_mapper, exclude_instructions=True) .copy(domain=new_domain, iname_slab_increments=iname_slab_increments, instructions=new_insns, @@ -367,9 +365,9 @@ def join_dimensions(kernel, inames, new_iname=None, tag=AutoFitLocalIndexTag()): forced_iname_deps=subst_forced_iname_deps(insn.forced_iname_deps)) for insn in kernel.instructions] - result = kernel.copy( - instructions=new_insns, - domain=new_domain) + result = (kernel + .map_expressions(subst_map, exclude_instructions=True) + .copy(instructions=new_insns, domain=new_domain)) return tag_dimensions(result, {new_iname: tag}) @@ -419,53 +417,47 @@ def tag_dimensions(kernel, iname_to_tag, force=False): # {{{ convenience: add_prefetch -def add_prefetch(kernel, var_name, fetch_dims=[], uni_template=None, - new_inames=None, default_tag="l.auto"): - used_cse_tags = set() - def map_cse(expr, rec): - used_cse_tags.add(expr.tag) - rec(expr.child) +def add_prefetch(kernel, var_name, sweep_dims, dim_args=None, + new_inames=None, default_tag="l.auto", rule_name=None): - def get_unique_cse_tag(): - from loopy.tools import generate_unique_possibilities - for cse_tag in generate_unique_possibilities(prefix="fetch_"+var_name): - if cse_tag not in used_cse_tags: - used_cse_tags.add(cse_tag) - return cse_tag + if rule_name is None: + rule_name = kernel.make_unique_subst_rule_name("%s_fetch" % var_name) - cse_tag = get_unique_cse_tag() + arg = kernel.arg_dict[var_name] - from loopy.symbolic import VariableFetchCSEMapper - vf_cse_mapper = VariableFetchCSEMapper(var_name, lambda: cse_tag) - kernel = kernel.copy(instructions=[ - insn.copy(expression=vf_cse_mapper(insn.expression)) - for insn in kernel.instructions]) + newly_created_vars = set() + parameters = [] + for i in range(len(arg.shape)): + based_on = "%s_i%d" % (var_name, i) + if dim_args is not None and i < len(dim_args): + based_on = dim_args[i] - if var_name in kernel.arg_dict: - dtype = kernel.arg_dict[var_name].dtype - else: - dtype = kernel.temporary_variables[var_name].dtype + par_name = kernel.make_unique_var_name(based_on=based_on, + extra_used_vars=newly_created_vars) + newly_created_vars.add(par_name) + parameters.append(par_name) - kernel = realize_cse(kernel, cse_tag, dtype, fetch_dims, uni_template=uni_template, - new_inames=new_inames, default_tag=default_tag) + from pymbolic import var + uni_template = var(var_name) + if len(parameters) > 1: + uni_template = uni_template[tuple(var(par_name) for par_name in parameters)] + elif len(parameters) == 1: + uni_template = uni_template[var(parameters[0])] + + kernel = extract_subst(kernel, rule_name, uni_template, parameters) + + new_fetch_dims = [] + for fd in sweep_dims: + if isinstance(fd, int): + new_fetch_dims.append(parameters[fd]) + else: + new_fetch_dims.append(fd) - return kernel + return precompute(kernel, rule_name, arg.dtype, sweep_dims, new_arg_names=dim_args, + default_tag=default_tag) # }}} -def remove_cses(kernel): - from loopy.symbolic import CSECallbackMapper - - def map_cse(expr, rec): - return expr.child - - new_insns = [] - for insn in kernel.instructions: - new_insns.append( - insn.copy( - expression=CSECallbackMapper(map_cse)(insn.expression))) - - return kernel.copy(instructions=new_insns) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 0beaf47dbaf0958ee1c3e663151a165925a33a79..fb94fbecaf819537d829249197bf1c9756914207 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -62,7 +62,7 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): if lower_incr: assert lower_incr > 0 - lower_slab = ("initial", isl.Set.universe(kernel.space) + lower_slab = ("initial", isl.BasicSet.universe(kernel.space) .add_constraint(lb_cns_orig) .add_constraint(ub_cns_orig) .add_constraint( @@ -78,7 +78,7 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): if upper_incr: assert upper_incr > 0 - upper_slab = ("final", isl.Set.universe(kernel.space) + upper_slab = ("final", isl.BasicSet.universe(kernel.space) .add_constraint(lb_cns_orig) .add_constraint(ub_cns_orig) .add_constraint( @@ -98,7 +98,7 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): slabs.append(lower_slab) slabs.append(( ("bulk", - (isl.Set.universe(kernel.space) + (isl.BasicSet.universe(kernel.space) .add_constraint(lower_bulk_bound) .add_constraint(upper_bulk_bound))))) if upper_slab: @@ -108,7 +108,7 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): else: return [("bulk", - (isl.Set.universe(kernel.space) + (isl.BasicSet.universe(kernel.space) .add_constraint(lb_cns_orig) .add_constraint(ub_cns_orig)))] diff --git a/loopy/cse.py b/loopy/cse.py index ee22a8b8fcbe79f3943783dbefffba51db04515f..748ef458b443be1e154a9d8751805337376b2cf1 100644 --- a/loopy/cse.py +++ b/loopy/cse.py @@ -12,50 +12,8 @@ from pymbolic import var -def check_cse_iname_deps(iname, duplicate_inames, tag, dependencies, cse_tag, uni_template): - from loopy.kernel import (LocalIndexTagBase, GroupIndexTag, IlpTag) - - if isinstance(tag, LocalIndexTagBase): - kind = "l" - elif isinstance(tag, GroupIndexTag): - kind = "g" - elif isinstance(tag, IlpTag): - kind = "i" - else: - kind = "o" - - if iname not in duplicate_inames and iname in dependencies: - if kind == "i": - raise RuntimeError( - "When realizing CSE with tag '%s', encountered iname " - "'%s' which is depended upon by the CSE and tagged " - "'%s', but not duplicated. The CSE would " - "inherit this iname, which would lead to a write race. " - "A likely solution of this problem is to also duplicate this " - "iname." - % (cse_tag, iname, tag)) - - if iname in duplicate_inames and kind == "g": - raise RuntimeError("duplicating the iname '%s' into " - "group index axes is not helpful, as they cannot " - "collaborate in computing a local/private variable" - %iname) - - if iname in dependencies: - return - - # the iname is *not* a dependency of the fetch expression - if iname in duplicate_inames: - raise RuntimeError("duplicating an iname ('%s') " - "that the CSE ('%s') does not depend on " - "does not make sense" % (iname, uni_template)) - - - - -class CSEDescriptor(Record): - __slots__ = ["insn", "cse", "independent_inames", - "unif_var_dict"] +class InvocationDescriptor(Record): + __slots__ = ["expr", "args", ] @@ -78,366 +36,233 @@ def to_parameters_or_project_out(param_inames, set_inames, set): -def process_cses(kernel, uni_template, - independent_inames, matching_vars, cse_descriptors): - from loopy.symbolic import UnidirectionalUnifier - - ind_inames_set = set(independent_inames) - - uni_iname_list = independent_inames + matching_vars - footprint = None - - uni_recs = [] - matching_var_values = {} - - for csed in cse_descriptors: - # {{{ find unifier - - unif = UnidirectionalUnifier( - lhs_mapping_candidates=ind_inames_set | set(matching_vars)) - unifiers = unif(uni_template, csed.cse.child) - if not unifiers: - raise RuntimeError("Unable to unify " - "CSEs '%s' and '%s' (with lhs candidates '%s')" % ( - uni_template, csed.cse.child, - ",".join(unif.lhs_mapping_candidates), - )) - - # }}} - - found_good_unifier = False - - for unifier in unifiers: - # {{{ construct, check mapping - - map_space = kernel.space - ln = len(uni_iname_list) - rn = kernel.space.dim(dim_type.out) - - map_space = map_space.add_dims(dim_type.in_, ln) - for i, iname in enumerate(uni_iname_list): - map_space = map_space.set_dim_name(dim_type.in_, i, iname) - - set_space = map_space.move_dims( - dim_type.out, rn, - dim_type.in_, 0, ln).range() - - var_map = None +def get_footprint(kernel, subst_name, arg_names, unique_new_arg_names, + sweep_inames, invocation_descriptors): + global_footprint_map = None - rhs_deps = set() + processed_sweep_inames = set() - from loopy.symbolic import aff_from_expr - for lhs, rhs in unifier.equations: - cns = isl.Constraint.equality_from_aff( - aff_from_expr(set_space, lhs - rhs)) + for invdesc in invocation_descriptors: - rhs_deps.update(get_dependencies(rhs)) - - cns_map = isl.BasicMap.from_constraint(cns) - if var_map is None: - var_map = cns_map - else: - var_map = var_map.intersect(cns_map) + for iname in sweep_inames: + if iname in arg_names: + arg_idx = arg_names.index(iname) + processed_sweep_inames.add( + get_dependencies(invdesc.args[arg_idx])) + else: + processed_sweep_inames.add(iname) - var_map = var_map.move_dims( - dim_type.in_, 0, - dim_type.out, rn, ln) + # {{{ construct, check mapping - restr_rhs_map = ( - isl.Map.from_basic_map(var_map) - .intersect_range(kernel.domain)) + map_space = kernel.space + ln = len(unique_new_arg_names) + rn = kernel.space.dim(dim_type.out) - # Sanity check: If the range of the map does not recover the - # domain of the expression, the unifier must have been no - # good. - if restr_rhs_map.range() != kernel.domain: - continue + map_space = map_space.add_dims(dim_type.in_, ln) + for i, iname in enumerate(unique_new_arg_names): + map_space = map_space.set_dim_name(dim_type.in_, i, iname+"'") - restr_rhs_map = restr_rhs_map.project_out_except( - rhs_deps, [dim_type.out]) + set_space = map_space.move_dims( + dim_type.out, rn, + dim_type.in_, 0, ln).range() - # Sanity check: Injectivity here means that unique lead indices - # can be found for each + footprint_map = None - if not restr_rhs_map.is_injective(): - raise RuntimeError("In CSEs '%s' and '%s': " - "cannot find lead indices uniquely" - % (uni_template, csed.cse.child)) + from loopy.symbolic import aff_from_expr + for uarg_name, arg_val in zip(unique_new_arg_names, invdesc.args): + cns = isl.Constraint.equality_from_aff( + aff_from_expr(set_space, var(uarg_name+"'") - arg_val)) - footprint_contrib = restr_rhs_map.domain() - if footprint is None: - footprint = footprint_contrib + cns_map = isl.BasicMap.from_constraint(cns) + if footprint_map is None: + footprint_map = cns_map else: - footprint = footprint.union(footprint_contrib) - - found_good_unifier = True - - # }}} - - if not found_good_unifier: - raise RuntimeError("No valid unifier for '%s' and '%s'" - % (csed.cse.child, uni_template)) - - uni_recs.append(unifier) + footprint_map = footprint_map.intersect(cns_map) - # {{{ check that matching_vars have a unique_value + footprint_map = footprint_map.move_dims( + dim_type.in_, 0, + dim_type.out, rn, ln) - csed.unif_var_dict = dict((lhs.name, rhs) - for lhs, rhs in unifier.equations) - for mv_name in matching_vars: - if mv_name in matching_var_values: - if matching_var_values[mv_name] != csed.unif_var_dict[mv_name]: - raise RuntimeError("two different expressions encountered " - "for matching variable '%s' in unification template '%s':" - "'%s' and '%s'" % ( - mv_name, uni_template, - matching_var_values[mv_name], csed.unif_var_dict[mv_name])) - else: - matching_var_values[mv_name] = csed.unif_var_dict[mv_name] + if global_footprint_map is None: + global_footprint_map = footprint_map + else: + global_footprint_map = global_footprint_map.union(footprint_map) # }}} - assert (footprint - .project_out_except(independent_inames, [dim_type.set]) - .is_bounded()) + processed_sweep_inames = list(processed_sweep_inames) - return footprint, matching_var_values, + global_footprint_map = global_footprint_map.intersect_range(kernel.domain) + # move non-sweep-dimensions into parameter space + sweep_footprint_map = global_footprint_map.coalesce() + for iname in kernel.all_inames(): + if iname not in processed_sweep_inames: + sp = sweep_footprint_map.get_space() + dt, idx = sp.get_var_dict()[iname] + sweep_footprint_map = sweep_footprint_map.move_dims( + dim_type.param, sp.dim(dim_type.param), + dt, idx, 1) + # compute bounding boxes to each set of parameters + sfm_dom = sweep_footprint_map.domain().coalesce() + if not sfm_dom.is_bounded(): + raise RuntimeError("In precomputation of substitution '%s': " + "sweep did not result in a bounded footprint" + % subst_name) -def make_compute_insn(kernel, cse_tag, uni_template, - target_var_name, target_var_base_indices, - independent_inames, ind_iname_to_tag, insn): + from loopy.kernel import find_var_base_indices_and_shape_from_inames + base_indices, shape = find_var_base_indices_and_shape_from_inames( + sfm_dom, [uarg+"'" for uarg in unique_new_arg_names], + kernel.cache_manager) - # {{{ decide whether to force a dep + # compute augmented domain - from loopy.symbolic import IndexVariableFinder - dependencies = IndexVariableFinder( - include_reduction_inames=False)(uni_template) + # {{{ subtract off the base indices + # add the new, base-0 as new in dimensions - parent_inames = kernel.insn_inames(insn) | insn.reduction_inames() - #print dependencies, parent_inames - #assert dependencies <= parent_inames + sp = global_footprint_map.get_space() + tgt_idx = sp.dim(dim_type.out) - for iname in parent_inames: - if iname in independent_inames: - tag = ind_iname_to_tag[iname] - else: - tag = kernel.iname_to_tag.get(iname) + n_args = len(unique_new_arg_names) - check_cse_iname_deps( - iname, independent_inames, tag, dependencies, cse_tag, uni_template) + aug_domain = global_footprint_map.move_dims( + dim_type.out, tgt_idx, + dim_type.in_, 0, + n_args).range().coalesce() - # }}} + aug_domain = aug_domain.insert_dims(dim_type.set, tgt_idx, n_args) + for i, name in enumerate(unique_new_arg_names): + aug_domain = aug_domain.set_dim_name(dim_type.set, tgt_idx+i, name) - assignee = var(target_var_name) + # index layout now: + # <....out.....> (tgt_idx) <base-0 args> <args> - if independent_inames: - assignee = assignee[tuple( - var(iname)-bi - for iname, bi in zip(independent_inames, target_var_base_indices) - )] + from loopy.symbolic import aff_from_expr + for uarg_name, bi in zip(unique_new_arg_names, base_indices): + cns = isl.Constraint.equality_from_aff( + aff_from_expr(aug_domain.get_space(), + var(uarg_name) - (var(uarg_name+"'") - bi))) - insn_prefix = cse_tag - if insn_prefix is None: - insn_prefix = "cse" - from loopy.kernel import Instruction - return Instruction( - id=kernel.make_unique_instruction_id(based_on=insn_prefix+"_compute"), - assignee=assignee, - expression=uni_template) + aug_domain = aug_domain.add_constraint(cns) + aug_domain = aug_domain.eliminate(dim_type.set, tgt_idx+n_args, n_args) + aug_domain = aug_domain.remove_dims(dim_type.set, tgt_idx+n_args, n_args) + base_indices_2, shape_2 = find_var_base_indices_and_shape_from_inames( + aug_domain, unique_new_arg_names, + kernel.cache_manager) + assert base_indices_2 == [0] * n_args + assert shape_2 == shape -def realize_cse(kernel, cse_tag, dtype, independent_inames=[], - uni_template=None, ind_iname_to_tag={}, new_inames=None, default_tag="l.auto"): - """ - :arg independent_inames: which inames are supposed to be separate loops - in the CSE. Also determines index order of temporary array. - The variables in independent_inames refer to the unification - template. - :arg uni_template: An expression against which all targeted subexpressions - must unify + return aug_domain, base_indices, shape - If None, a unification template will be chosen from among the targeted - CSEs. That CSE is chosen to depend on all the variables in - *independent_inames*. It is an error if no such expression can be - found. - May contain '*' wildcards that will have to match exactly across all - unifications. - Process: - - Find all targeted CSEs. - - Find *uni_template* as described above. +def simplify_via_aff(space, expr): + from loopy.symbolic import aff_from_expr, aff_to_expr + return aff_to_expr(aff_from_expr(space, expr)) - - Turn all wildcards in *uni_template* into matching-relevant (but not - independent, in the sense of *independent_inames*) variables. - - Unify the CSEs with the unification template, detecting mappings - of template variables to variables used in the CSE. - - Find the (union) footprint of the CSEs in terms of the - *independent_inames*. - - Augment the kernel domain by that footprint and generate the fetch - instruction. +def precompute(kernel, subst_name, dtype, sweep_inames=[], + new_arg_names=None, arg_name_to_tag={}, default_tag="l.auto"): - - Replace the CSEs according to the mapping detected in unification. - """ + subst = kernel.substitutions[subst_name] + arg_names = subst.arguments + subst_expr = subst.expression - newly_created_var_names = set() + # {{{ gather up invocations - # {{{ replace any wildcards in uni_template with new variables + invocation_descriptors = [] + invocation_arg_deps = set() - if isinstance(uni_template, str): - from pymbolic import parse - uni_template = parse(uni_template) + def gather_substs(expr, name, args, rec): + arg_deps = get_dependencies(args) + if not arg_deps <= kernel.all_inames(): + raise RuntimeError("CSE arguments in '%s' do not consist " + "exclusively of inames" % expr) - def get_unique_var_name(): - if cse_tag is None: - based_on = "cse_wc" - else: - based_on = cse_tag+"_wc" + invocation_arg_deps.update(arg_deps) - result = kernel.make_unique_var_name( - based_on=based_on, extra_used_vars=newly_created_var_names) - newly_created_var_names.add(result) - return result + invocation_descriptors.append( + InvocationDescriptor(expr=expr, args=args)) + return expr - if uni_template is not None: - from loopy.symbolic import WildcardToUniqueVariableMapper - wc_map = WildcardToUniqueVariableMapper(get_unique_var_name) - uni_template = wc_map(uni_template) + from loopy.symbolic import SubstitutionCallbackMapper + scm = SubstitutionCallbackMapper([subst_name], gather_substs) + for insn in kernel.instructions: + scm(insn.expression) + for s in kernel.substitutions.itervalues(): + if s is not subst: + scm(s.expression) + + allowable_sweep_inames = invocation_arg_deps | set(arg_names) + if not set(sweep_inames) <= allowable_sweep_inames: + raise RuntimeError("independent iname(s) '%s' do not occur as arg names " + "of subsitution rule or in arguments of invocation" % (",".join( + set(sweep_inames)-allowable_sweep_inames))) # }}} # {{{ process ind_iname_to_tag argument - ind_iname_to_tag = ind_iname_to_tag.copy() + arg_name_to_tag = arg_name_to_tag.copy() from loopy.kernel import parse_tag default_tag = parse_tag(default_tag) - for iname in independent_inames: - ind_iname_to_tag.setdefault(iname, default_tag) + for iname in arg_names: + arg_name_to_tag.setdefault(iname, default_tag) - if not set(ind_iname_to_tag.iterkeys()) <= set(independent_inames): - raise RuntimeError("tags for non-new inames may not be passed") + if not set(arg_name_to_tag.iterkeys()) <= set(arg_names): + raise RuntimeError("tags for non-argument names may not be passed") # here, all information is consolidated into ind_iname_to_tag # }}} - # {{{ gather cse descriptors - - cse_descriptors = [] - - def gather_cses(cse, rec): - if cse.prefix != cse_tag: - rec(cse.child) - return - - cse_descriptors.append( - CSEDescriptor(insn=insn, cse=cse)) - # can't nest, don't recurse - - from loopy.symbolic import CSECallbackMapper - cse_cb_mapper = CSECallbackMapper(gather_cses) - - for insn in kernel.instructions: - cse_cb_mapper(insn.expression) - - # }}} - - # {{{ find/pick a unification template - - if not cse_descriptors: - raise RuntimeError("no CSEs tagged '%s' found" % cse_tag) - - if uni_template is None: - for csed in cse_descriptors: - if set(independent_inames) <= get_dependencies(csed.cse.child): - # pick the first cse that has the required inames as the unification template - uni_template = csed.cse.child - break - - if uni_template is None: - raise RuntimeError("could not find a suitable unification template that depends on " - "inames '%s'" % ",".join(independent_inames)) - - # }}} - - # {{{ make sure that independent inames and kernel inames do not overlap - - # (and substitute in uni_template if any variable name changes are necessary) - - if set(independent_inames) & kernel.all_inames(): - old_to_new = {} - - new_independent_inames = [] - new_ind_iname_to_tag = {} - for i, iname in enumerate(independent_inames): - if iname in kernel.all_inames(): - based_on = iname - if new_inames is not None and i < len(new_inames): - based_on = new_inames[i] - elif cse_tag is not None: - based_on = "%s_%s" % (iname, cse_tag) - - new_iname = kernel.make_unique_var_name( - based_on=based_on, extra_used_vars=newly_created_var_names) - old_to_new[iname] = var(new_iname) - newly_created_var_names.add(new_iname) - new_independent_inames.append(new_iname) - new_ind_iname_to_tag[new_iname] = ind_iname_to_tag[iname] - else: - new_independent_inames.append(iname) - new_ind_iname_to_tag[iname] = ind_iname_to_tag[iname] - - independent_inames = new_independent_inames - ind_iname_to_tag = new_ind_iname_to_tag - uni_template = ( - SubstitutionMapper(make_subst_func(old_to_new)) - (uni_template)) - - # }}} - - if not set(independent_inames) <= get_dependencies(uni_template): - raise RuntimeError("independent iname(s) '%s' do not occur in unification " - "template" % (",".join( - set(independent_inames)-get_dependencies(uni_template)))) + newly_created_var_names = set() - # {{{ deal with iname deps of uni_template that are not independent_inames + # {{{ make sure that new - # (We call these 'matching_vars', because they have to match exactly in - # every CSE. As above, they might need to be renamed to make them unique - # within the kernel.) + # (and substitute in subst_expressions if any variable name changes are necessary) - matching_vars = [] old_to_new = {} - for iname in (get_dependencies(uni_template) - - set(independent_inames) - - kernel.non_iname_variable_names()): - if iname in kernel.all_inames(): - # need to rename to be unique - new_iname = kernel.make_unique_var_name( - based_on=iname, extra_used_vars=newly_created_var_names) - old_to_new[iname] = var(new_iname) - newly_created_var_names.add(new_iname) - matching_vars.append(new_iname) + unique_new_arg_names = [] + new_arg_name_to_tag = {} + for i, name in enumerate(arg_names): + new_name = None + + if new_arg_names is not None and i < len(new_arg_names): + new_name = new_arg_names[i] + if new_name in kernel.all_variable_names(): + raise RuntimeError("new name '%s' already exists" % new_name) + + if name in kernel.all_variable_names(): + based_on = "%s_%s" % (name, subst_name) + new_name = kernel.make_unique_var_name( + based_on=based_on, extra_used_vars=newly_created_var_names) + + if new_name is not None: + old_to_new[name] = var(new_name) + newly_created_var_names.add(new_name) + unique_new_arg_names.append(new_name) + new_arg_name_to_tag[new_name] = arg_name_to_tag[name] else: - matching_vars.append(iname) + unique_new_arg_names.append(name) + new_arg_name_to_tag[name] = arg_name_to_tag[name] - if old_to_new: - uni_template = ( - SubstitutionMapper(make_subst_func(old_to_new)) - (uni_template)) + arg_name_to_tag = new_arg_name_to_tag + subst_expr = ( + SubstitutionMapper(make_subst_func(old_to_new)) + (subst_expr)) # }}} @@ -445,61 +270,27 @@ def realize_cse(kernel, cse_tag, dtype, independent_inames=[], # (If there are independent inames, this adds extra dimensions to the domain.) - footprint, matching_var_values = process_cses(kernel, uni_template, - independent_inames, matching_vars, - cse_descriptors) - - if isinstance(footprint, isl.Set): - footprint = footprint.coalesce() - footprint_bsets = footprint.get_basic_sets() - if len(footprint_bsets) > 1: - raise NotImplementedError("CSE '%s' yielded a non-convex footprint" - % cse_tag) + new_domain, target_var_base_indices, target_var_shape = \ + get_footprint(kernel, subst_name, arg_names, unique_new_arg_names, + sweep_inames, invocation_descriptors) - footprint, = footprint_bsets + new_domain = new_domain.coalesce() + if isinstance(new_domain, isl.Set): + dom_bsets = new_domain.get_basic_sets() + if len(dom_bsets) > 1: + raise NotImplementedError("Substitution '%s' yielded a non-convex footprint" + % subst_name) - ndim = kernel.space.dim(dim_type.set) - footprint = footprint.insert_dims(dim_type.set, 0, ndim) - for i in range(ndim): - footprint = footprint.set_dim_name(dim_type.set, i, - kernel.space.get_dim_name(dim_type.set, i)) - - from islpy import align_spaces - new_domain = align_spaces(kernel.domain, footprint).intersect(footprint) - - # set matching vars equal to their unified value, eliminate them - from loopy.symbolic import aff_from_expr - - assert set(matching_var_values) == set(matching_vars) - - for var_name, value in matching_var_values.iteritems(): - cns = isl.Constraint.equality_from_aff( - aff_from_expr(new_domain.get_space(), var(var_name) - value)) - new_domain = new_domain.add_constraint(cns) - - new_domain = (new_domain - .eliminate(dim_type.set, - new_domain.dim(dim_type.set)-len(matching_vars), len(matching_vars)) - .remove_dims(dim_type.set, - new_domain.dim(dim_type.set)-len(matching_vars), len(matching_vars))) - new_domain = new_domain.remove_redundancies() + new_domain, = dom_bsets # }}} # {{{ set up temp variable - var_base = cse_tag - if var_base is None: - var_base = "cse" - target_var_name = kernel.make_unique_var_name(var_base) - - from loopy.kernel import (TemporaryVariable, - find_var_base_indices_and_shape_from_inames) + target_var_name = kernel.make_unique_var_name(based_on=subst_name, + extra_used_vars=newly_created_var_names) - target_var_base_indices, target_var_shape = \ - find_var_base_indices_and_shape_from_inames( - new_domain, independent_inames, - kernel.cache_manager) + from loopy.kernel import TemporaryVariable new_temporary_variables = kernel.temporary_variables.copy() new_temporary_variables[target_var_name] = TemporaryVariable( @@ -511,56 +302,63 @@ def realize_cse(kernel, cse_tag, dtype, independent_inames=[], # }}} - mv_subst = SubstitutionMapper(make_subst_func( - dict((mv, matching_var_values[mv]) for mv in matching_vars))) + # {{{ set up compute insn - compute_insn = make_compute_insn( - kernel, cse_tag, mv_subst(uni_template), - target_var_name, target_var_base_indices, - independent_inames, ind_iname_to_tag, - # pick one insn at random for dep check - cse_descriptors[0].insn) + assignee = var(target_var_name) + + if unique_new_arg_names: + assignee = assignee[tuple(var(iname) for iname in unique_new_arg_names)] + + from loopy.kernel import Instruction + compute_insn = Instruction( + id=kernel.make_unique_instruction_id(based_on=subst_name), + assignee=assignee, + expression=subst_expr) - # {{{ substitute variable references into instructions + # }}} + + # {{{ substitute rule into instructions - def subst_cses(cse, rec): + def do_substs(expr, name, args, rec): found = False - for csed in cse_descriptors: - if cse is csed.cse: + for invdesc in invocation_descriptors: + if expr is invdesc.expr: found = True break if not found: - from pymbolic.primitives import CommonSubexpression - return CommonSubexpression( - rec(cse.child), cse.prefix) + return - indices = [csed.unif_var_dict[iname]-bi - for iname, bi in zip(independent_inames, target_var_base_indices)] + args = [simplify_via_aff(new_domain.get_space(), arg-bi) + for arg, bi in zip(args, target_var_base_indices)] new_outer_expr = var(target_var_name) - if indices: - new_outer_expr = new_outer_expr[tuple(indices)] + if args: + new_outer_expr = new_outer_expr[tuple(args)] return new_outer_expr # can't nest, don't recurse - cse_cb_mapper = CSECallbackMapper(subst_cses) - new_insns = [compute_insn] + sub_map = SubstitutionCallbackMapper([subst_name], do_substs) for insn in kernel.instructions: - new_expr = cse_cb_mapper(insn.expression) - new_insns.append(insn.copy(expression=new_expr)) + new_insns.append(insn.copy(expression=sub_map(insn.expression))) # }}} new_iname_to_tag = kernel.iname_to_tag.copy() - new_iname_to_tag.update(ind_iname_to_tag) + new_iname_to_tag.update(arg_name_to_tag) + + new_substs = dict( + (s.name, s.copy(expression=sub_map(subst.expression))) + for s in kernel.substitutions.itervalues()) + del new_substs[subst_name] return kernel.copy( domain=new_domain, instructions=new_insns, + substitutions=new_substs, temporary_variables=new_temporary_variables, iname_to_tag=new_iname_to_tag) diff --git a/loopy/kernel.py b/loopy/kernel.py index dfdae050229367911eece85980090724bede98ef..e8c80d36a2c24d952a97011570cb3d6ee847f4ec 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -217,6 +217,26 @@ class TemporaryVariable(Record): # }}} +# {{{ subsitution rule + +class SubstitutionRule(Record): + """ + :ivar name: + :ivar arguments: + :ivar expression: + """ + + def __init__(self, name, arguments, expression): + Record.__init__(self, + name=name, arguments=arguments, expression=expression) + + def __str__(self): + return "%s(%s) := %s" % ( + self.name, ", ".join(self.arguments), self.expression) + + +# }}} + # {{{ instruction class Instruction(Record): @@ -451,6 +471,8 @@ class LoopKernel(Record): :ivar local_sizes: A dictionary from integers to integers, mapping workgroup axes to ther sizes, e.g. *{0: 16}* forces axis 0 to be length 16. + :ivar substitutions: a mapping from substitution names to :class:`SubstitutionRule` + objects :ivar cache_manager: @@ -458,8 +480,6 @@ class LoopKernel(Record): finished: :ivar iname_to_tag_requests: - :ivar cses: a mapping from CSE names to tuples (arg_names, expr). - :ivar substitutions: a mapping from CSE names to tuples (arg_names, expr). """ def __init__(self, device, domain, instructions, args=None, schedule=None, @@ -468,7 +488,7 @@ class LoopKernel(Record): iname_slab_increments={}, temporary_variables={}, local_sizes={}, - iname_to_tag={}, iname_to_tag_requests=None, cses={}, substitutions={}, + iname_to_tag={}, iname_to_tag_requests=None, substitutions={}, cache_manager=None): """ :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl. @@ -489,7 +509,7 @@ class LoopKernel(Record): INAME_ENTRY_RE = re.compile( r"^\s*(?P<iname>\w+)\s*(?:\:\s*(?P<tag>[\w.]+))?\s*$") - LABEL_DEP_RE = re.compile( + INSN_RE = re.compile( r"^\s*(?:(?P<label>\w+):)?" "\s*(?:\[" "(?P<iname_deps_and_tags>[\s\w,:.]*)" @@ -499,6 +519,9 @@ class LoopKernel(Record): "\s*(?P<lhs>.+)\s*=\s*(?P<rhs>.+?)" "\s*?(?:\:\s*(?P<insn_deps>[\s\w,]+))?$" ) + SUBST_RE = re.compile( + r"^\s*(?P<lhs>.+)\s*:=\s*(?P<rhs>.+?)\s*$" + ) def parse_iname_and_tag_list(s): dup_entries = [ @@ -540,21 +563,28 @@ class LoopKernel(Record): "instance or a parseable string. got '%s' instead." % type(insn)) - label_dep_match = LABEL_DEP_RE.match(insn) - if label_dep_match is None: + insn_match = INSN_RE.match(insn) + subst_match = SUBST_RE.match(insn) + if insn_match is not None and subst_match is not None: raise RuntimeError("insn parse error") - groups = label_dep_match.groupdict() - if groups["label"] is not None: - label = groups["label"] + if insn_match is not None: + groups = insn_match.groupdict() + elif subst_match is not None: + groups = subst_match.groupdict() else: - label = "insn" + raise RuntimeError("insn parse error") lhs = parse(groups["lhs"]) from loopy.symbolic import FunctionToPrimitiveMapper rhs = FunctionToPrimitiveMapper()(parse(groups["rhs"])) - if label.lower() not in ["cse", "subst"]: + if insn_match is not None: + if groups["label"] is not None: + label = groups["label"] + else: + label = "insn" + if groups["insn_deps"] is not None: insn_deps = set(dep.strip() for dep in groups["insn_deps"].split(",")) else: @@ -587,43 +617,35 @@ class LoopKernel(Record): assignee=lhs, expression=rhs, temp_var_type=temp_var_type, duplicate_inames_and_tags=duplicate_inames_and_tags)) - else: - if groups["iname_deps_and_tags"] is not None: - raise RuntimeError("CSEs/substitutions cannot declare iname dependencies") - if groups["insn_deps"] is not None: - raise RuntimeError("CSEs/substitutions cannot declare instruction dependencies") - if groups["temp_var_type"] is not None: - raise RuntimeError("CSEs/substitutions cannot declare temporary storage") - + elif subst_match is not None: from pymbolic.primitives import Variable, Call if isinstance(lhs, Variable): - cse_name = lhs.name + subst_name = lhs.name arg_names = [] elif isinstance(lhs, Call): if not isinstance(lhs.function, Variable): - raise RuntimeError("Invalid CSE left-hand side") - cse_name = lhs.function.name + raise RuntimeError("Invalid substitution rule left-hand side") + subst_name = lhs.function.name arg_names = [] for arg in lhs.parameters: if not isinstance(arg, Variable): - raise RuntimeError("Invalid CSE left-hand side") + raise RuntimeError("Invalid substitution rule left-hand side") arg_names.append(arg.name) else: - raise RuntimeError("CSEs cannot declare temporary storage") + raise RuntimeError("Invalid substitution rule left-hand side") - if label.lower() == "cse": - cses[cse_name] = (arg_names, rhs) - else: - substitutions[cse_name] = (arg_names, rhs) + substitutions[subst_name] = SubstitutionRule( + name=subst_name, + arguments=arg_names, + expression=rhs) # }}} insns = [] - cses = cses.copy() - substituions = substitutions.copy() + substitutions = substitutions.copy() for insn in instructions: # must construct list one-by-one to facilitate unique id generation @@ -656,7 +678,7 @@ class LoopKernel(Record): local_sizes=local_sizes, iname_to_tag=iname_to_tag, iname_to_tag_requests=iname_to_tag_requests, - cses=cses, substitutions=substitutions, + substitutions=substitutions, cache_manager=cache_manager) def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()): @@ -670,6 +692,12 @@ class LoopKernel(Record): if id_str not in used_ids: return id_str + def make_unique_subst_rule_name(self, based_on="subst"): + from loopy.tools import generate_unique_possibilities + for id_str in generate_unique_possibilities(based_on): + if id_str not in self.substitutions: + return id_str + @memoize_method def all_inames(self): from islpy import dim_type @@ -826,12 +854,16 @@ class LoopKernel(Record): insn.get_assignee_var_name() for insn in self.instructions) - def make_unique_var_name(self, based_on="var", extra_used_vars=set()): - used_vars = ( + @memoize_method + def all_variable_names(self): + return ( set(self.temporary_variables.iterkeys()) + | set(self.substitutions.iterkeys()) | set(arg.name for arg in self.args) - | set(self.iname_to_dim.keys()) - | extra_used_vars) + | set(self.iname_to_dim.keys())) + + def make_unique_var_name(self, based_on="var", extra_used_vars=set()): + used_vars = self.all_variable_names() | extra_used_vars from loopy.tools import generate_unique_possibilities for var_name in generate_unique_possibilities(based_on): @@ -1025,17 +1057,44 @@ class LoopKernel(Record): return result + def map_expressions(self, func, exclude_instructions=False): + if exclude_instructions: + new_insns = self.instructions + else: + new_insns = [insn.copy(expression=func(insn.expression)) + for insn in self.instructions] + + return self.copy( + instructions=new_insns, + substitutions=dict( + (subst.name, subst.copy(expression=func(subst.expression))) + for subst in self.substitutions.itervalues())) + def __str__(self): lines = [] + sep = 75*"-" + lines.append(sep) + lines.append("INAME-TO-TAG MAP:") for iname in sorted(self.all_inames()): lines.append("%s: %s" % (iname, self.iname_to_tag.get(iname))) - lines.append("") + + lines.append(sep) + lines.append("DOMAIN:") lines.append(str(self.domain)) - lines.append("") + + if self.substitutions: + lines.append(sep) + lines.append("SUBSTIUTION RULES:") + for rule in self.substitutions.itervalues(): + lines.append(str(rule)) + + lines.append(sep) + lines.append("INSTRUCTIONS:") for insn in self.instructions: lines.append(str(insn)) lines.append(" [%s]" % ",".join(sorted(self.insn_inames(insn)))) + lines.append(sep) return "\n".join(lines) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 3c84d835c832b3ce67fdcec01bd1d068429ada64..8419b9f217f2d159be69060aa76787104745cd00 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -642,6 +642,9 @@ def adjust_local_temp_var_storage(kernel): def preprocess_kernel(kernel): + from loopy.subst import apply_subst + kernel = apply_subst(kernel) + kernel = mark_local_temporaries(kernel) kernel = duplicate_reduction_inames(kernel) kernel = realize_reduction(kernel) diff --git a/loopy/schedule.py b/loopy/schedule.py index 0b1fb08fbe8d82a11f3f03d703e8a76eb03206b3..c1a1fdea0bd84db05bb08168c78c73f28809d405 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -604,18 +604,6 @@ def insert_barriers(kernel, schedule, level=0): # {{{ main scheduling entrypoint def generate_loop_schedules(kernel, loop_priority=[], debug=None): - # {{{ check that all CSEs have been realized - - from loopy.symbolic import CSECallbackMapper - - def map_cse(expr, rec): - raise RuntimeError("all CSEs must be realized before scheduling") - - for insn in kernel.instructions: - CSECallbackMapper(map_cse)(insn.expression) - - # }}} - from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) diff --git a/loopy/subst.py b/loopy/subst.py new file mode 100644 index 0000000000000000000000000000000000000000..0b56a9a3acbce2578f459d84d85e2de530bdd396 --- /dev/null +++ b/loopy/subst.py @@ -0,0 +1,203 @@ +from __future__ import division + +from loopy.symbolic import get_dependencies, SubstitutionMapper +from pymbolic.mapper.substitutor import make_subst_func + +from pytools import Record +from pymbolic import var + + + + + +class ExprDescriptor(Record): + __slots__ = ["insn", "expr", "unif_var_dict"] + + + + + +def extract_subst(kernel, subst_name, template, parameters): + """ + :arg template: An expression against which all targeted subexpressions + must unify + + If None, a unification template will be chosen from among the targeted + CSEs. That CSE is chosen to depend on all the variables in + *parameters*. It is an error if no such expression can be + found. + + May contain '*' wildcards that will have to match exactly across all + unifications. + """ + + newly_created_var_names = set() + + if isinstance(template, str): + from pymbolic import parse + template = parse(template) + + # {{{ replace any wildcards in template with new variables + + def get_unique_var_name(): + based_on = subst_name+"_wc" + + result = kernel.make_unique_var_name( + based_on=based_on, extra_used_vars=newly_created_var_names) + newly_created_var_names.add(result) + return result + + from loopy.symbolic import WildcardToUniqueVariableMapper + wc_map = WildcardToUniqueVariableMapper(get_unique_var_name) + template = wc_map(template) + + # }}} + + # {{{ deal with iname deps of template that are not independent_inames + + # (We call these 'matching_vars', because they have to match exactly in + # every CSE. As above, they might need to be renamed to make them unique + # within the kernel.) + + matching_vars = [] + old_to_new = {} + + for iname in (get_dependencies(template) + - set(parameters) + - kernel.non_iname_variable_names()): + if iname in kernel.all_inames(): + # need to rename to be unique + new_iname = kernel.make_unique_var_name( + based_on=iname, extra_used_vars=newly_created_var_names) + old_to_new[iname] = var(new_iname) + newly_created_var_names.add(new_iname) + matching_vars.append(new_iname) + else: + matching_vars.append(iname) + + if old_to_new: + template = ( + SubstitutionMapper(make_subst_func(old_to_new)) + (template)) + + # }}} + + # {{{ gather up expressions + + expr_descriptors = [] + + from loopy.symbolic import UnidirectionalUnifier + unif = UnidirectionalUnifier( + lhs_mapping_candidates=set(parameters) | set(matching_vars)) + + def gather_exprs(expr, mapper): + print expr + urecs = unif(template, expr) + + if urecs: + if len(urecs) > 1: + raise RuntimeError("ambiguous unification of '%s' with template '%s'" + % (expr, template)) + + urec, = urecs + + expr_descriptors.append( + ExprDescriptor( + insn=insn, + expr=expr, + unif_var_dict = dict((lhs.name, rhs) + for lhs, rhs in urec.equations))) + else: + mapper.fallback_mapper(expr) + # can't nest, don't recurse + + from loopy.symbolic import ( + CallbackMapper, WalkMapper, IdentityMapper) + dfmapper = CallbackMapper(gather_exprs, WalkMapper()) + + for insn in kernel.instructions: + dfmapper(insn.expression) + + for sr in kernel.substitutions.itervalues(): + dfmapper(sr.expression) + + # }}} + + if not expr_descriptors: + raise RuntimeError("no expressions matching '%s'" % template) + + # {{{ substitute rule into instructions + + def replace_exprs(expr, mapper): + found = False + for exprd in expr_descriptors: + if expr is exprd.expr: + found = True + break + + if not found: + return mapper.fallback_mapper(expr) + + args = [exprd.unif_var_dict[arg_name] + for arg_name in parameters] + + result = var(subst_name) + if args: + result = result(*args) + + return result + # can't nest, don't recurse + + cbmapper = CallbackMapper(replace_exprs, IdentityMapper()) + + new_insns = [] + + for insn in kernel.instructions: + new_expr = cbmapper(insn.expression) + new_insns.append(insn.copy(expression=new_expr)) + + from loopy.kernel import SubstitutionRule + new_substs = { + subst_name: SubstitutionRule( + name=subst_name, + arguments=parameters, + expression=template, + )} + + for subst in kernel.substitutions.itervalues(): + new_substs[subst.name] = subst.copy( + expression=cbmapper(insn.expression)) + + # }}} + + return kernel.copy( + instructions=new_insns, + substitutions=new_substs) + + + + +def apply_subst(kernel, subst_name=None): + if subst_name is None: + rules = kernel.substitutions + else: + rule = kernel.substitutions[subst_name] + rules = {rule.name: rule} + + from loopy.symbolic import ParametrizedSubstitutor + submap = ParametrizedSubstitutor(rules) + + if subst_name: + new_substs = kernel.substitutions.copy() + del new_substs[subst_name] + else: + new_substs = {} + + return (kernel + .copy(substitutions=new_substs) + .map_expressions(submap)) + + + + +# vim: foldmethod=marker diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 63a564eaf43f01648ae4d21ab30ee54cc2cb2a68..d980de74a254ad8f3f3f05ad2569bbc61603c879 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -8,7 +8,10 @@ from pymbolic.primitives import AlgebraicLeaf from pymbolic.mapper import ( CombineMapper as CombineMapperBase, IdentityMapper as IdentityMapperBase, - RecursiveMapper) + RecursiveMapper, + WalkMapper as WalkMapperBase, + CallbackMapper as CallbackMapperBase, + ) from pymbolic.mapper.c_code import CCodeMapper from pymbolic.mapper.stringifier import PREC_NONE from pymbolic.mapper.substitutor import \ @@ -80,6 +83,13 @@ class IdentityMapperMixin(object): class IdentityMapper(IdentityMapperBase, IdentityMapperMixin): pass +class WalkMapper(WalkMapperBase): + def map_reduction(self, expr): + self.rec(expr.expression) + +class CallbackMapper(CallbackMapperBase, IdentityMapper): + map_reduction = CallbackMapperBase.map_constant + class CombineMapper(CombineMapperBase): def map_reduction(self, expr): return self.rec(expr.expr) @@ -510,20 +520,6 @@ def constraint_to_expr(cns, except_name=None): # }}} -# {{{ CSE callback mapper - -class CSECallbackMapper(IdentityMapper): - def __init__(self, callback): - self.callback = callback - - def map_common_subexpression(self, expr): - result = self.callback(expr, self.rec) - if result is None: - return IdentityMapper.map_common_subexpression(self, expr) - return result - -# }}} - # {{{ Reduction callback mapper class ReductionCallbackMapper(IdentityMapper): @@ -579,82 +575,71 @@ class IndexVariableFinder(CombineMapper): # }}} -# {{{ variable-fetch CSE mapper - -class VariableFetchCSEMapper(IdentityMapper): - """Turns fetches of a given variable names into CSEs.""" - def __init__(self, var_name, cse_tag_getter): - self.var_name = var_name - self.cse_tag_getter = cse_tag_getter - - def map_variable(self, expr): - from pymbolic.primitives import CommonSubexpression - if expr.name == self.var_name: - return CommonSubexpression(expr, self.cse_tag_getter()) - else: - return IdentityMapper.map_variable(self, expr) - - def map_subscript(self, expr): - from pymbolic.primitives import CommonSubexpression, Variable, Subscript - if (isinstance(expr.aggregate, Variable) - and expr.aggregate.name == self.var_name): - return CommonSubexpression( - Subscript(expr.aggregate, self.rec(expr.index)), self.cse_tag_getter()) - else: - return IdentityMapper.map_subscript(self, expr) - -# }}} - # {{{ parametrized substitutor class ParametrizedSubstitutor(IdentityMapper): - def __init__(self, cses, wrap_cse): - """ - :arg cses: a mapping from CSE names to tuples (arg_names, expr). - :arg wrap_cse: flag: wrap substituted expressions in CSEs - """ - self.cses = cses - self.wrap_cse = wrap_cse + def __init__(self, rules): + self.rules = rules def map_variable(self, expr): - if expr.name not in self.cses: + if expr.name not in self.rules: return IdentityMapper.map_variable(self, expr) - arg_names, cse_expr = self.cse[expr.name] - if len(arg_names) != 0: + rule = self.rules[expr.name] + if rule.arguments: raise RuntimeError("CSE '%s' must be invoked with %d arguments" - % (expr.name, len(arg_names))) + % (expr.name, len(rule.arguments))) - cse_expr = self.rec(cse_expr) - - if self.wrap_cse: - from pymbolic.primitives import CommonSubexpression - return CommonSubexpression(cse_expr, expr.name) - else: - return cse_expr + return self.rec(rule.expression) def map_call(self, expr): - from pymbolic.primitives import Variable, CommonSubexpression + from pymbolic.primitives import Variable if (not isinstance(expr.function, Variable) - or expr.function.name not in self.cses): + or expr.function.name not in self.rules): return IdentityMapper.map_variable(self, expr) cse_name = expr.function.name - arg_names, cse_expr = self.cses[cse_name] - if len(arg_names) != len(expr.parameters): - raise RuntimeError("CSE '%s' invoked with %d arguments (needs %d)" - % (cse_name, len(arg_names), len(expr.parameters))) + rule = self.rules[cse_name] + if len(rule.arguments) != len(expr.parameters): + raise RuntimeError("Rule '%s' invoked with %d arguments (needs %d)" + % (cse_name, len(expr.parameters), len(rule.arguments), )) from pymbolic.mapper.substitutor import make_subst_func subst_map = SubstitutionMapper(make_subst_func( - dict(zip(arg_names, expr.parameters)))) + dict(zip(rule.arguments, expr.parameters)))) + + return self.rec(subst_map(rule.expression)) + +# }}} + +# {{{ substitution callback mapper + +class SubstitutionCallbackMapper(IdentityMapper): + def __init__(self, names, func): + self.names = names + self.func = func + + def map_variable(self, expr): + if expr.name not in self.names: + return IdentityMapper.map_variable(self, expr) - cse_expr = self.rec(subst_map(cse_expr)) + result = self.func(expr, expr.name, (), self.rec) + if result is None: + return IdentityMapper.map_variable(self, expr) + else: + return result + + def map_call(self, expr): + from pymbolic.primitives import Variable + if (not isinstance(expr.function, Variable) + or expr.function.name not in self.names): + return IdentityMapper.map_variable(self, expr) - if self.wrap_cse: - return CommonSubexpression(cse_expr, cse_name) + result = self.func(expr, expr.function.name, expr.parameters, self.rec) + if result is None: + return IdentityMapper.map_call(self, expr) else: - return cse_expr + return result # }}} diff --git a/test/test_fem_assembly.py b/test/test_fem_assembly.py index 4a580725e3bceae5158d058a21ea3ee0e32d5335..7e57d1508a68f8983c4e2982ec02e5062517a0cb 100644 --- a/test/test_fem_assembly.py +++ b/test/test_fem_assembly.py @@ -34,7 +34,7 @@ def test_laplacian_stiffness(ctx_factory): "[Nc] -> {[K,i,j,q]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d}" % dict(Nb=Nb, Nq=Nq), [ - "SUBST: dPsi(a,dxi) = jacInv[K,q,0,dxi] * DPsi[a,q,0] " + "CSE: dPsi(a,dxi) = jacInv[K,q,0,dxi] * DPsi[a,q,0] " "+ jacInv[K,q,1,dxi] * DPsi[a,q,1]", "A[K, i, j] = sum_float32(q, w[q] * jacDet[K,q] * (" "dPsi(0,0)*dPsi(1,0) + dPsi(0,1)*dPsi(1,1)))" @@ -52,20 +52,43 @@ def test_laplacian_stiffness(ctx_factory): seq_knl = knl - knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) - knl = lp.split_dimension(knl, "K_inner", 4, inner_tag="ilp") - knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) - knl = lp.add_prefetch(knl, 'jacInv', ["Kii", "Kio", "q", "x", "y"], - uni_template="jacInv[Kii + 4*Kio +16*Ko,q,x,y]") - - kernel_gen = lp.generate_loop_schedules(knl, - loop_priority=["K", "i", "j"]) - kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc)) - - lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, - op_count=0, op_label="GFlops", - parameters={"Nc": Nc}, print_seq_code=True, - timing_rounds=30) + def variant_1(knl): + # no ILP across elements + knl= lp.remove_cse_by_tag(knl, "dPsi") + knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) + knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) + knl = lp.add_prefetch(knl, 'jacInv', ["Kii", "Kio", "q", "x", "y"], + uni_template="jacInv[Kii +16*Ko,q,x,y]") + return knl + + def variant_2(knl): + # with ILP across elements + knl= lp.remove_cse_by_tag(knl, "dPsi") + knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) + knl = lp.split_dimension(knl, "K_inner", 4, inner_tag="ilp") + knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) + knl = lp.add_prefetch(knl, 'jacInv', ["Kii", "Kio", "q", "x", "y"], + uni_template="jacInv[Kii + 4*Kio +16*Ko,q,x,y]") + return knl + + def variant_1(knl): + # no ILP across elements, precompute dPsiTransf + knl= lp.realize_cse(knl, "dPsi", ["a", "dxi", ""]) + knl = lp.split_dimension(knl, "K", 16, outer_tag="g.0", slabs=(0,1)) + knl = lp.tag_dimensions(knl, {"i": "l.0", "j": "l.1"}) + knl = lp.add_prefetch(knl, 'jacInv', ["Kii", "Kio", "q", "x", "y"], + uni_template="jacInv[Kii +16*Ko,q,x,y]") + return knl + + for variant in [variant_1, variant_2]: + kernel_gen = lp.generate_loop_schedules(variant(knl), + loop_priority=["K", "i", "j"]) + kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc)) + + lp.auto_test_vs_seq(seq_knl, ctx, kernel_gen, + op_count=0, op_label="GFlops", + parameters={"Nc": Nc}, print_seq_code=True, + timing_rounds=30) diff --git a/test/test_linalg.py b/test/test_linalg.py index f5d9c2f00e42a78faee6396bde0101d7bb1c1c35..0c09032b803e4765a055f7cc775b08b1e748a3ea 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -348,9 +348,9 @@ def test_rank_one(ctx_factory): def variant_4(knl): knl = lp.split_dimension(knl, "i", 256, - outer_tag="g.0")#, slabs=(0, 1)) + outer_tag="g.0", slabs=(0, 1)) knl = lp.split_dimension(knl, "j", 256, - outer_tag="g.1")#, slabs=(0, 1)) + outer_tag="g.1", slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None) knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None) diff --git a/test/test_loopy.py b/test/test_loopy.py index cc387b017c579054670e2bf61dc2c10a24a85b79..6bc77e6ad9353207dd3f4cd9da71d3df3936dbae 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -73,7 +73,8 @@ def test_multi_cse(ctx_factory): local_sizes={0: 16}) knl = lp.split_dimension(knl, "i", 16, inner_tag="l.0") - knl = lp.realize_cse(knl, None, np.float32, ["i_inner"]) + knl = lp.add_prefetch(knl, "a", []) + #knl = lp.realize_cse(knl, None, np.float32, ["i_inner"]) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen) @@ -124,23 +125,29 @@ def test_stencil(ctx_factory): knl = lp.make_kernel(ctx.devices[0], "{[i,j]: 0<= i,j < 32}", [ - "[i] <float32> z[i,j] = -2*cse(a[i,j])" - " + cse(a[i,j-1])" - " + cse(a[i,j+1])" - " + cse(a[i-1,j])" - " + cse(a[i+1,j])" + "[i] <float32> z[i,j] = -2*a[i,j]" + " + a[i,j-1]" + " + a[i,j+1]" + " + a[i-1,j]" + " + a[i+1,j]" ], [ lp.ArrayArg("a", np.float32, shape=(32,32,)) ]) - def variant_3(knl): + + def variant_1(knl): + knl = lp.add_prefetch(knl, "a", [0, 1]) + return knl + + def variant_2(knl): knl = lp.split_dimension(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - knl = lp.realize_cse(knl, None, np.float32, ["i_inner", "j_inner"]) + knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"]) return knl - for variant in [variant_3]: + #for variant in [variant_1, variant_2]: + for variant in [variant_2]: kernel_gen = lp.generate_loop_schedules(variant(knl), loop_priority=["i_outer", "i_inner_0", "j_0"]) kernel_gen = lp.check_kernels(kernel_gen)