diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index d293e3ebe998a632bd547f94a67e675ff0592bfb..55331c93f3736eedd016a922b80673ed69d69201 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -31,7 +31,7 @@ Dealing with Substitution Rules Caching, Precomputation and Prefetching --------------------------------------- -.. autofunction:: precompute +.. automodule:: loopy.transform.precompute .. autofunction:: add_prefetch diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 5240042337163f0aefcbc7fdb8f3151ac280053f..67e3a522e4dfaf1e0cea85bed2d170bb0641fd59 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -166,19 +166,29 @@ def generate_code_for_sched_index(codegen_state, sched_index): else: return barrier_ast else: - # host code - if sched_item.kind in ["global", "local"]: - # host code is assumed globally and locally synchronous + if sched_item.kind not in ("global", "local"): + raise LoopyError("do not know how to emit code for barrier kind '%s'" + "in host code" % sched_item.kind) + + # host code is assumed locally synchronous but possibly needing + # global synchronization + + if sched_item.kind == "global" and ( + codegen_state.kernel.target.has_host_side_global_barriers()): + barrier_ast = codegen_state.ast_builder.emit_barrier( + sched_item.kind, sched_item.comment) + return CodeGenerationResult.new( + codegen_state, + sched_item.originating_insn_id, + barrier_ast, + codegen_state.implemented_domain) + else: return CodeGenerationResult( host_program=None, device_programs=[], implemented_domains={}, implemented_data_info=codegen_state.implemented_data_info) - else: - raise LoopyError("do not know how to emit code for barrier kind '%s'" - "in host code" % sched_item.kind) - # }}} elif isinstance(sched_item, RunInstruction): diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index abf4d799fbdb14f86fa29dde26e6654130fc66de..ba4e30d09b76f2729f68446426108dbc116eb44f 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -710,7 +710,12 @@ def generate_loop_schedules_internal( schedule=sched_state.schedule + (next_preschedule_item,), preschedule=sched_state.preschedule[1:], within_subkernel=True, - may_schedule_global_barriers=False, + may_schedule_global_barriers=( + not + sched_state + .kernel + .target + .split_kernel_at_global_barriers()), enclosing_subkernel_inames=sched_state.active_inames), allow_boost=rec_allow_boost, debug=debug): diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 5800a0236e8ae5f81a63942c31a74822bc2fab96..b6dc086376ce38bb2d0108f3a855fef2077b5c5a 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -96,6 +96,13 @@ class TargetBase(object): """ raise NotImplementedError() + def has_host_side_global_barriers(self): + """ + :returns: a :class:`bool` indicating whether the host target + requires a synchronizing global barrier + """ + raise NotImplementedError() + def get_host_ast_builder(self): """ :returns: a class implementing :class:`ASTBuilderBase` for the host code diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index e54ac0f693c4704c13b8c435e4bc7acaac1b1a47..7fe2a288f5973c41db2304d74d56aa33ad41a519 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -267,6 +267,9 @@ class CTarget(TargetBase): def split_kernel_at_global_barriers(self): return False + def has_host_side_global_barriers(self): + return False + def get_host_ast_builder(self): return DummyHostASTBuilder(self) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 8f924d3aee3b9f2982006fdb7b558cccac6785e3..5488ecfc3b74a6c88e0aec89b77d8c8fdada9419 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -850,6 +850,19 @@ class CExpressionToCodeMapper(RecursiveMapper): def map_array_literal(self, expr, enclosing_prec): return "{ %s }" % self.join_rec(", ", expr.children, PREC_NONE) + # new_array and delete_array implement new[] and delete[] in ISPC + + def map_new_array(self, expr, enclosing_prec): + return self.parenthesize_if_needed( + "uniform new uniform %s[%s]" % ( + expr.type, self.rec(expr.size, PREC_NONE)), + enclosing_prec, PREC_UNARY) + + def map_delete_array(self, expr, enclosing_prec): + return self.parenthesize_if_needed( + "delete[] %s" % expr.array, + enclosing_prec, PREC_UNARY) + # }}} # vim: fdm=marker diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 35dade90494906b61aad9eb66e7271f2c5d1e180..2b5068b46fc416627065ebc755b76918a2617f2f 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -26,7 +26,7 @@ THE SOFTWARE. import numpy as np # noqa -from loopy.target.c import CTarget, CASTBuilder +from loopy.target.c import CTarget, CASTBuilder, CExpression from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from loopy.diagnostic import LoopyError from loopy.symbolic import Literal @@ -38,6 +38,72 @@ from pymbolic.mapper.stringifier import PREC_NONE from pytools import memoize_method +class NewArray(p.Expression): + """An expression node for allocating a new array.""" + + # Only used in host-side code generation. + + init_arg_names = ("type", "size") + + def __init__(self, type, size): + self.type = type + self.size = size + + def __getinitargs__(self): + return (self.type, self.size) + + init_arg_names = ("type", "size") + + mapper_method = "map_new_array" + + +class DeleteArray(p.Expression): + """An expression node for deleting an array.""" + + # Only used in host-side code generation. + + init_arg_names = ("array",) + + def __init__(self, array): + self.array = array + + def __getinitargs__(self): + return (self.array,) + + init_arg_names = ("array",) + + mapper_method = "map_delete_array" + + +def temporary_should_have_extra_local_axis(kernel, temporary_name): + # This checks if a private temporary variable should be implemented as + # having an extra axis or not. If there are no local-parallel tagged writers + # on it, then we are safe implementing it without the extra axis. + # + # Note: this is currently needed as a workaround, because otherwise the + # compiler will reject some code as mistyped. The problem is that a + # temporary that is accessed with the programIndex variable will have a + # varying type. However, the compiler always disallows assignments from + # varying type to uniform type, even though in our case they may be + # legitimate and race-free. In order to make sure that compilable code is + # always generated, it might be worth trying to case the result of + # assignments from varying to uniform when a private temporary is involved. + writers = kernel.writer_map()[temporary_name] + from loopy.kernel.data import LocalIndexTagBase + + def insn_has_local_tag(id): + for iname in kernel.id_to_insn[id].within_inames: + if isinstance(kernel.iname_to_tag.get(iname), LocalIndexTagBase): + return True + return False + + if any(insn_has_local_tag(insn) for insn in writers): + _, lsize = kernel.get_grid_size_upper_bounds_as_exprs() + return lsize + + return False + + # {{{ expression mapper class ExprToISPCExprMapper(ExpressionToCExpressionMapper): @@ -86,8 +152,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): # FIXME: This is a pretty coarse way of deciding what # private temporaries get duplicated. Refine? (See also # below in decl generation) - gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() - if lsize: + if temporary_should_have_extra_local_axis(self.kernel, tv.name): return expr[var("programIndex")] else: return expr @@ -172,6 +237,12 @@ class ISPCTarget(CTarget): host_program_name_suffix = "" device_program_name_suffix = "_inner" + def split_kernel_at_global_barriers(self): + return True + + def has_host_side_global_barriers(self): + return True + def pre_codegen_check(self, kernel): gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs() if len(lsize) > 1: @@ -182,7 +253,7 @@ class ISPCTarget(CTarget): "by ISPC" % ls_i) def get_host_ast_builder(self): - return ISPCASTBuilder(self) + return ISPCHostASTBuilder(self) def get_device_ast_builder(self): return ISPCASTBuilder(self) @@ -201,13 +272,14 @@ class ISPCTarget(CTarget): class ISPCASTBuilder(CASTBuilder): - def _arg_names_and_decls(self, codegen_state): + def _arg_names_and_decls(self, codegen_state, extra_args): implemented_data_info = codegen_state.implemented_data_info - arg_names = [iai.name for iai in implemented_data_info] + all_args = implemented_data_info + extra_args + arg_names = [iai.name for iai in all_args] arg_decls = [ self.idi_to_cgen_declarator(codegen_state.kernel, idi) - for idi in implemented_data_info] + for idi in all_args] # {{{ occa compatibility hackery @@ -237,7 +309,7 @@ class ISPCASTBuilder(CASTBuilder): from cgen import (FunctionDeclaration, Value) from cgen.ispc import ISPCExport, ISPCTask - arg_names, arg_decls = self._arg_names_and_decls(codegen_state) + arg_names, arg_decls = self._arg_names_and_decls(codegen_state, []) if codegen_state.is_generating_device_code: result = ISPCTask( @@ -267,7 +339,7 @@ class ISPCASTBuilder(CASTBuilder): "assert(programCount == (%s))" % ecm(lsize[0], PREC_NONE))) - arg_names, arg_decls = self._arg_names_and_decls(codegen_state) + arg_names, arg_decls = self._arg_names_and_decls(codegen_state, extra_args) from cgen.ispc import ISPCLaunch result.append( @@ -309,11 +381,13 @@ class ISPCASTBuilder(CASTBuilder): shape = decl_info.shape if temp_var.scope == temp_var_scope.PRIVATE: - # FIXME: This is a pretty coarse way of deciding what - # private temporaries get duplicated. Refine? (See also - # above in expr to code mapper) - _, lsize = codegen_state.kernel.get_grid_size_upper_bounds_as_exprs() - shape = lsize + shape + if temporary_should_have_extra_local_axis( + codegen_state.kernel, temp_var.name): + # FIXME: This is a pretty coarse way of deciding what + # private temporaries get duplicated. Refine? (See also + # above in expr to code mapper) + _, lsize = codegen_state.kernel.get_grid_size_upper_bounds_as_exprs() + shape = lsize + shape if shape: from cgen import ArrayOf @@ -496,6 +570,90 @@ class ISPCASTBuilder(CASTBuilder): # }}} +# {{{ host ast builder + +class ISPCHostASTBuilder(ISPCASTBuilder): + + # FIXME: Rather than having to override get_temporary_decls / + # get_function_definition, perhaps it makes more sense to have a + # get_host_prologue / get_host_epilogue function pair + + def get_function_definition( + self, codegen_state, codegen_result, schedule_index, function_decl, + function_body): + from loopy.kernel.data import temp_var_scope + import six + + ecm = self.get_c_expression_to_code_mapper() + + global_temporaries = sorted( + (tv for tv in + six.itervalues(codegen_state.kernel.temporary_variables) + if tv.scope == temp_var_scope.GLOBAL), + key=lambda tv: tv.name) + + # Add deallocation code. + + from cgen import ExpressionStatement, Line + # FIXME: Not sure if it's okay to modify the function_body argument. + function_body.extend([Line()] + + [ExpressionStatement( + CExpression(ecm, DeleteArray(tv.name))) + for tv in global_temporaries]) + + return ISPCASTBuilder.get_function_definition( + self, + codegen_state, + codegen_result, + schedule_index, + function_decl, + function_body) + + def get_temporary_decls(self, codegen_state, schedule_state): + from loopy.kernel.data import temp_var_scope + import six + + global_temporaries = sorted( + (tv for tv in + six.itervalues(codegen_state.kernel.temporary_variables) + if tv.scope == temp_var_scope.GLOBAL), + key=lambda tv: tv.name) + + ecm = self.get_c_expression_to_code_mapper() + + from cgen import Assign, Line, dtype_to_ctype + from six.moves import reduce + from operator import mul + + assignments = [] + + for tv in global_temporaries: + decl_info = tv.decl_info( + self.target, index_dtype=codegen_state.kernel.index_dtype) + + # FIXME: I don't know if this inner loop is necessary. + for idi in decl_info: + from loopy.target.c import POD # uses the correct complex type + from cgen.ispc import ISPCUniformPointer, ISPCUniform + + decl = ISPCUniform( + ISPCUniformPointer(POD(self, idi.dtype, idi.name))) + + assignments.append(decl) + assignments.append( + Assign(idi.name, + CExpression(ecm, + NewArray( + # FIXME: Not sure how to get this properly + dtype_to_ctype(idi.dtype), + reduce(mul, idi.shape, 1) + )))) + assignments.append(Line()) + + return assignments + +# }}} + # TODO: Generate launch code # TODO: Vector types (element access: done) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 6077332c4fc4322ac7ffb02ade4a0e24c7066245..fb1891507ff490f8dd82362b4f5e05649dad45e6 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -25,6 +25,9 @@ THE SOFTWARE. import six from six.moves import range, zip + +from pytools import Record + import islpy as isl from loopy.symbolic import (get_dependencies, RuleAwareIdentityMapper, RuleAwareSubstitutionMapper, @@ -38,6 +41,13 @@ from pymbolic import var from loopy.transform.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap, AccessDescriptor) +__doc__ = """ + +.. autoclass:: PrecomputeTransformInfo + +.. autofunction:: precompute +""" + class RuleAccessDescriptor(AccessDescriptor): __slots__ = ["args", "expansion_stack"] @@ -254,13 +264,33 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): # }}} +class PrecomputeTransformInfo(Record): + """ + .. attribute:: kernel + + The transformed kernel + + .. attribute:: compute_insn_id + + The instruction ID that performs the precomputation. + + .. attribute:: compute_dep_id + + The instruction ID that computations should depend on to + make use of the finished state of the precomputation. + May be the same as :attr:`compute_insn_id` or a subsequent + barrier. + """ + + def precompute(kernel, subst_use, sweep_inames=[], within=None, storage_axes=None, temporary_name=None, precompute_inames=None, precompute_outer_inames=None, storage_axis_to_tag={}, default_tag="l.auto", dtype=None, fetch_bounding_box=False, temporary_scope=None, temporary_is_local=None, - compute_insn_id=None): + compute_insn_id=None, + return_info_structure=False): """Precompute the expression described in the substitution rule determined by *subst_use* and store it in a temporary array. A precomputation needs two things to operate, a list of *sweep_inames* (order irrelevant) and an @@ -335,6 +365,10 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, `` with the direct sweep axes being the slower-varying indices. + :returns: If *return_info_structure* is True, return an instance + of :class:`PrecomputeTransformInfo`. Otherwise, return the + transformed kernel. + Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are eliminated. """ @@ -1001,6 +1035,13 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, from loopy.kernel.tools import assign_automatic_axes kernel = assign_automatic_axes(kernel) - return kernel + if return_info_structure: + return PrecomputeTransformInfo( + kernel=kernel, + compute_insn_id=compute_insn_id, + compute_dep_id=compute_dep_id) + + else: + return kernel # vim: foldmethod=marker diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py new file mode 100644 index 0000000000000000000000000000000000000000..7b52d8fff3da7d55198555a17c75d368ad61d894 --- /dev/null +++ b/loopy/transform/reduction.py @@ -0,0 +1,787 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2017 Matt Wala" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import loopy as lp +import islpy as isl + +from loopy.kernel.data import iname_tag_to_temp_var_scope + +import logging +logger = logging.getLogger(__name__) + + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: make_two_level_reduction +.. autofunction:: make_two_level_scan +""" + + +# {{{ two-level reduction + +def make_two_level_reduction( + kernel, insn_id, inner_length, + nonlocal_storage_scope=None, + nonlocal_tag=None, + outer_tag=None, + inner_tag=None): + """ + Two level reduction, mediated through a "nonlocal" array. + + This turns a reduction of the form:: + + [...] result = reduce(i, f(i)) + + into:: + + i -> inner + inner_length * outer + + [..., nl] nonlocal[nl] = reduce(inner, f(nl, inner)) + [...] result = reduce(outer, nonlocal[outer]) + """ + + # {{{ sanity checks + + reduction = kernel.id_to_insn[insn_id].expression + reduction_iname, = reduction.inames + + # }}} + + # {{{ get stable names for everything + + var_name_gen = kernel.get_var_name_generator() + + format_kwargs = {"insn": insn_id, "iname": reduction_iname} + + nonlocal_storage_name = var_name_gen( + "{insn}_nonlocal".format(**format_kwargs)) + + inner_iname = var_name_gen( + "{iname}_inner".format(**format_kwargs)) + outer_iname = var_name_gen( + "{iname}_outer".format(**format_kwargs)) + nonlocal_iname = var_name_gen( + "{iname}_nonlocal".format(**format_kwargs)) + + inner_subst = var_name_gen( + "{insn}_inner_subst".format(**format_kwargs)) + + # }}} + + # First we split this iname. This results in (roughly) + # + # [...] result = reduce([outer, inner], f(outer, inner)) + # + # FIXME: within + + kernel = lp.split_iname(kernel, reduction_iname, inner_length, + outer_iname=outer_iname, inner_iname=inner_iname) + + # Next, we split the reduction inward and then extract a substitution + # rule for the reduction. This results in + # + # subst(outer) := reduce(inner, f(outer, inner)) + # [...] result = reduce([outer], subst(outer)) + # + # FIXME: within, insn_match... + + kernel = lp.split_reduction_inward(kernel, inner_iname) + from loopy.transform.data import reduction_arg_to_subst_rule + kernel = reduction_arg_to_subst_rule(kernel, outer_iname, + subst_rule_name=inner_subst) + + # Next, we precompute the inner iname into its own storage. + + # [...,nl] nonlocal[nl] = reduce(inner, f(nl, inner)) + # [...] result = reduce([outer], nonlocal[outer]) + + kernel = lp.precompute(kernel, inner_subst, + sweep_inames=[outer_iname], + precompute_inames=[nonlocal_iname], + temporary_name=nonlocal_storage_name, + temporary_scope=nonlocal_storage_scope) + + return kernel + +# }}} + + +# {{{ helpers for two-level scan + +def _update_instructions(kernel, id_to_new_insn, copy=True): + # FIXME: Even if this improves efficiency, this probably should not be + # doing in-place updates, to avoid obscure caching bugs + + if not isinstance(id_to_new_insn, dict): + id_to_new_insn = dict((insn.id, insn) for insn in id_to_new_insn) + + new_instructions = ( + list(insn for insn in kernel.instructions + if insn.id not in id_to_new_insn) + + list(id_to_new_insn.values())) + + if copy: + kernel = kernel.copy() + + kernel.instructions = new_instructions + return kernel + + +def _make_slab_set(iname, size): + # FIXME: There is a very similar identically named function in + # preprocess. Refactor. + + if not isinstance(size, (isl.PwAff, isl.Aff)): + from loopy.symbolic import pwaff_from_expr + size = pwaff_from_expr( + isl.Space.params_alloc(isl.DEFAULT_CONTEXT, 0), size) + + base_space = size.get_domain_space() + + space = (base_space + .add_dims(isl.dim_type.set, 1) + .set_dim_name(isl.dim_type.set, base_space.dim(isl.dim_type.set), iname)) + + v = isl.affs_from_space(space) + + size = isl.align_spaces(size, v[0]) + + bs, = ( + v[0].le_set(v[iname]) + & + v[iname].lt_set(v[0] + size)).get_basic_sets() + + return bs + + +def _add_subdomain_to_kernel(kernel, subdomain): + domains = list(kernel.domains) + # Filter out value parameters. + dep_inames = ( + frozenset(subdomain.get_var_names(isl.dim_type.param)) + & kernel.all_inames()) + + indices = kernel.get_leaf_domain_indices(dep_inames) + + if len(indices) == 0: + domains.append(subdomain) + elif len(indices) == 1: + idx, = indices + domains.insert(idx + 1, subdomain) + else: + print(indices) + raise ValueError("more than 1 leaf index") + + return kernel.copy(domains=domains) + + +def _add_scan_subdomain( + kernel, scan_iname, sweep_iname): + """ + Add the following domain to the kernel:: + + [sweep_iname] -> {[scan_iname] : 0 <= scan_iname <= sweep_iname } + """ + sp = ( + isl.Space.set_alloc(isl.DEFAULT_CONTEXT, 1, 1) + .set_dim_name(isl.dim_type.param, 0, sweep_iname) + .set_dim_name(isl.dim_type.set, 0, scan_iname)) + + affs = isl.affs_from_space(sp) + + subd, = ( + affs[scan_iname].le_set(affs[sweep_iname]) + & + affs[scan_iname].ge_set(affs[0])).get_basic_sets() + + return _add_subdomain_to_kernel(kernel, subd) + + +def _expand_subst_within_expression(kernel, expr): + from loopy.symbolic import ( + RuleAwareSubstitutionRuleExpander, SubstitutionRuleMappingContext) + + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + submap = RuleAwareSubstitutionRuleExpander( + rule_mapping_context, + kernel.substitutions, + within=lambda *args: True + ) + return submap(expr, kernel, insn=None) + + +def _add_global_barrier(kernel, source, sink, barrier_id): + from loopy.kernel.instruction import BarrierInstruction + + sources = (source,) if isinstance(source, str) else source + sinks = (sink,) if isinstance(sink, str) else sink + + within_inames = kernel.id_to_insn[sources[0]].within_inames + from itertools import chain + for iname in chain(sources[1:], sinks): + within_inames &= kernel.id_to_insn[iname].within_inames + + barrier_insn = BarrierInstruction( + id=barrier_id, + depends_on=frozenset(sources), + within_inames=within_inames, + kind="global") + + sink_insns = (kernel.id_to_insn[sink] for sink in sinks) + updated_sinks = ( + sink.copy(depends_on=sink.depends_on | frozenset([barrier_id])) + for sink in sink_insns) + + kernel = _update_instructions( + kernel, chain([barrier_insn], updated_sinks), copy=True) + + return kernel + + +def _get_scan_level(sweep_iname): + SWEEP_RE = r".*__l(\d+)(?:_outer)?" # noqa + + import re + match_result = re.match(SWEEP_RE, sweep_iname) + + if match_result is None: + return 0 + + return int(match_result.group(1)) + + +def _get_base_iname(iname): + BASE_INAME_RE = r"(.*)__l\d+(?:_outer)?" # noqa + + import re + match_result = re.match(BASE_INAME_RE, iname) + + if match_result is None: + return iname + + return match_result.group(1) + +# }}} + + +# {{{ two-level scan + +def make_two_level_scan( + kernel, insn_id, + scan_iname, + sweep_iname, + inner_length, + local_storage_name=None, + local_storage_scope=None, + local_storage_axes=None, + nonlocal_storage_name=None, + nonlocal_scan_storage_name=None, + nonlocal_storage_scope=None, + nonlocal_tag=None, + outer_local_tag=None, + inner_local_tag=None, + inner_tag=None, + outer_tag=None, + inner_iname=None, + outer_iname=None): + """Two level scan, mediated through a "local" and "nonlocal" array. + + This turns a scan of the form:: + + [...,i] result = reduce(j, f(j)) + + into:: + + [...,l',l''] + [...,nlinit] nonlocal[0] = 0 + [...,nlinit] nonlocal[nlinit+1] = local[nlinit,-1] + [...,nl] + [...,i',i''] result = nonlocal[i'] + local[i',i''] + + :arg local_storage_axes: A tuple of inames. For each iname, a corresponding + axis will be added to the temporary array that does the local part of + the scan (the "local" array). May be *None*, in which case it is + automatically inferred from the tags of the inames. + """ + + # TODO: Test that this works even when doing split scans in a loop + + # {{{ sanity checks/input processing + + # FIXME: More sanity checks... + + insn = kernel.id_to_insn[insn_id] + scan = insn.expression + assert scan.inames[0] == scan_iname + assert len(scan.inames) == 1 + del insn + + # }}} + + # {{{ get stable names for everything + + var_name_gen = kernel.get_var_name_generator() + insn_id_gen = kernel.get_instruction_id_generator() + + level = _get_scan_level(sweep_iname) + base_scan_iname = _get_base_iname(scan_iname) + base_sweep_iname = _get_base_iname(sweep_iname) + base_insn_id = _get_base_iname(insn_id) + + format_kwargs = { + "insn": base_insn_id, + "iname": base_scan_iname, + "sweep": base_sweep_iname, + "level": level, + "next_level": level + 1} + + if inner_iname is None: + inner_iname = var_name_gen( + "{sweep}__l{level}".format(**format_kwargs)) + else: + var_name_gen.add_name(inner_iname) + + if outer_iname is None: + outer_iname = var_name_gen( + "{sweep}__l{level}_outer".format(**format_kwargs)) + else: + var_name_gen.add_iname(outer_iname) + + """ + nonlocal_init_head_outer_iname = var_name_gen( + "{sweep}__l{level}_nlhead_outer".format(**format_kwargs)) + + nonlocal_init_head_inner_iname = var_name_gen( + "{sweep}__l{level}_nlhead_inner".format(**format_kwargs)) + """ + + nonlocal_init_tail_outer_iname = var_name_gen( + "{sweep}__l{level}_nltail_outer".format(**format_kwargs)) + + # FIXME: This iname is not really needed. We should see about getting + # rid of it. That would also make the write race warning business below + # unnecessary. + nonlocal_init_tail_inner_iname = var_name_gen( + "{sweep}__l{level}_nltail_inner".format(**format_kwargs)) + + nonlocal_iname = var_name_gen( + "{sweep}__l{level}_nonloc".format(**format_kwargs)) + + inner_local_iname = var_name_gen( + "{sweep}__l{next_level}".format(**format_kwargs)) + + inner_scan_iname = var_name_gen( + "{iname}__l{next_level}".format(**format_kwargs)) + + outer_local_iname = var_name_gen( + "{sweep}__l{next_level}_outer".format(**format_kwargs)) + + outer_scan_iname = var_name_gen( + "{iname}__l{level}".format(**format_kwargs)) + + subst_name = var_name_gen( + "{insn}_inner_subst".format(**format_kwargs)) + + local_subst_name = var_name_gen( + "{insn}_local_subst".format(**format_kwargs)) + + if local_storage_name is None: + local_storage_name = var_name_gen( + "{insn}__l{next_level}".format(**format_kwargs)) + else: + var_name_gen.add_name(local_storage_name) + + if nonlocal_storage_name is None: + nonlocal_storage_name = var_name_gen( + "{insn}__l{level}_outer".format(**format_kwargs)) + else: + var_name_gen.add_name(nonlocal_storage_name) + + if nonlocal_scan_storage_name is None: + nonlocal_scan_storage_name = var_name_gen( + "{insn}__l{level}_outer_scan".format(**format_kwargs)) + else: + var_name_gen.add_name(nonlocal_scan_storage_name) + + local_scan_insn_id = insn_id_gen( + "{insn}__l{next_level}".format(**format_kwargs)) + + nonlocal_scan_insn_id = insn_id_gen( + "{insn}__l{level}".format(**format_kwargs)) + + format_kwargs.update({"nonlocal": nonlocal_storage_name}) + + nonlocal_init_head_insn_id = insn_id_gen( + "{nonlocal}_init_head".format(**format_kwargs)) + + nonlocal_init_tail_insn_id = insn_id_gen( + "{nonlocal}_init_tail".format(**format_kwargs)) + + # }}} + + # {{{ parameter processing + + # It seems that local_storage_axes should be determined automatically. + auto_local_storage_axes = [ + iname + for iname, tag in [ + (outer_iname, outer_tag), + (inner_iname, inner_tag)] + + # ">" is "more global" + # In a way, global inames are automatically part of an access to a + # more local array. + if iname_tag_to_temp_var_scope(tag) <= local_storage_scope] + + if local_storage_axes is None: + local_storage_axes = auto_local_storage_axes + else: + if list(local_storage_axes) != auto_local_storage_axes: + raise ValueError("expected local_storage_axes (%s) did not match " + "provided local_storage_axes (%s)" + % (auto_local_storage_axes, local_storage_axes)) + + # }}} + + # {{{ utils + + def pick_out_relevant_axes(full_indices, strip_scalar=False): + assert len(full_indices) == 2 + iname_to_index = dict(zip((outer_iname, inner_iname), full_indices)) + + result = [] + for iname in local_storage_axes: + result.append(iname_to_index[iname]) + + assert len(result) > 0 + + return (tuple(result) + if not (strip_scalar and len(result) == 1) + else result[0]) + + # }}} + + # {{{ prepare for two level scan + + # Turn the scan into a substitution rule, replace the original scan with a + # nop and delete the scan iname. + # + # (The presence of the scan iname seems to be making precompute very confused.) + + from loopy.transform.data import reduction_arg_to_subst_rule + kernel = reduction_arg_to_subst_rule( + kernel, scan_iname, subst_rule_name=subst_name) + + kernel = _update_instructions( + kernel, + {insn_id: kernel.id_to_insn[insn_id].copy(expression=0)}) + + """ + from loopy.kernel.instruction import NoOpInstruction + {insn_id: NoOpInstruction( + id=insn_id, + depends_on=insn.depends_on, + groups=insn.groups, + conflicts_with_groups=insn.groups, + no_sync_with=insn.no_sync_with, + within_inames_is_final=insn.within_inames_is_final, + within_inames=insn.within_inames, + priority=insn.priority, + boostable=insn.boostable, + boostable_into=insn.boostable_into, + predicates=insn.predicates, + tags=insn.tags)}, + copy=False) + """ + + kernel = lp.remove_unused_inames(kernel, inames=(scan_iname,)) + + # Make sure we got rid of everything + assert scan_iname not in kernel.all_inames() + + # }}} + + # {{{ implement local scan + + from pymbolic import var + + # FIXME: This can probably be done using split_reduction_inward() + # and will end up looking like less of a mess that way. + + local_scan_expr = _expand_subst_within_expression(kernel, + var(subst_name)(var(outer_iname) * inner_length + + var(inner_scan_iname))) + + kernel = lp.split_iname(kernel, sweep_iname, inner_length, + inner_iname=inner_iname, outer_iname=outer_iname, + inner_tag=inner_tag, outer_tag=outer_tag) + + from loopy.kernel.data import SubstitutionRule + from loopy.symbolic import Reduction + + local_subst = SubstitutionRule( + name=local_subst_name, + arguments=(outer_iname, inner_iname), + expression=Reduction( + scan.operation, (inner_scan_iname,), local_scan_expr)) + + substitutions = kernel.substitutions.copy() + substitutions[local_subst_name] = local_subst + + kernel = kernel.copy(substitutions=substitutions) + + outer_local_iname = outer_iname + + all_precompute_inames = (outer_local_iname, inner_local_iname) + + precompute_inames = pick_out_relevant_axes(all_precompute_inames) + sweep_inames = pick_out_relevant_axes((outer_iname, inner_iname)) + + storage_axis_to_tag = { + outer_iname: outer_local_tag, + inner_iname: inner_local_tag, + outer_local_iname: outer_local_tag, + inner_local_iname: inner_local_tag} + + precompute_outer_inames = ( + frozenset(all_precompute_inames) - frozenset(precompute_inames)) + within_inames = ( + kernel.id_to_insn[insn_id].within_inames + - frozenset([outer_iname, inner_iname])) + + from pymbolic import var + + local_precompute_xform_info = lp.precompute(kernel, + [var(local_subst_name)( + var(outer_iname), var(inner_iname))], + sweep_inames=sweep_inames, + precompute_inames=precompute_inames, + storage_axes=local_storage_axes, + storage_axis_to_tag=storage_axis_to_tag, + precompute_outer_inames=precompute_outer_inames | within_inames, + temporary_name=local_storage_name, + temporary_scope=local_storage_scope, + compute_insn_id=local_scan_insn_id, + return_info_structure=True) + + kernel = local_precompute_xform_info.kernel + local_scan_dep_id = local_precompute_xform_info.compute_dep_id + + # FIXME: Should make it so that compute_insn just gets created with these + # deps in place. + compute_insn_with_deps = kernel.id_to_insn[ + local_precompute_xform_info.compute_insn_id] + compute_insn_with_deps = compute_insn_with_deps.copy( + depends_on=compute_insn_with_deps.depends_on + | kernel.id_to_insn[insn_id].depends_on) + + kernel = _update_instructions(kernel, (compute_insn_with_deps,)) + + kernel = _add_scan_subdomain(kernel, inner_scan_iname, inner_local_iname) + + # }}} + + from loopy.kernel.data import ConcurrentTag + if not isinstance(kernel.iname_to_tag[outer_iname], ConcurrentTag): + # FIXME + raise NotImplementedError("outer iname must currently be concurrent because " + "it occurs in the local scan and the final addition and one of " + "those would need to be copied/renamed if it is non-concurrent. " + "This split is currently unimplemented.") + + # {{{ implement local to nonlocal information transfer + + from loopy.isl_helpers import static_max_of_pw_aff + from loopy.symbolic import pw_aff_to_expr + + local_storage_local_axis_len = ( + kernel.temporary_variables[local_storage_name].shape[-1]) + + nonlocal_storage_len_pw_aff = static_max_of_pw_aff( + kernel.get_iname_bounds(outer_iname).size, + constants_only=False) + + # FIXME: this shouldn't have to have an extra element. + + # FIXME: (Related) This information transfer should perhaps be done with a + # ternary, but the bounds checker is currently too dumb to recognize that + # that's OK. + + nonlocal_storage_len = pw_aff_to_expr(1 + nonlocal_storage_len_pw_aff) + + nonlocal_tail_inner_subd = _make_slab_set(nonlocal_init_tail_inner_iname, 1) + kernel = _add_subdomain_to_kernel(kernel, nonlocal_tail_inner_subd) + nonlocal_tail_outer_subd = _make_slab_set( + nonlocal_init_tail_outer_iname, nonlocal_storage_len_pw_aff) + kernel = _add_subdomain_to_kernel(kernel, nonlocal_tail_outer_subd) + + """ + nonlocal_head_inner_subd = _make_slab_set(nonlocal_init_head_inner_iname, 1) + kernel = _add_subdomain_to_kernel(kernel, nonlocal_head_inner_subd) + nonlocal_head_outer_subd = _make_slab_set(nonlocal_init_head_outer_iname, 1) + kernel = _add_subdomain_to_kernel(kernel, nonlocal_head_outer_subd) + """ + + kernel = lp.tag_inames(kernel, { + #nonlocal_init_head_outer_iname: outer_local_tag, + #nonlocal_init_head_inner_iname: inner_local_tag, + nonlocal_init_tail_outer_iname: outer_local_tag, + nonlocal_init_tail_inner_iname: inner_local_tag}) + + for nls_name in [nonlocal_storage_name, nonlocal_scan_storage_name]: + if nls_name not in kernel.temporary_variables: + + from loopy.kernel.data import TemporaryVariable + new_temporary_variables = kernel.temporary_variables.copy() + + new_temporary_variables[nls_name] = ( + TemporaryVariable( + nls_name, + shape=(nonlocal_storage_len,), + scope=nonlocal_storage_scope, + base_indices=lp.auto, + dtype=lp.auto)) + + kernel = kernel.copy(temporary_variables=new_temporary_variables) + + from loopy.kernel.instruction import make_assignment + + nonlocal_init_head = make_assignment( + id=nonlocal_init_head_insn_id, + assignees=(var(nonlocal_storage_name)[0],), + + # FIXME: should be neutral element... + expression=0, + + within_inames=( + within_inames | frozenset([nonlocal_init_tail_outer_iname, + nonlocal_init_tail_inner_iname])), + no_sync_with=frozenset([(nonlocal_init_tail_insn_id, "any")]), + predicates=(var(nonlocal_init_tail_inner_iname).eq(0), + var(nonlocal_init_tail_outer_iname).eq(0)), + depends_on=frozenset([local_scan_dep_id])) + + nonlocal_init_tail = make_assignment( + id=nonlocal_init_tail_insn_id, + assignees=( + var(nonlocal_storage_name)[ + var(nonlocal_init_tail_outer_iname) + 1],), + expression=var(local_storage_name)[ + pick_out_relevant_axes( + (var(nonlocal_init_tail_outer_iname), + var(nonlocal_init_tail_inner_iname) + + local_storage_local_axis_len - 1), + strip_scalar=True)], + no_sync_with=frozenset([(nonlocal_init_head_insn_id, "any")]), + within_inames=( + within_inames | frozenset([nonlocal_init_tail_outer_iname, + nonlocal_init_tail_inner_iname])), + depends_on=frozenset([local_scan_dep_id])) + + kernel = _update_instructions( + kernel, (nonlocal_init_head, nonlocal_init_tail), copy=False) + + # The write race warnings are spurious - the inner iname is length + # 1, so there's really no write race at all here. + kernel = kernel.copy( + silenced_warnings=kernel.silenced_warnings + + ["write_race(%s)" % nonlocal_init_tail_insn_id] + + ["write_race(%s)" % nonlocal_init_head_insn_id]) + + # }}} + + insn = kernel.id_to_insn[insn_id] + + # {{{ implement nonlocal scan + + subd = _make_slab_set(nonlocal_iname, nonlocal_storage_len_pw_aff) + kernel = _add_subdomain_to_kernel(kernel, subd) + + if nonlocal_tag is not None: + kernel = lp.tag_inames(kernel, {nonlocal_iname: nonlocal_tag}) + + kernel = _add_scan_subdomain(kernel, outer_scan_iname, nonlocal_iname) + + nonlocal_scan = make_assignment( + id=nonlocal_scan_insn_id, + assignees=(var(nonlocal_scan_storage_name)[var(nonlocal_iname)],), + expression=Reduction( + scan.operation, + (outer_scan_iname,), + var(nonlocal_storage_name)[var(outer_scan_iname)]), + within_inames=within_inames | frozenset([nonlocal_iname]), + depends_on=( + frozenset([nonlocal_init_tail_insn_id, nonlocal_init_head_insn_id]))) + + kernel = _update_instructions(kernel, (nonlocal_scan,), copy=False) + + if nonlocal_storage_scope == lp.temp_var_scope.GLOBAL: + barrier_id = insn_id_gen( + "{insn}_nonlocal_init_barrier".format(**format_kwargs)) + kernel = _add_global_barrier(kernel, + source=(nonlocal_init_tail_insn_id, nonlocal_init_head_insn_id), + sink=nonlocal_scan_insn_id, + barrier_id=barrier_id) + + # }}} + + # {{{ replace scan with local + nonlocal + + updated_depends_on = (insn.depends_on + | frozenset([nonlocal_scan_insn_id, local_scan_dep_id])) + + if nonlocal_storage_scope == lp.temp_var_scope.GLOBAL: + barrier_id = insn_id_gen( + "{insn}_nonlocal_scan_barrier".format(**format_kwargs)) + kernel = (_add_global_barrier(kernel, + source=nonlocal_scan_insn_id, sink=insn_id, barrier_id=barrier_id)) + updated_depends_on |= frozenset([barrier_id]) + + nonlocal_part = var(nonlocal_scan_storage_name)[var(outer_iname)] + + local_part = var(local_storage_name)[ + pick_out_relevant_axes( + (var(outer_iname), var(inner_iname)), strip_scalar=True)] + + updated_insn = insn.copy( + no_sync_with=( + insn.no_sync_with + | frozenset([(nonlocal_scan_insn_id, "any")])), + depends_on=updated_depends_on, + # XXX: scan binary op + expression=nonlocal_part + local_part) + + kernel = _update_instructions(kernel, (updated_insn,), copy=False) + + # }}} + + return kernel + +# }}} + + +# vim: foldmethod=marker diff --git a/test/test_reduction.py b/test/test_reduction.py index 555b8c0cccd3a5ca32eb438c6cca44a1b0434a73..904cfa1bd084f98eec037f34bc9edffc19d8cddb 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -246,6 +246,34 @@ def test_global_parallel_reduction(ctx_factory, size): print_ref_code=True) +def test_global_parallel_reduction_2(): + knl = lp.make_kernel( + "{[i]: 0 <= i < n }", + """ + # Using z[0] instead of z works around a bug in ancient PyOpenCL. + z[0] = sum(i, i/13) {id=reduce} + """) + + gsize = 128 + from loopy.transform.reduction import make_two_level_reduction + knl = make_two_level_reduction(knl, + "reduce", + inner_length=gsize * 20, + nonlocal_tag="g.0", + nonlocal_storage_scope=lp.temp_var_scope.GLOBAL, + outer_tag=None, + inner_tag=None) + + print(knl) + + knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") + knl = lp.split_reduction_inward(knl, "i_inner_inner") + + knl = lp.realize_reduction(knl) + + print(knl) + + @pytest.mark.parametrize("size", [1000]) def test_global_mc_parallel_reduction(ctx_factory, size): ctx = ctx_factory() diff --git a/test/test_scan.py b/test/test_scan.py index 08754819c9a156403aba689cb3e9c238144e7905..82add0e3ff3f9ccbc744f8f4e7374b44a5e8058c 100644 --- a/test/test_scan.py +++ b/test/test_scan.py @@ -43,6 +43,7 @@ except ImportError: else: faulthandler.enable() +from pytools import memoize from pyopencl.tools import pytest_generate_tests_for_pyopencl \ as pytest_generate_tests @@ -362,6 +363,8 @@ def test_argmax(ctx_factory, i_tag): assert (max_indices == [np.argmax(np.abs(a[0:i+1])) for i in range(n)]).all() +# {{{ segmented scan + def check_segmented_scan_output(arr, segment_boundaries_indices, out): class SegmentGrouper(object): @@ -421,6 +424,162 @@ def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag): check_segmented_scan_output(arr, segment_boundaries_indices, out) +# }}} + + +# {{{ two and three level scan getters + +@memoize +def _get_two_level_scan_kernel(g_size): + knl = lp.make_kernel( + [ + "[n] -> {[i,j]: 0 <= i < n and 0 <= j <= i}", + ], + """ + out[i] = sum(j, a[j]) {id=insn} + """, + "...", + assumptions="n > 0") + + from loopy.transform.reduction import make_two_level_scan + knl = make_two_level_scan( + knl, "insn", inner_length=g_size, + scan_iname="j", + sweep_iname="i", + local_storage_axes=(("i__l0",)), + inner_iname="i__l0", + inner_tag="l.0", + outer_tag="g.0", + local_storage_scope=lp.temp_var_scope.LOCAL, + nonlocal_storage_scope=lp.temp_var_scope.GLOBAL, + inner_local_tag="l.0", + outer_local_tag="g.0") + + knl = lp.realize_reduction(knl, force_scan=True) + + knl = lp.add_nosync( + knl, + scope="global", + source="writes:acc_j__l0", + sink="reads:acc_j__l0") + + from loopy.transform.save import save_and_reload_temporaries + + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + knl = save_and_reload_temporaries(knl) + knl = lp.get_one_scheduled_kernel(knl) + + return knl + + +@memoize +def _get_three_level_scan_kernel(g_size, p_size): + knl = lp.make_kernel( + [ + "[n] -> {[i,j]: 0 <= i < n and 0 <= j <= i}", + ], + """ + out[i] = sum(j, a[j]) {id=insn} + """, + "...", + assumptions="n > 0") + + from loopy.transform.reduction import make_two_level_scan + knl = make_two_level_scan( + knl, "insn", inner_length=g_size, + scan_iname="j", + sweep_iname="i", + local_storage_axes=(("i__l0",)), + inner_iname="i__l0", + inner_tag=None, + outer_tag="g.0", + local_storage_scope=lp.temp_var_scope.LOCAL, + nonlocal_storage_scope=lp.temp_var_scope.GLOBAL, + inner_local_tag=None, + outer_local_tag="g.0") + + knl = make_two_level_scan( + knl, "insn__l1", inner_length=p_size, + scan_iname="j__l1", + sweep_iname="i__l1", + inner_tag="for", + outer_tag="l.0", + nonlocal_tag="l.0", + local_storage_scope=lp.temp_var_scope.LOCAL, + nonlocal_storage_scope=lp.temp_var_scope.LOCAL, + inner_local_tag="for", + outer_local_tag="l.0") + + knl = lp.tag_inames(knl, dict(i__l0="l.0", + i__l0_nltail_inner="l.0")) + + knl = lp.realize_reduction(knl, force_scan=True) + + knl = lp.add_nosync( + knl, + scope="global", + source="writes:acc_j__l0", + sink="reads:acc_j__l0") + + knl = lp.alias_temporaries(knl, + ("insn__l1", "insn__l2"), + synchronize_for_exclusive_use=False) + + from loopy.transform.save import save_and_reload_temporaries + + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + knl = save_and_reload_temporaries(knl) + knl = lp.get_one_scheduled_kernel(knl) + + return knl + +# }}} + + +# {{{ two and three level scan + +# TODO: Test everything from the matrix +# (l.0, seq) x (l.0, seq) +@pytest.mark.parametrize("input_len", + (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 32)) +@pytest.mark.parametrize("g_size", (16,)) +def test_two_level_scan(ctx_getter, input_len, g_size): + knl = _get_two_level_scan_kernel(g_size) + + import numpy as np + np.random.seed(0) + a = np.random.randint(low=0, high=100, size=input_len) + + c = ctx_getter() + q = cl.CommandQueue(c) + + _, (out,) = knl(q, a=a) + + assert (out == np.cumsum(a)).all() + + +@pytest.mark.parametrize("input_len", + (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 32)) +@pytest.mark.parametrize("g_size", (16,)) +@pytest.mark.parametrize("p_size", (4,)) +def test_three_level_scan(ctx_getter, g_size, p_size, input_len): + knl = _get_three_level_scan_kernel(g_size, p_size) + + import numpy as np + np.random.seed(0) + a = np.random.randint(low=0, high=100, size=input_len) + + c = ctx_getter() + q = cl.CommandQueue(c) + + _, (out,) = knl(q, a=a) + + assert (out == np.cumsum(a)).all() + +# }}} + if __name__ == "__main__": if len(sys.argv) > 1: