diff --git a/doc/misc.rst b/doc/misc.rst index 2f8fac090455eea1be87c3b4eae7bfd72eba24ee..89279d9744d2f1cf4f080618d0d2b2e0f078a723 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -5,7 +5,9 @@ Installation This command should install :mod:`loopy`:: - pip install loopy + pip install loo.py + +(Note the extra "."!) You may need to run this with :command:`sudo`. If you don't already have `pip `_, diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index ff27d84f55ea1b8b4222f18ba271723f450b1a42..33d40385b529f72e54da65238304e87bdb2cddab 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -270,9 +270,10 @@ Expressions Loopy's expressions are a slight superset of the expressions supported by :mod:`pymbolic`. -* `if` -* `reductions` +* ``if`` +* ``reductions`` * duplication of reduction inames + * ``reduce`` vs ``simul_reduce`` * complex-valued arithmetic * tagging of array access and substitution rule use ("$") * ``indexof``, ``indexof_vec`` diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 4421fd80f35cffdeebf37dbb93f668fc7bf48aa4..4275474d9ce74e04aa3c7ae69356f0672db2128c 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -96,6 +96,7 @@ always see loopy's view of a kernel by printing it. .. doctest:: + >>> knl = lp.set_options(knl, allow_terminal_colors=False) >>> print(knl) --------------------------------------------------------------------------- KERNEL: loopy_kernel @@ -245,8 +246,6 @@ call :func:`loopy.generate_code`: .. doctest:: >>> typed_knl = lp.add_dtypes(knl, dict(a=np.float32)) - >>> typed_knl = lp.preprocess_kernel(typed_knl, device=ctx.devices[0]) - >>> typed_knl = lp.get_one_scheduled_kernel(typed_knl) >>> code, _ = lp.generate_code(typed_knl) >>> print(code) #define lid(N) ((int) get_local_id(N)) diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py index 199a4dd038b1b79e3364c9a0c053300c5407bff0..a402896c85baa49bc23bb5d770607cf73641c273 100644 --- a/examples/python/ispc-stream-harness.py +++ b/examples/python/ispc-stream-harness.py @@ -59,29 +59,6 @@ def main(): with open("tasksys.cpp", "r") as ts_file: tasksys_source = ts_file.read() - if 0: - from loopy.target.ispc import ISPCTarget - stream_knl = lp.make_kernel( - "{[i]: 0<=i0") - stream_knl = lp.split_iname(stream_knl, - "i", 2**18, outer_tag="g.0", slabs=(0, 1)) - stream_knl = lp.split_iname(stream_knl, "i_inner", 8, inner_tag="l.0") - stream_knl = lp.preprocess_kernel(stream_knl) - stream_knl = lp.get_one_scheduled_kernel(stream_knl) - stream_knl = lp.set_argument_order(stream_knl, "n,a,x,y,z") - ispc_code, arg_info = lp.generate_code(stream_knl) - def make_knl(name, insn, vars): knl = lp.make_kernel( "{[i]: 0<=i loop_list_width: lines.append("[%s]" % loop_list) lines.append("%s%s <- %s # %s" % ( - (loop_list_width+2)*" ", lhs, - rhs, ", ".join(options))) + (loop_list_width+2)*" ", Fore.BLUE+lhs+Style.RESET_ALL, + Fore.MAGENTA+rhs+Style.RESET_ALL, + ", ".join(options))) else: lines.append("[%s]%s%s <- %s # %s" % ( loop_list, " "*(loop_list_width-len(loop_list)), - lhs, rhs, ",".join(options))) + Fore.BLUE + lhs + Style.RESET_ALL, + Fore.MAGENTA+rhs+Style.RESET_ALL, + ",".join(options))) lines.extend(trailing) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 46b301b47c45430078f554a178448687eb490e94..cb72f283e2151fd8d097f19eb4e4876cc78ccd43 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -770,7 +770,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): If *axis* is None, find a suitable axis automatically. """ try: - desired_length = kernel.get_constant_iname_length(iname) + with isl.SuppressedWarnings(kernel.isl_context): + desired_length = kernel.get_constant_iname_length(iname) except isl.Error: # Likely unbounded, automatic assignment is not # going to happen for this iname. @@ -882,7 +883,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): def get_iname_length(iname): try: - return kernel.get_constant_iname_length(iname) + with isl.SuppressedWarnings(kernel.isl_context): + return kernel.get_constant_iname_length(iname) except isl.Error: return -1 # assign longest auto axis inames first diff --git a/loopy/options.py b/loopy/options.py index 049d21affd284d94f27416d018d38bdbd5b4bddc..9b27f111e2fb93b09090b35278a3753f9f1a3f82 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -27,6 +27,11 @@ from pytools import Record import re +class _ColoramaStub(object): + def __getattribute__(self, name): + return "" + + class Options(Record): """ Unless otherwise specified, these options are Boolean-valued @@ -105,6 +110,11 @@ class Options(Record): Options to pass to the OpenCL compiler when building the kernel. A list of strings. + + .. attribute:: allow_terminal_colors + + A :class:`bool`. Whether to allow colors in terminal output + """ def __init__( @@ -124,6 +134,7 @@ class Options(Record): write_wrapper=False, highlight_wrapper=False, write_cl=False, highlight_cl=False, edit_cl=False, cl_build_options=[], + allow_terminal_colors=True ): Record.__init__( self, @@ -137,6 +148,7 @@ class Options(Record): write_wrapper=write_wrapper, highlight_wrapper=highlight_wrapper, write_cl=write_cl, highlight_cl=highlight_cl, edit_cl=edit_cl, cl_build_options=cl_build_options, + allow_terminal_colors=allow_terminal_colors, ) def update(self, other): @@ -150,6 +162,30 @@ class Options(Record): for field_name in sorted(self.__class__.fields): key_builder.rec(key_hash, getattr(self, field_name)) + @property + def _fore(self): + if self.allow_terminal_colors: + import colorama + return colorama.Fore + else: + return _ColoramaStub() + + @property + def _back(self): + if self.allow_terminal_colors: + import colorama + return colorama.Back + else: + return _ColoramaStub() + + @property + def _style(self): + if self.allow_terminal_colors: + import colorama + return colorama.Style + else: + return _ColoramaStub() + KEY_VAL_RE = re.compile("^([a-zA-Z0-9]+)=(.*)$") diff --git a/loopy/preprocess.py b/loopy/preprocess.py index fe88118e018d5829e9ff2104b70940a39cb95ade..4c75cfd250807c6959c1e5167465d34b029d762e 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -66,6 +66,45 @@ def prepare_for_caching(kernel): # }}} +# {{{ check reduction iname uniqueness + +def check_reduction_iname_uniqueness(kernel): + iname_to_reduction_count = {} + iname_to_nonsimultaneous_reduction_count = {} + + def map_reduction(expr, rec): + rec(expr.expr) + for iname in expr.inames: + iname_to_reduction_count[iname] = ( + iname_to_reduction_count.get(iname, 0) + 1) + if not expr.allow_simultaneous: + iname_to_nonsimultaneous_reduction_count[iname] = ( + iname_to_nonsimultaneous_reduction_count.get(iname, 0) + 1) + + return expr + + from loopy.symbolic import ReductionCallbackMapper + cb_mapper = ReductionCallbackMapper(map_reduction) + + for insn in kernel.instructions: + insn.with_transformed_expressions(cb_mapper) + + for iname, count in six.iteritems(iname_to_reduction_count): + nonsimul_count = iname_to_nonsimultaneous_reduction_count.get(iname, 0) + + if nonsimul_count and count > 1: + raise LoopyError("iname '%s' used in more than one reduction. " + "(%d of them, to be precise.) " + "Since this usage can easily cause loop scheduling " + "problems, this is prohibited by default. " + "Use loopy.make_reduction_inames_unique() to fix this. " + "If you are sure that this is OK, write the reduction " + "as 'simul_reduce(...)' instead of 'reduce(...)'" + % (iname, count)) + +# }}} + + # {{{ infer types def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): @@ -677,11 +716,13 @@ def preprocess_kernel(kernel, device=None): kernel = expand_subst(kernel) # Ordering restriction: - # Type inference doesn't handle substitutions. Get them out of the - # way. + # Type inference and reduction iname uniqueness don't handle substitutions. + # Get them out of the way. kernel = infer_unknown_types(kernel, expect_completion=False) + check_reduction_iname_uniqueness(kernel) + kernel = add_default_dependencies(kernel) # Ordering restrictions: diff --git a/loopy/schedule.py b/loopy/schedule.py index 17c1ab3ce0bad0170f8f7b03e243862de33af21d..de71ffaed0d457f96ee6ddbdeeafac58c0959aa9 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -317,11 +317,40 @@ def group_insn_counts(kernel): return result + +def gen_dependencies_except(kernel, insn_id, except_insn_ids): + insn = kernel.id_to_insn[insn_id] + for dep_id in insn.depends_on: + + if dep_id in except_insn_ids: + continue + + yield dep_id + + for sub_dep_id in gen_dependencies_except(kernel, dep_id, except_insn_ids): + yield sub_dep_id + # }}} # {{{ debug help +def format_insn_id(kernel, insn_id): + Fore = kernel.options._fore + Style = kernel.options._style + return Fore.GREEN + insn_id + Style.RESET_ALL + + +def format_insn(kernel, insn_id): + insn = kernel.id_to_insn[insn_id] + Fore = kernel.options._fore + Style = kernel.options._style + return "[%s] %s%s%s <- %s%s%s" % ( + format_insn_id(kernel, insn_id), + Fore.BLUE, str(insn.assignee), Style.RESET_ALL, + Fore.MAGENTA, str(insn.expression), Style.RESET_ALL) + + def dump_schedule(kernel, schedule): lines = [] indent = "" @@ -337,8 +366,7 @@ def dump_schedule(kernel, schedule): elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] if isinstance(insn, Assignment): - insn_str = "[%s] %s <- %s" % ( - insn.id, str(insn.assignee), str(insn.expression)) + insn_str = format_insn(kernel, sched_item.insn_id) else: insn_str = sched_item.insn_id lines.append(indent + insn_str) @@ -482,6 +510,8 @@ def generate_loop_schedules_internal( # to give loops containing high-priority instructions a chance. kernel = sched_state.kernel + Fore = kernel.options._fore + Style = kernel.options._style if allow_boost is None: rec_allow_boost = None @@ -510,7 +540,7 @@ def generate_loop_schedules_internal( print(dump_schedule(sched_state.kernel, sched_state.schedule)) #print("boost allowed:", allow_boost) print(75*"=") - print("LOOP NEST MAP:") + print("LOOP NEST MAP (inner: outer):") for iname, val in six.iteritems(sched_state.loop_nest_around_map): print("%s : %s" % (iname, ", ".join(val))) print(75*"=") @@ -549,7 +579,7 @@ def generate_loop_schedules_internal( if not is_ready: if debug_mode: print("instruction '%s' is missing insn depedencies '%s'" % ( - insn.id, ",".join( + format_insn(kernel, insn.id), ",".join( insn.depends_on - sched_state.scheduled_insn_ids))) continue @@ -570,10 +600,10 @@ def generate_loop_schedules_internal( if debug_mode: if want-have: print("instruction '%s' is missing inames '%s'" - % (insn.id, ",".join(want-have))) + % (format_insn(kernel, insn.id), ",".join(want-have))) if have-want: print("instruction '%s' won't work under inames '%s'" - % (insn.id, ",".join(have-want))) + % (format_insn(kernel, insn.id), ",".join(have-want))) # {{{ determine group-based readiness @@ -595,7 +625,7 @@ def generate_loop_schedules_internal( # }}} if is_ready and debug_mode: - print("ready to schedule '%s'" % insn.id) + print("ready to schedule '%s'" % format_insn(kernel, insn.id)) if is_ready and not debug_mode: iid_set = frozenset([insn.id]) @@ -660,7 +690,38 @@ def generate_loop_schedules_internal( if last_entered_loop in kernel.insn_inames(insn): if debug_mode: print("cannot leave '%s' because '%s' still depends on it" - % (last_entered_loop, insn.id)) + % (last_entered_loop, format_insn(kernel, insn.id))) + + # check if there's a dependency of insn that needs to be + # outside of last_entered_loop. + for subdep_id in gen_dependencies_except(kernel, insn_id, + sched_state.unscheduled_insn_ids): + subdep = kernel.id_to_insn[insn_id] + want = (kernel.insn_inames(subdep_id) + - sched_state.parallel_inames) + if ( + last_entered_loop not in want and + last_entered_loop not in subdep.boostable_into): + print( + "%(warn)swarning:%(reset_all)s '%(iname)s', " + "which the schedule is " + "currently stuck inside of, seems mis-nested. " + "'%(subdep)s' must occur " "before '%(dep)s', " + "but '%(subdep)s must be outside " + "'%(iname)s', whereas '%(dep)s' must be back " + "in it.%(reset_all)s\n" + " %(subdep_i)s\n" + " %(dep_i)s" + % { + "warn": Fore.RED + Style.BRIGHT, + "reset_all": Style.RESET_ALL, + "iname": last_entered_loop, + "subdep": format_insn_id(kernel, subdep_id), + "dep": format_insn_id(kernel, insn_id), + "subdep_i": format_insn(kernel, subdep_id), + "dep_i": format_insn(kernel, insn_id), + }) + can_leave = False break diff --git a/loopy/statistics.py b/loopy/statistics.py index f0a01463a9045ad93694bf26cf95f6cd3393d1fc..ab0743f5b3ba54f214512d7be1ea4db8caffd2c8 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -49,6 +49,7 @@ __doc__ = """ """ + # {{{ ToCountMap class ToCountMap: diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 7adab80c68c38f900976eb1adcd90226f40a7d9b..b887c703420d092d7f3c0fc9c729dd1d1f942a76 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -79,7 +79,20 @@ class IdentityMapperMixin(object): return expr def map_reduction(self, expr, *args): - return Reduction(expr.operation, expr.inames, self.rec(expr.expr, *args)) + mapped_inames = [self.rec(Variable(iname), *args) for iname in expr.inames] + + new_inames = [] + for iname, new_sym_iname in zip(expr.inames, mapped_inames): + if not isinstance(new_sym_iname, Variable): + from loopy.diagnostic import LoopyError + raise LoopyError("%s did not map iname '%s' to a variable" + % (type(self).__name__, iname)) + + new_inames.append(new_sym_iname.name) + + return Reduction( + expr.operation, tuple(new_inames), self.rec(expr.expr, *args), + allow_simultaneous=expr.allow_simultaneous) def map_tagged_variable(self, expr, *args): # leaf, doesn't change @@ -146,7 +159,8 @@ class StringifyMapper(StringifyMapperBase): return "loc.%d" % expr.index def map_reduction(self, expr, prec): - return "reduce(%s, [%s], %s)" % ( + return "%sreduce(%s, [%s], %s)" % ( + "simul_" if expr.allow_simultaneous else "", expr.operation, ", ".join(expr.inames), expr.expr) def map_tagged_variable(self, expr, prec): @@ -346,11 +360,16 @@ class Reduction(AlgebraicLeaf): The expression (as a :class:`pymbolic.primitives.Expression`) on which reduction is performed. + + .. attribute:: allow_simultaneous + + A :class:`bool`. If not *True*, an iname is allowed to be used + in precisely one reduction, to avoid mis-nesting errors. """ - init_arg_names = ("operation", "inames", "expr") + init_arg_names = ("operation", "inames", "expr", "allow_simultaneous") - def __init__(self, operation, inames, expr): + def __init__(self, operation, inames, expr, allow_simultaneous=False): if isinstance(inames, str): inames = tuple(iname.strip() for iname in inames.split(",")) @@ -378,9 +397,10 @@ class Reduction(AlgebraicLeaf): self.operation = operation self.inames = inames self.expr = expr + self.allow_simultaneous = allow_simultaneous def __getinitargs__(self): - return (self.operation, self.inames, self.expr) + return (self.operation, self.inames, self.expr, self.allow_simultaneous) def get_hash(self): return hash((self.__class__, self.operation, self.inames, @@ -779,7 +799,8 @@ class FunctionToPrimitiveMapper(IdentityMapper): turns those into the actual pymbolic primitives used for that. """ - def _parse_reduction(self, operation, inames, red_expr): + def _parse_reduction(self, operation, inames, red_expr, + allow_simultaneous=False): if isinstance(inames, Variable): inames = (inames,) @@ -795,7 +816,8 @@ class FunctionToPrimitiveMapper(IdentityMapper): processed_inames.append(iname.name) - return Reduction(operation, tuple(processed_inames), red_expr) + return Reduction(operation, tuple(processed_inames), red_expr, + allow_simultaneous=allow_simultaneous) def map_call(self, expr): from loopy.library.reduction import parse_reduction_op @@ -820,7 +842,7 @@ class FunctionToPrimitiveMapper(IdentityMapper): else: raise TypeError("cse takes two arguments") - elif name == "reduce": + elif name in ["reduce", "simul_reduce"]: if len(expr.parameters) == 3: operation, inames, red_expr = expr.parameters @@ -829,7 +851,8 @@ class FunctionToPrimitiveMapper(IdentityMapper): "must be a symbol") operation = parse_reduction_op(operation.name) - return self._parse_reduction(operation, inames, self.rec(red_expr)) + return self._parse_reduction(operation, inames, self.rec(red_expr), + allow_simultaneous=(name == "simul_reduce")) else: raise TypeError("invalid 'reduce' calling sequence") diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 64332637910340d68cb035d64ad6f4f643c0b5c9..7b1deb7951392e2e0c46360f8fd979ebf5aedb37 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -353,6 +353,26 @@ def remove_unused_arguments(knl): for insn in exp_knl.instructions: refd_vars.update(insn.dependency_names()) + from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag + from loopy.symbolic import get_dependencies + from itertools import chain + + def tolerant_get_deps(expr): + if expr is None or expr is lp.auto: + return set() + return get_dependencies(expr) + + for ary in chain(knl.args, six.itervalues(knl.temporary_variables)): + if isinstance(ary, ArrayBase): + refd_vars.update( + tolerant_get_deps(ary.shape) + | tolerant_get_deps(ary.offset)) + + for dim_tag in ary.dim_tags: + if isinstance(dim_tag, FixedStrideArrayDimTag): + refd_vars.update( + tolerant_get_deps(dim_tag.stride)) + for arg in knl.args: if arg.name in refd_vars: new_args.append(arg) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index bf435d3fe08d022790bf31a4d583d4923f0bfeff..e44f8abe227d451e8e940708530f6c20566685e8 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -210,8 +210,9 @@ def _fuse_two_kernels(knla, knlb): from pymbolic.imperative.transform import \ fuse_instruction_streams_with_unique_ids - new_instructions, _ = fuse_instruction_streams_with_unique_ids( - knla.instructions, knlb.instructions) + new_instructions, old_b_id_to_new_b_id = \ + fuse_instruction_streams_with_unique_ids( + knla.instructions, knlb.instructions) # {{{ fuse assumptions @@ -283,12 +284,12 @@ def _fuse_two_kernels(knla, knlb): "target", knla.target, knlb.target), - options=knla.options) + options=knla.options), old_b_id_to_new_b_id # }}} -def fuse_kernels(kernels, suffixes=None): +def fuse_kernels(kernels, suffixes=None, data_flow=None): """Return a kernel that performs all the operations in all entries of *kernels*. @@ -296,6 +297,11 @@ def fuse_kernels(kernels, suffixes=None): :arg suffixes: If given, must be a list of strings of a length matching that of *kernels*. This will be used to disambiguate the names of temporaries, as described below. + :arg data_flow: A list of data dependencies + ``[(var_name, from_kernel, to_kernel), ...]``. + Based on this, the fuser will create dependencies between all + writers of *var_name* in ``kernels[from_kernel]`` to + readers of *var_name* in ``kernels[to_kernel]``. The components of the kernels are fused as follows: @@ -321,9 +327,16 @@ def fuse_kernels(kernels, suffixes=None): * The resulting kernel will contain all instructions from each entry of *kernels*. Clashing instruction IDs will be renamed to ensure uniqueness. + + .. versionchanged:: 2016.2 + + *data_flow* was added in version 2016.2 """ kernels = list(kernels) + if data_flow is None: + data_flow = [] + if suffixes: suffixes = list(suffixes) if len(suffixes) != len(kernels): @@ -356,9 +369,46 @@ def fuse_kernels(kernels, suffixes=None): # }}} - result = kernels.pop(0) - while kernels: - result = _fuse_two_kernels(result, kernels.pop(0)) + kernel_insn_ids = [] + result = None + + for knlb in kernels: + if result is None: + result = knlb + kernel_insn_ids.append([ + insn.id for insn in knlb.instructions]) + else: + result, old_b_id_to_new_b_id = _fuse_two_kernels( + knla=result, + knlb=knlb) + + kernel_insn_ids.append([ + old_b_id_to_new_b_id[insn.id] + for insn in knlb.instructions]) + + # {{{ realize data_flow dependencies + + id_to_insn = result.id_to_insn.copy() + + for var_name, from_kernel, to_kernel in data_flow: + from_writer_ids = frozenset( + insn_id + for insn_id in kernel_insn_ids[from_kernel] + if var_name in id_to_insn[insn_id].assignee_var_names()) + + for insn_id in kernel_insn_ids[to_kernel]: + insn = id_to_insn[insn_id] + if var_name in insn.read_dependency_names(): + insn = insn.copy(depends_on=insn.depends_on | from_writer_ids) + + id_to_insn[insn_id] = insn + + result = result.copy(instructions=[ + id_to_insn[insn_id] + for insn_ids in kernel_insn_ids + for insn_id in insn_ids]) + + # }}} return result diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index c98ed26b41d69189edc8253449b226524a365367..b42b338a6856a0603b76d44604c7ea3da2f065a5 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -66,6 +66,8 @@ __doc__ = """ .. autofunction:: find_unused_axis_tag +.. autofunction:: make_reduction_inames_unique + """ @@ -119,7 +121,8 @@ class _InameSplitter(RuleAwareIdentityMapper): from loopy.symbolic import Reduction return Reduction(expr.operation, tuple(new_inames), - self.rec(expr.expr, expn_state)) + self.rec(expr.expr, expn_state), + expr.allow_simultaneous) else: return super(_InameSplitter, self).map_reduction(expr, expn_state) @@ -444,7 +447,8 @@ class _InameJoiner(RuleAwareSubstitutionMapper): from loopy.symbolic import Reduction return Reduction(expr.operation, tuple(new_inames), - self.rec(expr.expr, expn_state)) + self.rec(expr.expr, expn_state), + expr.allow_simultaneous) else: return super(_InameJoiner, self).map_reduction(expr, expn_state) @@ -676,7 +680,8 @@ class _InameDuplicator(RuleAwareIdentityMapper): from loopy.symbolic import Reduction return Reduction(expr.operation, new_inames, - self.rec(expr.expr, expn_state)) + self.rec(expr.expr, expn_state), + expr.allow_simultaneous) else: return super(_InameDuplicator, self).map_reduction(expr, expn_state) @@ -1074,11 +1079,14 @@ class _ReductionSplitter(RuleAwareIdentityMapper): if self.direction == "in": return Reduction(expr.operation, tuple(leftover_inames), Reduction(expr.operation, tuple(self.inames), - self.rec(expr.expr, expn_state))) + self.rec(expr.expr, expn_state), + expr.allow_simultaneous), + expr.allow_simultaneous) elif self.direction == "out": return Reduction(expr.operation, tuple(self.inames), Reduction(expr.operation, tuple(leftover_inames), - self.rec(expr.expr, expn_state))) + self.rec(expr.expr, expn_state), + expr.allow_simultaneous)) else: assert False else: @@ -1399,4 +1407,108 @@ def separate_loop_head_tail_slab(kernel, iname, head_it_count, tail_it_count): # }}} + +# {{{ make_reduction_inames_unique + +class _ReductionInameUniquifier(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, inames, within): + super(_ReductionInameUniquifier, self).__init__(rule_mapping_context) + + self.inames = inames + self.old_to_new = [] + self.within = within + + self.iname_to_red_count = {} + self.iname_to_nonsimultaneous_red_count = {} + + def map_reduction(self, expr, expn_state): + within = self.within( + expn_state.kernel, + expn_state.instruction, + expn_state.stack) + + for iname in expr.inames: + self.iname_to_red_count[iname] = ( + self.iname_to_red_count.get(iname, 0) + 1) + if not expr.allow_simultaneous: + self.iname_to_nonsimultaneous_red_count[iname] = ( + self.iname_to_nonsimultaneous_red_count.get(iname, 0) + 1) + + if within and not expr.allow_simultaneous: + subst_dict = {} + + from pymbolic import var + + new_inames = [] + for iname in expr.inames: + if ( + not (self.inames is None or iname in self.inames) + or + self.iname_to_red_count[iname] <= 1): + new_inames.append(iname) + continue + + new_iname = self.rule_mapping_context.make_unique_var_name(iname) + subst_dict[iname] = var(new_iname) + self.old_to_new.append((iname, new_iname)) + new_inames.append(new_iname) + + from loopy.symbolic import SubstitutionMapper + from pymbolic.mapper.substitutor import make_subst_func + + from loopy.symbolic import Reduction + return Reduction(expr.operation, tuple(new_inames), + self.rec( + SubstitutionMapper(make_subst_func(subst_dict))( + expr.expr), + expn_state), + expr.allow_simultaneous) + else: + return super(_ReductionInameUniquifier, self).map_reduction( + expr, expn_state) + + +def make_reduction_inames_unique(kernel, inames=None, within=None): + """ + :arg inames: if not *None*, only apply to these inames + :arg within: a stack match as understood by + :func:`loopy.context_matching.parse_stack_match`. + + .. versionadded:: 2016.2 + """ + + name_gen = kernel.get_var_name_generator() + + from loopy.context_matching import parse_stack_match + within = parse_stack_match(within) + + # {{{ change kernel + + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, name_gen) + r_uniq = _ReductionInameUniquifier(rule_mapping_context, + inames, within=within) + + kernel = rule_mapping_context.finish_kernel( + r_uniq.map_kernel(kernel)) + + # }}} + + # {{{ duplicate the inames + + for old_iname, new_iname in r_uniq.old_to_new: + from loopy.kernel.tools import DomainChanger + domch = DomainChanger(kernel, frozenset([old_iname])) + + from loopy.isl_helpers import duplicate_axes + kernel = kernel.copy( + domains=domch.get_domains_with( + duplicate_axes(domch.domain, [old_iname], [new_iname]))) + + # }}} + + return kernel + +# }}} + # vim: foldmethod=marker diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 9ce1f9c5420e8dbbc0d3a94b6c465e443c5f109d..e599c902227faf8d1292ece2307d097bc8fd7c19 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -349,7 +349,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, usage_to_definition = {} - for insn in kernel.instructions: + for insn in dep_kernel.instructions: if lhs_name not in insn.read_dependency_names(): continue diff --git a/loopy/version.py b/loopy/version.py index 9ad8ac19bebff7a712e91900815057155205ae57..adc069663503b200bcdd1638c05ae0ffae5f14df 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v18-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v19-islpy%s" % _islpy_version diff --git a/setup.py b/setup.py index 4fbb4e142b0680e28e88282409655c5a644a6f4e..5ed095315234339709309a1c55ec88c7fdab6bfa 100644 --- a/setup.py +++ b/setup.py @@ -40,8 +40,9 @@ setup(name="loo.py", "pytools>=2016.1", "pymbolic>=2016.2", "cgen>=2016.1", - "islpy>=2016.1", + "islpy>=2016.1.2", "six>=1.8.0", + "colorama", ], extras_require={ diff --git a/test/test_dg.py b/test/test_dg.py index 63a961423d2f750a4c9a25fdcb5fb56a479d8a35..fafef86c35211183ebdaeb75acf2b664a36586a0 100644 --- a/test/test_dg.py +++ b/test/test_dg.py @@ -52,10 +52,10 @@ def test_dg_volume(ctx_factory): "{[n,m,k]: 0<= n,m < Np and 0<= k < K}", ], """ - <> du_drst = sum(m, DrDsDt[n,m]*u[k,m]) - <> dv_drst = sum(m, DrDsDt[n,m]*v[k,m]) - <> dw_drst = sum(m, DrDsDt[n,m]*w[k,m]) - <> dp_drst = sum(m, DrDsDt[n,m]*p[k,m]) + <> du_drst = simul_reduce(sum, m, DrDsDt[n,m]*u[k,m]) + <> dv_drst = simul_reduce(sum, m, DrDsDt[n,m]*v[k,m]) + <> dw_drst = simul_reduce(sum, m, DrDsDt[n,m]*w[k,m]) + <> dp_drst = simul_reduce(sum, m, DrDsDt[n,m]*p[k,m]) # volume flux rhsu[k,n] = dot(drst_dx[k],dp_drst) diff --git a/test/test_loopy.py b/test/test_loopy.py index 1fed3289aac1c184b2267e3425aed2d8023f9a03..606eec7667ca3d76a215c3b487e8c93bc371c36e 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -944,6 +944,27 @@ def test_double_sum(ctx_factory): n = 20 + knl = lp.make_kernel( + "{[i,j]: 0<=i,j {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e