diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index 8674434084077ba1f791c46123a083346715209e..9138d9a41d7b33db956fd8aba55c0b3b788db064 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -5,8 +5,8 @@ Reference: Loopy's Model of a Kernel .. _domain-tree: -Loop Domain Tree ----------------- +Loop Domain Forest +------------------ .. {{{ @@ -29,10 +29,29 @@ Note that *n* in the example is not an iname. It is a :ref:`domain-parameters` that is passed to the kernel by the user. To accommodate some data-dependent control flow, there is not actually -a single loop domain, but rather a *tree of loop domains*, -allowing more deeply nested domains to depend on inames +a single loop domain, but rather a *forest of loop domains* (a collection +of trees) allowing more deeply nested domains to depend on inames introduced by domains closer to the root. +Here is an example:: + + { [l] : 0 <= l <= 2 } + { [i] : start <= i < end } + { [j] : start <= j < end } + +The i and j domains are "children" of the l domain (visible from indentation). +This is also how :mod:`loopy` prints the domain forest, to make the parent/child +relationship visible. In the example, the parameters start/end might be read +inside of the 'l' loop. + +The idea is that domains form a forest (a collection of trees), and a +"sub-forest" is extracted that covers all the inames for each +instruction. Each individual sub-tree is then checked for branching, +which is ill-formed. It is declared ill-formed because intersecting, in +the above case, the l, i, and j domains could result in restrictions from the +i domain affecting the j domain by way of how i affects l--which would +be counterintuitive to say the least.) + .. _inames: Inames diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 5eaa12b8124f86cfaf08cf2e83c3382861d9e0f2..92ec799f7045cf63dc75d1386d8a51fd7d42954c 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1120,7 +1120,7 @@ all work items reach the same barrier, the kernel will hang during execution. By default, :mod:`loopy` inserts local barriers between two instructions when it detects that a dependency involving local memory may occur across work items. To -see this in action, take a look at the section on :ref:`local_temporaries`. +see this in action, take a look at the section on :ref:`local_temporaries`. In contrast, :mod:`loopy` will *not* insert global barriers automatically. Global barriers require manual intervention along with some special @@ -1308,7 +1308,7 @@ tagged, as in the following example:: assumptions="n>0") .. [#global-barrier-note] In particular, this is *not* the same as a call to - ``barrier(CLK_GLOBAL_MEM_FENCE)``. + ``barrier(CLK_GLOBAL_MEM_FENCE)``. .. }}} @@ -1533,12 +1533,12 @@ information provided. Now we will count the operations: >>> op_map = lp.get_op_map(knl) >>> print(lp.stringify_stats_mapping(op_map)) - Op(np:dtype('float32'), add) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - Op(np:dtype('float32'), div) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - Op(np:dtype('float32'), mul) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - Op(np:dtype('float64'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - Op(np:dtype('float64'), mul) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - Op(np:dtype('int32'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float32'), add) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float32'), div) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float32'), mul) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float64'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float64'), mul) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('int32'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } <BLANKLINE> :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{** @@ -1596,9 +1596,9 @@ together into keys containing only the specified fields: >>> op_map_dtype = op_map.group_by('dtype') >>> print(lp.stringify_stats_mapping(op_map_dtype)) - Op(np:dtype('float32'), None) : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 } - Op(np:dtype('float64'), None) : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 } - Op(np:dtype('int32'), None) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float32'), None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float64'), None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('int32'), None) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } <BLANKLINE> >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32) ... ].eval_with_dict(param_dict) @@ -1619,12 +1619,12 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 2 * n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } <BLANKLINE> :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** @@ -1674,18 +1674,18 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 8 * n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } <BLANKLINE> >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) - MemAccess(None, None, None, load, None) : [n, m, l] -> { (16 * n * m + 12 * n * m * l) : n > 0 and m > 0 and l > 0 } - MemAccess(None, None, None, store, None) : [n, m, l] -> { (8 * n * m + 4 * n * m * l) : n > 0 and m > 0 and l > 0 } + MemAccess(None, None, None, load, None) : [m, l, n] -> { (16 * m + 12 * m * l) * n : m > 0 and l > 0 and n > 0 } + MemAccess(None, None, None, store, None) : [m, l, n] -> { (8 * m + 4 * m * l) * n : m > 0 and l > 0 and n > 0 } <BLANKLINE> >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') ... ].eval_with_dict(param_dict) @@ -1712,12 +1712,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 1, load, a) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 1, load, b) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 1, store, c) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 1, load, g) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 1, load, h) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 1, store, e) : [m, l, n] -> { ... } <BLANKLINE> With this parallelization, consecutive threads will access consecutive array @@ -1753,12 +1753,12 @@ switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 128, load, a) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 128, load, b) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 128, store, c) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 128, load, g) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 128, load, h) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 128, store, e) : [m, l, n] -> { ... } <BLANKLINE> With this parallelization, consecutive threads will access *nonconsecutive* diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py index 82ff2e60dee345fe16771d09cb39d2d56e9f493d..7c5de5a1b1d7042498a12204959a59021ac5e0d8 100644 --- a/examples/python/hello-loopy.py +++ b/examples/python/hello-loopy.py @@ -26,5 +26,5 @@ knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") evt, (out,) = knl(queue, a=a) # ENDEXAMPLE -cknl = lp.CompiledKernel(ctx, knl) -print(cknl.get_highlighted_code({"a": np.float32})) +knl = lp.add_and_infer_dtypes(knl, {"a": np.dtype(np.float32)}) +print(lp.generate_code_v2(knl).device_code()) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 0860caa5be8345b40531bdec59746248769c2039..a91eb51a0a90e987ec05a8ebf71a6759e047f5d3 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -519,9 +519,13 @@ def auto_test_vs_ref( args = None from loopy.kernel import kernel_state + from loopy.target.pyopencl import PyOpenCLTarget if test_knl.state not in [ kernel_state.PREPROCESSED, kernel_state.SCHEDULED]: + if isinstance(test_knl.target, PyOpenCLTarget): + test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0])) + test_knl = lp.preprocess_kernel(test_knl) if not test_knl.schedule: diff --git a/loopy/check.py b/loopy/check.py index d1ba1ab1aec0dd72acc4c268417e9b0e6653e55f..741195ae6ac87d01de3a4ac620ce510fd62ff470 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -354,7 +354,7 @@ def check_has_schedulable_iname_nesting(kernel): def pre_schedule_checks(kernel): try: - logger.info("%s: pre-schedule check: start" % kernel.name) + logger.debug("%s: pre-schedule check: start" % kernel.name) check_for_orphaned_user_hardware_axes(kernel) check_for_double_use_of_hw_axes(kernel) @@ -367,7 +367,7 @@ def pre_schedule_checks(kernel): check_write_destinations(kernel) check_has_schedulable_iname_nesting(kernel) - logger.info("%s: pre-schedule check: done" % kernel.name) + logger.debug("%s: pre-schedule check: done" % kernel.name) except KeyboardInterrupt: raise except: @@ -505,22 +505,23 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel): # {{{ check that temporaries are defined in subkernels where used def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): - from loopy.schedule.tools import InstructionQuery from loopy.kernel.data import temp_var_scope + from loopy.kernel.tools import get_subkernels - insn_query = InstructionQuery(kernel) - - for subkernel in insn_query.subkernels(): + for subkernel in get_subkernels(kernel): defined_base_storage = set() - for temporary in insn_query.temporaries_written_in_subkernel(subkernel): + from loopy.schedule.tools import ( + temporaries_written_in_subkernel, temporaries_read_in_subkernel) + + for temporary in temporaries_written_in_subkernel(kernel, subkernel): tval = kernel.temporary_variables[temporary] if tval.base_storage is not None: defined_base_storage.add(tval.base_storage) for temporary in ( - insn_query.temporaries_read_in_subkernel(subkernel) - - insn_query.temporaries_written_in_subkernel(subkernel)): + temporaries_read_in_subkernel(kernel, subkernel) - + temporaries_written_in_subkernel(kernel, subkernel)): tval = kernel.temporary_variables[temporary] if tval.initializer is not None: @@ -530,16 +531,17 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): if tval.base_storage is not None: if tval.base_storage not in defined_base_storage: from loopy.diagnostic import MissingDefinitionError - raise MissingDefinitionError("temporary variable '%s' gets used " - "in subkernel '%s' and neither it nor its aliases have a " - "definition" % (temporary, subkernel)) + raise MissingDefinitionError("temporary variable '%s' gets " + "used in subkernel '%s' and neither it nor its " + "aliases have a definition" % (temporary, subkernel)) continue if tval.scope in (temp_var_scope.PRIVATE, temp_var_scope.LOCAL): from loopy.diagnostic import MissingDefinitionError - raise MissingDefinitionError("temporary variable '%s' gets used in " - "subkernel '%s' without a definition (maybe you forgot to call " - "loopy.save_and_reload_temporaries?)" % (temporary, subkernel)) + raise MissingDefinitionError("temporary variable '%s' gets used " + "in subkernel '%s' without a definition (maybe you forgot " + "to call loopy.save_and_reload_temporaries?)" + % (temporary, subkernel)) # }}} @@ -616,7 +618,7 @@ def check_that_shapes_and_strides_are_arguments(kernel): def pre_codegen_checks(kernel): try: - logger.info("pre-codegen check %s: start" % kernel.name) + logger.debug("pre-codegen check %s: start" % kernel.name) check_for_unused_hw_axes_in_insns(kernel) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) @@ -625,7 +627,7 @@ def pre_codegen_checks(kernel): kernel.target.pre_codegen_check(kernel) check_that_shapes_and_strides_are_arguments(kernel) - logger.info("pre-codegen check %s: done" % kernel.name) + logger.debug("pre-codegen check %s: done" % kernel.name) except: print(75*"=") print("failing kernel during pre-schedule check:") diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index 7cc381f11d1239cba5656a9dc7a04cddaa14a368..61f4b3a9b8c38dfc25ebc81243812aa963423f8a 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -63,13 +63,20 @@ def get_usable_inames_for_conditional(kernel, sched_index): result = find_active_inames_at(kernel, sched_index) crosses_barrier = has_barrier_within(kernel, sched_index) - # Find our containing subkernel, grab inames for all insns from there. - - subkernel_index = sched_index - from loopy.schedule import CallKernel - - while not isinstance(kernel.schedule[subkernel_index], CallKernel): - subkernel_index -= 1 + # Find our containing subkernel. Grab inames for all insns from there. + within_subkernel = False + + for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index+1]): + from loopy.schedule import CallKernel, ReturnFromKernel + if isinstance(sched_item, CallKernel): + within_subkernel = True + subkernel_index = sched_item_index + elif isinstance(sched_item, ReturnFromKernel): + within_subkernel = False + + if not within_subkernel: + # Outside all subkernels - use only inames available to host. + return frozenset(result) insn_ids_for_subkernel = get_insn_ids_for_block_at( kernel.schedule, subkernel_index) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 3ef7c8f6ad6c8af09dd01bf9e1341179d2be0be7..e590502fb5813af0a820d45228de8e11c35a46c8 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -200,7 +200,7 @@ def generate_assignment_instruction_code(codegen_state, insn): "(%s).y" % lhs_code]) if printf_args: - printf_args_str = ", " + ", ".join(printf_args) + printf_args_str = ", " + ", ".join(str(v) for v in printf_args) else: printf_args_str = "" diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index 15ab8a1ee13df440926e51e676223bc6a398df57..512e4ac8619f33856d0a8ed929de0b574f7da014 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -103,6 +103,10 @@ class MissingDefinitionError(LoopyError): class UnscheduledInstructionError(LoopyError): pass + +class ReductionIsNotTriangularError(LoopyError): + pass + # }}} diff --git a/loopy/execution.py b/loopy/execution.py new file mode 100644 index 0000000000000000000000000000000000000000..dac5b2ff80767ae00c126aba31c2851cfe3769ef --- /dev/null +++ b/loopy/execution.py @@ -0,0 +1,234 @@ +from __future__ import division, with_statement, absolute_import + +__copyright__ = "Copyright (C) 2012-16 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import six +import numpy as np +from pytools import ImmutableRecord, memoize_method +from loopy.diagnostic import LoopyError + +import logging +logger = logging.getLogger(__name__) + +from pytools.persistent_dict import PersistentDict +from loopy.tools import LoopyKeyBuilder +from loopy.version import DATA_MODEL_VERSION + + +# {{{ object array argument packing + +class _PackingInfo(ImmutableRecord): + """ + .. attribute:: name + .. attribute:: sep_shape + + .. attribute:: subscripts_and_names + + A list of type ``[(index, unpacked_name), ...]``. + """ + + +class SeparateArrayPackingController(object): + """For argument arrays with axes tagged to be implemented as separate + arrays, this class provides preprocessing of the incoming arguments so that + all sub-arrays may be passed in one object array (under the original, + un-split argument name) and are unpacked into separate arrays before being + passed to the kernel. + + It also repacks outgoing arrays of this type back into an object array. + """ + + def __init__(self, kernel): + # map from arg name + self.packing_info = {} + + from loopy.kernel.array import ArrayBase + for arg in kernel.args: + if not isinstance(arg, ArrayBase): + continue + + if arg.shape is None or arg.dim_tags is None: + continue + + subscripts_and_names = arg.subscripts_and_names() + + if subscripts_and_names is None: + continue + + self.packing_info[arg.name] = _PackingInfo( + name=arg.name, + sep_shape=arg.sep_shape(), + subscripts_and_names=subscripts_and_names, + is_written=arg.name in kernel.get_written_variables()) + + def unpack(self, kernel_kwargs): + if not self.packing_info: + return kernel_kwargs + + kernel_kwargs = kernel_kwargs.copy() + + for packing_info in six.itervalues(self.packing_info): + arg_name = packing_info.name + if packing_info.name in kernel_kwargs: + arg = kernel_kwargs[arg_name] + for index, unpacked_name in packing_info.subscripts_and_names: + assert unpacked_name not in kernel_kwargs + kernel_kwargs[unpacked_name] = arg[index] + del kernel_kwargs[arg_name] + + return kernel_kwargs + + def pack(self, outputs): + if not self.packing_info: + return outputs + + for packing_info in six.itervalues(self.packing_info): + if not packing_info.is_written: + continue + + result = outputs[packing_info.name] = \ + np.zeros(packing_info.sep_shape, dtype=np.object) + + for index, unpacked_name in packing_info.subscripts_and_names: + result[index] = outputs.pop(unpacked_name) + + return outputs + +# }}} + + +# {{{ KernelExecutorBase + +typed_and_scheduled_cache = PersistentDict( + "loopy-typed-and-scheduled-cache-v1-"+DATA_MODEL_VERSION, + key_builder=LoopyKeyBuilder()) + + +class KernelExecutorBase(object): + """An object connecting a kernel to a :class:`pyopencl.Context` + for execution. + + .. automethod:: __init__ + .. automethod:: __call__ + """ + + def __init__(self, kernel): + """ + :arg kernel: a loopy.LoopKernel + """ + + self.kernel = kernel + + self.packing_controller = SeparateArrayPackingController(kernel) + + self.output_names = tuple(arg.name for arg in self.kernel.args + if arg.name in self.kernel.get_written_variables()) + + self.has_runtime_typed_args = any( + arg.dtype is None + for arg in kernel.args) + + def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set): + from loopy.kernel.tools import add_dtypes + + kernel = self.kernel + + if arg_to_dtype_set: + var_to_dtype = {} + for var, dtype in arg_to_dtype_set: + try: + dest_name = kernel.impl_arg_to_arg[var].name + except KeyError: + dest_name = var + + try: + var_to_dtype[dest_name] = dtype + except KeyError: + raise LoopyError("cannot set type for '%s': " + "no known variable/argument with that name" + % var) + + kernel = add_dtypes(kernel, var_to_dtype) + + from loopy.type_inference import infer_unknown_types + kernel = infer_unknown_types(kernel, expect_completion=True) + + if kernel.schedule is None: + from loopy.preprocess import preprocess_kernel + kernel = preprocess_kernel(kernel) + + from loopy.schedule import get_one_scheduled_kernel + kernel = get_one_scheduled_kernel(kernel) + + return kernel + + @memoize_method + def get_typed_and_scheduled_kernel(self, arg_to_dtype_set): + from loopy import CACHING_ENABLED + + cache_key = (type(self).__name__, self.kernel, arg_to_dtype_set) + if CACHING_ENABLED: + try: + return typed_and_scheduled_cache[cache_key] + except KeyError: + pass + + logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name) + + kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set) + + if CACHING_ENABLED: + typed_and_scheduled_cache[cache_key] = kernel + + return kernel + + def arg_to_dtype_set(self, kwargs): + if not self.has_runtime_typed_args: + return None + + from loopy.types import NumpyType + target = self.kernel.target + + impl_arg_to_arg = self.kernel.impl_arg_to_arg + arg_to_dtype = {} + for arg_name, val in six.iteritems(kwargs): + arg = impl_arg_to_arg.get(arg_name, None) + + if arg is None: + # offsets, strides and such + continue + + if arg.dtype is None and val is not None: + try: + dtype = val.dtype + except AttributeError: + pass + else: + arg_to_dtype[arg_name] = NumpyType(dtype, target) + + return frozenset(six.iteritems(arg_to_dtype)) + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 0ebe90fbca0d31c05eaee64321e2b73709292331..5f0884fd44ed5064f3f195d103b164f2163d1d19 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -594,37 +594,67 @@ def get_simple_strides(bset, key_by="name"): """ result = {} - lspace = bset.get_local_space() - for idiv in range(lspace.dim(dim_type.div)): - div = lspace.get_div(idiv) + comp_div_set_pieces = convexify(bset.compute_divs()).get_basic_sets() + assert len(comp_div_set_pieces) == 1 + bset, = comp_div_set_pieces + + def _get_indices_and_coeffs(obj, dts): + result = [] + for dt in dts: + for dim_idx in range(obj.dim(dt)): + coeff_val = obj.get_coefficient_val(dt, dim_idx) + if not coeff_val.is_zero(): + result.append((dt, dim_idx, coeff_val)) - # check for sub-divs - supported = True - for dim_idx in range(div.dim(dim_type.div)): - coeff_val = div.get_coefficient_val(dim_type.div, dim_idx) - if not coeff_val.is_zero(): - # sub-divs not supported - supported = False - break + return result + + for cns in bset.get_constraints(): + if not cns.is_equality(): + continue + aff = cns.get_aff() - if not supported: + # recognizes constraints of the form + # -i0 + 2*floor((i0)/2) == 0 + + if aff.dim(dim_type.div) != 1: + continue + + idiv = 0 + div = aff.get_div(idiv) + + # check for sub-divs + if _get_indices_and_coeffs(div, [dim_type.div]): + # found one -> not supported continue denom = div.get_denominator_val().to_python() - inames_and_coeffs = [] - for dt in [dim_type.param, dim_type.in_]: - for dim_idx in range(div.dim(dt)): - coeff_val = div.get_coefficient_val(dt, dim_idx) * denom - if not coeff_val.is_zero(): - inames_and_coeffs.append((dt, dim_idx, coeff_val)) + # if the coefficient in front of the div is not the same as the denominator + if not aff.get_coefficient_val(dim_type.div, idiv).div(denom).is_one(): + # not supported + continue + + inames_and_coeffs = _get_indices_and_coeffs( + div, [dim_type.param, dim_type.in_]) if len(inames_and_coeffs) != 1: continue (dt, dim_idx, coeff), = inames_and_coeffs - if coeff != 1: + if not (coeff * denom).is_one(): + # not supported + continue + + inames_and_coeffs = _get_indices_and_coeffs( + aff, [dim_type.param, dim_type.in_]) + + if len(inames_and_coeffs) != 1: + continue + + (outer_dt, outer_dim_idx, outer_coeff), = inames_and_coeffs + if (not outer_coeff.neg().is_one() + or (outer_dt, outer_dim_idx) != (dt, dim_idx)): # not supported continue diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 324f7da1a21de0115ea060ff7ef55e52ab0913d4..622f5e49be1e40b4156113d92907fe8b1d9fb859 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -44,33 +44,49 @@ from loopy.diagnostic import CannotBranchDomainTree, LoopyError # {{{ unique var names -def _is_var_name_conflicting_with_longer(name_a, name_b): - # Array dimensions implemented as separate arrays generate - # names by appending '_s<NUMBER>'. Make sure that no - # conflicts can arise from these names. +class _UniqueVarNameGenerator(UniqueNameGenerator): - # Only deal with the case of b longer than a. - if not name_b.startswith(name_a): - return False + def __init__(self, existing_names=set(), forced_prefix=""): + super(_UniqueVarNameGenerator, self).__init__(existing_names, forced_prefix) + array_prefix_pattern = re.compile("(.*)_s[0-9]+$") - return re.match("^%s_s[0-9]+" % re.escape(name_b), name_a) is not None + array_prefixes = set() + for name in existing_names: + match = array_prefix_pattern.match(name) + if match is None: + continue + array_prefixes.add(match.group(1)) -def _is_var_name_conflicting(name_a, name_b): - if name_a == name_b: - return True + self.conflicting_array_prefixes = array_prefixes + self.array_prefix_pattern = array_prefix_pattern - return ( - _is_var_name_conflicting_with_longer(name_a, name_b) - or _is_var_name_conflicting_with_longer(name_b, name_a)) + def _name_added(self, name): + match = self.array_prefix_pattern.match(name) + if match is None: + return + self.conflicting_array_prefixes.add(match.group(1)) -class _UniqueVarNameGenerator(UniqueNameGenerator): def is_name_conflicting(self, name): - from pytools import any - return any( - _is_var_name_conflicting(name, other_name) - for other_name in self.existing_names) + if name in self.existing_names: + return True + + # Array dimensions implemented as separate arrays generate + # names by appending '_s<NUMBER>'. Make sure that no + # conflicts can arise from these names. + + # Case 1: a_s0 is already a name; we are trying to insert a + # Case 2: a is already a name; we are trying to insert a_s0 + + if name in self.conflicting_array_prefixes: + return True + + match = self.array_prefix_pattern.match(name) + if match is None: + return False + + return match.group(1) in self.existing_names # }}} @@ -599,8 +615,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): # nothin' new continue - domain_parents = [home_domain_index] + ppd[home_domain_index] - current_root = domain_parents[-1] + domain_path_to_root = [home_domain_index] + ppd[home_domain_index] + current_root = domain_path_to_root[-1] previous_leaf = root_to_leaf.get(current_root) if previous_leaf is not None: @@ -610,8 +626,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): # it can introduce artificial restrictions on variables # further up the tree. - prev_parents = set(ppd[previous_leaf]) - if not prev_parents <= set(domain_parents): + prev_path_to_root = set([previous_leaf] + ppd[previous_leaf]) + if not prev_path_to_root <= set(domain_path_to_root): raise CannotBranchDomainTree("iname set '%s' requires " "branch in domain tree (when adding '%s')" % (", ".join(inames), iname)) @@ -620,7 +636,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): pass root_to_leaf[current_root] = home_domain_index - domain_indices.update(domain_parents) + domain_indices.update(domain_path_to_root) return list(root_to_leaf.values()) @@ -1095,7 +1111,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): return embedding - def stringify(self, what=None, with_dependencies=False): + def stringify(self, what=None, with_dependencies=False, use_separators=True, + show_labels=True): all_what = set([ "name", "arguments", @@ -1134,7 +1151,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): kernel = self - sep = 75*"-" + if use_separators: + sep = [75*"-"] + else: + sep = [] def natorder(key): # Return natural ordering for strings, as opposed to dictionary order. @@ -1151,44 +1171,50 @@ class LoopKernel(ImmutableRecordWithoutPickling): return sorted(seq, key=lambda y: natorder(key(y))) if "name" in what: - lines.append(sep) + lines.extend(sep) lines.append("KERNEL: " + kernel.name) if "arguments" in what: - lines.append(sep) - lines.append("ARGUMENTS:") + lines.extend(sep) + if show_labels: + lines.append("ARGUMENTS:") for arg_name in natsorted(kernel.arg_dict): lines.append(str(kernel.arg_dict[arg_name])) if "domains" in what: - lines.append(sep) - lines.append("DOMAINS:") + lines.extend(sep) + if show_labels: + lines.append("DOMAINS:") for dom, parents in zip(kernel.domains, kernel.all_parents_per_domain()): lines.append(len(parents)*" " + str(dom)) if "tags" in what: - lines.append(sep) - lines.append("INAME IMPLEMENTATION TAGS:") + lines.extend(sep) + if show_labels: + lines.append("INAME IMPLEMENTATION TAGS:") for iname in natsorted(kernel.all_inames()): line = "%s: %s" % (iname, kernel.iname_to_tag.get(iname)) lines.append(line) if "variables" in what and kernel.temporary_variables: - lines.append(sep) - lines.append("TEMPORARIES:") + lines.extend(sep) + if show_labels: + lines.append("TEMPORARIES:") for tv in natsorted(six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): lines.append(str(tv)) if "rules" in what and kernel.substitutions: - lines.append(sep) - lines.append("SUBSTIUTION RULES:") + lines.extend(sep) + if show_labels: + lines.append("SUBSTIUTION RULES:") for rule_name in natsorted(six.iterkeys(kernel.substitutions)): lines.append(str(kernel.substitutions[rule_name])) if "instructions" in what: - lines.append(sep) - lines.append("INSTRUCTIONS:") + lines.extend(sep) + if show_labels: + lines.append("INSTRUCTIONS:") loop_list_width = 35 # {{{ topological sort @@ -1303,18 +1329,20 @@ class LoopKernel(ImmutableRecordWithoutPickling): dep_lines.append("%s : %s" % (insn.id, ",".join(insn.depends_on))) if "Dependencies" in what and dep_lines: - lines.append(sep) - lines.append("DEPENDENCIES: " - "(use loopy.show_dependency_graph to visualize)") + lines.extend(sep) + if show_labels: + lines.append("DEPENDENCIES: " + "(use loopy.show_dependency_graph to visualize)") lines.extend(dep_lines) if "schedule" in what and kernel.schedule is not None: - lines.append(sep) - lines.append("SCHEDULE:") + lines.extend(sep) + if show_labels: + lines.append("SCHEDULE:") from loopy.schedule import dump_schedule lines.append(dump_schedule(kernel, kernel.schedule)) - lines.append(sep) + lines.extend(sep) return "\n".join(lines) @@ -1384,17 +1412,35 @@ class LoopKernel(ImmutableRecordWithoutPickling): result.pop("cache_manager", None) - return result + # make sure that kernels are pickled with a cached hash key in place + from loopy.tools import LoopyKeyBuilder + LoopyKeyBuilder()(self) + + return (result, self._pytools_persistent_hash_digest) def __setstate__(self, state): + attribs, p_hash_digest = state + new_fields = set() - for k, v in six.iteritems(state): + for k, v in six.iteritems(attribs): setattr(self, k, v) new_fields.add(k) self.register_fields(new_fields) + if 0: + # {{{ check that 'reconstituted' object has same hash + + from loopy.tools import LoopyKeyBuilder + LoopyKeyBuilder()(self) + + assert p_hash_digest == self._pytools_persistent_hash_digest + + # }}} + else: + self._pytools_persistent_hash_digest = p_hash_digest + from loopy.kernel.tools import SetOperationCacheManager self.cache_manager = SetOperationCacheManager() self._kernel_executor_cache = {} diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 0d22dbb88ed99c7c92480d1d39b924cc2198cc3f..08268ca9f27623a6d17a195d3c04acb55e5ec68a 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -714,7 +714,7 @@ class Assignment(MultiAssignmentBase): z[i] = z[i+1-1] + a {atomic} - :mod:`loopy` may to evaluate the right-hand side *multiple times* + :mod:`loopy` may choose to evaluate the right-hand side *multiple times* as part of a single assignment. It is up to the user to ensure that this retains correct semantics. diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 17226b63addb9e2e30d556730aa326d2ed59128c..ced1aaaa13ed8275c1e3a376d1c24895287b3239 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -48,20 +48,22 @@ def prepare_for_caching(kernel): import loopy as lp new_args = [] + tgt = kernel.target + for arg in kernel.args: dtype = arg.dtype - if dtype is not None and dtype is not lp.auto: - dtype = dtype.with_target(kernel.target) + if dtype is not None and dtype is not lp.auto and dtype.target is not tgt: + arg = arg.copy(dtype=dtype.with_target(kernel.target)) - new_args.append(arg.copy(dtype=dtype)) + new_args.append(arg) new_temporary_variables = {} for name, temp in six.iteritems(kernel.temporary_variables): dtype = temp.dtype - if dtype is not None and dtype is not lp.auto: - dtype = dtype.with_target(kernel.target) + if dtype is not None and dtype is not lp.auto and dtype.target is not tgt: + temp = temp.copy(dtype=dtype.with_target(tgt)) - new_temporary_variables[name] = temp.copy(dtype=dtype) + new_temporary_variables[name] = temp kernel = kernel.copy( args=new_args, @@ -274,7 +276,425 @@ def find_temporary_scope(kernel): # {{{ rewrite reduction to imperative form -# {{{ reduction utils +# {{{ utils (not stateful) + +from collections import namedtuple + + +_InameClassification = namedtuple("_InameClassifiction", + "sequential, local_parallel, nonlocal_parallel") + + +def _classify_reduction_inames(kernel, inames): + sequential = [] + local_par = [] + nonlocal_par = [] + + from loopy.kernel.data import ( + LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, + ParallelTag) + + for iname in inames: + iname_tag = kernel.iname_to_tag.get(iname) + + if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)): + # These are nominally parallel, but we can live with + # them as sequential. + sequential.append(iname) + + elif isinstance(iname_tag, LocalIndexTagBase): + local_par.append(iname) + + elif isinstance(iname_tag, (ParallelTag, VectorizeTag)): + nonlocal_par.append(iname) + + else: + sequential.append(iname) + + return _InameClassification( + tuple(sequential), tuple(local_par), tuple(nonlocal_par)) + + +def _add_params_to_domain(domain, param_names): + dim_type = isl.dim_type + nparams_orig = domain.dim(dim_type.param) + domain = domain.add_dims(dim_type.param, len(param_names)) + + for param_idx, param_name in enumerate(param_names): + domain = domain.set_dim_name( + dim_type.param, param_idx + nparams_orig, param_name) + + return domain + + +def _move_set_to_param_dims_except(domain, except_dims): + dim_type = isl.dim_type + + iname_idx = 0 + for iname in domain.get_var_names(dim_type.set): + if iname not in except_dims: + domain = domain.move_dims( + dim_type.param, 0, + dim_type.set, iname_idx, 1) + iname_idx -= 1 + iname_idx += 1 + + return domain + + +def _domain_depends_on_given_set_dims(domain, set_dim_names): + set_dim_names = frozenset(set_dim_names) + + return any( + set_dim_names & set(constr.get_coefficients_by_name()) + for constr in domain.get_constraints()) + + +def _check_reduction_is_triangular(kernel, expr, scan_param): + """Check whether the reduction within `expr` with scan parameters described by + the structure `scan_param` is triangular. This attempts to verify that the + domain for the scan and sweep inames is as follows: + + [params] -> { + [other inames..., scan_iname, sweep_iname]: + (sweep_min_value + <= sweep_iname + <= sweep_max_value) + and + (scan_min_value + <= scan_iname + <= stride * (sweep_iname - sweep_min_value) + scan_min_value) + and + (irrelevant constraints) + } + """ + + orig_domain = kernel.get_inames_domain( + frozenset((scan_param.sweep_iname, scan_param.scan_iname))) + + sweep_iname = scan_param.sweep_iname + scan_iname = scan_param.scan_iname + affs = isl.affs_from_space(orig_domain.space) + + sweep_lower_bound = isl.align_spaces( + scan_param.sweep_lower_bound, + affs[0], + across_dim_types=True) + + sweep_upper_bound = isl.align_spaces( + scan_param.sweep_upper_bound, + affs[0], + across_dim_types=True) + + scan_lower_bound = isl.align_spaces( + scan_param.scan_lower_bound, + affs[0], + across_dim_types=True) + + from itertools import product + + for (sweep_lb_domain, sweep_lb_aff), \ + (sweep_ub_domain, sweep_ub_aff), \ + (scan_lb_domain, scan_lb_aff) in \ + product(sweep_lower_bound.get_pieces(), + sweep_upper_bound.get_pieces(), + scan_lower_bound.get_pieces()): + + # Assumptions inherited from the domains of the pwaffs + assumptions = sweep_lb_domain & sweep_ub_domain & scan_lb_domain + + # Sweep iname constraints + hyp_domain = affs[sweep_iname].ge_set(sweep_lb_aff) + hyp_domain &= affs[sweep_iname].le_set(sweep_ub_aff) + + # Scan iname constraints + hyp_domain &= affs[scan_iname].ge_set(scan_lb_aff) + hyp_domain &= affs[scan_iname].le_set( + scan_param.stride * (affs[sweep_iname] - sweep_lb_aff) + + scan_lb_aff) + + hyp_domain, = (hyp_domain & assumptions).get_basic_sets() + test_domain, = (orig_domain & assumptions).get_basic_sets() + + hyp_gist_against_test = hyp_domain.gist(test_domain) + if _domain_depends_on_given_set_dims(hyp_gist_against_test, + (sweep_iname, scan_iname)): + return False, ( + "gist of hypothesis against test domain " + "has sweep or scan dependent constraints: '%s'" + % hyp_gist_against_test) + + test_gist_against_hyp = test_domain.gist(hyp_domain) + if _domain_depends_on_given_set_dims(test_gist_against_hyp, + (sweep_iname, scan_iname)): + return False, ( + "gist of test against hypothesis domain " + "has sweep or scan dependent constraint: '%s'" + % test_gist_against_hyp) + + return True, "ok" + + +_ScanCandidateParameters = namedtuple( + "_ScanCandidateParameters", + "sweep_iname, scan_iname, sweep_lower_bound, " + "sweep_upper_bound, scan_lower_bound, stride") + + +def _try_infer_scan_candidate_from_expr( + kernel, expr, within_inames, sweep_iname=None): + """Analyze `expr` and determine if it can be implemented as a scan. + """ + from loopy.symbolic import Reduction + assert isinstance(expr, Reduction) + + if len(expr.inames) != 1: + raise ValueError( + "Multiple inames in reduction: '%s'" % (", ".join(expr.inames),)) + + scan_iname, = expr.inames + + from loopy.kernel.tools import DomainChanger + dchg = DomainChanger(kernel, (scan_iname,)) + domain = dchg.get_original_domain() + + if sweep_iname is None: + try: + sweep_iname = _try_infer_sweep_iname( + domain, scan_iname, kernel.all_inames()) + except ValueError as v: + raise ValueError( + "Couldn't determine a sweep iname for the scan " + "expression '%s': %s" % (expr, v)) + + try: + sweep_lower_bound, sweep_upper_bound, scan_lower_bound = ( + _try_infer_scan_and_sweep_bounds( + kernel, scan_iname, sweep_iname, within_inames)) + except ValueError as v: + raise ValueError( + "Couldn't determine bounds for the scan with expression '%s' " + "(sweep iname: '%s', scan iname: '%s'): %s" + % (expr, sweep_iname, scan_iname, v)) + + try: + stride = _try_infer_scan_stride( + kernel, scan_iname, sweep_iname, sweep_lower_bound) + except ValueError as v: + raise ValueError( + "Couldn't determine a scan stride for the scan with expression '%s' " + "(sweep iname: '%s', scan iname: '%s'): %s" + % (expr, sweep_iname, scan_iname, v)) + + return _ScanCandidateParameters(sweep_iname, scan_iname, sweep_lower_bound, + sweep_upper_bound, scan_lower_bound, stride) + + +def _try_infer_sweep_iname(domain, scan_iname, candidate_inames): + """The sweep iname is the outer iname which guides the scan. + + E.g. for a domain of {[i,j]: 0<=i<n and 0<=j<=i}, i is the sweep iname. + """ + constrs = domain.get_constraints() + sweep_iname_candidate = None + + for constr in constrs: + candidate_vars = set([ + var for var in constr.get_var_dict() + if var in candidate_inames]) + + # Irrelevant constraint - skip + if scan_iname not in candidate_vars: + continue + + # No additional inames - skip + if len(candidate_vars) == 1: + continue + + candidate_vars.remove(scan_iname) + + # Depends on more than one iname - error + if len(candidate_vars) > 1: + raise ValueError( + "More than one sweep iname candidate for scan iname '%s' found " + "(via constraint '%s')" % (scan_iname, constr)) + + next_candidate = candidate_vars.pop() + + if sweep_iname_candidate is None: + sweep_iname_candidate = next_candidate + defining_constraint = constr + else: + # Check next_candidate consistency + if sweep_iname_candidate != next_candidate: + raise ValueError( + "More than one sweep iname candidate for scan iname '%s' " + "found (via constraints '%s', '%s')" % + (scan_iname, defining_constraint, constr)) + + if sweep_iname_candidate is None: + raise ValueError( + "Couldn't find any sweep iname candidates for " + "scan iname '%s'" % scan_iname) + + return sweep_iname_candidate + + +def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_inames): + domain = kernel.get_inames_domain(frozenset((sweep_iname, scan_iname))) + domain = _move_set_to_param_dims_except(domain, (sweep_iname, scan_iname)) + + var_dict = domain.get_var_dict() + sweep_idx = var_dict[sweep_iname][1] + scan_idx = var_dict[scan_iname][1] + + domain = domain.project_out_except( + within_inames | kernel.non_iname_variable_names(), (isl.dim_type.param,)) + + try: + with isl.SuppressedWarnings(domain.get_ctx()): + sweep_lower_bound = domain.dim_min(sweep_idx) + sweep_upper_bound = domain.dim_max(sweep_idx) + scan_lower_bound = domain.dim_min(scan_idx) + except isl.Error as e: + raise ValueError("isl error: %s" % e) + + return (sweep_lower_bound, sweep_upper_bound, scan_lower_bound) + + +def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): + """The stride is the number of steps the scan iname takes per iteration + of the sweep iname. This is allowed to be an integer constant. + + E.g. for a domain of {[i,j]: 0<=i<n and 0<=j<=6*i}, the stride is 6. + """ + dim_type = isl.dim_type + + domain = kernel.get_inames_domain(frozenset([sweep_iname, scan_iname])) + domain_with_sweep_param = _move_set_to_param_dims_except(domain, (scan_iname,)) + + domain_with_sweep_param = domain_with_sweep_param.project_out_except( + (sweep_iname, scan_iname), (dim_type.set, dim_type.param)) + + scan_iname_idx = domain_with_sweep_param.find_dim_by_name( + dim_type.set, scan_iname) + + # Should be equal to k * sweep_iname, where k is the stride. + + try: + with isl.SuppressedWarnings(domain_with_sweep_param.get_ctx()): + scan_iname_range = ( + domain_with_sweep_param.dim_max(scan_iname_idx) + - domain_with_sweep_param.dim_min(scan_iname_idx) + ).gist(domain_with_sweep_param.params()) + except isl.Error as e: + raise ValueError("isl error: '%s'" % e) + + scan_iname_pieces = scan_iname_range.get_pieces() + + if len(scan_iname_pieces) > 1: + raise ValueError("range in multiple pieces: %s" % scan_iname_range) + elif len(scan_iname_pieces) == 0: + raise ValueError("empty range found for iname '%s'" % scan_iname) + + scan_iname_constr, scan_iname_aff = scan_iname_pieces[0] + + if not scan_iname_constr.plain_is_universe(): + raise ValueError("found constraints: %s" % scan_iname_constr) + + if scan_iname_aff.dim(dim_type.div): + raise ValueError("aff has div: %s" % scan_iname_aff) + + coeffs = scan_iname_aff.get_coefficients_by_name(dim_type.param) + + if len(coeffs) == 0: + try: + scan_iname_aff.get_constant_val() + except: + raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff) + + # If this point is reached we're assuming the domain is of the form + # {[i,j]: i=0 and j=0}, so the stride is technically 1 - any value + # this function returns will be verified later by + # _check_reduction_is_triangular(). + return 1 + + if sweep_iname not in coeffs: + raise ValueError("didn't find sweep iname in coeffs: %s" % sweep_iname) + + stride = coeffs[sweep_iname] + + if not stride.is_int(): + raise ValueError("stride not an integer: %s" % stride) + + if not stride.is_pos(): + raise ValueError("stride not positive: %s" % stride) + + return stride.to_python() + + +def _get_domain_with_iname_as_param(domain, iname): + dim_type = isl.dim_type + + if domain.find_dim_by_name(dim_type.param, iname) >= 0: + return domain + + iname_idx = domain.find_dim_by_name(dim_type.set, iname) + + assert iname_idx >= 0, (iname, domain) + + return domain.move_dims( + dim_type.param, domain.dim(dim_type.param), + dim_type.set, iname_idx, 1) + + +def _create_domain_for_sweep_tracking(orig_domain, + tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride): + dim_type = isl.dim_type + + subd = isl.BasicSet.universe(orig_domain.params().space) + + # Add tracking_iname and sweep iname. + + subd = _add_params_to_domain(subd, (sweep_iname, tracking_iname)) + + # Here we realize the domain: + # + # [..., i] -> { + # [j]: 0 <= j - l + # and + # j - l <= k * (i - m) + # and + # k * (i - m - 1) < j - l } + # where + # * i is the sweep iname + # * j is the tracking iname + # * k is the stride for the scan + # * l is the lower bound for the scan + # * m is the lower bound for the sweep iname + # + affs = isl.affs_from_space(subd.space) + + subd &= (affs[tracking_iname] - scan_min_value).ge_set(affs[0]) + subd &= (affs[tracking_iname] - scan_min_value)\ + .le_set(stride * (affs[sweep_iname] - sweep_min_value)) + subd &= (affs[tracking_iname] - scan_min_value)\ + .gt_set(stride * (affs[sweep_iname] - sweep_min_value - 1)) + + # Move tracking_iname into a set dim (NOT sweep iname). + subd = subd.move_dims( + dim_type.set, 0, + dim_type.param, subd.dim(dim_type.param) - 1, 1) + + # Simplify (maybe). + orig_domain_with_sweep_param = ( + _get_domain_with_iname_as_param(orig_domain, sweep_iname)) + subd = subd.gist_params(orig_domain_with_sweep_param.params()) + + subd, = subd.get_basic_sets() + + return subd + def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): """ @@ -455,10 +875,22 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): return kernel.copy(temporary_variables=new_temporary_variables, instructions=new_instructions) + +def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): + # Intersect with inames, because we could have captured some kernel params + # in here too... + dependent_inames = ( + frozenset(subdomain.get_var_names(isl.dim_type.param)) + & kernel.all_inames()) + idx, = kernel.get_leaf_domain_indices(dependent_inames) + domains.insert(idx + 1, subdomain) + # }}} -def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): +def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, + automagic_scans_ok=False, force_scan=False, + force_outer_iname_for_scan=None): """Rewrites reductions into their imperative form. With *insn_id_filter* specified, operate only on the instruction with an instruction id matching *insn_id_filter*. @@ -469,6 +901,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): If *insn_id_filter* is not given, all reductions in all instructions will be realized. + + If *automagic_scans_ok*, this function will attempt to rewrite triangular + reductions as scans automatically. + + If *force_scan* is *True*, this function will attempt to rewrite *all* + candidate reductions as scans and raise an error if this is not possible + (this is most useful combined with *insn_id_filter*). + + If *force_outer_iname_for_scan* is not *None*, this function will attempt + to realize candidate reductions as scans using the specified iname as the + outer (sweep) iname. """ logger.debug("%s: realize reduction" % kernel.name) @@ -480,6 +923,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): var_name_gen = kernel.get_var_name_generator() new_temporary_variables = kernel.temporary_variables.copy() + inames_added_for_scan = set() + inames_to_remove = set() # {{{ helpers @@ -489,8 +934,44 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): else: return val + def preprocess_scan_arguments( + insn, expr, nresults, scan_iname, track_iname, + newly_generated_insn_id_set): + """Does iname substitution within scan arguments and returns a set of values + suitable to be passed to the binary op. Returns a tuple.""" + + if nresults > 1: + inner_expr = expr + + # In the case of a multi-argument scan, we need a name for each of + # the arguments in order to pass them to the binary op - so we expand + # items that are not "plain" tuples here. + if not isinstance(inner_expr, tuple): + get_args_insn_id = insn_id_gen( + "%s_%s_get" % (insn.id, "_".join(expr.inames))) + + inner_expr = expand_inner_reduction( + id=get_args_insn_id, + expr=inner_expr, + nresults=nresults, + depends_on=insn.depends_on, + within_inames=insn.within_inames | expr.inames, + within_inames_is_final=insn.within_inames_is_final) + + newly_generated_insn_id_set.add(get_args_insn_id) + + updated_inner_exprs = tuple( + replace_var_within_expr(sub_expr, scan_iname, track_iname) + for sub_expr in inner_expr) + else: + updated_inner_exprs = ( + replace_var_within_expr(expr, scan_iname, track_iname),) + + return updated_inner_exprs + def expand_inner_reduction(id, expr, nresults, depends_on, within_inames, within_inames_is_final): + # FIXME: use make_temporaries from pymbolic.primitives import Call from loopy.symbolic import Reduction assert isinstance(expr, (Call, Reduction)) @@ -530,20 +1011,23 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) - from pymbolic import var - acc_var_names = [ - var_name_gen("acc_"+"_".join(expr.inames)) - for i in range(nresults)] - acc_vars = tuple(var(n) for n in acc_var_names) + from loopy.kernel.data import temp_var_scope + acc_var_names = make_temporaries( + name_based_on="acc_"+"_".join(expr.inames), + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) - from loopy.kernel.data import TemporaryVariable, temp_var_scope + init_insn_depends_on = frozenset() - for name, dtype in zip(acc_var_names, reduction_dtypes): - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=(), - dtype=dtype, - scope=temp_var_scope.PRIVATE) + global_barrier = lp.find_most_recent_global_barrier(temp_kernel, insn.id) + + if global_barrier is not None: + init_insn_depends_on |= frozenset([global_barrier]) + + from pymbolic import var + acc_vars = tuple(var(n) for n in acc_var_names) init_id = insn_id_gen( "%s_%s_init" % (insn.id, "_".join(expr.inames))) @@ -553,7 +1037,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): assignees=acc_vars, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset(), + depends_on=init_insn_depends_on, expression=expr.operation.neutral_element(*arg_dtypes)) generated_insns.append(init_insn) @@ -567,17 +1051,20 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): reduction_insn_depends_on = set([init_id]) + # In the case of a multi-argument reduction, we need a name for each of + # the arguments in order to pass them to the binary op - so we expand + # items that are not "plain" tuples here. if nresults > 1 and not isinstance(expr.expr, tuple): get_args_insn_id = insn_id_gen( "%s_%s_get" % (insn.id, "_".join(expr.inames))) reduction_expr = expand_inner_reduction( - id=get_args_insn_id, - expr=expr.expr, - nresults=nresults, - depends_on=insn.depends_on, - within_inames=update_insn_iname_deps, - within_inames_is_final=insn.within_inames_is_final) + id=get_args_insn_id, + expr=expr.expr, + nresults=nresults, + depends_on=insn.depends_on, + within_inames=update_insn_iname_deps, + within_inames_is_final=insn.within_inames_is_final) reduction_insn_depends_on.add(get_args_insn_id) else: @@ -626,6 +1113,14 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): v[iname].lt_set(v[0] + size)).get_basic_sets() return bs + def _make_slab_set_from_range(iname, lbound, ubound): + v = isl.make_zero_and_vars([iname]) + bs, = ( + v[iname].ge_set(v[0] + lbound) + & + v[iname].lt_set(v[0] + ubound)).get_basic_sets() + return bs + def map_reduction_local(expr, rec, nresults, arg_dtypes, reduction_dtypes): red_iname, = expr.inames @@ -650,6 +1145,24 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): _get_int_iname_size(oiname) for oiname in outer_local_inames) + from loopy.kernel.data import temp_var_scope + + neutral_var_names = make_temporaries( + name_based_on="neutral_"+red_iname, + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) + + acc_var_names = make_temporaries( + name_based_on="acc_"+red_iname, + nvars=nresults, + shape=outer_local_iname_sizes + (size,), + dtypes=reduction_dtypes, + scope=temp_var_scope.LOCAL) + + acc_vars = tuple(var(n) for n in acc_var_names) + # {{{ add separate iname to carry out the reduction # Doing this sheds any odd conditionals that may be active @@ -661,32 +1174,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): # }}} - neutral_var_names = [ - var_name_gen("neutral_"+red_iname) - for i in range(nresults)] - acc_var_names = [ - var_name_gen("acc_"+red_iname) - for i in range(nresults)] - acc_vars = tuple(var(n) for n in acc_var_names) - - from loopy.kernel.data import TemporaryVariable, temp_var_scope - for name, dtype in zip(acc_var_names, reduction_dtypes): - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=outer_local_iname_sizes + (size,), - dtype=dtype, - scope=temp_var_scope.LOCAL) - for name, dtype in zip(neutral_var_names, reduction_dtypes): - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=(), - dtype=dtype, - scope=temp_var_scope.PRIVATE) - base_iname_deps = outer_insn_inames - frozenset(expr.inames) neutral = expr.operation.neutral_element(*arg_dtypes) - init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname)) init_insn = make_assignment( id=init_id, @@ -711,6 +1201,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): transfer_depends_on = set([init_neutral_id, init_id]) + # In the case of a multi-argument reduction, we need a name for each of + # the arguments in order to pass them to the binary op - so we expand + # items that are not "plain" tuples here. if nresults > 1 and not isinstance(expr.expr, tuple): get_args_insn_id = insn_id_gen( "%s_%s_get" % (insn.id, red_iname)) @@ -745,9 +1238,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): (outer_insn_inames - frozenset(expr.inames)) | frozenset([red_iname])), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset(transfer_depends_on) | insn.depends_on, - no_sync_with=frozenset( - [(insn_id, "any") for insn_id in transfer_depends_on])) + depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on, + no_sync_with=frozenset([(init_id, "any")])) generated_insns.append(transfer_insn) cur_size = 1 @@ -759,6 +1251,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): istage = 0 while cur_size > 1: + new_size = cur_size // 2 assert new_size * 2 == cur_size @@ -807,6 +1300,351 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars] # }}} + # {{{ utils (stateful) + + from pytools import memoize + + @memoize + def get_or_add_sweep_tracking_iname_and_domain( + scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, + tracking_iname): + domain = temp_kernel.get_inames_domain(frozenset((scan_iname, sweep_iname))) + + inames_added_for_scan.add(tracking_iname) + + new_domain = _create_domain_for_sweep_tracking(domain, + tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride) + + _insert_subdomain_into_domain_tree(temp_kernel, domains, new_domain) + + return tracking_iname + + def replace_var_within_expr(expr, from_var, to_var): + from pymbolic.mapper.substitutor import make_subst_func + + from loopy.symbolic import ( + SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper) + + rule_mapping_context = SubstitutionRuleMappingContext( + temp_kernel.substitutions, var_name_gen) + + from pymbolic import var + mapper = RuleAwareSubstitutionMapper( + rule_mapping_context, + make_subst_func({from_var: var(to_var)}), + within=lambda *args: True) + + return mapper(expr, temp_kernel, None) + + def make_temporaries(name_based_on, nvars, shape, dtypes, scope): + var_names = [ + var_name_gen(name_based_on.format(index=i)) + for i in range(nvars)] + + from loopy.kernel.data import TemporaryVariable + + for name, dtype in zip(var_names, dtypes): + new_temporary_variables[name] = TemporaryVariable( + name=name, + shape=shape, + dtype=dtype, + scope=scope) + + return var_names + + # }}} + + # {{{ sequential scan + + def map_scan_seq(expr, rec, nresults, arg_dtypes, + reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, + scan_min_value, stride): + outer_insn_inames = temp_kernel.insn_inames(insn) + inames_to_remove.add(scan_iname) + + track_iname = var_name_gen( + "{sweep_iname}__seq_scan" + .format(scan_iname=scan_iname, sweep_iname=sweep_iname)) + + get_or_add_sweep_tracking_iname_and_domain( + scan_iname, sweep_iname, sweep_min_value, scan_min_value, + stride, track_iname) + + from loopy.kernel.data import temp_var_scope + acc_var_names = make_temporaries( + name_based_on="acc_" + scan_iname, + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) + + from pymbolic import var + acc_vars = tuple(var(n) for n in acc_var_names) + + init_id = insn_id_gen( + "%s_%s_init" % (insn.id, "_".join(expr.inames))) + + init_insn_depends_on = frozenset() + + global_barrier = lp.find_most_recent_global_barrier(temp_kernel, insn.id) + + if global_barrier is not None: + init_insn_depends_on |= frozenset([global_barrier]) + + init_insn = make_assignment( + id=init_id, + assignees=acc_vars, + within_inames=outer_insn_inames - frozenset( + (sweep_iname,) + expr.inames), + within_inames_is_final=insn.within_inames_is_final, + depends_on=init_insn_depends_on, + expression=expr.operation.neutral_element(*arg_dtypes)) + + generated_insns.append(init_insn) + + update_insn_depends_on = set([init_insn.id]) | insn.depends_on + + updated_inner_exprs = ( + preprocess_scan_arguments(insn, expr.expr, nresults, + scan_iname, track_iname, update_insn_depends_on)) + + update_id = insn_id_gen( + based_on="%s_%s_update" % (insn.id, "_".join(expr.inames))) + + update_insn_iname_deps = temp_kernel.insn_inames(insn) | set([track_iname]) + if insn.within_inames_is_final: + update_insn_iname_deps = insn.within_inames | set([track_iname]) + + scan_insn = make_assignment( + id=update_id, + assignees=acc_vars, + expression=expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, acc_vars), + _strip_if_scalar(acc_vars, updated_inner_exprs)), + depends_on=frozenset(update_insn_depends_on), + within_inames=update_insn_iname_deps, + no_sync_with=insn.no_sync_with, + within_inames_is_final=insn.within_inames_is_final) + + generated_insns.append(scan_insn) + + new_insn_add_depends_on.add(scan_insn.id) + + if nresults == 1: + assert len(acc_vars) == 1 + return acc_vars[0] + else: + return acc_vars + + # }}} + + # {{{ local-parallel scan + + def map_scan_local(expr, rec, nresults, arg_dtypes, + reduction_dtypes, sweep_iname, scan_iname, + sweep_min_value, scan_min_value, stride): + + scan_size = _get_int_iname_size(sweep_iname) + + assert scan_size > 0 + + if scan_size == 1: + return map_reduction_seq( + expr, rec, nresults, arg_dtypes, reduction_dtypes) + + outer_insn_inames = temp_kernel.insn_inames(insn) + + from loopy.kernel.data import LocalIndexTagBase + outer_local_inames = tuple( + oiname + for oiname in outer_insn_inames + if isinstance( + kernel.iname_to_tag.get(oiname), + LocalIndexTagBase) + and oiname != sweep_iname) + + from pymbolic import var + outer_local_iname_vars = tuple( + var(oiname) for oiname in outer_local_inames) + + outer_local_iname_sizes = tuple( + _get_int_iname_size(oiname) + for oiname in outer_local_inames) + + track_iname = var_name_gen( + "{sweep_iname}__pre_scan" + .format(scan_iname=scan_iname, sweep_iname=sweep_iname)) + + get_or_add_sweep_tracking_iname_and_domain( + scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, + track_iname) + + # {{{ add separate iname to carry out the scan + + # Doing this sheds any odd conditionals that may be active + # on our scan_iname. + + base_exec_iname = var_name_gen(sweep_iname + "__scan") + domains.append(_make_slab_set(base_exec_iname, scan_size)) + new_iname_tags[base_exec_iname] = kernel.iname_to_tag[sweep_iname] + + # }}} + + from loopy.kernel.data import temp_var_scope + + read_var_names = make_temporaries( + name_based_on="read_"+scan_iname+"_arg_{index}", + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) + + acc_var_names = make_temporaries( + name_based_on="acc_"+scan_iname, + nvars=nresults, + shape=outer_local_iname_sizes + (scan_size,), + dtypes=reduction_dtypes, + scope=temp_var_scope.LOCAL) + + acc_vars = tuple(var(n) for n in acc_var_names) + read_vars = tuple(var(n) for n in read_var_names) + + base_iname_deps = (outer_insn_inames + - frozenset(expr.inames) - frozenset([sweep_iname])) + + neutral = expr.operation.neutral_element(*arg_dtypes) + + init_insn_depends_on = insn.depends_on + + global_barrier = lp.find_most_recent_global_barrier(temp_kernel, insn.id) + + if global_barrier is not None: + init_insn_depends_on |= frozenset([global_barrier]) + + init_id = insn_id_gen("%s_%s_init" % (insn.id, scan_iname)) + init_insn = make_assignment( + id=init_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(base_exec_iname),)] + for acc_var in acc_vars), + expression=neutral, + within_inames=base_iname_deps | frozenset([base_exec_iname]), + within_inames_is_final=insn.within_inames_is_final, + depends_on=init_insn_depends_on) + generated_insns.append(init_insn) + + transfer_insn_depends_on = set([init_insn.id]) | insn.depends_on + + updated_inner_exprs = ( + preprocess_scan_arguments(insn, expr.expr, nresults, + scan_iname, track_iname, transfer_insn_depends_on)) + + from loopy.symbolic import Reduction + + from loopy.symbolic import pw_aff_to_expr + sweep_min_value_expr = pw_aff_to_expr(sweep_min_value) + + transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, scan_iname)) + transfer_insn = make_assignment( + id=transfer_id, + assignees=tuple( + acc_var[outer_local_iname_vars + + (var(sweep_iname) - sweep_min_value_expr,)] + for acc_var in acc_vars), + expression=Reduction( + operation=expr.operation, + inames=(track_iname,), + expr=_strip_if_scalar(acc_vars, updated_inner_exprs), + allow_simultaneous=False, + ), + within_inames=outer_insn_inames - frozenset(expr.inames), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset(transfer_insn_depends_on), + no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with) + + generated_insns.append(transfer_insn) + + prev_id = transfer_id + + istage = 0 + cur_size = 1 + + while cur_size < scan_size: + stage_exec_iname = var_name_gen("%s__scan_s%d" % (sweep_iname, istage)) + domains.append( + _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size)) + new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[sweep_iname] + + for read_var, acc_var in zip(read_vars, acc_vars): + read_stage_id = insn_id_gen( + "scan_%s_read_stage_%d" % (scan_iname, istage)) + + read_stage_insn = make_assignment( + id=read_stage_id, + assignees=(read_var,), + expression=( + acc_var[ + outer_local_iname_vars + + (var(stage_exec_iname) - cur_size,)]), + within_inames=( + base_iname_deps | frozenset([stage_exec_iname])), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([prev_id])) + + if cur_size == 1: + # Performance hack: don't add a barrier here with transfer_insn. + # NOTE: This won't work if the way that local inames + # are lowered changes. + read_stage_insn = read_stage_insn.copy( + no_sync_with=( + read_stage_insn.no_sync_with + | frozenset([(transfer_id, "any")]))) + + generated_insns.append(read_stage_insn) + prev_id = read_stage_id + + write_stage_id = insn_id_gen( + "scan_%s_write_stage_%d" % (scan_iname, istage)) + write_stage_insn = make_assignment( + id=write_stage_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars), + expression=expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, read_vars), + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars)) + ), + within_inames=( + base_iname_deps | frozenset([stage_exec_iname])), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([prev_id]), + ) + + generated_insns.append(write_stage_insn) + prev_id = write_stage_id + + cur_size *= 2 + istage += 1 + + new_insn_add_depends_on.add(prev_id) + new_insn_add_within_inames.add(sweep_iname) + + output_idx = var(sweep_iname) - sweep_min_value_expr + + if nresults == 1: + assert len(acc_vars) == 1 + return acc_vars[0][outer_local_iname_vars + (output_idx,)] + else: + return [acc_var[outer_local_iname_vars + (output_idx,)] + for acc_var in acc_vars] + + # }}} + # {{{ seq/par dispatch def map_reduction(expr, rec, nresults=1): @@ -825,31 +1663,43 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): raise LoopyError("reduction used within loop(s) that it was " "supposed to reduce over: " + ", ".join(bad_inames)) - n_sequential = 0 - n_local_par = 0 + iname_classes = _classify_reduction_inames(temp_kernel, expr.inames) - from loopy.kernel.data import ( - LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, - ParallelTag) - for iname in expr.inames: - iname_tag = kernel.iname_to_tag.get(iname) + n_sequential = len(iname_classes.sequential) + n_local_par = len(iname_classes.local_parallel) + n_nonlocal_par = len(iname_classes.nonlocal_parallel) + + really_force_scan = force_scan and ( + len(expr.inames) != 1 or expr.inames[0] not in inames_added_for_scan) + + def _error_if_force_scan_on(cls, msg): + if really_force_scan: + raise cls(msg) - if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)): - # These are nominally parallel, but we can live with - # them as sequential. - n_sequential += 1 + may_be_implemented_as_scan = False + if force_scan or automagic_scans_ok: + from loopy.diagnostic import ReductionIsNotTriangularError - elif isinstance(iname_tag, LocalIndexTagBase): - n_local_par += 1 + try: + # Try to determine scan candidate information (sweep iname, scan + # iname, etc). + scan_param = _try_infer_scan_candidate_from_expr( + temp_kernel, expr, outer_insn_inames, + sweep_iname=force_outer_iname_for_scan) - elif isinstance(iname_tag, (ParallelTag, VectorizeTag)): - raise LoopyError("the only form of parallelism supported " - "by reductions is 'local'--found iname '%s' " - "tagged '%s'" - % (iname, type(iname_tag).__name__)) + except ValueError as v: + error = str(v) else: - n_sequential += 1 + # Ensures the reduction is triangular (somewhat expensive). + may_be_implemented_as_scan, error = ( + _check_reduction_is_triangular( + temp_kernel, expr, scan_param)) + + if not may_be_implemented_as_scan: + _error_if_force_scan_on(ReductionIsNotTriangularError, error) + + # {{{ sanity checks if n_local_par and n_sequential: raise LoopyError("Reduction over '%s' contains both parallel and " @@ -865,21 +1715,87 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): "before code generation." % ", ".join(expr.inames)) - if n_sequential: - assert n_local_par == 0 - return map_reduction_seq(expr, rec, nresults, arg_dtypes, - reduction_dtypes) - elif n_local_par: - return map_reduction_local(expr, rec, nresults, arg_dtypes, - reduction_dtypes) - else: + if n_nonlocal_par: + bad_inames = iname_classes.nonlocal_parallel + raise LoopyError("the only form of parallelism supported " + "by reductions is 'local'--found iname(s) '%s' " + "respectively tagged '%s'" + % (", ".join(bad_inames), + ", ".join(kernel.iname_to_tag[iname] + for iname in bad_inames))) + + if n_local_par == 0 and n_sequential == 0: from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "empty_reduction", "Empty reduction found (no inames to reduce over). " "Eliminating.") + # We're not supposed to reduce/sum at all. (Note how this is distinct + # from an empty reduction--there is an element here, just no inames + # to reduce over. It's rather similar to an array with () shape in + # numpy.) + return expr.expr + # }}} + + if may_be_implemented_as_scan: + assert force_scan or automagic_scans_ok + + # We require the "scan" iname to be tagged sequential. + if n_sequential: + sweep_iname = scan_param.sweep_iname + sweep_class = _classify_reduction_inames(kernel, (sweep_iname,)) + + sequential = sweep_iname in sweep_class.sequential + parallel = sweep_iname in sweep_class.local_parallel + bad_parallel = sweep_iname in sweep_class.nonlocal_parallel + + if sweep_iname not in outer_insn_inames: + _error_if_force_scan_on(LoopyError, + "Sweep iname '%s' was detected, but is not an iname " + "for the instruction." % sweep_iname) + elif bad_parallel: + _error_if_force_scan_on(LoopyError, + "Sweep iname '%s' has an unsupported parallel tag '%s' " + "- the only parallelism allowed is 'local'." % + (sweep_iname, temp_kernel.iname_to_tag[sweep_iname])) + elif parallel: + return map_scan_local( + expr, rec, nresults, arg_dtypes, reduction_dtypes, + sweep_iname, scan_param.scan_iname, + scan_param.sweep_lower_bound, + scan_param.scan_lower_bound, + scan_param.stride) + elif sequential: + return map_scan_seq( + expr, rec, nresults, arg_dtypes, reduction_dtypes, + sweep_iname, scan_param.scan_iname, + scan_param.sweep_lower_bound, + scan_param.scan_lower_bound, + scan_param.stride) + + # fallthrough to reduction implementation + + else: + assert n_local_par > 0 + scan_iname, = expr.inames + _error_if_force_scan_on(LoopyError, + "Scan iname '%s' is parallel tagged: this is not allowed " + "(only the sweep iname should be tagged if parallelism " + "is desired)." % scan_iname) + + # fallthrough to reduction implementation + + if n_sequential: + assert n_local_par == 0 + return map_reduction_seq( + expr, rec, nresults, arg_dtypes, reduction_dtypes) + else: + assert n_local_par > 0 + return map_reduction_local( + expr, rec, nresults, arg_dtypes, reduction_dtypes) + # }}} from loopy.symbolic import ReductionCallbackMapper @@ -985,6 +1901,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): kernel = lp.tag_inames(kernel, new_iname_tags) + # TODO: remove unused inames... + kernel = ( _hackily_ensure_multi_assignment_return_values_are_scoped_private( kernel)) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index c078da2ec58dabbbf646bfcf593ea0138941cc85..57cf74b808ae1a7107e76a18a3876785ab8baabd 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1908,14 +1908,14 @@ def generate_loop_schedules_inner(kernel, debug_args={}): if (gsize or lsize): if not kernel.options.disable_global_barriers: - logger.info("%s: barrier insertion: global" % kernel.name) + logger.debug("%s: barrier insertion: global" % kernel.name) gen_sched = insert_barriers(kernel, gen_sched, kind="global", verify_only=True) - logger.info("%s: barrier insertion: local" % kernel.name) + logger.debug("%s: barrier insertion: local" % kernel.name) gen_sched = insert_barriers(kernel, gen_sched, kind="local", verify_only=False) - logger.info("%s: barrier insertion: done" % kernel.name) + logger.debug("%s: barrier insertion: done" % kernel.name) new_kernel = kernel.copy( schedule=gen_sched, diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index 5de677e72708be844a5276b3d40ace8b1dad9da0..f9b08d3434556f912107726f125abbfa110f5676 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -23,10 +23,6 @@ THE SOFTWARE. """ from loopy.kernel.data import temp_var_scope -from loopy.schedule import (BeginBlockItem, CallKernel, EndBlockItem, - RunInstruction, Barrier) - -from pytools import memoize_method # {{{ block boundary finder @@ -37,6 +33,7 @@ def get_block_boundaries(schedule): :class:`loopy.schedule.BlockBeginItem`s to :class:`loopy.schedule.BlockEndItem`s and vice versa. """ + from loopy.schedule import (BeginBlockItem, EndBlockItem) block_bounds = {} active_blocks = [] for idx, sched_item in enumerate(schedule): @@ -51,109 +48,24 @@ def get_block_boundaries(schedule): # }}} -# {{{ instruction query utility - -class InstructionQuery(object): - - def __init__(self, kernel): - self.kernel = kernel - block_bounds = get_block_boundaries(kernel.schedule) - subkernel_slices = {} - from six import iteritems - for start, end in iteritems(block_bounds): - sched_item = kernel.schedule[start] - if isinstance(sched_item, CallKernel): - subkernel_slices[sched_item.kernel_name] = slice(start, end + 1) - self.subkernel_slices = subkernel_slices - - @memoize_method - def subkernels(self): - return frozenset(self.subkernel_slices.keys()) - - @memoize_method - def insns_reading_or_writing(self, var): - return frozenset(insn.id for insn in self.kernel.instructions - if var in insn.read_dependency_names() - or var in insn.assignee_var_names()) - - @memoize_method - def insns_in_subkernel(self, subkernel): - return frozenset(sched_item.insn_id for sched_item - in self.kernel.schedule[self.subkernel_slices[subkernel]] - if isinstance(sched_item, RunInstruction)) - - @memoize_method - def temporaries_read_in_subkernel(self, subkernel): - return frozenset( - var - for insn in self.insns_in_subkernel(subkernel) - for var in self.kernel.id_to_insn[insn].read_dependency_names() - if var in self.kernel.temporary_variables) - - @memoize_method - def temporaries_written_in_subkernel(self, subkernel): - return frozenset( - var - for insn in self.insns_in_subkernel(subkernel) - for var in self.kernel.id_to_insn[insn].assignee_var_names() - if var in self.kernel.temporary_variables) - - @memoize_method - def temporaries_read_or_written_in_subkernel(self, subkernel): - return ( - self.temporaries_read_in_subkernel(subkernel) | - self.temporaries_written_in_subkernel(subkernel)) - - @memoize_method - def inames_in_subkernel(self, subkernel): - subkernel_start = self.subkernel_slices[subkernel].start - return frozenset(self.kernel.schedule[subkernel_start].extra_inames) - - @memoize_method - def pre_and_post_barriers(self, subkernel): - subkernel_start = self.subkernel_slices[subkernel].start - subkernel_end = self.subkernel_slices[subkernel].stop - - def is_global_barrier(item): - return isinstance(item, Barrier) and item.kind == "global" - - try: - pre_barrier = next(item for item in - self.kernel.schedule[subkernel_start::-1] - if is_global_barrier(item)).originating_insn_id - except StopIteration: - pre_barrier = None - - try: - post_barrier = next(item for item in - self.kernel.schedule[subkernel_end:] - if is_global_barrier(item)).originating_insn_id - except StopIteration: - post_barrier = None - - return (pre_barrier, post_barrier) - - @memoize_method - def hw_inames(self, insn_id): - """ - Return the inames that insn runs in and that are tagged as hardware - parallel. - """ - from loopy.kernel.data import HardwareParallelTag - return set(iname for iname in self.kernel.insn_inames(insn_id) - if isinstance(self.kernel.iname_to_tag.get(iname), - HardwareParallelTag)) - - @memoize_method - def common_hw_inames(self, insn_ids): - """ - Return the common set of hardware parallel tagged inames among - the list of instructions. - """ - # Get the list of hardware inames in which the temporary is defined. - if len(insn_ids) == 0: - return set() - return set.intersection(*(self.hw_inames(id) for id in insn_ids)) +# {{{ subkernel tools + +def temporaries_read_in_subkernel(kernel, subkernel): + from loopy.kernel.tools import get_subkernel_to_insn_id_map + insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel] + return frozenset(tv + for insn_id in insn_ids + for tv in kernel.id_to_insn[insn_id].read_dependency_names() + if tv in kernel.temporary_variables) + + +def temporaries_written_in_subkernel(kernel, subkernel): + from loopy.kernel.tools import get_subkernel_to_insn_id_map + insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel] + return frozenset(tv + for insn_id in insn_ids + for tv in kernel.id_to_insn[insn_id].write_dependency_names() + if tv in kernel.temporary_variables) # }}} @@ -166,23 +78,27 @@ def add_extra_args_to_schedule(kernel): instructions in the schedule with global temporaries. """ new_schedule = [] - - insn_query = InstructionQuery(kernel) + from loopy.schedule import CallKernel for sched_item in kernel.schedule: if isinstance(sched_item, CallKernel): - subrange_temporaries = (insn_query - .temporaries_read_or_written_in_subkernel(sched_item.kernel_name)) + subkernel = sched_item.kernel_name + + used_temporaries = ( + temporaries_read_in_subkernel(kernel, subkernel) + | temporaries_written_in_subkernel(kernel, subkernel)) + more_args = set(tv - for tv in subrange_temporaries - if - kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL - and - kernel.temporary_variables[tv].initializer is None - and - tv not in sched_item.extra_args) + for tv in used_temporaries + if + kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL + and + kernel.temporary_variables[tv].initializer is None + and + tv not in sched_item.extra_args) + new_schedule.append(sched_item.copy( - extra_args=sched_item.extra_args + sorted(more_args))) + extra_args=sched_item.extra_args + sorted(more_args))) else: new_schedule.append(sched_item) diff --git a/loopy/statistics.py b/loopy/statistics.py index fde8643bf92b7ad56bb47975fa7ede1bda9b399c..9b15ec471fb681698b85c1dd2f92376fbc731f00 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -38,6 +38,7 @@ __doc__ = """ .. currentmodule:: loopy +.. autoclass:: GuardedPwQPolynomial .. autoclass:: ToCountMap .. autoclass:: Op .. autoclass:: MemAccess @@ -52,6 +53,66 @@ __doc__ = """ """ +# {{{ GuardedPwQPolynomial + +class GuardedPwQPolynomial(object): + def __init__(self, pwqpolynomial, valid_domain): + self.pwqpolynomial = pwqpolynomial + self.valid_domain = valid_domain + + def __add__(self, other): + if isinstance(other, GuardedPwQPolynomial): + return GuardedPwQPolynomial( + self.pwqpolynomial + other.pwqpolynomial, + self.valid_domain & other.valid_domain) + else: + return GuardedPwQPolynomial( + self.pwqpolynomial + other, + self.valid_domain) + + __radd__ = __add__ + + def __mul__(self, other): + if isinstance(other, GuardedPwQPolynomial): + return GuardedPwQPolynomial( + self.pwqpolynomial * other.pwqpolynomial, + self.valid_domain & other.valid_domain) + else: + return GuardedPwQPolynomial( + self.pwqpolynomial * other, + self.valid_domain) + + __rmul__ = __mul__ + + def eval_with_dict(self, value_dict): + space = self.pwqpolynomial.space + pt = isl.Point.zero(space.params()) + + for i in range(space.dim(dim_type.param)): + par_name = space.get_dim_name(dim_type.param, i) + pt = pt.set_coordinate_val( + dim_type.param, i, value_dict[par_name]) + + if not (isl.Set.from_point(pt) <= self.valid_domain): + raise ValueError("evaluation point outside of domain of " + "definition of piecewise quasipolynomial") + + return self.pwqpolynomial.eval(pt).to_python() + + @staticmethod + def zero(): + p = isl.PwQPolynomial('{ 0 }') + return GuardedPwQPolynomial(p, isl.Set.universe(p.domain().space)) + + def __str__(self): + return str(self.pwqpolynomial) + + def __repr__(self): + return repr(self.pwqpolynomial) + +# }}} + + # {{{ ToCountMap class ToCountMap(object): @@ -66,16 +127,17 @@ class ToCountMap(object): """ - def __init__(self, init_dict=None): + def __init__(self, init_dict=None, val_type=GuardedPwQPolynomial): if init_dict is None: init_dict = {} self.count_map = init_dict + self.val_type = val_type def __add__(self, other): result = self.count_map.copy() for k, v in six.iteritems(other.count_map): result[k] = self.count_map.get(k, 0) + v - return ToCountMap(result) + return ToCountMap(result, self.val_type) def __radd__(self, other): if other != 0: @@ -86,7 +148,7 @@ class ToCountMap(object): return self def __mul__(self, other): - if isinstance(other, isl.PwQPolynomial): + if isinstance(other, GuardedPwQPolynomial): return ToCountMap(dict( (index, self.count_map[index]*other) for index in self.keys())) @@ -101,7 +163,11 @@ class ToCountMap(object): try: return self.count_map[index] except KeyError: - return isl.PwQPolynomial('{ 0 }') + #TODO what is the best way to handle this? + if self.val_type is GuardedPwQPolynomial: + return GuardedPwQPolynomial.zero() + else: + return 0 def __setitem__(self, index, value): self.count_map[index] = value @@ -112,6 +178,9 @@ class ToCountMap(object): def __len__(self): return len(self.count_map) + def get(self, key, default=None): + return self.count_map.get(key, default) + def items(self): return self.count_map.items() @@ -122,14 +191,20 @@ class ToCountMap(object): return self.count_map.pop(item) def copy(self): - return ToCountMap(dict(self.count_map)) + return ToCountMap(dict(self.count_map), self.val_type) + + def with_set_attributes(self, **kwargs): + return ToCountMap(dict( + (key.copy(**kwargs), val) + for key, val in six.iteritems(self.count_map)), + self.val_type) def filter_by(self, **kwargs): """Remove items without specified key fields. - :parameter \*\*kwargs: Keyword arguments matching fields in the keys of - the :class:`ToCountMap`, each given a list of - allowable values for that key field. + :arg kwargs: Keyword arguments matching fields in the keys of + the :class:`ToCountMap`, each given a list of + allowable values for that key field. :return: A :class:`ToCountMap` containing the subset of the items in the original :class:`ToCountMap` that match the field values @@ -149,7 +224,7 @@ class ToCountMap(object): """ - result_map = ToCountMap() + result_map = ToCountMap(val_type=self.val_type) from loopy.types import to_loopy_type if 'dtype' in kwargs.keys(): @@ -175,10 +250,10 @@ class ToCountMap(object): def filter_by_func(self, func): """Keep items that pass a test. - :parameter func: A function that takes a map key a parameter and - returns a :class:`bool`. + :arg func: A function that takes a map key a parameter and + returns a :class:`bool`. - :return: A :class:`ToCountMap` containing the subset of the items in + :arg: A :class:`ToCountMap` containing the subset of the items in the original :class:`ToCountMap` for which func(key) is true. Example usage:: @@ -197,7 +272,7 @@ class ToCountMap(object): """ - result_map = ToCountMap() + result_map = ToCountMap(val_type=self.val_type) # for each item in self.count_map, call func on the key for self_key, self_val in self.items(): @@ -210,7 +285,7 @@ class ToCountMap(object): """Group map items together, distinguishing by only the key fields passed in args. - :parameter \*args: Zero or more :class:`str` fields of map keys. + :arg args: Zero or more :class:`str` fields of map keys. :return: A :class:`ToCountMap` containing the same total counts grouped together by new keys that only contain the fields @@ -252,7 +327,7 @@ class ToCountMap(object): """ - result_map = ToCountMap() + result_map = ToCountMap(val_type=self.val_type) # make sure all item keys have same type if self.count_map: @@ -315,23 +390,36 @@ class ToCountMap(object): bytes_processed = int(key.dtype.itemsize) * val result[key] = bytes_processed + #TODO again, is this okay? + result.val_type = int + return result def sum(self): """Add all counts in ToCountMap. - :return: A :class:`islpy.PwQPolynomial` containing the sum of counts. + :return: A :class:`islpy.PwQPolynomial` or :class:`int` containing the sum of + counts. """ - total = isl.PwQPolynomial('{ 0 }') + + if self.val_type is GuardedPwQPolynomial: + total = GuardedPwQPolynomial.zero() + else: + total = 0 + for k, v in self.items(): - if not isinstance(v, isl.PwQPolynomial): - raise ValueError("ToCountMap: sum() encountered type {0} but " - "may only be used on PwQPolynomials." - .format(type(v))) total += v return total + #TODO test and document + def eval(self, params): + result = self.copy() + for key, val in self.items(): + result[key] = val.eval_with_dict(params) + result.val_type = int + return result + def eval_and_sum(self, params): """Add all counts in :class:`ToCountMap` and evaluate with provided parameter dict. @@ -364,8 +452,10 @@ def stringify_stats_mapping(m): return result +# {{{ Op descriptor + class Op(object): - """An arithmetic operation. + """A descriptor for a type of arithmetic operation. .. attribute:: dtype @@ -379,6 +469,8 @@ class Op(object): """ + # FIXME: This could be done much more briefly by inheriting from Record. + def __init__(self, dtype=None, name=None): self.name = name if dtype is None: @@ -398,19 +490,15 @@ class Op(object): return hash(str(self)) def __repr__(self): - if self.dtype is None: - dtype = 'None' - else: - dtype = str(self.dtype) - if self.name is None: - name = 'None' - else: - name = self.name - return "Op("+dtype+", "+name+")" + return "Op(%s, %s)" % (self.dtype, self.name) + +# }}} +# {{{ MemAccess descriptor + class MemAccess(object): - """A memory access. + """A descriptor for a type of memory access. .. attribute:: mtype @@ -439,6 +527,8 @@ class MemAccess(object): """ + # FIXME: This could be done much more briefly by inheriting from Record. + def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None): self.mtype = mtype @@ -461,6 +551,16 @@ class MemAccess(object): raise NotImplementedError("MemAccess: variable must be None when " "mtype is 'local'") + def copy(self, mtype=None, dtype=None, stride=None, direction=None, + variable=None): + return MemAccess( + mtype=mtype if mtype is not None else self.mtype, + dtype=dtype if dtype is not None else self.dtype, + stride=stride if stride is not None else self.stride, + direction=direction if direction is not None else self.direction, + variable=variable if variable is not None else self.variable, + ) + def __eq__(self, other): return isinstance(other, MemAccess) and ( (self.mtype is None or other.mtype is None or @@ -501,11 +601,70 @@ class MemAccess(object): return "MemAccess(" + mtype + ", " + dtype + ", " + stride + ", " \ + direction + ", " + variable + ")" +# }}} -# {{{ ExpressionOpCounter -class ExpressionOpCounter(CombineMapper): +# {{{ counter base + +class CounterBase(CombineMapper): + def __init__(self, knl): + self.knl = knl + from loopy.type_inference import TypeInferenceMapper + self.type_inf = TypeInferenceMapper(knl) + + def combine(self, values): + return sum(values) + + def map_constant(self, expr): + return ToCountMap() + + def map_call(self, expr): + return self.rec(expr.parameters) + + def map_sum(self, expr): + if expr.children: + return sum(self.rec(child) for child in expr.children) + else: + return ToCountMap() + + map_product = map_sum + + def map_comparison(self, expr): + return self.rec(expr.left)+self.rec(expr.right) + + def map_if(self, expr): + warn_with_kernel(self.knl, "summing_if_branches", + "%s counting sum of if-expression branches." + % type(self).__name__) + return self.rec(expr.condition) + self.rec(expr.then) \ + + self.rec(expr.else_) + + def map_if_positive(self, expr): + warn_with_kernel(self.knl, "summing_if_branches", + "%s counting sum of if-expression branches." + % type(self).__name__) + return self.rec(expr.criterion) + self.rec(expr.then) \ + + self.rec(expr.else_) + + def map_common_subexpression(self, expr): + raise RuntimeError("%s encountered %s--not supposed to happen" + % (type(self).__name__, type(expr).__name__)) + + map_substitution = map_common_subexpression + map_derivative = map_common_subexpression + map_slice = map_common_subexpression + + # preprocessing should have removed these + def map_reduction(self, expr): + raise RuntimeError("%s encountered %s--not supposed to happen" + % (type(self).__name__, type(expr).__name__)) + +# }}} + + +# {{{ ExpressionOpCounter +class ExpressionOpCounter(CounterBase): def __init__(self, knl): self.knl = knl from loopy.type_inference import TypeInferenceMapper @@ -620,106 +779,59 @@ class ExpressionOpCounter(CombineMapper): # }}} -# {{{ LocalSubscriptCounter - -class LocalSubscriptCounter(CombineMapper): - - def __init__(self, knl): - self.knl = knl - from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) - - def combine(self, values): - return sum(values) - - def map_constant(self, expr): - return ToCountMap() +class MemAccessCounter(CounterBase): + pass - map_tagged_variable = map_constant - map_variable = map_constant - def map_call(self, expr): - return self.rec(expr.parameters) +# {{{ LocalMemAccessCounter - def map_subscript(self, expr): +class LocalMemAccessCounter(MemAccessCounter): + def count_var_access(self, dtype, name, subscript): sub_map = ToCountMap() - name = expr.aggregate.name # name of array if name in self.knl.temporary_variables: array = self.knl.temporary_variables[name] if array.is_local: - sub_map[MemAccess(mtype='local', dtype=self.type_inf(expr))] = 1 - return sub_map + self.rec(expr.index) - - def map_sum(self, expr): - if expr.children: - return sum(self.rec(child) for child in expr.children) - else: - return ToCountMap() - - map_product = map_sum - - def map_comparison(self, expr): - return self.rec(expr.left)+self.rec(expr.right) - - def map_if(self, expr): - warn_with_kernel(self.knl, "summing_if_branches_lsubs", - "LocalSubscriptCounter counting LMEM accesses as sum " - "of if-statement branches.") - return self.rec(expr.condition) + self.rec(expr.then) \ - + self.rec(expr.else_) - - def map_if_positive(self, expr): - warn_with_kernel(self.knl, "summing_ifpos_branches_lsubs", - "LocalSubscriptCounter counting LMEM accesses as sum " - "of if_pos-statement branches.") - return self.rec(expr.criterion) + self.rec(expr.then) \ - + self.rec(expr.else_) - - def map_common_subexpression(self, expr): - raise NotImplementedError("LocalSubscriptCounter encountered " - "common_subexpression, " - "map_common_subexpression not implemented.") + sub_map[MemAccess(mtype='local', dtype=dtype)] = 1 + return sub_map - def map_substitution(self, expr): - raise NotImplementedError("LocalSubscriptCounter encountered " - "substitution, " - "map_substitution not implemented.") + def map_variable(self, expr): + return self.count_var_access( + self.type_inf(expr), expr.name, None) - def map_derivative(self, expr): - raise NotImplementedError("LocalSubscriptCounter encountered " - "derivative, " - "map_derivative not implemented.") + map_tagged_variable = map_variable - def map_slice(self, expr): - raise NotImplementedError("LocalSubscriptCounter encountered slice, " - "map_slice not implemented.") + def map_subscript(self, expr): + return ( + self.count_var_access( + self.type_inf(expr), expr.aggregate.name, expr.index) + + self.rec(expr.index)) # }}} -# {{{ GlobalSubscriptCounter - -class GlobalSubscriptCounter(CombineMapper): +# {{{ GlobalMemAccessCounter - def __init__(self, knl): - self.knl = knl - from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) - - def combine(self, values): - return sum(values) +class GlobalMemAccessCounter(MemAccessCounter): + def map_variable(self, expr): + name = expr.name - def map_constant(self, expr): - return ToCountMap() + if name in self.knl.arg_dict: + array = self.knl.arg_dict[name] + else: + # this is a temporary variable + return ToCountMap() - map_tagged_variable = map_constant - map_variable = map_constant + if not isinstance(array, lp.GlobalArg): + # this array is not in global memory + return ToCountMap() - def map_call(self, expr): - return self.rec(expr.parameters) + return ToCountMap({MemAccess(mtype='global', + dtype=self.type_inf(expr), stride=0, + variable=name): 1} + ) + self.rec(expr.index) def map_subscript(self, expr): - name = expr.aggregate.name # name of array + name = expr.aggregate.name if name in self.knl.arg_dict: array = self.knl.arg_dict[name] @@ -806,47 +918,6 @@ class GlobalSubscriptCounter(CombineMapper): stride=total_stride, variable=name): 1} ) + self.rec(expr.index) - def map_sum(self, expr): - if expr.children: - return sum(self.rec(child) for child in expr.children) - else: - return ToCountMap() - - map_product = map_sum - - def map_if(self, expr): - warn_with_kernel(self.knl, "summing_if_branches_gsubs", - "GlobalSubscriptCounter counting GMEM accesses as " - "sum of if-statement branches.") - return self.rec(expr.condition) + self.rec(expr.then) \ - + self.rec(expr.else_) - - def map_if_positive(self, expr): - warn_with_kernel(self.knl, "summing_ifpos_branches_gsubs", - "GlobalSubscriptCounter counting GMEM accesses as " - "sum of if_pos-statement branches.") - return self.rec(expr.criterion) + self.rec(expr.then) \ - + self.rec(expr.else_) - - def map_common_subexpression(self, expr): - raise NotImplementedError("GlobalSubscriptCounter encountered " - "common_subexpression, " - "map_common_subexpression not implemented.") - - def map_substitution(self, expr): - raise NotImplementedError("GlobalSubscriptCounter encountered " - "substitution, " - "map_substitution not implemented.") - - def map_derivative(self, expr): - raise NotImplementedError("GlobalSubscriptCounter encountered " - "derivative, " - "map_derivative not implemented.") - - def map_slice(self, expr): - raise NotImplementedError("GlobalSubscriptCounter encountered slice, " - "map_slice not implemented.") - # }}} @@ -919,9 +990,13 @@ class AccessFootprintGatherer(CombineMapper): # {{{ count -def count(kernel, set): +def add_assumptions_guard(kernel, pwqpolynomial): + return GuardedPwQPolynomial(pwqpolynomial, kernel.assumptions) + + +def count(kernel, set, space=None): try: - return set.card() + return add_assumptions_guard(kernel, set.card()) except AttributeError: pass @@ -948,7 +1023,11 @@ def count(kernel, set): if stride is None: stride = 1 - length = isl.PwQPolynomial.from_pw_aff(dmax - dmin + stride) + length_pwaff = dmax - dmin + stride + if space is not None: + length_pwaff = length_pwaff.align_params(space) + + length = isl.PwQPolynomial.from_pw_aff(length_pwaff) length = length.scale_down_val(stride) if bset_count is None: @@ -1008,46 +1087,102 @@ def count(kernel, set): "number of integer points in your loop " "domain.") - return count + return add_assumptions_guard(kernel, count) -# }}} +def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): + # FIXME: Multi-kernel support + gsize, lsize = knl.get_grid_size_upper_bounds() -# {{{ get_op_poly + g_used = set() + l_used = set() -def get_op_poly(knl, numpy_types=True): + from loopy.kernel.data import LocalIndexTag, GroupIndexTag + for iname in knl.insn_inames(insn): + tag = knl.iname_to_tag.get(iname) - """Count the number of operations in a loopy kernel. + if isinstance(tag, LocalIndexTag): + l_used.add(tag.axis) + elif isinstance(tag, GroupIndexTag): + g_used.add(tag.axis) - get_op_poly is deprecated. Use get_op_map instead. + def mult_grid_factor(used_axes, size): + result = 1 + for iaxis, size in enumerate(size): + if iaxis not in used_axes: + if not isinstance(size, int): + if space is not None: + size = size.align_params(space) - """ - warn_with_kernel(knl, "depricated_get_op_poly", - "get_op_poly is deprecated. Use get_op_map instead.") - return get_op_map(knl, numpy_types) + size = isl.PwQPolynomial.from_pw_aff(size) + + result = result * size + + return result + + if disregard_local_axes: + result = mult_grid_factor(g_used, gsize) + else: + result = mult_grid_factor(g_used, gsize) * mult_grid_factor(l_used, lsize) + + return add_assumptions_guard(knl, result) + + +def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False): + insn_inames = knl.insn_inames(insn) + + if disregard_local_axes: + from loopy.kernel.data import LocalIndexTag + insn_inames = [iname for iname in insn_inames if not + isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)] + + inames_domain = knl.get_inames_domain(insn_inames) + domain = (inames_domain.project_out_except( + insn_inames, [dim_type.set])) + + space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT, + set=[], params=knl.outer_params()) + + c = count(knl, domain, space=space) + + if count_redundant_work: + unused_fac = get_unused_hw_axes_factor(knl, insn, + disregard_local_axes=disregard_local_axes, + space=space) + return c * unused_fac + else: + return c # }}} -def get_op_map(knl, numpy_types=True): +# {{{ get_op_map + +def get_op_map(knl, numpy_types=True, count_redundant_work=False): """Count the number of operations in a loopy kernel. - :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted. + :arg knl: A :class:`loopy.LoopKernel` whose operations are to be counted. + + :arg numpy_types: A :class:`bool` specifying whether the types + in the returned mapping should be numpy types + instead of :class:`loopy.LoopyType`. - :parameter numpy_types: A :class:`bool` specifying whether the types - in the returned mapping should be numpy types - instead of :class:`loopy.LoopyType`. + :arg count_redundant_work: Based on usage of hardware axes or other + specifics, a kernel may perform work redundantly. This :class:`bool` + flag indicates whether this work should be included in the count. + (Likely desirable for performance modeling, but undesirable for + code optimization.) :return: A :class:`ToCountMap` of **{** :class:`Op` **:** - :class:`islpy.PwQPolynomial` **}**. + :class:`islpy.PwQPolynomial` **}**. - - The :class:`Op` specifies the characteristics of the arithmetic - operation. + - The :class:`Op` specifies the characteristics of the arithmetic + operation. - - The :class:`islpy.PwQPolynomial` holds the number of operations of - the kind specified in the key (in terms of the - :class:`loopy.LoopKernel` parameter *inames*). + - The :class:`islpy.PwQPolynomial` holds the number of operations of + the kind specified in the key (in terms of the + :class:`loopy.LoopKernel` parameter *inames*). Example usage:: @@ -1069,14 +1204,10 @@ def get_op_map(knl, numpy_types=True): op_map = ToCountMap() op_counter = ExpressionOpCounter(knl) for insn in knl.instructions: - # how many times is this instruction executed? - # check domain size: - insn_inames = knl.insn_inames(insn) - inames_domain = knl.get_inames_domain(insn_inames) - domain = (inames_domain.project_out_except( - insn_inames, [dim_type.set])) ops = op_counter(insn.assignee) + op_counter(insn.expression) - op_map = op_map + ops*count(knl, domain) + op_map = op_map + ops*count_insn_runs( + knl, insn, + count_redundant_work=count_redundant_work) if numpy_types: op_map.count_map = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name), @@ -1085,73 +1216,36 @@ def get_op_map(knl, numpy_types=True): return op_map - -#TODO test deprecated functions? -def get_lmem_access_poly(knl): - """Count the number of local memory accesses in a loopy kernel. - - get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['local'] option. - - """ - warn_with_kernel(knl, "depricated_get_lmem_access_poly", - "get_lmem_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['local'] option.") - return get_mem_access_map(knl).filter_by(mtype=['local']) - - -def get_DRAM_access_poly(knl): - """Count the number of global memory accesses in a loopy kernel. - - get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['global'] option. - - """ - warn_with_kernel(knl, "depricated_get_DRAM_access_poly", - "get_DRAM_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['global'] option.") - return get_mem_access_map(knl).filter_by(mtype=['global']) - - -# {{{ get_gmem_access_poly - -def get_gmem_access_poly(knl): - """Count the number of global memory accesses in a loopy kernel. - - get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['global'] option. - - """ - warn_with_kernel(knl, "depricated_get_gmem_access_poly", - "get_DRAM_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['global'] option.") - return get_mem_access_map(knl).filter_by(mtype=['global']) - # }}} -def get_mem_access_map(knl, numpy_types=True): +# {{{ get_mem_access_map + +def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False): """Count the number of memory accesses in a loopy kernel. - :parameter knl: A :class:`loopy.LoopKernel` whose memory accesses are to be - counted. + :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be + counted. + + :arg numpy_types: A :class:`bool` specifying whether the types + in the returned mapping should be numpy types + instead of :class:`loopy.LoopyType`. - :parameter numpy_types: A :class:`bool` specifying whether the types - in the returned mapping should be numpy types - instead of :class:`loopy.LoopyType`. + :arg count_redundant_work: Based on usage of hardware axes or other + specifics, a kernel may perform work redundantly. This :class:`bool` + flag indicates whether this work should be included in the count. + (Likely desirable for performance modeling, but undesirable for + code optimization.) :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** - :class:`islpy.PwQPolynomial` **}**. + :class:`islpy.PwQPolynomial` **}**. - - The :class:`MemAccess` specifies the characteristics of the - memory access. + - The :class:`MemAccess` specifies the characteristics of the + memory access. - - The :class:`islpy.PwQPolynomial` holds the number of memory - accesses with the characteristics specified in the key (in terms - of the :class:`loopy.LoopKernel` *inames*). + - The :class:`islpy.PwQPolynomial` holds the number of memory + accesses with the characteristics specified in the key (in terms + of the :class:`loopy.LoopKernel` *inames*). Example usage:: @@ -1196,102 +1290,74 @@ def get_mem_access_map(knl, numpy_types=True): cache_holder = CacheHolder() @memoize_in(cache_holder, "insn_count") - def get_insn_count(knl, insn_inames, uniform=False): - if uniform: - from loopy.kernel.data import LocalIndexTag - insn_inames = [iname for iname in insn_inames if not - isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)] - inames_domain = knl.get_inames_domain(insn_inames) - domain = (inames_domain.project_out_except( - insn_inames, [dim_type.set])) - return count(knl, domain) + def get_insn_count(knl, insn_id, uniform=False): + insn = knl.id_to_insn[insn_id] + return count_insn_runs( + knl, insn, disregard_local_axes=uniform, + count_redundant_work=count_redundant_work) knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) - subs_map = ToCountMap() - subs_counter_g = GlobalSubscriptCounter(knl) - subs_counter_l = LocalSubscriptCounter(knl) + access_map = ToCountMap() + access_counter_g = GlobalMemAccessCounter(knl) + access_counter_l = LocalMemAccessCounter(knl) for insn in knl.instructions: - # count subscripts - subs_expr = subs_counter_g(insn.expression) \ - + subs_counter_l(insn.expression) - - # distinguish loads and stores - for key in subs_expr.count_map: - subs_expr[MemAccess(mtype=key.mtype, dtype=key.dtype, - stride=key.stride, direction='load', - variable=key.variable) - ] = subs_expr.pop(key) - - subs_assignee_g = subs_counter_g(insn.assignee) - for key in subs_assignee_g.count_map: - subs_assignee_g[MemAccess(mtype=key.mtype, dtype=key.dtype, - stride=key.stride, - direction='store', - variable=key.variable) - ] = subs_assignee_g.pop(key) - # for now, don't count writes to local mem - - insn_inames = knl.insn_inames(insn) + access_expr = ( + access_counter_g(insn.expression) + + access_counter_l(insn.expression) + ).with_set_attributes(direction="load") + + access_assignee_g = access_counter_g(insn.assignee).with_set_attributes( + direction="store") + + # FIXME: (!!!!) for now, don't count writes to local mem # use count excluding local index tags for uniform accesses - for key in subs_expr.count_map: - map = ToCountMap({key: subs_expr[key]}) - if (key.mtype == 'global' and + for key, val in six.iteritems(access_expr.count_map): + is_uniform = (key.mtype == 'global' and isinstance(key.stride, int) and - key.stride == 0): - subs_map = subs_map \ - + map*get_insn_count(knl, insn_inames, True) - else: - subs_map = subs_map + map*get_insn_count(knl, insn_inames) - #currently not counting stride of local mem access - - for key in subs_assignee_g.count_map: - map = ToCountMap({key: subs_assignee_g[key]}) - if isinstance(key.stride, int) and key.stride == 0: - subs_map = subs_map \ - + map*get_insn_count(knl, insn_inames, True) - else: - subs_map = subs_map + map*get_insn_count(knl, insn_inames) + key.stride == 0) + access_map = ( + access_map + + ToCountMap({key: val}) + * get_insn_count(knl, insn.id, is_uniform)) + #currently not counting stride of local mem access + + for key, val in six.iteritems(access_assignee_g.count_map): + is_uniform = (key.mtype == 'global' and + isinstance(key.stride, int) and + key.stride == 0) + access_map = ( + access_map + + ToCountMap({key: val}) + * get_insn_count(knl, insn.id, is_uniform)) # for now, don't count writes to local mem if numpy_types: - subs_map.count_map = dict((MemAccess(mtype=mem_access.mtype, + # FIXME: Don't modify in-place + access_map.count_map = dict((MemAccess(mtype=mem_access.mtype, dtype=mem_access.dtype.numpy_dtype, stride=mem_access.stride, direction=mem_access.direction, variable=mem_access.variable), count) - for mem_access, count in six.iteritems(subs_map.count_map)) - - return subs_map - - -# {{{ get_synchronization_poly - -def get_synchronization_poly(knl): - """Count the number of synchronization events each thread encounters in a - loopy kernel. + for mem_access, count in six.iteritems(access_map.count_map)) - get_synchronization_poly is deprecated. Use get_synchronization_map instead. - - """ - warn_with_kernel(knl, "depricated_get_synchronization_poly", - "get_synchronization_poly is deprecated. Use " - "get_synchronization_map instead.") - return get_synchronization_map(knl) + return access_map # }}} +# {{{ get_synchronization_map + def get_synchronization_map(knl): """Count the number of synchronization events each thread encounters in a loopy kernel. - :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. + :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: A dictionary mapping each type of synchronization event to a :class:`islpy.PwQPolynomial` holding the number of events per @@ -1358,9 +1424,10 @@ def get_synchronization_map(knl): raise LoopyError("unexpected schedule item: %s" % type(sched_item).__name__) - #return result.count_map #TODO is this change okay? return result +# }}} + # {{{ gather_access_footprints @@ -1456,4 +1523,74 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): # }}} + +# {{{ compat goop + +def get_lmem_access_poly(knl): + """Count the number of local memory accesses in a loopy kernel. + + get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['local'] option. + + """ + warn_with_kernel(knl, "deprecated_get_lmem_access_poly", + "get_lmem_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['local'] option.") + return get_mem_access_map(knl).filter_by(mtype=['local']) + + +def get_DRAM_access_poly(knl): + """Count the number of global memory accesses in a loopy kernel. + + get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['global'] option. + + """ + warn_with_kernel(knl, "deprecated_get_DRAM_access_poly", + "get_DRAM_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['global'] option.") + return get_mem_access_map(knl).filter_by(mtype=['global']) + + +def get_gmem_access_poly(knl): + """Count the number of global memory accesses in a loopy kernel. + + get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['global'] option. + + """ + warn_with_kernel(knl, "deprecated_get_gmem_access_poly", + "get_DRAM_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['global'] option.") + return get_mem_access_map(knl).filter_by(mtype=['global']) + + +def get_synchronization_poly(knl): + """Count the number of synchronization events each thread encounters in a + loopy kernel. + + get_synchronization_poly is deprecated. Use get_synchronization_map instead. + + """ + warn_with_kernel(knl, "deprecated_get_synchronization_poly", + "get_synchronization_poly is deprecated. Use " + "get_synchronization_map instead.") + return get_synchronization_map(knl) + + +def get_op_poly(knl, numpy_types=True): + """Count the number of operations in a loopy kernel. + + get_op_poly is deprecated. Use get_op_map instead. + + """ + warn_with_kernel(knl, "deprecated_get_op_poly", + "get_op_poly is deprecated. Use get_op_map instead.") + return get_op_map(knl, numpy_types) + +# }}} + # vim: foldmethod=marker diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 68cc32e56be077c7e45d11b9e2aade86b04494cc..8f924d3aee3b9f2982006fdb7b558cccac6785e3 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -167,6 +167,13 @@ class ExpressionToCExpressionMapper(IdentityMapper): def base_impl(expr, type_context): return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')] + def make_var(name): + from loopy import TaggedVariable + if isinstance(expr.aggregate, TaggedVariable): + return TaggedVariable(name, expr.aggregate.tag) + else: + return var(name) + from pymbolic.primitives import Variable if not isinstance(expr.aggregate, Variable): return base_impl(expr, type_context) @@ -224,16 +231,16 @@ class ExpressionToCExpressionMapper(IdentityMapper): (isinstance(ary, (ConstantArg, GlobalArg)) or (isinstance(ary, TemporaryVariable) and ary.base_storage))): # unsubscripted global args are pointers - result = var(access_info.array_name)[0] + result = make_var(access_info.array_name)[0] else: # unsubscripted temp vars are scalars # (unless they use base_storage) - result = var(access_info.array_name) + result = make_var(access_info.array_name) else: subscript, = access_info.subscripts - result = var(access_info.array_name)[self.rec(subscript, 'i')] + result = make_var(access_info.array_name)[self.rec(subscript, 'i')] if access_info.vector_index is not None: return self.codegen_state.ast_builder.add_vector_access( @@ -716,6 +723,8 @@ class CExpressionToCodeMapper(RecursiveMapper): def map_variable(self, expr, enclosing_prec): return expr.name + map_tagged_variable = map_variable + def map_lookup(self, expr, enclosing_prec): return self.parenthesize_if_needed( "%s.%s" % ( diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 80a69bd00c99258b709ea18b2a716c339b888b02..35dade90494906b61aad9eb66e7271f2c5d1e180 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -32,6 +32,7 @@ from loopy.diagnostic import LoopyError from loopy.symbolic import Literal from pymbolic import var import pymbolic.primitives as p +from loopy.kernel.data import temp_var_scope from pymbolic.mapper.stringifier import PREC_NONE from pytools import memoize_method @@ -81,7 +82,6 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): def map_variable(self, expr, type_context): tv = self.kernel.temporary_variables.get(expr.name) - from loopy.kernel.data import temp_var_scope if tv is not None and tv.scope == temp_var_scope.PRIVATE: # FIXME: This is a pretty coarse way of deciding what # private temporaries get duplicated. Refine? (See also @@ -101,7 +101,10 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): ary = self.find_array(expr) - if isinstance(ary, TemporaryVariable): + if (isinstance(ary, TemporaryVariable) + and ary.scope == temp_var_scope.PRIVATE): + # generate access code for acccess to private-index temporaries + gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() if lsize: lsize, = lsize @@ -305,7 +308,6 @@ class ISPCASTBuilder(CASTBuilder): shape = decl_info.shape - from loopy.kernel.data import temp_var_scope if temp_var.scope == temp_var_scope.PRIVATE: # FIXME: This is a pretty coarse way of deciding what # private temporaries get duplicated. Refine? (See also diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 467bc8ee801a090641c7e7f8d7f2e7c12a921232..f24b115fd5a35af94e4a6d437550bccf86b5bee0 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -335,6 +335,8 @@ class PyOpenCLTarget(OpenCLTarget): % dev_id) def preprocess(self, kernel): + if self.device is not None: + kernel = adjust_local_temp_var_storage(kernel, self.device) return kernel def pre_codegen_check(self, kernel): @@ -363,14 +365,26 @@ class PyOpenCLTarget(OpenCLTarget): raise NotImplementedError("atomics flavor: %s" % self.atomics_flavor) def is_vector_dtype(self, dtype): - from pyopencl.array import vec + try: + import pyopencl.cltypes as cltypes + vec_types = cltypes.vec_types + except ImportError: + from pyopencl.array import vec + vec_types = vec.types + return (isinstance(dtype, NumpyType) - and dtype.numpy_dtype in list(vec.types.values())) + and dtype.numpy_dtype in list(vec_types.values())) def vector_dtype(self, base, count): - from pyopencl.array import vec + try: + import pyopencl.cltypes as cltypes + vec_types = cltypes.vec_types + except ImportError: + from pyopencl.array import vec + vec_types = vec.types + return NumpyType( - vec.types[base.numpy_dtype, count], + vec_types[base.numpy_dtype, count], target=self) def alignment_requirement(self, type_decl): diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index df2067bfa352c9e6a66e74d280c3c14b8924036c..0da502fbab8aa45ed58a0491e0f43323ecf9aeff 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -26,8 +26,8 @@ from six.moves import range, zip from pytools import memoize_method from pytools.py_codegen import Indentation -from loopy.target.execution import (KernelExecutorBase, ExecutionWrapperGeneratorBase, - _KernelInfo, _Kernels) +from loopy.target.execution import ( + KernelExecutorBase, ExecutionWrapperGeneratorBase, _KernelInfo, _Kernels) import logging logger = logging.getLogger(__name__) diff --git a/loopy/target/python.py b/loopy/target/python.py index 99ec42f44b49f546cda324dfdb3c6a5b001d2222..11951abcf17e94c0fdba51042e3060735215b423 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -133,15 +133,17 @@ class ExpressionToPythonMapper(StringifyMapper): def map_if(self, expr, enclosing_prec): # Synthesize PREC_IFTHENELSE, make sure it is in the right place in the # operator precedence hierarchy (right above "or"). - from pymbolic.mapper.stringifier import PREC_LOGICAL_OR, PREC_NONE + from pymbolic.mapper.stringifier import PREC_LOGICAL_OR PREC_IFTHENELSE = PREC_LOGICAL_OR - 1 # noqa return self.parenthesize_if_needed( "{then} if {cond} else {else_}".format( - then=self.rec(expr.then, PREC_IFTHENELSE), - cond=self.rec(expr.condition, PREC_IFTHENELSE), - else_=self.rec(expr.else_, PREC_IFTHENELSE)), - enclosing_prec, PREC_NONE) + # "1 if 0 if 1 else 2 else 3" is not valid Python. + # So force parens by using an artificially higher precedence. + then=self.rec(expr.then, PREC_LOGICAL_OR), + cond=self.rec(expr.condition, PREC_LOGICAL_OR), + else_=self.rec(expr.else_, PREC_LOGICAL_OR)), + enclosing_prec, PREC_IFTHENELSE) # }}} @@ -257,7 +259,7 @@ class PythonASTBuilderBase(ASTBuilderBase): lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper - from pymbolic.mapper.stringifier import PREC_NONE + from pymbolic.mapper.stringifier import PREC_NONE, PREC_SUM from genpy import For return For( @@ -265,7 +267,7 @@ class PythonASTBuilderBase(ASTBuilderBase): "range(%s, %s + 1)" % ( ecm(lbound, PREC_NONE, "i"), - ecm(ubound, PREC_NONE, "i"), + ecm(ubound, PREC_SUM, "i"), ), inner) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index a19e06ecdf7c9966501ebb9600ea4e01614363f4..6077332c4fc4322ac7ffb02ade4a0e24c7066245 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -681,12 +681,18 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, dt, dim_idx = var_dict[primed_non1_saxis_names[i]] mod_domain = mod_domain.set_dim_name(dt, dim_idx, saxis) + def add_assumptions(d): + assumption_non_param = isl.BasicSet.from_params(kernel.assumptions) + assumptions, domain = isl.align_two(assumption_non_param, d) + return assumptions & domain + # {{{ check that we got the desired domain - check_domain = check_domain.project_out_except( - primed_non1_saxis_names, [isl.dim_type.set]) + check_domain = add_assumptions( + check_domain.project_out_except( + primed_non1_saxis_names, [isl.dim_type.set])) - mod_check_domain = mod_domain + mod_check_domain = add_assumptions(mod_domain) # re-add the prime from the new variable var_dict = mod_check_domain.get_var_dict(isl.dim_type.set) @@ -716,10 +722,11 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # project out the new names from the modified domain orig_domain_inames = list(domch.domain.get_var_dict(isl.dim_type.set)) - mod_check_domain = mod_domain.project_out_except( - orig_domain_inames, [isl.dim_type.set]) + mod_check_domain = add_assumptions( + mod_domain.project_out_except( + orig_domain_inames, [isl.dim_type.set])) - check_domain = domch.domain + check_domain = add_assumptions(domch.domain) mod_check_domain, check_domain = isl.align_two( mod_check_domain, check_domain) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 8afc1695a38a37baf165f6ec6ef6567e2012173b..3d4f5c2d4765aa7cbf1e56c76d127bf8f4d61a06 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -25,6 +25,7 @@ THE SOFTWARE. from loopy.diagnostic import LoopyError import loopy as lp +import six from loopy.kernel.data import auto, temp_var_scope from pytools import memoize_method, Record @@ -32,7 +33,7 @@ from loopy.schedule import ( EnterLoop, LeaveLoop, RunInstruction, CallKernel, ReturnFromKernel, Barrier) -from loopy.schedule.tools import (get_block_boundaries, InstructionQuery) +from loopy.schedule.tools import get_block_boundaries import logging @@ -135,7 +136,7 @@ class LivenessAnalysis(object): @memoize_method def liveness(self): - logging.info("running liveness analysis") + logger.info("running liveness analysis") successors = self.get_successor_relation() gen, kill = self.get_gen_and_kill_sets() @@ -152,7 +153,7 @@ class LivenessAnalysis(object): lr[idx].live_out.update(lr[succ].live_in) lr[idx].live_in = gen[idx] | (lr[idx].live_out - kill[idx]) - logging.info("done running liveness analysis") + logger.info("done running liveness analysis") return lr @@ -193,13 +194,9 @@ class TemporarySaver(object): The name of the new temporary. - .. attribute:: orig_temporary + .. attribute:: orig_temporary_name - The original temporary variable object. - - .. attribute:: hw_inames - - The common list of hw axes that define the original object. + The name of original temporary variable object. .. attribute:: hw_dims @@ -207,6 +204,10 @@ class TemporarySaver(object): of the promoted temporary value, corresponding to hardware dimensions + .. attribute:: hw_tags + + The tags for the inames associated with hw_dims + .. attribute:: non_hw_dims A list of expressions, to be added in front of the shape @@ -214,9 +215,15 @@ class TemporarySaver(object): non-hardware dimensions """ - @memoize_method - def as_variable(self): - temporary = self.orig_temporary + __slots__ = """ + name + orig_temporary_name + hw_dims + hw_tags + non_hw_dims""".split() + + def as_kernel_temporary(self, kernel): + temporary = kernel.temporary_variables[self.orig_temporary_name] from loopy.kernel.data import TemporaryVariable return TemporaryVariable( name=self.name, @@ -230,16 +237,215 @@ class TemporarySaver(object): def __init__(self, kernel): self.kernel = kernel - self.insn_query = InstructionQuery(kernel) self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() + # These fields keep track of updates to the kernel. self.insns_to_insert = [] self.insns_to_update = {} self.extra_args_to_add = {} self.updated_iname_to_tag = {} self.updated_temporary_variables = {} - self.saves_or_reloads_added = {} + + # temporary name -> save or reload insn ids + from collections import defaultdict + self.temporary_to_save_ids = defaultdict(set) + self.temporary_to_reload_ids = defaultdict(set) + self.subkernel_to_newly_added_insn_ids = defaultdict(set) + + # Maps names of base_storage to the name of the temporary + # representative chosen for saves/reloads + self.base_storage_to_representative = {} + + from loopy.kernel.data import ValueArg + import islpy as isl + self.new_subdomain = ( + isl.BasicSet.universe( + isl.Space.create_from_names( + isl.DEFAULT_CONTEXT, + set=[], + params=set( + arg.name for arg in kernel.args + if isinstance(arg, ValueArg))))) + + def find_accessing_instructions_in_subkernel(self, temporary, subkernel): + # Find all accessing instructions in the subkernel. If base_storage is + # present, this includes instructions that access aliasing memory. + + aliasing_names = set([temporary]) + base_storage = self.kernel.temporary_variables[temporary].base_storage + + if base_storage is not None: + aliasing_names |= self.base_storage_to_temporary_map[base_storage] + + from loopy.kernel.tools import get_subkernel_to_insn_id_map + accessing_insns_in_subkernel = set() + subkernel_insns = get_subkernel_to_insn_id_map(self.kernel)[subkernel] + + for name in aliasing_names: + try: + accessing_insns_in_subkernel |= ( + self.kernel.reader_map()[name] & subkernel_insns) + except KeyError: + pass + + try: + accessing_insns_in_subkernel |= ( + self.kernel.writer_map()[name] & subkernel_insns) + except KeyError: + pass + + return frozenset(accessing_insns_in_subkernel) + + @property + @memoize_method + def base_storage_to_temporary_map(self): + from collections import defaultdict + + result = defaultdict(set) + + for temporary in six.itervalues(self.kernel.temporary_variables): + if temporary.base_storage is None: + continue + result[temporary.base_storage].add(temporary.name) + + return result + + @property + @memoize_method + def subkernel_to_slice_indices(self): + result = {} + + for sched_item_idx, sched_item in enumerate(self.kernel.schedule): + if isinstance(sched_item, CallKernel): + start_idx = sched_item_idx + elif isinstance(sched_item, ReturnFromKernel): + result[sched_item.kernel_name] = (start_idx, 1 + sched_item_idx) + + return result + + @property + @memoize_method + def subkernel_to_surrounding_inames(self): + current_outer_inames = set() + within_subkernel = False + result = {} + + for sched_item_idx, sched_item in enumerate(self.kernel.schedule): + if isinstance(sched_item, CallKernel): + within_subkernel = True + result[sched_item.kernel_name] = frozenset(current_outer_inames) + elif isinstance(sched_item, ReturnFromKernel): + within_subkernel = False + elif isinstance(sched_item, EnterLoop): + if not within_subkernel: + current_outer_inames.add(sched_item.iname) + elif isinstance(sched_item, LeaveLoop): + current_outer_inames.discard(sched_item.iname) + + return result + + @memoize_method + def get_enclosing_global_barrier_pair(self, subkernel): + subkernel_start, subkernel_end = ( + self.subkernel_to_slice_indices[subkernel]) + + def is_global_barrier(item): + return isinstance(item, Barrier) and item.kind == "global" + + try: + pre_barrier = next(item for item in + self.kernel.schedule[subkernel_start::-1] + if is_global_barrier(item)).originating_insn_id + except StopIteration: + pre_barrier = None + + try: + post_barrier = next(item for item in + self.kernel.schedule[subkernel_end:] + if is_global_barrier(item)).originating_insn_id + except StopIteration: + post_barrier = None + + return (pre_barrier, post_barrier) + + def get_hw_axis_sizes_and_tags_for_save_slot(self, temporary): + """ + This is used for determining the amount of global storage needed for saving + and restoring the temporary across kernel calls, due to hardware + parallel inames (the inferred axes get prefixed to the number of + dimensions in the temporary). + + In the case of local temporaries, inames that are tagged + hw-local do not contribute to the global storage shape. + """ + accessor_insn_ids = frozenset( + self.kernel.reader_map()[temporary.name] + | self.kernel.writer_map()[temporary.name]) + + group_tags = None + local_tags = None + + def _sortedtags(tags): + return sorted(tags, key=lambda tag: tag.axis) + + for insn_id in accessor_insn_ids: + insn = self.kernel.id_to_insn[insn_id] + + my_group_tags = [] + my_local_tags = [] + + for iname in insn.within_inames: + tag = self.kernel.iname_to_tag.get(iname) + + if tag is None: + continue + + from loopy.kernel.data import ( + GroupIndexTag, LocalIndexTag, ParallelTag) + + if isinstance(tag, GroupIndexTag): + my_group_tags.append(tag) + elif isinstance(tag, LocalIndexTag): + my_local_tags.append(tag) + elif isinstance(tag, ParallelTag): + raise LoopyError( + "iname '%s' is tagged with '%s' - only " + "group and local tags are supported for " + "auto save/reload of temporaries" % + (iname, tag)) + + if group_tags is None: + group_tags = _sortedtags(my_group_tags) + local_tags = _sortedtags(my_local_tags) + group_tags_originating_insn_id = insn_id + + if ( + group_tags != _sortedtags(my_group_tags) + or local_tags != _sortedtags(my_local_tags)): + raise LoopyError( + "inconsistent parallel tags across instructions that access " + "'%s' (specifically, instruction '%s' has tags '%s' but " + "instruction '%s' has tags '%s')" + % (temporary.name, + group_tags_originating_insn_id, group_tags + local_tags, + insn_id, my_group_tags + my_local_tags)) + + if group_tags is None: + assert local_tags is None + return (), () + + group_sizes, local_sizes = ( + self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids)) + + if temporary.scope == lp.temp_var_scope.LOCAL: + # Elide local axes in the save slot for local temporaries. + del local_tags[:] + local_sizes = () + + # We set hw_dims to be arranged according to the order: + # g.0 < g.1 < ... < l.0 < l.1 < ... + return (group_sizes + local_sizes), tuple(group_tags + local_tags) @memoize_method def auto_promote_temporary(self, temporary_name): @@ -255,52 +461,16 @@ class TemporarySaver(object): assert temporary.read_only return None - if temporary.base_storage is not None: - raise ValueError( - "Cannot promote temporaries with base_storage to global") - - # `hw_inames`: The set of hw-parallel tagged inames that this temporary - # is associated with. This is used for determining the shape of the - # global storage needed for saving and restoring the temporary across - # kernel calls. - # - # TODO: Make a policy decision about which dimensions to use. Currently, - # the code looks at each instruction that defines or uses the temporary, - # and takes the common set of hw-parallel tagged inames associated with - # these instructions. - # - # Furthermore, in the case of local temporaries, inames that are tagged - # hw-local do not contribute to the global storage shape. - hw_inames = self.insn_query.common_hw_inames( - self.insn_query.insns_reading_or_writing(temporary.name)) - - # We want hw_inames to be arranged according to the order: - # g.0 < g.1 < ... < l.0 < l.1 < ... - # Sorting lexicographically accomplishes this. - hw_inames = sorted(hw_inames, - key=lambda iname: str(self.kernel.iname_to_tag[iname])) - - # Calculate the sizes of the dimensions that get added in front for - # the global storage of the temporary. - hw_dims = [] - - backing_hw_inames = [] - - for iname in hw_inames: - tag = self.kernel.iname_to_tag[iname] - from loopy.kernel.data import LocalIndexTag - is_local_iname = isinstance(tag, LocalIndexTag) - if is_local_iname and temporary.scope == temp_var_scope.LOCAL: - # Restrict shape to that of group inames for locals. - continue - backing_hw_inames.append(iname) - from loopy.isl_helpers import static_max_of_pw_aff - from loopy.symbolic import aff_to_expr - hw_dims.append( - aff_to_expr( - static_max_of_pw_aff( - self.kernel.get_iname_bounds(iname).size, False))) + base_storage_conflict = ( + self.base_storage_to_representative.get( + temporary.base_storage, temporary) is not temporary) + if base_storage_conflict: + raise NotImplementedError( + "tried to save/reload multiple temporaries with the " + "same base_storage; this is currently not supported") + + hw_dims, hw_tags = self.get_hw_axis_sizes_and_tags_for_save_slot(temporary) non_hw_dims = temporary.shape if len(non_hw_dims) == 0 and len(hw_dims) == 0: @@ -309,10 +479,14 @@ class TemporarySaver(object): backing_temporary = self.PromotedTemporary( name=self.var_name_gen(temporary.name + "_save_slot"), - orig_temporary=temporary, - hw_dims=tuple(hw_dims), - non_hw_dims=non_hw_dims, - hw_inames=backing_hw_inames) + orig_temporary_name=temporary.name, + hw_dims=hw_dims, + hw_tags=hw_tags, + non_hw_dims=non_hw_dims) + + if temporary.base_storage is not None: + self.base_storage_to_representative[temporary.base_storage] = ( + backing_temporary) return backing_temporary @@ -326,23 +500,16 @@ class TemporarySaver(object): if promoted_temporary is None: return - from loopy.kernel.tools import DomainChanger - dchg = DomainChanger( - self.kernel, - frozenset( - self.insn_query.inames_in_subkernel(subkernel) | - set(promoted_temporary.hw_inames))) - - domain, hw_inames, dim_inames, iname_to_tag = \ + new_subdomain, hw_inames, dim_inames, iname_to_tag = ( self.augment_domain_for_save_or_reload( - dchg.domain, promoted_temporary, mode, subkernel) + self.new_subdomain, promoted_temporary, mode, subkernel)) - self.kernel = dchg.get_kernel_with(domain) + self.new_subdomain = new_subdomain save_or_load_insn_id = self.insn_name_gen( "{name}.{mode}".format(name=temporary, mode=mode)) - def subscript_or_var(agg, subscript=()): + def add_subscript_if_subscript_nonempty(agg, subscript=()): from pymbolic.primitives import Subscript, Variable if len(subscript) == 0: return Variable(agg) @@ -351,20 +518,22 @@ class TemporarySaver(object): Variable(agg), tuple(map(Variable, subscript))) - dim_inames_trunc = dim_inames[:len(promoted_temporary.orig_temporary.shape)] + orig_temporary = ( + self.kernel.temporary_variables[ + promoted_temporary.orig_temporary_name]) + dim_inames_trunc = dim_inames[:len(orig_temporary.shape)] args = ( - subscript_or_var( - temporary, dim_inames_trunc), - subscript_or_var( - promoted_temporary.name, hw_inames + dim_inames)) + add_subscript_if_subscript_nonempty( + temporary, subscript=dim_inames_trunc), + add_subscript_if_subscript_nonempty( + promoted_temporary.name, subscript=hw_inames + dim_inames)) if mode == "save": args = reversed(args) - accessing_insns_in_subkernel = ( - self.insn_query.insns_reading_or_writing(temporary) & - self.insn_query.insns_in_subkernel(subkernel)) + accessing_insns_in_subkernel = self.find_accessing_instructions_in_subkernel( + temporary, subkernel) if mode == "save": depends_on = accessing_insns_in_subkernel @@ -373,7 +542,7 @@ class TemporarySaver(object): depends_on = frozenset() update_deps = accessing_insns_in_subkernel - pre_barrier, post_barrier = self.insn_query.pre_and_post_barriers(subkernel) + pre_barrier, post_barrier = self.get_enclosing_global_barrier_pair(subkernel) if pre_barrier is not None: depends_on |= set([pre_barrier]) @@ -387,16 +556,19 @@ class TemporarySaver(object): *args, id=save_or_load_insn_id, within_inames=( - self.insn_query.inames_in_subkernel(subkernel) | - frozenset(hw_inames + dim_inames)), + self.subkernel_to_surrounding_inames[subkernel] + | frozenset(hw_inames + dim_inames)), within_inames_is_final=True, depends_on=depends_on, boostable=False, boostable_into=frozenset()) - if temporary not in self.saves_or_reloads_added: - self.saves_or_reloads_added[temporary] = set() - self.saves_or_reloads_added[temporary].add(save_or_load_insn_id) + if mode == "save": + self.temporary_to_save_ids[temporary].add(save_or_load_insn_id) + else: + self.temporary_to_reload_ids[temporary].add(save_or_load_insn_id) + + self.subkernel_to_newly_added_insn_ids[subkernel].add(save_or_load_insn_id) self.insns_to_insert.append(save_or_load_insn) @@ -405,8 +577,8 @@ class TemporarySaver(object): self.insns_to_update[insn_id] = insn.copy( depends_on=insn.depends_on | frozenset([save_or_load_insn_id])) - self.updated_temporary_variables[promoted_temporary.name] = \ - promoted_temporary.as_variable() + self.updated_temporary_variables[promoted_temporary.name] = ( + promoted_temporary.as_kernel_temporary(self.kernel)) self.updated_iname_to_tag.update(iname_to_tag) @@ -416,15 +588,6 @@ class TemporarySaver(object): insns_to_insert = dict((insn.id, insn) for insn in self.insns_to_insert) - # Add global no_sync_with between any added reloads and saves - from six import iteritems - for temporary, added_insns in iteritems(self.saves_or_reloads_added): - for insn_id in added_insns: - insn = insns_to_insert[insn_id] - insns_to_insert[insn_id] = insn.copy( - no_sync_with=frozenset( - (added_insn, "global") for added_insn in added_insns)) - for orig_insn in self.kernel.instructions: if orig_insn.id in self.insns_to_update: new_instructions.append(self.insns_to_update[orig_insn.id]) @@ -436,12 +599,31 @@ class TemporarySaver(object): self.updated_iname_to_tag.update(self.kernel.iname_to_tag) self.updated_temporary_variables.update(self.kernel.temporary_variables) + new_domains = list(self.kernel.domains) + import islpy as isl + if self.new_subdomain.dim(isl.dim_type.set) > 0: + new_domains.append(self.new_subdomain) + kernel = self.kernel.copy( + domains=new_domains, instructions=new_instructions, iname_to_tag=self.updated_iname_to_tag, temporary_variables=self.updated_temporary_variables, overridden_get_grid_sizes_for_insn_ids=None) + # Add nosync directives to any saves or reloads that were added with a + # potential dependency chain. + from loopy.kernel.tools import get_subkernels + for subkernel in get_subkernels(kernel): + relevant_insns = self.subkernel_to_newly_added_insn_ids[subkernel] + + from itertools import product + for temporary in self.temporary_to_reload_ids: + for source, sink in product( + relevant_insns & self.temporary_to_reload_ids[temporary], + relevant_insns & self.temporary_to_save_ids[temporary]): + kernel = lp.add_nosync(kernel, "global", source, sink) + from loopy.kernel.tools import assign_automatic_axes return assign_automatic_axes(kernel) @@ -456,22 +638,28 @@ class TemporarySaver(object): """ Add new axes to the domain corresponding to the dimensions of `promoted_temporary`. These axes will be used in the save/ - reload stage. + reload stage. These get prefixed onto the already existing axes. """ assert mode in ("save", "reload") import islpy as isl - orig_temporary = promoted_temporary.orig_temporary + orig_temporary = ( + self.kernel.temporary_variables[ + promoted_temporary.orig_temporary_name]) orig_dim = domain.dim(isl.dim_type.set) # Tags for newly added inames iname_to_tag = {} + from loopy.symbolic import aff_from_expr + # FIXME: Restrict size of new inames to access footprint. # Add dimension-dependent inames. dim_inames = [] - domain = domain.add(isl.dim_type.set, len(promoted_temporary.non_hw_dims)) + domain = domain.add(isl.dim_type.set, + len(promoted_temporary.non_hw_dims) + + len(promoted_temporary.hw_dims)) for dim_idx, dim_size in enumerate(promoted_temporary.non_hw_dims): new_iname = self.insn_name_gen("{name}_{mode}_axis_{dim}_{sk}". @@ -493,25 +681,31 @@ class TemporarySaver(object): # Add size information. aff = isl.affs_from_space(domain.space) domain &= aff[0].le_set(aff[new_iname]) - from loopy.symbolic import aff_from_expr domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, dim_size)) - # FIXME: Use promoted_temporary.hw_inames - hw_inames = [] + dim_offset = orig_dim + len(promoted_temporary.non_hw_dims) - # Add hardware inames duplicates. - for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames): + hw_inames = [] + # Add hardware dims. + for hw_iname_idx, (hw_tag, dim) in enumerate( + zip(promoted_temporary.hw_tags, promoted_temporary.hw_dims)): new_iname = self.insn_name_gen("{name}_{mode}_hw_dim_{dim}_{sk}". format(name=orig_temporary.name, mode=mode, - dim=t_idx, + dim=hw_iname_idx, sk=subkernel)) - hw_inames.append(new_iname) - iname_to_tag[new_iname] = self.kernel.iname_to_tag[hw_iname] + domain = domain.set_dim_name( + isl.dim_type.set, dim_offset + hw_iname_idx, new_iname) - from loopy.isl_helpers import duplicate_axes - domain = duplicate_axes( - domain, promoted_temporary.hw_inames, hw_inames) + aff = isl.affs_from_space(domain.space) + domain = (domain + & + aff[0].le_set(aff[new_iname]) + & + aff[new_iname].lt_set(aff_from_expr(domain.space, dim))) + + self.updated_iname_to_tag[new_iname] = hw_tag + hw_inames.append(new_iname) # The operations on the domain above return a Set object, but the # underlying domain should be expressible as a single BasicSet. @@ -551,7 +745,8 @@ def save_and_reload_temporaries(knl): liveness = LivenessAnalysis(knl) saver = TemporarySaver(knl) - insn_query = InstructionQuery(knl) + from loopy.schedule.tools import ( + temporaries_read_in_subkernel, temporaries_written_in_subkernel) for sched_idx, sched_item in enumerate(knl.schedule): @@ -562,9 +757,10 @@ def save_and_reload_temporaries(knl): # Kernel entry: nothing live interesting_temporaries = set() else: + subkernel = sched_item.kernel_name interesting_temporaries = ( - insn_query.temporaries_read_or_written_in_subkernel( - sched_item.kernel_name)) + temporaries_read_in_subkernel(knl, subkernel) + | temporaries_written_in_subkernel(knl, subkernel)) for temporary in liveness[sched_idx].live_out & interesting_temporaries: logger.info("reloading {0} at entry of {1}" @@ -576,9 +772,9 @@ def save_and_reload_temporaries(knl): # Kernel exit: nothing live interesting_temporaries = set() else: + subkernel = sched_item.kernel_name interesting_temporaries = ( - insn_query.temporaries_written_in_subkernel( - sched_item.kernel_name)) + temporaries_written_in_subkernel(knl, subkernel)) for temporary in liveness[sched_idx].live_in & interesting_temporaries: logger.info("saving {0} before return of {1}" diff --git a/loopy/type_inference.py b/loopy/type_inference.py index b8b0cbcbf1236cdf712da998922ac238261a3e6e..78d817ce73724d90a6cc6f380b24290971f6c1e7 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -332,7 +332,20 @@ class TypeInferenceMapper(CombineMapper): if not agg_result: return agg_result - field = agg_result[0].numpy_dtype.fields[expr.name] + numpy_dtype = agg_result[0].numpy_dtype + fields = numpy_dtype.fields + if fields is None: + raise LoopyError("cannot look up attribute '%s' in " + "non-aggregate expression '%s'" + % (expr.aggregate, expr.name)) + + try: + field = fields[expr.name] + except KeyError: + raise LoopyError("cannot look up attribute '%s' in " + "aggregate expression '%s' of dtype '%s'" + % (expr.aggregate, expr.name, numpy_dtype)) + dtype = field[0] return [NumpyType(dtype)] diff --git a/loopy/version.py b/loopy/version.py index 77d0e21bdd2ef5383c5f874656c25fe1ede21a70..02244f55d0dbf207a4641c3ebf6cc33b536f0421 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v60-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v64-islpy%s" % _islpy_version diff --git a/setup.py b/setup.py index a941eecd2b58daf413830fc22500179d3e8a8cf1..67d943af3be4446834bf7262a91b8596b601ca85 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ setup(name="loo.py", ], install_requires=[ - "pytools>=2016.2.6", + "pytools>=2017.3", "pymbolic>=2016.2", "genpy>=2016.1.2", "cgen>=2016.1", diff --git a/test/test_loopy.py b/test/test_loopy.py index c9328a252b708770e6f83590275be92e0c5d8b05..2ac1026c0660573d97cd2f65c0502ec69b63803d 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1146,7 +1146,7 @@ def save_and_reload_temporaries_test(queue, knl, out_expect, debug=False): 1/0 _, (out,) = knl(queue, out_host=True) - assert (out == out_expect).all() + assert (out == out_expect).all(), (out, out_expect) @pytest.mark.parametrize("hw_loop", [True, False]) @@ -1338,6 +1338,73 @@ def test_save_local_multidim_array(ctx_factory, debug=False): save_and_reload_temporaries_test(queue, knl, 1, debug) +def test_save_with_base_storage(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i]: 0 <= i < 10}", + """ + <>a[i] = 0 + <>b[i] = i + ... gbarrier + out[i] = a[i] + """, + "...", + seq_dependencies=True) + + knl = lp.tag_inames(knl, dict(i="l.0")) + knl = lp.set_temporary_scope(knl, "a", "local") + knl = lp.set_temporary_scope(knl, "b", "local") + + knl = lp.alias_temporaries(knl, ["a", "b"], + synchronize_for_exclusive_use=False) + + save_and_reload_temporaries_test(queue, knl, np.arange(10), debug) + + +def test_save_ambiguous_storage_requirements(): + knl = lp.make_kernel( + "{[i,j]: 0 <= i < 10 and 0 <= j < 10}", + """ + <>a[j] = j + ... gbarrier + out[i,j] = a[j] + """, + seq_dependencies=True) + + knl = lp.tag_inames(knl, dict(i="g.0", j="l.0")) + knl = lp.duplicate_inames(knl, "j", within="writes:out", tags={"j": "l.0"}) + knl = lp.set_temporary_scope(knl, "a", "local") + + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + + from loopy.diagnostic import LoopyError + with pytest.raises(LoopyError): + lp.save_and_reload_temporaries(knl) + + +def test_save_across_inames_with_same_tag(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i]: 0 <= i < 10}", + """ + <>a[i] = i + ... gbarrier + out[i] = a[i] + """, + "...", + seq_dependencies=True) + + knl = lp.tag_inames(knl, dict(i="l.0")) + knl = lp.duplicate_inames(knl, "i", within="reads:a", tags={"i": "l.0"}) + + save_and_reload_temporaries_test(queue, knl, np.arange(10), debug) + + def test_missing_temporary_definition_detection(): knl = lp.make_kernel( "{ [i]: 0<=i<10 }", @@ -2117,6 +2184,41 @@ def test_barrier_insertion_near_bottom_of_loop(): assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1]) +def test_multi_argument_reduction_type_inference(): + from loopy.type_inference import TypeInferenceMapper + from loopy.library.reduction import SegmentedSumReductionOperation + from loopy.types import to_loopy_type + op = SegmentedSumReductionOperation() + + knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=j<i}", "") + + int32 = to_loopy_type(np.int32) + + expr = lp.symbolic.Reduction( + operation=op, + inames=("i",), + expr=lp.symbolic.Reduction( + operation=op, + inames="j", + expr=(1, 2), + allow_simultaneous=True), + allow_simultaneous=True) + + t_inf_mapper = TypeInferenceMapper(knl) + + assert ( + t_inf_mapper(expr, return_tuple=True, return_dtype_set=True) + == [(int32, int32)]) + + +def test_multi_argument_reduction_parsing(): + from loopy.symbolic import parse, Reduction + + assert isinstance( + parse("reduce(argmax, i, reduce(argmax, j, i, j))").expr, + Reduction) + + def test_global_barrier_order_finding(): knl = lp.make_kernel( "{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}", @@ -2161,41 +2263,6 @@ def test_global_barrier_error_if_unordered(): lp.get_global_barrier_order(knl) -def test_multi_argument_reduction_type_inference(): - from loopy.type_inference import TypeInferenceMapper - from loopy.library.reduction import SegmentedSumReductionOperation - from loopy.types import to_loopy_type - op = SegmentedSumReductionOperation() - - knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=j<i}", "") - - int32 = to_loopy_type(np.int32) - - expr = lp.symbolic.Reduction( - operation=op, - inames=("i",), - expr=lp.symbolic.Reduction( - operation=op, - inames="j", - expr=(1, 2), - allow_simultaneous=True), - allow_simultaneous=True) - - t_inf_mapper = TypeInferenceMapper(knl) - - assert ( - t_inf_mapper(expr, return_tuple=True, return_dtype_set=True) - == [(int32, int32)]) - - -def test_multi_argument_reduction_parsing(): - from loopy.symbolic import parse, Reduction - - assert isinstance( - parse("reduce(argmax, i, reduce(argmax, j, i, j))").expr, - Reduction) - - def test_struct_assignment(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -2231,6 +2298,43 @@ def test_struct_assignment(ctx_factory): knl(queue, N=200) +def test_inames_conditional_generation(ctx_factory): + ctx = ctx_factory() + knl = lp.make_kernel( + "{[i,j,k]: 0 < k < i and 0 < j < 10 and 0 < i < 10}", + """ + for k + ... gbarrier + <>tmp1 = 0 + end + for j + ... gbarrier + <>tmp2 = i + end + """, + "...", + seq_dependencies=True) + + knl = lp.tag_inames(knl, dict(i="g.0")) + + with cl.CommandQueue(ctx) as queue: + knl(queue) + + +def test_kernel_var_name_generator(): + knl = lp.make_kernel( + "{[i]: 0 <= i <= 10}", + """ + <>a = 0 + <>b_s0 = 0 + """) + + vng = knl.get_var_name_generator() + + assert vng("a_s0") != "a_s0" + assert vng("b") != "b" + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_scan.py b/test/test_scan.py new file mode 100644 index 0000000000000000000000000000000000000000..08754819c9a156403aba689cb3e9c238144e7905 --- /dev/null +++ b/test/test_scan.py @@ -0,0 +1,432 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = """ +Copyright (C) 2012 Andreas Kloeckner +Copyright (C) 2016, 2017 Matt Wala +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa +import pytest + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +__all__ = [ + "pytest_generate_tests", + "cl" # 'cl.create_some_context' + ] + + +# More things to test. +# - scan(a) + scan(b) +# - test for badly tagged inames + +@pytest.mark.parametrize("n", [1, 2, 3, 16]) +@pytest.mark.parametrize("stride", [1, 2]) +def test_sequential_scan(ctx_factory, n, stride): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "[n] -> {[i,j]: 0<=i<n and 0<=j<=%d*i}" % stride, + """ + a[i] = sum(j, j**2) + """ + ) + + knl = lp.fix_parameters(knl, n=n) + knl = lp.realize_reduction(knl, force_scan=True) + + evt, (a,) = knl(queue) + + assert (a.get() == np.cumsum(np.arange(stride*n)**2)[::stride]).all() + + +@pytest.mark.parametrize("sweep_lbound, scan_lbound", [ + (4, 0), + (3, 1), + (2, 2), + (1, 3), + (0, 4), + (5, -1), + ]) +def test_scan_with_different_lower_bound_from_sweep( + ctx_factory, sweep_lbound, scan_lbound): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "[n, sweep_lbound, scan_lbound] -> " + "{[i,j]: sweep_lbound<=i<n+sweep_lbound " + "and scan_lbound<=j<=2*(i-sweep_lbound)+scan_lbound}", + """ + out[i-sweep_lbound] = sum(j, j**2) + """ + ) + + n = 10 + + knl = lp.fix_parameters(knl, sweep_lbound=sweep_lbound, scan_lbound=scan_lbound) + knl = lp.realize_reduction(knl, force_scan=True) + evt, (out,) = knl(queue, n=n) + + assert (out.get() + == np.cumsum(np.arange(scan_lbound, 2*n+scan_lbound)**2)[::2]).all() + + +def test_automatic_scan_detection(): + knl = lp.make_kernel( + [ + "[n] -> {[i]: 0<=i<n}", + "{[j]: 0<=j<=2*i}" + ], + """ + a[i] = sum(j, j**2) + """ + ) + + cgr = lp.generate_code_v2(knl) + assert "scan" not in cgr.device_code() + + +def test_selective_scan_realization(): + pass + + +def test_force_outer_iname_for_scan(): + knl = lp.make_kernel( + "[n] -> {[i,j,k]: 0<=k<n and 0<=i<=k and 0<=j<=i}", + "out[i] = product(j, a[j]) {inames=i:k}") + + knl = lp.add_dtypes(knl, dict(a=np.float32)) + + # TODO: Maybe this deserves to work? + with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError): + lp.realize_reduction(knl, force_scan=True) + + knl = lp.realize_reduction(knl, force_scan=True, force_outer_iname_for_scan="i") + + +def test_dependent_domain_scan(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + [ + "[n] -> {[i]: 0<=i<n}", + "{[j]: 0<=j<=2*i}" + ], + """ + a[i] = sum(j, j**2) {id=scan} + """ + ) + knl = lp.realize_reduction(knl, force_scan=True) + evt, (a,) = knl(queue, n=100) + + assert (a.get() == np.cumsum(np.arange(200)**2)[::2]).all() + + +@pytest.mark.parametrize("i_tag, j_tag", [ + ("for", "for") + ]) +def test_nested_scan(ctx_factory, i_tag, j_tag): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + [ + "[n] -> {[i]: 0 <= i < n}", + "[i] -> {[j]: 0 <= j <= i}", + "[i] -> {[k]: 0 <= k <= i}" + ], + """ + <>tmp[i] = sum(k, 1) + out[i] = sum(j, tmp[j]) + """) + + knl = lp.fix_parameters(knl, n=10) + knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag)) + + knl = lp.realize_reduction(knl, force_scan=True) + + print(knl) + + evt, (out,) = knl(queue) + + print(out) + + +def test_scan_not_triangular(): + knl = lp.make_kernel( + "{[i,j]: 0<=i<100 and 1<=j<=2*i}", + """ + a[i] = sum(j, j**2) + """ + ) + + with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError): + knl = lp.realize_reduction(knl, force_scan=True) + + +@pytest.mark.parametrize("n", [1, 2, 3, 16, 17]) +def test_local_parallel_scan(ctx_factory, n): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}", + """ + out[i] = sum(j, a[j]**2) + """, + "..." + ) + + knl = lp.fix_parameters(knl, n=n) + knl = lp.tag_inames(knl, dict(i="l.0")) + knl = lp.realize_reduction(knl, force_scan=True) + + knl = lp.realize_reduction(knl) + + knl = lp.add_dtypes(knl, dict(a=int)) + + print(knl) + + evt, (a,) = knl(queue, a=np.arange(n)) + assert (a == np.cumsum(np.arange(n)**2)).all() + + +def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "[n] -> {[i,j]: 1<=i<n+1 and 0<=j<=i-1}", + """ + out[i-1] = sum(j, a[j]**2) + """, + "..." + ) + + knl = lp.fix_parameters(knl, n=16) + knl = lp.tag_inames(knl, dict(i="l.0")) + knl = lp.realize_reduction(knl, force_scan=True) + knl = lp.realize_reduction(knl) + + knl = lp.add_dtypes(knl, dict(a=int)) + evt, (out,) = knl(queue, a=np.arange(1, 17)) + + assert (out == np.cumsum(np.arange(1, 17)**2)).all() + + +def test_scan_extra_constraints_on_domain(): + knl = lp.make_kernel( + "{[i,j,k]: 0<=i<n and 0<=j<=i and i=k}", + "out[i] = sum(j, a[j])") + + with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError): + knl = lp.realize_reduction( + knl, force_scan=True, force_outer_iname_for_scan="i") + + +@pytest.mark.parametrize("sweep_iname_tag", ["for", "l.1"]) +def test_scan_with_outer_parallel_iname(ctx_factory, sweep_iname_tag): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + [ + "{[k]: 0<=k<=1}", + "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}" + ], + "out[k,i] = k + sum(j, j**2)" + ) + + knl = lp.tag_inames(knl, dict(k="l.0", i=sweep_iname_tag)) + n = 10 + knl = lp.fix_parameters(knl, n=n) + knl = lp.realize_reduction(knl, force_scan=True) + + evt, (out,) = knl(queue) + + inner = np.cumsum(np.arange(n)**2) + + assert (out.get() == np.array([inner, 1 + inner])).all() + + +@pytest.mark.parametrize("dtype", [ + np.int32, np.int64, np.float32, np.float64]) +def test_scan_data_types(ctx_factory, dtype): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i,j]: 0<=i<n and 0<=j<=i }", + "res[i] = reduce(sum, j, a[j])", + assumptions="n>=1") + + a = np.random.randn(20).astype(dtype) + knl = lp.add_dtypes(knl, dict(a=dtype)) + knl = lp.realize_reduction(knl, force_scan=True) + evt, (res,) = knl(queue, a=a) + + assert np.allclose(res, np.cumsum(a)) + + +@pytest.mark.parametrize(("op_name", "np_op"), [ + ("sum", np.sum), + ("product", np.prod), + ("min", np.min), + ("max", np.max), + ]) +def test_scan_library(ctx_factory, op_name, np_op): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i,j]: 0<=i<n and 0<=j<=i }", + "res[i] = reduce(%s, j, a[j])" % op_name, + assumptions="n>=1") + + a = np.random.randn(20) + knl = lp.add_dtypes(knl, dict(a=np.float)) + knl = lp.realize_reduction(knl, force_scan=True) + evt, (res,) = knl(queue, a=a) + + assert np.allclose(res, np.array( + [np_op(a[:i+1]) for i in range(len(a))])) + + +def test_scan_unsupported_tags(): + pass + + +@pytest.mark.parametrize("i_tag", ["for", "l.0"]) +def test_argmax(ctx_factory, i_tag): + logging.basicConfig(level=logging.INFO) + + dtype = np.dtype(np.float32) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 128 + + knl = lp.make_kernel( + "{[i,j]: 0<=i<%d and 0<=j<=i}" % n, + """ + max_vals[i], max_indices[i] = argmax(j, fabs(a[j]), j) + """) + + knl = lp.tag_inames(knl, dict(i=i_tag)) + knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + knl = lp.realize_reduction(knl, force_scan=True) + + a = np.random.randn(n).astype(dtype) + evt, (max_indices, max_vals) = knl(queue, a=a, out_host=True) + + assert (max_vals == [np.max(np.abs(a)[0:i+1]) for i in range(n)]).all() + assert (max_indices == [np.argmax(np.abs(a[0:i+1])) for i in range(n)]).all() + + +def check_segmented_scan_output(arr, segment_boundaries_indices, out): + class SegmentGrouper(object): + + def __init__(self): + self.seg_idx = 0 + self.idx = 0 + + def __call__(self, key): + if self.idx in segment_boundaries_indices: + self.seg_idx += 1 + self.idx += 1 + return self.seg_idx + + from itertools import groupby + + expected = [np.cumsum(list(group)) + for _, group in groupby(arr, SegmentGrouper())] + actual = [np.array(list(group)) + for _, group in groupby(out, SegmentGrouper())] + + assert len(expected) == len(actual) == len(segment_boundaries_indices) + assert [(e == a).all() for e, a in zip(expected, actual)] + + +@pytest.mark.parametrize("n, segment_boundaries_indices", [ + (1, (0,)), + (2, (0,)), + (2, (0, 1)), + (3, (0,)), + (3, (0, 1)), + (3, (0, 2)), + (3, (0, 1, 2)), + (16, (0, 4, 8, 12))]) +@pytest.mark.parametrize("iname_tag", ("for", "l.0")) +def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + arr = np.ones(n, dtype=np.float32) + segment_boundaries = np.zeros(n, dtype=np.int32) + segment_boundaries[(segment_boundaries_indices,)] = 1 + + knl = lp.make_kernel( + "{[i,j]: 0<=i<n and 0<=j<=i}", + "out[i], <>_ = reduce(segmented(sum), j, arr[j], segflag[j])", + [ + lp.GlobalArg("arr", np.float32, shape=("n",)), + lp.GlobalArg("segflag", np.int32, shape=("n",)), + "..." + ]) + + knl = lp.fix_parameters(knl, n=n) + knl = lp.tag_inames(knl, dict(i=iname_tag)) + knl = lp.realize_reduction(knl, force_scan=True) + + (evt, (out,)) = knl(queue, arr=arr, segflag=segment_boundaries) + + check_segmented_scan_output(arr, segment_boundaries_indices, out) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_statistics.py b/test/test_statistics.py index 5e363f13594ee8e4cf170faa232b0783cca9d018..a72b62af90050008f837e144f1f28d4a4de1c730 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -49,7 +49,7 @@ def test_op_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -74,7 +74,7 @@ def test_op_counter_reduction(): name="matmul_serial", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -100,7 +100,7 @@ def test_op_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -130,7 +130,7 @@ def test_op_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -166,7 +166,7 @@ def test_op_counter_bitwise(): a=np.int32, b=np.int32, g=np.int64, h=np.int64)) - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -205,7 +205,7 @@ def test_op_counter_triangular_domain(): else: expect_fallback = False - op_map = lp.get_op_map(knl)[lp.Op(np.float64, 'mul')] + op_map = lp.get_op_map(knl, count_redundant_work=True)[lp.Op(np.float64, 'mul')] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) @@ -229,7 +229,7 @@ def test_mem_access_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -269,7 +269,7 @@ def test_mem_access_counter_reduction(): name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -307,7 +307,7 @@ def test_mem_access_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -343,7 +343,7 @@ def test_mem_access_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -395,7 +395,7 @@ def test_mem_access_counter_bitwise(): a=np.int32, b=np.int32, g=np.int32, h=np.int32)) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -437,11 +437,11 @@ def test_mem_access_counter_mixed(): knl = lp.add_and_infer_dtypes(knl, dict( a=np.float32, b=np.float32, g=np.float64, h=np.float64, x=np.float32)) - threads = 16 - knl = lp.split_iname(knl, "j", threads) + bsize = 16 + knl = lp.split_iname(knl, "j", bsize) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) - mem_map = lp.get_mem_access_map(knl) # noqa + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) # noqa n = 512 m = 256 l = 128 @@ -463,8 +463,8 @@ def test_mem_access_counter_mixed(): stride=Variable('m'), direction='load', variable='b') ].eval_with_dict(params) - assert f64uniform == 2*n*m - assert f32uniform == n*m*l/threads + assert f64uniform == 2*n*m*l/bsize + assert f32uniform == n*m*l/bsize assert f32nonconsec == 3*n*m*l f64uniform = mem_map[lp.MemAccess('global', np.float64, @@ -474,7 +474,7 @@ def test_mem_access_counter_mixed(): stride=Variable('m'), direction='store', variable='c') ].eval_with_dict(params) - assert f64uniform == n*m + assert f64uniform == n*m*l/bsize assert f32nonconsec == n*m*l @@ -494,7 +494,7 @@ def test_mem_access_counter_nonconsec(): knl = lp.split_iname(knl, "i", 16) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) - mem_map = lp.get_mem_access_map(knl) # noqa + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) # noqa n = 512 m = 256 l = 128 @@ -545,7 +545,7 @@ def test_mem_access_counter_consec(): a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -563,7 +563,7 @@ def test_mem_access_counter_consec(): f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=1, direction='load', variable='b') ].eval_with_dict(params) - assert f64consec == 2*n*m + assert f64consec == 2*n*m*l assert f32consec == 3*n*m*l f64consec = mem_map[lp.MemAccess('global', np.float64, @@ -572,7 +572,7 @@ def test_mem_access_counter_consec(): f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c') ].eval_with_dict(params) - assert f64consec == n*m + assert f64consec == n*m*l assert f32consec == n*m*l @@ -628,6 +628,7 @@ def test_barrier_counter_barriers(): def test_all_counters_parallel_matmul(): + bsize = 16 knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ @@ -635,9 +636,9 @@ def test_all_counters_parallel_matmul(): ], name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") - knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") - knl = lp.split_iname(knl, "k", 16) + knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1") + knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0") + knl = lp.split_iname(knl, "k", bsize) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) @@ -649,9 +650,9 @@ def test_all_counters_parallel_matmul(): sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 assert sync_map["kernel_launch"].eval_with_dict(params) == 1 - assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/16 + assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) f32mul = op_map[ lp.Op(np.float32, 'mul') ].eval_with_dict(params) @@ -667,16 +668,17 @@ def test_all_counters_parallel_matmul(): assert f32mul+f32add == n*m*l*2 - op_map = lp.get_mem_access_map(knl) + op_map = lp.get_mem_access_map(knl, count_redundant_work=True) - f32coal = op_map[lp.MemAccess('global', np.float32, + f32s1lb = op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='b') ].eval_with_dict(params) - f32coal += op_map[lp.MemAccess('global', np.float32, - stride=1, direction='load', variable='a') - ].eval_with_dict(params) + f32s1la = op_map[lp.MemAccess('global', np.float32, + stride=1, direction='load', variable='a') + ].eval_with_dict(params) - assert f32coal == n*m+m*l + assert f32s1lb == n*m*l/bsize + assert f32s1la == n*m*l/bsize f32coal = op_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c') @@ -684,7 +686,8 @@ def test_all_counters_parallel_matmul(): assert f32coal == n*l - local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local']) + local_mem_map = lp.get_mem_access_map(knl, + count_redundant_work=True).filter_by(mtype=['local']) local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load') ].eval_with_dict(params) @@ -742,7 +745,7 @@ def test_summations_and_filters(): l = 128 params = {'n': n, 'm': m, 'l': l} - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) loads_a = mem_map.filter_by(direction=['load'], variable=['a'] ).eval_and_sum(params) @@ -768,7 +771,7 @@ def test_summations_and_filters(): assert f32lall == 3*n*m*l assert f64lall == 2*n*m - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)