diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5ea075d194a9da75a1c18d180c65239be83eb85e..f96b43d67fcc1ca53a736fb4893990b8bd363a1a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -69,6 +69,7 @@ Python 2.7 with legacy PyOpenCL: - pocl except: - tags + retry: 2 Python 3.6 POCL: script: diff --git a/doc/index.rst b/doc/index.rst index a0bad2898be4aab74dead90aae825e4e0a460c87..d862a8acd0cb258bfd1e9623bd5cef895871f6b1 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -18,12 +18,14 @@ When you run this script, the following kernel is generated, compiled, and execu (See the full example for how to print the generated code.) +.. _static-binary: + Want to try out loopy? ---------------------- There's no need to go through :ref:`installation` if you'd just like to get a feel for what loopy is. Instead, you may -`download a self-contained Linux binary `_. +`download a self-contained Linux binary `_. This is purposefully built on an ancient Linux distribution, so it should work on most versions of Linux that are currently out there. diff --git a/doc/misc.rst b/doc/misc.rst index 347b5d098c8dc0e37bb72659c0b0de5a8b4e3704..cd6fe102cb9c97a619d8b6512f103c9dcabe65b5 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -3,6 +3,18 @@ Installation ============ +Option 0: Static Binary +----------------------- + +If you would just like to experiment with :mod:`loopy`'s code transformation +abilities, the easiest way to get loopy is to download a statically-linked +Linux binary. + +See :ref:`static-binary` for details. + +Option 1: From Source, no PyOpenCL integration +----------------------------------------------- + This command should install :mod:`loopy`:: pip install loo.py @@ -26,10 +38,59 @@ You may also clone its git repository:: git clone --recursive git://github.com/inducer/loopy git clone --recursive http://git.tiker.net/trees/loopy.git +Option 2: From Conda Forge, with PyOpenCL integration +----------------------------------------------------- + +This set of instructions is intended for 64-bit Linux and +MacOS support computers: + +#. Make sure your system has the basics to build software. + + On Debian derivatives (Ubuntu and many more), + installing ``build-essential`` should do the trick. + + Everywhere else, just making sure you have the ``g++`` package should be + enough. + +#. Install `miniconda `_. + (Both Python 2 and 3 should work. In the absence of other constraints, prefer Python 3.) + +#. ``export CONDA=/WHERE/YOU/INSTALLED/miniconda3`` + + If you accepted the default location, this should work: + + ``export CONDA=$HOME/miniconda3`` + +#. ``$CONDA/bin/conda create -n dev`` + +#. ``source $CONDA/bin/activate dev`` + +#. ``conda config --add channels conda-forge`` + +#. ``conda install git pip pocl islpy pyopencl`` (Linux) + + or + + ``conda install osx-pocl-opencl git pip pocl islpy pyopencl`` (OS X) + +#. Type the following command:: + + pip install git+https://github.com/inducer/loopy + +Next time you want to use :mod:`loopy`, just run the following command:: + + source /WHERE/YOU/INSTALLED/miniconda3/bin/activate dev + +You may also like to add this to a startup file (like :file:`$HOME/.bashrc`) or create an alias for it. + +See the `PyOpenCL installation instructions +`_ for options +regarding OpenCL drivers. + User-visible Changes ==================== -Version 2016.2 +Version 2017.2 -------------- .. note:: @@ -57,7 +118,7 @@ Licensing Loopy is licensed to you under the MIT/X Consortium license: -Copyright (c) 2009-13 Andreas Klöckner and Contributors. +Copyright (c) 2009-17 Andreas Klöckner and Contributors. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation @@ -342,6 +403,11 @@ Here's a Bibtex entry for your convenience:: doi = "{10.1145/2627373.2627387}", } +Getting help +============ + +Email the friendly folks on the `loopy mailing list `_. + Acknowledgments =============== diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index 9138d9a41d7b33db956fd8aba55c0b3b788db064..07b7836d82596892f1d94e336dfa81e1b5a7a881 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -130,6 +130,7 @@ Iname Implementation Tags Tag Meaning =============================== ==================================================== ``None`` | ``"for"`` Sequential loop +``"ord"`` Forced-order sequential loop ``"l.N"`` Local (intra-group) axis N ("local") ``"g.N"`` Group-number axis N ("group") ``"unr"`` Unroll @@ -326,15 +327,25 @@ Expressions Loopy's expressions are a slight superset of the expressions supported by :mod:`pymbolic`. -* ``if`` -* ``elif`` (following an ``if``) -* ``else`` (following an ``if`` / ``elif``) +* ``if(cond, then, else_)`` + +* ``a[[ 8*i + j ]]``: Linear subscripts. + See :class:`loopy.symbolic.LinearSubscript`. + * ``reductions`` - * duplication of reduction inames + See :class:`loopy.symbolic.Reduction`. + * ``reduce`` vs ``simul_reduce`` + * complex-valued arithmetic + * tagging of array access and substitution rule use ("$") + See :class:`loopy.symbolic.TaggedVariable`. + * ``indexof``, ``indexof_vec`` +* ``cast(type, value)``: No parse syntax currently. + See :class:`loopy.symbolic.TypeCast`. + TODO: Functions TODO: Reductions @@ -579,4 +590,15 @@ Do not create :class:`LoopKernel` objects directly. Instead, refer to :members: :undoc-members: +Implementation Detail: The Base Array +------------------------------------- + +All array-like data in :mod:`loopy` (such as :class:`GlobalArg` and +:class:`TemporaryVariable`) derive from single, shared base array type, +described next. + +.. currentmodule:: loopy.kernel.array + +.. autoclass:: ArrayBase + .. vim: tw=75:spell:fdm=marker diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index d293e3ebe998a632bd547f94a67e675ff0592bfb..8bdd17b6295e9328bbbb4acbadd2be7e14ae625b 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -74,6 +74,8 @@ Manipulating Instructions .. autofunction:: add_nosync +.. autofunction:: add_barrier + Registering Library Routines ---------------------------- diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 92ec799f7045cf63dc75d1386d8a51fd7d42954c..69f89548618e86b408a31af240bee84678c859c1 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -122,7 +122,9 @@ always see loopy's view of a kernel by printing it. i: None --------------------------------------------------------------------------- INSTRUCTIONS: - [i] out[i] <- 2*a[i] # insn + for i + out[i] = 2*a[i] {id=insn} + end i --------------------------------------------------------------------------- You'll likely have noticed that there's quite a bit more information here @@ -1105,11 +1107,12 @@ work item: :mod:`loopy` supports two kinds of barriers: -* *Local barriers* ensure consistency of local memory accesses to items within +* *Local barriers* ensure consistency of memory accesses to items within *the same* work group. This synchronizes with all instructions in the work - group. + group. The type of memory (local or global) may be specified by the + :attr:`loopy.instruction.BarrierInstruction.mem_kind` -* *Global barriers* ensure consistency of global memory accesses +* *Global barriers* ensure consistency of memory accesses across *all* work groups, i.e. it synchronizes with every work item executing the kernel. Note that there is no exact equivalent for this kind of barrier in OpenCL. [#global-barrier-note]_ @@ -1118,14 +1121,17 @@ Once a work item has reached a barrier, it waits for everyone that it synchronizes with to reach the barrier before continuing. This means that unless all work items reach the same barrier, the kernel will hang during execution. +Barrier insertion +~~~~~~~~~~~~~~~~~ + By default, :mod:`loopy` inserts local barriers between two instructions when it detects that a dependency involving local memory may occur across work items. To see this in action, take a look at the section on :ref:`local_temporaries`. -In contrast, :mod:`loopy` will *not* insert global barriers automatically. -Global barriers require manual intervention along with some special -post-processing which we describe below. Consider the following kernel, which -attempts to rotate its input to the right by 1 in parallel: +In contrast, :mod:`loopy` will *not* insert global barriers automatically and +instead will report an error if it detects the need for a global barrier. As an +example, consider the following kernel, which attempts to rotate its input to +the right by 1 in parallel: .. doctest:: @@ -1153,8 +1159,22 @@ this, :mod:`loopy` will complain that global barrier needs to be inserted: ... MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) -The syntax for a global barrier instruction is ``... gbarrier``. This needs to -be added between the pair of offending instructions. +The syntax for a inserting a global barrier instruction is +``... gbarrier``. :mod:`loopy` also supports manually inserting local +barriers. The syntax for a local barrier instruction is ``... lbarrier``. + +Saving temporaries across global barriers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For some platforms (currently only PyOpenCL), :mod:`loopy` implements global +barriers by splitting the kernel into a host side kernel and multiple +device-side kernels. On such platforms, it will be necessary to save non-global +temporaries that are live across kernel calls. This section presents an example +of how to use :func:`loopy.save_and_reload_temporaries` which is helpful for +that purpose. + +Let us start with an example. Consider the kernel from above with a +``... gbarrier`` instruction that has already been inserted. .. doctest:: @@ -1175,17 +1195,16 @@ be added between the pair of offending instructions. ... assumptions="n mod 16 = 0") >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0") -When we try to generate code for this, it will still not work. +Here is what happens when we try to generate code for the kernel: >>> cgr = lp.generate_code_v2(knl) Traceback (most recent call last): ... MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?) -To understand what is going on, you need to know that :mod:`loopy` implements -global barriers by splitting the kernel into multiple device-side kernels. The -splitting happens when the instruction schedule is generated. To see the -schedule, we must first call :func:`loopy.get_one_scheduled_kernel`: +This happens due to the kernel splitting done by :mod:`loopy`. The splitting +happens when the instruction schedule is generated. To see the schedule, we +should call :func:`loopy.get_one_scheduled_kernel`: >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) >>> print(knl) @@ -1196,11 +1215,11 @@ schedule, we must first call :func:`loopy.get_one_scheduled_kernel`: --------------------------------------------------------------------------- SCHEDULE: 0: CALL KERNEL rotate_v2(extra_args=[], extra_inames=[]) - 1: [maketmp] tmp <- arr[i_inner + i_outer*16] + 1: tmp = arr[i_inner + i_outer*16] {id=maketmp} 2: RETURN FROM KERNEL rotate_v2 - 3: ---BARRIER:global--- + 3: ... gbarrier 4: CALL KERNEL rotate_v2_0(extra_args=[], extra_inames=[]) - 5: [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp + 5: arr[((1 + i_inner + i_outer*16) % n)] = tmp {id=rotate} 6: RETURN FROM KERNEL rotate_v2_0 --------------------------------------------------------------------------- @@ -1234,13 +1253,13 @@ put those instructions into the schedule. --------------------------------------------------------------------------- SCHEDULE: 0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[]) - 1: [maketmp] tmp <- arr[i_inner + i_outer*16] - 2: [tmp.save] tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] <- tmp + 1: tmp = arr[i_inner + i_outer*16] {id=maketmp} + 2: tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] = tmp {id=tmp.save} 3: RETURN FROM KERNEL rotate_v2 - 4: ---BARRIER:global--- + 4: ... gbarrier 5: CALL KERNEL rotate_v2_0(extra_args=['tmp_save_slot'], extra_inames=[]) - 6: [tmp.reload] tmp <- tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0] - 7: [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp + 6: tmp = tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0] {id=tmp.reload} + 7: arr[((1 + i_inner + i_outer*16) % n)] = tmp {id=rotate} 8: RETURN FROM KERNEL rotate_v2_0 --------------------------------------------------------------------------- @@ -1280,7 +1299,7 @@ The kernel translates into two OpenCL kernels. arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp; } -Executing the kernel does what we expect. +Now we can execute the kernel. >>> arr = cl.array.arange(queue, 16, dtype=np.int32) >>> print(arr) diff --git a/loopy/__init__.py b/loopy/__init__.py index aa1d43172a4bd6472f5974c292c4256946fcf542..7a853d11570226a7a3fe35539f590e7f78ea3f44 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -27,7 +27,7 @@ import six from six.moves import range, zip from loopy.symbolic import ( - TaggedVariable, Reduction, LinearSubscript, ) + TaggedVariable, Reduction, LinearSubscript, TypeCast) from loopy.diagnostic import LoopyError, LoopyWarning @@ -112,7 +112,7 @@ from loopy.transform.ilp import realize_ilp from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries - +from loopy.transform.add_barrier import add_barrier # }}} from loopy.type_inference import infer_unknown_types @@ -145,7 +145,7 @@ from loopy.target.numba import NumbaTarget, NumbaCudaTarget __all__ = [ - "TaggedVariable", "Reduction", "LinearSubscript", + "TaggedVariable", "Reduction", "LinearSubscript", "TypeCast", "auto", @@ -215,6 +215,8 @@ __all__ = [ "save_and_reload_temporaries", + "add_barrier", + # }}} "get_dot_dependency_graph", diff --git a/loopy/check.py b/loopy/check.py index 741195ae6ac87d01de3a4ac620ce510fd62ff470..7e661b566b15c47ec99e03ffdeb035057602da76 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -60,6 +60,12 @@ def check_identifiers_in_subst_rules(knl): # {{{ sanity checks run pre-scheduling + +# FIXME: Replace with an enum. See +# https://gitlab.tiker.net/inducer/loopy/issues/85 +VALID_NOSYNC_SCOPES = frozenset(["local", "global", "any"]) + + def check_insn_attributes(kernel): all_insn_ids = set(insn.id for insn in kernel.instructions) @@ -76,6 +82,30 @@ def check_insn_attributes(kernel): % (insn.id, ", ".join( insn.depends_on - all_insn_ids))) + no_sync_with_insn_ids = set(id for id, scope in insn.no_sync_with) + if not no_sync_with_insn_ids <= all_insn_ids: + raise LoopyError("insn '%s' has nosync directive with unknown " + "instruction ids: %s" + % (insn.id, + ", ".join(no_sync_with_insn_ids - all_insn_ids))) + + no_sync_with_scopes = set(scope for id, scope in insn.no_sync_with) + if not no_sync_with_scopes <= VALID_NOSYNC_SCOPES: + raise LoopyError("insn '%s' has invalid nosync scopes: %s" + % (insn.id, + ", ".join(no_sync_with_scopes - VALID_NOSYNC_SCOPES))) + + +def check_for_duplicate_insn_ids(knl): + insn_ids = set() + + for insn in knl.instructions: + if not isinstance(insn.id, str): + raise LoopyError("instruction id %r is not a string" % insn.id) + if insn.id in insn_ids: + raise LoopyError("duplicate instruction id: '%s'" % insn.id) + insn_ids.add(insn.id) + def check_loop_priority_inames_known(kernel): for prio in kernel.loop_priority: @@ -114,20 +144,20 @@ def check_for_inactive_iname_access(kernel): def _is_racing_iname_tag(tv, tag): from loopy.kernel.data import (temp_var_scope, - LocalIndexTagBase, GroupIndexTag, ParallelTag, auto) + LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto) if tv.scope == temp_var_scope.PRIVATE: return ( - isinstance(tag, ParallelTag) + isinstance(tag, ConcurrentTag) and not isinstance(tag, (LocalIndexTagBase, GroupIndexTag))) elif tv.scope == temp_var_scope.LOCAL: return ( - isinstance(tag, ParallelTag) + isinstance(tag, ConcurrentTag) and not isinstance(tag, GroupIndexTag)) elif tv.scope == temp_var_scope.GLOBAL: - return isinstance(tag, ParallelTag) + return isinstance(tag, ConcurrentTag) elif tv.scope == auto: raise LoopyError("scope of temp var '%s' has not yet been" @@ -139,7 +169,7 @@ def _is_racing_iname_tag(tv, tag): def check_for_write_races(kernel): - from loopy.kernel.data import ParallelTag + from loopy.kernel.data import ConcurrentTag iname_to_tag = kernel.iname_to_tag.get for insn in kernel.instructions: @@ -160,7 +190,7 @@ def check_for_write_races(kernel): raceable_parallel_insn_inames = set( iname for iname in kernel.insn_inames(insn) - if isinstance(iname_to_tag(iname), ParallelTag)) + if isinstance(iname_to_tag(iname), ConcurrentTag)) elif assignee_name in kernel.temporary_variables: temp_var = kernel.temporary_variables[assignee_name] @@ -200,13 +230,13 @@ def check_for_orphaned_user_hardware_axes(kernel): def check_for_data_dependent_parallel_bounds(kernel): - from loopy.kernel.data import ParallelTag + from loopy.kernel.data import ConcurrentTag for i, dom in enumerate(kernel.domains): dom_inames = set(dom.get_var_names(dim_type.set)) par_inames = set(iname for iname in dom_inames - if isinstance(kernel.iname_to_tag.get(iname), ParallelTag)) + if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag)) if not par_inames: continue @@ -356,6 +386,7 @@ def pre_schedule_checks(kernel): try: logger.debug("%s: pre-schedule check: start" % kernel.name) + check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) check_for_double_use_of_hw_axes(kernel) check_insn_attributes(kernel) @@ -370,7 +401,7 @@ def pre_schedule_checks(kernel): logger.debug("%s: pre-schedule check: done" % kernel.name) except KeyboardInterrupt: raise - except: + except Exception: print(75*"=") print("failing kernel during pre-schedule check:") print(75*"=") @@ -628,7 +659,7 @@ def pre_codegen_checks(kernel): check_that_shapes_and_strides_are_arguments(kernel) logger.debug("pre-codegen check %s: done" % kernel.name) - except: + except Exception: print(75*"=") print("failing kernel during pre-schedule check:") print(75*"=") @@ -677,6 +708,16 @@ def check_implemented_domains(kernel, implemented_domains, code=None): (insn_impl_domain & assumptions) .project_out_except(insn_inames, [dim_type.set])) + from loopy.kernel.instruction import BarrierInstruction + from loopy.kernel.data import LocalIndexTag + if isinstance(insn, BarrierInstruction): + # project out local-id-mapped inames, solves #94 on gitlab + non_lid_inames = frozenset( + [iname for iname in insn_inames if not isinstance( + kernel.iname_to_tag.get(iname), LocalIndexTag)]) + insn_impl_domain = insn_impl_domain.project_out_except( + non_lid_inames, [dim_type.set]) + insn_domain = kernel.get_inames_domain(insn_inames) insn_parameters = frozenset(insn_domain.get_var_names(dim_type.param)) assumptions, insn_domain = align_two(assumption_non_param, insn_domain) @@ -684,6 +725,11 @@ def check_implemented_domains(kernel, implemented_domains, code=None): .project_out_except(insn_inames, [dim_type.set]) .project_out_except(insn_parameters, [dim_type.param])) + if isinstance(insn, BarrierInstruction): + # project out local-id-mapped inames, solves #94 on gitlab + desired_domain = desired_domain.project_out_except( + non_lid_inames, [dim_type.set]) + insn_impl_domain = (insn_impl_domain .project_out_except(insn_parameters, [dim_type.param])) insn_impl_domain, desired_domain = align_two( diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 009dadc1a0d6236f092029dbc03ad0c035c7b8f8..e83515d31f1c61e52569d8d0754ce79e7a7f602f 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -28,7 +28,7 @@ from loopy.diagnostic import LoopyError, warn from pytools import ImmutableRecord import islpy as isl -from pytools.persistent_dict import PersistentDict +from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION @@ -357,8 +357,9 @@ class CodeGenerationState(object): # }}} -code_gen_cache = PersistentDict("loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION, - key_builder=LoopyKeyBuilder()) +code_gen_cache = WriteOncePersistentDict( + "loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION, + key_builder=LoopyKeyBuilder()) class PreambleInfo(ImmutableRecord): @@ -367,6 +368,7 @@ class PreambleInfo(ImmutableRecord): .. attribute:: seen_dtypes .. attribute:: seen_functions .. attribute:: seen_atomic_dtypes + .. attribute:: codegen_state """ @@ -495,7 +497,9 @@ def generate_code_v2(kernel): seen_dtypes=seen_dtypes, seen_functions=seen_functions, # a set of LoopyTypes (!) - seen_atomic_dtypes=seen_atomic_dtypes) + seen_atomic_dtypes=seen_atomic_dtypes, + codegen_state=codegen_state + ) preamble_generators = (kernel.preamble_generators + kernel.target.get_device_ast_builder().preamble_generators()) @@ -507,15 +511,15 @@ def generate_code_v2(kernel): # }}} # For faster unpickling in the common case when implemented_domains isn't needed. - from loopy.tools import LazilyUnpicklingDictionary + from loopy.tools import LazilyUnpicklingDict codegen_result = codegen_result.copy( - implemented_domains=LazilyUnpicklingDictionary( + implemented_domains=LazilyUnpicklingDict( codegen_result.implemented_domains)) logger.info("%s: generate code: done" % kernel.name) if CACHING_ENABLED: - code_gen_cache[input_kernel] = codegen_result + code_gen_cache.store_if_not_present(input_kernel, codegen_result) return codegen_result diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index 61f4b3a9b8c38dfc25ebc81243812aa963423f8a..f398a063dc41f3f82267f6d4850158e4c45f4733 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -58,7 +58,7 @@ def get_approximate_convex_bounds_checks(domain, check_inames, implemented_domai def get_usable_inames_for_conditional(kernel, sched_index): from loopy.schedule import ( find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) - from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag + from loopy.kernel.data import ConcurrentTag, LocalIndexTagBase, IlpBaseTag result = find_active_inames_at(kernel, sched_index) crosses_barrier = has_barrier_within(kernel, sched_index) @@ -97,7 +97,7 @@ def get_usable_inames_for_conditional(kernel, sched_index): # at the innermost level of nesting. if ( - isinstance(tag, ParallelTag) + isinstance(tag, ConcurrentTag) and not (isinstance(tag, LocalIndexTagBase) and crosses_barrier) and not isinstance(tag, IlpBaseTag) ): diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 789c00d33b7bb41816e6901e24046d4b0eefb27d..e3e209726879741c31d686f2a6530e1b7ec67b97 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -40,7 +40,7 @@ def get_admissible_conditional_inames_for(codegen_state, sched_index): kernel = codegen_state.kernel - from loopy.kernel.data import LocalIndexTag, HardwareParallelTag + from loopy.kernel.data import LocalIndexTag, HardwareConcurrentTag from loopy.schedule import find_active_inames_at, has_barrier_within result = find_active_inames_at(kernel, sched_index) @@ -48,7 +48,7 @@ def get_admissible_conditional_inames_for(codegen_state, sched_index): has_barrier = has_barrier_within(kernel, sched_index) for iname, tag in six.iteritems(kernel.iname_to_tag): - if (isinstance(tag, HardwareParallelTag) + if (isinstance(tag, HardwareConcurrentTag) and codegen_state.is_generating_device_code): if not has_barrier or not isinstance(tag, LocalIndexTag): result.add(iname) @@ -135,12 +135,13 @@ def generate_code_for_sched_index(codegen_state, sched_index): generate_sequential_loop_dim_code) from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag, - LoopedIlpTag, VectorizeTag) + LoopedIlpTag, VectorizeTag, InOrderSequentialSequentialTag) if isinstance(tag, (UnrollTag, UnrolledIlpTag)): func = generate_unroll_loop elif isinstance(tag, VectorizeTag): func = generate_vectorize_loop - elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag)): + elif tag is None or isinstance(tag, ( + LoopedIlpTag, ForceSequentialTag, InOrderSequentialSequentialTag)): func = generate_sequential_loop_dim_code else: raise RuntimeError("encountered (invalid) EnterLoop " @@ -155,7 +156,8 @@ def generate_code_for_sched_index(codegen_state, sched_index): if codegen_state.is_generating_device_code: barrier_ast = codegen_state.ast_builder.emit_barrier( - sched_item.kind, sched_item.comment) + sched_item.synchronization_kind, sched_item.mem_kind, + sched_item.comment) if sched_item.originating_insn_id: return CodeGenerationResult.new( codegen_state, @@ -166,7 +168,7 @@ def generate_code_for_sched_index(codegen_state, sched_index): return barrier_ast else: # host code - if sched_item.kind in ["global", "local"]: + if sched_item.synchronization_kind in ["global", "local"]: # host code is assumed globally and locally synchronous return CodeGenerationResult( host_program=None, @@ -175,8 +177,9 @@ def generate_code_for_sched_index(codegen_state, sched_index): implemented_data_info=codegen_state.implemented_data_info) else: - raise LoopyError("do not know how to emit code for barrier kind '%s'" - "in host code" % sched_item.kind) + raise LoopyError("do not know how to emit code for barrier " + "synchronization kind '%s'" "in host code" + % sched_item.synchronization_kind) # }}} @@ -240,6 +243,15 @@ def build_loop_nest(codegen_state, schedule_index): kernel = codegen_state.kernel + # If the AST builder does not implement conditionals, we can save us + # some work about hoisting conditionals and directly go into recursion. + if not codegen_state.ast_builder.can_implement_conditionals: + result = [] + inner = generate_code_for_sched_index(codegen_state, schedule_index) + if inner is not None: + result.append(inner) + return merge_codegen_results(codegen_state, result) + # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices # i.e. go up to the next LeaveLoop, and skip over inner loops. diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 0110a06095fa0bd690045f050136027d7bed3a28..1db7b0445efd2a2e27e761164fa919647df37a07 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -90,7 +90,7 @@ def get_slab_decomposition(kernel, iname): iname_rel_aff(space, iname, "<=", upper_bound_aff-upper_incr))) else: - lower_slab = None + upper_slab = None slabs = [] @@ -231,7 +231,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, kernel = codegen_state.kernel from loopy.kernel.data import ( - UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag) + UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag) from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) @@ -243,7 +243,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, hw_inames_left = [iname for iname in all_inames_by_insns - if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)] + if isinstance(kernel.iname_to_tag.get(iname), HardwareConcurrentTag)] if not hw_inames_left: return next_func(codegen_state) @@ -446,7 +446,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): from loopy.symbolic import pw_aff_to_expr - if ubound.is_equal(lbound): + if impl_ubound.is_equal(impl_lbound): # single-trip, generate just a variable assignment, not a loop inner = merge_codegen_results(codegen_state, [ astb.emit_initializer( diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 5f0884fd44ed5064f3f195d103b164f2163d1d19..5a747d070a47ff89336c22c8237ff03e567d0a8a 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -203,7 +203,7 @@ def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context) if len(pieces) == 1: (_, result), = pieces if constants_only and not result.is_cst(): - raise ValueError("a numeric %s was not found for PwAff '%s'" + raise StaticValueFindingError("a numeric %s was not found for PwAff '%s'" % (what, pw_aff)) return result @@ -329,7 +329,7 @@ def is_nonnegative(expr, over_set): from loopy.symbolic import aff_from_expr try: aff = aff_from_expr(space, -expr-1) - except: + except Exception: return None expr_neg_set = isl.BasicSet.universe(space).add_constraint( isl.Constraint.inequality_from_aff(aff)) @@ -616,10 +616,12 @@ def get_simple_strides(bset, key_by="name"): # recognizes constraints of the form # -i0 + 2*floor((i0)/2) == 0 - if aff.dim(dim_type.div) != 1: + divs_with_coeffs = _get_indices_and_coeffs(aff, [dim_type.div]) + if len(divs_with_coeffs) != 1: continue - idiv = 0 + (_, idiv, div_coeff), = divs_with_coeffs + div = aff.get_div(idiv) # check for sub-divs @@ -630,7 +632,7 @@ def get_simple_strides(bset, key_by="name"): denom = div.get_denominator_val().to_python() # if the coefficient in front of the div is not the same as the denominator - if not aff.get_coefficient_val(dim_type.div, idiv).div(denom).is_one(): + if not div_coeff.div(denom).is_one(): # not supported continue diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 622f5e49be1e40b4156113d92907fe8b1d9fb859..88a5717642af6d9ebc1bd7770936ae44e8cbf44b 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -40,6 +40,8 @@ from loopy.library.function import ( single_arg_function_mangler) from loopy.diagnostic import CannotBranchDomainTree, LoopyError +from loopy.tools import natsorted +from loopy.diagnostic import StaticValueFindingError # {{{ unique var names @@ -212,45 +214,17 @@ class LoopKernel(ImmutableRecordWithoutPickling): state=kernel_state.INITIAL, target=None, - # When kernels get intersected in slab decomposition, - # their grid sizes shouldn't change. This provides - # a way to forward sub-kernel grid size requests. overridden_get_grid_sizes_for_insn_ids=None): + """ + :arg overridden_get_grid_sizes_for_insn_ids: A callable. When kernels get + intersected in slab decomposition, their grid sizes shouldn't + change. This provides a way to forward sub-kernel grid size requests. + """ if cache_manager is None: from loopy.kernel.tools import SetOperationCacheManager cache_manager = SetOperationCacheManager() - # {{{ make instruction ids unique - - from loopy.kernel.creation import UniqueName - - insn_ids = set() - for insn in instructions: - if insn.id is not None and not isinstance(insn.id, UniqueName): - if insn.id in insn_ids: - raise RuntimeError("duplicate instruction id: %s" % insn.id) - insn_ids.add(insn.id) - - insn_id_gen = UniqueNameGenerator(insn_ids) - - new_instructions = [] - - for insn in instructions: - if insn.id is None: - new_instructions.append( - insn.copy(id=insn_id_gen("insn"))) - elif isinstance(insn.id, UniqueName): - new_instructions.append( - insn.copy(id=insn_id_gen(insn.id.name))) - else: - new_instructions.append(insn) - - instructions = new_instructions - del new_instructions - - # }}} - # {{{ process assumptions if assumptions is None: @@ -729,12 +703,12 @@ class LoopKernel(ImmutableRecordWithoutPickling): tag_key_uses = {} - from loopy.kernel.data import HardwareParallelTag + from loopy.kernel.data import HardwareConcurrentTag for iname in cond_inames: tag = self.iname_to_tag.get(iname) - if isinstance(tag, HardwareParallelTag): + if isinstance(tag, HardwareConcurrentTag): tag_key_uses.setdefault(tag.key, []).append(iname) multi_use_keys = set( @@ -744,7 +718,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): multi_use_inames = set() for iname in cond_inames: tag = self.iname_to_tag.get(iname) - if isinstance(tag, HardwareParallelTag) and tag.key in multi_use_keys: + if isinstance(tag, HardwareConcurrentTag) and tag.key in multi_use_keys: multi_use_inames.add(iname) return frozenset(cond_inames - multi_use_inames) @@ -986,8 +960,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): try: # insist block size is constant size = static_max_of_pw_aff(size, - constants_only=isinstance(tag, LocalIndexTag)) - except ValueError: + constants_only=isinstance(tag, LocalIndexTag), + context=self.assumptions) + except StaticValueFindingError: pass tgt_dict[tag.axis] = size @@ -1156,20 +1131,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): else: sep = [] - def natorder(key): - # Return natural ordering for strings, as opposed to dictionary order. - # E.g. will result in - # 'abc1' < 'abc9' < 'abc10' - # rather than - # 'abc1' < 'abc10' < 'abc9' - # Based on - # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7 - import re - return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)] - - def natsorted(seq, key=lambda x: x): - return sorted(seq, key=lambda y: natorder(key(y))) - if "name" in what: lines.extend(sep) lines.append("KERNEL: " + kernel.name) @@ -1207,7 +1168,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if "rules" in what and kernel.substitutions: lines.extend(sep) if show_labels: - lines.append("SUBSTIUTION RULES:") + lines.append("SUBSTITUTION RULES:") for rule_name in natsorted(six.iterkeys(kernel.substitutions)): lines.append(str(kernel.substitutions[rule_name])) @@ -1215,113 +1176,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): lines.extend(sep) if show_labels: lines.append("INSTRUCTIONS:") - loop_list_width = 35 - - # {{{ topological sort - printed_insn_ids = set() - printed_insn_order = [] - - def insert_insn_into_order(insn): - if insn.id in printed_insn_ids: - return - printed_insn_ids.add(insn.id) - - for dep_id in natsorted(insn.depends_on): - insert_insn_into_order(kernel.id_to_insn[dep_id]) - - printed_insn_order.append(insn) - - for insn in kernel.instructions: - insert_insn_into_order(insn) - - # }}} - - import loopy as lp - - Fore = self.options._fore # noqa - Style = self.options._style # noqa - - from loopy.kernel.tools import draw_dependencies_as_unicode_arrows - for insn, (arrows, extender) in zip( - printed_insn_order, - draw_dependencies_as_unicode_arrows( - printed_insn_order, fore=Fore, style=Style)): - - if isinstance(insn, lp.MultiAssignmentBase): - lhs = ", ".join(str(a) for a in insn.assignees) - rhs = str(insn.expression) - trailing = [] - elif isinstance(insn, lp.CInstruction): - lhs = ", ".join(str(a) for a in insn.assignees) - rhs = "CODE(%s|%s)" % ( - ", ".join(str(x) for x in insn.read_variables), - ", ".join("%s=%s" % (name, expr) - for name, expr in insn.iname_exprs)) - - trailing = [" "+l for l in insn.code.split("\n")] - elif isinstance(insn, lp.BarrierInstruction): - lhs = "" - rhs = "... %sbarrier" % insn.kind[0] - trailing = [] - - elif isinstance(insn, lp.NoOpInstruction): - lhs = "" - rhs = "... nop" - trailing = [] - - else: - raise LoopyError("unexpected instruction type: %s" - % type(insn).__name__) - - order = self._get_iname_order_for_printing() - loop_list = ",".join( - sorted(kernel.insn_inames(insn), key=lambda iname: order[iname])) - - options = [Fore.GREEN+insn.id+Style.RESET_ALL] - if insn.priority: - options.append("priority=%d" % insn.priority) - if insn.tags: - options.append("tags=%s" % ":".join(insn.tags)) - if isinstance(insn, lp.Assignment) and insn.atomicity: - options.append("atomic=%s" % ":".join( - str(a) for a in insn.atomicity)) - if insn.groups: - options.append("groups=%s" % ":".join(insn.groups)) - if insn.conflicts_with_groups: - options.append( - "conflicts=%s" % ":".join(insn.conflicts_with_groups)) - if insn.no_sync_with: - options.append("no_sync_with=%s" % ":".join( - "%s@%s" % entry for entry in sorted(insn.no_sync_with))) - - if lhs: - core = "%s <- %s" % ( - Fore.CYAN+lhs+Style.RESET_ALL, - Fore.MAGENTA+rhs+Style.RESET_ALL, - ) - else: - core = Fore.MAGENTA+rhs+Style.RESET_ALL - - if len(loop_list) > loop_list_width: - lines.append("%s [%s]" % (arrows, loop_list)) - lines.append("%s %s%s # %s" % ( - extender, - (loop_list_width+2)*" ", - core, - ", ".join(options))) - else: - lines.append("%s [%s]%s%s # %s" % ( - arrows, - loop_list, " "*(loop_list_width-len(loop_list)), - core, - ",".join(options))) - - lines.extend(trailing) - - if insn.predicates: - lines.append(10*" " + "if (%s)" % " && ".join( - [str(x) for x in insn.predicates])) + from loopy.kernel.tools import stringify_instruction_list + lines.extend(stringify_instruction_list(kernel)) dep_lines = [] for insn in kernel.instructions: @@ -1502,6 +1359,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): return hash(key_hash.digest()) def __eq__(self, other): + if self is other: + return True + if not isinstance(other, LoopKernel): return False @@ -1515,7 +1375,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): return False elif field_name == "assumptions": - if not self.assumptions.plain_is_equal(other.assumptions): + if not ( + self.assumptions.plain_is_equal(other.assumptions) + or self.assumptions.is_equal(other.assumptions)): return False elif getattr(self, field_name) != getattr(other, field_name): diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 531cc822e1bc76573ef6e0812970d16bd6df0b17..5d4240b9ab3e1ce2ad356a93b5e21b3bbf4d499e 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -567,6 +567,14 @@ class ArrayBase(ImmutableRecord): informational/documentational purpose. On occasion, they are used to generate more informative names than could be achieved by axis numbers. + + .. automethod:: __init__ + .. automethod:: __eq__ + .. automethod:: num_user_axes + .. automethod:: num_target_axes + .. automethod:: vector_size + + (supports persistent hashing) """ # Note that order may also wind up in attributes, if the @@ -579,7 +587,8 @@ class ArrayBase(ImmutableRecord): target=None, **kwargs): """ - All of the following are optional. Specify either strides or shape. + All of the following (except *name*) are optional. + Specify either strides or shape. :arg name: May contain multiple names separated by commas, in which case multiple arguments, @@ -643,8 +652,9 @@ class ArrayBase(ImmutableRecord): :arg offset: Offset from the beginning of the buffer to the point from which the strides are counted. May be one of - * 0 + * 0 or None * a string (that is interpreted as an argument name). + * a pymbolic expression * :class:`loopy.auto`, in which case an offset argument is added automatically, immediately following this argument. :class:`loopy.CompiledKernel` is even smarter in its treatment of @@ -877,6 +887,7 @@ class ArrayBase(ImmutableRecord): :class:`pytools.persistent_dict.PersistentDict`. """ + key_builder.rec(key_hash, type(self).__name__.encode("utf-8")) key_builder.rec(key_hash, self.name) key_builder.rec(key_hash, self.dtype) self.update_persistent_hash_for_shape(key_hash, key_builder, self.shape) @@ -1039,7 +1050,9 @@ class ArrayBase(ImmutableRecord): is_written=is_written) - if self.offset: + import loopy as lp + + if self.offset is lp.auto: offset_name = full_name+"_offset" yield ImplementedDataInfo( target=target, @@ -1205,12 +1218,16 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): return result def apply_offset(sub): - if ary.offset: - offset_name = ary.offset - if offset_name is lp.auto: - offset_name = array_name+"_offset" + import loopy as lp - return var(offset_name) + sub + if ary.offset: + if ary.offset is lp.auto: + return var(array_name+"_offset") + sub + elif isinstance(ary.offset, str): + return var(ary.offset) + sub + else: + # assume it's an expression + return ary.offset + sub else: return sub diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 89cb5f26a4940656cca1ab09841311148e113275..fb935476d54b3f9eb0a3bf858c883fe4c75eaa5a 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -51,9 +51,14 @@ logger = logging.getLogger(__name__) _IDENTIFIER_RE = re.compile(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\b") +# source: check_keywords() in isl_stream.c, ISL version 0.17 +_ISL_KEYWORDS = frozenset(""" + exists and or implies not infty infinity NaN min max rat true false ceild + floord mod ceil floor""".split()) + def _gather_isl_identifiers(s): - return set(_IDENTIFIER_RE.findall(s)) - set(["and", "or", "exists"]) + return set(_IDENTIFIER_RE.findall(s)) - _ISL_KEYWORDS class UniqueName: @@ -352,6 +357,14 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None): % v) del assignee_name + elif opt_key == "mem_kind": + opt_value = opt_value.lower().strip() + if opt_value not in ['local', 'global']: + raise LoopyError("Unknown memory synchronization type %s specified" + " expected, 'local' or 'global'." + % opt_value) + result["mem_kind"] = opt_value + else: raise ValueError( "unrecognized instruction option '%s' " @@ -420,6 +433,17 @@ SUBST_RE = re.compile( r"^\s*(?P.+?)\s*:=\s*(?P.+)\s*$") +def check_illegal_options(insn_options, insn_type): + illegal_options = [] + if insn_type not in ['gbarrier', 'lbarrier']: + illegal_options.append('mem_kind') + + bad_options = [x for x in illegal_options if x in insn_options] + if bad_options: + raise LoopyError("Cannot supply option(s) '%s' to instruction type '%s'" % + ', '.join(bad_options), insn_type) + + def parse_insn(groups, insn_options): """ :return: a tuple ``(insn, inames_to_dup)``, where insn is a @@ -434,7 +458,7 @@ def parse_insn(groups, insn_options): if "lhs" in groups: try: lhs = parse(groups["lhs"]) - except: + except Exception: print("While parsing left hand side '%s', " "the following error occurred:" % groups["lhs"]) raise @@ -443,7 +467,7 @@ def parse_insn(groups, insn_options): try: rhs = parse(groups["rhs"]) - except: + except Exception: print("While parsing right hand side '%s', " "the following error occurred:" % groups["rhs"]) raise @@ -493,6 +517,9 @@ def parse_insn(groups, insn_options): groups["options"], assignee_names=assignee_names) + # check for bad options + check_illegal_options(insn_options, 'assignment') + insn_id = insn_options.pop("insn_id", None) inames_to_dup = insn_options.pop("inames_to_dup", []) @@ -517,14 +544,14 @@ def parse_subst_rule(groups): from loopy.symbolic import parse try: lhs = parse(groups["lhs"]) - except: + except Exception: print("While parsing left hand side '%s', " "the following error occurred:" % groups["lhs"]) raise try: rhs = parse(groups["rhs"]) - except: + except Exception: print("While parsing right hand side '%s', " "the following error occurred:" % groups["rhs"]) raise @@ -578,13 +605,15 @@ def parse_special_insn(groups, insn_options): from loopy.kernel.instruction import NoOpInstruction, BarrierInstruction special_insn_kind = groups["kind"] + # check for bad options + check_illegal_options(insn_options, special_insn_kind) if special_insn_kind == "gbarrier": cls = BarrierInstruction - kwargs["kind"] = "global" + kwargs["synchronization_kind"] = "global" elif special_insn_kind == "lbarrier": cls = BarrierInstruction - kwargs["kind"] = "local" + kwargs["synchronization_kind"] = "local" elif special_insn_kind == "nop": cls = NoOpInstruction else: @@ -792,6 +821,8 @@ def parse_instructions(instructions, defines): parse_insn_options( insn_options_stack[-1], with_options_match.group("options"))) + # check for bad options + check_illegal_options(insn_options_stack[-1], 'with-block') continue for_match = FOR_RE.match(insn) @@ -896,7 +927,8 @@ def parse_instructions(instructions, defines): obj = insn_options_stack.pop() #if this object is the end of an if statement if obj['predicates'] == if_predicates_stack[-1]["insn_predicates"] and\ - if_predicates_stack[-1]["insn_predicates"]: + if_predicates_stack[-1]["insn_predicates"] and\ + obj['within_inames'] == if_predicates_stack[-1]['within_inames']: if_predicates_stack.pop() continue @@ -991,7 +1023,7 @@ def parse_domains(domains, defines): try: dom = isl.BasicSet.read_from_str(isl.DEFAULT_CONTEXT, dom) - except: + except Exception: print("failed to parse domain '%s'" % dom) raise else: @@ -1859,6 +1891,13 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): :arg seq_dependencies: If *True*, dependencies that sequentially connect the given *instructions* will be added. Defaults to *False*. + :arg fixed_parameters: A dictionary of *name*/*value* pairs, where *name* + will be fixed to *value*. *name* may refer to :ref:`domain-parameters` + or :ref:`arguments`. See also :func:`loopy.fix_parameters`. + + .. versionchanged:: 2017.2 + + *fixed_parameters* added. .. versionchanged:: 2016.3 @@ -1876,6 +1915,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): flags = kwargs.pop("flags", None) target = kwargs.pop("target", None) seq_dependencies = kwargs.pop("seq_dependencies", False) + fixed_parameters = kwargs.pop("fixed_parameters", {}) if defines: from warnings import warn @@ -1976,6 +2016,11 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): target=target, **kwargs) + from loopy.transform.instruction import uniquify_instruction_ids + knl = uniquify_instruction_ids(knl) + from loopy.check import check_for_duplicate_insn_ids + check_for_duplicate_insn_ids(knl) + if seq_dependencies: knl = add_sequential_dependencies(knl) @@ -1996,11 +2041,14 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # ------------------------------------------------------------------------- # Must create temporaries before inferring inames (because those temporaries # mediate dependencies that are then used for iname propagation.) + # Must create temporaries before fixing parameters. # ------------------------------------------------------------------------- knl = add_used_inames(knl) # NOTE: add_inferred_inames will be phased out and throws warnings if it # does something. knl = add_inferred_inames(knl) + from loopy.transform.parameter import fix_parameters + knl = fix_parameters(knl, **fixed_parameters) # ------------------------------------------------------------------------- # Ordering dependency: # ------------------------------------------------------------------------- diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 94b31df12dae516d3539438b7e4ed66ed765e697..96933f57a003aaca58ed00d2d73c3301b0c448c7 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -77,14 +77,19 @@ class IndexTag(ImmutableRecord): return type(self).__name__ -class ParallelTag(IndexTag): +class ConcurrentTag(IndexTag): pass -class HardwareParallelTag(ParallelTag): +class HardwareConcurrentTag(ConcurrentTag): pass +# deprecated aliases +ParallelTag = ConcurrentTag +HardwareParallelTag = HardwareConcurrentTag + + class UniqueTag(IndexTag): pass @@ -105,11 +110,11 @@ class AxisTag(UniqueTag): self.print_name, self.axis) -class GroupIndexTag(HardwareParallelTag, AxisTag): +class GroupIndexTag(HardwareConcurrentTag, AxisTag): print_name = "g" -class LocalIndexTagBase(HardwareParallelTag): +class LocalIndexTagBase(HardwareConcurrentTag): pass @@ -130,7 +135,7 @@ class AutoFitLocalIndexTag(AutoLocalIndexTagBase): # {{{ ilp-like -class IlpBaseTag(ParallelTag): +class IlpBaseTag(ConcurrentTag): pass @@ -161,6 +166,11 @@ class ForceSequentialTag(IndexTag): return "forceseq" +class InOrderSequentialSequentialTag(IndexTag): + def __str__(self): + return "ord" + + def parse_tag(tag): if tag is None: return tag @@ -173,6 +183,8 @@ def parse_tag(tag): if tag == "for": return None + elif tag == "ord": + return InOrderSequentialSequentialTag() elif tag in ["unr"]: return UnrollTag() elif tag in ["vec"]: @@ -346,6 +358,14 @@ class TemporaryVariable(ArrayBase): A :class:`bool` indicating whether the variable may be written during its lifetime. If *True*, *initializer* must be given. + + .. attribute:: _base_storage_access_may_be_aliasing + + Whether the temporary is used to alias the underlying base storage. + Defaults to *False*. If *False*, C-based code generators will declare + the temporary as a ``restrict`` const pointer to the base storage + memory location. If *True*, the restrict part is omitted on this + declaration. """ min_target_axes = 0 @@ -358,12 +378,14 @@ class TemporaryVariable(ArrayBase): "base_storage", "initializer", "read_only", + "_base_storage_access_may_be_aliasing", ] def __init__(self, name, dtype=None, shape=(), scope=auto, dim_tags=None, offset=0, dim_names=None, strides=None, order=None, base_indices=None, storage_shape=None, - base_storage=None, initializer=None, read_only=False, **kwargs): + base_storage=None, initializer=None, read_only=False, + _base_storage_access_may_be_aliasing=False, **kwargs): """ :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype` :arg shape: :class:`loopy.auto` or a shape tuple @@ -419,6 +441,13 @@ class TemporaryVariable(ArrayBase): "mutually exclusive" % name) + if base_storage is None and _base_storage_access_may_be_aliasing: + raise LoopyError( + "temporary variable '%s': " + "_base_storage_access_may_be_aliasing option, but no " + "base_storage given!" + % name) + ArrayBase.__init__(self, name=intern(name), dtype=dtype, shape=shape, dim_tags=dim_tags, offset=offset, dim_names=dim_names, @@ -428,6 +457,8 @@ class TemporaryVariable(ArrayBase): base_storage=base_storage, initializer=initializer, read_only=read_only, + _base_storage_access_may_be_aliasing=( + _base_storage_access_may_be_aliasing), **kwargs) @property @@ -489,7 +520,10 @@ class TemporaryVariable(ArrayBase): and ( (self.initializer is None and other.initializer is None) or np.array_equal(self.initializer, other.initializer)) - and self.read_only == other.read_only) + and self.read_only == other.read_only + and (self._base_storage_access_may_be_aliasing + == other._base_storage_access_may_be_aliasing) + ) def update_persistent_hash(self, key_hash, key_builder): """Custom hash computation function for use with @@ -500,6 +534,8 @@ class TemporaryVariable(ArrayBase): self.update_persistent_hash_for_shape(key_hash, key_builder, self.storage_shape) key_builder.rec(key_hash, self.base_indices) + key_builder.rec(key_hash, self.scope) + key_builder.rec(key_hash, self.base_storage) initializer = self.initializer if initializer is not None: @@ -507,10 +543,22 @@ class TemporaryVariable(ArrayBase): key_builder.rec(key_hash, initializer) key_builder.rec(key_hash, self.read_only) + key_builder.rec(key_hash, self._base_storage_access_may_be_aliasing) # }}} +def iname_tag_to_temp_var_scope(iname_tag): + iname_tag = parse_tag(iname_tag) + + if isinstance(iname_tag, GroupIndexTag): + return temp_var_scope.GLOBAL + elif isinstance(iname_tag, LocalIndexTag): + return temp_var_scope.LOCAL + else: + return temp_var_scope.PRIVATE + + # {{{ substitution rule class SubstitutionRule(ImmutableRecord): diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index d5c388af60a39987c09092fc93325f067a8f4cf7..dbd99e85016b00b3df4827ad7999e7b57e58af24 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -152,6 +152,12 @@ class InstructionBase(ImmutableRecord): "within_inames_is_final within_inames " "priority boostable boostable_into".split()) + # Names of fields that are pymbolic expressions. Needed for key building + pymbolic_fields = set("") + + # Names of fields that are sets of pymbolic expressions. Needed for key building + pymbolic_set_fields = set(["predicates"]) + def __init__(self, id, depends_on, depends_on_is_final, groups, conflicts_with_groups, no_sync_with, @@ -407,7 +413,27 @@ class InstructionBase(ImmutableRecord): return result - # {{{ comparison, hashing + # {{{ hashing and key building + + @property + @memoize_method + def _key_builder(self): + from loopy.tools import LoopyEqKeyBuilder + key_builder = LoopyEqKeyBuilder() + key_builder.update_for_class(self.__class__) + + for field_name in self.fields: + field_value = getattr(self, field_name) + if field_name in self.pymbolic_fields: + key_builder.update_for_pymbolic_field(field_name, field_value) + elif field_name in self.pymbolic_set_fields: + # First sort the fields, as a canonical form + items = tuple(sorted(field_value, key=str)) + key_builder.update_for_pymbolic_field(field_name, items) + else: + key_builder.update_for_field(field_name, field_value) + + return key_builder def update_persistent_hash(self, key_hash, key_builder): """Custom hash computation function for use with @@ -416,9 +442,7 @@ class InstructionBase(ImmutableRecord): Only works in conjunction with :class:`loopy.tools.KeyBuilder`. """ - # Order matters for hash forming--sort the field names - for field_name in sorted(self.fields): - key_builder.rec(key_hash, getattr(self, field_name)) + key_builder.rec(key_hash, self._key_builder.hash_key()) # }}} @@ -648,6 +672,7 @@ class MultiAssignmentBase(InstructionBase): """An assignment instruction with an expression as a right-hand side.""" fields = InstructionBase.fields | set(["expression"]) + pymbolic_fields = InstructionBase.pymbolic_fields | set(["expression"]) @memoize_method def read_dependency_names(self): @@ -734,6 +759,7 @@ class Assignment(MultiAssignmentBase): fields = MultiAssignmentBase.fields | \ set("assignee temp_var_type atomicity".split()) + pymbolic_fields = MultiAssignmentBase.pymbolic_fields | set(["assignee"]) def __init__(self, assignee, expression, @@ -818,26 +844,6 @@ class Assignment(MultiAssignmentBase): result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates) return result - def update_persistent_hash(self, key_hash, key_builder): - """Custom hash computation function for use with - :class:`pytools.persistent_dict.PersistentDict`. - - Only works in conjunction with :class:`loopy.tools.KeyBuilder`. - """ - - # Order matters for hash forming--sort the fields. - for field_name in sorted(self.fields): - if field_name in ["assignee", "expression"]: - key_builder.update_for_pymbolic_expression( - key_hash, getattr(self, field_name)) - elif field_name == "predicates": - preds = sorted(self.predicates, key=str) - for pred in preds: - key_builder.update_for_pymbolic_expression( - key_hash, pred) - else: - key_builder.rec(key_hash, getattr(self, field_name)) - # {{{ for interface uniformity with CallInstruction @property @@ -886,6 +892,7 @@ class CallInstruction(MultiAssignmentBase): fields = MultiAssignmentBase.fields | \ set("assignees temp_var_types".split()) + pymbolic_fields = MultiAssignmentBase.pymbolic_fields | set(["assignees"]) def __init__(self, assignees, expression, @@ -987,26 +994,6 @@ class CallInstruction(MultiAssignmentBase): result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates) return result - def update_persistent_hash(self, key_hash, key_builder): - """Custom hash computation function for use with - :class:`pytools.persistent_dict.PersistentDict`. - - Only works in conjunction with :class:`loopy.tools.KeyBuilder`. - """ - - # Order matters for hash forming--sort the fields. - for field_name in sorted(self.fields): - if field_name in ["assignees", "expression"]: - key_builder.update_for_pymbolic_expression( - key_hash, getattr(self, field_name)) - elif field_name == "predicates": - preds = sorted(self.predicates, key=str) - for pred in preds: - key_builder.update_for_pymbolic_expression( - key_hash, pred) - else: - key_builder.rec(key_hash, getattr(self, field_name)) - @property def atomicity(self): # Function calls can impossibly be atomic, and even the result assignment @@ -1086,6 +1073,10 @@ class CInstruction(InstructionBase): fields = InstructionBase.fields | \ set("iname_exprs code read_variables assignees".split()) + pymbolic_fields = InstructionBase.pymbolic_fields | \ + set("iname_exprs assignees".split()) + pymbolic_set_fields = InstructionBase.pymbolic_set_fields | \ + set(["read_variables"]) def __init__(self, iname_exprs, code, @@ -1210,25 +1201,6 @@ class CInstruction(InstructionBase): return first_line + "\n " + "\n ".join( self.code.split("\n")) - def update_persistent_hash(self, key_hash, key_builder): - """Custom hash computation function for use with - :class:`pytools.persistent_dict.PersistentDict`. - - Only works in conjunction with :class:`loopy.tools.KeyBuilder`. - """ - - # Order matters for hash forming--sort the fields. - for field_name in sorted(self.fields): - if field_name == "assignees": - for a in self.assignees: - key_builder.update_for_pymbolic_expression(key_hash, a) - elif field_name == "iname_exprs": - for name, val in self.iname_exprs: - key_builder.rec(key_hash, name) - key_builder.update_for_pymbolic_expression(key_hash, val) - else: - key_builder.rec(key_hash, getattr(self, field_name)) - # }}} @@ -1308,19 +1280,29 @@ class NoOpInstruction(_DataObliviousInstruction): class BarrierInstruction(_DataObliviousInstruction): """An instruction that requires synchronization with all - concurrent work items of :attr:`kind`. + concurrent work items of :attr:`synchronization_kind`. - .. attribute:: kind + .. attribute:: synchronization_kind A string, ``"global"`` or ``"local"``. + .. attribute:: mem_kind + + A string, ``"global"`` or ``"local"``. Chooses which memory type to + sychronize, for targets that require this (e.g. OpenCL) + The textual syntax in a :mod:`loopy` kernel is:: ... gbarrier ... lbarrier + + Note that the memory type :attr:`mem_kind` can be specified for local barriers:: + + ... lbarrier {mem_kind=global} """ - fields = _DataObliviousInstruction.fields | set(["kind"]) + fields = _DataObliviousInstruction.fields | set(["synchronization_kind", + "mem_kind"]) def __init__(self, id, depends_on=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, @@ -1328,7 +1310,8 @@ class BarrierInstruction(_DataObliviousInstruction): within_inames_is_final=None, within_inames=None, priority=None, boostable=None, boostable_into=None, - predicates=None, tags=None, kind="global"): + predicates=None, tags=None, synchronization_kind="global", + mem_kind="local"): if predicates: raise LoopyError("conditional barriers are not supported") @@ -1346,20 +1329,32 @@ class BarrierInstruction(_DataObliviousInstruction): boostable=boostable, boostable_into=boostable_into, predicates=predicates, - tags=tags, + tags=tags ) - self.kind = kind + self.synchronization_kind = synchronization_kind + self.mem_kind = mem_kind def __str__(self): - first_line = "%s: ... %sbarrier" % (self.id, self.kind[0]) + first_line = "%s: ... %sbarrier" % (self.id, self.synchronization_kind[0]) options = self.get_str_options() + if self.synchronization_kind == "local": + # add the memory kind + options += ['mem_kind={}'.format(self.mem_kind)] if options: first_line += " {%s}" % (": ".join(options)) return first_line + @property + def kind(self): + from warnings import warn + warn("BarrierInstruction.kind is deprecated, use synchronization_kind " + "instead", DeprecationWarning, stacklevel=2) + return self.synchronization_kind + # }}} + # vim: foldmethod=marker diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 8bdc72d54a91c6e8b4f9ec0ca3053831627d3eae..a65e7fb4ceefd28a909dcb6cee24ea437f15a60e 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -35,7 +35,7 @@ import islpy as isl from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg - +from loopy.tools import natsorted import logging logger = logging.getLogger(__name__) @@ -620,11 +620,11 @@ class DomainParameterFinder(object): if dep.name in param_names: from pymbolic.algorithm import solve_affine_equations_for try: - # friggin' overkill :) + # overkill :) param_expr = solve_affine_equations_for( [dep.name], [(shape_i, var("shape_i"))] )[dep.name] - except: + except Exception: # went wrong? oh well pass else: @@ -1070,7 +1070,7 @@ def guess_var_shape(kernel, var_name): if n_axes == 1: # Leave shape undetermined--we can live with that for 1D. - shape = (None,) + shape = None else: raise LoopyError("cannot determine access range for '%s': " "undetermined index in subscript(s) '%s'" @@ -1092,7 +1092,7 @@ def guess_var_shape(kernel, var_name): kernel.cache_manager.dim_max( armap.access_range, i) + 1, constants_only=False))) - except: + except Exception: print("While trying to find shape axis %d of " "variable '%s', the following " "exception occurred:" % (i, var_name), @@ -1371,7 +1371,170 @@ def draw_dependencies_as_unicode_arrows( conform_to_uniform_length(extender)) for row, extender in rows] - return rows + return uniform_length, rows + +# }}} + + +# {{{ stringify_instruction_list + +def stringify_instruction_list(kernel): + # {{{ topological sort + + printed_insn_ids = set() + printed_insn_order = [] + + def insert_insn_into_order(insn): + if insn.id in printed_insn_ids: + return + printed_insn_ids.add(insn.id) + + for dep_id in natsorted(insn.depends_on): + insert_insn_into_order(kernel.id_to_insn[dep_id]) + + printed_insn_order.append(insn) + + for insn in kernel.instructions: + insert_insn_into_order(insn) + + # }}} + + import loopy as lp + + Fore = kernel.options._fore # noqa + Style = kernel.options._style # noqa + + uniform_arrow_length, arrows_and_extenders = \ + draw_dependencies_as_unicode_arrows( + printed_insn_order, fore=Fore, style=Style) + + leader = " " * uniform_arrow_length + lines = [] + current_inames = [set()] + + if uniform_arrow_length: + indent_level = [1] + else: + indent_level = [0] + + indent_increment = 2 + + iname_order = kernel._get_iname_order_for_printing() + + def add_pre_line(s): + lines.append(leader + " " * indent_level[0] + s) + + def add_main_line(s): + lines.append(arrows + " " * indent_level[0] + s) + + def add_post_line(s): + lines.append(extender + " " * indent_level[0] + s) + + def adapt_to_new_inames_list(new_inames): + added = [] + removed = [] + + # FIXME: Doesn't respect strict nesting + for iname in iname_order: + is_in_current = iname in current_inames[0] + is_in_new = iname in new_inames + + if is_in_new == is_in_current: + pass + elif is_in_new and not is_in_current: + added.append(iname) + elif not is_in_new and is_in_current: + removed.append(iname) + else: + assert False + + if removed: + indent_level[0] -= indent_increment * len(removed) + add_pre_line("end " + ", ".join(removed)) + if added: + add_pre_line("for " + ", ".join(added)) + indent_level[0] += indent_increment * len(added) + + current_inames[0] = new_inames + + for insn, (arrows, extender) in zip(printed_insn_order, arrows_and_extenders): + if isinstance(insn, lp.MultiAssignmentBase): + lhs = ", ".join(str(a) for a in insn.assignees) + rhs = str(insn.expression) + trailing = [] + elif isinstance(insn, lp.CInstruction): + lhs = ", ".join(str(a) for a in insn.assignees) + rhs = "CODE(%s|%s)" % ( + ", ".join(str(x) for x in insn.read_variables), + ", ".join("%s=%s" % (name, expr) + for name, expr in insn.iname_exprs)) + + trailing = [l for l in insn.code.split("\n")] + elif isinstance(insn, lp.BarrierInstruction): + lhs = "" + rhs = "... %sbarrier" % insn.synchronization_kind[0] + trailing = [] + + elif isinstance(insn, lp.NoOpInstruction): + lhs = "" + rhs = "... nop" + trailing = [] + + else: + raise LoopyError("unexpected instruction type: %s" + % type(insn).__name__) + + adapt_to_new_inames_list(kernel.insn_inames(insn)) + + options = ["id="+Fore.GREEN+insn.id+Style.RESET_ALL] + if insn.priority: + options.append("priority=%d" % insn.priority) + if insn.tags: + options.append("tags=%s" % ":".join(insn.tags)) + if isinstance(insn, lp.Assignment) and insn.atomicity: + options.append("atomic=%s" % ":".join( + str(a) for a in insn.atomicity)) + if insn.groups: + options.append("groups=%s" % ":".join(insn.groups)) + if insn.conflicts_with_groups: + options.append( + "conflicts=%s" % ":".join(insn.conflicts_with_groups)) + if insn.no_sync_with: + options.append("no_sync_with=%s" % ":".join( + "%s@%s" % entry for entry in sorted(insn.no_sync_with))) + if isinstance(insn, lp.BarrierInstruction) and \ + insn.synchronization_kind == 'local': + options.append('mem_kind=%s' % insn.mem_kind) + + if lhs: + core = "%s = %s" % ( + Fore.CYAN+lhs+Style.RESET_ALL, + Fore.MAGENTA+rhs+Style.RESET_ALL, + ) + else: + core = Fore.MAGENTA+rhs+Style.RESET_ALL + + options_str = " {%s}" % ", ".join(options) + + if insn.predicates: + # FIXME: precedence + add_pre_line("if %s" % " and ".join([str(x) for x in insn.predicates])) + indent_level[0] += indent_increment + + add_main_line(core + options_str) + + for t in trailing: + add_post_line(t) + + if insn.predicates: + indent_level[0] -= indent_increment + add_post_line("end") + + leader = extender + + adapt_to_new_inames_list([]) + + return lines # }}} @@ -1394,7 +1557,8 @@ def get_global_barrier_order(kernel): def is_barrier(my_insn_id): insn = kernel.id_to_insn[my_insn_id] from loopy.kernel.instruction import BarrierInstruction - return isinstance(insn, BarrierInstruction) and insn.kind == "global" + return isinstance(insn, BarrierInstruction) and \ + insn.synchronization_kind == "global" while unvisited: stack = [unvisited.pop()] @@ -1487,7 +1651,8 @@ def find_most_recent_global_barrier(kernel, insn_id): def is_barrier(my_insn_id): insn = kernel.id_to_insn[my_insn_id] from loopy.kernel.instruction import BarrierInstruction - return isinstance(insn, BarrierInstruction) and insn.kind == "global" + return isinstance(insn, BarrierInstruction) and \ + insn.synchronization_kind == "global" global_barrier_to_ordinal = dict( (b, i) for i, b in enumerate(global_barrier_order)) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index ced1aaaa13ed8275c1e3a376d1c24895287b3239..ac7ac19887388649670154fcd36eba79ba3b4315 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -30,7 +30,7 @@ from loopy.diagnostic import ( import islpy as isl -from pytools.persistent_dict import PersistentDict +from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION @@ -292,7 +292,7 @@ def _classify_reduction_inames(kernel, inames): from loopy.kernel.data import ( LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, - ParallelTag) + ConcurrentTag) for iname in inames: iname_tag = kernel.iname_to_tag.get(iname) @@ -305,7 +305,7 @@ def _classify_reduction_inames(kernel, inames): elif isinstance(iname_tag, LocalIndexTagBase): local_par.append(iname) - elif isinstance(iname_tag, (ParallelTag, VectorizeTag)): + elif isinstance(iname_tag, (ConcurrentTag, VectorizeTag)): nonlocal_par.append(iname) else: @@ -610,7 +610,7 @@ def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): if len(coeffs) == 0: try: scan_iname_aff.get_constant_val() - except: + except Exception: raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff) # If this point is reached we're assuming the domain is of the form @@ -956,7 +956,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, nresults=nresults, depends_on=insn.depends_on, within_inames=insn.within_inames | expr.inames, - within_inames_is_final=insn.within_inames_is_final) + within_inames_is_final=insn.within_inames_is_final, + predicates=insn.predicates, + ) newly_generated_insn_id_set.add(get_args_insn_id) @@ -970,7 +972,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, return updated_inner_exprs def expand_inner_reduction(id, expr, nresults, depends_on, within_inames, - within_inames_is_final): + within_inames_is_final, predicates): # FIXME: use make_temporaries from pymbolic.primitives import Call from loopy.symbolic import Reduction @@ -997,7 +999,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, expression=expr, depends_on=depends_on, within_inames=within_inames, - within_inames_is_final=within_inames_is_final) + within_inames_is_final=within_inames_is_final, + predicates=predicates) generated_insns.append(call_insn) @@ -1038,7 +1041,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=init_insn_depends_on, - expression=expr.operation.neutral_element(*arg_dtypes)) + expression=expr.operation.neutral_element(*arg_dtypes), + predicates=insn.predicates,) generated_insns.append(init_insn) @@ -1064,7 +1068,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, nresults=nresults, depends_on=insn.depends_on, within_inames=update_insn_iname_deps, - within_inames_is_final=insn.within_inames_is_final) + within_inames_is_final=insn.within_inames_is_final, + predicates=insn.predicates, + ) reduction_insn_depends_on.add(get_args_insn_id) else: @@ -1079,7 +1085,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, reduction_expr), depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on, within_inames=update_insn_iname_deps, - within_inames_is_final=insn.within_inames_is_final) + within_inames_is_final=insn.within_inames_is_final, + predicates=insn.predicates,) generated_insns.append(reduction_insn) @@ -1186,7 +1193,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, expression=neutral, within_inames=base_iname_deps | frozenset([base_exec_iname]), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset()) + depends_on=frozenset(), + predicates=insn.predicates, + ) generated_insns.append(init_insn) init_neutral_id = insn_id_gen("%s_%s_init_neutral" % (insn.id, red_iname)) @@ -1196,7 +1205,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, expression=neutral, within_inames=base_iname_deps | frozenset([base_exec_iname]), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset()) + depends_on=frozenset(), + predicates=insn.predicates, + ) generated_insns.append(init_neutral_insn) transfer_depends_on = set([init_neutral_id, init_id]) @@ -1216,7 +1227,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, within_inames=( (outer_insn_inames - frozenset(expr.inames)) | frozenset([red_iname])), - within_inames_is_final=insn.within_inames_is_final) + within_inames_is_final=insn.within_inames_is_final, + predicates=insn.predicates, + ) transfer_depends_on.add(get_args_insn_id) else: @@ -1239,7 +1252,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, | frozenset([red_iname])), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on, - no_sync_with=frozenset([(init_id, "any")])) + no_sync_with=frozenset([(init_id, "any")]), + predicates=insn.predicates, + ) generated_insns.append(transfer_insn) cur_size = 1 @@ -1280,6 +1295,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([prev_id]), + predicates=insn.predicates, ) generated_insns.append(stage_insn) @@ -1398,7 +1414,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, (sweep_iname,) + expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=init_insn_depends_on, - expression=expr.operation.neutral_element(*arg_dtypes)) + expression=expr.operation.neutral_element(*arg_dtypes), + predicates=insn.predicates, + ) generated_insns.append(init_insn) @@ -1425,7 +1443,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, depends_on=frozenset(update_insn_depends_on), within_inames=update_insn_iname_deps, no_sync_with=insn.no_sync_with, - within_inames_is_final=insn.within_inames_is_final) + within_inames_is_final=insn.within_inames_is_final, + predicates=insn.predicates, + ) generated_insns.append(scan_insn) @@ -1531,7 +1551,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, expression=neutral, within_inames=base_iname_deps | frozenset([base_exec_iname]), within_inames_is_final=insn.within_inames_is_final, - depends_on=init_insn_depends_on) + depends_on=init_insn_depends_on, + predicates=insn.predicates, + ) generated_insns.append(init_insn) transfer_insn_depends_on = set([init_insn.id]) | insn.depends_on @@ -1561,7 +1583,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset(transfer_insn_depends_on), - no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with) + no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with, + predicates=insn.predicates, + ) generated_insns.append(transfer_insn) @@ -1590,7 +1614,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, within_inames=( base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset([prev_id])) + depends_on=frozenset([prev_id]), + predicates=insn.predicates, + ) if cur_size == 1: # Performance hack: don't add a barrier here with transfer_insn. @@ -1623,6 +1649,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([prev_id]), + predicates=insn.predicates, ) generated_insns.append(write_stage_insn) @@ -1928,7 +1955,7 @@ def find_idempotence(kernel): for insn in kernel.instructions) from collections import defaultdict - dep_graph = defaultdict(lambda: set()) + dep_graph = defaultdict(set) for insn in kernel.instructions: dep_graph[insn.id] = set(writer_id @@ -2020,7 +2047,8 @@ def limit_boostability(kernel): # }}} -preprocess_cache = PersistentDict("loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, +preprocess_cache = WriteOncePersistentDict( + "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -2126,7 +2154,7 @@ def preprocess_kernel(kernel, device=None): # }}} if CACHING_ENABLED: - preprocess_cache[input_kernel] = kernel + preprocess_cache.store_if_not_present(input_kernel, kernel) return kernel diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 4281e50bd006a3cddf5a3cae0ffffe3d78abcfac..850f0a61fcdc2878d43895bc0e024032532aa680 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -29,7 +29,7 @@ import sys import islpy as isl from loopy.diagnostic import warn_with_kernel, LoopyError # noqa -from pytools.persistent_dict import PersistentDict +from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION @@ -84,14 +84,18 @@ class Barrier(ScheduleItem): A plain-text comment explaining why the barrier was inserted. - .. attribute:: kind + .. attribute:: synchronization_kind + + ``"local"`` or ``"global"`` + + .. attribute:: mem_kind ``"local"`` or ``"global"`` .. attribute:: originating_insn_id """ - hash_fields = ["comment", "kind"] + hash_fields = ["comment", "synchronization_kind", "mem_kind"] __slots__ = hash_fields + ["originating_insn_id"] # }}} @@ -206,13 +210,13 @@ def find_loop_nest_with_map(kernel): """ result = {} - from loopy.kernel.data import ParallelTag, IlpBaseTag, VectorizeTag + from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag all_nonpar_inames = set([ iname for iname in kernel.all_inames() if not isinstance(kernel.iname_to_tag.get(iname), - (ParallelTag, IlpBaseTag, VectorizeTag))]) + (ConcurrentTag, IlpBaseTag, VectorizeTag))]) iname_to_insns = kernel.iname_to_insns() @@ -274,10 +278,10 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): result = {} - from loopy.kernel.data import ParallelTag, IlpBaseTag, VectorizeTag + from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag for insn in kernel.instructions: for iname in kernel.insn_inames(insn): - if isinstance(kernel.iname_to_tag.get(iname), ParallelTag): + if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag): continue iname_dep = result.setdefault(iname, set()) @@ -308,7 +312,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): continue tag = kernel.iname_to_tag.get(dep_insn_iname) - if isinstance(tag, (ParallelTag, IlpBaseTag, VectorizeTag)): + if isinstance(tag, (ConcurrentTag, IlpBaseTag, VectorizeTag)): # Parallel tags don't really nest, so we'll disregard # them here. continue @@ -431,14 +435,19 @@ def format_insn(kernel, insn_id): from loopy.kernel.instruction import ( MultiAssignmentBase, NoOpInstruction, BarrierInstruction) if isinstance(insn, MultiAssignmentBase): - return "[%s] %s%s%s <- %s%s%s" % ( - format_insn_id(kernel, insn_id), + return "%s%s%s = %s%s%s {id=%s}" % ( Fore.CYAN, ", ".join(str(a) for a in insn.assignees), Style.RESET_ALL, - Fore.MAGENTA, str(insn.expression), Style.RESET_ALL) + Fore.MAGENTA, str(insn.expression), Style.RESET_ALL, + format_insn_id(kernel, insn_id)) elif isinstance(insn, BarrierInstruction): - return "[%s] %s... %sbarrier%s" % ( + mem_kind = '' + if insn.synchronization_kind == 'local': + mem_kind = '{mem_kind=%s}' % insn.mem_kind + + return "[%s] %s... %sbarrier%s%s" % ( format_insn_id(kernel, insn_id), - Fore.MAGENTA, insn.kind[0], Style.RESET_ALL) + Fore.MAGENTA, insn.synchronization_kind[0], mem_kind, + Style.RESET_ALL) elif isinstance(insn, NoOpInstruction): return "[%s] %s... nop%s" % ( format_insn_id(kernel, insn_id), @@ -456,11 +465,11 @@ def dump_schedule(kernel, schedule): from loopy.kernel.data import MultiAssignmentBase for sched_item in schedule: if isinstance(sched_item, EnterLoop): - lines.append(indent + "FOR %s" % sched_item.iname) + lines.append(indent + "for %s" % sched_item.iname) indent += " " elif isinstance(sched_item, LeaveLoop): indent = indent[:-4] - lines.append(indent + "END %s" % sched_item.iname) + lines.append(indent + "end %s" % sched_item.iname) elif isinstance(sched_item, CallKernel): lines.append(indent + "CALL KERNEL %s(extra_args=%s, extra_inames=%s)" % ( @@ -479,7 +488,8 @@ def dump_schedule(kernel, schedule): insn_str = sched_item.insn_id lines.append(indent + insn_str) elif isinstance(sched_item, Barrier): - lines.append(indent + "---BARRIER:%s---" % sched_item.kind) + lines.append(indent + "... %sbarrier" % + sched_item.synchronization_kind[0]) else: assert False @@ -833,7 +843,8 @@ def generate_loop_schedules_internal( # {{{ check if scheduler state allows insn scheduling from loopy.kernel.instruction import BarrierInstruction - if isinstance(insn, BarrierInstruction) and insn.kind == "global": + if isinstance(insn, BarrierInstruction) and \ + insn.synchronization_kind == "global": if not sched_state.may_schedule_global_barriers: if debug_mode: print("can't schedule '%s' because global barriers are " @@ -1318,7 +1329,8 @@ def convert_barrier_instructions_to_barriers(kernel, schedule): insn = kernel.id_to_insn[sched_item.insn_id] if isinstance(insn, BarrierInstruction): result.append(Barrier( - kind=insn.kind, + synchronization_kind=insn.synchronization_kind, + mem_kind=insn.mem_kind, originating_insn_id=insn.id, comment="Barrier inserted due to %s" % insn.id)) continue @@ -1415,8 +1427,8 @@ class DependencyTracker(object): raise ValueError("unknown 'var_kind': %s" % var_kind) from collections import defaultdict - self.writer_map = defaultdict(lambda: set()) - self.reader_map = defaultdict(lambda: set()) + self.writer_map = defaultdict(set) + self.reader_map = defaultdict(set) self.temp_to_base_storage = kernel.get_temporary_to_base_storage_map() def map_to_base_storage(self, var_names): @@ -1577,7 +1589,8 @@ def _insn_ids_reaching_end(schedule, kind, reverse): # end # barrier() # end - if barrier_kind_more_or_equally_global(sched_item.kind, kind): + if barrier_kind_more_or_equally_global( + sched_item.synchronization_kind, kind): insn_ids_alive_at_scope[-1].clear() else: insn_ids_alive_at_scope[-1] |= set( @@ -1607,15 +1620,17 @@ def append_barrier_or_raise_error(schedule, dep, verify_only): tgt=dep.target.id, src=dep.source.id)) schedule.append(Barrier( comment=comment, - kind=dep.var_kind, + synchronization_kind=dep.var_kind, + mem_kind=dep.var_kind, originating_insn_id=None)) -def insert_barriers(kernel, schedule, kind, verify_only, level=0): +def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0): """ - :arg kind: "local" or "global". The :attr:`Barrier.kind` to be inserted. - Generally, this function will be called once for each kind of barrier - at the top level, where more global barriers should be inserted first. + :arg synchronization_kind: "local" or "global". + The :attr:`Barrier.synchronization_kind` to be inserted. Generally, this + function will be called once for each kind of barrier at the top level, where + more global barriers should be inserted first. :arg verify_only: do not insert barriers, only complain if they are missing. :arg level: the current level of loop nesting, 0 for outermost. @@ -1624,14 +1639,15 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0): # {{{ insert barriers at outermost scheduling level def insert_barriers_at_outer_level(schedule, reverse=False): - dep_tracker = DependencyTracker(kernel, var_kind=kind, reverse=reverse) + dep_tracker = DependencyTracker(kernel, var_kind=synchronization_kind, + reverse=reverse) if reverse: # Populate the dependency tracker with sources from the tail end of # the schedule block. for insn_id in ( insn_ids_reaching_end_without_intervening_barrier( - schedule, kind)): + schedule, synchronization_kind)): dep_tracker.add_source(insn_id) result = [] @@ -1645,11 +1661,11 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0): loop_head = ( insn_ids_reachable_from_start_without_intervening_barrier( - subloop, kind)) + subloop, synchronization_kind)) loop_tail = ( insn_ids_reaching_end_without_intervening_barrier( - subloop, kind)) + subloop, synchronization_kind)) # Checks if a barrier is needed before the loop. This handles # dependencies with targets that can be reached without an @@ -1688,7 +1704,8 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0): elif isinstance(sched_item, Barrier): result.append(sched_item) - if barrier_kind_more_or_equally_global(sched_item.kind, kind): + if barrier_kind_more_or_equally_global( + sched_item.synchronization_kind, synchronization_kind): dep_tracker.discard_all_sources() i += 1 @@ -1724,7 +1741,8 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0): if isinstance(sched_item, EnterLoop): subloop, new_i = gather_schedule_block(schedule, i) new_subloop = insert_barriers( - kernel, subloop[1:-1], kind, verify_only, level + 1) + kernel, subloop[1:-1], synchronization_kind, verify_only, + level + 1) result.append(subloop[0]) result.extend(new_subloop) result.append(subloop[-1]) @@ -1756,7 +1774,8 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0): def generate_loop_schedules(kernel, debug_args={}): from pytools import MinRecursionLimit - with MinRecursionLimit(len(kernel.instructions) * 2): + with MinRecursionLimit(max(len(kernel.instructions) * 2, + len(kernel.all_inames()) * 4)): for sched in generate_loop_schedules_inner(kernel, debug_args=debug_args): yield sched @@ -1786,7 +1805,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}): for item in preschedule for insn_id in sched_item_to_insn_id(item)) - from loopy.kernel.data import IlpBaseTag, ParallelTag, VectorizeTag + from loopy.kernel.data import IlpBaseTag, ConcurrentTag, VectorizeTag ilp_inames = set( iname for iname in kernel.all_inames() @@ -1797,7 +1816,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}): if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag)) parallel_inames = set( iname for iname in kernel.all_inames() - if isinstance(kernel.iname_to_tag.get(iname), ParallelTag)) + if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag)) loop_nest_with_map = find_loop_nest_with_map(kernel) loop_nest_around_map = find_loop_nest_around_map(kernel) @@ -1889,11 +1908,11 @@ def generate_loop_schedules_inner(kernel, debug_args={}): if not kernel.options.disable_global_barriers: logger.debug("%s: barrier insertion: global" % kernel.name) gen_sched = insert_barriers(kernel, gen_sched, - kind="global", verify_only=True) + synchronization_kind="global", verify_only=True) logger.debug("%s: barrier insertion: local" % kernel.name) - gen_sched = insert_barriers(kernel, gen_sched, kind="local", - verify_only=False) + gen_sched = insert_barriers(kernel, gen_sched, + synchronization_kind="local", verify_only=False) logger.debug("%s: barrier insertion: done" % kernel.name) new_kernel = kernel.copy( @@ -1939,7 +1958,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}): # }}} -schedule_cache = PersistentDict("loopy-schedule-cache-v4-"+DATA_MODEL_VERSION, +schedule_cache = WriteOncePersistentDict( + "loopy-schedule-cache-v4-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -1970,7 +1990,7 @@ def get_one_scheduled_kernel(kernel): kernel.name, time()-start_time)) if CACHING_ENABLED and not from_cache: - schedule_cache[sched_cache_key] = result + schedule_cache.store_if_not_present(sched_cache_key, result) return result diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py index 1a0789c2f61e21e4a0371e2a73195c9771245527..5c41f03997e5193333f5be213f2f87d38147b6df 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/schedule/device_mapping.py @@ -106,7 +106,7 @@ def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen): [end_item]) elif isinstance(sched_item, Barrier): - if sched_item.kind == "global": + if sched_item.synchronization_kind == "global": # Wrap the current chunk into a kernel call. schedule_required_splitting = True if current_chunk: diff --git a/loopy/statistics.py b/loopy/statistics.py index 9b15ec471fb681698b85c1dd2f92376fbc731f00..72d0c6c7d7a634cd96379d17b7a91f6a638e0ab9 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -38,7 +38,6 @@ __doc__ = """ .. currentmodule:: loopy -.. autoclass:: GuardedPwQPolynomial .. autoclass:: ToCountMap .. autoclass:: Op .. autoclass:: MemAccess @@ -50,6 +49,11 @@ __doc__ = """ .. autofunction:: gather_access_footprints .. autofunction:: gather_access_footprint_bytes +.. currentmodule:: loopy.statistics + +.. autoclass:: GuardedPwQPolynomial + +.. currentmodule:: loopy """ @@ -996,6 +1000,9 @@ def add_assumptions_guard(kernel, pwqpolynomial): def count(kernel, set, space=None): try: + if space is not None: + set = set.align_params(space) + return add_assumptions_guard(kernel, set.card()) except AttributeError: pass @@ -1410,7 +1417,8 @@ def get_synchronization_map(knl): iname_list.pop() elif isinstance(sched_item, Barrier): - result = result + ToCountMap({"barrier_%s" % sched_item.kind: + result = result + ToCountMap({"barrier_%s" % + sched_item.synchronization_kind: get_count_poly(iname_list)}) elif isinstance(sched_item, CallKernel): diff --git a/loopy/symbolic.py b/loopy/symbolic.py index f1a494f30d469511817d204c0476ff79abe00e3b..2d31c63ef13774599de27ae871be64bc5acb7514 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -104,7 +104,9 @@ class IdentityMapperMixin(object): return expr def map_type_annotation(self, expr, *args): - return TypeAnnotation(expr.type, self.rec(expr.child)) + return type(expr)(expr.type, self.rec(expr.child)) + + map_type_cast = map_type_annotation map_linear_subscript = IdentityMapperBase.map_subscript @@ -147,6 +149,11 @@ class WalkMapper(WalkMapperBase): self.rec(expr.expr, *args) + def map_type_cast(self, expr, *args): + if not self.visit(expr): + return + self.rec(expr.child, *args) + map_tagged_variable = WalkMapperBase.map_variable def map_loopy_function_identifier(self, expr, *args): @@ -219,6 +226,10 @@ class StringifyMapper(StringifyMapperBase): def map_rule_argument(self, expr, enclosing_prec): return "" % expr.index + def map_type_cast(self, expr, enclosing_prec): + from pymbolic.mapper.stringifier import PREC_NONE + return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) + class UnidirectionalUnifier(UnidirectionalUnifierBase): def map_reduction(self, expr, other, unis): @@ -273,6 +284,9 @@ class DependencyMapper(DependencyMapperBase): map_linear_subscript = DependencyMapperBase.map_subscript + def map_type_cast(self, expr): + return self.rec(expr.child) + class SubstitutionRuleExpander(IdentityMapper): def __init__(self, rules): @@ -398,6 +412,10 @@ class TypedCSE(p.CommonSubexpression): class TypeAnnotation(p.Expression): + """Undocumented for now. Currently only used internally around LHSs of + assignments that create temporaries. + """ + def __init__(self, type, child): super(TypeAnnotation, self).__init__() self.type = type @@ -406,9 +424,55 @@ class TypeAnnotation(p.Expression): def __getinitargs__(self): return (self.type, self.child) + def stringifier(self): + return StringifyMapper + mapper_method = intern("map_type_annotation") +class TypeCast(p.Expression): + """Only defined for numerical types with semantics matching + :meth:`numpy.ndarray.astype`. + + .. attribute:: child + + The expression to be cast. + """ + + def __init__(self, type, child): + super(TypeCast, self).__init__() + + from loopy.types import to_loopy_type, NumpyType + type = to_loopy_type(type) + + if (not isinstance(type, NumpyType) + or not issubclass(type.dtype.type, np.number)): + from loopy.diagnostic import LoopyError + raise LoopyError("TypeCast only supports numerical numpy types, " + "not '%s'" % type) + + # We're storing the type as a name for now to avoid + # numpy pickling bug madness. (see loopy.types) + self._type_name = type.dtype.name + self.child = child + + @property + def type(self): + from loopy.types import NumpyType + return NumpyType(np.dtype(self._type_name)) + + # init_arg_names is a misnomer--they're attribute names used for pickling. + init_arg_names = ("_type_name", "child") + + def __getinitargs__(self): + return (self._type_name, self.child) + + def stringifier(self): + return StringifyMapper + + mapper_method = intern("map_type_cast") + + class TaggedVariable(p.Variable): """This is an identifier with a tag, such as 'matrix$one', where 'one' identifies this specific use of the identifier. This mechanism @@ -1232,6 +1296,9 @@ class PwAffEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin): super(PwAffEvaluationMapper, self).__init__(context) def map_constant(self, expr): + if isinstance(expr, np.integer): + expr = int(expr) + return self.pw_zero + expr def map_min(self, expr): @@ -1559,6 +1626,9 @@ class BatchedAccessRangeMapper(WalkMapper): def map_reduction(self, expr, inames): return WalkMapper.map_reduction(self, expr, inames | set(expr.inames)) + def map_type_cast(self, expr, inames): + return self.rec(expr.child, inames) + class AccessRangeMapper(object): diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 5d5743bae322fc59c989cafd85122c8ca619c422..aac528087cf812a91553d416f166be898a1cd132 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -55,6 +55,7 @@ class TargetBase(object): comparison_fields = () def update_persistent_hash(self, key_hash, key_builder): + key_hash.update(type(self).__name__.encode()) for field_name in self.hash_fields: key_builder.rec(key_hash, getattr(self, field_name)) @@ -188,9 +189,10 @@ class ASTBuilderBase(object): def add_vector_access(self, access_expr, index): raise NotImplementedError() - def emit_barrier(self, kind, comment): + def emit_barrier(self, synchronization_kind, mem_kind, comment): """ - :arg kind: ``"local"`` or ``"global"`` + :arg synchronization_kind: ``"local"`` or ``"global"`` + :arg mem_kind: ``"local"`` or ``"global"`` """ raise NotImplementedError() @@ -210,6 +212,10 @@ class ASTBuilderBase(object): static_lbound, static_ubound, inner): raise NotImplementedError() + @property + def can_implement_conditionals(self): + return False + def emit_if(self, condition_str, ast): raise NotImplementedError() @@ -274,28 +280,6 @@ class DummyHostASTBuilder(ASTBuilderBase): def ast_block_scope_class(self): return _DummyASTBlock - def emit_assignment(self, codegen_state, insn): - return None - - def emit_multiple_assignment(self, codegen_state, insn): - return None - - def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): - return None - - def emit_if(self, condition_str, ast): - return None - - def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): - return None - - def emit_blank_line(self): - return None - - def emit_comment(self, s): - return None - # }}} diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 0e1f0ff86ca5eaa1932f766b3f8b79f5167ce6f4..423311cdb259c77e77070f5fc27a542dd2c89fc9 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -27,12 +27,14 @@ THE SOFTWARE. import six import numpy as np # noqa +from loopy.kernel.data import CallMangleInfo from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder from loopy.diagnostic import LoopyError from cgen import Pointer, NestedDeclarator, Block from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import IdentityMapper +from loopy.types import NumpyType import pymbolic.primitives as p from pytools import memoize_method @@ -315,9 +317,75 @@ class _ConstRestrictPointer(Pointer): return sub_tp, ("*const __restrict__ %s" % sub_decl) +class _ConstPointer(Pointer): + def get_decl_pait(self): + sub_tp, sub_decl = self.subdecl.get_decl_pair() + return sub_tp, ("*const %s" % sub_decl) + + +# {{{ symbol mangler + +def c_symbol_mangler(kernel, name): + # float NAN as defined in C99 standard + if name == "NAN": + return NumpyType(np.dtype(np.float32)), name + return None + +# }}} + + +# {{{ function mangler + +def c_function_mangler(target, name, arg_dtypes): + # convert abs(), min(), max() to fabs(), fmin(), fmax() to comply with + # C99 standard + if not isinstance(name, str): + return None + + if (name == "abs" + and len(arg_dtypes) == 1 + and arg_dtypes[0].numpy_dtype.kind == "f"): + return CallMangleInfo( + target_name="fabs", + result_dtypes=arg_dtypes, + arg_dtypes=arg_dtypes) + + if name in ["max", "min"] and len(arg_dtypes) == 2: + dtype = np.find_common_type( + [], [dtype.numpy_dtype for dtype in arg_dtypes]) + + if dtype.kind == "c": + raise RuntimeError("min/max do not support complex numbers") + + if dtype.kind == "f": + name = "f" + name + + result_dtype = NumpyType(dtype) + return CallMangleInfo( + target_name=name, + result_dtypes=(result_dtype,), + arg_dtypes=2*(result_dtype,)) + + return None + +# }}} + + class CASTBuilder(ASTBuilderBase): # {{{ library + def function_manglers(self): + return ( + super(CASTBuilder, self).function_manglers() + [ + c_function_mangler + ]) + + def symbol_manglers(self): + return ( + super(CASTBuilder, self).symbol_manglers() + [ + c_symbol_mangler + ]) + def preamble_generators(self): return ( super(CASTBuilder, self).preamble_generators() + [ @@ -344,7 +412,16 @@ class CASTBuilder(ASTBuilderBase): result = [] from loopy.kernel.data import temp_var_scope - + from loopy.schedule import CallKernel + # We only need to write declarations for global variables with + # the first device program. `is_first_dev_prog` determines + # whether this is the first device program in the schedule. + is_first_dev_prog = True + for i in range(schedule_index): + if isinstance(kernel.schedule[i], CallKernel): + is_first_dev_prog = False + break + if is_first_dev_prog: for tv in sorted( six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): @@ -421,6 +498,15 @@ class CASTBuilder(ASTBuilderBase): base_storage_to_align_bytes = {} from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line + # Getting the temporary variables that are needed for the current + # sub-kernel. + from loopy.schedule.tools import ( + temporaries_read_in_subkernel, + temporaries_written_in_subkernel) + subkernel = kernel.schedule[schedule_index].kernel_name + sub_knl_temps = ( + temporaries_read_in_subkernel(kernel, subkernel) | + temporaries_written_in_subkernel(kernel, subkernel)) for tv in sorted( six.itervalues(kernel.temporary_variables), @@ -430,7 +516,8 @@ class CASTBuilder(ASTBuilderBase): if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations - if tv.scope != temp_var_scope.GLOBAL: + if tv.scope != temp_var_scope.GLOBAL and ( + tv.name in sub_knl_temps): decl = self.wrap_temporary_decl( self.get_temporary_decl( codegen_state, schedule_index, tv, idi), @@ -470,13 +557,17 @@ class CASTBuilder(ASTBuilderBase): temp_var_decl = self.wrap_temporary_decl( temp_var_decl, tv.scope) + if tv._base_storage_access_may_be_aliasing: + ptrtype = _ConstPointer + else: # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to # not use them to shovel data from one representation to the # other. That counts, right? + ptrtype = _ConstRestrictPointer - cast_decl = _ConstRestrictPointer(cast_decl) - temp_var_decl = _ConstRestrictPointer(temp_var_decl) + cast_decl = ptrtype(cast_decl) + temp_var_decl = ptrtype(temp_var_decl) cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( @@ -797,6 +888,10 @@ class CASTBuilder(ASTBuilderBase): from cgen import Comment return Comment(s) + @property + def can_implement_conditionals(self): + return True + def emit_if(self, condition_str, ast): from cgen import If return If(condition_str, ast) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 8f924d3aee3b9f2982006fdb7b558cccac6785e3..caee73eb1c3320f03ceac66e55e8f5c0bfadbbc2 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -29,8 +29,10 @@ import numpy as np from pymbolic.mapper import RecursiveMapper, IdentityMapper from pymbolic.mapper.stringifier import (PREC_NONE, PREC_CALL, PREC_PRODUCT, - PREC_POWER, - PREC_UNARY, PREC_LOGICAL_OR, PREC_LOGICAL_AND) + PREC_POWER, PREC_SHIFT, + PREC_UNARY, PREC_LOGICAL_OR, PREC_LOGICAL_AND, + PREC_BITWISE_AND, PREC_BITWISE_OR) + import islpy as isl import pymbolic.primitives as p from pymbolic import var @@ -338,6 +340,11 @@ class ExpressionToCExpressionMapper(IdentityMapper): expr.operator, self.rec(expr.right, inner_type_context)) + def map_type_cast(self, expr, type_context): + registry = self.codegen_state.ast_builder.target.get_dtype_registry() + cast = var("(%s)" % registry.dtype_to_ctype(expr.type)) + return cast(self.rec(expr.child, type_context)) + def map_constant(self, expr, type_context): if isinstance(expr, (complex, np.complexfloating)): try: @@ -782,6 +789,16 @@ class CExpressionToCodeMapper(RecursiveMapper): def map_literal(self, expr, enclosing_prec): return expr.s + def map_left_shift(self, expr, enclosing_prec): + return self.parenthesize_if_needed( + self.join_rec(" << ", (expr.shiftee, expr.shift), PREC_SHIFT), + enclosing_prec, PREC_SHIFT) + + def map_right_shift(self, expr, enclosing_prec): + return self.parenthesize_if_needed( + self.join_rec(" >> ", (expr.shiftee, expr.shift), PREC_SHIFT), + enclosing_prec, PREC_SHIFT) + def map_logical_not(self, expr, enclosing_prec): return self.parenthesize_if_needed( "!" + self.rec(expr.child, PREC_UNARY), @@ -807,6 +824,21 @@ class CExpressionToCodeMapper(RecursiveMapper): result = "(%s)" % result return result + def map_bitwise_not(self, expr, enclosing_prec): + return self.parenthesize_if_needed( + "~" + self.rec(expr.child, PREC_UNARY), + enclosing_prec, PREC_UNARY) + + def map_bitwise_and(self, expr, enclosing_prec): + return self.parenthesize_if_needed( + self.join_rec(" & ", expr.children, PREC_BITWISE_AND), + enclosing_prec, PREC_BITWISE_AND) + + def map_bitwise_or(self, expr, enclosing_prec): + return self.parenthesize_if_needed( + self.join_rec(" | ", expr.children, PREC_BITWISE_OR), + enclosing_prec, PREC_BITWISE_OR) + def map_sum(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_SUM diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 2bdffb5aa69bdc0f72fe12a58faa6d0e78920e0f..027f27838bf68511905bd34cf75d0b361c749629 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -291,18 +291,19 @@ class CUDACASTBuilder(CASTBuilder): def add_vector_access(self, access_expr, index): return access_expr.a(self._VEC_AXES[index]) - def emit_barrier(self, kind, comment): + def emit_barrier(self, synchronization_kind, mem_kind, comment): """ :arg kind: ``"local"`` or ``"global"`` + :arg memkind: unused :return: a :class:`loopy.codegen.GeneratedInstruction`. """ - if kind == "local": + if synchronization_kind == "local": if comment: comment = " /* %s */" % comment from cgen import Statement return Statement("__syncthreads()%s" % comment) - elif kind == "global": + elif synchronization_kind == "global": raise LoopyError("CUDA does not have global barriers") else: raise LoopyError("unknown barrier kind") diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 61788df2dd9d32978a550990fb7c84501f76e856..2909f16f56315b136f4f2677348bfe0c3e5553b4 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -30,6 +30,13 @@ from loopy.diagnostic import LoopyError from pytools.py_codegen import ( Indentation, PythonFunctionGenerator) +import logging +logger = logging.getLogger(__name__) + +from pytools.persistent_dict import WriteOncePersistentDict +from loopy.tools import LoopyKeyBuilder +from loopy.version import DATA_MODEL_VERSION + # {{{ object array argument packing @@ -419,7 +426,8 @@ class ExecutionWrapperGeneratorBase(object): # {{{ allocate written arrays, if needed if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \ - and arg.shape is not None: + and arg.shape is not None \ + and all(si is not None for si in arg.shape): if not isinstance(arg.dtype, NumpyType): raise LoopyError("do not know how to pass arg of type '%s'" @@ -653,6 +661,11 @@ class _Kernels(object): pass +typed_and_scheduled_cache = WriteOncePersistentDict( + "loopy-typed-and-scheduled-cache-v1-"+DATA_MODEL_VERSION, + key_builder=LoopyKeyBuilder()) + + # {{{ kernel executor class KernelExecutorBase(object): @@ -716,6 +729,31 @@ class KernelExecutorBase(object): return kernel + @memoize_method + def get_typed_and_scheduled_kernel(self, arg_to_dtype_set): + from loopy import CACHING_ENABLED + + from loopy.preprocess import prepare_for_caching + # prepare_for_caching() gets run by preprocess, but the kernel at this + # stage is not guaranteed to be preprocessed. + cacheable_kernel = prepare_for_caching(self.kernel) + cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set) + + if CACHING_ENABLED: + try: + return typed_and_scheduled_cache[cache_key] + except KeyError: + pass + + logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name) + + kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set) + + if CACHING_ENABLED: + typed_and_scheduled_cache.store_if_not_present(cache_key, kernel) + + return kernel + def arg_to_dtype_set(self, kwargs): if not self.has_runtime_typed_args: return None diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 35dade90494906b61aad9eb66e7271f2c5d1e180..45a59847ba9f175df5ca1be46aa78566b2aab03b 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -288,15 +288,15 @@ class ISPCASTBuilder(CASTBuilder): def add_vector_access(self, access_expr, index): return access_expr[index] - def emit_barrier(self, kind, comment): + def emit_barrier(self, synchronization_kind, mem_kind, comment): from cgen import Comment, Statement assert comment - if kind == "local": + if synchronization_kind == "local": return Comment("local barrier: %s" % comment) - elif kind == "global": + elif synchronization_kind == "global": return Statement("sync; /* %s */" % comment) else: diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index a5f7562c41c3ec8eca673904550e078d2a992241..2763caace891570a1b7f8b13f225001a03d3aa65 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -167,30 +167,6 @@ def opencl_function_mangler(kernel, name, arg_dtypes): if not isinstance(name, str): return None - if (name == "abs" - and len(arg_dtypes) == 1 - and arg_dtypes[0].numpy_dtype.kind == "f"): - return CallMangleInfo( - target_name="fabs", - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) - - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise RuntimeError("min/max do not support complex numbers") - - if dtype.kind == "f": - name = "f" + name - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) - if name == "dot": scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"] return CallMangleInfo( @@ -450,18 +426,20 @@ class OpenCLCASTBuilder(CASTBuilder): # The 'int' avoids an 'L' suffix for long ints. return access_expr.attr("s%s" % hex(int(index))[2:]) - def emit_barrier(self, kind, comment): + def emit_barrier(self, synchronization_kind, mem_kind, comment): """ :arg kind: ``"local"`` or ``"global"`` :return: a :class:`loopy.codegen.GeneratedInstruction`. """ - if kind == "local": + if synchronization_kind == "local": if comment: comment = " /* %s */" % comment + mem_kind = mem_kind.upper() + from cgen import Statement - return Statement("barrier(CLK_LOCAL_MEM_FENCE)%s" % comment) - elif kind == "global": + return Statement("barrier(CLK_%s_MEM_FENCE)%s" % (mem_kind, comment)) + elif synchronization_kind == "global": raise LoopyError("OpenCL does not have global barriers") else: raise LoopyError("unknown barrier kind") diff --git a/loopy/target/python.py b/loopy/target/python.py index 11951abcf17e94c0fdba51042e3060735215b423..ce04986d3d2a39dcf7126339055d32fa16ffcc25 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -283,6 +283,10 @@ class PythonASTBuilderBase(ASTBuilderBase): from genpy import Comment return Comment(s) + @property + def can_implement_conditionals(self): + return True + def emit_if(self, condition_str, ast): from genpy import If return If(condition_str, ast) diff --git a/loopy/tools.py b/loopy/tools.py index 56b673b597fc3bf43a6b03f87607ea8d3db0866a..d6952d54782f113685299641c828907fb7f32a46 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -23,6 +23,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import collections import numpy as np from pytools.persistent_dict import KeyBuilder as KeyBuilderBase from loopy.symbolic import WalkMapper as LoopyWalkMapper @@ -50,7 +51,12 @@ class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase): See also :meth:`LoopyKeyBuilder.update_for_pymbolic_expression`. """ - # + def map_reduction(self, expr, *args): + if not self.visit(expr): + return + + self.key_hash.update(type(expr.operation).__name__.encode("utf-8")) + self.rec(expr.expr, *args) class LoopyKeyBuilder(KeyBuilderBase): @@ -113,6 +119,53 @@ class PymbolicExpressionHashWrapper(object): # }}} +# {{{ eq key builder + +class LoopyEqKeyBuilder(object): + """Unlike :class:`loopy.tools.LoopyKeyBuilder`, this builds keys for use in + equality comparison, such that `key(a) == key(b)` if and only if `a == b`. + The types of objects being compared should satisfy structural equality. + + The output is suitable for use with :class:`loopy.tools.LoopyKeyBuilder` + provided all fields are persistent hashable. + + As an optimization, top-level pymbolic expression fields are stringified for + faster comparisons / hash calculations. + + Usage:: + + kb = LoopyEqKeyBuilder() + kb.update_for_class(insn.__class__) + kb.update_for_field("field", insn.field) + ... + key = kb.key() + + """ + + def __init__(self): + self.field_dict = {} + + def update_for_class(self, class_): + self.class_ = class_ + + def update_for_field(self, field_name, value): + self.field_dict[field_name] = value + + def update_for_pymbolic_field(self, field_name, value): + self.field_dict[field_name] = str(value).encode("utf-8") + + def key(self): + return (self.class_.__name__.encode("utf-8"), self.field_dict) + + def hash_key(self): + """Similar to key(), but excludes field names for faster hashing. + """ + return (self.class_.__name__.encode("utf-8"),) + tuple( + self.field_dict[k] for k in sorted(self.field_dict.keys())) + +# }}} + + # {{{ remove common indentation def remove_common_indentation(code, require_leading_newline=True, @@ -340,23 +393,19 @@ def compute_sccs(graph): # }}} -# {{{ lazily unpickling dictionary - +# {{{ pickled container value -class _PickledObjectWrapper(object): - """ - A class meant to wrap a pickled value (for :class:`LazilyUnpicklingDictionary`). +class _PickledObject(object): + """A class meant to wrap a pickled value (for :class:`LazilyUnpicklingDict` and + :class:`LazilyUnpicklingList`). """ - @classmethod - def from_object(cls, obj): - if isinstance(obj, cls): - return obj - from pickle import dumps - return cls(dumps(obj)) - - def __init__(self, objstring): - self.objstring = objstring + def __init__(self, obj): + if isinstance(obj, _PickledObject): + self.objstring = obj.objstring + else: + from pickle import dumps + self.objstring = dumps(obj) def unpickle(self): from pickle import loads @@ -366,12 +415,35 @@ class _PickledObjectWrapper(object): return {"objstring": self.objstring} -import collections +class _PickledObjectWithEqAndPersistentHashKeys(_PickledObject): + """Like :class:`_PickledObject`, with two additional attributes: + * `eq_key` + * `persistent_hash_key` -class LazilyUnpicklingDictionary(collections.MutableMapping): + This allows for comparison and for persistent hashing without unpickling. """ - A dictionary-like object which lazily unpickles its values. + + def __init__(self, obj, eq_key, persistent_hash_key): + _PickledObject.__init__(self, obj) + self.eq_key = eq_key + self.persistent_hash_key = persistent_hash_key + + def update_persistent_hash(self, key_hash, key_builder): + key_builder.rec(key_hash, self.persistent_hash_key) + + def __getstate__(self): + return {"objstring": self.objstring, + "eq_key": self.eq_key, + "persistent_hash_key": self.persistent_hash_key} + +# }}} + + +# {{{ lazily unpickling dictionary + +class LazilyUnpicklingDict(collections.MutableMapping): + """A dictionary-like object which lazily unpickles its values. """ def __init__(self, *args, **kwargs): @@ -379,7 +451,7 @@ class LazilyUnpicklingDictionary(collections.MutableMapping): def __getitem__(self, key): value = self._map[key] - if isinstance(value, _PickledObjectWrapper): + if isinstance(value, _PickledObject): value = self._map[key] = value.unpickle() return value @@ -397,12 +469,105 @@ class LazilyUnpicklingDictionary(collections.MutableMapping): def __getstate__(self): return {"_map": dict( - (key, _PickledObjectWrapper.from_object(val)) + (key, _PickledObject(val)) for key, val in six.iteritems(self._map))} # }}} +# {{{ lazily unpickling list + +class LazilyUnpicklingList(collections.MutableSequence): + """A list which lazily unpickles its values.""" + + def __init__(self, *args, **kwargs): + self._list = list(*args, **kwargs) + + def __getitem__(self, key): + item = self._list[key] + if isinstance(item, _PickledObject): + item = self._list[key] = item.unpickle() + return item + + def __setitem__(self, key, value): + self._list[key] = value + + def __delitem__(self, key): + del self._list[key] + + def __len__(self): + return len(self._list) + + def insert(self, key, value): + self._list.insert(key, value) + + def __getstate__(self): + return {"_list": [_PickledObject(val) for val in self._list]} + + +class LazilyUnpicklingListWithEqAndPersistentHashing(LazilyUnpicklingList): + """A list which lazily unpickles its values, and supports equality comparison + and persistent hashing without unpickling. + + Persistent hashing only works in conjunction with :class:`LoopyKeyBuilder`. + + Equality comparison and persistent hashing are implemented by supplying + functions `eq_key_getter` and `persistent_hash_key_getter` to the + constructor. These functions should return keys that can be used in place of + the original object for the respective purposes of equality comparison and + persistent hashing. + """ + + def __init__(self, *args, **kwargs): + self.eq_key_getter = kwargs.pop("eq_key_getter") + self.persistent_hash_key_getter = kwargs.pop("persistent_hash_key_getter") + LazilyUnpicklingList.__init__(self, *args, **kwargs) + + def update_persistent_hash(self, key_hash, key_builder): + key_builder.update_for_list(key_hash, self._list) + + def _get_eq_key(self, obj): + if isinstance(obj, _PickledObjectWithEqAndPersistentHashKeys): + return obj.eq_key + return self.eq_key_getter(obj) + + def _get_persistent_hash_key(self, obj): + if isinstance(obj, _PickledObjectWithEqAndPersistentHashKeys): + return obj.persistent_hash_key + return self.persistent_hash_key_getter(obj) + + def __eq__(self, other): + if not isinstance(other, (list, LazilyUnpicklingList)): + return NotImplemented + + if isinstance(other, LazilyUnpicklingList): + other = other._list + + if len(self) != len(other): + return False + + for a, b in zip(self._list, other): + if self._get_eq_key(a) != self._get_eq_key(b): + return False + + return True + + def __ne__(self, other): + return not self.__eq__(other) + + def __getstate__(self): + return {"_list": [ + _PickledObjectWithEqAndPersistentHashKeys( + val, + self._get_eq_key(val), + self._get_persistent_hash_key(val)) + for val in self._list], + "eq_key_getter": self.eq_key_getter, + "persistent_hash_key_getter": self.persistent_hash_key_getter} + +# }}} + + def is_interned(s): return s is None or intern(s) is s @@ -411,4 +576,19 @@ def intern_frozenset_of_ids(fs): return frozenset(intern(s) for s in fs) +def natorder(key): + # Return natural ordering for strings, as opposed to dictionary order. + # E.g. will result in + # 'abc1' < 'abc9' < 'abc10' + # rather than + # 'abc1' < 'abc10' < 'abc9' + # Based on + # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7 + import re + return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)] + + +def natsorted(seq, key=lambda x: x): + return sorted(seq, key=lambda y: natorder(key(y))) + # vim: foldmethod=marker diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py new file mode 100644 index 0000000000000000000000000000000000000000..cfbbd56e906c5e622debcd82bd5368aa3b1fb34c --- /dev/null +++ b/loopy/transform/add_barrier.py @@ -0,0 +1,87 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2017 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +from loopy.kernel.instruction import BarrierInstruction +from loopy.match import parse_match +from loopy.transform.instruction import add_dependency + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: add_barrier +""" + + +# {{{ add_barrier + +def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, + tags=None, synchronization_kind="global", mem_kind=None): + """Takes in a kernel that needs to be added a barrier and returns a kernel + which has a barrier inserted into it. It takes input of 2 instructions and + then adds a barrier in between those 2 instructions. The expressions can + be any inputs that are understood by :func:`loopy.match.parse_match`. + + :arg insn_before: String expression that specifies the instruction(s) + before the barrier which is to be added + :arg insn_after: String expression that specifies the instruction(s) after + the barrier which is to be added + :arg id: String on which the id of the barrier would be based on. + :arg tags: The tag of the group to which the barrier must be added + :arg synchronization_kind: Kind of barrier to be added. May be "global" or + "local" + :arg kind: Type of memory to be synchronied. May be "global" or "local". Ignored + for "global" bariers. If not supplied, defaults to :arg:`synchronization_kind` + """ + + if mem_kind is None: + mem_kind = synchronization_kind + + if id_based_on is None: + id = knl.make_unique_instruction_id( + based_on=synchronization_kind[0]+"_barrier") + else: + id = knl.make_unique_instruction_id(based_on=id_based_on) + + match = parse_match(insn_before) + insn_before_list = [insn.id for insn in knl.instructions if match(knl, + insn)] + + barrier_to_add = BarrierInstruction(depends_on=frozenset(insn_before_list), + depends_on_is_final=True, + id=id, + tags=tags, + synchronization_kind=synchronization_kind, + mem_kind=mem_kind) + + new_knl = knl.copy(instructions=knl.instructions + [barrier_to_add]) + new_knl = add_dependency(kernel=new_knl, + insn_match=insn_after, + depends_on="id:"+id) + + return new_knl + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py index f4e6526a7b083f0b38dda1209b607aa38a62b68e..618e36f20da8b3f9089ecf5ce88d6b3177528570 100644 --- a/loopy/transform/array_buffer_map.py +++ b/loopy/transform/array_buffer_map.py @@ -239,14 +239,14 @@ class ArrayToBufferMap(object): non1_storage_axis_flags = [] non1_storage_shape = [] - for saxis, bi, l in zip( + for saxis, bi, saxis_len in zip( storage_axis_names, storage_base_indices, storage_shape): - has_length_non1 = l != 1 + has_length_non1 = saxis_len != 1 non1_storage_axis_flags.append(has_length_non1) if has_length_non1: - non1_storage_shape.append(l) + non1_storage_shape.append(saxis_len) # }}} diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 92cff7a507d672a3acc51a8abed572a04cb7e86a..1b059b6a73d3064596b8679fbc87f94287b2d9fe 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -29,7 +29,7 @@ from loopy.symbolic import (get_dependencies, RuleAwareIdentityMapper, SubstitutionRuleMappingContext, SubstitutionMapper) from pymbolic.mapper.substitutor import make_subst_func -from pytools.persistent_dict import PersistentDict +from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper from loopy.version import DATA_MODEL_VERSION from loopy.diagnostic import LoopyError @@ -124,7 +124,8 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper): # }}} -buffer_array_cache = PersistentDict("loopy-buffer-array-cache-"+DATA_MODEL_VERSION, +buffer_array_cache = WriteOncePersistentDict( + "loopy-buffer-array-cache-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -531,7 +532,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, if CACHING_ENABLED: from loopy.preprocess import prepare_for_caching - buffer_array_cache[cache_key] = prepare_for_caching(kernel) + buffer_array_cache.store_if_not_present( + cache_key, prepare_for_caching(kernel)) return kernel diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py index 77840753258fa545aa01ef3e8c58cbc36e66ed72..0ac71d603ebe8b5150fb854dd3978676dd9d98c3 100644 --- a/loopy/transform/ilp.py +++ b/loopy/transform/ilp.py @@ -38,6 +38,7 @@ from loopy.symbolic import IdentityMapper class ExtraInameIndexInserter(IdentityMapper): def __init__(self, var_to_new_inames): self.var_to_new_inames = var_to_new_inames + self.seen_ilp_inames = set() def map_subscript(self, expr): try: @@ -50,6 +51,7 @@ class ExtraInameIndexInserter(IdentityMapper): index = (index,) index = tuple(self.rec(i) for i in index) + self.seen_ilp_inames.update(v.name for v in new_idx) return expr.aggregate.index(index + new_idx) def map_variable(self, expr): @@ -58,6 +60,7 @@ class ExtraInameIndexInserter(IdentityMapper): except KeyError: return expr else: + self.seen_ilp_inames.update(v.name for v in new_idx) return expr.index(new_idx) @@ -160,13 +163,30 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None): # }}} from pymbolic import var - eiii = ExtraInameIndexInserter( - dict((var_name, tuple(var(iname) for iname in inames)) - for var_name, inames in six.iteritems(var_to_new_ilp_inames))) - - new_insns = [ - insn.with_transformed_expressions(eiii) - for insn in kernel.instructions] + var_to_extra_iname = dict( + (var_name, tuple(var(iname) for iname in inames)) + for var_name, inames in six.iteritems(var_to_new_ilp_inames)) + + new_insns = [] + + for insn in kernel.instructions: + eiii = ExtraInameIndexInserter(var_to_extra_iname) + new_insn = insn.with_transformed_expressions(eiii) + if not eiii.seen_ilp_inames <= insn.within_inames: + + from loopy.diagnostic import warn_with_kernel + warn_with_kernel( + kernel, + "implicit_ilp_iname", + "Instruction '%s': touched variable that (for ILP) " + "required iname(s) '%s', but that the instruction was not " + "previously within the iname(s). Previously, this would " + "implicitly promote the instruction, but that behavior is " + "deprecated and will stop working in 2018.1." + % (insn.id, ", ".join( + eiii.seen_ilp_inames - insn.within_inames))) + + new_insns.append(new_insn) return kernel.copy( temporary_variables=new_temp_vars, diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index ea90abfe27c8de69daf39021b3d0ea5463a2e4c8..22fd7b3bb2c643bc3c1309f4e3fdb89438ae7d2b 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -641,7 +641,7 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): iname_to_tag = [(iname, parse_tag(tag)) for iname, tag in iname_to_tag] - from loopy.kernel.data import (ParallelTag, AutoLocalIndexTagBase, + from loopy.kernel.data import (ConcurrentTag, AutoLocalIndexTagBase, ForceSequentialTag) # {{{ globbing @@ -686,13 +686,13 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): if iname not in kernel.all_inames(): raise ValueError("cannot tag '%s'--not known" % iname) - if isinstance(new_tag, ParallelTag) \ + if isinstance(new_tag, ConcurrentTag) \ and isinstance(old_tag, ForceSequentialTag): raise ValueError("cannot tag '%s' as parallel--" "iname requires sequential execution" % iname) if isinstance(new_tag, ForceSequentialTag) \ - and isinstance(old_tag, ParallelTag): + and isinstance(old_tag, ConcurrentTag): raise ValueError("'%s' is already tagged as parallel, " "but is now prohibited from being parallel " "(likely because of participation in a precompute or " @@ -972,9 +972,9 @@ def get_iname_duplication_options(knl, use_boostable_into=False): # Get the duplication options as a tuple of iname and a set for iname, insns in _get_iname_duplication_options(insn_deps): # Check whether this iname has a parallel tag and discard it if so - from loopy.kernel.data import ParallelTag + from loopy.kernel.data import ConcurrentTag if (iname in knl.iname_to_tag - and isinstance(knl.iname_to_tag[iname], ParallelTag)): + and isinstance(knl.iname_to_tag[iname], ConcurrentTag)): continue # If we find a duplication option and fo not use boostable_into diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 2be78f8e5c25a3b48c195f52715f9d6453100e3b..37c5d85a1ade5c8f7fadb2c6a785cf7cea3dde40 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -301,4 +301,39 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False): # }}} +# {{{ uniquify_instruction_ids + +def uniquify_instruction_ids(kernel): + """Converts any ids that are :class:`loopy.UniqueName` or *None* into unique + strings. + + This function does *not* deduplicate existing instruction ids. + """ + + from loopy.kernel.creation import UniqueName + + insn_ids = set( + insn.id for insn in kernel.instructions + if insn.id is not None and not isinstance(insn.id, UniqueName)) + + from pytools import UniqueNameGenerator + insn_id_gen = UniqueNameGenerator(insn_ids) + + new_instructions = [] + + for insn in kernel.instructions: + if insn.id is None: + new_instructions.append( + insn.copy(id=insn_id_gen("insn"))) + elif isinstance(insn.id, UniqueName): + new_instructions.append( + insn.copy(id=insn_id_gen(insn.id.name))) + else: + new_instructions.append(insn) + + return kernel.copy(instructions=new_instructions) + +# }}} + + # vim: foldmethod=marker diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 6077332c4fc4322ac7ffb02ade4a0e24c7066245..4755ca1774a15480a2c6b255380dd724e47f9042 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -811,7 +811,8 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, barrier_insn = BarrierInstruction( id=barrier_insn_id, depends_on=frozenset([compute_insn_id]), - kind="global") + synchronization_kind="global", + mem_kind="global") compute_dep_id = barrier_insn_id added_compute_insns.append(barrier_insn) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 3d4f5c2d4765aa7cbf1e56c76d127bf8f4d61a06..b53488b486c6750742b269f47cfd4f08b8f8fab9 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -351,7 +351,8 @@ class TemporarySaver(object): self.subkernel_to_slice_indices[subkernel]) def is_global_barrier(item): - return isinstance(item, Barrier) and item.kind == "global" + return isinstance(item, Barrier) and \ + item.synchronization_kind == "global" try: pre_barrier = next(item for item in @@ -402,13 +403,13 @@ class TemporarySaver(object): continue from loopy.kernel.data import ( - GroupIndexTag, LocalIndexTag, ParallelTag) + GroupIndexTag, LocalIndexTag, ConcurrentTag) if isinstance(tag, GroupIndexTag): my_group_tags.append(tag) elif isinstance(tag, LocalIndexTag): my_local_tags.append(tag) - elif isinstance(tag, ParallelTag): + elif isinstance(tag, ConcurrentTag): raise LoopyError( "iname '%s' is tagged with '%s' - only " "group and local tags are supported for " diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 79ceff9fdf1e2c4b3b544e8ae85f8194b36ec444..a681afe06520483c83530c241e39229412e88f03 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -1,6 +1,4 @@ -from __future__ import division -from __future__ import absolute_import -import six +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -24,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six from loopy.symbolic import ( get_dependencies, SubstitutionMapper, @@ -141,6 +140,7 @@ def extract_subst(kernel, subst_name, template, parameters=()): dfmapper = CallbackMapper(gather_exprs, WalkMapper()) for insn in kernel.instructions: + dfmapper(insn.assignees) dfmapper(insn.expression) for sr in six.itervalues(kernel.substitutions): @@ -178,8 +178,7 @@ def extract_subst(kernel, subst_name, template, parameters=()): new_insns = [] for insn in kernel.instructions: - new_expr = cbmapper(insn.expression) - new_insns.append(insn.copy(expression=new_expr)) + new_insns.append(insn.with_transformed_expressions(cbmapper)) from loopy.kernel.data import SubstitutionRule new_substs = { diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 409cbbc5ebd5feb13b04eeba1671f639663bfcf1..6ffc1dff5220ab48c6c87ec29fec6e44d57ba133 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -237,6 +237,12 @@ class TypeInferenceMapper(CombineMapper): else: raise TypeInferenceFailure("Cannot deduce type of constant '%s'" % expr) + def map_type_cast(self, expr): + subtype, = self.rec(expr.child) + if not issubclass(subtype.dtype.type, np.number): + raise LoopyError("Can't cast a '%s' to '%s'" % (subtype, expr.type)) + return [expr.type] + def map_subscript(self, expr): return self.rec(expr.aggregate) diff --git a/loopy/version.py b/loopy/version.py index 02244f55d0dbf207a4641c3ebf6cc33b536f0421..e142162729d5a374082fa853dcc763665f7dfe33 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -21,7 +21,7 @@ THE SOFTWARE. """ -VERSION = (2016, 2) +VERSION = (2017, 2) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v64-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v69-islpy%s" % _islpy_version diff --git a/requirements.txt b/requirements.txt index 3ff69a123d10cc7bc6799ebfb8913bfd0eed839e..1a23022821116aea068b76eab72f9a5596694eea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ git+https://github.com/inducer/pytools.git git+https://github.com/inducer/islpy.git git+https://github.com/inducer/cgen.git -git+https://github.com/pyopencl/pyopencl.git +git+https://github.com/inducer/pyopencl.git git+https://github.com/inducer/pymbolic.git git+https://github.com/inducer/genpy.git git+https://github.com/inducer/codepy.git diff --git a/setup.py b/setup.py index b8bc17d888aae8409000c936b487afb94a5250d0..b8f36d12559f05a47ef57dd06efd4761e3b3ad9a 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ setup(name="loo.py", ], install_requires=[ - "pytools>=2017.3", + "pytools>=2017.6", "pymbolic>=2016.2", "genpy>=2016.1.2", "cgen>=2016.1", diff --git a/test/test_fortran.py b/test/test_fortran.py index 6e05aa6adba66ce0a1896527249d321de104c512..842a0127e3118ec8e7a0ea89ed17decc091e8566 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -278,14 +278,14 @@ def test_matmul(ctx_factory, buffer_inames): logging.basicConfig(level=logging.INFO) fortran_src = """ - subroutine dgemm(m,n,l,a,b,c) + subroutine dgemm(m,n,ell,a,b,c) implicit none - real*8 a(m,l),b(l,n),c(m,n) - integer m,n,k,i,j,l + real*8 a(m,ell),b(ell,n),c(m,n) + integer m,n,k,i,j,ell do j = 1,n do i = 1,m - do k = 1,l + do k = 1,ell c(i,j) = c(i,j) + b(k,j)*a(i,k) end do end do @@ -306,7 +306,7 @@ def test_matmul(ctx_factory, buffer_inames): knl = lp.split_iname(knl, "k", 32) knl = lp.assume(knl, "n mod 32 = 0") knl = lp.assume(knl, "m mod 32 = 0") - knl = lp.assume(knl, "l mod 16 = 0") + knl = lp.assume(knl, "ell mod 16 = 0") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") @@ -317,7 +317,7 @@ def test_matmul(ctx_factory, buffer_inames): init_expression="0", store_expression="base+buffer") ctx = ctx_factory() - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128)) + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128)) @pytest.mark.xfail @@ -457,14 +457,14 @@ def test_parse_and_fuse_two_kernels(): def test_precompute_some_exist(ctx_factory): fortran_src = """ - subroutine dgemm(m,n,l,a,b,c) + subroutine dgemm(m,n,ell,a,b,c) implicit none - real*8 a(m,l),b(l,n),c(m,n) - integer m,n,k,i,j,l + real*8 a(m,ell),b(ell,n),c(m,n) + integer m,n,k,i,j,ell do j = 1,n do i = 1,m - do k = 1,l + do k = 1,ell c(i,j) = c(i,j) + b(k,j)*a(i,k) end do end do @@ -483,7 +483,7 @@ def test_precompute_some_exist(ctx_factory): knl = lp.split_iname(knl, "k", 8) knl = lp.assume(knl, "n mod 8 = 0") knl = lp.assume(knl, "m mod 8 = 0") - knl = lp.assume(knl, "l mod 8 = 0") + knl = lp.assume(knl, "ell mod 8 = 0") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") @@ -495,7 +495,7 @@ def test_precompute_some_exist(ctx_factory): ref_knl = knl ctx = ctx_factory() - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128)) + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128)) if __name__ == "__main__": diff --git a/test/test_linalg.py b/test/test_linalg.py index 772d536d1e00fedc0b7abcd2f8c05350fe3b633e..3d422f1d8b5a847d4445468978ee529db95c481f 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -230,14 +230,14 @@ def test_funny_shape_matrix_mul(ctx_factory): n = get_suitable_size(ctx) m = n+12 - l = m+12 + ell = m+12 knl = lp.make_kernel( - "{[i,k,j]: 0<=i gid = i/256 + start = gid*256 + for j + a[start + j] = a[start + j] + j + end + end + """, + seq_dependencies=True, + name="uniform_l", + target=PyOpenCLTarget(), + assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0])) + + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) + cl_kernel_info = CompiledKernel(ctx, knl).cl_kernel_info(frozenset()) # noqa + # }}} @@ -1064,6 +1115,28 @@ def test_literal_local_barrier(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) +def test_local_barrier_mem_kind(): + def __test_type(mtype, expected): + insn = '... lbarrier' + if mtype: + insn += '{mem_kind=%s}' % mtype + knl = lp.make_kernel( + "{ [i]: 0<=i {[i]: 0 <= i < n}", + """ + <>tmp[i] = i + tmp[0] = 0 + """, + fixed_parameters=dict(n=1)) + + knl(queue) + + +def test_parameter_inference(): + knl = lp.make_kernel("{[i]: 0 <= i < n and i mod 2 = 0}", "") + assert knl.all_params() == set(["n"]) + + def test_execution_backend_can_cache_dtypes(ctx_factory): # When the kernel is invoked, the execution backend uses it as a cache key # for the type inference and scheduling cache. This tests to make sure that @@ -2431,6 +2593,167 @@ def test_execution_backend_can_cache_dtypes(ctx_factory): knl(queue) +def test_preamble_with_separate_temporaries(ctx_factory): + from loopy.kernel.data import temp_var_scope as scopes + # create a function mangler + + func_name = 'indirect' + func_arg_dtypes = (np.int32, np.int32, np.int32) + func_result_dtypes = (np.int32,) + + def __indirectmangler(kernel, name, arg_dtypes): + """ + A function that will return a :class:`loopy.kernel.data.CallMangleInfo` + to interface with the calling :class:`loopy.LoopKernel` + """ + if name != func_name: + return None + + from loopy.types import to_loopy_type + from loopy.kernel.data import CallMangleInfo + + def __compare(d1, d2): + # compare dtypes ignoring atomic + return to_loopy_type(d1, for_atomic=True) == \ + to_loopy_type(d2, for_atomic=True) + + # check types + if len(arg_dtypes) != len(arg_dtypes): + raise Exception('Unexpected number of arguments provided to mangler ' + '{}, expected {}, got {}'.format( + func_name, len(func_arg_dtypes), len(arg_dtypes))) + + for i, (d1, d2) in enumerate(zip(func_arg_dtypes, arg_dtypes)): + if not __compare(d1, d2): + raise Exception('Argument at index {} for mangler {} does not ' + 'match expected dtype. Expected {}, got {}'. + format(i, func_name, str(d1), str(d2))) + + # get target for creation + target = arg_dtypes[0].target + return CallMangleInfo( + target_name=func_name, + result_dtypes=tuple(to_loopy_type(x, target=target) for x in + func_result_dtypes), + arg_dtypes=arg_dtypes) + + # create the preamble generator + def create_preamble(arr): + def __indirectpreamble(preamble_info): + # find a function matching our name + func_match = next( + (x for x in preamble_info.seen_functions + if x.name == func_name), None) + desc = 'custom_funcs_indirect' + if func_match is not None: + from loopy.types import to_loopy_type + # check types + if tuple(to_loopy_type(x) for x in func_arg_dtypes) == \ + func_match.arg_dtypes: + # if match, create our temporary + var = lp.TemporaryVariable( + 'lookup', initializer=arr, dtype=arr.dtype, shape=arr.shape, + scope=scopes.GLOBAL, read_only=True) + # and code + code = """ + int {name}(int start, int end, int match) + {{ + int result = start; + for (int i = start + 1; i < end; ++i) + {{ + if (lookup[i] == match) + result = i; + }} + return result; + }} + """.format(name=func_name) + + # generate temporary variable code + from cgen import Initializer + from loopy.target.c import generate_array_literal + codegen_state = preamble_info.codegen_state.copy( + is_generating_device_code=True) + kernel = preamble_info.kernel + ast_builder = codegen_state.ast_builder + target = kernel.target + decl_info, = var.decl_info(target, index_dtype=kernel.index_dtype) + decl = ast_builder.wrap_global_constant( + ast_builder.get_temporary_decl( + codegen_state, None, var, + decl_info)) + if var.initializer is not None: + decl = Initializer(decl, generate_array_literal( + codegen_state, var, var.initializer)) + # return generated code + yield (desc, '\n'.join([str(decl), code])) + return __indirectpreamble + + # and finally create a test + n = 10 + # for each entry come up with a random number of data points + num_data = np.asarray(np.random.randint(2, 10, size=n), dtype=np.int32) + # turn into offsets + offsets = np.asarray(np.hstack(([0], np.cumsum(num_data))), dtype=np.int32) + # create lookup data + lookup = np.empty(0) + for i in num_data: + lookup = np.hstack((lookup, np.arange(i))) + lookup = np.asarray(lookup, dtype=np.int32) + # and create data array + data = np.random.rand(np.product(num_data)) + + # make kernel + kernel = lp.make_kernel('{[i]: 0 <= i < n}', + """ + for i + <>ind = indirect(offsets[i], offsets[i + 1], 1) + out[i] = data[ind] + end + """, + [lp.GlobalArg('out', shape=('n',)), + lp.TemporaryVariable( + 'offsets', shape=(offsets.size,), initializer=offsets, scope=scopes.GLOBAL, + read_only=True), + lp.GlobalArg('data', shape=(data.size,), dtype=np.float64)], + ) + # fixt params, and add manglers / preamble + kernel = lp.fix_parameters(kernel, **{'n': n}) + kernel = lp.register_preamble_generators(kernel, [create_preamble(lookup)]) + kernel = lp.register_function_manglers(kernel, [__indirectmangler]) + + print(lp.generate_code(kernel)[0]) + # and call (functionality unimportant, more that it compiles) + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + # check that it actually performs the lookup correctly + assert np.allclose(kernel( + queue, data=data.flatten('C'))[1][0], data[offsets[:-1] + 1]) + + +def test_add_prefetch_works_in_lhs_index(): + knl = lp.make_kernel( + "{ [n,k,l,k1,l1,k2,l2]: " + "start<=n a1_tmp[k,l] = a1[a1_map[n, k],l] + a1_tmp[k1,l1] = a1_tmp[k1,l1] + 1 + a1_out[a1_map[n,k2], l2] = a1_tmp[k2,l2] + end + """, + [ + lp.GlobalArg("a1,a1_out", None, "ndofs,2"), + lp.GlobalArg("a1_map", None, "nelements,3"), + "..." + ]) + + knl = lp.add_prefetch(knl, "a1_map", "k") + + from loopy.symbolic import get_dependencies + for insn in knl.instructions: + assert "a1_map" not in get_dependencies(insn.assignees) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_misc.py b/test/test_misc.py index a22e424630255df4225586eeb9f0d62a03d5318f..0273948b38b28b85e42a600bffb65fbf86dcc554 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -92,26 +92,36 @@ def test_SetTrie(): s.add_or_update(set([1, 4])) -class PicklableItem(object): +class PickleDetector(object): + """Contains a class attribute which flags if any instance was unpickled. + """ - flags = {"unpickled": False} + @classmethod + def reset(cls): + cls.instance_unpickled = False def __getstate__(self): - return True + return {"state": self.state} def __setstate__(self, state): - PicklableItem.flags["unpickled"] = True + self.__class__.instance_unpickled = True + self.state = state["state"] -def test_LazilyUnpicklingDictionary(): - def is_unpickled(): - return PicklableItem.flags["unpickled"] +class PickleDetectorForLazilyUnpicklingDict(PickleDetector): + instance_unpickled = False - from loopy.tools import LazilyUnpicklingDictionary + def __init__(self): + self.state = None - mapping = LazilyUnpicklingDictionary({0: PicklableItem()}) - assert not is_unpickled() +def test_LazilyUnpicklingDict(): + from loopy.tools import LazilyUnpicklingDict + + cls = PickleDetectorForLazilyUnpicklingDict + mapping = LazilyUnpicklingDict({0: cls()}) + + assert not cls.instance_unpickled from pickle import loads, dumps @@ -120,30 +130,160 @@ def test_LazilyUnpicklingDictionary(): # {{{ test lazy loading mapping = loads(pickled_mapping) - assert not is_unpickled() + assert not cls.instance_unpickled list(mapping.keys()) - assert not is_unpickled() - assert isinstance(mapping[0], PicklableItem) - assert is_unpickled() + assert not cls.instance_unpickled + assert isinstance(mapping[0], cls) + assert cls.instance_unpickled + + # }}} + + # {{{ conversion + + cls.reset() + mapping = loads(pickled_mapping) + dict(mapping) + assert cls.instance_unpickled # }}} # {{{ test multi round trip mapping = loads(dumps(loads(pickled_mapping))) - assert isinstance(mapping[0], PicklableItem) + assert isinstance(mapping[0], cls) # }}} # {{{ test empty map - mapping = LazilyUnpicklingDictionary({}) + mapping = LazilyUnpicklingDict({}) mapping = loads(dumps(mapping)) assert len(mapping) == 0 # }}} +class PickleDetectorForLazilyUnpicklingList(PickleDetector): + instance_unpickled = False + + def __init__(self): + self.state = None + + +def test_LazilyUnpicklingList(): + from loopy.tools import LazilyUnpicklingList + + cls = PickleDetectorForLazilyUnpicklingList + lst = LazilyUnpicklingList([cls()]) + assert not cls.instance_unpickled + + from pickle import loads, dumps + pickled_lst = dumps(lst) + + # {{{ test lazy loading + + lst = loads(pickled_lst) + assert not cls.instance_unpickled + assert isinstance(lst[0], cls) + assert cls.instance_unpickled + + # }}} + + # {{{ conversion + + cls.reset() + lst = loads(pickled_lst) + list(lst) + assert cls.instance_unpickled + + # }}} + + # {{{ test multi round trip + + lst = loads(dumps(loads(dumps(lst)))) + assert isinstance(lst[0], cls) + + # }}} + + # {{{ test empty list + + lst = LazilyUnpicklingList([]) + lst = loads(dumps(lst)) + assert len(lst) == 0 + + # }}} + + +class PickleDetectorForLazilyUnpicklingListWithEqAndPersistentHashing( + PickleDetector): + instance_unpickled = False + + def __init__(self, comparison_key): + self.state = comparison_key + + def __repr__(self): + return repr(self.state) + + def update_persistent_hash(self, key_hash, key_builder): + key_builder.rec(key_hash, repr(self)) + + +def test_LazilyUnpicklingListWithEqAndPersistentHashing(): + from loopy.tools import LazilyUnpicklingListWithEqAndPersistentHashing + + cls = PickleDetectorForLazilyUnpicklingListWithEqAndPersistentHashing + from pickle import loads, dumps + + # {{{ test comparison of a pair of lazy lists + + lst0 = LazilyUnpicklingListWithEqAndPersistentHashing( + [cls(0), cls(1)], + eq_key_getter=repr, + persistent_hash_key_getter=repr) + lst1 = LazilyUnpicklingListWithEqAndPersistentHashing( + [cls(0), cls(1)], + eq_key_getter=repr, + persistent_hash_key_getter=repr) + + assert not cls.instance_unpickled + + assert lst0 == lst1 + assert not cls.instance_unpickled + + lst0 = loads(dumps(lst0)) + lst1 = loads(dumps(lst1)) + + assert lst0 == lst1 + assert not cls.instance_unpickled + + lst0.append(cls(3)) + lst1.append(cls(2)) + + assert lst0 != lst1 + + # }}} + + # {{{ comparison with plain lists + + lst = [cls(0), cls(1), cls(3)] + + assert lst == lst0 + assert lst0 == lst + assert not cls.instance_unpickled + + # }}} + + # {{{ persistent hashing + + from loopy.tools import LoopyKeyBuilder + kb = LoopyKeyBuilder() + + assert kb(lst0) == kb(lst) + assert not cls.instance_unpickled + + # }}} + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 0de08f5f616937604bc2c93581c5a8a1770164f4..eff3dbd0e07439bbec399479183a7e9ddb69b9ff 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -28,6 +28,7 @@ import pytest import loopy as lp import pyopencl as cl import sys +import os pytestmark = pytest.mark.importorskip("fparser") @@ -49,7 +50,7 @@ __all__ = [ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa ctx = ctx_factory() - filename = "strongVolumeKernels.f90" + filename = os.path.join(os.path.dirname(__file__), "strongVolumeKernels.f90") with open(filename, "r") as sourcef: source = sourcef.read() diff --git a/test/test_reduction.py b/test/test_reduction.py index be11d7c8cada94596dceb1a8e0e678f8adb582e9..0c37d2228ee41f3e8af7ef6f6fcd68afa7a66960 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -97,22 +97,22 @@ def test_nested_dependent_reduction(ctx_factory): "{[j]: 0<=j sumlen = l[i]", + "<> sumlen = ell[i]", "a[i] = sum(j, j)", ], [ lp.ValueArg("n", np.int32), lp.GlobalArg("a", dtype, ("n",)), - lp.GlobalArg("l", np.int32, ("n",)), + lp.GlobalArg("ell", np.int32, ("n",)), ]) cknl = lp.CompiledKernel(ctx, knl) n = 330 - l = np.arange(n, dtype=np.int32) - evt, (a,) = cknl(queue, l=l, n=n, out_host=True) + ell = np.arange(n, dtype=np.int32) + evt, (a,) = cknl(queue, ell=ell, n=n, out_host=True) - tgt_result = (2*l-1)*2*l/2 + tgt_result = (2*ell-1)*2*ell/2 assert (a == tgt_result).all() @@ -413,6 +413,27 @@ def test_parallel_multi_output_reduction(ctx_factory): assert max_index == np.argmax(np.abs(a)) +def test_reduction_with_conditional(): + # Test whether realization of a reduction inherits predicates + # of the original instruction. Tested with the CTarget, because + # the PyOpenCL target will hoist the conditional into the host + # code in this minimal example. + knl = lp.make_kernel( + "{ [i] : 0<=i<42 }", + """ + if n > 0 + <>b = sum(i, a[i]) + end + """, + [lp.GlobalArg("a", dtype=np.float32, shape=(42,)), + lp.GlobalArg("n", dtype=np.float32, shape=())], + target=lp.CTarget()) + code = lp.generate_body(knl) + + # Check that the if appears before the loop that realizes the reduction. + assert code.index("if") < code.index("for") + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_statistics.py b/test/test_statistics.py index a72b62af90050008f837e144f1f28d4a4de1c730..eeb4a5a288afdd5b9295b0b681abb61b5f021d97 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -37,14 +37,14 @@ from pymbolic.primitives import Variable def test_op_counter_basic(): knl = lp.make_kernel( - "[n,m,l] -> {[i,k,j]: 0<=i {[i,k,j]: 0<=i6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) + e[i,k] = if( + not(k6 or k/2==ell, + g[i,k]*2, + g[i,k]+h[i,k]/2) """ ], - name="logic", assumptions="n,m,l >= 1") + name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 - l = 128 - params = {'n': n, 'm': m, 'l': l} + ell = 128 + params = {'n': n, 'm': m, 'ell': ell} f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(params) f64div = op_map[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params) @@ -118,14 +121,14 @@ def test_op_counter_logic(): def test_op_counter_specialops(): knl = lp.make_kernel( - "{[i,k,j]: 0<=i> k)) """ ], - name="bitwise", assumptions="n,m,l >= 1") + name="bitwise", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, dict( @@ -169,16 +172,16 @@ def test_op_counter_bitwise(): op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 - l = 128 - params = {'n': n, 'm': m, 'l': l} + ell = 128 + params = {'n': n, 'm': m, 'ell': ell} i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params) i32bw = op_map[lp.Op(np.int32, 'bw')].eval_with_dict(params) i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params) i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params) i64add = op_map[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params) i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params) - assert i32add == n*m+n*m*l - assert i32bw == 2*n*m*l + assert i32add == n*m+n*m*ell + assert i32bw == 2*n*m*ell assert i64bw == 2*n*m assert i64add == i64mul == n*m assert i64shift == 2*n*m @@ -218,22 +221,22 @@ def test_op_counter_triangular_domain(): def test_mem_access_counter_basic(): knl = lp.make_kernel( - "[n,m,l] -> {[i,k,j]: 0<=i {[i,k,j]: 0<=i6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) + e[i,k] = if(not(k6 or k/2==ell, + g[i,k]*2, + g[i,k]+h[i,k]/2) """ ], - name="logic", assumptions="n,m,l >= 1") + name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 - l = 128 - params = {'n': n, 'm': m, 'l': l} + ell = 128 + params = {'n': n, 'm': m, 'ell': ell} reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') @@ -332,22 +337,22 @@ def test_mem_access_counter_logic(): def test_mem_access_counter_specialops(): knl = lp.make_kernel( - "{[i,k,j]: 0<=i> k)) """ ], - name="bitwise", assumptions="n,m,l >= 1") + name="bitwise", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, dict( @@ -398,8 +403,8 @@ def test_mem_access_counter_bitwise(): mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 - l = 128 - params = {'n': n, 'm': m, 'l': l} + ell = 128 + params = {'n': n, 'm': m, 'ell': ell} i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='a') ].eval_with_dict(params) @@ -412,7 +417,7 @@ def test_mem_access_counter_bitwise(): i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), stride=0, direction='load', variable='h') ].eval_with_dict(params) - assert i32 == 4*n*m+2*n*m*l + assert i32 == 4*n*m+2*n*m*ell i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='c') @@ -420,20 +425,20 @@ def test_mem_access_counter_bitwise(): i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='e') ].eval_with_dict(params) - assert i32 == n*m+n*m*l + assert i32 == n*m+n*m*ell def test_mem_access_counter_mixed(): knl = lp.make_kernel( - "[n,m,l] -> {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}", + "[n,m,ell] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}", [ """ c[i,j,k] = 2*a[i,j,k] {id=first} @@ -620,8 +625,8 @@ def test_barrier_counter_barriers(): print(sync_map) n = 512 m = 256 - l = 128 - params = {'n': n, 'm': m, 'l': l} + ell = 128 + params = {'n': n, 'm': m, 'ell': ell} barrier_count = sync_map["barrier_local"].eval_with_dict(params) assert barrier_count == 50*10*2 @@ -630,11 +635,11 @@ def test_all_counters_parallel_matmul(): bsize = 16 knl = lp.make_kernel( - "{[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i]: 0<=i 1: exec(sys.argv[1]) diff --git a/test/test_target.py b/test/test_target.py index ad0cb7439bfdd6200e020c0becadcd73072ceef4..aa6f004634f207a7b9733da4a3d7e06d13d7db7c 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -140,6 +140,32 @@ def test_generate_c_snippet(): print(lp.generate_body(knl)) +def test_c_min_max(): + # Test fmin() fmax() is generated for C backend instead of max() and min() + from loopy.target.c import CTarget + import pymbolic.primitives as p + i = p.Variable("i") + xi = p.Subscript(p.Variable("x"), i) + yi = p.Subscript(p.Variable("y"), i) + zi = p.Subscript(p.Variable("z"), i) + + n = 100 + domain = "{[i]: 0<=i<%d}" % n + data = [lp.GlobalArg("x", np.float64, shape=(n,)), + lp.GlobalArg("y", np.float64, shape=(n,)), + lp.GlobalArg("z", np.float64, shape=(n,))] + + inst = [lp.Assignment(xi, p.Variable("min")(yi, zi))] + knl = lp.make_kernel(domain, inst, data, target=CTarget()) + code = lp.generate_code_v2(knl).device_code() + assert "fmin" in code + + inst = [lp.Assignment(xi, p.Variable("max")(yi, zi))] + knl = lp.make_kernel(domain, inst, data, target=CTarget()) + code = lp.generate_code_v2(knl).device_code() + assert "fmax" in code + + @pytest.mark.parametrize("tp", ["f32", "f64"]) def test_random123(ctx_factory, tp): ctx = ctx_factory() @@ -240,6 +266,44 @@ def test_numba_cuda_target(): print(lp.generate_code_v2(knl).all_code()) +def test_sized_integer_c_codegen(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + from pymbolic import var + knl = lp.make_kernel( + "{[i]: 0<=i ctr = make_uint2(0, 0)", + lp.Assignment("a[i]", lp.TypeCast(np.int64, var("ctr")) << var("i"))] + ) + + with pytest.raises(lp.LoopyError): + knl = lp.preprocess_kernel(knl) + + +def test_target_invalid_type_cast(): + dtype = np.dtype([('', ' 1: exec(sys.argv[1]) diff --git a/test/test_transform.py b/test/test_transform.py index b5fcdf04c4781c5f370c911ceb7efcb4042f6b4e..e50605b46672f8e9c1817431f1577742b1f6fb4c 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -105,6 +105,27 @@ def test_to_batched(ctx_factory): bknl(queue, a=a, x=x) +def test_add_barrier(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + knl = lp.make_kernel( + "{[i, j, ii, jj]: 0<=i,j, ii, jj 1: exec(sys.argv[1])