diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5ea075d194a9da75a1c18d180c65239be83eb85e..f96b43d67fcc1ca53a736fb4893990b8bd363a1a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -69,6 +69,7 @@ Python 2.7 with legacy PyOpenCL: - pocl except: - tags + retry: 2 Python 3.6 POCL: script: diff --git a/doc/index.rst b/doc/index.rst index a0bad2898be4aab74dead90aae825e4e0a460c87..d862a8acd0cb258bfd1e9623bd5cef895871f6b1 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -18,12 +18,14 @@ When you run this script, the following kernel is generated, compiled, and execu (See the full example for how to print the generated code.) +.. _static-binary: + Want to try out loopy? ---------------------- There's no need to go through :ref:`installation` if you'd just like to get a feel for what loopy is. Instead, you may -`download a self-contained Linux binary `_. +`download a self-contained Linux binary `_. This is purposefully built on an ancient Linux distribution, so it should work on most versions of Linux that are currently out there. diff --git a/doc/misc.rst b/doc/misc.rst index 9db3b85a7d96c9ccf56592bcefb2b8639984f4f8..cd6fe102cb9c97a619d8b6512f103c9dcabe65b5 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -3,6 +3,18 @@ Installation ============ +Option 0: Static Binary +----------------------- + +If you would just like to experiment with :mod:`loopy`'s code transformation +abilities, the easiest way to get loopy is to download a statically-linked +Linux binary. + +See :ref:`static-binary` for details. + +Option 1: From Source, no PyOpenCL integration +----------------------------------------------- + This command should install :mod:`loopy`:: pip install loo.py @@ -26,10 +38,59 @@ You may also clone its git repository:: git clone --recursive git://github.com/inducer/loopy git clone --recursive http://git.tiker.net/trees/loopy.git +Option 2: From Conda Forge, with PyOpenCL integration +----------------------------------------------------- + +This set of instructions is intended for 64-bit Linux and +MacOS support computers: + +#. Make sure your system has the basics to build software. + + On Debian derivatives (Ubuntu and many more), + installing ``build-essential`` should do the trick. + + Everywhere else, just making sure you have the ``g++`` package should be + enough. + +#. Install `miniconda `_. + (Both Python 2 and 3 should work. In the absence of other constraints, prefer Python 3.) + +#. ``export CONDA=/WHERE/YOU/INSTALLED/miniconda3`` + + If you accepted the default location, this should work: + + ``export CONDA=$HOME/miniconda3`` + +#. ``$CONDA/bin/conda create -n dev`` + +#. ``source $CONDA/bin/activate dev`` + +#. ``conda config --add channels conda-forge`` + +#. ``conda install git pip pocl islpy pyopencl`` (Linux) + + or + + ``conda install osx-pocl-opencl git pip pocl islpy pyopencl`` (OS X) + +#. Type the following command:: + + pip install git+https://github.com/inducer/loopy + +Next time you want to use :mod:`loopy`, just run the following command:: + + source /WHERE/YOU/INSTALLED/miniconda3/bin/activate dev + +You may also like to add this to a startup file (like :file:`$HOME/.bashrc`) or create an alias for it. + +See the `PyOpenCL installation instructions +`_ for options +regarding OpenCL drivers. + User-visible Changes ==================== -Version 2016.2 +Version 2017.2 -------------- .. note:: @@ -57,7 +118,7 @@ Licensing Loopy is licensed to you under the MIT/X Consortium license: -Copyright (c) 2009-13 Andreas Klöckner and Contributors. +Copyright (c) 2009-17 Andreas Klöckner and Contributors. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index 85716fd93ff4768e8ec075c8afa7f0a9b0363999..3f01b0764f71e9ce2de86a66cc71f56473a7dc9f 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -130,6 +130,7 @@ Iname Implementation Tags Tag Meaning =============================== ==================================================== ``None`` | ``"for"`` Sequential loop +``"ord"`` Forced-order sequential loop ``"l.N"`` Local (intra-group) axis N ("local") ``"g.N"`` Group-number axis N ("group") ``"unr"`` Unroll diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 12c058fb741279db55521118f6711f197735dbd0..8b85387259228777f028fb70b1c0cf2efcc2d2ef 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -122,7 +122,9 @@ always see loopy's view of a kernel by printing it. i: None --------------------------------------------------------------------------- INSTRUCTIONS: - [i] out[i] <- 2*a[i] # insn + for i + out[i] = 2*a[i] {id=insn} + end i --------------------------------------------------------------------------- You'll likely have noticed that there's quite a bit more information here @@ -1212,11 +1214,11 @@ should call :func:`loopy.get_one_scheduled_kernel`: --------------------------------------------------------------------------- SCHEDULE: 0: CALL KERNEL rotate_v2(extra_args=[], extra_inames=[]) - 1: [maketmp] tmp <- arr[i_inner + i_outer*16] + 1: tmp = arr[i_inner + i_outer*16] {id=maketmp} 2: RETURN FROM KERNEL rotate_v2 - 3: ---BARRIER:global--- + 3: ... gbarrier 4: CALL KERNEL rotate_v2_0(extra_args=[], extra_inames=[]) - 5: [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp + 5: arr[((1 + i_inner + i_outer*16) % n)] = tmp {id=rotate} 6: RETURN FROM KERNEL rotate_v2_0 --------------------------------------------------------------------------- @@ -1250,13 +1252,13 @@ put those instructions into the schedule. --------------------------------------------------------------------------- SCHEDULE: 0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[]) - 1: [maketmp] tmp <- arr[i_inner + i_outer*16] - 2: [tmp.save] tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] <- tmp + 1: tmp = arr[i_inner + i_outer*16] {id=maketmp} + 2: tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] = tmp {id=tmp.save} 3: RETURN FROM KERNEL rotate_v2 - 4: ---BARRIER:global--- + 4: ... gbarrier 5: CALL KERNEL rotate_v2_0(extra_args=['tmp_save_slot'], extra_inames=[]) - 6: [tmp.reload] tmp <- tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0] - 7: [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp + 6: tmp = tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0] {id=tmp.reload} + 7: arr[((1 + i_inner + i_outer*16) % n)] = tmp {id=rotate} 8: RETURN FROM KERNEL rotate_v2_0 --------------------------------------------------------------------------- diff --git a/loopy/check.py b/loopy/check.py index a8ec1ad35e42410454b36fa38ef5f0a2fbefc0d6..6bac368381c708b72b2b7f235792df97d0bcd15e 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -144,20 +144,20 @@ def check_for_inactive_iname_access(kernel): def _is_racing_iname_tag(tv, tag): from loopy.kernel.data import (temp_var_scope, - LocalIndexTagBase, GroupIndexTag, ParallelTag, auto) + LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto) if tv.scope == temp_var_scope.PRIVATE: return ( - isinstance(tag, ParallelTag) + isinstance(tag, ConcurrentTag) and not isinstance(tag, (LocalIndexTagBase, GroupIndexTag))) elif tv.scope == temp_var_scope.LOCAL: return ( - isinstance(tag, ParallelTag) + isinstance(tag, ConcurrentTag) and not isinstance(tag, GroupIndexTag)) elif tv.scope == temp_var_scope.GLOBAL: - return isinstance(tag, ParallelTag) + return isinstance(tag, ConcurrentTag) elif tv.scope == auto: raise LoopyError("scope of temp var '%s' has not yet been" @@ -169,7 +169,7 @@ def _is_racing_iname_tag(tv, tag): def check_for_write_races(kernel): - from loopy.kernel.data import ParallelTag + from loopy.kernel.data import ConcurrentTag iname_to_tag = kernel.iname_to_tag.get for insn in kernel.instructions: @@ -190,7 +190,7 @@ def check_for_write_races(kernel): raceable_parallel_insn_inames = set( iname for iname in kernel.insn_inames(insn) - if isinstance(iname_to_tag(iname), ParallelTag)) + if isinstance(iname_to_tag(iname), ConcurrentTag)) elif assignee_name in kernel.temporary_variables: temp_var = kernel.temporary_variables[assignee_name] @@ -230,13 +230,13 @@ def check_for_orphaned_user_hardware_axes(kernel): def check_for_data_dependent_parallel_bounds(kernel): - from loopy.kernel.data import ParallelTag + from loopy.kernel.data import ConcurrentTag for i, dom in enumerate(kernel.domains): dom_inames = set(dom.get_var_names(dim_type.set)) par_inames = set(iname for iname in dom_inames - if isinstance(kernel.iname_to_tag.get(iname), ParallelTag)) + if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag)) if not par_inames: continue @@ -401,7 +401,7 @@ def pre_schedule_checks(kernel): logger.debug("%s: pre-schedule check: done" % kernel.name) except KeyboardInterrupt: raise - except: + except Exception: print(75*"=") print("failing kernel during pre-schedule check:") print(75*"=") @@ -659,7 +659,7 @@ def pre_codegen_checks(kernel): check_that_shapes_and_strides_are_arguments(kernel) logger.debug("pre-codegen check %s: done" % kernel.name) - except: + except Exception: print(75*"=") print("failing kernel during pre-schedule check:") print(75*"=") @@ -708,6 +708,16 @@ def check_implemented_domains(kernel, implemented_domains, code=None): (insn_impl_domain & assumptions) .project_out_except(insn_inames, [dim_type.set])) + from loopy.kernel.instruction import BarrierInstruction + from loopy.kernel.data import LocalIndexTag + if isinstance(insn, BarrierInstruction): + # project out local-id-mapped inames, solves #94 on gitlab + non_lid_inames = frozenset( + [iname for iname in insn_inames if not isinstance( + kernel.iname_to_tag.get(iname), LocalIndexTag)]) + insn_impl_domain = insn_impl_domain.project_out_except( + non_lid_inames, [dim_type.set]) + insn_domain = kernel.get_inames_domain(insn_inames) insn_parameters = frozenset(insn_domain.get_var_names(dim_type.param)) assumptions, insn_domain = align_two(assumption_non_param, insn_domain) @@ -715,6 +725,11 @@ def check_implemented_domains(kernel, implemented_domains, code=None): .project_out_except(insn_inames, [dim_type.set]) .project_out_except(insn_parameters, [dim_type.param])) + if isinstance(insn, BarrierInstruction): + # project out local-id-mapped inames, solves #94 on gitlab + desired_domain = desired_domain.project_out_except( + non_lid_inames, [dim_type.set]) + insn_impl_domain = (insn_impl_domain .project_out_except(insn_parameters, [dim_type.param])) insn_impl_domain, desired_domain = align_two( diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 07bcdc7c6c4a0c23d374a14bc21e4e161b73be03..e83515d31f1c61e52569d8d0754ce79e7a7f602f 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -28,7 +28,7 @@ from loopy.diagnostic import LoopyError, warn from pytools import ImmutableRecord import islpy as isl -from pytools.persistent_dict import PersistentDict +from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION @@ -357,8 +357,9 @@ class CodeGenerationState(object): # }}} -code_gen_cache = PersistentDict("loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION, - key_builder=LoopyKeyBuilder()) +code_gen_cache = WriteOncePersistentDict( + "loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION, + key_builder=LoopyKeyBuilder()) class PreambleInfo(ImmutableRecord): @@ -367,6 +368,7 @@ class PreambleInfo(ImmutableRecord): .. attribute:: seen_dtypes .. attribute:: seen_functions .. attribute:: seen_atomic_dtypes + .. attribute:: codegen_state """ @@ -495,7 +497,9 @@ def generate_code_v2(kernel): seen_dtypes=seen_dtypes, seen_functions=seen_functions, # a set of LoopyTypes (!) - seen_atomic_dtypes=seen_atomic_dtypes) + seen_atomic_dtypes=seen_atomic_dtypes, + codegen_state=codegen_state + ) preamble_generators = (kernel.preamble_generators + kernel.target.get_device_ast_builder().preamble_generators()) @@ -515,7 +519,7 @@ def generate_code_v2(kernel): logger.info("%s: generate code: done" % kernel.name) if CACHING_ENABLED: - code_gen_cache[input_kernel] = codegen_result + code_gen_cache.store_if_not_present(input_kernel, codegen_result) return codegen_result diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index 61f4b3a9b8c38dfc25ebc81243812aa963423f8a..f398a063dc41f3f82267f6d4850158e4c45f4733 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -58,7 +58,7 @@ def get_approximate_convex_bounds_checks(domain, check_inames, implemented_domai def get_usable_inames_for_conditional(kernel, sched_index): from loopy.schedule import ( find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) - from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag + from loopy.kernel.data import ConcurrentTag, LocalIndexTagBase, IlpBaseTag result = find_active_inames_at(kernel, sched_index) crosses_barrier = has_barrier_within(kernel, sched_index) @@ -97,7 +97,7 @@ def get_usable_inames_for_conditional(kernel, sched_index): # at the innermost level of nesting. if ( - isinstance(tag, ParallelTag) + isinstance(tag, ConcurrentTag) and not (isinstance(tag, LocalIndexTagBase) and crosses_barrier) and not isinstance(tag, IlpBaseTag) ): diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 789c00d33b7bb41816e6901e24046d4b0eefb27d..5240042337163f0aefcbc7fdb8f3151ac280053f 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -40,7 +40,7 @@ def get_admissible_conditional_inames_for(codegen_state, sched_index): kernel = codegen_state.kernel - from loopy.kernel.data import LocalIndexTag, HardwareParallelTag + from loopy.kernel.data import LocalIndexTag, HardwareConcurrentTag from loopy.schedule import find_active_inames_at, has_barrier_within result = find_active_inames_at(kernel, sched_index) @@ -48,7 +48,7 @@ def get_admissible_conditional_inames_for(codegen_state, sched_index): has_barrier = has_barrier_within(kernel, sched_index) for iname, tag in six.iteritems(kernel.iname_to_tag): - if (isinstance(tag, HardwareParallelTag) + if (isinstance(tag, HardwareConcurrentTag) and codegen_state.is_generating_device_code): if not has_barrier or not isinstance(tag, LocalIndexTag): result.add(iname) @@ -135,12 +135,13 @@ def generate_code_for_sched_index(codegen_state, sched_index): generate_sequential_loop_dim_code) from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag, - LoopedIlpTag, VectorizeTag) + LoopedIlpTag, VectorizeTag, InOrderSequentialSequentialTag) if isinstance(tag, (UnrollTag, UnrolledIlpTag)): func = generate_unroll_loop elif isinstance(tag, VectorizeTag): func = generate_vectorize_loop - elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag)): + elif tag is None or isinstance(tag, ( + LoopedIlpTag, ForceSequentialTag, InOrderSequentialSequentialTag)): func = generate_sequential_loop_dim_code else: raise RuntimeError("encountered (invalid) EnterLoop " @@ -240,6 +241,15 @@ def build_loop_nest(codegen_state, schedule_index): kernel = codegen_state.kernel + # If the AST builder does not implement conditionals, we can save us + # some work about hoisting conditionals and directly go into recursion. + if not codegen_state.ast_builder.can_implement_conditionals: + result = [] + inner = generate_code_for_sched_index(codegen_state, schedule_index) + if inner is not None: + result.append(inner) + return merge_codegen_results(codegen_state, result) + # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices # i.e. go up to the next LeaveLoop, and skip over inner loops. diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 1a132049731cd094ba5665857f1afa4f9b04684a..1db7b0445efd2a2e27e761164fa919647df37a07 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -231,7 +231,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, kernel = codegen_state.kernel from loopy.kernel.data import ( - UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag) + UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag) from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) @@ -243,7 +243,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, hw_inames_left = [iname for iname in all_inames_by_insns - if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)] + if isinstance(kernel.iname_to_tag.get(iname), HardwareConcurrentTag)] if not hw_inames_left: return next_func(codegen_state) diff --git a/loopy/execution.py b/loopy/execution.py index 07e28f06d33e5884ac57c9505593c9ee916c3171..a1228f8f3bb3493e83936ee0b3998bbd5b8cdcc2 100644 --- a/loopy/execution.py +++ b/loopy/execution.py @@ -31,7 +31,7 @@ from loopy.diagnostic import LoopyError import logging logger = logging.getLogger(__name__) -from pytools.persistent_dict import PersistentDict +from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION @@ -120,7 +120,7 @@ class SeparateArrayPackingController(object): # {{{ KernelExecutorBase -typed_and_scheduled_cache = PersistentDict( +typed_and_scheduled_cache = WriteOncePersistentDict( "loopy-typed-and-scheduled-cache-v1-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -204,7 +204,7 @@ class KernelExecutorBase(object): kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set) if CACHING_ENABLED: - typed_and_scheduled_cache[cache_key] = kernel + typed_and_scheduled_cache.store_if_not_present(cache_key, kernel) return kernel diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index f7ce5d9fc983c2ab946b5d959f283ef9328b7f29..49ab3fd68303e18a6bec371fc54db4e63f57346d 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -329,7 +329,7 @@ def is_nonnegative(expr, over_set): from loopy.symbolic import aff_from_expr try: aff = aff_from_expr(space, -expr-1) - except: + except Exception: return None expr_neg_set = isl.BasicSet.universe(space).add_constraint( isl.Constraint.inequality_from_aff(aff)) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 084c37b45cc4af25689ae3e121f170382c4e8d16..cad11fc78075342a1c270f68486900ead65a95fd 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -40,6 +40,7 @@ from loopy.library.function import ( single_arg_function_mangler) from loopy.diagnostic import CannotBranchDomainTree, LoopyError +from loopy.tools import natsorted # {{{ unique var names @@ -701,12 +702,12 @@ class LoopKernel(ImmutableRecordWithoutPickling): tag_key_uses = {} - from loopy.kernel.data import HardwareParallelTag + from loopy.kernel.data import HardwareConcurrentTag for iname in cond_inames: tag = self.iname_to_tag.get(iname) - if isinstance(tag, HardwareParallelTag): + if isinstance(tag, HardwareConcurrentTag): tag_key_uses.setdefault(tag.key, []).append(iname) multi_use_keys = set( @@ -716,7 +717,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): multi_use_inames = set() for iname in cond_inames: tag = self.iname_to_tag.get(iname) - if isinstance(tag, HardwareParallelTag) and tag.key in multi_use_keys: + if isinstance(tag, HardwareConcurrentTag) and tag.key in multi_use_keys: multi_use_inames.add(iname) return frozenset(cond_inames - multi_use_inames) @@ -958,7 +959,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): try: # insist block size is constant size = static_max_of_pw_aff(size, - constants_only=isinstance(tag, LocalIndexTag)) + constants_only=isinstance(tag, LocalIndexTag), + context=self.assumptions) except ValueError: pass @@ -1128,20 +1130,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): else: sep = [] - def natorder(key): - # Return natural ordering for strings, as opposed to dictionary order. - # E.g. will result in - # 'abc1' < 'abc9' < 'abc10' - # rather than - # 'abc1' < 'abc10' < 'abc9' - # Based on - # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7 - import re - return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)] - - def natsorted(seq, key=lambda x: x): - return sorted(seq, key=lambda y: natorder(key(y))) - if "name" in what: lines.extend(sep) lines.append("KERNEL: " + kernel.name) @@ -1187,113 +1175,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): lines.extend(sep) if show_labels: lines.append("INSTRUCTIONS:") - loop_list_width = 35 - - # {{{ topological sort - - printed_insn_ids = set() - printed_insn_order = [] - - def insert_insn_into_order(insn): - if insn.id in printed_insn_ids: - return - printed_insn_ids.add(insn.id) - - for dep_id in natsorted(insn.depends_on): - insert_insn_into_order(kernel.id_to_insn[dep_id]) - - printed_insn_order.append(insn) - - for insn in kernel.instructions: - insert_insn_into_order(insn) - - # }}} - - import loopy as lp - - Fore = self.options._fore # noqa - Style = self.options._style # noqa - - from loopy.kernel.tools import draw_dependencies_as_unicode_arrows - for insn, (arrows, extender) in zip( - printed_insn_order, - draw_dependencies_as_unicode_arrows( - printed_insn_order, fore=Fore, style=Style)): - - if isinstance(insn, lp.MultiAssignmentBase): - lhs = ", ".join(str(a) for a in insn.assignees) - rhs = str(insn.expression) - trailing = [] - elif isinstance(insn, lp.CInstruction): - lhs = ", ".join(str(a) for a in insn.assignees) - rhs = "CODE(%s|%s)" % ( - ", ".join(str(x) for x in insn.read_variables), - ", ".join("%s=%s" % (name, expr) - for name, expr in insn.iname_exprs)) - - trailing = [" "+l for l in insn.code.split("\n")] - elif isinstance(insn, lp.BarrierInstruction): - lhs = "" - rhs = "... %sbarrier" % insn.kind[0] - trailing = [] - - elif isinstance(insn, lp.NoOpInstruction): - lhs = "" - rhs = "... nop" - trailing = [] - else: - raise LoopyError("unexpected instruction type: %s" - % type(insn).__name__) - - order = self._get_iname_order_for_printing() - loop_list = ",".join( - sorted(kernel.insn_inames(insn), key=lambda iname: order[iname])) - - options = [Fore.GREEN+insn.id+Style.RESET_ALL] - if insn.priority: - options.append("priority=%d" % insn.priority) - if insn.tags: - options.append("tags=%s" % ":".join(insn.tags)) - if isinstance(insn, lp.Assignment) and insn.atomicity: - options.append("atomic=%s" % ":".join( - str(a) for a in insn.atomicity)) - if insn.groups: - options.append("groups=%s" % ":".join(insn.groups)) - if insn.conflicts_with_groups: - options.append( - "conflicts=%s" % ":".join(insn.conflicts_with_groups)) - if insn.no_sync_with: - options.append("no_sync_with=%s" % ":".join( - "%s@%s" % entry for entry in sorted(insn.no_sync_with))) - - if lhs: - core = "%s <- %s" % ( - Fore.CYAN+lhs+Style.RESET_ALL, - Fore.MAGENTA+rhs+Style.RESET_ALL, - ) - else: - core = Fore.MAGENTA+rhs+Style.RESET_ALL - - if len(loop_list) > loop_list_width: - lines.append("%s [%s]" % (arrows, loop_list)) - lines.append("%s %s%s # %s" % ( - extender, - (loop_list_width+2)*" ", - core, - ", ".join(options))) - else: - lines.append("%s [%s]%s%s # %s" % ( - arrows, - loop_list, " "*(loop_list_width-len(loop_list)), - core, - ",".join(options))) - - lines.extend(trailing) - - if insn.predicates: - lines.append(10*" " + "if (%s)" % " && ".join( - [str(x) for x in insn.predicates])) + from loopy.kernel.tools import stringify_instruction_list + lines.extend(stringify_instruction_list(kernel)) dep_lines = [] for insn in kernel.instructions: @@ -1474,6 +1358,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): return hash(key_hash.digest()) def __eq__(self, other): + if self is other: + return True + if not isinstance(other, LoopKernel): return False @@ -1487,7 +1374,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): return False elif field_name == "assumptions": - if not self.assumptions.plain_is_equal(other.assumptions): + if not ( + self.assumptions.plain_is_equal(other.assumptions) + or self.assumptions.is_equal(other.assumptions)): return False elif getattr(self, field_name) != getattr(other, field_name): diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index e4cb17657632c53120e56aacf29b20bc0778d73f..dcac16479e368908f50f5dff1ef0f4c0edcc3e7b 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -439,7 +439,7 @@ def parse_insn(groups, insn_options): if "lhs" in groups: try: lhs = parse(groups["lhs"]) - except: + except Exception: print("While parsing left hand side '%s', " "the following error occurred:" % groups["lhs"]) raise @@ -448,7 +448,7 @@ def parse_insn(groups, insn_options): try: rhs = parse(groups["rhs"]) - except: + except Exception: print("While parsing right hand side '%s', " "the following error occurred:" % groups["rhs"]) raise @@ -522,14 +522,14 @@ def parse_subst_rule(groups): from loopy.symbolic import parse try: lhs = parse(groups["lhs"]) - except: + except Exception: print("While parsing left hand side '%s', " "the following error occurred:" % groups["lhs"]) raise try: rhs = parse(groups["rhs"]) - except: + except Exception: print("While parsing right hand side '%s', " "the following error occurred:" % groups["rhs"]) raise @@ -901,7 +901,8 @@ def parse_instructions(instructions, defines): obj = insn_options_stack.pop() #if this object is the end of an if statement if obj['predicates'] == if_predicates_stack[-1]["insn_predicates"] and\ - if_predicates_stack[-1]["insn_predicates"]: + if_predicates_stack[-1]["insn_predicates"] and\ + obj['within_inames'] == if_predicates_stack[-1]['within_inames']: if_predicates_stack.pop() continue @@ -996,7 +997,7 @@ def parse_domains(domains, defines): try: dom = isl.BasicSet.read_from_str(isl.DEFAULT_CONTEXT, dom) - except: + except Exception: print("failed to parse domain '%s'" % dom) raise else: diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 94b31df12dae516d3539438b7e4ed66ed765e697..96933f57a003aaca58ed00d2d73c3301b0c448c7 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -77,14 +77,19 @@ class IndexTag(ImmutableRecord): return type(self).__name__ -class ParallelTag(IndexTag): +class ConcurrentTag(IndexTag): pass -class HardwareParallelTag(ParallelTag): +class HardwareConcurrentTag(ConcurrentTag): pass +# deprecated aliases +ParallelTag = ConcurrentTag +HardwareParallelTag = HardwareConcurrentTag + + class UniqueTag(IndexTag): pass @@ -105,11 +110,11 @@ class AxisTag(UniqueTag): self.print_name, self.axis) -class GroupIndexTag(HardwareParallelTag, AxisTag): +class GroupIndexTag(HardwareConcurrentTag, AxisTag): print_name = "g" -class LocalIndexTagBase(HardwareParallelTag): +class LocalIndexTagBase(HardwareConcurrentTag): pass @@ -130,7 +135,7 @@ class AutoFitLocalIndexTag(AutoLocalIndexTagBase): # {{{ ilp-like -class IlpBaseTag(ParallelTag): +class IlpBaseTag(ConcurrentTag): pass @@ -161,6 +166,11 @@ class ForceSequentialTag(IndexTag): return "forceseq" +class InOrderSequentialSequentialTag(IndexTag): + def __str__(self): + return "ord" + + def parse_tag(tag): if tag is None: return tag @@ -173,6 +183,8 @@ def parse_tag(tag): if tag == "for": return None + elif tag == "ord": + return InOrderSequentialSequentialTag() elif tag in ["unr"]: return UnrollTag() elif tag in ["vec"]: @@ -346,6 +358,14 @@ class TemporaryVariable(ArrayBase): A :class:`bool` indicating whether the variable may be written during its lifetime. If *True*, *initializer* must be given. + + .. attribute:: _base_storage_access_may_be_aliasing + + Whether the temporary is used to alias the underlying base storage. + Defaults to *False*. If *False*, C-based code generators will declare + the temporary as a ``restrict`` const pointer to the base storage + memory location. If *True*, the restrict part is omitted on this + declaration. """ min_target_axes = 0 @@ -358,12 +378,14 @@ class TemporaryVariable(ArrayBase): "base_storage", "initializer", "read_only", + "_base_storage_access_may_be_aliasing", ] def __init__(self, name, dtype=None, shape=(), scope=auto, dim_tags=None, offset=0, dim_names=None, strides=None, order=None, base_indices=None, storage_shape=None, - base_storage=None, initializer=None, read_only=False, **kwargs): + base_storage=None, initializer=None, read_only=False, + _base_storage_access_may_be_aliasing=False, **kwargs): """ :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype` :arg shape: :class:`loopy.auto` or a shape tuple @@ -419,6 +441,13 @@ class TemporaryVariable(ArrayBase): "mutually exclusive" % name) + if base_storage is None and _base_storage_access_may_be_aliasing: + raise LoopyError( + "temporary variable '%s': " + "_base_storage_access_may_be_aliasing option, but no " + "base_storage given!" + % name) + ArrayBase.__init__(self, name=intern(name), dtype=dtype, shape=shape, dim_tags=dim_tags, offset=offset, dim_names=dim_names, @@ -428,6 +457,8 @@ class TemporaryVariable(ArrayBase): base_storage=base_storage, initializer=initializer, read_only=read_only, + _base_storage_access_may_be_aliasing=( + _base_storage_access_may_be_aliasing), **kwargs) @property @@ -489,7 +520,10 @@ class TemporaryVariable(ArrayBase): and ( (self.initializer is None and other.initializer is None) or np.array_equal(self.initializer, other.initializer)) - and self.read_only == other.read_only) + and self.read_only == other.read_only + and (self._base_storage_access_may_be_aliasing + == other._base_storage_access_may_be_aliasing) + ) def update_persistent_hash(self, key_hash, key_builder): """Custom hash computation function for use with @@ -500,6 +534,8 @@ class TemporaryVariable(ArrayBase): self.update_persistent_hash_for_shape(key_hash, key_builder, self.storage_shape) key_builder.rec(key_hash, self.base_indices) + key_builder.rec(key_hash, self.scope) + key_builder.rec(key_hash, self.base_storage) initializer = self.initializer if initializer is not None: @@ -507,10 +543,22 @@ class TemporaryVariable(ArrayBase): key_builder.rec(key_hash, initializer) key_builder.rec(key_hash, self.read_only) + key_builder.rec(key_hash, self._base_storage_access_may_be_aliasing) # }}} +def iname_tag_to_temp_var_scope(iname_tag): + iname_tag = parse_tag(iname_tag) + + if isinstance(iname_tag, GroupIndexTag): + return temp_var_scope.GLOBAL + elif isinstance(iname_tag, LocalIndexTag): + return temp_var_scope.LOCAL + else: + return temp_var_scope.PRIVATE + + # {{{ substitution rule class SubstitutionRule(ImmutableRecord): diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 8bdc72d54a91c6e8b4f9ec0ca3053831627d3eae..02df0f2b4fd27dcb0f8b847411aa3dea7f3f9169 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -35,7 +35,7 @@ import islpy as isl from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg - +from loopy.tools import natsorted import logging logger = logging.getLogger(__name__) @@ -620,11 +620,11 @@ class DomainParameterFinder(object): if dep.name in param_names: from pymbolic.algorithm import solve_affine_equations_for try: - # friggin' overkill :) + # overkill :) param_expr = solve_affine_equations_for( [dep.name], [(shape_i, var("shape_i"))] )[dep.name] - except: + except Exception: # went wrong? oh well pass else: @@ -1070,7 +1070,7 @@ def guess_var_shape(kernel, var_name): if n_axes == 1: # Leave shape undetermined--we can live with that for 1D. - shape = (None,) + shape = None else: raise LoopyError("cannot determine access range for '%s': " "undetermined index in subscript(s) '%s'" @@ -1092,7 +1092,7 @@ def guess_var_shape(kernel, var_name): kernel.cache_manager.dim_max( armap.access_range, i) + 1, constants_only=False))) - except: + except Exception: print("While trying to find shape axis %d of " "variable '%s', the following " "exception occurred:" % (i, var_name), @@ -1371,7 +1371,167 @@ def draw_dependencies_as_unicode_arrows( conform_to_uniform_length(extender)) for row, extender in rows] - return rows + return uniform_length, rows + +# }}} + + +# {{{ stringify_instruction_list + +def stringify_instruction_list(kernel): + # {{{ topological sort + + printed_insn_ids = set() + printed_insn_order = [] + + def insert_insn_into_order(insn): + if insn.id in printed_insn_ids: + return + printed_insn_ids.add(insn.id) + + for dep_id in natsorted(insn.depends_on): + insert_insn_into_order(kernel.id_to_insn[dep_id]) + + printed_insn_order.append(insn) + + for insn in kernel.instructions: + insert_insn_into_order(insn) + + # }}} + + import loopy as lp + + Fore = kernel.options._fore # noqa + Style = kernel.options._style # noqa + + uniform_arrow_length, arrows_and_extenders = \ + draw_dependencies_as_unicode_arrows( + printed_insn_order, fore=Fore, style=Style) + + leader = " " * uniform_arrow_length + lines = [] + current_inames = [set()] + + if uniform_arrow_length: + indent_level = [1] + else: + indent_level = [0] + + indent_increment = 2 + + iname_order = kernel._get_iname_order_for_printing() + + def add_pre_line(s): + lines.append(leader + " " * indent_level[0] + s) + + def add_main_line(s): + lines.append(arrows + " " * indent_level[0] + s) + + def add_post_line(s): + lines.append(extender + " " * indent_level[0] + s) + + def adapt_to_new_inames_list(new_inames): + added = [] + removed = [] + + # FIXME: Doesn't respect strict nesting + for iname in iname_order: + is_in_current = iname in current_inames[0] + is_in_new = iname in new_inames + + if is_in_new == is_in_current: + pass + elif is_in_new and not is_in_current: + added.append(iname) + elif not is_in_new and is_in_current: + removed.append(iname) + else: + assert False + + if removed: + indent_level[0] -= indent_increment * len(removed) + add_pre_line("end " + ", ".join(removed)) + if added: + add_pre_line("for " + ", ".join(added)) + indent_level[0] += indent_increment * len(added) + + current_inames[0] = new_inames + + for insn, (arrows, extender) in zip(printed_insn_order, arrows_and_extenders): + if isinstance(insn, lp.MultiAssignmentBase): + lhs = ", ".join(str(a) for a in insn.assignees) + rhs = str(insn.expression) + trailing = [] + elif isinstance(insn, lp.CInstruction): + lhs = ", ".join(str(a) for a in insn.assignees) + rhs = "CODE(%s|%s)" % ( + ", ".join(str(x) for x in insn.read_variables), + ", ".join("%s=%s" % (name, expr) + for name, expr in insn.iname_exprs)) + + trailing = [l for l in insn.code.split("\n")] + elif isinstance(insn, lp.BarrierInstruction): + lhs = "" + rhs = "... %sbarrier" % insn.kind[0] + trailing = [] + + elif isinstance(insn, lp.NoOpInstruction): + lhs = "" + rhs = "... nop" + trailing = [] + + else: + raise LoopyError("unexpected instruction type: %s" + % type(insn).__name__) + + adapt_to_new_inames_list(kernel.insn_inames(insn)) + + options = ["id="+Fore.GREEN+insn.id+Style.RESET_ALL] + if insn.priority: + options.append("priority=%d" % insn.priority) + if insn.tags: + options.append("tags=%s" % ":".join(insn.tags)) + if isinstance(insn, lp.Assignment) and insn.atomicity: + options.append("atomic=%s" % ":".join( + str(a) for a in insn.atomicity)) + if insn.groups: + options.append("groups=%s" % ":".join(insn.groups)) + if insn.conflicts_with_groups: + options.append( + "conflicts=%s" % ":".join(insn.conflicts_with_groups)) + if insn.no_sync_with: + options.append("no_sync_with=%s" % ":".join( + "%s@%s" % entry for entry in sorted(insn.no_sync_with))) + + if lhs: + core = "%s = %s" % ( + Fore.CYAN+lhs+Style.RESET_ALL, + Fore.MAGENTA+rhs+Style.RESET_ALL, + ) + else: + core = Fore.MAGENTA+rhs+Style.RESET_ALL + + options_str = " {%s}" % ", ".join(options) + + if insn.predicates: + # FIXME: precedence + add_pre_line("if %s" % " and ".join([str(x) for x in insn.predicates])) + indent_level[0] += indent_increment + + add_main_line(core + options_str) + + for t in trailing: + add_post_line(t) + + if insn.predicates: + indent_level[0] -= indent_increment + add_post_line("end") + + leader = extender + + adapt_to_new_inames_list([]) + + return lines # }}} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 541b44f58c5b02e9beba15211cb861fd09f14096..ac7ac19887388649670154fcd36eba79ba3b4315 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -30,7 +30,7 @@ from loopy.diagnostic import ( import islpy as isl -from pytools.persistent_dict import PersistentDict +from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION @@ -292,7 +292,7 @@ def _classify_reduction_inames(kernel, inames): from loopy.kernel.data import ( LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, - ParallelTag) + ConcurrentTag) for iname in inames: iname_tag = kernel.iname_to_tag.get(iname) @@ -305,7 +305,7 @@ def _classify_reduction_inames(kernel, inames): elif isinstance(iname_tag, LocalIndexTagBase): local_par.append(iname) - elif isinstance(iname_tag, (ParallelTag, VectorizeTag)): + elif isinstance(iname_tag, (ConcurrentTag, VectorizeTag)): nonlocal_par.append(iname) else: @@ -610,7 +610,7 @@ def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): if len(coeffs) == 0: try: scan_iname_aff.get_constant_val() - except: + except Exception: raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff) # If this point is reached we're assuming the domain is of the form @@ -956,7 +956,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, nresults=nresults, depends_on=insn.depends_on, within_inames=insn.within_inames | expr.inames, - within_inames_is_final=insn.within_inames_is_final) + within_inames_is_final=insn.within_inames_is_final, + predicates=insn.predicates, + ) newly_generated_insn_id_set.add(get_args_insn_id) @@ -970,7 +972,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, return updated_inner_exprs def expand_inner_reduction(id, expr, nresults, depends_on, within_inames, - within_inames_is_final): + within_inames_is_final, predicates): # FIXME: use make_temporaries from pymbolic.primitives import Call from loopy.symbolic import Reduction @@ -997,7 +999,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, expression=expr, depends_on=depends_on, within_inames=within_inames, - within_inames_is_final=within_inames_is_final) + within_inames_is_final=within_inames_is_final, + predicates=predicates) generated_insns.append(call_insn) @@ -1038,7 +1041,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=init_insn_depends_on, - expression=expr.operation.neutral_element(*arg_dtypes)) + expression=expr.operation.neutral_element(*arg_dtypes), + predicates=insn.predicates,) generated_insns.append(init_insn) @@ -1064,7 +1068,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, nresults=nresults, depends_on=insn.depends_on, within_inames=update_insn_iname_deps, - within_inames_is_final=insn.within_inames_is_final) + within_inames_is_final=insn.within_inames_is_final, + predicates=insn.predicates, + ) reduction_insn_depends_on.add(get_args_insn_id) else: @@ -1079,7 +1085,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, reduction_expr), depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on, within_inames=update_insn_iname_deps, - within_inames_is_final=insn.within_inames_is_final) + within_inames_is_final=insn.within_inames_is_final, + predicates=insn.predicates,) generated_insns.append(reduction_insn) @@ -1186,7 +1193,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, expression=neutral, within_inames=base_iname_deps | frozenset([base_exec_iname]), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset()) + depends_on=frozenset(), + predicates=insn.predicates, + ) generated_insns.append(init_insn) init_neutral_id = insn_id_gen("%s_%s_init_neutral" % (insn.id, red_iname)) @@ -1196,7 +1205,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, expression=neutral, within_inames=base_iname_deps | frozenset([base_exec_iname]), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset()) + depends_on=frozenset(), + predicates=insn.predicates, + ) generated_insns.append(init_neutral_insn) transfer_depends_on = set([init_neutral_id, init_id]) @@ -1216,7 +1227,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, within_inames=( (outer_insn_inames - frozenset(expr.inames)) | frozenset([red_iname])), - within_inames_is_final=insn.within_inames_is_final) + within_inames_is_final=insn.within_inames_is_final, + predicates=insn.predicates, + ) transfer_depends_on.add(get_args_insn_id) else: @@ -1239,7 +1252,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, | frozenset([red_iname])), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on, - no_sync_with=frozenset([(init_id, "any")])) + no_sync_with=frozenset([(init_id, "any")]), + predicates=insn.predicates, + ) generated_insns.append(transfer_insn) cur_size = 1 @@ -1280,6 +1295,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([prev_id]), + predicates=insn.predicates, ) generated_insns.append(stage_insn) @@ -1398,7 +1414,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, (sweep_iname,) + expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=init_insn_depends_on, - expression=expr.operation.neutral_element(*arg_dtypes)) + expression=expr.operation.neutral_element(*arg_dtypes), + predicates=insn.predicates, + ) generated_insns.append(init_insn) @@ -1425,7 +1443,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, depends_on=frozenset(update_insn_depends_on), within_inames=update_insn_iname_deps, no_sync_with=insn.no_sync_with, - within_inames_is_final=insn.within_inames_is_final) + within_inames_is_final=insn.within_inames_is_final, + predicates=insn.predicates, + ) generated_insns.append(scan_insn) @@ -1531,7 +1551,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, expression=neutral, within_inames=base_iname_deps | frozenset([base_exec_iname]), within_inames_is_final=insn.within_inames_is_final, - depends_on=init_insn_depends_on) + depends_on=init_insn_depends_on, + predicates=insn.predicates, + ) generated_insns.append(init_insn) transfer_insn_depends_on = set([init_insn.id]) | insn.depends_on @@ -1561,7 +1583,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset(transfer_insn_depends_on), - no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with) + no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with, + predicates=insn.predicates, + ) generated_insns.append(transfer_insn) @@ -1590,7 +1614,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, within_inames=( base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset([prev_id])) + depends_on=frozenset([prev_id]), + predicates=insn.predicates, + ) if cur_size == 1: # Performance hack: don't add a barrier here with transfer_insn. @@ -1623,6 +1649,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([prev_id]), + predicates=insn.predicates, ) generated_insns.append(write_stage_insn) @@ -2020,7 +2047,8 @@ def limit_boostability(kernel): # }}} -preprocess_cache = PersistentDict("loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, +preprocess_cache = WriteOncePersistentDict( + "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -2126,7 +2154,7 @@ def preprocess_kernel(kernel, device=None): # }}} if CACHING_ENABLED: - preprocess_cache[input_kernel] = kernel + preprocess_cache.store_if_not_present(input_kernel, kernel) return kernel diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index d28e7b1b3def2b988b4624aed9caf8f65c70b2c5..abf4d799fbdb14f86fa29dde26e6654130fc66de 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -29,7 +29,7 @@ import sys import islpy as isl from loopy.diagnostic import warn_with_kernel, LoopyError # noqa -from pytools.persistent_dict import PersistentDict +from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION @@ -206,13 +206,13 @@ def find_loop_nest_with_map(kernel): """ result = {} - from loopy.kernel.data import ParallelTag, IlpBaseTag, VectorizeTag + from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag all_nonpar_inames = set([ iname for iname in kernel.all_inames() if not isinstance(kernel.iname_to_tag.get(iname), - (ParallelTag, IlpBaseTag, VectorizeTag))]) + (ConcurrentTag, IlpBaseTag, VectorizeTag))]) iname_to_insns = kernel.iname_to_insns() @@ -274,10 +274,10 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): result = {} - from loopy.kernel.data import ParallelTag, IlpBaseTag, VectorizeTag + from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag for insn in kernel.instructions: for iname in kernel.insn_inames(insn): - if isinstance(kernel.iname_to_tag.get(iname), ParallelTag): + if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag): continue iname_dep = result.setdefault(iname, set()) @@ -308,7 +308,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): continue tag = kernel.iname_to_tag.get(dep_insn_iname) - if isinstance(tag, (ParallelTag, IlpBaseTag, VectorizeTag)): + if isinstance(tag, (ConcurrentTag, IlpBaseTag, VectorizeTag)): # Parallel tags don't really nest, so we'll disregard # them here. continue @@ -431,10 +431,10 @@ def format_insn(kernel, insn_id): from loopy.kernel.instruction import ( MultiAssignmentBase, NoOpInstruction, BarrierInstruction) if isinstance(insn, MultiAssignmentBase): - return "[%s] %s%s%s <- %s%s%s" % ( - format_insn_id(kernel, insn_id), + return "%s%s%s = %s%s%s {id=%s}" % ( Fore.CYAN, ", ".join(str(a) for a in insn.assignees), Style.RESET_ALL, - Fore.MAGENTA, str(insn.expression), Style.RESET_ALL) + Fore.MAGENTA, str(insn.expression), Style.RESET_ALL, + format_insn_id(kernel, insn_id)) elif isinstance(insn, BarrierInstruction): return "[%s] %s... %sbarrier%s" % ( format_insn_id(kernel, insn_id), @@ -456,11 +456,11 @@ def dump_schedule(kernel, schedule): from loopy.kernel.data import MultiAssignmentBase for sched_item in schedule: if isinstance(sched_item, EnterLoop): - lines.append(indent + "FOR %s" % sched_item.iname) + lines.append(indent + "for %s" % sched_item.iname) indent += " " elif isinstance(sched_item, LeaveLoop): indent = indent[:-4] - lines.append(indent + "END %s" % sched_item.iname) + lines.append(indent + "end %s" % sched_item.iname) elif isinstance(sched_item, CallKernel): lines.append(indent + "CALL KERNEL %s(extra_args=%s, extra_inames=%s)" % ( @@ -479,7 +479,7 @@ def dump_schedule(kernel, schedule): insn_str = sched_item.insn_id lines.append(indent + insn_str) elif isinstance(sched_item, Barrier): - lines.append(indent + "---BARRIER:%s---" % sched_item.kind) + lines.append(indent + "... %sbarrier" % sched_item.kind[0]) else: assert False @@ -1787,7 +1787,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}): for item in preschedule for insn_id in sched_item_to_insn_id(item)) - from loopy.kernel.data import IlpBaseTag, ParallelTag, VectorizeTag + from loopy.kernel.data import IlpBaseTag, ConcurrentTag, VectorizeTag ilp_inames = set( iname for iname in kernel.all_inames() @@ -1798,7 +1798,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}): if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag)) parallel_inames = set( iname for iname in kernel.all_inames() - if isinstance(kernel.iname_to_tag.get(iname), ParallelTag)) + if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag)) loop_nest_with_map = find_loop_nest_with_map(kernel) loop_nest_around_map = find_loop_nest_around_map(kernel) @@ -1940,7 +1940,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}): # }}} -schedule_cache = PersistentDict("loopy-schedule-cache-v4-"+DATA_MODEL_VERSION, +schedule_cache = WriteOncePersistentDict( + "loopy-schedule-cache-v4-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -1971,7 +1972,7 @@ def get_one_scheduled_kernel(kernel): kernel.name, time()-start_time)) if CACHING_ENABLED and not from_cache: - schedule_cache[sched_cache_key] = result + schedule_cache.store_if_not_present(sched_cache_key, result) return result diff --git a/loopy/statistics.py b/loopy/statistics.py index 233cfe5e881ef594ebabc536ab8c7b3d18d5cf17..88d7ec328345fd4c97d75b449385316f99c2509d 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1000,6 +1000,9 @@ def add_assumptions_guard(kernel, pwqpolynomial): def count(kernel, set, space=None): try: + if space is not None: + set = set.align_params(space) + return add_assumptions_guard(kernel, set.card()) except AttributeError: pass diff --git a/loopy/symbolic.py b/loopy/symbolic.py index f1a494f30d469511817d204c0476ff79abe00e3b..543c2743bb98b09b706c2fdbf9188ed0a85d97f2 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1232,6 +1232,9 @@ class PwAffEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin): super(PwAffEvaluationMapper, self).__init__(context) def map_constant(self, expr): + if isinstance(expr, np.integer): + expr = int(expr) + return self.pw_zero + expr def map_min(self, expr): diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 7e307ef8bdd4d89e24b26dbacf39733ab3350307..5800a0236e8ae5f81a63942c31a74822bc2fab96 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -211,6 +211,10 @@ class ASTBuilderBase(object): static_lbound, static_ubound, inner): raise NotImplementedError() + @property + def can_implement_conditionals(self): + return False + def emit_if(self, condition_str, ast): raise NotImplementedError() @@ -275,28 +279,6 @@ class DummyHostASTBuilder(ASTBuilderBase): def ast_block_scope_class(self): return _DummyASTBlock - def emit_assignment(self, codegen_state, insn): - return None - - def emit_multiple_assignment(self, codegen_state, insn): - return None - - def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): - return None - - def emit_if(self, condition_str, ast): - return None - - def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): - return None - - def emit_blank_line(self): - return None - - def emit_comment(self, s): - return None - # }}} diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index a2ad682505bbdb7ed5977a28e201ebc6655c7784..e54ac0f693c4704c13b8c435e4bc7acaac1b1a47 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -307,6 +307,12 @@ class _ConstRestrictPointer(Pointer): return sub_tp, ("*const __restrict__ %s" % sub_decl) +class _ConstPointer(Pointer): + def get_decl_pait(self): + sub_tp, sub_decl = self.subdecl.get_decl_pair() + return sub_tp, ("*const %s" % sub_decl) + + class CASTBuilder(ASTBuilderBase): # {{{ library @@ -462,13 +468,17 @@ class CASTBuilder(ASTBuilderBase): temp_var_decl = self.wrap_temporary_decl( temp_var_decl, tv.scope) - # The 'restrict' part of this is a complete lie--of course - # all these temporaries are aliased. But we're promising to - # not use them to shovel data from one representation to the - # other. That counts, right? + if tv._base_storage_access_may_be_aliasing: + ptrtype = _ConstPointer + else: + # The 'restrict' part of this is a complete lie--of course + # all these temporaries are aliased. But we're promising to + # not use them to shovel data from one representation to the + # other. That counts, right? + ptrtype = _ConstRestrictPointer - cast_decl = _ConstRestrictPointer(cast_decl) - temp_var_decl = _ConstRestrictPointer(temp_var_decl) + cast_decl = ptrtype(cast_decl) + temp_var_decl = ptrtype(temp_var_decl) cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( @@ -789,6 +799,10 @@ class CASTBuilder(ASTBuilderBase): from cgen import Comment return Comment(s) + @property + def can_implement_conditionals(self): + return True + def emit_if(self, condition_str, ast): from cgen import If return If(condition_str, ast) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 2da25ba39ceef38a4af105913973226bd3773729..975c691a74d0d17bdca39243f515c5d04284893d 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -328,7 +328,8 @@ def generate_arg_setup(gen, kernel, implemented_data_info, options): # {{{ allocate written arrays, if needed if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \ - and arg.shape is not None: + and arg.shape is not None \ + and all(si is not None for si in arg.shape): if not isinstance(arg.dtype, NumpyType): raise LoopyError("do not know how to pass arg of type '%s'" diff --git a/loopy/target/python.py b/loopy/target/python.py index 11951abcf17e94c0fdba51042e3060735215b423..ce04986d3d2a39dcf7126339055d32fa16ffcc25 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -283,6 +283,10 @@ class PythonASTBuilderBase(ASTBuilderBase): from genpy import Comment return Comment(s) + @property + def can_implement_conditionals(self): + return True + def emit_if(self, condition_str, ast): from genpy import If return If(condition_str, ast) diff --git a/loopy/tools.py b/loopy/tools.py index 1ebbe5c8a4fd2b68e3bfcf5ed493384599dac2c5..d6952d54782f113685299641c828907fb7f32a46 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -576,4 +576,19 @@ def intern_frozenset_of_ids(fs): return frozenset(intern(s) for s in fs) +def natorder(key): + # Return natural ordering for strings, as opposed to dictionary order. + # E.g. will result in + # 'abc1' < 'abc9' < 'abc10' + # rather than + # 'abc1' < 'abc10' < 'abc9' + # Based on + # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7 + import re + return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)] + + +def natsorted(seq, key=lambda x: x): + return sorted(seq, key=lambda y: natorder(key(y))) + # vim: foldmethod=marker diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py index f4e6526a7b083f0b38dda1209b607aa38a62b68e..618e36f20da8b3f9089ecf5ce88d6b3177528570 100644 --- a/loopy/transform/array_buffer_map.py +++ b/loopy/transform/array_buffer_map.py @@ -239,14 +239,14 @@ class ArrayToBufferMap(object): non1_storage_axis_flags = [] non1_storage_shape = [] - for saxis, bi, l in zip( + for saxis, bi, saxis_len in zip( storage_axis_names, storage_base_indices, storage_shape): - has_length_non1 = l != 1 + has_length_non1 = saxis_len != 1 non1_storage_axis_flags.append(has_length_non1) if has_length_non1: - non1_storage_shape.append(l) + non1_storage_shape.append(saxis_len) # }}} diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 92cff7a507d672a3acc51a8abed572a04cb7e86a..1b059b6a73d3064596b8679fbc87f94287b2d9fe 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -29,7 +29,7 @@ from loopy.symbolic import (get_dependencies, RuleAwareIdentityMapper, SubstitutionRuleMappingContext, SubstitutionMapper) from pymbolic.mapper.substitutor import make_subst_func -from pytools.persistent_dict import PersistentDict +from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper from loopy.version import DATA_MODEL_VERSION from loopy.diagnostic import LoopyError @@ -124,7 +124,8 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper): # }}} -buffer_array_cache = PersistentDict("loopy-buffer-array-cache-"+DATA_MODEL_VERSION, +buffer_array_cache = WriteOncePersistentDict( + "loopy-buffer-array-cache-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -531,7 +532,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, if CACHING_ENABLED: from loopy.preprocess import prepare_for_caching - buffer_array_cache[cache_key] = prepare_for_caching(kernel) + buffer_array_cache.store_if_not_present( + cache_key, prepare_for_caching(kernel)) return kernel diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py index 77840753258fa545aa01ef3e8c58cbc36e66ed72..0ac71d603ebe8b5150fb854dd3978676dd9d98c3 100644 --- a/loopy/transform/ilp.py +++ b/loopy/transform/ilp.py @@ -38,6 +38,7 @@ from loopy.symbolic import IdentityMapper class ExtraInameIndexInserter(IdentityMapper): def __init__(self, var_to_new_inames): self.var_to_new_inames = var_to_new_inames + self.seen_ilp_inames = set() def map_subscript(self, expr): try: @@ -50,6 +51,7 @@ class ExtraInameIndexInserter(IdentityMapper): index = (index,) index = tuple(self.rec(i) for i in index) + self.seen_ilp_inames.update(v.name for v in new_idx) return expr.aggregate.index(index + new_idx) def map_variable(self, expr): @@ -58,6 +60,7 @@ class ExtraInameIndexInserter(IdentityMapper): except KeyError: return expr else: + self.seen_ilp_inames.update(v.name for v in new_idx) return expr.index(new_idx) @@ -160,13 +163,30 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None): # }}} from pymbolic import var - eiii = ExtraInameIndexInserter( - dict((var_name, tuple(var(iname) for iname in inames)) - for var_name, inames in six.iteritems(var_to_new_ilp_inames))) - - new_insns = [ - insn.with_transformed_expressions(eiii) - for insn in kernel.instructions] + var_to_extra_iname = dict( + (var_name, tuple(var(iname) for iname in inames)) + for var_name, inames in six.iteritems(var_to_new_ilp_inames)) + + new_insns = [] + + for insn in kernel.instructions: + eiii = ExtraInameIndexInserter(var_to_extra_iname) + new_insn = insn.with_transformed_expressions(eiii) + if not eiii.seen_ilp_inames <= insn.within_inames: + + from loopy.diagnostic import warn_with_kernel + warn_with_kernel( + kernel, + "implicit_ilp_iname", + "Instruction '%s': touched variable that (for ILP) " + "required iname(s) '%s', but that the instruction was not " + "previously within the iname(s). Previously, this would " + "implicitly promote the instruction, but that behavior is " + "deprecated and will stop working in 2018.1." + % (insn.id, ", ".join( + eiii.seen_ilp_inames - insn.within_inames))) + + new_insns.append(new_insn) return kernel.copy( temporary_variables=new_temp_vars, diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index ea90abfe27c8de69daf39021b3d0ea5463a2e4c8..22fd7b3bb2c643bc3c1309f4e3fdb89438ae7d2b 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -641,7 +641,7 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): iname_to_tag = [(iname, parse_tag(tag)) for iname, tag in iname_to_tag] - from loopy.kernel.data import (ParallelTag, AutoLocalIndexTagBase, + from loopy.kernel.data import (ConcurrentTag, AutoLocalIndexTagBase, ForceSequentialTag) # {{{ globbing @@ -686,13 +686,13 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): if iname not in kernel.all_inames(): raise ValueError("cannot tag '%s'--not known" % iname) - if isinstance(new_tag, ParallelTag) \ + if isinstance(new_tag, ConcurrentTag) \ and isinstance(old_tag, ForceSequentialTag): raise ValueError("cannot tag '%s' as parallel--" "iname requires sequential execution" % iname) if isinstance(new_tag, ForceSequentialTag) \ - and isinstance(old_tag, ParallelTag): + and isinstance(old_tag, ConcurrentTag): raise ValueError("'%s' is already tagged as parallel, " "but is now prohibited from being parallel " "(likely because of participation in a precompute or " @@ -972,9 +972,9 @@ def get_iname_duplication_options(knl, use_boostable_into=False): # Get the duplication options as a tuple of iname and a set for iname, insns in _get_iname_duplication_options(insn_deps): # Check whether this iname has a parallel tag and discard it if so - from loopy.kernel.data import ParallelTag + from loopy.kernel.data import ConcurrentTag if (iname in knl.iname_to_tag - and isinstance(knl.iname_to_tag[iname], ParallelTag)): + and isinstance(knl.iname_to_tag[iname], ConcurrentTag)): continue # If we find a duplication option and fo not use boostable_into diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 3d4f5c2d4765aa7cbf1e56c76d127bf8f4d61a06..2ba2338b0af541274cc0362c9f71cec9c2887ffc 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -402,13 +402,13 @@ class TemporarySaver(object): continue from loopy.kernel.data import ( - GroupIndexTag, LocalIndexTag, ParallelTag) + GroupIndexTag, LocalIndexTag, ConcurrentTag) if isinstance(tag, GroupIndexTag): my_group_tags.append(tag) elif isinstance(tag, LocalIndexTag): my_local_tags.append(tag) - elif isinstance(tag, ParallelTag): + elif isinstance(tag, ConcurrentTag): raise LoopyError( "iname '%s' is tagged with '%s' - only " "group and local tags are supported for " diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 79ceff9fdf1e2c4b3b544e8ae85f8194b36ec444..a681afe06520483c83530c241e39229412e88f03 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -1,6 +1,4 @@ -from __future__ import division -from __future__ import absolute_import -import six +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -24,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six from loopy.symbolic import ( get_dependencies, SubstitutionMapper, @@ -141,6 +140,7 @@ def extract_subst(kernel, subst_name, template, parameters=()): dfmapper = CallbackMapper(gather_exprs, WalkMapper()) for insn in kernel.instructions: + dfmapper(insn.assignees) dfmapper(insn.expression) for sr in six.itervalues(kernel.substitutions): @@ -178,8 +178,7 @@ def extract_subst(kernel, subst_name, template, parameters=()): new_insns = [] for insn in kernel.instructions: - new_expr = cbmapper(insn.expression) - new_insns.append(insn.copy(expression=new_expr)) + new_insns.append(insn.with_transformed_expressions(cbmapper)) from loopy.kernel.data import SubstitutionRule new_substs = { diff --git a/loopy/version.py b/loopy/version.py index 3a9781748d00a0e453d4a56e374a25aa72ab4733..5e07e979f2d44684be00290328244496176337b3 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v66-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v68-islpy%s" % _islpy_version diff --git a/setup.py b/setup.py index 67d943af3be4446834bf7262a91b8596b601ca85..94843bf69e4e25677ccc0713e5f598e9dcfd55e2 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ setup(name="loo.py", ], install_requires=[ - "pytools>=2017.3", + "pytools>=2017.6", "pymbolic>=2016.2", "genpy>=2016.1.2", "cgen>=2016.1", diff --git a/test/test_fortran.py b/test/test_fortran.py index 6e05aa6adba66ce0a1896527249d321de104c512..842a0127e3118ec8e7a0ea89ed17decc091e8566 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -278,14 +278,14 @@ def test_matmul(ctx_factory, buffer_inames): logging.basicConfig(level=logging.INFO) fortran_src = """ - subroutine dgemm(m,n,l,a,b,c) + subroutine dgemm(m,n,ell,a,b,c) implicit none - real*8 a(m,l),b(l,n),c(m,n) - integer m,n,k,i,j,l + real*8 a(m,ell),b(ell,n),c(m,n) + integer m,n,k,i,j,ell do j = 1,n do i = 1,m - do k = 1,l + do k = 1,ell c(i,j) = c(i,j) + b(k,j)*a(i,k) end do end do @@ -306,7 +306,7 @@ def test_matmul(ctx_factory, buffer_inames): knl = lp.split_iname(knl, "k", 32) knl = lp.assume(knl, "n mod 32 = 0") knl = lp.assume(knl, "m mod 32 = 0") - knl = lp.assume(knl, "l mod 16 = 0") + knl = lp.assume(knl, "ell mod 16 = 0") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") @@ -317,7 +317,7 @@ def test_matmul(ctx_factory, buffer_inames): init_expression="0", store_expression="base+buffer") ctx = ctx_factory() - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128)) + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128)) @pytest.mark.xfail @@ -457,14 +457,14 @@ def test_parse_and_fuse_two_kernels(): def test_precompute_some_exist(ctx_factory): fortran_src = """ - subroutine dgemm(m,n,l,a,b,c) + subroutine dgemm(m,n,ell,a,b,c) implicit none - real*8 a(m,l),b(l,n),c(m,n) - integer m,n,k,i,j,l + real*8 a(m,ell),b(ell,n),c(m,n) + integer m,n,k,i,j,ell do j = 1,n do i = 1,m - do k = 1,l + do k = 1,ell c(i,j) = c(i,j) + b(k,j)*a(i,k) end do end do @@ -483,7 +483,7 @@ def test_precompute_some_exist(ctx_factory): knl = lp.split_iname(knl, "k", 8) knl = lp.assume(knl, "n mod 8 = 0") knl = lp.assume(knl, "m mod 8 = 0") - knl = lp.assume(knl, "l mod 8 = 0") + knl = lp.assume(knl, "ell mod 8 = 0") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") @@ -495,7 +495,7 @@ def test_precompute_some_exist(ctx_factory): ref_knl = knl ctx = ctx_factory() - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128)) + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128)) if __name__ == "__main__": diff --git a/test/test_linalg.py b/test/test_linalg.py index 772d536d1e00fedc0b7abcd2f8c05350fe3b633e..3d422f1d8b5a847d4445468978ee529db95c481f 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -230,14 +230,14 @@ def test_funny_shape_matrix_mul(ctx_factory): n = get_suitable_size(ctx) m = n+12 - l = m+12 + ell = m+12 knl = lp.make_kernel( - "{[i,k,j]: 0<=i gid = i/256 + start = gid*256 + for j + a[start + j] = a[start + j] + j + end + end + """, + seq_dependencies=True, + name="uniform_l", + target=PyOpenCLTarget(), + assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0])) + + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) + cl_kernel_info = CompiledKernel(ctx, knl).cl_kernel_info(frozenset()) # noqa + # }}} @@ -2008,6 +2034,37 @@ def test_if_else(ctx_factory): out_ref[4::6] = 11 out_ref[2::6] = 3 + knl = lp.make_kernel( + "{ [i,j]: 0<=i,j<50}", + """ + for i + if i < 25 + for j + if j % 2 == 0 + a[i, j] = 1 + else + a[i, j] = 0 + end + end + else + for j + if j % 2 == 0 + a[i, j] = 0 + else + a[i, j] = 1 + end + end + end + end + """ + ) + + evt, (out,) = knl(queue, out_host=True) + + out_ref = np.zeros((50, 50)) + out_ref[:25, 0::2] = 1 + out_ref[25:, 1::2] = 1 + assert np.array_equal(out_ref, out) @@ -2180,11 +2237,12 @@ def test_nosync_option_parsing(): """, options=lp.Options(allow_terminal_colors=False)) kernel_str = str(knl) - assert "# insn1,no_sync_with=insn1@any" in kernel_str - assert "# insn2,no_sync_with=insn1@any:insn2@any" in kernel_str - assert "# insn3,no_sync_with=insn1@local:insn2@global:insn3@any" in kernel_str - assert "# insn4,no_sync_with=insn1@local:insn2@local:insn3@local:insn5@local" in kernel_str # noqa - assert "# insn5,no_sync_with=insn1@any" in kernel_str + print(kernel_str) + assert "id=insn1, no_sync_with=insn1@any" in kernel_str + assert "id=insn2, no_sync_with=insn1@any:insn2@any" in kernel_str + assert "id=insn3, no_sync_with=insn1@local:insn2@global:insn3@any" in kernel_str + assert "id=insn4, no_sync_with=insn1@local:insn2@local:insn3@local:insn5@local" in kernel_str # noqa + assert "id=insn5, no_sync_with=insn1@any" in kernel_str def assert_barrier_between(knl, id1, id2, ignore_barriers_in_levels=()): @@ -2265,6 +2323,43 @@ def test_barrier_insertion_near_bottom_of_loop(): assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1]) +def test_barrier_in_overridden_get_grid_size_expanded_kernel(): + from loopy.kernel.data import temp_var_scope as scopes + + # make simple barrier'd kernel + knl = lp.make_kernel('{[i]: 0 <= i < 10}', + """ + for i + a[i] = i {id=a} + ... lbarrier {id=barrier} + b[i + 1] = a[i] {nosync=a} + end + """, + [lp.TemporaryVariable("a", np.float32, shape=(10,), order='C', + scope=scopes.LOCAL), + lp.GlobalArg("b", np.float32, shape=(11,), order='C')], + seq_dependencies=True) + + # split into kernel w/ vesize larger than iname domain + vecsize = 16 + knl = lp.split_iname(knl, 'i', vecsize, inner_tag='l.0') + + # artifically expand via overridden_get_grid_sizes_for_insn_ids + class GridOverride(object): + def __init__(self, clean, vecsize=vecsize): + self.clean = clean + self.vecsize = vecsize + + def __call__(self, insn_ids, ignore_auto=True): + gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto) + return gsize, (self.vecsize,) + + knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride( + knl.copy(), vecsize)) + # make sure we can generate the code + lp.generate_code_v2(knl) + + def test_multi_argument_reduction_type_inference(): from loopy.type_inference import TypeInferenceMapper from loopy.library.reduction import SegmentedSumReductionOperation @@ -2451,6 +2546,167 @@ def test_execution_backend_can_cache_dtypes(ctx_factory): knl(queue) +def test_preamble_with_separate_temporaries(ctx_factory): + from loopy.kernel.data import temp_var_scope as scopes + # create a function mangler + + func_name = 'indirect' + func_arg_dtypes = (np.int32, np.int32, np.int32) + func_result_dtypes = (np.int32,) + + def __indirectmangler(kernel, name, arg_dtypes): + """ + A function that will return a :class:`loopy.kernel.data.CallMangleInfo` + to interface with the calling :class:`loopy.LoopKernel` + """ + if name != func_name: + return None + + from loopy.types import to_loopy_type + from loopy.kernel.data import CallMangleInfo + + def __compare(d1, d2): + # compare dtypes ignoring atomic + return to_loopy_type(d1, for_atomic=True) == \ + to_loopy_type(d2, for_atomic=True) + + # check types + if len(arg_dtypes) != len(arg_dtypes): + raise Exception('Unexpected number of arguments provided to mangler ' + '{}, expected {}, got {}'.format( + func_name, len(func_arg_dtypes), len(arg_dtypes))) + + for i, (d1, d2) in enumerate(zip(func_arg_dtypes, arg_dtypes)): + if not __compare(d1, d2): + raise Exception('Argument at index {} for mangler {} does not ' + 'match expected dtype. Expected {}, got {}'. + format(i, func_name, str(d1), str(d2))) + + # get target for creation + target = arg_dtypes[0].target + return CallMangleInfo( + target_name=func_name, + result_dtypes=tuple(to_loopy_type(x, target=target) for x in + func_result_dtypes), + arg_dtypes=arg_dtypes) + + # create the preamble generator + def create_preamble(arr): + def __indirectpreamble(preamble_info): + # find a function matching our name + func_match = next( + (x for x in preamble_info.seen_functions + if x.name == func_name), None) + desc = 'custom_funcs_indirect' + if func_match is not None: + from loopy.types import to_loopy_type + # check types + if tuple(to_loopy_type(x) for x in func_arg_dtypes) == \ + func_match.arg_dtypes: + # if match, create our temporary + var = lp.TemporaryVariable( + 'lookup', initializer=arr, dtype=arr.dtype, shape=arr.shape, + scope=scopes.GLOBAL, read_only=True) + # and code + code = """ + int {name}(int start, int end, int match) + {{ + int result = start; + for (int i = start + 1; i < end; ++i) + {{ + if (lookup[i] == match) + result = i; + }} + return result; + }} + """.format(name=func_name) + + # generate temporary variable code + from cgen import Initializer + from loopy.target.c import generate_array_literal + codegen_state = preamble_info.codegen_state.copy( + is_generating_device_code=True) + kernel = preamble_info.kernel + ast_builder = codegen_state.ast_builder + target = kernel.target + decl_info, = var.decl_info(target, index_dtype=kernel.index_dtype) + decl = ast_builder.wrap_global_constant( + ast_builder.get_temporary_decl( + codegen_state, None, var, + decl_info)) + if var.initializer is not None: + decl = Initializer(decl, generate_array_literal( + codegen_state, var, var.initializer)) + # return generated code + yield (desc, '\n'.join([str(decl), code])) + return __indirectpreamble + + # and finally create a test + n = 10 + # for each entry come up with a random number of data points + num_data = np.asarray(np.random.randint(2, 10, size=n), dtype=np.int32) + # turn into offsets + offsets = np.asarray(np.hstack(([0], np.cumsum(num_data))), dtype=np.int32) + # create lookup data + lookup = np.empty(0) + for i in num_data: + lookup = np.hstack((lookup, np.arange(i))) + lookup = np.asarray(lookup, dtype=np.int32) + # and create data array + data = np.random.rand(np.product(num_data)) + + # make kernel + kernel = lp.make_kernel('{[i]: 0 <= i < n}', + """ + for i + <>ind = indirect(offsets[i], offsets[i + 1], 1) + out[i] = data[ind] + end + """, + [lp.GlobalArg('out', shape=('n',)), + lp.TemporaryVariable( + 'offsets', shape=(offsets.size,), initializer=offsets, scope=scopes.GLOBAL, + read_only=True), + lp.GlobalArg('data', shape=(data.size,), dtype=np.float64)], + ) + # fixt params, and add manglers / preamble + kernel = lp.fix_parameters(kernel, **{'n': n}) + kernel = lp.register_preamble_generators(kernel, [create_preamble(lookup)]) + kernel = lp.register_function_manglers(kernel, [__indirectmangler]) + + print(lp.generate_code(kernel)[0]) + # and call (functionality unimportant, more that it compiles) + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + # check that it actually performs the lookup correctly + assert np.allclose(kernel( + queue, data=data.flatten('C'))[1][0], data[offsets[:-1] + 1]) + + +def test_add_prefetch_works_in_lhs_index(): + knl = lp.make_kernel( + "{ [n,k,l,k1,l1,k2,l2]: " + "start<=n a1_tmp[k,l] = a1[a1_map[n, k],l] + a1_tmp[k1,l1] = a1_tmp[k1,l1] + 1 + a1_out[a1_map[n,k2], l2] = a1_tmp[k2,l2] + end + """, + [ + lp.GlobalArg("a1,a1_out", None, "ndofs,2"), + lp.GlobalArg("a1_map", None, "nelements,3"), + "..." + ]) + + knl = lp.add_prefetch(knl, "a1_map", "k") + + from loopy.symbolic import get_dependencies + for insn in knl.instructions: + assert "a1_map" not in get_dependencies(insn.assignees) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_reduction.py b/test/test_reduction.py index be11d7c8cada94596dceb1a8e0e678f8adb582e9..0c37d2228ee41f3e8af7ef6f6fcd68afa7a66960 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -97,22 +97,22 @@ def test_nested_dependent_reduction(ctx_factory): "{[j]: 0<=j sumlen = l[i]", + "<> sumlen = ell[i]", "a[i] = sum(j, j)", ], [ lp.ValueArg("n", np.int32), lp.GlobalArg("a", dtype, ("n",)), - lp.GlobalArg("l", np.int32, ("n",)), + lp.GlobalArg("ell", np.int32, ("n",)), ]) cknl = lp.CompiledKernel(ctx, knl) n = 330 - l = np.arange(n, dtype=np.int32) - evt, (a,) = cknl(queue, l=l, n=n, out_host=True) + ell = np.arange(n, dtype=np.int32) + evt, (a,) = cknl(queue, ell=ell, n=n, out_host=True) - tgt_result = (2*l-1)*2*l/2 + tgt_result = (2*ell-1)*2*ell/2 assert (a == tgt_result).all() @@ -413,6 +413,27 @@ def test_parallel_multi_output_reduction(ctx_factory): assert max_index == np.argmax(np.abs(a)) +def test_reduction_with_conditional(): + # Test whether realization of a reduction inherits predicates + # of the original instruction. Tested with the CTarget, because + # the PyOpenCL target will hoist the conditional into the host + # code in this minimal example. + knl = lp.make_kernel( + "{ [i] : 0<=i<42 }", + """ + if n > 0 + <>b = sum(i, a[i]) + end + """, + [lp.GlobalArg("a", dtype=np.float32, shape=(42,)), + lp.GlobalArg("n", dtype=np.float32, shape=())], + target=lp.CTarget()) + code = lp.generate_body(knl) + + # Check that the if appears before the loop that realizes the reduction. + assert code.index("if") < code.index("for") + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_statistics.py b/test/test_statistics.py index cf86539efec7be7e85fecfadc3b19d26fac7bb6d..eeb4a5a288afdd5b9295b0b681abb61b5f021d97 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -37,14 +37,14 @@ from pymbolic.primitives import Variable def test_op_counter_basic(): knl = lp.make_kernel( - "[n,m,l] -> {[i,k,j]: 0<=i {[i,k,j]: 0<=i6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) + e[i,k] = if( + not(k6 or k/2==ell, + g[i,k]*2, + g[i,k]+h[i,k]/2) """ ], - name="logic", assumptions="n,m,l >= 1") + name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 - l = 128 - params = {'n': n, 'm': m, 'l': l} + ell = 128 + params = {'n': n, 'm': m, 'ell': ell} f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(params) f64div = op_map[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params) @@ -118,14 +121,14 @@ def test_op_counter_logic(): def test_op_counter_specialops(): knl = lp.make_kernel( - "{[i,k,j]: 0<=i> k)) """ ], - name="bitwise", assumptions="n,m,l >= 1") + name="bitwise", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, dict( @@ -169,16 +172,16 @@ def test_op_counter_bitwise(): op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 - l = 128 - params = {'n': n, 'm': m, 'l': l} + ell = 128 + params = {'n': n, 'm': m, 'ell': ell} i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params) i32bw = op_map[lp.Op(np.int32, 'bw')].eval_with_dict(params) i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params) i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params) i64add = op_map[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params) i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params) - assert i32add == n*m+n*m*l - assert i32bw == 2*n*m*l + assert i32add == n*m+n*m*ell + assert i32bw == 2*n*m*ell assert i64bw == 2*n*m assert i64add == i64mul == n*m assert i64shift == 2*n*m @@ -218,22 +221,22 @@ def test_op_counter_triangular_domain(): def test_mem_access_counter_basic(): knl = lp.make_kernel( - "[n,m,l] -> {[i,k,j]: 0<=i {[i,k,j]: 0<=i6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) + e[i,k] = if(not(k6 or k/2==ell, + g[i,k]*2, + g[i,k]+h[i,k]/2) """ ], - name="logic", assumptions="n,m,l >= 1") + name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 - l = 128 - params = {'n': n, 'm': m, 'l': l} + ell = 128 + params = {'n': n, 'm': m, 'ell': ell} reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') @@ -332,22 +337,22 @@ def test_mem_access_counter_logic(): def test_mem_access_counter_specialops(): knl = lp.make_kernel( - "{[i,k,j]: 0<=i> k)) """ ], - name="bitwise", assumptions="n,m,l >= 1") + name="bitwise", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, dict( @@ -398,8 +403,8 @@ def test_mem_access_counter_bitwise(): mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 - l = 128 - params = {'n': n, 'm': m, 'l': l} + ell = 128 + params = {'n': n, 'm': m, 'ell': ell} i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='a') ].eval_with_dict(params) @@ -412,7 +417,7 @@ def test_mem_access_counter_bitwise(): i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), stride=0, direction='load', variable='h') ].eval_with_dict(params) - assert i32 == 4*n*m+2*n*m*l + assert i32 == 4*n*m+2*n*m*ell i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='c') @@ -420,20 +425,20 @@ def test_mem_access_counter_bitwise(): i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='e') ].eval_with_dict(params) - assert i32 == n*m+n*m*l + assert i32 == n*m+n*m*ell def test_mem_access_counter_mixed(): knl = lp.make_kernel( - "[n,m,l] -> {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}", + "[n,m,ell] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}", [ """ c[i,j,k] = 2*a[i,j,k] {id=first} @@ -620,8 +625,8 @@ def test_barrier_counter_barriers(): print(sync_map) n = 512 m = 256 - l = 128 - params = {'n': n, 'm': m, 'l': l} + ell = 128 + params = {'n': n, 'm': m, 'ell': ell} barrier_count = sync_map["barrier_local"].eval_with_dict(params) assert barrier_count == 50*10*2 @@ -630,11 +635,11 @@ def test_all_counters_parallel_matmul(): bsize = 16 knl = lp.make_kernel( - "{[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i