diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5ea075d194a9da75a1c18d180c65239be83eb85e..f96b43d67fcc1ca53a736fb4893990b8bd363a1a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -69,6 +69,7 @@ Python 2.7 with legacy PyOpenCL:
- pocl
except:
- tags
+ retry: 2
Python 3.6 POCL:
script:
diff --git a/doc/index.rst b/doc/index.rst
index a0bad2898be4aab74dead90aae825e4e0a460c87..d862a8acd0cb258bfd1e9623bd5cef895871f6b1 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -18,12 +18,14 @@ When you run this script, the following kernel is generated, compiled, and execu
(See the full example for how to print the generated code.)
+.. _static-binary:
+
Want to try out loopy?
----------------------
There's no need to go through :ref:`installation` if you'd just like to get a
feel for what loopy is. Instead, you may
-`download a self-contained Linux binary `_.
+`download a self-contained Linux binary `_.
This is purposefully built on an ancient Linux distribution, so it should work
on most versions of Linux that are currently out there.
diff --git a/doc/misc.rst b/doc/misc.rst
index 347b5d098c8dc0e37bb72659c0b0de5a8b4e3704..cd6fe102cb9c97a619d8b6512f103c9dcabe65b5 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -3,6 +3,18 @@
Installation
============
+Option 0: Static Binary
+-----------------------
+
+If you would just like to experiment with :mod:`loopy`'s code transformation
+abilities, the easiest way to get loopy is to download a statically-linked
+Linux binary.
+
+See :ref:`static-binary` for details.
+
+Option 1: From Source, no PyOpenCL integration
+-----------------------------------------------
+
This command should install :mod:`loopy`::
pip install loo.py
@@ -26,10 +38,59 @@ You may also clone its git repository::
git clone --recursive git://github.com/inducer/loopy
git clone --recursive http://git.tiker.net/trees/loopy.git
+Option 2: From Conda Forge, with PyOpenCL integration
+-----------------------------------------------------
+
+This set of instructions is intended for 64-bit Linux and
+MacOS support computers:
+
+#. Make sure your system has the basics to build software.
+
+ On Debian derivatives (Ubuntu and many more),
+ installing ``build-essential`` should do the trick.
+
+ Everywhere else, just making sure you have the ``g++`` package should be
+ enough.
+
+#. Install `miniconda `_.
+ (Both Python 2 and 3 should work. In the absence of other constraints, prefer Python 3.)
+
+#. ``export CONDA=/WHERE/YOU/INSTALLED/miniconda3``
+
+ If you accepted the default location, this should work:
+
+ ``export CONDA=$HOME/miniconda3``
+
+#. ``$CONDA/bin/conda create -n dev``
+
+#. ``source $CONDA/bin/activate dev``
+
+#. ``conda config --add channels conda-forge``
+
+#. ``conda install git pip pocl islpy pyopencl`` (Linux)
+
+ or
+
+ ``conda install osx-pocl-opencl git pip pocl islpy pyopencl`` (OS X)
+
+#. Type the following command::
+
+ pip install git+https://github.com/inducer/loopy
+
+Next time you want to use :mod:`loopy`, just run the following command::
+
+ source /WHERE/YOU/INSTALLED/miniconda3/bin/activate dev
+
+You may also like to add this to a startup file (like :file:`$HOME/.bashrc`) or create an alias for it.
+
+See the `PyOpenCL installation instructions
+`_ for options
+regarding OpenCL drivers.
+
User-visible Changes
====================
-Version 2016.2
+Version 2017.2
--------------
.. note::
@@ -57,7 +118,7 @@ Licensing
Loopy is licensed to you under the MIT/X Consortium license:
-Copyright (c) 2009-13 Andreas Klöckner and Contributors.
+Copyright (c) 2009-17 Andreas Klöckner and Contributors.
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
@@ -342,6 +403,11 @@ Here's a Bibtex entry for your convenience::
doi = "{10.1145/2627373.2627387}",
}
+Getting help
+============
+
+Email the friendly folks on the `loopy mailing list `_.
+
Acknowledgments
===============
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index 9138d9a41d7b33db956fd8aba55c0b3b788db064..07b7836d82596892f1d94e336dfa81e1b5a7a881 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -130,6 +130,7 @@ Iname Implementation Tags
Tag Meaning
=============================== ====================================================
``None`` | ``"for"`` Sequential loop
+``"ord"`` Forced-order sequential loop
``"l.N"`` Local (intra-group) axis N ("local")
``"g.N"`` Group-number axis N ("group")
``"unr"`` Unroll
@@ -326,15 +327,25 @@ Expressions
Loopy's expressions are a slight superset of the expressions supported by
:mod:`pymbolic`.
-* ``if``
-* ``elif`` (following an ``if``)
-* ``else`` (following an ``if`` / ``elif``)
+* ``if(cond, then, else_)``
+
+* ``a[[ 8*i + j ]]``: Linear subscripts.
+ See :class:`loopy.symbolic.LinearSubscript`.
+
* ``reductions``
- * duplication of reduction inames
+ See :class:`loopy.symbolic.Reduction`.
+
* ``reduce`` vs ``simul_reduce``
+
* complex-valued arithmetic
+
* tagging of array access and substitution rule use ("$")
+ See :class:`loopy.symbolic.TaggedVariable`.
+
* ``indexof``, ``indexof_vec``
+* ``cast(type, value)``: No parse syntax currently.
+ See :class:`loopy.symbolic.TypeCast`.
+
TODO: Functions
TODO: Reductions
@@ -579,4 +590,15 @@ Do not create :class:`LoopKernel` objects directly. Instead, refer to
:members:
:undoc-members:
+Implementation Detail: The Base Array
+-------------------------------------
+
+All array-like data in :mod:`loopy` (such as :class:`GlobalArg` and
+:class:`TemporaryVariable`) derive from single, shared base array type,
+described next.
+
+.. currentmodule:: loopy.kernel.array
+
+.. autoclass:: ArrayBase
+
.. vim: tw=75:spell:fdm=marker
diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
index d293e3ebe998a632bd547f94a67e675ff0592bfb..8bdd17b6295e9328bbbb4acbadd2be7e14ae625b 100644
--- a/doc/ref_transform.rst
+++ b/doc/ref_transform.rst
@@ -74,6 +74,8 @@ Manipulating Instructions
.. autofunction:: add_nosync
+.. autofunction:: add_barrier
+
Registering Library Routines
----------------------------
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 92ec799f7045cf63dc75d1386d8a51fd7d42954c..69f89548618e86b408a31af240bee84678c859c1 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -122,7 +122,9 @@ always see loopy's view of a kernel by printing it.
i: None
---------------------------------------------------------------------------
INSTRUCTIONS:
- [i] out[i] <- 2*a[i] # insn
+ for i
+ out[i] = 2*a[i] {id=insn}
+ end i
---------------------------------------------------------------------------
You'll likely have noticed that there's quite a bit more information here
@@ -1105,11 +1107,12 @@ work item:
:mod:`loopy` supports two kinds of barriers:
-* *Local barriers* ensure consistency of local memory accesses to items within
+* *Local barriers* ensure consistency of memory accesses to items within
*the same* work group. This synchronizes with all instructions in the work
- group.
+ group. The type of memory (local or global) may be specified by the
+ :attr:`loopy.instruction.BarrierInstruction.mem_kind`
-* *Global barriers* ensure consistency of global memory accesses
+* *Global barriers* ensure consistency of memory accesses
across *all* work groups, i.e. it synchronizes with every work item
executing the kernel. Note that there is no exact equivalent for
this kind of barrier in OpenCL. [#global-barrier-note]_
@@ -1118,14 +1121,17 @@ Once a work item has reached a barrier, it waits for everyone that it
synchronizes with to reach the barrier before continuing. This means that unless
all work items reach the same barrier, the kernel will hang during execution.
+Barrier insertion
+~~~~~~~~~~~~~~~~~
+
By default, :mod:`loopy` inserts local barriers between two instructions when it
detects that a dependency involving local memory may occur across work items. To
see this in action, take a look at the section on :ref:`local_temporaries`.
-In contrast, :mod:`loopy` will *not* insert global barriers automatically.
-Global barriers require manual intervention along with some special
-post-processing which we describe below. Consider the following kernel, which
-attempts to rotate its input to the right by 1 in parallel:
+In contrast, :mod:`loopy` will *not* insert global barriers automatically and
+instead will report an error if it detects the need for a global barrier. As an
+example, consider the following kernel, which attempts to rotate its input to
+the right by 1 in parallel:
.. doctest::
@@ -1153,8 +1159,22 @@ this, :mod:`loopy` will complain that global barrier needs to be inserted:
...
MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed)
-The syntax for a global barrier instruction is ``... gbarrier``. This needs to
-be added between the pair of offending instructions.
+The syntax for a inserting a global barrier instruction is
+``... gbarrier``. :mod:`loopy` also supports manually inserting local
+barriers. The syntax for a local barrier instruction is ``... lbarrier``.
+
+Saving temporaries across global barriers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For some platforms (currently only PyOpenCL), :mod:`loopy` implements global
+barriers by splitting the kernel into a host side kernel and multiple
+device-side kernels. On such platforms, it will be necessary to save non-global
+temporaries that are live across kernel calls. This section presents an example
+of how to use :func:`loopy.save_and_reload_temporaries` which is helpful for
+that purpose.
+
+Let us start with an example. Consider the kernel from above with a
+``... gbarrier`` instruction that has already been inserted.
.. doctest::
@@ -1175,17 +1195,16 @@ be added between the pair of offending instructions.
... assumptions="n mod 16 = 0")
>>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0")
-When we try to generate code for this, it will still not work.
+Here is what happens when we try to generate code for the kernel:
>>> cgr = lp.generate_code_v2(knl)
Traceback (most recent call last):
...
MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?)
-To understand what is going on, you need to know that :mod:`loopy` implements
-global barriers by splitting the kernel into multiple device-side kernels. The
-splitting happens when the instruction schedule is generated. To see the
-schedule, we must first call :func:`loopy.get_one_scheduled_kernel`:
+This happens due to the kernel splitting done by :mod:`loopy`. The splitting
+happens when the instruction schedule is generated. To see the schedule, we
+should call :func:`loopy.get_one_scheduled_kernel`:
>>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
>>> print(knl)
@@ -1196,11 +1215,11 @@ schedule, we must first call :func:`loopy.get_one_scheduled_kernel`:
---------------------------------------------------------------------------
SCHEDULE:
0: CALL KERNEL rotate_v2(extra_args=[], extra_inames=[])
- 1: [maketmp] tmp <- arr[i_inner + i_outer*16]
+ 1: tmp = arr[i_inner + i_outer*16] {id=maketmp}
2: RETURN FROM KERNEL rotate_v2
- 3: ---BARRIER:global---
+ 3: ... gbarrier
4: CALL KERNEL rotate_v2_0(extra_args=[], extra_inames=[])
- 5: [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp
+ 5: arr[((1 + i_inner + i_outer*16) % n)] = tmp {id=rotate}
6: RETURN FROM KERNEL rotate_v2_0
---------------------------------------------------------------------------
@@ -1234,13 +1253,13 @@ put those instructions into the schedule.
---------------------------------------------------------------------------
SCHEDULE:
0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[])
- 1: [maketmp] tmp <- arr[i_inner + i_outer*16]
- 2: [tmp.save] tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] <- tmp
+ 1: tmp = arr[i_inner + i_outer*16] {id=maketmp}
+ 2: tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] = tmp {id=tmp.save}
3: RETURN FROM KERNEL rotate_v2
- 4: ---BARRIER:global---
+ 4: ... gbarrier
5: CALL KERNEL rotate_v2_0(extra_args=['tmp_save_slot'], extra_inames=[])
- 6: [tmp.reload] tmp <- tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0]
- 7: [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp
+ 6: tmp = tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0] {id=tmp.reload}
+ 7: arr[((1 + i_inner + i_outer*16) % n)] = tmp {id=rotate}
8: RETURN FROM KERNEL rotate_v2_0
---------------------------------------------------------------------------
@@ -1280,7 +1299,7 @@ The kernel translates into two OpenCL kernels.
arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp;
}
-Executing the kernel does what we expect.
+Now we can execute the kernel.
>>> arr = cl.array.arange(queue, 16, dtype=np.int32)
>>> print(arr)
diff --git a/loopy/__init__.py b/loopy/__init__.py
index aa1d43172a4bd6472f5974c292c4256946fcf542..7a853d11570226a7a3fe35539f590e7f78ea3f44 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -27,7 +27,7 @@ import six
from six.moves import range, zip
from loopy.symbolic import (
- TaggedVariable, Reduction, LinearSubscript, )
+ TaggedVariable, Reduction, LinearSubscript, TypeCast)
from loopy.diagnostic import LoopyError, LoopyWarning
@@ -112,7 +112,7 @@ from loopy.transform.ilp import realize_ilp
from loopy.transform.batch import to_batched
from loopy.transform.parameter import assume, fix_parameters
from loopy.transform.save import save_and_reload_temporaries
-
+from loopy.transform.add_barrier import add_barrier
# }}}
from loopy.type_inference import infer_unknown_types
@@ -145,7 +145,7 @@ from loopy.target.numba import NumbaTarget, NumbaCudaTarget
__all__ = [
- "TaggedVariable", "Reduction", "LinearSubscript",
+ "TaggedVariable", "Reduction", "LinearSubscript", "TypeCast",
"auto",
@@ -215,6 +215,8 @@ __all__ = [
"save_and_reload_temporaries",
+ "add_barrier",
+
# }}}
"get_dot_dependency_graph",
diff --git a/loopy/check.py b/loopy/check.py
index 741195ae6ac87d01de3a4ac620ce510fd62ff470..7e661b566b15c47ec99e03ffdeb035057602da76 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -60,6 +60,12 @@ def check_identifiers_in_subst_rules(knl):
# {{{ sanity checks run pre-scheduling
+
+# FIXME: Replace with an enum. See
+# https://gitlab.tiker.net/inducer/loopy/issues/85
+VALID_NOSYNC_SCOPES = frozenset(["local", "global", "any"])
+
+
def check_insn_attributes(kernel):
all_insn_ids = set(insn.id for insn in kernel.instructions)
@@ -76,6 +82,30 @@ def check_insn_attributes(kernel):
% (insn.id, ", ".join(
insn.depends_on - all_insn_ids)))
+ no_sync_with_insn_ids = set(id for id, scope in insn.no_sync_with)
+ if not no_sync_with_insn_ids <= all_insn_ids:
+ raise LoopyError("insn '%s' has nosync directive with unknown "
+ "instruction ids: %s"
+ % (insn.id,
+ ", ".join(no_sync_with_insn_ids - all_insn_ids)))
+
+ no_sync_with_scopes = set(scope for id, scope in insn.no_sync_with)
+ if not no_sync_with_scopes <= VALID_NOSYNC_SCOPES:
+ raise LoopyError("insn '%s' has invalid nosync scopes: %s"
+ % (insn.id,
+ ", ".join(no_sync_with_scopes - VALID_NOSYNC_SCOPES)))
+
+
+def check_for_duplicate_insn_ids(knl):
+ insn_ids = set()
+
+ for insn in knl.instructions:
+ if not isinstance(insn.id, str):
+ raise LoopyError("instruction id %r is not a string" % insn.id)
+ if insn.id in insn_ids:
+ raise LoopyError("duplicate instruction id: '%s'" % insn.id)
+ insn_ids.add(insn.id)
+
def check_loop_priority_inames_known(kernel):
for prio in kernel.loop_priority:
@@ -114,20 +144,20 @@ def check_for_inactive_iname_access(kernel):
def _is_racing_iname_tag(tv, tag):
from loopy.kernel.data import (temp_var_scope,
- LocalIndexTagBase, GroupIndexTag, ParallelTag, auto)
+ LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto)
if tv.scope == temp_var_scope.PRIVATE:
return (
- isinstance(tag, ParallelTag)
+ isinstance(tag, ConcurrentTag)
and not isinstance(tag, (LocalIndexTagBase, GroupIndexTag)))
elif tv.scope == temp_var_scope.LOCAL:
return (
- isinstance(tag, ParallelTag)
+ isinstance(tag, ConcurrentTag)
and not isinstance(tag, GroupIndexTag))
elif tv.scope == temp_var_scope.GLOBAL:
- return isinstance(tag, ParallelTag)
+ return isinstance(tag, ConcurrentTag)
elif tv.scope == auto:
raise LoopyError("scope of temp var '%s' has not yet been"
@@ -139,7 +169,7 @@ def _is_racing_iname_tag(tv, tag):
def check_for_write_races(kernel):
- from loopy.kernel.data import ParallelTag
+ from loopy.kernel.data import ConcurrentTag
iname_to_tag = kernel.iname_to_tag.get
for insn in kernel.instructions:
@@ -160,7 +190,7 @@ def check_for_write_races(kernel):
raceable_parallel_insn_inames = set(
iname
for iname in kernel.insn_inames(insn)
- if isinstance(iname_to_tag(iname), ParallelTag))
+ if isinstance(iname_to_tag(iname), ConcurrentTag))
elif assignee_name in kernel.temporary_variables:
temp_var = kernel.temporary_variables[assignee_name]
@@ -200,13 +230,13 @@ def check_for_orphaned_user_hardware_axes(kernel):
def check_for_data_dependent_parallel_bounds(kernel):
- from loopy.kernel.data import ParallelTag
+ from loopy.kernel.data import ConcurrentTag
for i, dom in enumerate(kernel.domains):
dom_inames = set(dom.get_var_names(dim_type.set))
par_inames = set(iname
for iname in dom_inames
- if isinstance(kernel.iname_to_tag.get(iname), ParallelTag))
+ if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag))
if not par_inames:
continue
@@ -356,6 +386,7 @@ def pre_schedule_checks(kernel):
try:
logger.debug("%s: pre-schedule check: start" % kernel.name)
+ check_for_duplicate_insn_ids(kernel)
check_for_orphaned_user_hardware_axes(kernel)
check_for_double_use_of_hw_axes(kernel)
check_insn_attributes(kernel)
@@ -370,7 +401,7 @@ def pre_schedule_checks(kernel):
logger.debug("%s: pre-schedule check: done" % kernel.name)
except KeyboardInterrupt:
raise
- except:
+ except Exception:
print(75*"=")
print("failing kernel during pre-schedule check:")
print(75*"=")
@@ -628,7 +659,7 @@ def pre_codegen_checks(kernel):
check_that_shapes_and_strides_are_arguments(kernel)
logger.debug("pre-codegen check %s: done" % kernel.name)
- except:
+ except Exception:
print(75*"=")
print("failing kernel during pre-schedule check:")
print(75*"=")
@@ -677,6 +708,16 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
(insn_impl_domain & assumptions)
.project_out_except(insn_inames, [dim_type.set]))
+ from loopy.kernel.instruction import BarrierInstruction
+ from loopy.kernel.data import LocalIndexTag
+ if isinstance(insn, BarrierInstruction):
+ # project out local-id-mapped inames, solves #94 on gitlab
+ non_lid_inames = frozenset(
+ [iname for iname in insn_inames if not isinstance(
+ kernel.iname_to_tag.get(iname), LocalIndexTag)])
+ insn_impl_domain = insn_impl_domain.project_out_except(
+ non_lid_inames, [dim_type.set])
+
insn_domain = kernel.get_inames_domain(insn_inames)
insn_parameters = frozenset(insn_domain.get_var_names(dim_type.param))
assumptions, insn_domain = align_two(assumption_non_param, insn_domain)
@@ -684,6 +725,11 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
.project_out_except(insn_inames, [dim_type.set])
.project_out_except(insn_parameters, [dim_type.param]))
+ if isinstance(insn, BarrierInstruction):
+ # project out local-id-mapped inames, solves #94 on gitlab
+ desired_domain = desired_domain.project_out_except(
+ non_lid_inames, [dim_type.set])
+
insn_impl_domain = (insn_impl_domain
.project_out_except(insn_parameters, [dim_type.param]))
insn_impl_domain, desired_domain = align_two(
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 009dadc1a0d6236f092029dbc03ad0c035c7b8f8..e83515d31f1c61e52569d8d0754ce79e7a7f602f 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -28,7 +28,7 @@ from loopy.diagnostic import LoopyError, warn
from pytools import ImmutableRecord
import islpy as isl
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
from loopy.tools import LoopyKeyBuilder
from loopy.version import DATA_MODEL_VERSION
@@ -357,8 +357,9 @@ class CodeGenerationState(object):
# }}}
-code_gen_cache = PersistentDict("loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION,
- key_builder=LoopyKeyBuilder())
+code_gen_cache = WriteOncePersistentDict(
+ "loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION,
+ key_builder=LoopyKeyBuilder())
class PreambleInfo(ImmutableRecord):
@@ -367,6 +368,7 @@ class PreambleInfo(ImmutableRecord):
.. attribute:: seen_dtypes
.. attribute:: seen_functions
.. attribute:: seen_atomic_dtypes
+ .. attribute:: codegen_state
"""
@@ -495,7 +497,9 @@ def generate_code_v2(kernel):
seen_dtypes=seen_dtypes,
seen_functions=seen_functions,
# a set of LoopyTypes (!)
- seen_atomic_dtypes=seen_atomic_dtypes)
+ seen_atomic_dtypes=seen_atomic_dtypes,
+ codegen_state=codegen_state
+ )
preamble_generators = (kernel.preamble_generators
+ kernel.target.get_device_ast_builder().preamble_generators())
@@ -507,15 +511,15 @@ def generate_code_v2(kernel):
# }}}
# For faster unpickling in the common case when implemented_domains isn't needed.
- from loopy.tools import LazilyUnpicklingDictionary
+ from loopy.tools import LazilyUnpicklingDict
codegen_result = codegen_result.copy(
- implemented_domains=LazilyUnpicklingDictionary(
+ implemented_domains=LazilyUnpicklingDict(
codegen_result.implemented_domains))
logger.info("%s: generate code: done" % kernel.name)
if CACHING_ENABLED:
- code_gen_cache[input_kernel] = codegen_result
+ code_gen_cache.store_if_not_present(input_kernel, codegen_result)
return codegen_result
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index 61f4b3a9b8c38dfc25ebc81243812aa963423f8a..f398a063dc41f3f82267f6d4850158e4c45f4733 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -58,7 +58,7 @@ def get_approximate_convex_bounds_checks(domain, check_inames, implemented_domai
def get_usable_inames_for_conditional(kernel, sched_index):
from loopy.schedule import (
find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within)
- from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag
+ from loopy.kernel.data import ConcurrentTag, LocalIndexTagBase, IlpBaseTag
result = find_active_inames_at(kernel, sched_index)
crosses_barrier = has_barrier_within(kernel, sched_index)
@@ -97,7 +97,7 @@ def get_usable_inames_for_conditional(kernel, sched_index):
# at the innermost level of nesting.
if (
- isinstance(tag, ParallelTag)
+ isinstance(tag, ConcurrentTag)
and not (isinstance(tag, LocalIndexTagBase) and crosses_barrier)
and not isinstance(tag, IlpBaseTag)
):
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 789c00d33b7bb41816e6901e24046d4b0eefb27d..e3e209726879741c31d686f2a6530e1b7ec67b97 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -40,7 +40,7 @@ def get_admissible_conditional_inames_for(codegen_state, sched_index):
kernel = codegen_state.kernel
- from loopy.kernel.data import LocalIndexTag, HardwareParallelTag
+ from loopy.kernel.data import LocalIndexTag, HardwareConcurrentTag
from loopy.schedule import find_active_inames_at, has_barrier_within
result = find_active_inames_at(kernel, sched_index)
@@ -48,7 +48,7 @@ def get_admissible_conditional_inames_for(codegen_state, sched_index):
has_barrier = has_barrier_within(kernel, sched_index)
for iname, tag in six.iteritems(kernel.iname_to_tag):
- if (isinstance(tag, HardwareParallelTag)
+ if (isinstance(tag, HardwareConcurrentTag)
and codegen_state.is_generating_device_code):
if not has_barrier or not isinstance(tag, LocalIndexTag):
result.add(iname)
@@ -135,12 +135,13 @@ def generate_code_for_sched_index(codegen_state, sched_index):
generate_sequential_loop_dim_code)
from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag,
- LoopedIlpTag, VectorizeTag)
+ LoopedIlpTag, VectorizeTag, InOrderSequentialSequentialTag)
if isinstance(tag, (UnrollTag, UnrolledIlpTag)):
func = generate_unroll_loop
elif isinstance(tag, VectorizeTag):
func = generate_vectorize_loop
- elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag)):
+ elif tag is None or isinstance(tag, (
+ LoopedIlpTag, ForceSequentialTag, InOrderSequentialSequentialTag)):
func = generate_sequential_loop_dim_code
else:
raise RuntimeError("encountered (invalid) EnterLoop "
@@ -155,7 +156,8 @@ def generate_code_for_sched_index(codegen_state, sched_index):
if codegen_state.is_generating_device_code:
barrier_ast = codegen_state.ast_builder.emit_barrier(
- sched_item.kind, sched_item.comment)
+ sched_item.synchronization_kind, sched_item.mem_kind,
+ sched_item.comment)
if sched_item.originating_insn_id:
return CodeGenerationResult.new(
codegen_state,
@@ -166,7 +168,7 @@ def generate_code_for_sched_index(codegen_state, sched_index):
return barrier_ast
else:
# host code
- if sched_item.kind in ["global", "local"]:
+ if sched_item.synchronization_kind in ["global", "local"]:
# host code is assumed globally and locally synchronous
return CodeGenerationResult(
host_program=None,
@@ -175,8 +177,9 @@ def generate_code_for_sched_index(codegen_state, sched_index):
implemented_data_info=codegen_state.implemented_data_info)
else:
- raise LoopyError("do not know how to emit code for barrier kind '%s'"
- "in host code" % sched_item.kind)
+ raise LoopyError("do not know how to emit code for barrier "
+ "synchronization kind '%s'" "in host code"
+ % sched_item.synchronization_kind)
# }}}
@@ -240,6 +243,15 @@ def build_loop_nest(codegen_state, schedule_index):
kernel = codegen_state.kernel
+ # If the AST builder does not implement conditionals, we can save us
+ # some work about hoisting conditionals and directly go into recursion.
+ if not codegen_state.ast_builder.can_implement_conditionals:
+ result = []
+ inner = generate_code_for_sched_index(codegen_state, schedule_index)
+ if inner is not None:
+ result.append(inner)
+ return merge_codegen_results(codegen_state, result)
+
# {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices
# i.e. go up to the next LeaveLoop, and skip over inner loops.
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 0110a06095fa0bd690045f050136027d7bed3a28..1db7b0445efd2a2e27e761164fa919647df37a07 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -90,7 +90,7 @@ def get_slab_decomposition(kernel, iname):
iname_rel_aff(space,
iname, "<=", upper_bound_aff-upper_incr)))
else:
- lower_slab = None
+ upper_slab = None
slabs = []
@@ -231,7 +231,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
kernel = codegen_state.kernel
from loopy.kernel.data import (
- UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag)
+ UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag)
from loopy.schedule import get_insn_ids_for_block_at
insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index)
@@ -243,7 +243,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
hw_inames_left = [iname
for iname in all_inames_by_insns
- if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)]
+ if isinstance(kernel.iname_to_tag.get(iname), HardwareConcurrentTag)]
if not hw_inames_left:
return next_func(codegen_state)
@@ -446,7 +446,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index):
from loopy.symbolic import pw_aff_to_expr
- if ubound.is_equal(lbound):
+ if impl_ubound.is_equal(impl_lbound):
# single-trip, generate just a variable assignment, not a loop
inner = merge_codegen_results(codegen_state, [
astb.emit_initializer(
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 5f0884fd44ed5064f3f195d103b164f2163d1d19..5a747d070a47ff89336c22c8237ff03e567d0a8a 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -203,7 +203,7 @@ def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context)
if len(pieces) == 1:
(_, result), = pieces
if constants_only and not result.is_cst():
- raise ValueError("a numeric %s was not found for PwAff '%s'"
+ raise StaticValueFindingError("a numeric %s was not found for PwAff '%s'"
% (what, pw_aff))
return result
@@ -329,7 +329,7 @@ def is_nonnegative(expr, over_set):
from loopy.symbolic import aff_from_expr
try:
aff = aff_from_expr(space, -expr-1)
- except:
+ except Exception:
return None
expr_neg_set = isl.BasicSet.universe(space).add_constraint(
isl.Constraint.inequality_from_aff(aff))
@@ -616,10 +616,12 @@ def get_simple_strides(bset, key_by="name"):
# recognizes constraints of the form
# -i0 + 2*floor((i0)/2) == 0
- if aff.dim(dim_type.div) != 1:
+ divs_with_coeffs = _get_indices_and_coeffs(aff, [dim_type.div])
+ if len(divs_with_coeffs) != 1:
continue
- idiv = 0
+ (_, idiv, div_coeff), = divs_with_coeffs
+
div = aff.get_div(idiv)
# check for sub-divs
@@ -630,7 +632,7 @@ def get_simple_strides(bset, key_by="name"):
denom = div.get_denominator_val().to_python()
# if the coefficient in front of the div is not the same as the denominator
- if not aff.get_coefficient_val(dim_type.div, idiv).div(denom).is_one():
+ if not div_coeff.div(denom).is_one():
# not supported
continue
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 622f5e49be1e40b4156113d92907fe8b1d9fb859..88a5717642af6d9ebc1bd7770936ae44e8cbf44b 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -40,6 +40,8 @@ from loopy.library.function import (
single_arg_function_mangler)
from loopy.diagnostic import CannotBranchDomainTree, LoopyError
+from loopy.tools import natsorted
+from loopy.diagnostic import StaticValueFindingError
# {{{ unique var names
@@ -212,45 +214,17 @@ class LoopKernel(ImmutableRecordWithoutPickling):
state=kernel_state.INITIAL,
target=None,
- # When kernels get intersected in slab decomposition,
- # their grid sizes shouldn't change. This provides
- # a way to forward sub-kernel grid size requests.
overridden_get_grid_sizes_for_insn_ids=None):
+ """
+ :arg overridden_get_grid_sizes_for_insn_ids: A callable. When kernels get
+ intersected in slab decomposition, their grid sizes shouldn't
+ change. This provides a way to forward sub-kernel grid size requests.
+ """
if cache_manager is None:
from loopy.kernel.tools import SetOperationCacheManager
cache_manager = SetOperationCacheManager()
- # {{{ make instruction ids unique
-
- from loopy.kernel.creation import UniqueName
-
- insn_ids = set()
- for insn in instructions:
- if insn.id is not None and not isinstance(insn.id, UniqueName):
- if insn.id in insn_ids:
- raise RuntimeError("duplicate instruction id: %s" % insn.id)
- insn_ids.add(insn.id)
-
- insn_id_gen = UniqueNameGenerator(insn_ids)
-
- new_instructions = []
-
- for insn in instructions:
- if insn.id is None:
- new_instructions.append(
- insn.copy(id=insn_id_gen("insn")))
- elif isinstance(insn.id, UniqueName):
- new_instructions.append(
- insn.copy(id=insn_id_gen(insn.id.name)))
- else:
- new_instructions.append(insn)
-
- instructions = new_instructions
- del new_instructions
-
- # }}}
-
# {{{ process assumptions
if assumptions is None:
@@ -729,12 +703,12 @@ class LoopKernel(ImmutableRecordWithoutPickling):
tag_key_uses = {}
- from loopy.kernel.data import HardwareParallelTag
+ from loopy.kernel.data import HardwareConcurrentTag
for iname in cond_inames:
tag = self.iname_to_tag.get(iname)
- if isinstance(tag, HardwareParallelTag):
+ if isinstance(tag, HardwareConcurrentTag):
tag_key_uses.setdefault(tag.key, []).append(iname)
multi_use_keys = set(
@@ -744,7 +718,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
multi_use_inames = set()
for iname in cond_inames:
tag = self.iname_to_tag.get(iname)
- if isinstance(tag, HardwareParallelTag) and tag.key in multi_use_keys:
+ if isinstance(tag, HardwareConcurrentTag) and tag.key in multi_use_keys:
multi_use_inames.add(iname)
return frozenset(cond_inames - multi_use_inames)
@@ -986,8 +960,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
try:
# insist block size is constant
size = static_max_of_pw_aff(size,
- constants_only=isinstance(tag, LocalIndexTag))
- except ValueError:
+ constants_only=isinstance(tag, LocalIndexTag),
+ context=self.assumptions)
+ except StaticValueFindingError:
pass
tgt_dict[tag.axis] = size
@@ -1156,20 +1131,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
else:
sep = []
- def natorder(key):
- # Return natural ordering for strings, as opposed to dictionary order.
- # E.g. will result in
- # 'abc1' < 'abc9' < 'abc10'
- # rather than
- # 'abc1' < 'abc10' < 'abc9'
- # Based on
- # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7
- import re
- return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)]
-
- def natsorted(seq, key=lambda x: x):
- return sorted(seq, key=lambda y: natorder(key(y)))
-
if "name" in what:
lines.extend(sep)
lines.append("KERNEL: " + kernel.name)
@@ -1207,7 +1168,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
if "rules" in what and kernel.substitutions:
lines.extend(sep)
if show_labels:
- lines.append("SUBSTIUTION RULES:")
+ lines.append("SUBSTITUTION RULES:")
for rule_name in natsorted(six.iterkeys(kernel.substitutions)):
lines.append(str(kernel.substitutions[rule_name]))
@@ -1215,113 +1176,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
lines.extend(sep)
if show_labels:
lines.append("INSTRUCTIONS:")
- loop_list_width = 35
-
- # {{{ topological sort
- printed_insn_ids = set()
- printed_insn_order = []
-
- def insert_insn_into_order(insn):
- if insn.id in printed_insn_ids:
- return
- printed_insn_ids.add(insn.id)
-
- for dep_id in natsorted(insn.depends_on):
- insert_insn_into_order(kernel.id_to_insn[dep_id])
-
- printed_insn_order.append(insn)
-
- for insn in kernel.instructions:
- insert_insn_into_order(insn)
-
- # }}}
-
- import loopy as lp
-
- Fore = self.options._fore # noqa
- Style = self.options._style # noqa
-
- from loopy.kernel.tools import draw_dependencies_as_unicode_arrows
- for insn, (arrows, extender) in zip(
- printed_insn_order,
- draw_dependencies_as_unicode_arrows(
- printed_insn_order, fore=Fore, style=Style)):
-
- if isinstance(insn, lp.MultiAssignmentBase):
- lhs = ", ".join(str(a) for a in insn.assignees)
- rhs = str(insn.expression)
- trailing = []
- elif isinstance(insn, lp.CInstruction):
- lhs = ", ".join(str(a) for a in insn.assignees)
- rhs = "CODE(%s|%s)" % (
- ", ".join(str(x) for x in insn.read_variables),
- ", ".join("%s=%s" % (name, expr)
- for name, expr in insn.iname_exprs))
-
- trailing = [" "+l for l in insn.code.split("\n")]
- elif isinstance(insn, lp.BarrierInstruction):
- lhs = ""
- rhs = "... %sbarrier" % insn.kind[0]
- trailing = []
-
- elif isinstance(insn, lp.NoOpInstruction):
- lhs = ""
- rhs = "... nop"
- trailing = []
-
- else:
- raise LoopyError("unexpected instruction type: %s"
- % type(insn).__name__)
-
- order = self._get_iname_order_for_printing()
- loop_list = ",".join(
- sorted(kernel.insn_inames(insn), key=lambda iname: order[iname]))
-
- options = [Fore.GREEN+insn.id+Style.RESET_ALL]
- if insn.priority:
- options.append("priority=%d" % insn.priority)
- if insn.tags:
- options.append("tags=%s" % ":".join(insn.tags))
- if isinstance(insn, lp.Assignment) and insn.atomicity:
- options.append("atomic=%s" % ":".join(
- str(a) for a in insn.atomicity))
- if insn.groups:
- options.append("groups=%s" % ":".join(insn.groups))
- if insn.conflicts_with_groups:
- options.append(
- "conflicts=%s" % ":".join(insn.conflicts_with_groups))
- if insn.no_sync_with:
- options.append("no_sync_with=%s" % ":".join(
- "%s@%s" % entry for entry in sorted(insn.no_sync_with)))
-
- if lhs:
- core = "%s <- %s" % (
- Fore.CYAN+lhs+Style.RESET_ALL,
- Fore.MAGENTA+rhs+Style.RESET_ALL,
- )
- else:
- core = Fore.MAGENTA+rhs+Style.RESET_ALL
-
- if len(loop_list) > loop_list_width:
- lines.append("%s [%s]" % (arrows, loop_list))
- lines.append("%s %s%s # %s" % (
- extender,
- (loop_list_width+2)*" ",
- core,
- ", ".join(options)))
- else:
- lines.append("%s [%s]%s%s # %s" % (
- arrows,
- loop_list, " "*(loop_list_width-len(loop_list)),
- core,
- ",".join(options)))
-
- lines.extend(trailing)
-
- if insn.predicates:
- lines.append(10*" " + "if (%s)" % " && ".join(
- [str(x) for x in insn.predicates]))
+ from loopy.kernel.tools import stringify_instruction_list
+ lines.extend(stringify_instruction_list(kernel))
dep_lines = []
for insn in kernel.instructions:
@@ -1502,6 +1359,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
return hash(key_hash.digest())
def __eq__(self, other):
+ if self is other:
+ return True
+
if not isinstance(other, LoopKernel):
return False
@@ -1515,7 +1375,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
return False
elif field_name == "assumptions":
- if not self.assumptions.plain_is_equal(other.assumptions):
+ if not (
+ self.assumptions.plain_is_equal(other.assumptions)
+ or self.assumptions.is_equal(other.assumptions)):
return False
elif getattr(self, field_name) != getattr(other, field_name):
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 531cc822e1bc76573ef6e0812970d16bd6df0b17..5d4240b9ab3e1ce2ad356a93b5e21b3bbf4d499e 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -567,6 +567,14 @@ class ArrayBase(ImmutableRecord):
informational/documentational purpose. On occasion, they are used
to generate more informative names than could be achieved by
axis numbers.
+
+ .. automethod:: __init__
+ .. automethod:: __eq__
+ .. automethod:: num_user_axes
+ .. automethod:: num_target_axes
+ .. automethod:: vector_size
+
+ (supports persistent hashing)
"""
# Note that order may also wind up in attributes, if the
@@ -579,7 +587,8 @@ class ArrayBase(ImmutableRecord):
target=None,
**kwargs):
"""
- All of the following are optional. Specify either strides or shape.
+ All of the following (except *name*) are optional.
+ Specify either strides or shape.
:arg name: May contain multiple names separated by
commas, in which case multiple arguments,
@@ -643,8 +652,9 @@ class ArrayBase(ImmutableRecord):
:arg offset: Offset from the beginning of the buffer to the point from
which the strides are counted. May be one of
- * 0
+ * 0 or None
* a string (that is interpreted as an argument name).
+ * a pymbolic expression
* :class:`loopy.auto`, in which case an offset argument
is added automatically, immediately following this argument.
:class:`loopy.CompiledKernel` is even smarter in its treatment of
@@ -877,6 +887,7 @@ class ArrayBase(ImmutableRecord):
:class:`pytools.persistent_dict.PersistentDict`.
"""
+ key_builder.rec(key_hash, type(self).__name__.encode("utf-8"))
key_builder.rec(key_hash, self.name)
key_builder.rec(key_hash, self.dtype)
self.update_persistent_hash_for_shape(key_hash, key_builder, self.shape)
@@ -1039,7 +1050,9 @@ class ArrayBase(ImmutableRecord):
is_written=is_written)
- if self.offset:
+ import loopy as lp
+
+ if self.offset is lp.auto:
offset_name = full_name+"_offset"
yield ImplementedDataInfo(
target=target,
@@ -1205,12 +1218,16 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
return result
def apply_offset(sub):
- if ary.offset:
- offset_name = ary.offset
- if offset_name is lp.auto:
- offset_name = array_name+"_offset"
+ import loopy as lp
- return var(offset_name) + sub
+ if ary.offset:
+ if ary.offset is lp.auto:
+ return var(array_name+"_offset") + sub
+ elif isinstance(ary.offset, str):
+ return var(ary.offset) + sub
+ else:
+ # assume it's an expression
+ return ary.offset + sub
else:
return sub
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 89cb5f26a4940656cca1ab09841311148e113275..fb935476d54b3f9eb0a3bf858c883fe4c75eaa5a 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -51,9 +51,14 @@ logger = logging.getLogger(__name__)
_IDENTIFIER_RE = re.compile(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\b")
+# source: check_keywords() in isl_stream.c, ISL version 0.17
+_ISL_KEYWORDS = frozenset("""
+ exists and or implies not infty infinity NaN min max rat true false ceild
+ floord mod ceil floor""".split())
+
def _gather_isl_identifiers(s):
- return set(_IDENTIFIER_RE.findall(s)) - set(["and", "or", "exists"])
+ return set(_IDENTIFIER_RE.findall(s)) - _ISL_KEYWORDS
class UniqueName:
@@ -352,6 +357,14 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None):
% v)
del assignee_name
+ elif opt_key == "mem_kind":
+ opt_value = opt_value.lower().strip()
+ if opt_value not in ['local', 'global']:
+ raise LoopyError("Unknown memory synchronization type %s specified"
+ " expected, 'local' or 'global'."
+ % opt_value)
+ result["mem_kind"] = opt_value
+
else:
raise ValueError(
"unrecognized instruction option '%s' "
@@ -420,6 +433,17 @@ SUBST_RE = re.compile(
r"^\s*(?P.+?)\s*:=\s*(?P.+)\s*$")
+def check_illegal_options(insn_options, insn_type):
+ illegal_options = []
+ if insn_type not in ['gbarrier', 'lbarrier']:
+ illegal_options.append('mem_kind')
+
+ bad_options = [x for x in illegal_options if x in insn_options]
+ if bad_options:
+ raise LoopyError("Cannot supply option(s) '%s' to instruction type '%s'" %
+ ', '.join(bad_options), insn_type)
+
+
def parse_insn(groups, insn_options):
"""
:return: a tuple ``(insn, inames_to_dup)``, where insn is a
@@ -434,7 +458,7 @@ def parse_insn(groups, insn_options):
if "lhs" in groups:
try:
lhs = parse(groups["lhs"])
- except:
+ except Exception:
print("While parsing left hand side '%s', "
"the following error occurred:" % groups["lhs"])
raise
@@ -443,7 +467,7 @@ def parse_insn(groups, insn_options):
try:
rhs = parse(groups["rhs"])
- except:
+ except Exception:
print("While parsing right hand side '%s', "
"the following error occurred:" % groups["rhs"])
raise
@@ -493,6 +517,9 @@ def parse_insn(groups, insn_options):
groups["options"],
assignee_names=assignee_names)
+ # check for bad options
+ check_illegal_options(insn_options, 'assignment')
+
insn_id = insn_options.pop("insn_id", None)
inames_to_dup = insn_options.pop("inames_to_dup", [])
@@ -517,14 +544,14 @@ def parse_subst_rule(groups):
from loopy.symbolic import parse
try:
lhs = parse(groups["lhs"])
- except:
+ except Exception:
print("While parsing left hand side '%s', "
"the following error occurred:" % groups["lhs"])
raise
try:
rhs = parse(groups["rhs"])
- except:
+ except Exception:
print("While parsing right hand side '%s', "
"the following error occurred:" % groups["rhs"])
raise
@@ -578,13 +605,15 @@ def parse_special_insn(groups, insn_options):
from loopy.kernel.instruction import NoOpInstruction, BarrierInstruction
special_insn_kind = groups["kind"]
+ # check for bad options
+ check_illegal_options(insn_options, special_insn_kind)
if special_insn_kind == "gbarrier":
cls = BarrierInstruction
- kwargs["kind"] = "global"
+ kwargs["synchronization_kind"] = "global"
elif special_insn_kind == "lbarrier":
cls = BarrierInstruction
- kwargs["kind"] = "local"
+ kwargs["synchronization_kind"] = "local"
elif special_insn_kind == "nop":
cls = NoOpInstruction
else:
@@ -792,6 +821,8 @@ def parse_instructions(instructions, defines):
parse_insn_options(
insn_options_stack[-1],
with_options_match.group("options")))
+ # check for bad options
+ check_illegal_options(insn_options_stack[-1], 'with-block')
continue
for_match = FOR_RE.match(insn)
@@ -896,7 +927,8 @@ def parse_instructions(instructions, defines):
obj = insn_options_stack.pop()
#if this object is the end of an if statement
if obj['predicates'] == if_predicates_stack[-1]["insn_predicates"] and\
- if_predicates_stack[-1]["insn_predicates"]:
+ if_predicates_stack[-1]["insn_predicates"] and\
+ obj['within_inames'] == if_predicates_stack[-1]['within_inames']:
if_predicates_stack.pop()
continue
@@ -991,7 +1023,7 @@ def parse_domains(domains, defines):
try:
dom = isl.BasicSet.read_from_str(isl.DEFAULT_CONTEXT, dom)
- except:
+ except Exception:
print("failed to parse domain '%s'" % dom)
raise
else:
@@ -1859,6 +1891,13 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
:arg seq_dependencies: If *True*, dependencies that sequentially
connect the given *instructions* will be added. Defaults to
*False*.
+ :arg fixed_parameters: A dictionary of *name*/*value* pairs, where *name*
+ will be fixed to *value*. *name* may refer to :ref:`domain-parameters`
+ or :ref:`arguments`. See also :func:`loopy.fix_parameters`.
+
+ .. versionchanged:: 2017.2
+
+ *fixed_parameters* added.
.. versionchanged:: 2016.3
@@ -1876,6 +1915,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
flags = kwargs.pop("flags", None)
target = kwargs.pop("target", None)
seq_dependencies = kwargs.pop("seq_dependencies", False)
+ fixed_parameters = kwargs.pop("fixed_parameters", {})
if defines:
from warnings import warn
@@ -1976,6 +2016,11 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
target=target,
**kwargs)
+ from loopy.transform.instruction import uniquify_instruction_ids
+ knl = uniquify_instruction_ids(knl)
+ from loopy.check import check_for_duplicate_insn_ids
+ check_for_duplicate_insn_ids(knl)
+
if seq_dependencies:
knl = add_sequential_dependencies(knl)
@@ -1996,11 +2041,14 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
# -------------------------------------------------------------------------
# Must create temporaries before inferring inames (because those temporaries
# mediate dependencies that are then used for iname propagation.)
+ # Must create temporaries before fixing parameters.
# -------------------------------------------------------------------------
knl = add_used_inames(knl)
# NOTE: add_inferred_inames will be phased out and throws warnings if it
# does something.
knl = add_inferred_inames(knl)
+ from loopy.transform.parameter import fix_parameters
+ knl = fix_parameters(knl, **fixed_parameters)
# -------------------------------------------------------------------------
# Ordering dependency:
# -------------------------------------------------------------------------
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 94b31df12dae516d3539438b7e4ed66ed765e697..96933f57a003aaca58ed00d2d73c3301b0c448c7 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -77,14 +77,19 @@ class IndexTag(ImmutableRecord):
return type(self).__name__
-class ParallelTag(IndexTag):
+class ConcurrentTag(IndexTag):
pass
-class HardwareParallelTag(ParallelTag):
+class HardwareConcurrentTag(ConcurrentTag):
pass
+# deprecated aliases
+ParallelTag = ConcurrentTag
+HardwareParallelTag = HardwareConcurrentTag
+
+
class UniqueTag(IndexTag):
pass
@@ -105,11 +110,11 @@ class AxisTag(UniqueTag):
self.print_name, self.axis)
-class GroupIndexTag(HardwareParallelTag, AxisTag):
+class GroupIndexTag(HardwareConcurrentTag, AxisTag):
print_name = "g"
-class LocalIndexTagBase(HardwareParallelTag):
+class LocalIndexTagBase(HardwareConcurrentTag):
pass
@@ -130,7 +135,7 @@ class AutoFitLocalIndexTag(AutoLocalIndexTagBase):
# {{{ ilp-like
-class IlpBaseTag(ParallelTag):
+class IlpBaseTag(ConcurrentTag):
pass
@@ -161,6 +166,11 @@ class ForceSequentialTag(IndexTag):
return "forceseq"
+class InOrderSequentialSequentialTag(IndexTag):
+ def __str__(self):
+ return "ord"
+
+
def parse_tag(tag):
if tag is None:
return tag
@@ -173,6 +183,8 @@ def parse_tag(tag):
if tag == "for":
return None
+ elif tag == "ord":
+ return InOrderSequentialSequentialTag()
elif tag in ["unr"]:
return UnrollTag()
elif tag in ["vec"]:
@@ -346,6 +358,14 @@ class TemporaryVariable(ArrayBase):
A :class:`bool` indicating whether the variable may be written during
its lifetime. If *True*, *initializer* must be given.
+
+ .. attribute:: _base_storage_access_may_be_aliasing
+
+ Whether the temporary is used to alias the underlying base storage.
+ Defaults to *False*. If *False*, C-based code generators will declare
+ the temporary as a ``restrict`` const pointer to the base storage
+ memory location. If *True*, the restrict part is omitted on this
+ declaration.
"""
min_target_axes = 0
@@ -358,12 +378,14 @@ class TemporaryVariable(ArrayBase):
"base_storage",
"initializer",
"read_only",
+ "_base_storage_access_may_be_aliasing",
]
def __init__(self, name, dtype=None, shape=(), scope=auto,
dim_tags=None, offset=0, dim_names=None, strides=None, order=None,
base_indices=None, storage_shape=None,
- base_storage=None, initializer=None, read_only=False, **kwargs):
+ base_storage=None, initializer=None, read_only=False,
+ _base_storage_access_may_be_aliasing=False, **kwargs):
"""
:arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype`
:arg shape: :class:`loopy.auto` or a shape tuple
@@ -419,6 +441,13 @@ class TemporaryVariable(ArrayBase):
"mutually exclusive"
% name)
+ if base_storage is None and _base_storage_access_may_be_aliasing:
+ raise LoopyError(
+ "temporary variable '%s': "
+ "_base_storage_access_may_be_aliasing option, but no "
+ "base_storage given!"
+ % name)
+
ArrayBase.__init__(self, name=intern(name),
dtype=dtype, shape=shape,
dim_tags=dim_tags, offset=offset, dim_names=dim_names,
@@ -428,6 +457,8 @@ class TemporaryVariable(ArrayBase):
base_storage=base_storage,
initializer=initializer,
read_only=read_only,
+ _base_storage_access_may_be_aliasing=(
+ _base_storage_access_may_be_aliasing),
**kwargs)
@property
@@ -489,7 +520,10 @@ class TemporaryVariable(ArrayBase):
and (
(self.initializer is None and other.initializer is None)
or np.array_equal(self.initializer, other.initializer))
- and self.read_only == other.read_only)
+ and self.read_only == other.read_only
+ and (self._base_storage_access_may_be_aliasing
+ == other._base_storage_access_may_be_aliasing)
+ )
def update_persistent_hash(self, key_hash, key_builder):
"""Custom hash computation function for use with
@@ -500,6 +534,8 @@ class TemporaryVariable(ArrayBase):
self.update_persistent_hash_for_shape(key_hash, key_builder,
self.storage_shape)
key_builder.rec(key_hash, self.base_indices)
+ key_builder.rec(key_hash, self.scope)
+ key_builder.rec(key_hash, self.base_storage)
initializer = self.initializer
if initializer is not None:
@@ -507,10 +543,22 @@ class TemporaryVariable(ArrayBase):
key_builder.rec(key_hash, initializer)
key_builder.rec(key_hash, self.read_only)
+ key_builder.rec(key_hash, self._base_storage_access_may_be_aliasing)
# }}}
+def iname_tag_to_temp_var_scope(iname_tag):
+ iname_tag = parse_tag(iname_tag)
+
+ if isinstance(iname_tag, GroupIndexTag):
+ return temp_var_scope.GLOBAL
+ elif isinstance(iname_tag, LocalIndexTag):
+ return temp_var_scope.LOCAL
+ else:
+ return temp_var_scope.PRIVATE
+
+
# {{{ substitution rule
class SubstitutionRule(ImmutableRecord):
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index d5c388af60a39987c09092fc93325f067a8f4cf7..dbd99e85016b00b3df4827ad7999e7b57e58af24 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -152,6 +152,12 @@ class InstructionBase(ImmutableRecord):
"within_inames_is_final within_inames "
"priority boostable boostable_into".split())
+ # Names of fields that are pymbolic expressions. Needed for key building
+ pymbolic_fields = set("")
+
+ # Names of fields that are sets of pymbolic expressions. Needed for key building
+ pymbolic_set_fields = set(["predicates"])
+
def __init__(self, id, depends_on, depends_on_is_final,
groups, conflicts_with_groups,
no_sync_with,
@@ -407,7 +413,27 @@ class InstructionBase(ImmutableRecord):
return result
- # {{{ comparison, hashing
+ # {{{ hashing and key building
+
+ @property
+ @memoize_method
+ def _key_builder(self):
+ from loopy.tools import LoopyEqKeyBuilder
+ key_builder = LoopyEqKeyBuilder()
+ key_builder.update_for_class(self.__class__)
+
+ for field_name in self.fields:
+ field_value = getattr(self, field_name)
+ if field_name in self.pymbolic_fields:
+ key_builder.update_for_pymbolic_field(field_name, field_value)
+ elif field_name in self.pymbolic_set_fields:
+ # First sort the fields, as a canonical form
+ items = tuple(sorted(field_value, key=str))
+ key_builder.update_for_pymbolic_field(field_name, items)
+ else:
+ key_builder.update_for_field(field_name, field_value)
+
+ return key_builder
def update_persistent_hash(self, key_hash, key_builder):
"""Custom hash computation function for use with
@@ -416,9 +442,7 @@ class InstructionBase(ImmutableRecord):
Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
"""
- # Order matters for hash forming--sort the field names
- for field_name in sorted(self.fields):
- key_builder.rec(key_hash, getattr(self, field_name))
+ key_builder.rec(key_hash, self._key_builder.hash_key())
# }}}
@@ -648,6 +672,7 @@ class MultiAssignmentBase(InstructionBase):
"""An assignment instruction with an expression as a right-hand side."""
fields = InstructionBase.fields | set(["expression"])
+ pymbolic_fields = InstructionBase.pymbolic_fields | set(["expression"])
@memoize_method
def read_dependency_names(self):
@@ -734,6 +759,7 @@ class Assignment(MultiAssignmentBase):
fields = MultiAssignmentBase.fields | \
set("assignee temp_var_type atomicity".split())
+ pymbolic_fields = MultiAssignmentBase.pymbolic_fields | set(["assignee"])
def __init__(self,
assignee, expression,
@@ -818,26 +844,6 @@ class Assignment(MultiAssignmentBase):
result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates)
return result
- def update_persistent_hash(self, key_hash, key_builder):
- """Custom hash computation function for use with
- :class:`pytools.persistent_dict.PersistentDict`.
-
- Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
- """
-
- # Order matters for hash forming--sort the fields.
- for field_name in sorted(self.fields):
- if field_name in ["assignee", "expression"]:
- key_builder.update_for_pymbolic_expression(
- key_hash, getattr(self, field_name))
- elif field_name == "predicates":
- preds = sorted(self.predicates, key=str)
- for pred in preds:
- key_builder.update_for_pymbolic_expression(
- key_hash, pred)
- else:
- key_builder.rec(key_hash, getattr(self, field_name))
-
# {{{ for interface uniformity with CallInstruction
@property
@@ -886,6 +892,7 @@ class CallInstruction(MultiAssignmentBase):
fields = MultiAssignmentBase.fields | \
set("assignees temp_var_types".split())
+ pymbolic_fields = MultiAssignmentBase.pymbolic_fields | set(["assignees"])
def __init__(self,
assignees, expression,
@@ -987,26 +994,6 @@ class CallInstruction(MultiAssignmentBase):
result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates)
return result
- def update_persistent_hash(self, key_hash, key_builder):
- """Custom hash computation function for use with
- :class:`pytools.persistent_dict.PersistentDict`.
-
- Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
- """
-
- # Order matters for hash forming--sort the fields.
- for field_name in sorted(self.fields):
- if field_name in ["assignees", "expression"]:
- key_builder.update_for_pymbolic_expression(
- key_hash, getattr(self, field_name))
- elif field_name == "predicates":
- preds = sorted(self.predicates, key=str)
- for pred in preds:
- key_builder.update_for_pymbolic_expression(
- key_hash, pred)
- else:
- key_builder.rec(key_hash, getattr(self, field_name))
-
@property
def atomicity(self):
# Function calls can impossibly be atomic, and even the result assignment
@@ -1086,6 +1073,10 @@ class CInstruction(InstructionBase):
fields = InstructionBase.fields | \
set("iname_exprs code read_variables assignees".split())
+ pymbolic_fields = InstructionBase.pymbolic_fields | \
+ set("iname_exprs assignees".split())
+ pymbolic_set_fields = InstructionBase.pymbolic_set_fields | \
+ set(["read_variables"])
def __init__(self,
iname_exprs, code,
@@ -1210,25 +1201,6 @@ class CInstruction(InstructionBase):
return first_line + "\n " + "\n ".join(
self.code.split("\n"))
- def update_persistent_hash(self, key_hash, key_builder):
- """Custom hash computation function for use with
- :class:`pytools.persistent_dict.PersistentDict`.
-
- Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
- """
-
- # Order matters for hash forming--sort the fields.
- for field_name in sorted(self.fields):
- if field_name == "assignees":
- for a in self.assignees:
- key_builder.update_for_pymbolic_expression(key_hash, a)
- elif field_name == "iname_exprs":
- for name, val in self.iname_exprs:
- key_builder.rec(key_hash, name)
- key_builder.update_for_pymbolic_expression(key_hash, val)
- else:
- key_builder.rec(key_hash, getattr(self, field_name))
-
# }}}
@@ -1308,19 +1280,29 @@ class NoOpInstruction(_DataObliviousInstruction):
class BarrierInstruction(_DataObliviousInstruction):
"""An instruction that requires synchronization with all
- concurrent work items of :attr:`kind`.
+ concurrent work items of :attr:`synchronization_kind`.
- .. attribute:: kind
+ .. attribute:: synchronization_kind
A string, ``"global"`` or ``"local"``.
+ .. attribute:: mem_kind
+
+ A string, ``"global"`` or ``"local"``. Chooses which memory type to
+ sychronize, for targets that require this (e.g. OpenCL)
+
The textual syntax in a :mod:`loopy` kernel is::
... gbarrier
... lbarrier
+
+ Note that the memory type :attr:`mem_kind` can be specified for local barriers::
+
+ ... lbarrier {mem_kind=global}
"""
- fields = _DataObliviousInstruction.fields | set(["kind"])
+ fields = _DataObliviousInstruction.fields | set(["synchronization_kind",
+ "mem_kind"])
def __init__(self, id, depends_on=None, depends_on_is_final=None,
groups=None, conflicts_with_groups=None,
@@ -1328,7 +1310,8 @@ class BarrierInstruction(_DataObliviousInstruction):
within_inames_is_final=None, within_inames=None,
priority=None,
boostable=None, boostable_into=None,
- predicates=None, tags=None, kind="global"):
+ predicates=None, tags=None, synchronization_kind="global",
+ mem_kind="local"):
if predicates:
raise LoopyError("conditional barriers are not supported")
@@ -1346,20 +1329,32 @@ class BarrierInstruction(_DataObliviousInstruction):
boostable=boostable,
boostable_into=boostable_into,
predicates=predicates,
- tags=tags,
+ tags=tags
)
- self.kind = kind
+ self.synchronization_kind = synchronization_kind
+ self.mem_kind = mem_kind
def __str__(self):
- first_line = "%s: ... %sbarrier" % (self.id, self.kind[0])
+ first_line = "%s: ... %sbarrier" % (self.id, self.synchronization_kind[0])
options = self.get_str_options()
+ if self.synchronization_kind == "local":
+ # add the memory kind
+ options += ['mem_kind={}'.format(self.mem_kind)]
if options:
first_line += " {%s}" % (": ".join(options))
return first_line
+ @property
+ def kind(self):
+ from warnings import warn
+ warn("BarrierInstruction.kind is deprecated, use synchronization_kind "
+ "instead", DeprecationWarning, stacklevel=2)
+ return self.synchronization_kind
+
# }}}
+
# vim: foldmethod=marker
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 8bdc72d54a91c6e8b4f9ec0ca3053831627d3eae..a65e7fb4ceefd28a909dcb6cee24ea437f15a60e 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -35,7 +35,7 @@ import islpy as isl
from islpy import dim_type
from loopy.diagnostic import LoopyError, warn_with_kernel
from pytools import memoize_on_first_arg
-
+from loopy.tools import natsorted
import logging
logger = logging.getLogger(__name__)
@@ -620,11 +620,11 @@ class DomainParameterFinder(object):
if dep.name in param_names:
from pymbolic.algorithm import solve_affine_equations_for
try:
- # friggin' overkill :)
+ # overkill :)
param_expr = solve_affine_equations_for(
[dep.name], [(shape_i, var("shape_i"))]
)[dep.name]
- except:
+ except Exception:
# went wrong? oh well
pass
else:
@@ -1070,7 +1070,7 @@ def guess_var_shape(kernel, var_name):
if n_axes == 1:
# Leave shape undetermined--we can live with that for 1D.
- shape = (None,)
+ shape = None
else:
raise LoopyError("cannot determine access range for '%s': "
"undetermined index in subscript(s) '%s'"
@@ -1092,7 +1092,7 @@ def guess_var_shape(kernel, var_name):
kernel.cache_manager.dim_max(
armap.access_range, i) + 1,
constants_only=False)))
- except:
+ except Exception:
print("While trying to find shape axis %d of "
"variable '%s', the following "
"exception occurred:" % (i, var_name),
@@ -1371,7 +1371,170 @@ def draw_dependencies_as_unicode_arrows(
conform_to_uniform_length(extender))
for row, extender in rows]
- return rows
+ return uniform_length, rows
+
+# }}}
+
+
+# {{{ stringify_instruction_list
+
+def stringify_instruction_list(kernel):
+ # {{{ topological sort
+
+ printed_insn_ids = set()
+ printed_insn_order = []
+
+ def insert_insn_into_order(insn):
+ if insn.id in printed_insn_ids:
+ return
+ printed_insn_ids.add(insn.id)
+
+ for dep_id in natsorted(insn.depends_on):
+ insert_insn_into_order(kernel.id_to_insn[dep_id])
+
+ printed_insn_order.append(insn)
+
+ for insn in kernel.instructions:
+ insert_insn_into_order(insn)
+
+ # }}}
+
+ import loopy as lp
+
+ Fore = kernel.options._fore # noqa
+ Style = kernel.options._style # noqa
+
+ uniform_arrow_length, arrows_and_extenders = \
+ draw_dependencies_as_unicode_arrows(
+ printed_insn_order, fore=Fore, style=Style)
+
+ leader = " " * uniform_arrow_length
+ lines = []
+ current_inames = [set()]
+
+ if uniform_arrow_length:
+ indent_level = [1]
+ else:
+ indent_level = [0]
+
+ indent_increment = 2
+
+ iname_order = kernel._get_iname_order_for_printing()
+
+ def add_pre_line(s):
+ lines.append(leader + " " * indent_level[0] + s)
+
+ def add_main_line(s):
+ lines.append(arrows + " " * indent_level[0] + s)
+
+ def add_post_line(s):
+ lines.append(extender + " " * indent_level[0] + s)
+
+ def adapt_to_new_inames_list(new_inames):
+ added = []
+ removed = []
+
+ # FIXME: Doesn't respect strict nesting
+ for iname in iname_order:
+ is_in_current = iname in current_inames[0]
+ is_in_new = iname in new_inames
+
+ if is_in_new == is_in_current:
+ pass
+ elif is_in_new and not is_in_current:
+ added.append(iname)
+ elif not is_in_new and is_in_current:
+ removed.append(iname)
+ else:
+ assert False
+
+ if removed:
+ indent_level[0] -= indent_increment * len(removed)
+ add_pre_line("end " + ", ".join(removed))
+ if added:
+ add_pre_line("for " + ", ".join(added))
+ indent_level[0] += indent_increment * len(added)
+
+ current_inames[0] = new_inames
+
+ for insn, (arrows, extender) in zip(printed_insn_order, arrows_and_extenders):
+ if isinstance(insn, lp.MultiAssignmentBase):
+ lhs = ", ".join(str(a) for a in insn.assignees)
+ rhs = str(insn.expression)
+ trailing = []
+ elif isinstance(insn, lp.CInstruction):
+ lhs = ", ".join(str(a) for a in insn.assignees)
+ rhs = "CODE(%s|%s)" % (
+ ", ".join(str(x) for x in insn.read_variables),
+ ", ".join("%s=%s" % (name, expr)
+ for name, expr in insn.iname_exprs))
+
+ trailing = [l for l in insn.code.split("\n")]
+ elif isinstance(insn, lp.BarrierInstruction):
+ lhs = ""
+ rhs = "... %sbarrier" % insn.synchronization_kind[0]
+ trailing = []
+
+ elif isinstance(insn, lp.NoOpInstruction):
+ lhs = ""
+ rhs = "... nop"
+ trailing = []
+
+ else:
+ raise LoopyError("unexpected instruction type: %s"
+ % type(insn).__name__)
+
+ adapt_to_new_inames_list(kernel.insn_inames(insn))
+
+ options = ["id="+Fore.GREEN+insn.id+Style.RESET_ALL]
+ if insn.priority:
+ options.append("priority=%d" % insn.priority)
+ if insn.tags:
+ options.append("tags=%s" % ":".join(insn.tags))
+ if isinstance(insn, lp.Assignment) and insn.atomicity:
+ options.append("atomic=%s" % ":".join(
+ str(a) for a in insn.atomicity))
+ if insn.groups:
+ options.append("groups=%s" % ":".join(insn.groups))
+ if insn.conflicts_with_groups:
+ options.append(
+ "conflicts=%s" % ":".join(insn.conflicts_with_groups))
+ if insn.no_sync_with:
+ options.append("no_sync_with=%s" % ":".join(
+ "%s@%s" % entry for entry in sorted(insn.no_sync_with)))
+ if isinstance(insn, lp.BarrierInstruction) and \
+ insn.synchronization_kind == 'local':
+ options.append('mem_kind=%s' % insn.mem_kind)
+
+ if lhs:
+ core = "%s = %s" % (
+ Fore.CYAN+lhs+Style.RESET_ALL,
+ Fore.MAGENTA+rhs+Style.RESET_ALL,
+ )
+ else:
+ core = Fore.MAGENTA+rhs+Style.RESET_ALL
+
+ options_str = " {%s}" % ", ".join(options)
+
+ if insn.predicates:
+ # FIXME: precedence
+ add_pre_line("if %s" % " and ".join([str(x) for x in insn.predicates]))
+ indent_level[0] += indent_increment
+
+ add_main_line(core + options_str)
+
+ for t in trailing:
+ add_post_line(t)
+
+ if insn.predicates:
+ indent_level[0] -= indent_increment
+ add_post_line("end")
+
+ leader = extender
+
+ adapt_to_new_inames_list([])
+
+ return lines
# }}}
@@ -1394,7 +1557,8 @@ def get_global_barrier_order(kernel):
def is_barrier(my_insn_id):
insn = kernel.id_to_insn[my_insn_id]
from loopy.kernel.instruction import BarrierInstruction
- return isinstance(insn, BarrierInstruction) and insn.kind == "global"
+ return isinstance(insn, BarrierInstruction) and \
+ insn.synchronization_kind == "global"
while unvisited:
stack = [unvisited.pop()]
@@ -1487,7 +1651,8 @@ def find_most_recent_global_barrier(kernel, insn_id):
def is_barrier(my_insn_id):
insn = kernel.id_to_insn[my_insn_id]
from loopy.kernel.instruction import BarrierInstruction
- return isinstance(insn, BarrierInstruction) and insn.kind == "global"
+ return isinstance(insn, BarrierInstruction) and \
+ insn.synchronization_kind == "global"
global_barrier_to_ordinal = dict(
(b, i) for i, b in enumerate(global_barrier_order))
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index ced1aaaa13ed8275c1e3a376d1c24895287b3239..ac7ac19887388649670154fcd36eba79ba3b4315 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -30,7 +30,7 @@ from loopy.diagnostic import (
import islpy as isl
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
from loopy.tools import LoopyKeyBuilder
from loopy.version import DATA_MODEL_VERSION
@@ -292,7 +292,7 @@ def _classify_reduction_inames(kernel, inames):
from loopy.kernel.data import (
LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
- ParallelTag)
+ ConcurrentTag)
for iname in inames:
iname_tag = kernel.iname_to_tag.get(iname)
@@ -305,7 +305,7 @@ def _classify_reduction_inames(kernel, inames):
elif isinstance(iname_tag, LocalIndexTagBase):
local_par.append(iname)
- elif isinstance(iname_tag, (ParallelTag, VectorizeTag)):
+ elif isinstance(iname_tag, (ConcurrentTag, VectorizeTag)):
nonlocal_par.append(iname)
else:
@@ -610,7 +610,7 @@ def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound):
if len(coeffs) == 0:
try:
scan_iname_aff.get_constant_val()
- except:
+ except Exception:
raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff)
# If this point is reached we're assuming the domain is of the form
@@ -956,7 +956,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
nresults=nresults,
depends_on=insn.depends_on,
within_inames=insn.within_inames | expr.inames,
- within_inames_is_final=insn.within_inames_is_final)
+ within_inames_is_final=insn.within_inames_is_final,
+ predicates=insn.predicates,
+ )
newly_generated_insn_id_set.add(get_args_insn_id)
@@ -970,7 +972,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
return updated_inner_exprs
def expand_inner_reduction(id, expr, nresults, depends_on, within_inames,
- within_inames_is_final):
+ within_inames_is_final, predicates):
# FIXME: use make_temporaries
from pymbolic.primitives import Call
from loopy.symbolic import Reduction
@@ -997,7 +999,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
expression=expr,
depends_on=depends_on,
within_inames=within_inames,
- within_inames_is_final=within_inames_is_final)
+ within_inames_is_final=within_inames_is_final,
+ predicates=predicates)
generated_insns.append(call_insn)
@@ -1038,7 +1041,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
within_inames=outer_insn_inames - frozenset(expr.inames),
within_inames_is_final=insn.within_inames_is_final,
depends_on=init_insn_depends_on,
- expression=expr.operation.neutral_element(*arg_dtypes))
+ expression=expr.operation.neutral_element(*arg_dtypes),
+ predicates=insn.predicates,)
generated_insns.append(init_insn)
@@ -1064,7 +1068,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
nresults=nresults,
depends_on=insn.depends_on,
within_inames=update_insn_iname_deps,
- within_inames_is_final=insn.within_inames_is_final)
+ within_inames_is_final=insn.within_inames_is_final,
+ predicates=insn.predicates,
+ )
reduction_insn_depends_on.add(get_args_insn_id)
else:
@@ -1079,7 +1085,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
reduction_expr),
depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on,
within_inames=update_insn_iname_deps,
- within_inames_is_final=insn.within_inames_is_final)
+ within_inames_is_final=insn.within_inames_is_final,
+ predicates=insn.predicates,)
generated_insns.append(reduction_insn)
@@ -1186,7 +1193,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
expression=neutral,
within_inames=base_iname_deps | frozenset([base_exec_iname]),
within_inames_is_final=insn.within_inames_is_final,
- depends_on=frozenset())
+ depends_on=frozenset(),
+ predicates=insn.predicates,
+ )
generated_insns.append(init_insn)
init_neutral_id = insn_id_gen("%s_%s_init_neutral" % (insn.id, red_iname))
@@ -1196,7 +1205,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
expression=neutral,
within_inames=base_iname_deps | frozenset([base_exec_iname]),
within_inames_is_final=insn.within_inames_is_final,
- depends_on=frozenset())
+ depends_on=frozenset(),
+ predicates=insn.predicates,
+ )
generated_insns.append(init_neutral_insn)
transfer_depends_on = set([init_neutral_id, init_id])
@@ -1216,7 +1227,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
within_inames=(
(outer_insn_inames - frozenset(expr.inames))
| frozenset([red_iname])),
- within_inames_is_final=insn.within_inames_is_final)
+ within_inames_is_final=insn.within_inames_is_final,
+ predicates=insn.predicates,
+ )
transfer_depends_on.add(get_args_insn_id)
else:
@@ -1239,7 +1252,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
| frozenset([red_iname])),
within_inames_is_final=insn.within_inames_is_final,
depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on,
- no_sync_with=frozenset([(init_id, "any")]))
+ no_sync_with=frozenset([(init_id, "any")]),
+ predicates=insn.predicates,
+ )
generated_insns.append(transfer_insn)
cur_size = 1
@@ -1280,6 +1295,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
base_iname_deps | frozenset([stage_exec_iname])),
within_inames_is_final=insn.within_inames_is_final,
depends_on=frozenset([prev_id]),
+ predicates=insn.predicates,
)
generated_insns.append(stage_insn)
@@ -1398,7 +1414,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
(sweep_iname,) + expr.inames),
within_inames_is_final=insn.within_inames_is_final,
depends_on=init_insn_depends_on,
- expression=expr.operation.neutral_element(*arg_dtypes))
+ expression=expr.operation.neutral_element(*arg_dtypes),
+ predicates=insn.predicates,
+ )
generated_insns.append(init_insn)
@@ -1425,7 +1443,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
depends_on=frozenset(update_insn_depends_on),
within_inames=update_insn_iname_deps,
no_sync_with=insn.no_sync_with,
- within_inames_is_final=insn.within_inames_is_final)
+ within_inames_is_final=insn.within_inames_is_final,
+ predicates=insn.predicates,
+ )
generated_insns.append(scan_insn)
@@ -1531,7 +1551,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
expression=neutral,
within_inames=base_iname_deps | frozenset([base_exec_iname]),
within_inames_is_final=insn.within_inames_is_final,
- depends_on=init_insn_depends_on)
+ depends_on=init_insn_depends_on,
+ predicates=insn.predicates,
+ )
generated_insns.append(init_insn)
transfer_insn_depends_on = set([init_insn.id]) | insn.depends_on
@@ -1561,7 +1583,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
within_inames=outer_insn_inames - frozenset(expr.inames),
within_inames_is_final=insn.within_inames_is_final,
depends_on=frozenset(transfer_insn_depends_on),
- no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with)
+ no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with,
+ predicates=insn.predicates,
+ )
generated_insns.append(transfer_insn)
@@ -1590,7 +1614,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
within_inames=(
base_iname_deps | frozenset([stage_exec_iname])),
within_inames_is_final=insn.within_inames_is_final,
- depends_on=frozenset([prev_id]))
+ depends_on=frozenset([prev_id]),
+ predicates=insn.predicates,
+ )
if cur_size == 1:
# Performance hack: don't add a barrier here with transfer_insn.
@@ -1623,6 +1649,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
base_iname_deps | frozenset([stage_exec_iname])),
within_inames_is_final=insn.within_inames_is_final,
depends_on=frozenset([prev_id]),
+ predicates=insn.predicates,
)
generated_insns.append(write_stage_insn)
@@ -1928,7 +1955,7 @@ def find_idempotence(kernel):
for insn in kernel.instructions)
from collections import defaultdict
- dep_graph = defaultdict(lambda: set())
+ dep_graph = defaultdict(set)
for insn in kernel.instructions:
dep_graph[insn.id] = set(writer_id
@@ -2020,7 +2047,8 @@ def limit_boostability(kernel):
# }}}
-preprocess_cache = PersistentDict("loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
+preprocess_cache = WriteOncePersistentDict(
+ "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
key_builder=LoopyKeyBuilder())
@@ -2126,7 +2154,7 @@ def preprocess_kernel(kernel, device=None):
# }}}
if CACHING_ENABLED:
- preprocess_cache[input_kernel] = kernel
+ preprocess_cache.store_if_not_present(input_kernel, kernel)
return kernel
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 4281e50bd006a3cddf5a3cae0ffffe3d78abcfac..850f0a61fcdc2878d43895bc0e024032532aa680 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -29,7 +29,7 @@ import sys
import islpy as isl
from loopy.diagnostic import warn_with_kernel, LoopyError # noqa
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
from loopy.tools import LoopyKeyBuilder
from loopy.version import DATA_MODEL_VERSION
@@ -84,14 +84,18 @@ class Barrier(ScheduleItem):
A plain-text comment explaining why the barrier was inserted.
- .. attribute:: kind
+ .. attribute:: synchronization_kind
+
+ ``"local"`` or ``"global"``
+
+ .. attribute:: mem_kind
``"local"`` or ``"global"``
.. attribute:: originating_insn_id
"""
- hash_fields = ["comment", "kind"]
+ hash_fields = ["comment", "synchronization_kind", "mem_kind"]
__slots__ = hash_fields + ["originating_insn_id"]
# }}}
@@ -206,13 +210,13 @@ def find_loop_nest_with_map(kernel):
"""
result = {}
- from loopy.kernel.data import ParallelTag, IlpBaseTag, VectorizeTag
+ from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
all_nonpar_inames = set([
iname
for iname in kernel.all_inames()
if not isinstance(kernel.iname_to_tag.get(iname),
- (ParallelTag, IlpBaseTag, VectorizeTag))])
+ (ConcurrentTag, IlpBaseTag, VectorizeTag))])
iname_to_insns = kernel.iname_to_insns()
@@ -274,10 +278,10 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
result = {}
- from loopy.kernel.data import ParallelTag, IlpBaseTag, VectorizeTag
+ from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
for insn in kernel.instructions:
for iname in kernel.insn_inames(insn):
- if isinstance(kernel.iname_to_tag.get(iname), ParallelTag):
+ if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag):
continue
iname_dep = result.setdefault(iname, set())
@@ -308,7 +312,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
continue
tag = kernel.iname_to_tag.get(dep_insn_iname)
- if isinstance(tag, (ParallelTag, IlpBaseTag, VectorizeTag)):
+ if isinstance(tag, (ConcurrentTag, IlpBaseTag, VectorizeTag)):
# Parallel tags don't really nest, so we'll disregard
# them here.
continue
@@ -431,14 +435,19 @@ def format_insn(kernel, insn_id):
from loopy.kernel.instruction import (
MultiAssignmentBase, NoOpInstruction, BarrierInstruction)
if isinstance(insn, MultiAssignmentBase):
- return "[%s] %s%s%s <- %s%s%s" % (
- format_insn_id(kernel, insn_id),
+ return "%s%s%s = %s%s%s {id=%s}" % (
Fore.CYAN, ", ".join(str(a) for a in insn.assignees), Style.RESET_ALL,
- Fore.MAGENTA, str(insn.expression), Style.RESET_ALL)
+ Fore.MAGENTA, str(insn.expression), Style.RESET_ALL,
+ format_insn_id(kernel, insn_id))
elif isinstance(insn, BarrierInstruction):
- return "[%s] %s... %sbarrier%s" % (
+ mem_kind = ''
+ if insn.synchronization_kind == 'local':
+ mem_kind = '{mem_kind=%s}' % insn.mem_kind
+
+ return "[%s] %s... %sbarrier%s%s" % (
format_insn_id(kernel, insn_id),
- Fore.MAGENTA, insn.kind[0], Style.RESET_ALL)
+ Fore.MAGENTA, insn.synchronization_kind[0], mem_kind,
+ Style.RESET_ALL)
elif isinstance(insn, NoOpInstruction):
return "[%s] %s... nop%s" % (
format_insn_id(kernel, insn_id),
@@ -456,11 +465,11 @@ def dump_schedule(kernel, schedule):
from loopy.kernel.data import MultiAssignmentBase
for sched_item in schedule:
if isinstance(sched_item, EnterLoop):
- lines.append(indent + "FOR %s" % sched_item.iname)
+ lines.append(indent + "for %s" % sched_item.iname)
indent += " "
elif isinstance(sched_item, LeaveLoop):
indent = indent[:-4]
- lines.append(indent + "END %s" % sched_item.iname)
+ lines.append(indent + "end %s" % sched_item.iname)
elif isinstance(sched_item, CallKernel):
lines.append(indent +
"CALL KERNEL %s(extra_args=%s, extra_inames=%s)" % (
@@ -479,7 +488,8 @@ def dump_schedule(kernel, schedule):
insn_str = sched_item.insn_id
lines.append(indent + insn_str)
elif isinstance(sched_item, Barrier):
- lines.append(indent + "---BARRIER:%s---" % sched_item.kind)
+ lines.append(indent + "... %sbarrier" %
+ sched_item.synchronization_kind[0])
else:
assert False
@@ -833,7 +843,8 @@ def generate_loop_schedules_internal(
# {{{ check if scheduler state allows insn scheduling
from loopy.kernel.instruction import BarrierInstruction
- if isinstance(insn, BarrierInstruction) and insn.kind == "global":
+ if isinstance(insn, BarrierInstruction) and \
+ insn.synchronization_kind == "global":
if not sched_state.may_schedule_global_barriers:
if debug_mode:
print("can't schedule '%s' because global barriers are "
@@ -1318,7 +1329,8 @@ def convert_barrier_instructions_to_barriers(kernel, schedule):
insn = kernel.id_to_insn[sched_item.insn_id]
if isinstance(insn, BarrierInstruction):
result.append(Barrier(
- kind=insn.kind,
+ synchronization_kind=insn.synchronization_kind,
+ mem_kind=insn.mem_kind,
originating_insn_id=insn.id,
comment="Barrier inserted due to %s" % insn.id))
continue
@@ -1415,8 +1427,8 @@ class DependencyTracker(object):
raise ValueError("unknown 'var_kind': %s" % var_kind)
from collections import defaultdict
- self.writer_map = defaultdict(lambda: set())
- self.reader_map = defaultdict(lambda: set())
+ self.writer_map = defaultdict(set)
+ self.reader_map = defaultdict(set)
self.temp_to_base_storage = kernel.get_temporary_to_base_storage_map()
def map_to_base_storage(self, var_names):
@@ -1577,7 +1589,8 @@ def _insn_ids_reaching_end(schedule, kind, reverse):
# end
# barrier()
# end
- if barrier_kind_more_or_equally_global(sched_item.kind, kind):
+ if barrier_kind_more_or_equally_global(
+ sched_item.synchronization_kind, kind):
insn_ids_alive_at_scope[-1].clear()
else:
insn_ids_alive_at_scope[-1] |= set(
@@ -1607,15 +1620,17 @@ def append_barrier_or_raise_error(schedule, dep, verify_only):
tgt=dep.target.id, src=dep.source.id))
schedule.append(Barrier(
comment=comment,
- kind=dep.var_kind,
+ synchronization_kind=dep.var_kind,
+ mem_kind=dep.var_kind,
originating_insn_id=None))
-def insert_barriers(kernel, schedule, kind, verify_only, level=0):
+def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0):
"""
- :arg kind: "local" or "global". The :attr:`Barrier.kind` to be inserted.
- Generally, this function will be called once for each kind of barrier
- at the top level, where more global barriers should be inserted first.
+ :arg synchronization_kind: "local" or "global".
+ The :attr:`Barrier.synchronization_kind` to be inserted. Generally, this
+ function will be called once for each kind of barrier at the top level, where
+ more global barriers should be inserted first.
:arg verify_only: do not insert barriers, only complain if they are
missing.
:arg level: the current level of loop nesting, 0 for outermost.
@@ -1624,14 +1639,15 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0):
# {{{ insert barriers at outermost scheduling level
def insert_barriers_at_outer_level(schedule, reverse=False):
- dep_tracker = DependencyTracker(kernel, var_kind=kind, reverse=reverse)
+ dep_tracker = DependencyTracker(kernel, var_kind=synchronization_kind,
+ reverse=reverse)
if reverse:
# Populate the dependency tracker with sources from the tail end of
# the schedule block.
for insn_id in (
insn_ids_reaching_end_without_intervening_barrier(
- schedule, kind)):
+ schedule, synchronization_kind)):
dep_tracker.add_source(insn_id)
result = []
@@ -1645,11 +1661,11 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0):
loop_head = (
insn_ids_reachable_from_start_without_intervening_barrier(
- subloop, kind))
+ subloop, synchronization_kind))
loop_tail = (
insn_ids_reaching_end_without_intervening_barrier(
- subloop, kind))
+ subloop, synchronization_kind))
# Checks if a barrier is needed before the loop. This handles
# dependencies with targets that can be reached without an
@@ -1688,7 +1704,8 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0):
elif isinstance(sched_item, Barrier):
result.append(sched_item)
- if barrier_kind_more_or_equally_global(sched_item.kind, kind):
+ if barrier_kind_more_or_equally_global(
+ sched_item.synchronization_kind, synchronization_kind):
dep_tracker.discard_all_sources()
i += 1
@@ -1724,7 +1741,8 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0):
if isinstance(sched_item, EnterLoop):
subloop, new_i = gather_schedule_block(schedule, i)
new_subloop = insert_barriers(
- kernel, subloop[1:-1], kind, verify_only, level + 1)
+ kernel, subloop[1:-1], synchronization_kind, verify_only,
+ level + 1)
result.append(subloop[0])
result.extend(new_subloop)
result.append(subloop[-1])
@@ -1756,7 +1774,8 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0):
def generate_loop_schedules(kernel, debug_args={}):
from pytools import MinRecursionLimit
- with MinRecursionLimit(len(kernel.instructions) * 2):
+ with MinRecursionLimit(max(len(kernel.instructions) * 2,
+ len(kernel.all_inames()) * 4)):
for sched in generate_loop_schedules_inner(kernel, debug_args=debug_args):
yield sched
@@ -1786,7 +1805,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
for item in preschedule
for insn_id in sched_item_to_insn_id(item))
- from loopy.kernel.data import IlpBaseTag, ParallelTag, VectorizeTag
+ from loopy.kernel.data import IlpBaseTag, ConcurrentTag, VectorizeTag
ilp_inames = set(
iname
for iname in kernel.all_inames()
@@ -1797,7 +1816,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag))
parallel_inames = set(
iname for iname in kernel.all_inames()
- if isinstance(kernel.iname_to_tag.get(iname), ParallelTag))
+ if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag))
loop_nest_with_map = find_loop_nest_with_map(kernel)
loop_nest_around_map = find_loop_nest_around_map(kernel)
@@ -1889,11 +1908,11 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
if not kernel.options.disable_global_barriers:
logger.debug("%s: barrier insertion: global" % kernel.name)
gen_sched = insert_barriers(kernel, gen_sched,
- kind="global", verify_only=True)
+ synchronization_kind="global", verify_only=True)
logger.debug("%s: barrier insertion: local" % kernel.name)
- gen_sched = insert_barriers(kernel, gen_sched, kind="local",
- verify_only=False)
+ gen_sched = insert_barriers(kernel, gen_sched,
+ synchronization_kind="local", verify_only=False)
logger.debug("%s: barrier insertion: done" % kernel.name)
new_kernel = kernel.copy(
@@ -1939,7 +1958,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
# }}}
-schedule_cache = PersistentDict("loopy-schedule-cache-v4-"+DATA_MODEL_VERSION,
+schedule_cache = WriteOncePersistentDict(
+ "loopy-schedule-cache-v4-"+DATA_MODEL_VERSION,
key_builder=LoopyKeyBuilder())
@@ -1970,7 +1990,7 @@ def get_one_scheduled_kernel(kernel):
kernel.name, time()-start_time))
if CACHING_ENABLED and not from_cache:
- schedule_cache[sched_cache_key] = result
+ schedule_cache.store_if_not_present(sched_cache_key, result)
return result
diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py
index 1a0789c2f61e21e4a0371e2a73195c9771245527..5c41f03997e5193333f5be213f2f87d38147b6df 100644
--- a/loopy/schedule/device_mapping.py
+++ b/loopy/schedule/device_mapping.py
@@ -106,7 +106,7 @@ def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen):
[end_item])
elif isinstance(sched_item, Barrier):
- if sched_item.kind == "global":
+ if sched_item.synchronization_kind == "global":
# Wrap the current chunk into a kernel call.
schedule_required_splitting = True
if current_chunk:
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 9b15ec471fb681698b85c1dd2f92376fbc731f00..72d0c6c7d7a634cd96379d17b7a91f6a638e0ab9 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -38,7 +38,6 @@ __doc__ = """
.. currentmodule:: loopy
-.. autoclass:: GuardedPwQPolynomial
.. autoclass:: ToCountMap
.. autoclass:: Op
.. autoclass:: MemAccess
@@ -50,6 +49,11 @@ __doc__ = """
.. autofunction:: gather_access_footprints
.. autofunction:: gather_access_footprint_bytes
+.. currentmodule:: loopy.statistics
+
+.. autoclass:: GuardedPwQPolynomial
+
+.. currentmodule:: loopy
"""
@@ -996,6 +1000,9 @@ def add_assumptions_guard(kernel, pwqpolynomial):
def count(kernel, set, space=None):
try:
+ if space is not None:
+ set = set.align_params(space)
+
return add_assumptions_guard(kernel, set.card())
except AttributeError:
pass
@@ -1410,7 +1417,8 @@ def get_synchronization_map(knl):
iname_list.pop()
elif isinstance(sched_item, Barrier):
- result = result + ToCountMap({"barrier_%s" % sched_item.kind:
+ result = result + ToCountMap({"barrier_%s" %
+ sched_item.synchronization_kind:
get_count_poly(iname_list)})
elif isinstance(sched_item, CallKernel):
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index f1a494f30d469511817d204c0476ff79abe00e3b..2d31c63ef13774599de27ae871be64bc5acb7514 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -104,7 +104,9 @@ class IdentityMapperMixin(object):
return expr
def map_type_annotation(self, expr, *args):
- return TypeAnnotation(expr.type, self.rec(expr.child))
+ return type(expr)(expr.type, self.rec(expr.child))
+
+ map_type_cast = map_type_annotation
map_linear_subscript = IdentityMapperBase.map_subscript
@@ -147,6 +149,11 @@ class WalkMapper(WalkMapperBase):
self.rec(expr.expr, *args)
+ def map_type_cast(self, expr, *args):
+ if not self.visit(expr):
+ return
+ self.rec(expr.child, *args)
+
map_tagged_variable = WalkMapperBase.map_variable
def map_loopy_function_identifier(self, expr, *args):
@@ -219,6 +226,10 @@ class StringifyMapper(StringifyMapperBase):
def map_rule_argument(self, expr, enclosing_prec):
return "" % expr.index
+ def map_type_cast(self, expr, enclosing_prec):
+ from pymbolic.mapper.stringifier import PREC_NONE
+ return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE))
+
class UnidirectionalUnifier(UnidirectionalUnifierBase):
def map_reduction(self, expr, other, unis):
@@ -273,6 +284,9 @@ class DependencyMapper(DependencyMapperBase):
map_linear_subscript = DependencyMapperBase.map_subscript
+ def map_type_cast(self, expr):
+ return self.rec(expr.child)
+
class SubstitutionRuleExpander(IdentityMapper):
def __init__(self, rules):
@@ -398,6 +412,10 @@ class TypedCSE(p.CommonSubexpression):
class TypeAnnotation(p.Expression):
+ """Undocumented for now. Currently only used internally around LHSs of
+ assignments that create temporaries.
+ """
+
def __init__(self, type, child):
super(TypeAnnotation, self).__init__()
self.type = type
@@ -406,9 +424,55 @@ class TypeAnnotation(p.Expression):
def __getinitargs__(self):
return (self.type, self.child)
+ def stringifier(self):
+ return StringifyMapper
+
mapper_method = intern("map_type_annotation")
+class TypeCast(p.Expression):
+ """Only defined for numerical types with semantics matching
+ :meth:`numpy.ndarray.astype`.
+
+ .. attribute:: child
+
+ The expression to be cast.
+ """
+
+ def __init__(self, type, child):
+ super(TypeCast, self).__init__()
+
+ from loopy.types import to_loopy_type, NumpyType
+ type = to_loopy_type(type)
+
+ if (not isinstance(type, NumpyType)
+ or not issubclass(type.dtype.type, np.number)):
+ from loopy.diagnostic import LoopyError
+ raise LoopyError("TypeCast only supports numerical numpy types, "
+ "not '%s'" % type)
+
+ # We're storing the type as a name for now to avoid
+ # numpy pickling bug madness. (see loopy.types)
+ self._type_name = type.dtype.name
+ self.child = child
+
+ @property
+ def type(self):
+ from loopy.types import NumpyType
+ return NumpyType(np.dtype(self._type_name))
+
+ # init_arg_names is a misnomer--they're attribute names used for pickling.
+ init_arg_names = ("_type_name", "child")
+
+ def __getinitargs__(self):
+ return (self._type_name, self.child)
+
+ def stringifier(self):
+ return StringifyMapper
+
+ mapper_method = intern("map_type_cast")
+
+
class TaggedVariable(p.Variable):
"""This is an identifier with a tag, such as 'matrix$one', where
'one' identifies this specific use of the identifier. This mechanism
@@ -1232,6 +1296,9 @@ class PwAffEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin):
super(PwAffEvaluationMapper, self).__init__(context)
def map_constant(self, expr):
+ if isinstance(expr, np.integer):
+ expr = int(expr)
+
return self.pw_zero + expr
def map_min(self, expr):
@@ -1559,6 +1626,9 @@ class BatchedAccessRangeMapper(WalkMapper):
def map_reduction(self, expr, inames):
return WalkMapper.map_reduction(self, expr, inames | set(expr.inames))
+ def map_type_cast(self, expr, inames):
+ return self.rec(expr.child, inames)
+
class AccessRangeMapper(object):
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 5d5743bae322fc59c989cafd85122c8ca619c422..aac528087cf812a91553d416f166be898a1cd132 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -55,6 +55,7 @@ class TargetBase(object):
comparison_fields = ()
def update_persistent_hash(self, key_hash, key_builder):
+ key_hash.update(type(self).__name__.encode())
for field_name in self.hash_fields:
key_builder.rec(key_hash, getattr(self, field_name))
@@ -188,9 +189,10 @@ class ASTBuilderBase(object):
def add_vector_access(self, access_expr, index):
raise NotImplementedError()
- def emit_barrier(self, kind, comment):
+ def emit_barrier(self, synchronization_kind, mem_kind, comment):
"""
- :arg kind: ``"local"`` or ``"global"``
+ :arg synchronization_kind: ``"local"`` or ``"global"``
+ :arg mem_kind: ``"local"`` or ``"global"``
"""
raise NotImplementedError()
@@ -210,6 +212,10 @@ class ASTBuilderBase(object):
static_lbound, static_ubound, inner):
raise NotImplementedError()
+ @property
+ def can_implement_conditionals(self):
+ return False
+
def emit_if(self, condition_str, ast):
raise NotImplementedError()
@@ -274,28 +280,6 @@ class DummyHostASTBuilder(ASTBuilderBase):
def ast_block_scope_class(self):
return _DummyASTBlock
- def emit_assignment(self, codegen_state, insn):
- return None
-
- def emit_multiple_assignment(self, codegen_state, insn):
- return None
-
- def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
- static_lbound, static_ubound, inner):
- return None
-
- def emit_if(self, condition_str, ast):
- return None
-
- def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
- return None
-
- def emit_blank_line(self):
- return None
-
- def emit_comment(self, s):
- return None
-
# }}}
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 0e1f0ff86ca5eaa1932f766b3f8b79f5167ce6f4..423311cdb259c77e77070f5fc27a542dd2c89fc9 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -27,12 +27,14 @@ THE SOFTWARE.
import six
import numpy as np # noqa
+from loopy.kernel.data import CallMangleInfo
from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder
from loopy.diagnostic import LoopyError
from cgen import Pointer, NestedDeclarator, Block
from cgen.mapper import IdentityMapper as CASTIdentityMapperBase
from pymbolic.mapper.stringifier import PREC_NONE
from loopy.symbolic import IdentityMapper
+from loopy.types import NumpyType
import pymbolic.primitives as p
from pytools import memoize_method
@@ -315,9 +317,75 @@ class _ConstRestrictPointer(Pointer):
return sub_tp, ("*const __restrict__ %s" % sub_decl)
+class _ConstPointer(Pointer):
+ def get_decl_pait(self):
+ sub_tp, sub_decl = self.subdecl.get_decl_pair()
+ return sub_tp, ("*const %s" % sub_decl)
+
+
+# {{{ symbol mangler
+
+def c_symbol_mangler(kernel, name):
+ # float NAN as defined in C99 standard
+ if name == "NAN":
+ return NumpyType(np.dtype(np.float32)), name
+ return None
+
+# }}}
+
+
+# {{{ function mangler
+
+def c_function_mangler(target, name, arg_dtypes):
+ # convert abs(), min(), max() to fabs(), fmin(), fmax() to comply with
+ # C99 standard
+ if not isinstance(name, str):
+ return None
+
+ if (name == "abs"
+ and len(arg_dtypes) == 1
+ and arg_dtypes[0].numpy_dtype.kind == "f"):
+ return CallMangleInfo(
+ target_name="fabs",
+ result_dtypes=arg_dtypes,
+ arg_dtypes=arg_dtypes)
+
+ if name in ["max", "min"] and len(arg_dtypes) == 2:
+ dtype = np.find_common_type(
+ [], [dtype.numpy_dtype for dtype in arg_dtypes])
+
+ if dtype.kind == "c":
+ raise RuntimeError("min/max do not support complex numbers")
+
+ if dtype.kind == "f":
+ name = "f" + name
+
+ result_dtype = NumpyType(dtype)
+ return CallMangleInfo(
+ target_name=name,
+ result_dtypes=(result_dtype,),
+ arg_dtypes=2*(result_dtype,))
+
+ return None
+
+# }}}
+
+
class CASTBuilder(ASTBuilderBase):
# {{{ library
+ def function_manglers(self):
+ return (
+ super(CASTBuilder, self).function_manglers() + [
+ c_function_mangler
+ ])
+
+ def symbol_manglers(self):
+ return (
+ super(CASTBuilder, self).symbol_manglers() + [
+ c_symbol_mangler
+ ])
+
def preamble_generators(self):
return (
super(CASTBuilder, self).preamble_generators() + [
@@ -344,7 +412,16 @@ class CASTBuilder(ASTBuilderBase):
result = []
from loopy.kernel.data import temp_var_scope
-
+ from loopy.schedule import CallKernel
+ # We only need to write declarations for global variables with
+ # the first device program. `is_first_dev_prog` determines
+ # whether this is the first device program in the schedule.
+ is_first_dev_prog = True
+ for i in range(schedule_index):
+ if isinstance(kernel.schedule[i], CallKernel):
+ is_first_dev_prog = False
+ break
+ if is_first_dev_prog:
for tv in sorted(
six.itervalues(kernel.temporary_variables),
key=lambda tv: tv.name):
@@ -421,6 +498,15 @@ class CASTBuilder(ASTBuilderBase):
base_storage_to_align_bytes = {}
from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line
+ # Getting the temporary variables that are needed for the current
+ # sub-kernel.
+ from loopy.schedule.tools import (
+ temporaries_read_in_subkernel,
+ temporaries_written_in_subkernel)
+ subkernel = kernel.schedule[schedule_index].kernel_name
+ sub_knl_temps = (
+ temporaries_read_in_subkernel(kernel, subkernel) |
+ temporaries_written_in_subkernel(kernel, subkernel))
for tv in sorted(
six.itervalues(kernel.temporary_variables),
@@ -430,7 +516,8 @@ class CASTBuilder(ASTBuilderBase):
if not tv.base_storage:
for idi in decl_info:
# global temp vars are mapped to arguments or global declarations
- if tv.scope != temp_var_scope.GLOBAL:
+ if tv.scope != temp_var_scope.GLOBAL and (
+ tv.name in sub_knl_temps):
decl = self.wrap_temporary_decl(
self.get_temporary_decl(
codegen_state, schedule_index, tv, idi),
@@ -470,13 +557,17 @@ class CASTBuilder(ASTBuilderBase):
temp_var_decl = self.wrap_temporary_decl(
temp_var_decl, tv.scope)
+ if tv._base_storage_access_may_be_aliasing:
+ ptrtype = _ConstPointer
+ else:
# The 'restrict' part of this is a complete lie--of course
# all these temporaries are aliased. But we're promising to
# not use them to shovel data from one representation to the
# other. That counts, right?
+ ptrtype = _ConstRestrictPointer
- cast_decl = _ConstRestrictPointer(cast_decl)
- temp_var_decl = _ConstRestrictPointer(temp_var_decl)
+ cast_decl = ptrtype(cast_decl)
+ temp_var_decl = ptrtype(temp_var_decl)
cast_tp, cast_d = cast_decl.get_decl_pair()
temp_var_decl = Initializer(
@@ -797,6 +888,10 @@ class CASTBuilder(ASTBuilderBase):
from cgen import Comment
return Comment(s)
+ @property
+ def can_implement_conditionals(self):
+ return True
+
def emit_if(self, condition_str, ast):
from cgen import If
return If(condition_str, ast)
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 8f924d3aee3b9f2982006fdb7b558cccac6785e3..caee73eb1c3320f03ceac66e55e8f5c0bfadbbc2 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -29,8 +29,10 @@ import numpy as np
from pymbolic.mapper import RecursiveMapper, IdentityMapper
from pymbolic.mapper.stringifier import (PREC_NONE, PREC_CALL, PREC_PRODUCT,
- PREC_POWER,
- PREC_UNARY, PREC_LOGICAL_OR, PREC_LOGICAL_AND)
+ PREC_POWER, PREC_SHIFT,
+ PREC_UNARY, PREC_LOGICAL_OR, PREC_LOGICAL_AND,
+ PREC_BITWISE_AND, PREC_BITWISE_OR)
+
import islpy as isl
import pymbolic.primitives as p
from pymbolic import var
@@ -338,6 +340,11 @@ class ExpressionToCExpressionMapper(IdentityMapper):
expr.operator,
self.rec(expr.right, inner_type_context))
+ def map_type_cast(self, expr, type_context):
+ registry = self.codegen_state.ast_builder.target.get_dtype_registry()
+ cast = var("(%s)" % registry.dtype_to_ctype(expr.type))
+ return cast(self.rec(expr.child, type_context))
+
def map_constant(self, expr, type_context):
if isinstance(expr, (complex, np.complexfloating)):
try:
@@ -782,6 +789,16 @@ class CExpressionToCodeMapper(RecursiveMapper):
def map_literal(self, expr, enclosing_prec):
return expr.s
+ def map_left_shift(self, expr, enclosing_prec):
+ return self.parenthesize_if_needed(
+ self.join_rec(" << ", (expr.shiftee, expr.shift), PREC_SHIFT),
+ enclosing_prec, PREC_SHIFT)
+
+ def map_right_shift(self, expr, enclosing_prec):
+ return self.parenthesize_if_needed(
+ self.join_rec(" >> ", (expr.shiftee, expr.shift), PREC_SHIFT),
+ enclosing_prec, PREC_SHIFT)
+
def map_logical_not(self, expr, enclosing_prec):
return self.parenthesize_if_needed(
"!" + self.rec(expr.child, PREC_UNARY),
@@ -807,6 +824,21 @@ class CExpressionToCodeMapper(RecursiveMapper):
result = "(%s)" % result
return result
+ def map_bitwise_not(self, expr, enclosing_prec):
+ return self.parenthesize_if_needed(
+ "~" + self.rec(expr.child, PREC_UNARY),
+ enclosing_prec, PREC_UNARY)
+
+ def map_bitwise_and(self, expr, enclosing_prec):
+ return self.parenthesize_if_needed(
+ self.join_rec(" & ", expr.children, PREC_BITWISE_AND),
+ enclosing_prec, PREC_BITWISE_AND)
+
+ def map_bitwise_or(self, expr, enclosing_prec):
+ return self.parenthesize_if_needed(
+ self.join_rec(" | ", expr.children, PREC_BITWISE_OR),
+ enclosing_prec, PREC_BITWISE_OR)
+
def map_sum(self, expr, enclosing_prec):
from pymbolic.mapper.stringifier import PREC_SUM
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 2bdffb5aa69bdc0f72fe12a58faa6d0e78920e0f..027f27838bf68511905bd34cf75d0b361c749629 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -291,18 +291,19 @@ class CUDACASTBuilder(CASTBuilder):
def add_vector_access(self, access_expr, index):
return access_expr.a(self._VEC_AXES[index])
- def emit_barrier(self, kind, comment):
+ def emit_barrier(self, synchronization_kind, mem_kind, comment):
"""
:arg kind: ``"local"`` or ``"global"``
+ :arg memkind: unused
:return: a :class:`loopy.codegen.GeneratedInstruction`.
"""
- if kind == "local":
+ if synchronization_kind == "local":
if comment:
comment = " /* %s */" % comment
from cgen import Statement
return Statement("__syncthreads()%s" % comment)
- elif kind == "global":
+ elif synchronization_kind == "global":
raise LoopyError("CUDA does not have global barriers")
else:
raise LoopyError("unknown barrier kind")
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index 61788df2dd9d32978a550990fb7c84501f76e856..2909f16f56315b136f4f2677348bfe0c3e5553b4 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -30,6 +30,13 @@ from loopy.diagnostic import LoopyError
from pytools.py_codegen import (
Indentation, PythonFunctionGenerator)
+import logging
+logger = logging.getLogger(__name__)
+
+from pytools.persistent_dict import WriteOncePersistentDict
+from loopy.tools import LoopyKeyBuilder
+from loopy.version import DATA_MODEL_VERSION
+
# {{{ object array argument packing
@@ -419,7 +426,8 @@ class ExecutionWrapperGeneratorBase(object):
# {{{ allocate written arrays, if needed
if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
- and arg.shape is not None:
+ and arg.shape is not None \
+ and all(si is not None for si in arg.shape):
if not isinstance(arg.dtype, NumpyType):
raise LoopyError("do not know how to pass arg of type '%s'"
@@ -653,6 +661,11 @@ class _Kernels(object):
pass
+typed_and_scheduled_cache = WriteOncePersistentDict(
+ "loopy-typed-and-scheduled-cache-v1-"+DATA_MODEL_VERSION,
+ key_builder=LoopyKeyBuilder())
+
+
# {{{ kernel executor
class KernelExecutorBase(object):
@@ -716,6 +729,31 @@ class KernelExecutorBase(object):
return kernel
+ @memoize_method
+ def get_typed_and_scheduled_kernel(self, arg_to_dtype_set):
+ from loopy import CACHING_ENABLED
+
+ from loopy.preprocess import prepare_for_caching
+ # prepare_for_caching() gets run by preprocess, but the kernel at this
+ # stage is not guaranteed to be preprocessed.
+ cacheable_kernel = prepare_for_caching(self.kernel)
+ cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set)
+
+ if CACHING_ENABLED:
+ try:
+ return typed_and_scheduled_cache[cache_key]
+ except KeyError:
+ pass
+
+ logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name)
+
+ kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set)
+
+ if CACHING_ENABLED:
+ typed_and_scheduled_cache.store_if_not_present(cache_key, kernel)
+
+ return kernel
+
def arg_to_dtype_set(self, kwargs):
if not self.has_runtime_typed_args:
return None
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 35dade90494906b61aad9eb66e7271f2c5d1e180..45a59847ba9f175df5ca1be46aa78566b2aab03b 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -288,15 +288,15 @@ class ISPCASTBuilder(CASTBuilder):
def add_vector_access(self, access_expr, index):
return access_expr[index]
- def emit_barrier(self, kind, comment):
+ def emit_barrier(self, synchronization_kind, mem_kind, comment):
from cgen import Comment, Statement
assert comment
- if kind == "local":
+ if synchronization_kind == "local":
return Comment("local barrier: %s" % comment)
- elif kind == "global":
+ elif synchronization_kind == "global":
return Statement("sync; /* %s */" % comment)
else:
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index a5f7562c41c3ec8eca673904550e078d2a992241..2763caace891570a1b7f8b13f225001a03d3aa65 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -167,30 +167,6 @@ def opencl_function_mangler(kernel, name, arg_dtypes):
if not isinstance(name, str):
return None
- if (name == "abs"
- and len(arg_dtypes) == 1
- and arg_dtypes[0].numpy_dtype.kind == "f"):
- return CallMangleInfo(
- target_name="fabs",
- result_dtypes=arg_dtypes,
- arg_dtypes=arg_dtypes)
-
- if name in ["max", "min"] and len(arg_dtypes) == 2:
- dtype = np.find_common_type(
- [], [dtype.numpy_dtype for dtype in arg_dtypes])
-
- if dtype.kind == "c":
- raise RuntimeError("min/max do not support complex numbers")
-
- if dtype.kind == "f":
- name = "f" + name
-
- result_dtype = NumpyType(dtype)
- return CallMangleInfo(
- target_name=name,
- result_dtypes=(result_dtype,),
- arg_dtypes=2*(result_dtype,))
-
if name == "dot":
scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"]
return CallMangleInfo(
@@ -450,18 +426,20 @@ class OpenCLCASTBuilder(CASTBuilder):
# The 'int' avoids an 'L' suffix for long ints.
return access_expr.attr("s%s" % hex(int(index))[2:])
- def emit_barrier(self, kind, comment):
+ def emit_barrier(self, synchronization_kind, mem_kind, comment):
"""
:arg kind: ``"local"`` or ``"global"``
:return: a :class:`loopy.codegen.GeneratedInstruction`.
"""
- if kind == "local":
+ if synchronization_kind == "local":
if comment:
comment = " /* %s */" % comment
+ mem_kind = mem_kind.upper()
+
from cgen import Statement
- return Statement("barrier(CLK_LOCAL_MEM_FENCE)%s" % comment)
- elif kind == "global":
+ return Statement("barrier(CLK_%s_MEM_FENCE)%s" % (mem_kind, comment))
+ elif synchronization_kind == "global":
raise LoopyError("OpenCL does not have global barriers")
else:
raise LoopyError("unknown barrier kind")
diff --git a/loopy/target/python.py b/loopy/target/python.py
index 11951abcf17e94c0fdba51042e3060735215b423..ce04986d3d2a39dcf7126339055d32fa16ffcc25 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -283,6 +283,10 @@ class PythonASTBuilderBase(ASTBuilderBase):
from genpy import Comment
return Comment(s)
+ @property
+ def can_implement_conditionals(self):
+ return True
+
def emit_if(self, condition_str, ast):
from genpy import If
return If(condition_str, ast)
diff --git a/loopy/tools.py b/loopy/tools.py
index 56b673b597fc3bf43a6b03f87607ea8d3db0866a..d6952d54782f113685299641c828907fb7f32a46 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -23,6 +23,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
+import collections
import numpy as np
from pytools.persistent_dict import KeyBuilder as KeyBuilderBase
from loopy.symbolic import WalkMapper as LoopyWalkMapper
@@ -50,7 +51,12 @@ class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase):
See also :meth:`LoopyKeyBuilder.update_for_pymbolic_expression`.
"""
- #
+ def map_reduction(self, expr, *args):
+ if not self.visit(expr):
+ return
+
+ self.key_hash.update(type(expr.operation).__name__.encode("utf-8"))
+ self.rec(expr.expr, *args)
class LoopyKeyBuilder(KeyBuilderBase):
@@ -113,6 +119,53 @@ class PymbolicExpressionHashWrapper(object):
# }}}
+# {{{ eq key builder
+
+class LoopyEqKeyBuilder(object):
+ """Unlike :class:`loopy.tools.LoopyKeyBuilder`, this builds keys for use in
+ equality comparison, such that `key(a) == key(b)` if and only if `a == b`.
+ The types of objects being compared should satisfy structural equality.
+
+ The output is suitable for use with :class:`loopy.tools.LoopyKeyBuilder`
+ provided all fields are persistent hashable.
+
+ As an optimization, top-level pymbolic expression fields are stringified for
+ faster comparisons / hash calculations.
+
+ Usage::
+
+ kb = LoopyEqKeyBuilder()
+ kb.update_for_class(insn.__class__)
+ kb.update_for_field("field", insn.field)
+ ...
+ key = kb.key()
+
+ """
+
+ def __init__(self):
+ self.field_dict = {}
+
+ def update_for_class(self, class_):
+ self.class_ = class_
+
+ def update_for_field(self, field_name, value):
+ self.field_dict[field_name] = value
+
+ def update_for_pymbolic_field(self, field_name, value):
+ self.field_dict[field_name] = str(value).encode("utf-8")
+
+ def key(self):
+ return (self.class_.__name__.encode("utf-8"), self.field_dict)
+
+ def hash_key(self):
+ """Similar to key(), but excludes field names for faster hashing.
+ """
+ return (self.class_.__name__.encode("utf-8"),) + tuple(
+ self.field_dict[k] for k in sorted(self.field_dict.keys()))
+
+# }}}
+
+
# {{{ remove common indentation
def remove_common_indentation(code, require_leading_newline=True,
@@ -340,23 +393,19 @@ def compute_sccs(graph):
# }}}
-# {{{ lazily unpickling dictionary
-
+# {{{ pickled container value
-class _PickledObjectWrapper(object):
- """
- A class meant to wrap a pickled value (for :class:`LazilyUnpicklingDictionary`).
+class _PickledObject(object):
+ """A class meant to wrap a pickled value (for :class:`LazilyUnpicklingDict` and
+ :class:`LazilyUnpicklingList`).
"""
- @classmethod
- def from_object(cls, obj):
- if isinstance(obj, cls):
- return obj
- from pickle import dumps
- return cls(dumps(obj))
-
- def __init__(self, objstring):
- self.objstring = objstring
+ def __init__(self, obj):
+ if isinstance(obj, _PickledObject):
+ self.objstring = obj.objstring
+ else:
+ from pickle import dumps
+ self.objstring = dumps(obj)
def unpickle(self):
from pickle import loads
@@ -366,12 +415,35 @@ class _PickledObjectWrapper(object):
return {"objstring": self.objstring}
-import collections
+class _PickledObjectWithEqAndPersistentHashKeys(_PickledObject):
+ """Like :class:`_PickledObject`, with two additional attributes:
+ * `eq_key`
+ * `persistent_hash_key`
-class LazilyUnpicklingDictionary(collections.MutableMapping):
+ This allows for comparison and for persistent hashing without unpickling.
"""
- A dictionary-like object which lazily unpickles its values.
+
+ def __init__(self, obj, eq_key, persistent_hash_key):
+ _PickledObject.__init__(self, obj)
+ self.eq_key = eq_key
+ self.persistent_hash_key = persistent_hash_key
+
+ def update_persistent_hash(self, key_hash, key_builder):
+ key_builder.rec(key_hash, self.persistent_hash_key)
+
+ def __getstate__(self):
+ return {"objstring": self.objstring,
+ "eq_key": self.eq_key,
+ "persistent_hash_key": self.persistent_hash_key}
+
+# }}}
+
+
+# {{{ lazily unpickling dictionary
+
+class LazilyUnpicklingDict(collections.MutableMapping):
+ """A dictionary-like object which lazily unpickles its values.
"""
def __init__(self, *args, **kwargs):
@@ -379,7 +451,7 @@ class LazilyUnpicklingDictionary(collections.MutableMapping):
def __getitem__(self, key):
value = self._map[key]
- if isinstance(value, _PickledObjectWrapper):
+ if isinstance(value, _PickledObject):
value = self._map[key] = value.unpickle()
return value
@@ -397,12 +469,105 @@ class LazilyUnpicklingDictionary(collections.MutableMapping):
def __getstate__(self):
return {"_map": dict(
- (key, _PickledObjectWrapper.from_object(val))
+ (key, _PickledObject(val))
for key, val in six.iteritems(self._map))}
# }}}
+# {{{ lazily unpickling list
+
+class LazilyUnpicklingList(collections.MutableSequence):
+ """A list which lazily unpickles its values."""
+
+ def __init__(self, *args, **kwargs):
+ self._list = list(*args, **kwargs)
+
+ def __getitem__(self, key):
+ item = self._list[key]
+ if isinstance(item, _PickledObject):
+ item = self._list[key] = item.unpickle()
+ return item
+
+ def __setitem__(self, key, value):
+ self._list[key] = value
+
+ def __delitem__(self, key):
+ del self._list[key]
+
+ def __len__(self):
+ return len(self._list)
+
+ def insert(self, key, value):
+ self._list.insert(key, value)
+
+ def __getstate__(self):
+ return {"_list": [_PickledObject(val) for val in self._list]}
+
+
+class LazilyUnpicklingListWithEqAndPersistentHashing(LazilyUnpicklingList):
+ """A list which lazily unpickles its values, and supports equality comparison
+ and persistent hashing without unpickling.
+
+ Persistent hashing only works in conjunction with :class:`LoopyKeyBuilder`.
+
+ Equality comparison and persistent hashing are implemented by supplying
+ functions `eq_key_getter` and `persistent_hash_key_getter` to the
+ constructor. These functions should return keys that can be used in place of
+ the original object for the respective purposes of equality comparison and
+ persistent hashing.
+ """
+
+ def __init__(self, *args, **kwargs):
+ self.eq_key_getter = kwargs.pop("eq_key_getter")
+ self.persistent_hash_key_getter = kwargs.pop("persistent_hash_key_getter")
+ LazilyUnpicklingList.__init__(self, *args, **kwargs)
+
+ def update_persistent_hash(self, key_hash, key_builder):
+ key_builder.update_for_list(key_hash, self._list)
+
+ def _get_eq_key(self, obj):
+ if isinstance(obj, _PickledObjectWithEqAndPersistentHashKeys):
+ return obj.eq_key
+ return self.eq_key_getter(obj)
+
+ def _get_persistent_hash_key(self, obj):
+ if isinstance(obj, _PickledObjectWithEqAndPersistentHashKeys):
+ return obj.persistent_hash_key
+ return self.persistent_hash_key_getter(obj)
+
+ def __eq__(self, other):
+ if not isinstance(other, (list, LazilyUnpicklingList)):
+ return NotImplemented
+
+ if isinstance(other, LazilyUnpicklingList):
+ other = other._list
+
+ if len(self) != len(other):
+ return False
+
+ for a, b in zip(self._list, other):
+ if self._get_eq_key(a) != self._get_eq_key(b):
+ return False
+
+ return True
+
+ def __ne__(self, other):
+ return not self.__eq__(other)
+
+ def __getstate__(self):
+ return {"_list": [
+ _PickledObjectWithEqAndPersistentHashKeys(
+ val,
+ self._get_eq_key(val),
+ self._get_persistent_hash_key(val))
+ for val in self._list],
+ "eq_key_getter": self.eq_key_getter,
+ "persistent_hash_key_getter": self.persistent_hash_key_getter}
+
+# }}}
+
+
def is_interned(s):
return s is None or intern(s) is s
@@ -411,4 +576,19 @@ def intern_frozenset_of_ids(fs):
return frozenset(intern(s) for s in fs)
+def natorder(key):
+ # Return natural ordering for strings, as opposed to dictionary order.
+ # E.g. will result in
+ # 'abc1' < 'abc9' < 'abc10'
+ # rather than
+ # 'abc1' < 'abc10' < 'abc9'
+ # Based on
+ # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7
+ import re
+ return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)]
+
+
+def natsorted(seq, key=lambda x: x):
+ return sorted(seq, key=lambda y: natorder(key(y)))
+
# vim: foldmethod=marker
diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfbbd56e906c5e622debcd82bd5368aa3b1fb34c
--- /dev/null
+++ b/loopy/transform/add_barrier.py
@@ -0,0 +1,87 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2017 Kaushik Kulkarni"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from loopy.kernel.instruction import BarrierInstruction
+from loopy.match import parse_match
+from loopy.transform.instruction import add_dependency
+
+__doc__ = """
+.. currentmodule:: loopy
+
+.. autofunction:: add_barrier
+"""
+
+
+# {{{ add_barrier
+
+def add_barrier(knl, insn_before="", insn_after="", id_based_on=None,
+ tags=None, synchronization_kind="global", mem_kind=None):
+ """Takes in a kernel that needs to be added a barrier and returns a kernel
+ which has a barrier inserted into it. It takes input of 2 instructions and
+ then adds a barrier in between those 2 instructions. The expressions can
+ be any inputs that are understood by :func:`loopy.match.parse_match`.
+
+ :arg insn_before: String expression that specifies the instruction(s)
+ before the barrier which is to be added
+ :arg insn_after: String expression that specifies the instruction(s) after
+ the barrier which is to be added
+ :arg id: String on which the id of the barrier would be based on.
+ :arg tags: The tag of the group to which the barrier must be added
+ :arg synchronization_kind: Kind of barrier to be added. May be "global" or
+ "local"
+ :arg kind: Type of memory to be synchronied. May be "global" or "local". Ignored
+ for "global" bariers. If not supplied, defaults to :arg:`synchronization_kind`
+ """
+
+ if mem_kind is None:
+ mem_kind = synchronization_kind
+
+ if id_based_on is None:
+ id = knl.make_unique_instruction_id(
+ based_on=synchronization_kind[0]+"_barrier")
+ else:
+ id = knl.make_unique_instruction_id(based_on=id_based_on)
+
+ match = parse_match(insn_before)
+ insn_before_list = [insn.id for insn in knl.instructions if match(knl,
+ insn)]
+
+ barrier_to_add = BarrierInstruction(depends_on=frozenset(insn_before_list),
+ depends_on_is_final=True,
+ id=id,
+ tags=tags,
+ synchronization_kind=synchronization_kind,
+ mem_kind=mem_kind)
+
+ new_knl = knl.copy(instructions=knl.instructions + [barrier_to_add])
+ new_knl = add_dependency(kernel=new_knl,
+ insn_match=insn_after,
+ depends_on="id:"+id)
+
+ return new_knl
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py
index f4e6526a7b083f0b38dda1209b607aa38a62b68e..618e36f20da8b3f9089ecf5ce88d6b3177528570 100644
--- a/loopy/transform/array_buffer_map.py
+++ b/loopy/transform/array_buffer_map.py
@@ -239,14 +239,14 @@ class ArrayToBufferMap(object):
non1_storage_axis_flags = []
non1_storage_shape = []
- for saxis, bi, l in zip(
+ for saxis, bi, saxis_len in zip(
storage_axis_names, storage_base_indices, storage_shape):
- has_length_non1 = l != 1
+ has_length_non1 = saxis_len != 1
non1_storage_axis_flags.append(has_length_non1)
if has_length_non1:
- non1_storage_shape.append(l)
+ non1_storage_shape.append(saxis_len)
# }}}
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index 92cff7a507d672a3acc51a8abed572a04cb7e86a..1b059b6a73d3064596b8679fbc87f94287b2d9fe 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -29,7 +29,7 @@ from loopy.symbolic import (get_dependencies,
RuleAwareIdentityMapper, SubstitutionRuleMappingContext,
SubstitutionMapper)
from pymbolic.mapper.substitutor import make_subst_func
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper
from loopy.version import DATA_MODEL_VERSION
from loopy.diagnostic import LoopyError
@@ -124,7 +124,8 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper):
# }}}
-buffer_array_cache = PersistentDict("loopy-buffer-array-cache-"+DATA_MODEL_VERSION,
+buffer_array_cache = WriteOncePersistentDict(
+ "loopy-buffer-array-cache-"+DATA_MODEL_VERSION,
key_builder=LoopyKeyBuilder())
@@ -531,7 +532,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
if CACHING_ENABLED:
from loopy.preprocess import prepare_for_caching
- buffer_array_cache[cache_key] = prepare_for_caching(kernel)
+ buffer_array_cache.store_if_not_present(
+ cache_key, prepare_for_caching(kernel))
return kernel
diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index 77840753258fa545aa01ef3e8c58cbc36e66ed72..0ac71d603ebe8b5150fb854dd3978676dd9d98c3 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -38,6 +38,7 @@ from loopy.symbolic import IdentityMapper
class ExtraInameIndexInserter(IdentityMapper):
def __init__(self, var_to_new_inames):
self.var_to_new_inames = var_to_new_inames
+ self.seen_ilp_inames = set()
def map_subscript(self, expr):
try:
@@ -50,6 +51,7 @@ class ExtraInameIndexInserter(IdentityMapper):
index = (index,)
index = tuple(self.rec(i) for i in index)
+ self.seen_ilp_inames.update(v.name for v in new_idx)
return expr.aggregate.index(index + new_idx)
def map_variable(self, expr):
@@ -58,6 +60,7 @@ class ExtraInameIndexInserter(IdentityMapper):
except KeyError:
return expr
else:
+ self.seen_ilp_inames.update(v.name for v in new_idx)
return expr.index(new_idx)
@@ -160,13 +163,30 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
# }}}
from pymbolic import var
- eiii = ExtraInameIndexInserter(
- dict((var_name, tuple(var(iname) for iname in inames))
- for var_name, inames in six.iteritems(var_to_new_ilp_inames)))
-
- new_insns = [
- insn.with_transformed_expressions(eiii)
- for insn in kernel.instructions]
+ var_to_extra_iname = dict(
+ (var_name, tuple(var(iname) for iname in inames))
+ for var_name, inames in six.iteritems(var_to_new_ilp_inames))
+
+ new_insns = []
+
+ for insn in kernel.instructions:
+ eiii = ExtraInameIndexInserter(var_to_extra_iname)
+ new_insn = insn.with_transformed_expressions(eiii)
+ if not eiii.seen_ilp_inames <= insn.within_inames:
+
+ from loopy.diagnostic import warn_with_kernel
+ warn_with_kernel(
+ kernel,
+ "implicit_ilp_iname",
+ "Instruction '%s': touched variable that (for ILP) "
+ "required iname(s) '%s', but that the instruction was not "
+ "previously within the iname(s). Previously, this would "
+ "implicitly promote the instruction, but that behavior is "
+ "deprecated and will stop working in 2018.1."
+ % (insn.id, ", ".join(
+ eiii.seen_ilp_inames - insn.within_inames)))
+
+ new_insns.append(new_insn)
return kernel.copy(
temporary_variables=new_temp_vars,
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index ea90abfe27c8de69daf39021b3d0ea5463a2e4c8..22fd7b3bb2c643bc3c1309f4e3fdb89438ae7d2b 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -641,7 +641,7 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
iname_to_tag = [(iname, parse_tag(tag)) for iname, tag in iname_to_tag]
- from loopy.kernel.data import (ParallelTag, AutoLocalIndexTagBase,
+ from loopy.kernel.data import (ConcurrentTag, AutoLocalIndexTagBase,
ForceSequentialTag)
# {{{ globbing
@@ -686,13 +686,13 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
if iname not in kernel.all_inames():
raise ValueError("cannot tag '%s'--not known" % iname)
- if isinstance(new_tag, ParallelTag) \
+ if isinstance(new_tag, ConcurrentTag) \
and isinstance(old_tag, ForceSequentialTag):
raise ValueError("cannot tag '%s' as parallel--"
"iname requires sequential execution" % iname)
if isinstance(new_tag, ForceSequentialTag) \
- and isinstance(old_tag, ParallelTag):
+ and isinstance(old_tag, ConcurrentTag):
raise ValueError("'%s' is already tagged as parallel, "
"but is now prohibited from being parallel "
"(likely because of participation in a precompute or "
@@ -972,9 +972,9 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
# Get the duplication options as a tuple of iname and a set
for iname, insns in _get_iname_duplication_options(insn_deps):
# Check whether this iname has a parallel tag and discard it if so
- from loopy.kernel.data import ParallelTag
+ from loopy.kernel.data import ConcurrentTag
if (iname in knl.iname_to_tag
- and isinstance(knl.iname_to_tag[iname], ParallelTag)):
+ and isinstance(knl.iname_to_tag[iname], ConcurrentTag)):
continue
# If we find a duplication option and fo not use boostable_into
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 2be78f8e5c25a3b48c195f52715f9d6453100e3b..37c5d85a1ade5c8f7fadb2c6a785cf7cea3dde40 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -301,4 +301,39 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False):
# }}}
+# {{{ uniquify_instruction_ids
+
+def uniquify_instruction_ids(kernel):
+ """Converts any ids that are :class:`loopy.UniqueName` or *None* into unique
+ strings.
+
+ This function does *not* deduplicate existing instruction ids.
+ """
+
+ from loopy.kernel.creation import UniqueName
+
+ insn_ids = set(
+ insn.id for insn in kernel.instructions
+ if insn.id is not None and not isinstance(insn.id, UniqueName))
+
+ from pytools import UniqueNameGenerator
+ insn_id_gen = UniqueNameGenerator(insn_ids)
+
+ new_instructions = []
+
+ for insn in kernel.instructions:
+ if insn.id is None:
+ new_instructions.append(
+ insn.copy(id=insn_id_gen("insn")))
+ elif isinstance(insn.id, UniqueName):
+ new_instructions.append(
+ insn.copy(id=insn_id_gen(insn.id.name)))
+ else:
+ new_instructions.append(insn)
+
+ return kernel.copy(instructions=new_instructions)
+
+# }}}
+
+
# vim: foldmethod=marker
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index 6077332c4fc4322ac7ffb02ade4a0e24c7066245..4755ca1774a15480a2c6b255380dd724e47f9042 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -811,7 +811,8 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
barrier_insn = BarrierInstruction(
id=barrier_insn_id,
depends_on=frozenset([compute_insn_id]),
- kind="global")
+ synchronization_kind="global",
+ mem_kind="global")
compute_dep_id = barrier_insn_id
added_compute_insns.append(barrier_insn)
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index 3d4f5c2d4765aa7cbf1e56c76d127bf8f4d61a06..b53488b486c6750742b269f47cfd4f08b8f8fab9 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -351,7 +351,8 @@ class TemporarySaver(object):
self.subkernel_to_slice_indices[subkernel])
def is_global_barrier(item):
- return isinstance(item, Barrier) and item.kind == "global"
+ return isinstance(item, Barrier) and \
+ item.synchronization_kind == "global"
try:
pre_barrier = next(item for item in
@@ -402,13 +403,13 @@ class TemporarySaver(object):
continue
from loopy.kernel.data import (
- GroupIndexTag, LocalIndexTag, ParallelTag)
+ GroupIndexTag, LocalIndexTag, ConcurrentTag)
if isinstance(tag, GroupIndexTag):
my_group_tags.append(tag)
elif isinstance(tag, LocalIndexTag):
my_local_tags.append(tag)
- elif isinstance(tag, ParallelTag):
+ elif isinstance(tag, ConcurrentTag):
raise LoopyError(
"iname '%s' is tagged with '%s' - only "
"group and local tags are supported for "
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index 79ceff9fdf1e2c4b3b544e8ae85f8194b36ec444..a681afe06520483c83530c241e39229412e88f03 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -1,6 +1,4 @@
-from __future__ import division
-from __future__ import absolute_import
-import six
+from __future__ import division, absolute_import
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
@@ -24,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
+import six
from loopy.symbolic import (
get_dependencies, SubstitutionMapper,
@@ -141,6 +140,7 @@ def extract_subst(kernel, subst_name, template, parameters=()):
dfmapper = CallbackMapper(gather_exprs, WalkMapper())
for insn in kernel.instructions:
+ dfmapper(insn.assignees)
dfmapper(insn.expression)
for sr in six.itervalues(kernel.substitutions):
@@ -178,8 +178,7 @@ def extract_subst(kernel, subst_name, template, parameters=()):
new_insns = []
for insn in kernel.instructions:
- new_expr = cbmapper(insn.expression)
- new_insns.append(insn.copy(expression=new_expr))
+ new_insns.append(insn.with_transformed_expressions(cbmapper))
from loopy.kernel.data import SubstitutionRule
new_substs = {
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 409cbbc5ebd5feb13b04eeba1671f639663bfcf1..6ffc1dff5220ab48c6c87ec29fec6e44d57ba133 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -237,6 +237,12 @@ class TypeInferenceMapper(CombineMapper):
else:
raise TypeInferenceFailure("Cannot deduce type of constant '%s'" % expr)
+ def map_type_cast(self, expr):
+ subtype, = self.rec(expr.child)
+ if not issubclass(subtype.dtype.type, np.number):
+ raise LoopyError("Can't cast a '%s' to '%s'" % (subtype, expr.type))
+ return [expr.type]
+
def map_subscript(self, expr):
return self.rec(expr.aggregate)
diff --git a/loopy/version.py b/loopy/version.py
index 02244f55d0dbf207a4641c3ebf6cc33b536f0421..e142162729d5a374082fa853dcc763665f7dfe33 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -21,7 +21,7 @@ THE SOFTWARE.
"""
-VERSION = (2016, 2)
+VERSION = (2017, 2)
VERSION_STATUS = ""
VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
@@ -32,4 +32,4 @@ except ImportError:
else:
_islpy_version = islpy.version.VERSION_TEXT
-DATA_MODEL_VERSION = "v64-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v69-islpy%s" % _islpy_version
diff --git a/requirements.txt b/requirements.txt
index 3ff69a123d10cc7bc6799ebfb8913bfd0eed839e..1a23022821116aea068b76eab72f9a5596694eea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
git+https://github.com/inducer/pytools.git
git+https://github.com/inducer/islpy.git
git+https://github.com/inducer/cgen.git
-git+https://github.com/pyopencl/pyopencl.git
+git+https://github.com/inducer/pyopencl.git
git+https://github.com/inducer/pymbolic.git
git+https://github.com/inducer/genpy.git
git+https://github.com/inducer/codepy.git
diff --git a/setup.py b/setup.py
index b8bc17d888aae8409000c936b487afb94a5250d0..b8f36d12559f05a47ef57dd06efd4761e3b3ad9a 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@ setup(name="loo.py",
],
install_requires=[
- "pytools>=2017.3",
+ "pytools>=2017.6",
"pymbolic>=2016.2",
"genpy>=2016.1.2",
"cgen>=2016.1",
diff --git a/test/test_fortran.py b/test/test_fortran.py
index 6e05aa6adba66ce0a1896527249d321de104c512..842a0127e3118ec8e7a0ea89ed17decc091e8566 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -278,14 +278,14 @@ def test_matmul(ctx_factory, buffer_inames):
logging.basicConfig(level=logging.INFO)
fortran_src = """
- subroutine dgemm(m,n,l,a,b,c)
+ subroutine dgemm(m,n,ell,a,b,c)
implicit none
- real*8 a(m,l),b(l,n),c(m,n)
- integer m,n,k,i,j,l
+ real*8 a(m,ell),b(ell,n),c(m,n)
+ integer m,n,k,i,j,ell
do j = 1,n
do i = 1,m
- do k = 1,l
+ do k = 1,ell
c(i,j) = c(i,j) + b(k,j)*a(i,k)
end do
end do
@@ -306,7 +306,7 @@ def test_matmul(ctx_factory, buffer_inames):
knl = lp.split_iname(knl, "k", 32)
knl = lp.assume(knl, "n mod 32 = 0")
knl = lp.assume(knl, "m mod 32 = 0")
- knl = lp.assume(knl, "l mod 16 = 0")
+ knl = lp.assume(knl, "ell mod 16 = 0")
knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
@@ -317,7 +317,7 @@ def test_matmul(ctx_factory, buffer_inames):
init_expression="0", store_expression="base+buffer")
ctx = ctx_factory()
- lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128))
+ lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
@pytest.mark.xfail
@@ -457,14 +457,14 @@ def test_parse_and_fuse_two_kernels():
def test_precompute_some_exist(ctx_factory):
fortran_src = """
- subroutine dgemm(m,n,l,a,b,c)
+ subroutine dgemm(m,n,ell,a,b,c)
implicit none
- real*8 a(m,l),b(l,n),c(m,n)
- integer m,n,k,i,j,l
+ real*8 a(m,ell),b(ell,n),c(m,n)
+ integer m,n,k,i,j,ell
do j = 1,n
do i = 1,m
- do k = 1,l
+ do k = 1,ell
c(i,j) = c(i,j) + b(k,j)*a(i,k)
end do
end do
@@ -483,7 +483,7 @@ def test_precompute_some_exist(ctx_factory):
knl = lp.split_iname(knl, "k", 8)
knl = lp.assume(knl, "n mod 8 = 0")
knl = lp.assume(knl, "m mod 8 = 0")
- knl = lp.assume(knl, "l mod 8 = 0")
+ knl = lp.assume(knl, "ell mod 8 = 0")
knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
@@ -495,7 +495,7 @@ def test_precompute_some_exist(ctx_factory):
ref_knl = knl
ctx = ctx_factory()
- lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128))
+ lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
if __name__ == "__main__":
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 772d536d1e00fedc0b7abcd2f8c05350fe3b633e..3d422f1d8b5a847d4445468978ee529db95c481f 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -230,14 +230,14 @@ def test_funny_shape_matrix_mul(ctx_factory):
n = get_suitable_size(ctx)
m = n+12
- l = m+12
+ ell = m+12
knl = lp.make_kernel(
- "{[i,k,j]: 0<=i gid = i/256
+ start = gid*256
+ for j
+ a[start + j] = a[start + j] + j
+ end
+ end
+ """,
+ seq_dependencies=True,
+ name="uniform_l",
+ target=PyOpenCLTarget(),
+ assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0]))
+
+ knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+ cl_kernel_info = CompiledKernel(ctx, knl).cl_kernel_info(frozenset()) # noqa
+
# }}}
@@ -1064,6 +1115,28 @@ def test_literal_local_barrier(ctx_factory):
lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+def test_local_barrier_mem_kind():
+ def __test_type(mtype, expected):
+ insn = '... lbarrier'
+ if mtype:
+ insn += '{mem_kind=%s}' % mtype
+ knl = lp.make_kernel(
+ "{ [i]: 0<=i {[i]: 0 <= i < n}",
+ """
+ <>tmp[i] = i
+ tmp[0] = 0
+ """,
+ fixed_parameters=dict(n=1))
+
+ knl(queue)
+
+
+def test_parameter_inference():
+ knl = lp.make_kernel("{[i]: 0 <= i < n and i mod 2 = 0}", "")
+ assert knl.all_params() == set(["n"])
+
+
def test_execution_backend_can_cache_dtypes(ctx_factory):
# When the kernel is invoked, the execution backend uses it as a cache key
# for the type inference and scheduling cache. This tests to make sure that
@@ -2431,6 +2593,167 @@ def test_execution_backend_can_cache_dtypes(ctx_factory):
knl(queue)
+def test_preamble_with_separate_temporaries(ctx_factory):
+ from loopy.kernel.data import temp_var_scope as scopes
+ # create a function mangler
+
+ func_name = 'indirect'
+ func_arg_dtypes = (np.int32, np.int32, np.int32)
+ func_result_dtypes = (np.int32,)
+
+ def __indirectmangler(kernel, name, arg_dtypes):
+ """
+ A function that will return a :class:`loopy.kernel.data.CallMangleInfo`
+ to interface with the calling :class:`loopy.LoopKernel`
+ """
+ if name != func_name:
+ return None
+
+ from loopy.types import to_loopy_type
+ from loopy.kernel.data import CallMangleInfo
+
+ def __compare(d1, d2):
+ # compare dtypes ignoring atomic
+ return to_loopy_type(d1, for_atomic=True) == \
+ to_loopy_type(d2, for_atomic=True)
+
+ # check types
+ if len(arg_dtypes) != len(arg_dtypes):
+ raise Exception('Unexpected number of arguments provided to mangler '
+ '{}, expected {}, got {}'.format(
+ func_name, len(func_arg_dtypes), len(arg_dtypes)))
+
+ for i, (d1, d2) in enumerate(zip(func_arg_dtypes, arg_dtypes)):
+ if not __compare(d1, d2):
+ raise Exception('Argument at index {} for mangler {} does not '
+ 'match expected dtype. Expected {}, got {}'.
+ format(i, func_name, str(d1), str(d2)))
+
+ # get target for creation
+ target = arg_dtypes[0].target
+ return CallMangleInfo(
+ target_name=func_name,
+ result_dtypes=tuple(to_loopy_type(x, target=target) for x in
+ func_result_dtypes),
+ arg_dtypes=arg_dtypes)
+
+ # create the preamble generator
+ def create_preamble(arr):
+ def __indirectpreamble(preamble_info):
+ # find a function matching our name
+ func_match = next(
+ (x for x in preamble_info.seen_functions
+ if x.name == func_name), None)
+ desc = 'custom_funcs_indirect'
+ if func_match is not None:
+ from loopy.types import to_loopy_type
+ # check types
+ if tuple(to_loopy_type(x) for x in func_arg_dtypes) == \
+ func_match.arg_dtypes:
+ # if match, create our temporary
+ var = lp.TemporaryVariable(
+ 'lookup', initializer=arr, dtype=arr.dtype, shape=arr.shape,
+ scope=scopes.GLOBAL, read_only=True)
+ # and code
+ code = """
+ int {name}(int start, int end, int match)
+ {{
+ int result = start;
+ for (int i = start + 1; i < end; ++i)
+ {{
+ if (lookup[i] == match)
+ result = i;
+ }}
+ return result;
+ }}
+ """.format(name=func_name)
+
+ # generate temporary variable code
+ from cgen import Initializer
+ from loopy.target.c import generate_array_literal
+ codegen_state = preamble_info.codegen_state.copy(
+ is_generating_device_code=True)
+ kernel = preamble_info.kernel
+ ast_builder = codegen_state.ast_builder
+ target = kernel.target
+ decl_info, = var.decl_info(target, index_dtype=kernel.index_dtype)
+ decl = ast_builder.wrap_global_constant(
+ ast_builder.get_temporary_decl(
+ codegen_state, None, var,
+ decl_info))
+ if var.initializer is not None:
+ decl = Initializer(decl, generate_array_literal(
+ codegen_state, var, var.initializer))
+ # return generated code
+ yield (desc, '\n'.join([str(decl), code]))
+ return __indirectpreamble
+
+ # and finally create a test
+ n = 10
+ # for each entry come up with a random number of data points
+ num_data = np.asarray(np.random.randint(2, 10, size=n), dtype=np.int32)
+ # turn into offsets
+ offsets = np.asarray(np.hstack(([0], np.cumsum(num_data))), dtype=np.int32)
+ # create lookup data
+ lookup = np.empty(0)
+ for i in num_data:
+ lookup = np.hstack((lookup, np.arange(i)))
+ lookup = np.asarray(lookup, dtype=np.int32)
+ # and create data array
+ data = np.random.rand(np.product(num_data))
+
+ # make kernel
+ kernel = lp.make_kernel('{[i]: 0 <= i < n}',
+ """
+ for i
+ <>ind = indirect(offsets[i], offsets[i + 1], 1)
+ out[i] = data[ind]
+ end
+ """,
+ [lp.GlobalArg('out', shape=('n',)),
+ lp.TemporaryVariable(
+ 'offsets', shape=(offsets.size,), initializer=offsets, scope=scopes.GLOBAL,
+ read_only=True),
+ lp.GlobalArg('data', shape=(data.size,), dtype=np.float64)],
+ )
+ # fixt params, and add manglers / preamble
+ kernel = lp.fix_parameters(kernel, **{'n': n})
+ kernel = lp.register_preamble_generators(kernel, [create_preamble(lookup)])
+ kernel = lp.register_function_manglers(kernel, [__indirectmangler])
+
+ print(lp.generate_code(kernel)[0])
+ # and call (functionality unimportant, more that it compiles)
+ ctx = cl.create_some_context()
+ queue = cl.CommandQueue(ctx)
+ # check that it actually performs the lookup correctly
+ assert np.allclose(kernel(
+ queue, data=data.flatten('C'))[1][0], data[offsets[:-1] + 1])
+
+
+def test_add_prefetch_works_in_lhs_index():
+ knl = lp.make_kernel(
+ "{ [n,k,l,k1,l1,k2,l2]: "
+ "start<=n a1_tmp[k,l] = a1[a1_map[n, k],l]
+ a1_tmp[k1,l1] = a1_tmp[k1,l1] + 1
+ a1_out[a1_map[n,k2], l2] = a1_tmp[k2,l2]
+ end
+ """,
+ [
+ lp.GlobalArg("a1,a1_out", None, "ndofs,2"),
+ lp.GlobalArg("a1_map", None, "nelements,3"),
+ "..."
+ ])
+
+ knl = lp.add_prefetch(knl, "a1_map", "k")
+
+ from loopy.symbolic import get_dependencies
+ for insn in knl.instructions:
+ assert "a1_map" not in get_dependencies(insn.assignees)
+
+
if __name__ == "__main__":
if len(sys.argv) > 1:
exec(sys.argv[1])
diff --git a/test/test_misc.py b/test/test_misc.py
index a22e424630255df4225586eeb9f0d62a03d5318f..0273948b38b28b85e42a600bffb65fbf86dcc554 100644
--- a/test/test_misc.py
+++ b/test/test_misc.py
@@ -92,26 +92,36 @@ def test_SetTrie():
s.add_or_update(set([1, 4]))
-class PicklableItem(object):
+class PickleDetector(object):
+ """Contains a class attribute which flags if any instance was unpickled.
+ """
- flags = {"unpickled": False}
+ @classmethod
+ def reset(cls):
+ cls.instance_unpickled = False
def __getstate__(self):
- return True
+ return {"state": self.state}
def __setstate__(self, state):
- PicklableItem.flags["unpickled"] = True
+ self.__class__.instance_unpickled = True
+ self.state = state["state"]
-def test_LazilyUnpicklingDictionary():
- def is_unpickled():
- return PicklableItem.flags["unpickled"]
+class PickleDetectorForLazilyUnpicklingDict(PickleDetector):
+ instance_unpickled = False
- from loopy.tools import LazilyUnpicklingDictionary
+ def __init__(self):
+ self.state = None
- mapping = LazilyUnpicklingDictionary({0: PicklableItem()})
- assert not is_unpickled()
+def test_LazilyUnpicklingDict():
+ from loopy.tools import LazilyUnpicklingDict
+
+ cls = PickleDetectorForLazilyUnpicklingDict
+ mapping = LazilyUnpicklingDict({0: cls()})
+
+ assert not cls.instance_unpickled
from pickle import loads, dumps
@@ -120,30 +130,160 @@ def test_LazilyUnpicklingDictionary():
# {{{ test lazy loading
mapping = loads(pickled_mapping)
- assert not is_unpickled()
+ assert not cls.instance_unpickled
list(mapping.keys())
- assert not is_unpickled()
- assert isinstance(mapping[0], PicklableItem)
- assert is_unpickled()
+ assert not cls.instance_unpickled
+ assert isinstance(mapping[0], cls)
+ assert cls.instance_unpickled
+
+ # }}}
+
+ # {{{ conversion
+
+ cls.reset()
+ mapping = loads(pickled_mapping)
+ dict(mapping)
+ assert cls.instance_unpickled
# }}}
# {{{ test multi round trip
mapping = loads(dumps(loads(pickled_mapping)))
- assert isinstance(mapping[0], PicklableItem)
+ assert isinstance(mapping[0], cls)
# }}}
# {{{ test empty map
- mapping = LazilyUnpicklingDictionary({})
+ mapping = LazilyUnpicklingDict({})
mapping = loads(dumps(mapping))
assert len(mapping) == 0
# }}}
+class PickleDetectorForLazilyUnpicklingList(PickleDetector):
+ instance_unpickled = False
+
+ def __init__(self):
+ self.state = None
+
+
+def test_LazilyUnpicklingList():
+ from loopy.tools import LazilyUnpicklingList
+
+ cls = PickleDetectorForLazilyUnpicklingList
+ lst = LazilyUnpicklingList([cls()])
+ assert not cls.instance_unpickled
+
+ from pickle import loads, dumps
+ pickled_lst = dumps(lst)
+
+ # {{{ test lazy loading
+
+ lst = loads(pickled_lst)
+ assert not cls.instance_unpickled
+ assert isinstance(lst[0], cls)
+ assert cls.instance_unpickled
+
+ # }}}
+
+ # {{{ conversion
+
+ cls.reset()
+ lst = loads(pickled_lst)
+ list(lst)
+ assert cls.instance_unpickled
+
+ # }}}
+
+ # {{{ test multi round trip
+
+ lst = loads(dumps(loads(dumps(lst))))
+ assert isinstance(lst[0], cls)
+
+ # }}}
+
+ # {{{ test empty list
+
+ lst = LazilyUnpicklingList([])
+ lst = loads(dumps(lst))
+ assert len(lst) == 0
+
+ # }}}
+
+
+class PickleDetectorForLazilyUnpicklingListWithEqAndPersistentHashing(
+ PickleDetector):
+ instance_unpickled = False
+
+ def __init__(self, comparison_key):
+ self.state = comparison_key
+
+ def __repr__(self):
+ return repr(self.state)
+
+ def update_persistent_hash(self, key_hash, key_builder):
+ key_builder.rec(key_hash, repr(self))
+
+
+def test_LazilyUnpicklingListWithEqAndPersistentHashing():
+ from loopy.tools import LazilyUnpicklingListWithEqAndPersistentHashing
+
+ cls = PickleDetectorForLazilyUnpicklingListWithEqAndPersistentHashing
+ from pickle import loads, dumps
+
+ # {{{ test comparison of a pair of lazy lists
+
+ lst0 = LazilyUnpicklingListWithEqAndPersistentHashing(
+ [cls(0), cls(1)],
+ eq_key_getter=repr,
+ persistent_hash_key_getter=repr)
+ lst1 = LazilyUnpicklingListWithEqAndPersistentHashing(
+ [cls(0), cls(1)],
+ eq_key_getter=repr,
+ persistent_hash_key_getter=repr)
+
+ assert not cls.instance_unpickled
+
+ assert lst0 == lst1
+ assert not cls.instance_unpickled
+
+ lst0 = loads(dumps(lst0))
+ lst1 = loads(dumps(lst1))
+
+ assert lst0 == lst1
+ assert not cls.instance_unpickled
+
+ lst0.append(cls(3))
+ lst1.append(cls(2))
+
+ assert lst0 != lst1
+
+ # }}}
+
+ # {{{ comparison with plain lists
+
+ lst = [cls(0), cls(1), cls(3)]
+
+ assert lst == lst0
+ assert lst0 == lst
+ assert not cls.instance_unpickled
+
+ # }}}
+
+ # {{{ persistent hashing
+
+ from loopy.tools import LoopyKeyBuilder
+ kb = LoopyKeyBuilder()
+
+ assert kb(lst0) == kb(lst)
+ assert not cls.instance_unpickled
+
+ # }}}
+
+
if __name__ == "__main__":
if len(sys.argv) > 1:
exec(sys.argv[1])
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 0de08f5f616937604bc2c93581c5a8a1770164f4..eff3dbd0e07439bbec399479183a7e9ddb69b9ff 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -28,6 +28,7 @@ import pytest
import loopy as lp
import pyopencl as cl
import sys
+import os
pytestmark = pytest.mark.importorskip("fparser")
@@ -49,7 +50,7 @@ __all__ = [
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa
ctx = ctx_factory()
- filename = "strongVolumeKernels.f90"
+ filename = os.path.join(os.path.dirname(__file__), "strongVolumeKernels.f90")
with open(filename, "r") as sourcef:
source = sourcef.read()
diff --git a/test/test_reduction.py b/test/test_reduction.py
index be11d7c8cada94596dceb1a8e0e678f8adb582e9..0c37d2228ee41f3e8af7ef6f6fcd68afa7a66960 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -97,22 +97,22 @@ def test_nested_dependent_reduction(ctx_factory):
"{[j]: 0<=j sumlen = l[i]",
+ "<> sumlen = ell[i]",
"a[i] = sum(j, j)",
],
[
lp.ValueArg("n", np.int32),
lp.GlobalArg("a", dtype, ("n",)),
- lp.GlobalArg("l", np.int32, ("n",)),
+ lp.GlobalArg("ell", np.int32, ("n",)),
])
cknl = lp.CompiledKernel(ctx, knl)
n = 330
- l = np.arange(n, dtype=np.int32)
- evt, (a,) = cknl(queue, l=l, n=n, out_host=True)
+ ell = np.arange(n, dtype=np.int32)
+ evt, (a,) = cknl(queue, ell=ell, n=n, out_host=True)
- tgt_result = (2*l-1)*2*l/2
+ tgt_result = (2*ell-1)*2*ell/2
assert (a == tgt_result).all()
@@ -413,6 +413,27 @@ def test_parallel_multi_output_reduction(ctx_factory):
assert max_index == np.argmax(np.abs(a))
+def test_reduction_with_conditional():
+ # Test whether realization of a reduction inherits predicates
+ # of the original instruction. Tested with the CTarget, because
+ # the PyOpenCL target will hoist the conditional into the host
+ # code in this minimal example.
+ knl = lp.make_kernel(
+ "{ [i] : 0<=i<42 }",
+ """
+ if n > 0
+ <>b = sum(i, a[i])
+ end
+ """,
+ [lp.GlobalArg("a", dtype=np.float32, shape=(42,)),
+ lp.GlobalArg("n", dtype=np.float32, shape=())],
+ target=lp.CTarget())
+ code = lp.generate_body(knl)
+
+ # Check that the if appears before the loop that realizes the reduction.
+ assert code.index("if") < code.index("for")
+
+
if __name__ == "__main__":
if len(sys.argv) > 1:
exec(sys.argv[1])
diff --git a/test/test_statistics.py b/test/test_statistics.py
index a72b62af90050008f837e144f1f28d4a4de1c730..eeb4a5a288afdd5b9295b0b681abb61b5f021d97 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -37,14 +37,14 @@ from pymbolic.primitives import Variable
def test_op_counter_basic():
knl = lp.make_kernel(
- "[n,m,l] -> {[i,k,j]: 0<=i {[i,k,j]: 0<=i6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
+ e[i,k] = if(
+ not(k6 or k/2==ell,
+ g[i,k]*2,
+ g[i,k]+h[i,k]/2)
"""
],
- name="logic", assumptions="n,m,l >= 1")
+ name="logic", assumptions="n,m,ell >= 1")
knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
op_map = lp.get_op_map(knl, count_redundant_work=True)
n = 512
m = 256
- l = 128
- params = {'n': n, 'm': m, 'l': l}
+ ell = 128
+ params = {'n': n, 'm': m, 'ell': ell}
f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(params)
f64div = op_map[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params)
@@ -118,14 +121,14 @@ def test_op_counter_logic():
def test_op_counter_specialops():
knl = lp.make_kernel(
- "{[i,k,j]: 0<=i> k))
"""
],
- name="bitwise", assumptions="n,m,l >= 1")
+ name="bitwise", assumptions="n,m,ell >= 1")
knl = lp.add_and_infer_dtypes(
knl, dict(
@@ -169,16 +172,16 @@ def test_op_counter_bitwise():
op_map = lp.get_op_map(knl, count_redundant_work=True)
n = 512
m = 256
- l = 128
- params = {'n': n, 'm': m, 'l': l}
+ ell = 128
+ params = {'n': n, 'm': m, 'ell': ell}
i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params)
i32bw = op_map[lp.Op(np.int32, 'bw')].eval_with_dict(params)
i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params)
i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params)
i64add = op_map[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params)
i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params)
- assert i32add == n*m+n*m*l
- assert i32bw == 2*n*m*l
+ assert i32add == n*m+n*m*ell
+ assert i32bw == 2*n*m*ell
assert i64bw == 2*n*m
assert i64add == i64mul == n*m
assert i64shift == 2*n*m
@@ -218,22 +221,22 @@ def test_op_counter_triangular_domain():
def test_mem_access_counter_basic():
knl = lp.make_kernel(
- "[n,m,l] -> {[i,k,j]: 0<=i {[i,k,j]: 0<=i6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
+ e[i,k] = if(not(k6 or k/2==ell,
+ g[i,k]*2,
+ g[i,k]+h[i,k]/2)
"""
],
- name="logic", assumptions="n,m,l >= 1")
+ name="logic", assumptions="n,m,ell >= 1")
knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
n = 512
m = 256
- l = 128
- params = {'n': n, 'm': m, 'l': l}
+ ell = 128
+ params = {'n': n, 'm': m, 'ell': ell}
reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
@@ -332,22 +337,22 @@ def test_mem_access_counter_logic():
def test_mem_access_counter_specialops():
knl = lp.make_kernel(
- "{[i,k,j]: 0<=i> k))
"""
],
- name="bitwise", assumptions="n,m,l >= 1")
+ name="bitwise", assumptions="n,m,ell >= 1")
knl = lp.add_and_infer_dtypes(
knl, dict(
@@ -398,8 +403,8 @@ def test_mem_access_counter_bitwise():
mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
n = 512
m = 256
- l = 128
- params = {'n': n, 'm': m, 'l': l}
+ ell = 128
+ params = {'n': n, 'm': m, 'ell': ell}
i32 = mem_map[lp.MemAccess('global', np.int32,
stride=0, direction='load', variable='a')
].eval_with_dict(params)
@@ -412,7 +417,7 @@ def test_mem_access_counter_bitwise():
i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
stride=0, direction='load', variable='h')
].eval_with_dict(params)
- assert i32 == 4*n*m+2*n*m*l
+ assert i32 == 4*n*m+2*n*m*ell
i32 = mem_map[lp.MemAccess('global', np.int32,
stride=0, direction='store', variable='c')
@@ -420,20 +425,20 @@ def test_mem_access_counter_bitwise():
i32 += mem_map[lp.MemAccess('global', np.int32,
stride=0, direction='store', variable='e')
].eval_with_dict(params)
- assert i32 == n*m+n*m*l
+ assert i32 == n*m+n*m*ell
def test_mem_access_counter_mixed():
knl = lp.make_kernel(
- "[n,m,l] -> {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}",
+ "[n,m,ell] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}",
[
"""
c[i,j,k] = 2*a[i,j,k] {id=first}
@@ -620,8 +625,8 @@ def test_barrier_counter_barriers():
print(sync_map)
n = 512
m = 256
- l = 128
- params = {'n': n, 'm': m, 'l': l}
+ ell = 128
+ params = {'n': n, 'm': m, 'ell': ell}
barrier_count = sync_map["barrier_local"].eval_with_dict(params)
assert barrier_count == 50*10*2
@@ -630,11 +635,11 @@ def test_all_counters_parallel_matmul():
bsize = 16
knl = lp.make_kernel(
- "{[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i]: 0<=i 1:
exec(sys.argv[1])
diff --git a/test/test_target.py b/test/test_target.py
index ad0cb7439bfdd6200e020c0becadcd73072ceef4..aa6f004634f207a7b9733da4a3d7e06d13d7db7c 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -140,6 +140,32 @@ def test_generate_c_snippet():
print(lp.generate_body(knl))
+def test_c_min_max():
+ # Test fmin() fmax() is generated for C backend instead of max() and min()
+ from loopy.target.c import CTarget
+ import pymbolic.primitives as p
+ i = p.Variable("i")
+ xi = p.Subscript(p.Variable("x"), i)
+ yi = p.Subscript(p.Variable("y"), i)
+ zi = p.Subscript(p.Variable("z"), i)
+
+ n = 100
+ domain = "{[i]: 0<=i<%d}" % n
+ data = [lp.GlobalArg("x", np.float64, shape=(n,)),
+ lp.GlobalArg("y", np.float64, shape=(n,)),
+ lp.GlobalArg("z", np.float64, shape=(n,))]
+
+ inst = [lp.Assignment(xi, p.Variable("min")(yi, zi))]
+ knl = lp.make_kernel(domain, inst, data, target=CTarget())
+ code = lp.generate_code_v2(knl).device_code()
+ assert "fmin" in code
+
+ inst = [lp.Assignment(xi, p.Variable("max")(yi, zi))]
+ knl = lp.make_kernel(domain, inst, data, target=CTarget())
+ code = lp.generate_code_v2(knl).device_code()
+ assert "fmax" in code
+
+
@pytest.mark.parametrize("tp", ["f32", "f64"])
def test_random123(ctx_factory, tp):
ctx = ctx_factory()
@@ -240,6 +266,44 @@ def test_numba_cuda_target():
print(lp.generate_code_v2(knl).all_code())
+def test_sized_integer_c_codegen(ctx_factory):
+ ctx = ctx_factory()
+ queue = cl.CommandQueue(ctx)
+
+ from pymbolic import var
+ knl = lp.make_kernel(
+ "{[i]: 0<=i ctr = make_uint2(0, 0)",
+ lp.Assignment("a[i]", lp.TypeCast(np.int64, var("ctr")) << var("i"))]
+ )
+
+ with pytest.raises(lp.LoopyError):
+ knl = lp.preprocess_kernel(knl)
+
+
+def test_target_invalid_type_cast():
+ dtype = np.dtype([('', ' 1:
exec(sys.argv[1])
diff --git a/test/test_transform.py b/test/test_transform.py
index b5fcdf04c4781c5f370c911ceb7efcb4042f6b4e..e50605b46672f8e9c1817431f1577742b1f6fb4c 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -105,6 +105,27 @@ def test_to_batched(ctx_factory):
bknl(queue, a=a, x=x)
+def test_add_barrier(ctx_factory):
+ ctx = ctx_factory()
+ queue = cl.CommandQueue(ctx)
+ knl = lp.make_kernel(
+ "{[i, j, ii, jj]: 0<=i,j, ii, jj 1:
exec(sys.argv[1])