diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5ea075d194a9da75a1c18d180c65239be83eb85e..f96b43d67fcc1ca53a736fb4893990b8bd363a1a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -69,6 +69,7 @@ Python 2.7 with legacy PyOpenCL:
- pocl
except:
- tags
+ retry: 2
Python 3.6 POCL:
script:
diff --git a/doc/index.rst b/doc/index.rst
index a0bad2898be4aab74dead90aae825e4e0a460c87..d862a8acd0cb258bfd1e9623bd5cef895871f6b1 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -18,12 +18,14 @@ When you run this script, the following kernel is generated, compiled, and execu
(See the full example for how to print the generated code.)
+.. _static-binary:
+
Want to try out loopy?
----------------------
There's no need to go through :ref:`installation` if you'd just like to get a
feel for what loopy is. Instead, you may
-`download a self-contained Linux binary `_.
+`download a self-contained Linux binary `_.
This is purposefully built on an ancient Linux distribution, so it should work
on most versions of Linux that are currently out there.
diff --git a/doc/misc.rst b/doc/misc.rst
index 9db3b85a7d96c9ccf56592bcefb2b8639984f4f8..cd6fe102cb9c97a619d8b6512f103c9dcabe65b5 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -3,6 +3,18 @@
Installation
============
+Option 0: Static Binary
+-----------------------
+
+If you would just like to experiment with :mod:`loopy`'s code transformation
+abilities, the easiest way to get loopy is to download a statically-linked
+Linux binary.
+
+See :ref:`static-binary` for details.
+
+Option 1: From Source, no PyOpenCL integration
+-----------------------------------------------
+
This command should install :mod:`loopy`::
pip install loo.py
@@ -26,10 +38,59 @@ You may also clone its git repository::
git clone --recursive git://github.com/inducer/loopy
git clone --recursive http://git.tiker.net/trees/loopy.git
+Option 2: From Conda Forge, with PyOpenCL integration
+-----------------------------------------------------
+
+This set of instructions is intended for 64-bit Linux and
+MacOS support computers:
+
+#. Make sure your system has the basics to build software.
+
+ On Debian derivatives (Ubuntu and many more),
+ installing ``build-essential`` should do the trick.
+
+ Everywhere else, just making sure you have the ``g++`` package should be
+ enough.
+
+#. Install `miniconda `_.
+ (Both Python 2 and 3 should work. In the absence of other constraints, prefer Python 3.)
+
+#. ``export CONDA=/WHERE/YOU/INSTALLED/miniconda3``
+
+ If you accepted the default location, this should work:
+
+ ``export CONDA=$HOME/miniconda3``
+
+#. ``$CONDA/bin/conda create -n dev``
+
+#. ``source $CONDA/bin/activate dev``
+
+#. ``conda config --add channels conda-forge``
+
+#. ``conda install git pip pocl islpy pyopencl`` (Linux)
+
+ or
+
+ ``conda install osx-pocl-opencl git pip pocl islpy pyopencl`` (OS X)
+
+#. Type the following command::
+
+ pip install git+https://github.com/inducer/loopy
+
+Next time you want to use :mod:`loopy`, just run the following command::
+
+ source /WHERE/YOU/INSTALLED/miniconda3/bin/activate dev
+
+You may also like to add this to a startup file (like :file:`$HOME/.bashrc`) or create an alias for it.
+
+See the `PyOpenCL installation instructions
+`_ for options
+regarding OpenCL drivers.
+
User-visible Changes
====================
-Version 2016.2
+Version 2017.2
--------------
.. note::
@@ -57,7 +118,7 @@ Licensing
Loopy is licensed to you under the MIT/X Consortium license:
-Copyright (c) 2009-13 Andreas Klöckner and Contributors.
+Copyright (c) 2009-17 Andreas Klöckner and Contributors.
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index 85716fd93ff4768e8ec075c8afa7f0a9b0363999..3f01b0764f71e9ce2de86a66cc71f56473a7dc9f 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -130,6 +130,7 @@ Iname Implementation Tags
Tag Meaning
=============================== ====================================================
``None`` | ``"for"`` Sequential loop
+``"ord"`` Forced-order sequential loop
``"l.N"`` Local (intra-group) axis N ("local")
``"g.N"`` Group-number axis N ("group")
``"unr"`` Unroll
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 12c058fb741279db55521118f6711f197735dbd0..8b85387259228777f028fb70b1c0cf2efcc2d2ef 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -122,7 +122,9 @@ always see loopy's view of a kernel by printing it.
i: None
---------------------------------------------------------------------------
INSTRUCTIONS:
- [i] out[i] <- 2*a[i] # insn
+ for i
+ out[i] = 2*a[i] {id=insn}
+ end i
---------------------------------------------------------------------------
You'll likely have noticed that there's quite a bit more information here
@@ -1212,11 +1214,11 @@ should call :func:`loopy.get_one_scheduled_kernel`:
---------------------------------------------------------------------------
SCHEDULE:
0: CALL KERNEL rotate_v2(extra_args=[], extra_inames=[])
- 1: [maketmp] tmp <- arr[i_inner + i_outer*16]
+ 1: tmp = arr[i_inner + i_outer*16] {id=maketmp}
2: RETURN FROM KERNEL rotate_v2
- 3: ---BARRIER:global---
+ 3: ... gbarrier
4: CALL KERNEL rotate_v2_0(extra_args=[], extra_inames=[])
- 5: [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp
+ 5: arr[((1 + i_inner + i_outer*16) % n)] = tmp {id=rotate}
6: RETURN FROM KERNEL rotate_v2_0
---------------------------------------------------------------------------
@@ -1250,13 +1252,13 @@ put those instructions into the schedule.
---------------------------------------------------------------------------
SCHEDULE:
0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[])
- 1: [maketmp] tmp <- arr[i_inner + i_outer*16]
- 2: [tmp.save] tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] <- tmp
+ 1: tmp = arr[i_inner + i_outer*16] {id=maketmp}
+ 2: tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] = tmp {id=tmp.save}
3: RETURN FROM KERNEL rotate_v2
- 4: ---BARRIER:global---
+ 4: ... gbarrier
5: CALL KERNEL rotate_v2_0(extra_args=['tmp_save_slot'], extra_inames=[])
- 6: [tmp.reload] tmp <- tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0]
- 7: [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp
+ 6: tmp = tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0] {id=tmp.reload}
+ 7: arr[((1 + i_inner + i_outer*16) % n)] = tmp {id=rotate}
8: RETURN FROM KERNEL rotate_v2_0
---------------------------------------------------------------------------
diff --git a/loopy/check.py b/loopy/check.py
index a8ec1ad35e42410454b36fa38ef5f0a2fbefc0d6..6bac368381c708b72b2b7f235792df97d0bcd15e 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -144,20 +144,20 @@ def check_for_inactive_iname_access(kernel):
def _is_racing_iname_tag(tv, tag):
from loopy.kernel.data import (temp_var_scope,
- LocalIndexTagBase, GroupIndexTag, ParallelTag, auto)
+ LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto)
if tv.scope == temp_var_scope.PRIVATE:
return (
- isinstance(tag, ParallelTag)
+ isinstance(tag, ConcurrentTag)
and not isinstance(tag, (LocalIndexTagBase, GroupIndexTag)))
elif tv.scope == temp_var_scope.LOCAL:
return (
- isinstance(tag, ParallelTag)
+ isinstance(tag, ConcurrentTag)
and not isinstance(tag, GroupIndexTag))
elif tv.scope == temp_var_scope.GLOBAL:
- return isinstance(tag, ParallelTag)
+ return isinstance(tag, ConcurrentTag)
elif tv.scope == auto:
raise LoopyError("scope of temp var '%s' has not yet been"
@@ -169,7 +169,7 @@ def _is_racing_iname_tag(tv, tag):
def check_for_write_races(kernel):
- from loopy.kernel.data import ParallelTag
+ from loopy.kernel.data import ConcurrentTag
iname_to_tag = kernel.iname_to_tag.get
for insn in kernel.instructions:
@@ -190,7 +190,7 @@ def check_for_write_races(kernel):
raceable_parallel_insn_inames = set(
iname
for iname in kernel.insn_inames(insn)
- if isinstance(iname_to_tag(iname), ParallelTag))
+ if isinstance(iname_to_tag(iname), ConcurrentTag))
elif assignee_name in kernel.temporary_variables:
temp_var = kernel.temporary_variables[assignee_name]
@@ -230,13 +230,13 @@ def check_for_orphaned_user_hardware_axes(kernel):
def check_for_data_dependent_parallel_bounds(kernel):
- from loopy.kernel.data import ParallelTag
+ from loopy.kernel.data import ConcurrentTag
for i, dom in enumerate(kernel.domains):
dom_inames = set(dom.get_var_names(dim_type.set))
par_inames = set(iname
for iname in dom_inames
- if isinstance(kernel.iname_to_tag.get(iname), ParallelTag))
+ if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag))
if not par_inames:
continue
@@ -401,7 +401,7 @@ def pre_schedule_checks(kernel):
logger.debug("%s: pre-schedule check: done" % kernel.name)
except KeyboardInterrupt:
raise
- except:
+ except Exception:
print(75*"=")
print("failing kernel during pre-schedule check:")
print(75*"=")
@@ -659,7 +659,7 @@ def pre_codegen_checks(kernel):
check_that_shapes_and_strides_are_arguments(kernel)
logger.debug("pre-codegen check %s: done" % kernel.name)
- except:
+ except Exception:
print(75*"=")
print("failing kernel during pre-schedule check:")
print(75*"=")
@@ -708,6 +708,16 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
(insn_impl_domain & assumptions)
.project_out_except(insn_inames, [dim_type.set]))
+ from loopy.kernel.instruction import BarrierInstruction
+ from loopy.kernel.data import LocalIndexTag
+ if isinstance(insn, BarrierInstruction):
+ # project out local-id-mapped inames, solves #94 on gitlab
+ non_lid_inames = frozenset(
+ [iname for iname in insn_inames if not isinstance(
+ kernel.iname_to_tag.get(iname), LocalIndexTag)])
+ insn_impl_domain = insn_impl_domain.project_out_except(
+ non_lid_inames, [dim_type.set])
+
insn_domain = kernel.get_inames_domain(insn_inames)
insn_parameters = frozenset(insn_domain.get_var_names(dim_type.param))
assumptions, insn_domain = align_two(assumption_non_param, insn_domain)
@@ -715,6 +725,11 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
.project_out_except(insn_inames, [dim_type.set])
.project_out_except(insn_parameters, [dim_type.param]))
+ if isinstance(insn, BarrierInstruction):
+ # project out local-id-mapped inames, solves #94 on gitlab
+ desired_domain = desired_domain.project_out_except(
+ non_lid_inames, [dim_type.set])
+
insn_impl_domain = (insn_impl_domain
.project_out_except(insn_parameters, [dim_type.param]))
insn_impl_domain, desired_domain = align_two(
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 07bcdc7c6c4a0c23d374a14bc21e4e161b73be03..e83515d31f1c61e52569d8d0754ce79e7a7f602f 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -28,7 +28,7 @@ from loopy.diagnostic import LoopyError, warn
from pytools import ImmutableRecord
import islpy as isl
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
from loopy.tools import LoopyKeyBuilder
from loopy.version import DATA_MODEL_VERSION
@@ -357,8 +357,9 @@ class CodeGenerationState(object):
# }}}
-code_gen_cache = PersistentDict("loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION,
- key_builder=LoopyKeyBuilder())
+code_gen_cache = WriteOncePersistentDict(
+ "loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION,
+ key_builder=LoopyKeyBuilder())
class PreambleInfo(ImmutableRecord):
@@ -367,6 +368,7 @@ class PreambleInfo(ImmutableRecord):
.. attribute:: seen_dtypes
.. attribute:: seen_functions
.. attribute:: seen_atomic_dtypes
+ .. attribute:: codegen_state
"""
@@ -495,7 +497,9 @@ def generate_code_v2(kernel):
seen_dtypes=seen_dtypes,
seen_functions=seen_functions,
# a set of LoopyTypes (!)
- seen_atomic_dtypes=seen_atomic_dtypes)
+ seen_atomic_dtypes=seen_atomic_dtypes,
+ codegen_state=codegen_state
+ )
preamble_generators = (kernel.preamble_generators
+ kernel.target.get_device_ast_builder().preamble_generators())
@@ -515,7 +519,7 @@ def generate_code_v2(kernel):
logger.info("%s: generate code: done" % kernel.name)
if CACHING_ENABLED:
- code_gen_cache[input_kernel] = codegen_result
+ code_gen_cache.store_if_not_present(input_kernel, codegen_result)
return codegen_result
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index 61f4b3a9b8c38dfc25ebc81243812aa963423f8a..f398a063dc41f3f82267f6d4850158e4c45f4733 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -58,7 +58,7 @@ def get_approximate_convex_bounds_checks(domain, check_inames, implemented_domai
def get_usable_inames_for_conditional(kernel, sched_index):
from loopy.schedule import (
find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within)
- from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag
+ from loopy.kernel.data import ConcurrentTag, LocalIndexTagBase, IlpBaseTag
result = find_active_inames_at(kernel, sched_index)
crosses_barrier = has_barrier_within(kernel, sched_index)
@@ -97,7 +97,7 @@ def get_usable_inames_for_conditional(kernel, sched_index):
# at the innermost level of nesting.
if (
- isinstance(tag, ParallelTag)
+ isinstance(tag, ConcurrentTag)
and not (isinstance(tag, LocalIndexTagBase) and crosses_barrier)
and not isinstance(tag, IlpBaseTag)
):
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 789c00d33b7bb41816e6901e24046d4b0eefb27d..5240042337163f0aefcbc7fdb8f3151ac280053f 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -40,7 +40,7 @@ def get_admissible_conditional_inames_for(codegen_state, sched_index):
kernel = codegen_state.kernel
- from loopy.kernel.data import LocalIndexTag, HardwareParallelTag
+ from loopy.kernel.data import LocalIndexTag, HardwareConcurrentTag
from loopy.schedule import find_active_inames_at, has_barrier_within
result = find_active_inames_at(kernel, sched_index)
@@ -48,7 +48,7 @@ def get_admissible_conditional_inames_for(codegen_state, sched_index):
has_barrier = has_barrier_within(kernel, sched_index)
for iname, tag in six.iteritems(kernel.iname_to_tag):
- if (isinstance(tag, HardwareParallelTag)
+ if (isinstance(tag, HardwareConcurrentTag)
and codegen_state.is_generating_device_code):
if not has_barrier or not isinstance(tag, LocalIndexTag):
result.add(iname)
@@ -135,12 +135,13 @@ def generate_code_for_sched_index(codegen_state, sched_index):
generate_sequential_loop_dim_code)
from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag,
- LoopedIlpTag, VectorizeTag)
+ LoopedIlpTag, VectorizeTag, InOrderSequentialSequentialTag)
if isinstance(tag, (UnrollTag, UnrolledIlpTag)):
func = generate_unroll_loop
elif isinstance(tag, VectorizeTag):
func = generate_vectorize_loop
- elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag)):
+ elif tag is None or isinstance(tag, (
+ LoopedIlpTag, ForceSequentialTag, InOrderSequentialSequentialTag)):
func = generate_sequential_loop_dim_code
else:
raise RuntimeError("encountered (invalid) EnterLoop "
@@ -240,6 +241,15 @@ def build_loop_nest(codegen_state, schedule_index):
kernel = codegen_state.kernel
+ # If the AST builder does not implement conditionals, we can save us
+ # some work about hoisting conditionals and directly go into recursion.
+ if not codegen_state.ast_builder.can_implement_conditionals:
+ result = []
+ inner = generate_code_for_sched_index(codegen_state, schedule_index)
+ if inner is not None:
+ result.append(inner)
+ return merge_codegen_results(codegen_state, result)
+
# {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices
# i.e. go up to the next LeaveLoop, and skip over inner loops.
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 1a132049731cd094ba5665857f1afa4f9b04684a..1db7b0445efd2a2e27e761164fa919647df37a07 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -231,7 +231,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
kernel = codegen_state.kernel
from loopy.kernel.data import (
- UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag)
+ UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag)
from loopy.schedule import get_insn_ids_for_block_at
insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index)
@@ -243,7 +243,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
hw_inames_left = [iname
for iname in all_inames_by_insns
- if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)]
+ if isinstance(kernel.iname_to_tag.get(iname), HardwareConcurrentTag)]
if not hw_inames_left:
return next_func(codegen_state)
diff --git a/loopy/execution.py b/loopy/execution.py
index 07e28f06d33e5884ac57c9505593c9ee916c3171..a1228f8f3bb3493e83936ee0b3998bbd5b8cdcc2 100644
--- a/loopy/execution.py
+++ b/loopy/execution.py
@@ -31,7 +31,7 @@ from loopy.diagnostic import LoopyError
import logging
logger = logging.getLogger(__name__)
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
from loopy.tools import LoopyKeyBuilder
from loopy.version import DATA_MODEL_VERSION
@@ -120,7 +120,7 @@ class SeparateArrayPackingController(object):
# {{{ KernelExecutorBase
-typed_and_scheduled_cache = PersistentDict(
+typed_and_scheduled_cache = WriteOncePersistentDict(
"loopy-typed-and-scheduled-cache-v1-"+DATA_MODEL_VERSION,
key_builder=LoopyKeyBuilder())
@@ -204,7 +204,7 @@ class KernelExecutorBase(object):
kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set)
if CACHING_ENABLED:
- typed_and_scheduled_cache[cache_key] = kernel
+ typed_and_scheduled_cache.store_if_not_present(cache_key, kernel)
return kernel
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index f7ce5d9fc983c2ab946b5d959f283ef9328b7f29..49ab3fd68303e18a6bec371fc54db4e63f57346d 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -329,7 +329,7 @@ def is_nonnegative(expr, over_set):
from loopy.symbolic import aff_from_expr
try:
aff = aff_from_expr(space, -expr-1)
- except:
+ except Exception:
return None
expr_neg_set = isl.BasicSet.universe(space).add_constraint(
isl.Constraint.inequality_from_aff(aff))
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 084c37b45cc4af25689ae3e121f170382c4e8d16..cad11fc78075342a1c270f68486900ead65a95fd 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -40,6 +40,7 @@ from loopy.library.function import (
single_arg_function_mangler)
from loopy.diagnostic import CannotBranchDomainTree, LoopyError
+from loopy.tools import natsorted
# {{{ unique var names
@@ -701,12 +702,12 @@ class LoopKernel(ImmutableRecordWithoutPickling):
tag_key_uses = {}
- from loopy.kernel.data import HardwareParallelTag
+ from loopy.kernel.data import HardwareConcurrentTag
for iname in cond_inames:
tag = self.iname_to_tag.get(iname)
- if isinstance(tag, HardwareParallelTag):
+ if isinstance(tag, HardwareConcurrentTag):
tag_key_uses.setdefault(tag.key, []).append(iname)
multi_use_keys = set(
@@ -716,7 +717,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
multi_use_inames = set()
for iname in cond_inames:
tag = self.iname_to_tag.get(iname)
- if isinstance(tag, HardwareParallelTag) and tag.key in multi_use_keys:
+ if isinstance(tag, HardwareConcurrentTag) and tag.key in multi_use_keys:
multi_use_inames.add(iname)
return frozenset(cond_inames - multi_use_inames)
@@ -958,7 +959,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
try:
# insist block size is constant
size = static_max_of_pw_aff(size,
- constants_only=isinstance(tag, LocalIndexTag))
+ constants_only=isinstance(tag, LocalIndexTag),
+ context=self.assumptions)
except ValueError:
pass
@@ -1128,20 +1130,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
else:
sep = []
- def natorder(key):
- # Return natural ordering for strings, as opposed to dictionary order.
- # E.g. will result in
- # 'abc1' < 'abc9' < 'abc10'
- # rather than
- # 'abc1' < 'abc10' < 'abc9'
- # Based on
- # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7
- import re
- return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)]
-
- def natsorted(seq, key=lambda x: x):
- return sorted(seq, key=lambda y: natorder(key(y)))
-
if "name" in what:
lines.extend(sep)
lines.append("KERNEL: " + kernel.name)
@@ -1187,113 +1175,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
lines.extend(sep)
if show_labels:
lines.append("INSTRUCTIONS:")
- loop_list_width = 35
-
- # {{{ topological sort
-
- printed_insn_ids = set()
- printed_insn_order = []
-
- def insert_insn_into_order(insn):
- if insn.id in printed_insn_ids:
- return
- printed_insn_ids.add(insn.id)
-
- for dep_id in natsorted(insn.depends_on):
- insert_insn_into_order(kernel.id_to_insn[dep_id])
-
- printed_insn_order.append(insn)
-
- for insn in kernel.instructions:
- insert_insn_into_order(insn)
-
- # }}}
-
- import loopy as lp
-
- Fore = self.options._fore # noqa
- Style = self.options._style # noqa
-
- from loopy.kernel.tools import draw_dependencies_as_unicode_arrows
- for insn, (arrows, extender) in zip(
- printed_insn_order,
- draw_dependencies_as_unicode_arrows(
- printed_insn_order, fore=Fore, style=Style)):
-
- if isinstance(insn, lp.MultiAssignmentBase):
- lhs = ", ".join(str(a) for a in insn.assignees)
- rhs = str(insn.expression)
- trailing = []
- elif isinstance(insn, lp.CInstruction):
- lhs = ", ".join(str(a) for a in insn.assignees)
- rhs = "CODE(%s|%s)" % (
- ", ".join(str(x) for x in insn.read_variables),
- ", ".join("%s=%s" % (name, expr)
- for name, expr in insn.iname_exprs))
-
- trailing = [" "+l for l in insn.code.split("\n")]
- elif isinstance(insn, lp.BarrierInstruction):
- lhs = ""
- rhs = "... %sbarrier" % insn.kind[0]
- trailing = []
-
- elif isinstance(insn, lp.NoOpInstruction):
- lhs = ""
- rhs = "... nop"
- trailing = []
- else:
- raise LoopyError("unexpected instruction type: %s"
- % type(insn).__name__)
-
- order = self._get_iname_order_for_printing()
- loop_list = ",".join(
- sorted(kernel.insn_inames(insn), key=lambda iname: order[iname]))
-
- options = [Fore.GREEN+insn.id+Style.RESET_ALL]
- if insn.priority:
- options.append("priority=%d" % insn.priority)
- if insn.tags:
- options.append("tags=%s" % ":".join(insn.tags))
- if isinstance(insn, lp.Assignment) and insn.atomicity:
- options.append("atomic=%s" % ":".join(
- str(a) for a in insn.atomicity))
- if insn.groups:
- options.append("groups=%s" % ":".join(insn.groups))
- if insn.conflicts_with_groups:
- options.append(
- "conflicts=%s" % ":".join(insn.conflicts_with_groups))
- if insn.no_sync_with:
- options.append("no_sync_with=%s" % ":".join(
- "%s@%s" % entry for entry in sorted(insn.no_sync_with)))
-
- if lhs:
- core = "%s <- %s" % (
- Fore.CYAN+lhs+Style.RESET_ALL,
- Fore.MAGENTA+rhs+Style.RESET_ALL,
- )
- else:
- core = Fore.MAGENTA+rhs+Style.RESET_ALL
-
- if len(loop_list) > loop_list_width:
- lines.append("%s [%s]" % (arrows, loop_list))
- lines.append("%s %s%s # %s" % (
- extender,
- (loop_list_width+2)*" ",
- core,
- ", ".join(options)))
- else:
- lines.append("%s [%s]%s%s # %s" % (
- arrows,
- loop_list, " "*(loop_list_width-len(loop_list)),
- core,
- ",".join(options)))
-
- lines.extend(trailing)
-
- if insn.predicates:
- lines.append(10*" " + "if (%s)" % " && ".join(
- [str(x) for x in insn.predicates]))
+ from loopy.kernel.tools import stringify_instruction_list
+ lines.extend(stringify_instruction_list(kernel))
dep_lines = []
for insn in kernel.instructions:
@@ -1474,6 +1358,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
return hash(key_hash.digest())
def __eq__(self, other):
+ if self is other:
+ return True
+
if not isinstance(other, LoopKernel):
return False
@@ -1487,7 +1374,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
return False
elif field_name == "assumptions":
- if not self.assumptions.plain_is_equal(other.assumptions):
+ if not (
+ self.assumptions.plain_is_equal(other.assumptions)
+ or self.assumptions.is_equal(other.assumptions)):
return False
elif getattr(self, field_name) != getattr(other, field_name):
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index e4cb17657632c53120e56aacf29b20bc0778d73f..dcac16479e368908f50f5dff1ef0f4c0edcc3e7b 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -439,7 +439,7 @@ def parse_insn(groups, insn_options):
if "lhs" in groups:
try:
lhs = parse(groups["lhs"])
- except:
+ except Exception:
print("While parsing left hand side '%s', "
"the following error occurred:" % groups["lhs"])
raise
@@ -448,7 +448,7 @@ def parse_insn(groups, insn_options):
try:
rhs = parse(groups["rhs"])
- except:
+ except Exception:
print("While parsing right hand side '%s', "
"the following error occurred:" % groups["rhs"])
raise
@@ -522,14 +522,14 @@ def parse_subst_rule(groups):
from loopy.symbolic import parse
try:
lhs = parse(groups["lhs"])
- except:
+ except Exception:
print("While parsing left hand side '%s', "
"the following error occurred:" % groups["lhs"])
raise
try:
rhs = parse(groups["rhs"])
- except:
+ except Exception:
print("While parsing right hand side '%s', "
"the following error occurred:" % groups["rhs"])
raise
@@ -901,7 +901,8 @@ def parse_instructions(instructions, defines):
obj = insn_options_stack.pop()
#if this object is the end of an if statement
if obj['predicates'] == if_predicates_stack[-1]["insn_predicates"] and\
- if_predicates_stack[-1]["insn_predicates"]:
+ if_predicates_stack[-1]["insn_predicates"] and\
+ obj['within_inames'] == if_predicates_stack[-1]['within_inames']:
if_predicates_stack.pop()
continue
@@ -996,7 +997,7 @@ def parse_domains(domains, defines):
try:
dom = isl.BasicSet.read_from_str(isl.DEFAULT_CONTEXT, dom)
- except:
+ except Exception:
print("failed to parse domain '%s'" % dom)
raise
else:
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 94b31df12dae516d3539438b7e4ed66ed765e697..96933f57a003aaca58ed00d2d73c3301b0c448c7 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -77,14 +77,19 @@ class IndexTag(ImmutableRecord):
return type(self).__name__
-class ParallelTag(IndexTag):
+class ConcurrentTag(IndexTag):
pass
-class HardwareParallelTag(ParallelTag):
+class HardwareConcurrentTag(ConcurrentTag):
pass
+# deprecated aliases
+ParallelTag = ConcurrentTag
+HardwareParallelTag = HardwareConcurrentTag
+
+
class UniqueTag(IndexTag):
pass
@@ -105,11 +110,11 @@ class AxisTag(UniqueTag):
self.print_name, self.axis)
-class GroupIndexTag(HardwareParallelTag, AxisTag):
+class GroupIndexTag(HardwareConcurrentTag, AxisTag):
print_name = "g"
-class LocalIndexTagBase(HardwareParallelTag):
+class LocalIndexTagBase(HardwareConcurrentTag):
pass
@@ -130,7 +135,7 @@ class AutoFitLocalIndexTag(AutoLocalIndexTagBase):
# {{{ ilp-like
-class IlpBaseTag(ParallelTag):
+class IlpBaseTag(ConcurrentTag):
pass
@@ -161,6 +166,11 @@ class ForceSequentialTag(IndexTag):
return "forceseq"
+class InOrderSequentialSequentialTag(IndexTag):
+ def __str__(self):
+ return "ord"
+
+
def parse_tag(tag):
if tag is None:
return tag
@@ -173,6 +183,8 @@ def parse_tag(tag):
if tag == "for":
return None
+ elif tag == "ord":
+ return InOrderSequentialSequentialTag()
elif tag in ["unr"]:
return UnrollTag()
elif tag in ["vec"]:
@@ -346,6 +358,14 @@ class TemporaryVariable(ArrayBase):
A :class:`bool` indicating whether the variable may be written during
its lifetime. If *True*, *initializer* must be given.
+
+ .. attribute:: _base_storage_access_may_be_aliasing
+
+ Whether the temporary is used to alias the underlying base storage.
+ Defaults to *False*. If *False*, C-based code generators will declare
+ the temporary as a ``restrict`` const pointer to the base storage
+ memory location. If *True*, the restrict part is omitted on this
+ declaration.
"""
min_target_axes = 0
@@ -358,12 +378,14 @@ class TemporaryVariable(ArrayBase):
"base_storage",
"initializer",
"read_only",
+ "_base_storage_access_may_be_aliasing",
]
def __init__(self, name, dtype=None, shape=(), scope=auto,
dim_tags=None, offset=0, dim_names=None, strides=None, order=None,
base_indices=None, storage_shape=None,
- base_storage=None, initializer=None, read_only=False, **kwargs):
+ base_storage=None, initializer=None, read_only=False,
+ _base_storage_access_may_be_aliasing=False, **kwargs):
"""
:arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype`
:arg shape: :class:`loopy.auto` or a shape tuple
@@ -419,6 +441,13 @@ class TemporaryVariable(ArrayBase):
"mutually exclusive"
% name)
+ if base_storage is None and _base_storage_access_may_be_aliasing:
+ raise LoopyError(
+ "temporary variable '%s': "
+ "_base_storage_access_may_be_aliasing option, but no "
+ "base_storage given!"
+ % name)
+
ArrayBase.__init__(self, name=intern(name),
dtype=dtype, shape=shape,
dim_tags=dim_tags, offset=offset, dim_names=dim_names,
@@ -428,6 +457,8 @@ class TemporaryVariable(ArrayBase):
base_storage=base_storage,
initializer=initializer,
read_only=read_only,
+ _base_storage_access_may_be_aliasing=(
+ _base_storage_access_may_be_aliasing),
**kwargs)
@property
@@ -489,7 +520,10 @@ class TemporaryVariable(ArrayBase):
and (
(self.initializer is None and other.initializer is None)
or np.array_equal(self.initializer, other.initializer))
- and self.read_only == other.read_only)
+ and self.read_only == other.read_only
+ and (self._base_storage_access_may_be_aliasing
+ == other._base_storage_access_may_be_aliasing)
+ )
def update_persistent_hash(self, key_hash, key_builder):
"""Custom hash computation function for use with
@@ -500,6 +534,8 @@ class TemporaryVariable(ArrayBase):
self.update_persistent_hash_for_shape(key_hash, key_builder,
self.storage_shape)
key_builder.rec(key_hash, self.base_indices)
+ key_builder.rec(key_hash, self.scope)
+ key_builder.rec(key_hash, self.base_storage)
initializer = self.initializer
if initializer is not None:
@@ -507,10 +543,22 @@ class TemporaryVariable(ArrayBase):
key_builder.rec(key_hash, initializer)
key_builder.rec(key_hash, self.read_only)
+ key_builder.rec(key_hash, self._base_storage_access_may_be_aliasing)
# }}}
+def iname_tag_to_temp_var_scope(iname_tag):
+ iname_tag = parse_tag(iname_tag)
+
+ if isinstance(iname_tag, GroupIndexTag):
+ return temp_var_scope.GLOBAL
+ elif isinstance(iname_tag, LocalIndexTag):
+ return temp_var_scope.LOCAL
+ else:
+ return temp_var_scope.PRIVATE
+
+
# {{{ substitution rule
class SubstitutionRule(ImmutableRecord):
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 8bdc72d54a91c6e8b4f9ec0ca3053831627d3eae..02df0f2b4fd27dcb0f8b847411aa3dea7f3f9169 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -35,7 +35,7 @@ import islpy as isl
from islpy import dim_type
from loopy.diagnostic import LoopyError, warn_with_kernel
from pytools import memoize_on_first_arg
-
+from loopy.tools import natsorted
import logging
logger = logging.getLogger(__name__)
@@ -620,11 +620,11 @@ class DomainParameterFinder(object):
if dep.name in param_names:
from pymbolic.algorithm import solve_affine_equations_for
try:
- # friggin' overkill :)
+ # overkill :)
param_expr = solve_affine_equations_for(
[dep.name], [(shape_i, var("shape_i"))]
)[dep.name]
- except:
+ except Exception:
# went wrong? oh well
pass
else:
@@ -1070,7 +1070,7 @@ def guess_var_shape(kernel, var_name):
if n_axes == 1:
# Leave shape undetermined--we can live with that for 1D.
- shape = (None,)
+ shape = None
else:
raise LoopyError("cannot determine access range for '%s': "
"undetermined index in subscript(s) '%s'"
@@ -1092,7 +1092,7 @@ def guess_var_shape(kernel, var_name):
kernel.cache_manager.dim_max(
armap.access_range, i) + 1,
constants_only=False)))
- except:
+ except Exception:
print("While trying to find shape axis %d of "
"variable '%s', the following "
"exception occurred:" % (i, var_name),
@@ -1371,7 +1371,167 @@ def draw_dependencies_as_unicode_arrows(
conform_to_uniform_length(extender))
for row, extender in rows]
- return rows
+ return uniform_length, rows
+
+# }}}
+
+
+# {{{ stringify_instruction_list
+
+def stringify_instruction_list(kernel):
+ # {{{ topological sort
+
+ printed_insn_ids = set()
+ printed_insn_order = []
+
+ def insert_insn_into_order(insn):
+ if insn.id in printed_insn_ids:
+ return
+ printed_insn_ids.add(insn.id)
+
+ for dep_id in natsorted(insn.depends_on):
+ insert_insn_into_order(kernel.id_to_insn[dep_id])
+
+ printed_insn_order.append(insn)
+
+ for insn in kernel.instructions:
+ insert_insn_into_order(insn)
+
+ # }}}
+
+ import loopy as lp
+
+ Fore = kernel.options._fore # noqa
+ Style = kernel.options._style # noqa
+
+ uniform_arrow_length, arrows_and_extenders = \
+ draw_dependencies_as_unicode_arrows(
+ printed_insn_order, fore=Fore, style=Style)
+
+ leader = " " * uniform_arrow_length
+ lines = []
+ current_inames = [set()]
+
+ if uniform_arrow_length:
+ indent_level = [1]
+ else:
+ indent_level = [0]
+
+ indent_increment = 2
+
+ iname_order = kernel._get_iname_order_for_printing()
+
+ def add_pre_line(s):
+ lines.append(leader + " " * indent_level[0] + s)
+
+ def add_main_line(s):
+ lines.append(arrows + " " * indent_level[0] + s)
+
+ def add_post_line(s):
+ lines.append(extender + " " * indent_level[0] + s)
+
+ def adapt_to_new_inames_list(new_inames):
+ added = []
+ removed = []
+
+ # FIXME: Doesn't respect strict nesting
+ for iname in iname_order:
+ is_in_current = iname in current_inames[0]
+ is_in_new = iname in new_inames
+
+ if is_in_new == is_in_current:
+ pass
+ elif is_in_new and not is_in_current:
+ added.append(iname)
+ elif not is_in_new and is_in_current:
+ removed.append(iname)
+ else:
+ assert False
+
+ if removed:
+ indent_level[0] -= indent_increment * len(removed)
+ add_pre_line("end " + ", ".join(removed))
+ if added:
+ add_pre_line("for " + ", ".join(added))
+ indent_level[0] += indent_increment * len(added)
+
+ current_inames[0] = new_inames
+
+ for insn, (arrows, extender) in zip(printed_insn_order, arrows_and_extenders):
+ if isinstance(insn, lp.MultiAssignmentBase):
+ lhs = ", ".join(str(a) for a in insn.assignees)
+ rhs = str(insn.expression)
+ trailing = []
+ elif isinstance(insn, lp.CInstruction):
+ lhs = ", ".join(str(a) for a in insn.assignees)
+ rhs = "CODE(%s|%s)" % (
+ ", ".join(str(x) for x in insn.read_variables),
+ ", ".join("%s=%s" % (name, expr)
+ for name, expr in insn.iname_exprs))
+
+ trailing = [l for l in insn.code.split("\n")]
+ elif isinstance(insn, lp.BarrierInstruction):
+ lhs = ""
+ rhs = "... %sbarrier" % insn.kind[0]
+ trailing = []
+
+ elif isinstance(insn, lp.NoOpInstruction):
+ lhs = ""
+ rhs = "... nop"
+ trailing = []
+
+ else:
+ raise LoopyError("unexpected instruction type: %s"
+ % type(insn).__name__)
+
+ adapt_to_new_inames_list(kernel.insn_inames(insn))
+
+ options = ["id="+Fore.GREEN+insn.id+Style.RESET_ALL]
+ if insn.priority:
+ options.append("priority=%d" % insn.priority)
+ if insn.tags:
+ options.append("tags=%s" % ":".join(insn.tags))
+ if isinstance(insn, lp.Assignment) and insn.atomicity:
+ options.append("atomic=%s" % ":".join(
+ str(a) for a in insn.atomicity))
+ if insn.groups:
+ options.append("groups=%s" % ":".join(insn.groups))
+ if insn.conflicts_with_groups:
+ options.append(
+ "conflicts=%s" % ":".join(insn.conflicts_with_groups))
+ if insn.no_sync_with:
+ options.append("no_sync_with=%s" % ":".join(
+ "%s@%s" % entry for entry in sorted(insn.no_sync_with)))
+
+ if lhs:
+ core = "%s = %s" % (
+ Fore.CYAN+lhs+Style.RESET_ALL,
+ Fore.MAGENTA+rhs+Style.RESET_ALL,
+ )
+ else:
+ core = Fore.MAGENTA+rhs+Style.RESET_ALL
+
+ options_str = " {%s}" % ", ".join(options)
+
+ if insn.predicates:
+ # FIXME: precedence
+ add_pre_line("if %s" % " and ".join([str(x) for x in insn.predicates]))
+ indent_level[0] += indent_increment
+
+ add_main_line(core + options_str)
+
+ for t in trailing:
+ add_post_line(t)
+
+ if insn.predicates:
+ indent_level[0] -= indent_increment
+ add_post_line("end")
+
+ leader = extender
+
+ adapt_to_new_inames_list([])
+
+ return lines
# }}}
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 541b44f58c5b02e9beba15211cb861fd09f14096..ac7ac19887388649670154fcd36eba79ba3b4315 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -30,7 +30,7 @@ from loopy.diagnostic import (
import islpy as isl
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
from loopy.tools import LoopyKeyBuilder
from loopy.version import DATA_MODEL_VERSION
@@ -292,7 +292,7 @@ def _classify_reduction_inames(kernel, inames):
from loopy.kernel.data import (
LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
- ParallelTag)
+ ConcurrentTag)
for iname in inames:
iname_tag = kernel.iname_to_tag.get(iname)
@@ -305,7 +305,7 @@ def _classify_reduction_inames(kernel, inames):
elif isinstance(iname_tag, LocalIndexTagBase):
local_par.append(iname)
- elif isinstance(iname_tag, (ParallelTag, VectorizeTag)):
+ elif isinstance(iname_tag, (ConcurrentTag, VectorizeTag)):
nonlocal_par.append(iname)
else:
@@ -610,7 +610,7 @@ def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound):
if len(coeffs) == 0:
try:
scan_iname_aff.get_constant_val()
- except:
+ except Exception:
raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff)
# If this point is reached we're assuming the domain is of the form
@@ -956,7 +956,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
nresults=nresults,
depends_on=insn.depends_on,
within_inames=insn.within_inames | expr.inames,
- within_inames_is_final=insn.within_inames_is_final)
+ within_inames_is_final=insn.within_inames_is_final,
+ predicates=insn.predicates,
+ )
newly_generated_insn_id_set.add(get_args_insn_id)
@@ -970,7 +972,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
return updated_inner_exprs
def expand_inner_reduction(id, expr, nresults, depends_on, within_inames,
- within_inames_is_final):
+ within_inames_is_final, predicates):
# FIXME: use make_temporaries
from pymbolic.primitives import Call
from loopy.symbolic import Reduction
@@ -997,7 +999,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
expression=expr,
depends_on=depends_on,
within_inames=within_inames,
- within_inames_is_final=within_inames_is_final)
+ within_inames_is_final=within_inames_is_final,
+ predicates=predicates)
generated_insns.append(call_insn)
@@ -1038,7 +1041,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
within_inames=outer_insn_inames - frozenset(expr.inames),
within_inames_is_final=insn.within_inames_is_final,
depends_on=init_insn_depends_on,
- expression=expr.operation.neutral_element(*arg_dtypes))
+ expression=expr.operation.neutral_element(*arg_dtypes),
+ predicates=insn.predicates,)
generated_insns.append(init_insn)
@@ -1064,7 +1068,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
nresults=nresults,
depends_on=insn.depends_on,
within_inames=update_insn_iname_deps,
- within_inames_is_final=insn.within_inames_is_final)
+ within_inames_is_final=insn.within_inames_is_final,
+ predicates=insn.predicates,
+ )
reduction_insn_depends_on.add(get_args_insn_id)
else:
@@ -1079,7 +1085,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
reduction_expr),
depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on,
within_inames=update_insn_iname_deps,
- within_inames_is_final=insn.within_inames_is_final)
+ within_inames_is_final=insn.within_inames_is_final,
+ predicates=insn.predicates,)
generated_insns.append(reduction_insn)
@@ -1186,7 +1193,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
expression=neutral,
within_inames=base_iname_deps | frozenset([base_exec_iname]),
within_inames_is_final=insn.within_inames_is_final,
- depends_on=frozenset())
+ depends_on=frozenset(),
+ predicates=insn.predicates,
+ )
generated_insns.append(init_insn)
init_neutral_id = insn_id_gen("%s_%s_init_neutral" % (insn.id, red_iname))
@@ -1196,7 +1205,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
expression=neutral,
within_inames=base_iname_deps | frozenset([base_exec_iname]),
within_inames_is_final=insn.within_inames_is_final,
- depends_on=frozenset())
+ depends_on=frozenset(),
+ predicates=insn.predicates,
+ )
generated_insns.append(init_neutral_insn)
transfer_depends_on = set([init_neutral_id, init_id])
@@ -1216,7 +1227,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
within_inames=(
(outer_insn_inames - frozenset(expr.inames))
| frozenset([red_iname])),
- within_inames_is_final=insn.within_inames_is_final)
+ within_inames_is_final=insn.within_inames_is_final,
+ predicates=insn.predicates,
+ )
transfer_depends_on.add(get_args_insn_id)
else:
@@ -1239,7 +1252,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
| frozenset([red_iname])),
within_inames_is_final=insn.within_inames_is_final,
depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on,
- no_sync_with=frozenset([(init_id, "any")]))
+ no_sync_with=frozenset([(init_id, "any")]),
+ predicates=insn.predicates,
+ )
generated_insns.append(transfer_insn)
cur_size = 1
@@ -1280,6 +1295,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
base_iname_deps | frozenset([stage_exec_iname])),
within_inames_is_final=insn.within_inames_is_final,
depends_on=frozenset([prev_id]),
+ predicates=insn.predicates,
)
generated_insns.append(stage_insn)
@@ -1398,7 +1414,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
(sweep_iname,) + expr.inames),
within_inames_is_final=insn.within_inames_is_final,
depends_on=init_insn_depends_on,
- expression=expr.operation.neutral_element(*arg_dtypes))
+ expression=expr.operation.neutral_element(*arg_dtypes),
+ predicates=insn.predicates,
+ )
generated_insns.append(init_insn)
@@ -1425,7 +1443,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
depends_on=frozenset(update_insn_depends_on),
within_inames=update_insn_iname_deps,
no_sync_with=insn.no_sync_with,
- within_inames_is_final=insn.within_inames_is_final)
+ within_inames_is_final=insn.within_inames_is_final,
+ predicates=insn.predicates,
+ )
generated_insns.append(scan_insn)
@@ -1531,7 +1551,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
expression=neutral,
within_inames=base_iname_deps | frozenset([base_exec_iname]),
within_inames_is_final=insn.within_inames_is_final,
- depends_on=init_insn_depends_on)
+ depends_on=init_insn_depends_on,
+ predicates=insn.predicates,
+ )
generated_insns.append(init_insn)
transfer_insn_depends_on = set([init_insn.id]) | insn.depends_on
@@ -1561,7 +1583,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
within_inames=outer_insn_inames - frozenset(expr.inames),
within_inames_is_final=insn.within_inames_is_final,
depends_on=frozenset(transfer_insn_depends_on),
- no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with)
+ no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with,
+ predicates=insn.predicates,
+ )
generated_insns.append(transfer_insn)
@@ -1590,7 +1614,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
within_inames=(
base_iname_deps | frozenset([stage_exec_iname])),
within_inames_is_final=insn.within_inames_is_final,
- depends_on=frozenset([prev_id]))
+ depends_on=frozenset([prev_id]),
+ predicates=insn.predicates,
+ )
if cur_size == 1:
# Performance hack: don't add a barrier here with transfer_insn.
@@ -1623,6 +1649,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
base_iname_deps | frozenset([stage_exec_iname])),
within_inames_is_final=insn.within_inames_is_final,
depends_on=frozenset([prev_id]),
+ predicates=insn.predicates,
)
generated_insns.append(write_stage_insn)
@@ -2020,7 +2047,8 @@ def limit_boostability(kernel):
# }}}
-preprocess_cache = PersistentDict("loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
+preprocess_cache = WriteOncePersistentDict(
+ "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
key_builder=LoopyKeyBuilder())
@@ -2126,7 +2154,7 @@ def preprocess_kernel(kernel, device=None):
# }}}
if CACHING_ENABLED:
- preprocess_cache[input_kernel] = kernel
+ preprocess_cache.store_if_not_present(input_kernel, kernel)
return kernel
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index d28e7b1b3def2b988b4624aed9caf8f65c70b2c5..abf4d799fbdb14f86fa29dde26e6654130fc66de 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -29,7 +29,7 @@ import sys
import islpy as isl
from loopy.diagnostic import warn_with_kernel, LoopyError # noqa
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
from loopy.tools import LoopyKeyBuilder
from loopy.version import DATA_MODEL_VERSION
@@ -206,13 +206,13 @@ def find_loop_nest_with_map(kernel):
"""
result = {}
- from loopy.kernel.data import ParallelTag, IlpBaseTag, VectorizeTag
+ from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
all_nonpar_inames = set([
iname
for iname in kernel.all_inames()
if not isinstance(kernel.iname_to_tag.get(iname),
- (ParallelTag, IlpBaseTag, VectorizeTag))])
+ (ConcurrentTag, IlpBaseTag, VectorizeTag))])
iname_to_insns = kernel.iname_to_insns()
@@ -274,10 +274,10 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
result = {}
- from loopy.kernel.data import ParallelTag, IlpBaseTag, VectorizeTag
+ from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
for insn in kernel.instructions:
for iname in kernel.insn_inames(insn):
- if isinstance(kernel.iname_to_tag.get(iname), ParallelTag):
+ if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag):
continue
iname_dep = result.setdefault(iname, set())
@@ -308,7 +308,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
continue
tag = kernel.iname_to_tag.get(dep_insn_iname)
- if isinstance(tag, (ParallelTag, IlpBaseTag, VectorizeTag)):
+ if isinstance(tag, (ConcurrentTag, IlpBaseTag, VectorizeTag)):
# Parallel tags don't really nest, so we'll disregard
# them here.
continue
@@ -431,10 +431,10 @@ def format_insn(kernel, insn_id):
from loopy.kernel.instruction import (
MultiAssignmentBase, NoOpInstruction, BarrierInstruction)
if isinstance(insn, MultiAssignmentBase):
- return "[%s] %s%s%s <- %s%s%s" % (
- format_insn_id(kernel, insn_id),
+ return "%s%s%s = %s%s%s {id=%s}" % (
Fore.CYAN, ", ".join(str(a) for a in insn.assignees), Style.RESET_ALL,
- Fore.MAGENTA, str(insn.expression), Style.RESET_ALL)
+ Fore.MAGENTA, str(insn.expression), Style.RESET_ALL,
+ format_insn_id(kernel, insn_id))
elif isinstance(insn, BarrierInstruction):
return "[%s] %s... %sbarrier%s" % (
format_insn_id(kernel, insn_id),
@@ -456,11 +456,11 @@ def dump_schedule(kernel, schedule):
from loopy.kernel.data import MultiAssignmentBase
for sched_item in schedule:
if isinstance(sched_item, EnterLoop):
- lines.append(indent + "FOR %s" % sched_item.iname)
+ lines.append(indent + "for %s" % sched_item.iname)
indent += " "
elif isinstance(sched_item, LeaveLoop):
indent = indent[:-4]
- lines.append(indent + "END %s" % sched_item.iname)
+ lines.append(indent + "end %s" % sched_item.iname)
elif isinstance(sched_item, CallKernel):
lines.append(indent +
"CALL KERNEL %s(extra_args=%s, extra_inames=%s)" % (
@@ -479,7 +479,7 @@ def dump_schedule(kernel, schedule):
insn_str = sched_item.insn_id
lines.append(indent + insn_str)
elif isinstance(sched_item, Barrier):
- lines.append(indent + "---BARRIER:%s---" % sched_item.kind)
+ lines.append(indent + "... %sbarrier" % sched_item.kind[0])
else:
assert False
@@ -1787,7 +1787,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
for item in preschedule
for insn_id in sched_item_to_insn_id(item))
- from loopy.kernel.data import IlpBaseTag, ParallelTag, VectorizeTag
+ from loopy.kernel.data import IlpBaseTag, ConcurrentTag, VectorizeTag
ilp_inames = set(
iname
for iname in kernel.all_inames()
@@ -1798,7 +1798,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag))
parallel_inames = set(
iname for iname in kernel.all_inames()
- if isinstance(kernel.iname_to_tag.get(iname), ParallelTag))
+ if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag))
loop_nest_with_map = find_loop_nest_with_map(kernel)
loop_nest_around_map = find_loop_nest_around_map(kernel)
@@ -1940,7 +1940,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
# }}}
-schedule_cache = PersistentDict("loopy-schedule-cache-v4-"+DATA_MODEL_VERSION,
+schedule_cache = WriteOncePersistentDict(
+ "loopy-schedule-cache-v4-"+DATA_MODEL_VERSION,
key_builder=LoopyKeyBuilder())
@@ -1971,7 +1972,7 @@ def get_one_scheduled_kernel(kernel):
kernel.name, time()-start_time))
if CACHING_ENABLED and not from_cache:
- schedule_cache[sched_cache_key] = result
+ schedule_cache.store_if_not_present(sched_cache_key, result)
return result
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 233cfe5e881ef594ebabc536ab8c7b3d18d5cf17..88d7ec328345fd4c97d75b449385316f99c2509d 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1000,6 +1000,9 @@ def add_assumptions_guard(kernel, pwqpolynomial):
def count(kernel, set, space=None):
try:
+ if space is not None:
+ set = set.align_params(space)
+
return add_assumptions_guard(kernel, set.card())
except AttributeError:
pass
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index f1a494f30d469511817d204c0476ff79abe00e3b..543c2743bb98b09b706c2fdbf9188ed0a85d97f2 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -1232,6 +1232,9 @@ class PwAffEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin):
super(PwAffEvaluationMapper, self).__init__(context)
def map_constant(self, expr):
+ if isinstance(expr, np.integer):
+ expr = int(expr)
+
return self.pw_zero + expr
def map_min(self, expr):
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 7e307ef8bdd4d89e24b26dbacf39733ab3350307..5800a0236e8ae5f81a63942c31a74822bc2fab96 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -211,6 +211,10 @@ class ASTBuilderBase(object):
static_lbound, static_ubound, inner):
raise NotImplementedError()
+ @property
+ def can_implement_conditionals(self):
+ return False
+
def emit_if(self, condition_str, ast):
raise NotImplementedError()
@@ -275,28 +279,6 @@ class DummyHostASTBuilder(ASTBuilderBase):
def ast_block_scope_class(self):
return _DummyASTBlock
- def emit_assignment(self, codegen_state, insn):
- return None
-
- def emit_multiple_assignment(self, codegen_state, insn):
- return None
-
- def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
- static_lbound, static_ubound, inner):
- return None
-
- def emit_if(self, condition_str, ast):
- return None
-
- def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
- return None
-
- def emit_blank_line(self):
- return None
-
- def emit_comment(self, s):
- return None
-
# }}}
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index a2ad682505bbdb7ed5977a28e201ebc6655c7784..e54ac0f693c4704c13b8c435e4bc7acaac1b1a47 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -307,6 +307,12 @@ class _ConstRestrictPointer(Pointer):
return sub_tp, ("*const __restrict__ %s" % sub_decl)
+class _ConstPointer(Pointer):
+ def get_decl_pait(self):
+ sub_tp, sub_decl = self.subdecl.get_decl_pair()
+ return sub_tp, ("*const %s" % sub_decl)
+
+
class CASTBuilder(ASTBuilderBase):
# {{{ library
@@ -462,13 +468,17 @@ class CASTBuilder(ASTBuilderBase):
temp_var_decl = self.wrap_temporary_decl(
temp_var_decl, tv.scope)
- # The 'restrict' part of this is a complete lie--of course
- # all these temporaries are aliased. But we're promising to
- # not use them to shovel data from one representation to the
- # other. That counts, right?
+ if tv._base_storage_access_may_be_aliasing:
+ ptrtype = _ConstPointer
+ else:
+ # The 'restrict' part of this is a complete lie--of course
+ # all these temporaries are aliased. But we're promising to
+ # not use them to shovel data from one representation to the
+ # other. That counts, right?
+ ptrtype = _ConstRestrictPointer
- cast_decl = _ConstRestrictPointer(cast_decl)
- temp_var_decl = _ConstRestrictPointer(temp_var_decl)
+ cast_decl = ptrtype(cast_decl)
+ temp_var_decl = ptrtype(temp_var_decl)
cast_tp, cast_d = cast_decl.get_decl_pair()
temp_var_decl = Initializer(
@@ -789,6 +799,10 @@ class CASTBuilder(ASTBuilderBase):
from cgen import Comment
return Comment(s)
+ @property
+ def can_implement_conditionals(self):
+ return True
+
def emit_if(self, condition_str, ast):
from cgen import If
return If(condition_str, ast)
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index 2da25ba39ceef38a4af105913973226bd3773729..975c691a74d0d17bdca39243f515c5d04284893d 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -328,7 +328,8 @@ def generate_arg_setup(gen, kernel, implemented_data_info, options):
# {{{ allocate written arrays, if needed
if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
- and arg.shape is not None:
+ and arg.shape is not None \
+ and all(si is not None for si in arg.shape):
if not isinstance(arg.dtype, NumpyType):
raise LoopyError("do not know how to pass arg of type '%s'"
diff --git a/loopy/target/python.py b/loopy/target/python.py
index 11951abcf17e94c0fdba51042e3060735215b423..ce04986d3d2a39dcf7126339055d32fa16ffcc25 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -283,6 +283,10 @@ class PythonASTBuilderBase(ASTBuilderBase):
from genpy import Comment
return Comment(s)
+ @property
+ def can_implement_conditionals(self):
+ return True
+
def emit_if(self, condition_str, ast):
from genpy import If
return If(condition_str, ast)
diff --git a/loopy/tools.py b/loopy/tools.py
index 1ebbe5c8a4fd2b68e3bfcf5ed493384599dac2c5..d6952d54782f113685299641c828907fb7f32a46 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -576,4 +576,19 @@ def intern_frozenset_of_ids(fs):
return frozenset(intern(s) for s in fs)
+def natorder(key):
+ # Return natural ordering for strings, as opposed to dictionary order.
+ # E.g. will result in
+ # 'abc1' < 'abc9' < 'abc10'
+ # rather than
+ # 'abc1' < 'abc10' < 'abc9'
+ # Based on
+ # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7
+ import re
+ return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)]
+
+
+def natsorted(seq, key=lambda x: x):
+ return sorted(seq, key=lambda y: natorder(key(y)))
+
# vim: foldmethod=marker
diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py
index f4e6526a7b083f0b38dda1209b607aa38a62b68e..618e36f20da8b3f9089ecf5ce88d6b3177528570 100644
--- a/loopy/transform/array_buffer_map.py
+++ b/loopy/transform/array_buffer_map.py
@@ -239,14 +239,14 @@ class ArrayToBufferMap(object):
non1_storage_axis_flags = []
non1_storage_shape = []
- for saxis, bi, l in zip(
+ for saxis, bi, saxis_len in zip(
storage_axis_names, storage_base_indices, storage_shape):
- has_length_non1 = l != 1
+ has_length_non1 = saxis_len != 1
non1_storage_axis_flags.append(has_length_non1)
if has_length_non1:
- non1_storage_shape.append(l)
+ non1_storage_shape.append(saxis_len)
# }}}
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index 92cff7a507d672a3acc51a8abed572a04cb7e86a..1b059b6a73d3064596b8679fbc87f94287b2d9fe 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -29,7 +29,7 @@ from loopy.symbolic import (get_dependencies,
RuleAwareIdentityMapper, SubstitutionRuleMappingContext,
SubstitutionMapper)
from pymbolic.mapper.substitutor import make_subst_func
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper
from loopy.version import DATA_MODEL_VERSION
from loopy.diagnostic import LoopyError
@@ -124,7 +124,8 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper):
# }}}
-buffer_array_cache = PersistentDict("loopy-buffer-array-cache-"+DATA_MODEL_VERSION,
+buffer_array_cache = WriteOncePersistentDict(
+ "loopy-buffer-array-cache-"+DATA_MODEL_VERSION,
key_builder=LoopyKeyBuilder())
@@ -531,7 +532,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
if CACHING_ENABLED:
from loopy.preprocess import prepare_for_caching
- buffer_array_cache[cache_key] = prepare_for_caching(kernel)
+ buffer_array_cache.store_if_not_present(
+ cache_key, prepare_for_caching(kernel))
return kernel
diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index 77840753258fa545aa01ef3e8c58cbc36e66ed72..0ac71d603ebe8b5150fb854dd3978676dd9d98c3 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -38,6 +38,7 @@ from loopy.symbolic import IdentityMapper
class ExtraInameIndexInserter(IdentityMapper):
def __init__(self, var_to_new_inames):
self.var_to_new_inames = var_to_new_inames
+ self.seen_ilp_inames = set()
def map_subscript(self, expr):
try:
@@ -50,6 +51,7 @@ class ExtraInameIndexInserter(IdentityMapper):
index = (index,)
index = tuple(self.rec(i) for i in index)
+ self.seen_ilp_inames.update(v.name for v in new_idx)
return expr.aggregate.index(index + new_idx)
def map_variable(self, expr):
@@ -58,6 +60,7 @@ class ExtraInameIndexInserter(IdentityMapper):
except KeyError:
return expr
else:
+ self.seen_ilp_inames.update(v.name for v in new_idx)
return expr.index(new_idx)
@@ -160,13 +163,30 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
# }}}
from pymbolic import var
- eiii = ExtraInameIndexInserter(
- dict((var_name, tuple(var(iname) for iname in inames))
- for var_name, inames in six.iteritems(var_to_new_ilp_inames)))
-
- new_insns = [
- insn.with_transformed_expressions(eiii)
- for insn in kernel.instructions]
+ var_to_extra_iname = dict(
+ (var_name, tuple(var(iname) for iname in inames))
+ for var_name, inames in six.iteritems(var_to_new_ilp_inames))
+
+ new_insns = []
+
+ for insn in kernel.instructions:
+ eiii = ExtraInameIndexInserter(var_to_extra_iname)
+ new_insn = insn.with_transformed_expressions(eiii)
+ if not eiii.seen_ilp_inames <= insn.within_inames:
+
+ from loopy.diagnostic import warn_with_kernel
+ warn_with_kernel(
+ kernel,
+ "implicit_ilp_iname",
+ "Instruction '%s': touched variable that (for ILP) "
+ "required iname(s) '%s', but that the instruction was not "
+ "previously within the iname(s). Previously, this would "
+ "implicitly promote the instruction, but that behavior is "
+ "deprecated and will stop working in 2018.1."
+ % (insn.id, ", ".join(
+ eiii.seen_ilp_inames - insn.within_inames)))
+
+ new_insns.append(new_insn)
return kernel.copy(
temporary_variables=new_temp_vars,
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index ea90abfe27c8de69daf39021b3d0ea5463a2e4c8..22fd7b3bb2c643bc3c1309f4e3fdb89438ae7d2b 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -641,7 +641,7 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
iname_to_tag = [(iname, parse_tag(tag)) for iname, tag in iname_to_tag]
- from loopy.kernel.data import (ParallelTag, AutoLocalIndexTagBase,
+ from loopy.kernel.data import (ConcurrentTag, AutoLocalIndexTagBase,
ForceSequentialTag)
# {{{ globbing
@@ -686,13 +686,13 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
if iname not in kernel.all_inames():
raise ValueError("cannot tag '%s'--not known" % iname)
- if isinstance(new_tag, ParallelTag) \
+ if isinstance(new_tag, ConcurrentTag) \
and isinstance(old_tag, ForceSequentialTag):
raise ValueError("cannot tag '%s' as parallel--"
"iname requires sequential execution" % iname)
if isinstance(new_tag, ForceSequentialTag) \
- and isinstance(old_tag, ParallelTag):
+ and isinstance(old_tag, ConcurrentTag):
raise ValueError("'%s' is already tagged as parallel, "
"but is now prohibited from being parallel "
"(likely because of participation in a precompute or "
@@ -972,9 +972,9 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
# Get the duplication options as a tuple of iname and a set
for iname, insns in _get_iname_duplication_options(insn_deps):
# Check whether this iname has a parallel tag and discard it if so
- from loopy.kernel.data import ParallelTag
+ from loopy.kernel.data import ConcurrentTag
if (iname in knl.iname_to_tag
- and isinstance(knl.iname_to_tag[iname], ParallelTag)):
+ and isinstance(knl.iname_to_tag[iname], ConcurrentTag)):
continue
# If we find a duplication option and fo not use boostable_into
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index 3d4f5c2d4765aa7cbf1e56c76d127bf8f4d61a06..2ba2338b0af541274cc0362c9f71cec9c2887ffc 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -402,13 +402,13 @@ class TemporarySaver(object):
continue
from loopy.kernel.data import (
- GroupIndexTag, LocalIndexTag, ParallelTag)
+ GroupIndexTag, LocalIndexTag, ConcurrentTag)
if isinstance(tag, GroupIndexTag):
my_group_tags.append(tag)
elif isinstance(tag, LocalIndexTag):
my_local_tags.append(tag)
- elif isinstance(tag, ParallelTag):
+ elif isinstance(tag, ConcurrentTag):
raise LoopyError(
"iname '%s' is tagged with '%s' - only "
"group and local tags are supported for "
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index 79ceff9fdf1e2c4b3b544e8ae85f8194b36ec444..a681afe06520483c83530c241e39229412e88f03 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -1,6 +1,4 @@
-from __future__ import division
-from __future__ import absolute_import
-import six
+from __future__ import division, absolute_import
__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
@@ -24,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""
+import six
from loopy.symbolic import (
get_dependencies, SubstitutionMapper,
@@ -141,6 +140,7 @@ def extract_subst(kernel, subst_name, template, parameters=()):
dfmapper = CallbackMapper(gather_exprs, WalkMapper())
for insn in kernel.instructions:
+ dfmapper(insn.assignees)
dfmapper(insn.expression)
for sr in six.itervalues(kernel.substitutions):
@@ -178,8 +178,7 @@ def extract_subst(kernel, subst_name, template, parameters=()):
new_insns = []
for insn in kernel.instructions:
- new_expr = cbmapper(insn.expression)
- new_insns.append(insn.copy(expression=new_expr))
+ new_insns.append(insn.with_transformed_expressions(cbmapper))
from loopy.kernel.data import SubstitutionRule
new_substs = {
diff --git a/loopy/version.py b/loopy/version.py
index 3a9781748d00a0e453d4a56e374a25aa72ab4733..5e07e979f2d44684be00290328244496176337b3 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
else:
_islpy_version = islpy.version.VERSION_TEXT
-DATA_MODEL_VERSION = "v66-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v68-islpy%s" % _islpy_version
diff --git a/setup.py b/setup.py
index 67d943af3be4446834bf7262a91b8596b601ca85..94843bf69e4e25677ccc0713e5f598e9dcfd55e2 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@ setup(name="loo.py",
],
install_requires=[
- "pytools>=2017.3",
+ "pytools>=2017.6",
"pymbolic>=2016.2",
"genpy>=2016.1.2",
"cgen>=2016.1",
diff --git a/test/test_fortran.py b/test/test_fortran.py
index 6e05aa6adba66ce0a1896527249d321de104c512..842a0127e3118ec8e7a0ea89ed17decc091e8566 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -278,14 +278,14 @@ def test_matmul(ctx_factory, buffer_inames):
logging.basicConfig(level=logging.INFO)
fortran_src = """
- subroutine dgemm(m,n,l,a,b,c)
+ subroutine dgemm(m,n,ell,a,b,c)
implicit none
- real*8 a(m,l),b(l,n),c(m,n)
- integer m,n,k,i,j,l
+ real*8 a(m,ell),b(ell,n),c(m,n)
+ integer m,n,k,i,j,ell
do j = 1,n
do i = 1,m
- do k = 1,l
+ do k = 1,ell
c(i,j) = c(i,j) + b(k,j)*a(i,k)
end do
end do
@@ -306,7 +306,7 @@ def test_matmul(ctx_factory, buffer_inames):
knl = lp.split_iname(knl, "k", 32)
knl = lp.assume(knl, "n mod 32 = 0")
knl = lp.assume(knl, "m mod 32 = 0")
- knl = lp.assume(knl, "l mod 16 = 0")
+ knl = lp.assume(knl, "ell mod 16 = 0")
knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
@@ -317,7 +317,7 @@ def test_matmul(ctx_factory, buffer_inames):
init_expression="0", store_expression="base+buffer")
ctx = ctx_factory()
- lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128))
+ lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
@pytest.mark.xfail
@@ -457,14 +457,14 @@ def test_parse_and_fuse_two_kernels():
def test_precompute_some_exist(ctx_factory):
fortran_src = """
- subroutine dgemm(m,n,l,a,b,c)
+ subroutine dgemm(m,n,ell,a,b,c)
implicit none
- real*8 a(m,l),b(l,n),c(m,n)
- integer m,n,k,i,j,l
+ real*8 a(m,ell),b(ell,n),c(m,n)
+ integer m,n,k,i,j,ell
do j = 1,n
do i = 1,m
- do k = 1,l
+ do k = 1,ell
c(i,j) = c(i,j) + b(k,j)*a(i,k)
end do
end do
@@ -483,7 +483,7 @@ def test_precompute_some_exist(ctx_factory):
knl = lp.split_iname(knl, "k", 8)
knl = lp.assume(knl, "n mod 8 = 0")
knl = lp.assume(knl, "m mod 8 = 0")
- knl = lp.assume(knl, "l mod 8 = 0")
+ knl = lp.assume(knl, "ell mod 8 = 0")
knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
@@ -495,7 +495,7 @@ def test_precompute_some_exist(ctx_factory):
ref_knl = knl
ctx = ctx_factory()
- lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128))
+ lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
if __name__ == "__main__":
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 772d536d1e00fedc0b7abcd2f8c05350fe3b633e..3d422f1d8b5a847d4445468978ee529db95c481f 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -230,14 +230,14 @@ def test_funny_shape_matrix_mul(ctx_factory):
n = get_suitable_size(ctx)
m = n+12
- l = m+12
+ ell = m+12
knl = lp.make_kernel(
- "{[i,k,j]: 0<=i gid = i/256
+ start = gid*256
+ for j
+ a[start + j] = a[start + j] + j
+ end
+ end
+ """,
+ seq_dependencies=True,
+ name="uniform_l",
+ target=PyOpenCLTarget(),
+ assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0]))
+
+ knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+ cl_kernel_info = CompiledKernel(ctx, knl).cl_kernel_info(frozenset()) # noqa
+
# }}}
@@ -2008,6 +2034,37 @@ def test_if_else(ctx_factory):
out_ref[4::6] = 11
out_ref[2::6] = 3
+ knl = lp.make_kernel(
+ "{ [i,j]: 0<=i,j<50}",
+ """
+ for i
+ if i < 25
+ for j
+ if j % 2 == 0
+ a[i, j] = 1
+ else
+ a[i, j] = 0
+ end
+ end
+ else
+ for j
+ if j % 2 == 0
+ a[i, j] = 0
+ else
+ a[i, j] = 1
+ end
+ end
+ end
+ end
+ """
+ )
+
+ evt, (out,) = knl(queue, out_host=True)
+
+ out_ref = np.zeros((50, 50))
+ out_ref[:25, 0::2] = 1
+ out_ref[25:, 1::2] = 1
+
assert np.array_equal(out_ref, out)
@@ -2180,11 +2237,12 @@ def test_nosync_option_parsing():
""",
options=lp.Options(allow_terminal_colors=False))
kernel_str = str(knl)
- assert "# insn1,no_sync_with=insn1@any" in kernel_str
- assert "# insn2,no_sync_with=insn1@any:insn2@any" in kernel_str
- assert "# insn3,no_sync_with=insn1@local:insn2@global:insn3@any" in kernel_str
- assert "# insn4,no_sync_with=insn1@local:insn2@local:insn3@local:insn5@local" in kernel_str # noqa
- assert "# insn5,no_sync_with=insn1@any" in kernel_str
+ print(kernel_str)
+ assert "id=insn1, no_sync_with=insn1@any" in kernel_str
+ assert "id=insn2, no_sync_with=insn1@any:insn2@any" in kernel_str
+ assert "id=insn3, no_sync_with=insn1@local:insn2@global:insn3@any" in kernel_str
+ assert "id=insn4, no_sync_with=insn1@local:insn2@local:insn3@local:insn5@local" in kernel_str # noqa
+ assert "id=insn5, no_sync_with=insn1@any" in kernel_str
def assert_barrier_between(knl, id1, id2, ignore_barriers_in_levels=()):
@@ -2265,6 +2323,43 @@ def test_barrier_insertion_near_bottom_of_loop():
assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1])
+def test_barrier_in_overridden_get_grid_size_expanded_kernel():
+ from loopy.kernel.data import temp_var_scope as scopes
+
+ # make simple barrier'd kernel
+ knl = lp.make_kernel('{[i]: 0 <= i < 10}',
+ """
+ for i
+ a[i] = i {id=a}
+ ... lbarrier {id=barrier}
+ b[i + 1] = a[i] {nosync=a}
+ end
+ """,
+ [lp.TemporaryVariable("a", np.float32, shape=(10,), order='C',
+ scope=scopes.LOCAL),
+ lp.GlobalArg("b", np.float32, shape=(11,), order='C')],
+ seq_dependencies=True)
+
+ # split into kernel w/ vesize larger than iname domain
+ vecsize = 16
+ knl = lp.split_iname(knl, 'i', vecsize, inner_tag='l.0')
+
+ # artifically expand via overridden_get_grid_sizes_for_insn_ids
+ class GridOverride(object):
+ def __init__(self, clean, vecsize=vecsize):
+ self.clean = clean
+ self.vecsize = vecsize
+
+ def __call__(self, insn_ids, ignore_auto=True):
+ gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto)
+ return gsize, (self.vecsize,)
+
+ knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride(
+ knl.copy(), vecsize))
+ # make sure we can generate the code
+ lp.generate_code_v2(knl)
+
+
def test_multi_argument_reduction_type_inference():
from loopy.type_inference import TypeInferenceMapper
from loopy.library.reduction import SegmentedSumReductionOperation
@@ -2451,6 +2546,167 @@ def test_execution_backend_can_cache_dtypes(ctx_factory):
knl(queue)
+def test_preamble_with_separate_temporaries(ctx_factory):
+ from loopy.kernel.data import temp_var_scope as scopes
+ # create a function mangler
+
+ func_name = 'indirect'
+ func_arg_dtypes = (np.int32, np.int32, np.int32)
+ func_result_dtypes = (np.int32,)
+
+ def __indirectmangler(kernel, name, arg_dtypes):
+ """
+ A function that will return a :class:`loopy.kernel.data.CallMangleInfo`
+ to interface with the calling :class:`loopy.LoopKernel`
+ """
+ if name != func_name:
+ return None
+
+ from loopy.types import to_loopy_type
+ from loopy.kernel.data import CallMangleInfo
+
+ def __compare(d1, d2):
+ # compare dtypes ignoring atomic
+ return to_loopy_type(d1, for_atomic=True) == \
+ to_loopy_type(d2, for_atomic=True)
+
+ # check types
+ if len(arg_dtypes) != len(arg_dtypes):
+ raise Exception('Unexpected number of arguments provided to mangler '
+ '{}, expected {}, got {}'.format(
+ func_name, len(func_arg_dtypes), len(arg_dtypes)))
+
+ for i, (d1, d2) in enumerate(zip(func_arg_dtypes, arg_dtypes)):
+ if not __compare(d1, d2):
+ raise Exception('Argument at index {} for mangler {} does not '
+ 'match expected dtype. Expected {}, got {}'.
+ format(i, func_name, str(d1), str(d2)))
+
+ # get target for creation
+ target = arg_dtypes[0].target
+ return CallMangleInfo(
+ target_name=func_name,
+ result_dtypes=tuple(to_loopy_type(x, target=target) for x in
+ func_result_dtypes),
+ arg_dtypes=arg_dtypes)
+
+ # create the preamble generator
+ def create_preamble(arr):
+ def __indirectpreamble(preamble_info):
+ # find a function matching our name
+ func_match = next(
+ (x for x in preamble_info.seen_functions
+ if x.name == func_name), None)
+ desc = 'custom_funcs_indirect'
+ if func_match is not None:
+ from loopy.types import to_loopy_type
+ # check types
+ if tuple(to_loopy_type(x) for x in func_arg_dtypes) == \
+ func_match.arg_dtypes:
+ # if match, create our temporary
+ var = lp.TemporaryVariable(
+ 'lookup', initializer=arr, dtype=arr.dtype, shape=arr.shape,
+ scope=scopes.GLOBAL, read_only=True)
+ # and code
+ code = """
+ int {name}(int start, int end, int match)
+ {{
+ int result = start;
+ for (int i = start + 1; i < end; ++i)
+ {{
+ if (lookup[i] == match)
+ result = i;
+ }}
+ return result;
+ }}
+ """.format(name=func_name)
+
+ # generate temporary variable code
+ from cgen import Initializer
+ from loopy.target.c import generate_array_literal
+ codegen_state = preamble_info.codegen_state.copy(
+ is_generating_device_code=True)
+ kernel = preamble_info.kernel
+ ast_builder = codegen_state.ast_builder
+ target = kernel.target
+ decl_info, = var.decl_info(target, index_dtype=kernel.index_dtype)
+ decl = ast_builder.wrap_global_constant(
+ ast_builder.get_temporary_decl(
+ codegen_state, None, var,
+ decl_info))
+ if var.initializer is not None:
+ decl = Initializer(decl, generate_array_literal(
+ codegen_state, var, var.initializer))
+ # return generated code
+ yield (desc, '\n'.join([str(decl), code]))
+ return __indirectpreamble
+
+ # and finally create a test
+ n = 10
+ # for each entry come up with a random number of data points
+ num_data = np.asarray(np.random.randint(2, 10, size=n), dtype=np.int32)
+ # turn into offsets
+ offsets = np.asarray(np.hstack(([0], np.cumsum(num_data))), dtype=np.int32)
+ # create lookup data
+ lookup = np.empty(0)
+ for i in num_data:
+ lookup = np.hstack((lookup, np.arange(i)))
+ lookup = np.asarray(lookup, dtype=np.int32)
+ # and create data array
+ data = np.random.rand(np.product(num_data))
+
+ # make kernel
+ kernel = lp.make_kernel('{[i]: 0 <= i < n}',
+ """
+ for i
+ <>ind = indirect(offsets[i], offsets[i + 1], 1)
+ out[i] = data[ind]
+ end
+ """,
+ [lp.GlobalArg('out', shape=('n',)),
+ lp.TemporaryVariable(
+ 'offsets', shape=(offsets.size,), initializer=offsets, scope=scopes.GLOBAL,
+ read_only=True),
+ lp.GlobalArg('data', shape=(data.size,), dtype=np.float64)],
+ )
+ # fixt params, and add manglers / preamble
+ kernel = lp.fix_parameters(kernel, **{'n': n})
+ kernel = lp.register_preamble_generators(kernel, [create_preamble(lookup)])
+ kernel = lp.register_function_manglers(kernel, [__indirectmangler])
+
+ print(lp.generate_code(kernel)[0])
+ # and call (functionality unimportant, more that it compiles)
+ ctx = cl.create_some_context()
+ queue = cl.CommandQueue(ctx)
+ # check that it actually performs the lookup correctly
+ assert np.allclose(kernel(
+ queue, data=data.flatten('C'))[1][0], data[offsets[:-1] + 1])
+
+
+def test_add_prefetch_works_in_lhs_index():
+ knl = lp.make_kernel(
+ "{ [n,k,l,k1,l1,k2,l2]: "
+ "start<=n a1_tmp[k,l] = a1[a1_map[n, k],l]
+ a1_tmp[k1,l1] = a1_tmp[k1,l1] + 1
+ a1_out[a1_map[n,k2], l2] = a1_tmp[k2,l2]
+ end
+ """,
+ [
+ lp.GlobalArg("a1,a1_out", None, "ndofs,2"),
+ lp.GlobalArg("a1_map", None, "nelements,3"),
+ "..."
+ ])
+
+ knl = lp.add_prefetch(knl, "a1_map", "k")
+
+ from loopy.symbolic import get_dependencies
+ for insn in knl.instructions:
+ assert "a1_map" not in get_dependencies(insn.assignees)
+
+
if __name__ == "__main__":
if len(sys.argv) > 1:
exec(sys.argv[1])
diff --git a/test/test_reduction.py b/test/test_reduction.py
index be11d7c8cada94596dceb1a8e0e678f8adb582e9..0c37d2228ee41f3e8af7ef6f6fcd68afa7a66960 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -97,22 +97,22 @@ def test_nested_dependent_reduction(ctx_factory):
"{[j]: 0<=j sumlen = l[i]",
+ "<> sumlen = ell[i]",
"a[i] = sum(j, j)",
],
[
lp.ValueArg("n", np.int32),
lp.GlobalArg("a", dtype, ("n",)),
- lp.GlobalArg("l", np.int32, ("n",)),
+ lp.GlobalArg("ell", np.int32, ("n",)),
])
cknl = lp.CompiledKernel(ctx, knl)
n = 330
- l = np.arange(n, dtype=np.int32)
- evt, (a,) = cknl(queue, l=l, n=n, out_host=True)
+ ell = np.arange(n, dtype=np.int32)
+ evt, (a,) = cknl(queue, ell=ell, n=n, out_host=True)
- tgt_result = (2*l-1)*2*l/2
+ tgt_result = (2*ell-1)*2*ell/2
assert (a == tgt_result).all()
@@ -413,6 +413,27 @@ def test_parallel_multi_output_reduction(ctx_factory):
assert max_index == np.argmax(np.abs(a))
+def test_reduction_with_conditional():
+ # Test whether realization of a reduction inherits predicates
+ # of the original instruction. Tested with the CTarget, because
+ # the PyOpenCL target will hoist the conditional into the host
+ # code in this minimal example.
+ knl = lp.make_kernel(
+ "{ [i] : 0<=i<42 }",
+ """
+ if n > 0
+ <>b = sum(i, a[i])
+ end
+ """,
+ [lp.GlobalArg("a", dtype=np.float32, shape=(42,)),
+ lp.GlobalArg("n", dtype=np.float32, shape=())],
+ target=lp.CTarget())
+ code = lp.generate_body(knl)
+
+ # Check that the if appears before the loop that realizes the reduction.
+ assert code.index("if") < code.index("for")
+
+
if __name__ == "__main__":
if len(sys.argv) > 1:
exec(sys.argv[1])
diff --git a/test/test_statistics.py b/test/test_statistics.py
index cf86539efec7be7e85fecfadc3b19d26fac7bb6d..eeb4a5a288afdd5b9295b0b681abb61b5f021d97 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -37,14 +37,14 @@ from pymbolic.primitives import Variable
def test_op_counter_basic():
knl = lp.make_kernel(
- "[n,m,l] -> {[i,k,j]: 0<=i {[i,k,j]: 0<=i6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
+ e[i,k] = if(
+ not(k6 or k/2==ell,
+ g[i,k]*2,
+ g[i,k]+h[i,k]/2)
"""
],
- name="logic", assumptions="n,m,l >= 1")
+ name="logic", assumptions="n,m,ell >= 1")
knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
op_map = lp.get_op_map(knl, count_redundant_work=True)
n = 512
m = 256
- l = 128
- params = {'n': n, 'm': m, 'l': l}
+ ell = 128
+ params = {'n': n, 'm': m, 'ell': ell}
f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(params)
f64div = op_map[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params)
@@ -118,14 +121,14 @@ def test_op_counter_logic():
def test_op_counter_specialops():
knl = lp.make_kernel(
- "{[i,k,j]: 0<=i> k))
"""
],
- name="bitwise", assumptions="n,m,l >= 1")
+ name="bitwise", assumptions="n,m,ell >= 1")
knl = lp.add_and_infer_dtypes(
knl, dict(
@@ -169,16 +172,16 @@ def test_op_counter_bitwise():
op_map = lp.get_op_map(knl, count_redundant_work=True)
n = 512
m = 256
- l = 128
- params = {'n': n, 'm': m, 'l': l}
+ ell = 128
+ params = {'n': n, 'm': m, 'ell': ell}
i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params)
i32bw = op_map[lp.Op(np.int32, 'bw')].eval_with_dict(params)
i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params)
i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params)
i64add = op_map[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params)
i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params)
- assert i32add == n*m+n*m*l
- assert i32bw == 2*n*m*l
+ assert i32add == n*m+n*m*ell
+ assert i32bw == 2*n*m*ell
assert i64bw == 2*n*m
assert i64add == i64mul == n*m
assert i64shift == 2*n*m
@@ -218,22 +221,22 @@ def test_op_counter_triangular_domain():
def test_mem_access_counter_basic():
knl = lp.make_kernel(
- "[n,m,l] -> {[i,k,j]: 0<=i {[i,k,j]: 0<=i6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
+ e[i,k] = if(not(k6 or k/2==ell,
+ g[i,k]*2,
+ g[i,k]+h[i,k]/2)
"""
],
- name="logic", assumptions="n,m,l >= 1")
+ name="logic", assumptions="n,m,ell >= 1")
knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
n = 512
m = 256
- l = 128
- params = {'n': n, 'm': m, 'l': l}
+ ell = 128
+ params = {'n': n, 'm': m, 'ell': ell}
reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
@@ -332,22 +337,22 @@ def test_mem_access_counter_logic():
def test_mem_access_counter_specialops():
knl = lp.make_kernel(
- "{[i,k,j]: 0<=i> k))
"""
],
- name="bitwise", assumptions="n,m,l >= 1")
+ name="bitwise", assumptions="n,m,ell >= 1")
knl = lp.add_and_infer_dtypes(
knl, dict(
@@ -398,8 +403,8 @@ def test_mem_access_counter_bitwise():
mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
n = 512
m = 256
- l = 128
- params = {'n': n, 'm': m, 'l': l}
+ ell = 128
+ params = {'n': n, 'm': m, 'ell': ell}
i32 = mem_map[lp.MemAccess('global', np.int32,
stride=0, direction='load', variable='a')
].eval_with_dict(params)
@@ -412,7 +417,7 @@ def test_mem_access_counter_bitwise():
i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
stride=0, direction='load', variable='h')
].eval_with_dict(params)
- assert i32 == 4*n*m+2*n*m*l
+ assert i32 == 4*n*m+2*n*m*ell
i32 = mem_map[lp.MemAccess('global', np.int32,
stride=0, direction='store', variable='c')
@@ -420,20 +425,20 @@ def test_mem_access_counter_bitwise():
i32 += mem_map[lp.MemAccess('global', np.int32,
stride=0, direction='store', variable='e')
].eval_with_dict(params)
- assert i32 == n*m+n*m*l
+ assert i32 == n*m+n*m*ell
def test_mem_access_counter_mixed():
knl = lp.make_kernel(
- "[n,m,l] -> {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}",
+ "[n,m,ell] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}",
[
"""
c[i,j,k] = 2*a[i,j,k] {id=first}
@@ -620,8 +625,8 @@ def test_barrier_counter_barriers():
print(sync_map)
n = 512
m = 256
- l = 128
- params = {'n': n, 'm': m, 'l': l}
+ ell = 128
+ params = {'n': n, 'm': m, 'ell': ell}
barrier_count = sync_map["barrier_local"].eval_with_dict(params)
assert barrier_count == 50*10*2
@@ -630,11 +635,11 @@ def test_all_counters_parallel_matmul():
bsize = 16
knl = lp.make_kernel(
- "{[i,k,j]: 0<=i {[i,k,j]: 0<=i {[i,k,j]: 0<=i