diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c76590f62d4383d611e8afb40fdaddf21b148f07..4d4f7e52891a0cb8477672fbaf14b29c067de0f9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,6 +4,7 @@ Python 2.7 AMD CPU: - export PYOPENCL_TEST=amd:pu - export EXTRA_INSTALL="numpy mako" - export LOOPY_NO_CACHE=1 + - export NO_DOCTESTS=1 - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: @@ -18,6 +19,7 @@ Python 2.6 POCL: - export PYOPENCL_TEST=portable - export EXTRA_INSTALL="numpy mako" - export LOOPY_NO_CACHE=1 + - export NO_DOCTESTS=1 - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: @@ -31,7 +33,6 @@ Python 3.5 AMD CPU: - export PY_EXE=python3.5 - export PYOPENCL_TEST=amd:pu - export EXTRA_INSTALL="numpy mako" - - export NO_DOCTESTS=1 - export LOOPY_NO_CACHE=1 - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" @@ -47,6 +48,7 @@ Python 2.7 POCL: - export PYOPENCL_TEST=portable - export EXTRA_INSTALL="numpy mako" - export LOOPY_NO_CACHE=1 + - export NO_DOCTESTS=1 - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: @@ -62,6 +64,7 @@ Python 2.7 with legacy PyOpenCL: - export EXTRA_INSTALL="numpy mako" - export REQUIREMENTS_TXT="requirements-old-pyopencl.txt" - export LOOPY_NO_CACHE=1 + - export NO_DOCTESTS=1 - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: @@ -77,7 +80,6 @@ Python 3.6 POCL: - export PYOPENCL_TEST=portable - export EXTRA_INSTALL="numpy mako" - export LOOPY_NO_CACHE=1 - - export NO_DOCTESTS=1 - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: @@ -91,7 +93,6 @@ Python 3.6 POCL Twice With Cache: - export PY_EXE=python3.6 - export PYOPENCL_TEST=portable - export EXTRA_INSTALL="numpy mako" - - export NO_DOCTESTS=1 - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" - "cd .." diff --git a/doc/misc.rst b/doc/misc.rst index cd6fe102cb9c97a619d8b6512f103c9dcabe65b5..2c9c9a92bbc903e6eadc458ea7a197fef54c65d4 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -90,7 +90,9 @@ regarding OpenCL drivers. User-visible Changes ==================== -Version 2017.2 +See also :ref:`language-versioning`. + +Version 2018.1 -------------- .. note:: diff --git a/doc/ref_creation.rst b/doc/ref_creation.rst index 92eff09c9e3ecacfd8bb9030a9e4b9f002fefc71..6b715033cce60fa3a369f2abc4edbecbf4c9a0d3 100644 --- a/doc/ref_creation.rst +++ b/doc/ref_creation.rst @@ -30,4 +30,6 @@ To Copy between Data Formats .. autofunction:: make_copy_kernel +.. automodule:: loopy.version + .. vim: tw=75:spell:fdm=marker diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 7196dad863474d9b6ea9df9d9d0ae90b3e14986d..217e1ef7c323ca13f8a1aaf81e8ea30c08b784a7 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -25,6 +25,7 @@ import a few modules and set up a :class:`pyopencl.Context` and a >>> import loopy as lp >>> lp.set_caching_enabled(False) + >>> from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 >>> from warnings import filterwarnings, catch_warnings >>> filterwarnings('error', category=lp.LoopyWarning) @@ -1157,7 +1158,7 @@ this, :mod:`loopy` will complain that global barrier needs to be inserted: >>> cgr = lp.generate_code_v2(knl) Traceback (most recent call last): ... - MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) + loopy.diagnostic.MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) The syntax for a inserting a global barrier instruction is ``... gbarrier``. :mod:`loopy` also supports manually inserting local @@ -1200,7 +1201,7 @@ Here is what happens when we try to generate code for the kernel: >>> cgr = lp.generate_code_v2(knl) Traceback (most recent call last): ... - MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?) + loopy.diagnostic.MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?) This happens due to the kernel splitting done by :mod:`loopy`. The splitting happens when the instruction schedule is generated. To see the schedule, we @@ -1396,7 +1397,7 @@ Attempting to create this kernel results in an error: ... # While trying to find shape axis 0 of argument 'out', the following exception occurred: Traceback (most recent call last): ... - StaticValueFindingError: a static maximum was not found for PwAff '[n] -> { [(1)] : n <= 1; [(n)] : n >= 2 }' + loopy.diagnostic.StaticValueFindingError: a static maximum was not found for PwAff '[n] -> { [(1)] : n <= 1; [(n)] : n >= 2 }' The problem is that loopy cannot find a simple, universally valid expression for the length of *out* in this case. Notice how the kernel accesses both the @@ -1462,7 +1463,7 @@ sign that something is amiss: >>> evt, (out,) = knl(queue, a=a_mat_dev) Traceback (most recent call last): ... - WriteRaceConditionWarning: in kernel transpose: instruction 'a_fetch_rule' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch_rule)' to silenced_warnings kernel attribute to disable) + loopy.diagnostic.WriteRaceConditionWarning: in kernel transpose: instruction 'a_fetch_rule' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch_rule)' to silenced_warnings kernel attribute to disable) When we ask to see the code, the issue becomes apparent: @@ -1545,20 +1546,18 @@ containing different types of data: >>> knl = lp.add_and_infer_dtypes(knl, ... dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) -Note that loopy will infer the data types for arrays c and e from the +Note that loopy will infer the data types for arrays ``c`` and ``e`` from the information provided. Now we will count the operations: .. doctest:: >>> op_map = lp.get_op_map(knl) >>> print(lp.stringify_stats_mapping(op_map)) - Op(np:dtype('float32'), add) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('float32'), div) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('float32'), mul) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('float64'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('float64'), mul) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('int32'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } - + Op(np:dtype('float32'), add) : ... + +Each line of output will look roughly like:: + + Op(np:dtype('float32'), add) : [l, m, n] -> { l * m * n : l > 0 and m > 0 and n > 0 } :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. A @@ -1615,15 +1614,18 @@ together into keys containing only the specified fields: >>> op_map_dtype = op_map.group_by('dtype') >>> print(lp.stringify_stats_mapping(op_map_dtype)) - Op(np:dtype('float32'), None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('float64'), None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 } - Op(np:dtype('int32'), None) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float32'), None) : ... >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32) ... ].eval_with_dict(param_dict) >>> print(f32op_count) 1572864 +The lines of output above might look like:: + + Op(np:dtype('float32'), None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float64'), None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 } + See the reference page for :class:`loopy.ToCountMap` and :class:`loopy.Op` for more information on these functions. @@ -1638,13 +1640,15 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl) >>> print(lp.stringify_stats_mapping(mem_map)) + MemAccess(global, np:dtype('float32'), 0, load, a) : ... + + +Each line of output will look roughly like:: + + MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } - :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1693,18 +1697,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, a) : ... >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) - MemAccess(None, None, None, load, None) : [m, l, n] -> { (16 * m + 12 * m * l) * n : m > 0 and l > 0 and n > 0 } - MemAccess(None, None, None, store, None) : [m, l, n] -> { (8 * m + 4 * m * l) * n : m > 0 and l > 0 and n > 0 } + MemAccess(None, None, None, load, None) : ... + MemAccess(None, None, None, store, None) : ... >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') ... ].eval_with_dict(param_dict) @@ -1714,6 +1713,15 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: bytes loaded: 7340032 bytes stored: 2621440 +The lines of output above might look like:: + + MemAccess(global, np:[m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + One can see how these functions might be useful in computing, for example, achieved memory bandwidth in byte/sec or performance in FLOP/sec. @@ -1731,12 +1739,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 1, load, a) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float32'), 1, load, b) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float32'), 1, store, c) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 1, load, g) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 1, load, h) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 1, store, e) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 1, load, a) : ... + MemAccess(global, np:dtype('float32'), 1, load, b) : ... + MemAccess(global, np:dtype('float32'), 1, store, c) : ... + MemAccess(global, np:dtype('float64'), 1, load, g) : ... + MemAccess(global, np:dtype('float64'), 1, load, h) : ... + MemAccess(global, np:dtype('float64'), 1, store, e) : ... With this parallelization, consecutive threads will access consecutive array @@ -1772,12 +1780,12 @@ switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 128, load, a) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float32'), 128, load, b) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float32'), 128, store, c) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 128, load, g) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 128, load, h) : [m, l, n] -> { ... } - MemAccess(global, np:dtype('float64'), 128, store, e) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 128, load, a) : ... + MemAccess(global, np:dtype('float32'), 128, load, b) : ... + MemAccess(global, np:dtype('float32'), 128, store, c) : ... + MemAccess(global, np:dtype('float64'), 128, load, g) : ... + MemAccess(global, np:dtype('float64'), 128, load, h) : ... + MemAccess(global, np:dtype('float64'), 128, store, e) : ... With this parallelization, consecutive threads will access *nonconsecutive* diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py index 7c5de5a1b1d7042498a12204959a59021ac5e0d8..6fa9b5fd30b350a07e2d1d27fa36c930c9afb892 100644 --- a/examples/python/hello-loopy.py +++ b/examples/python/hello-loopy.py @@ -2,6 +2,7 @@ import numpy as np import loopy as lp import pyopencl as cl import pyopencl.array +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # setup # ----- diff --git a/loopy/__init__.py b/loopy/__init__.py index 5e8a3fb06b733183fb03c09eb6126a3eee98b916..0f4697f92e3f779b5670147c0fe7936989a317c4 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -65,6 +65,8 @@ from loopy.library.reduction import register_reduction_parser # {{{ import transforms +from loopy.version import VERSION, MOST_RECENT_LANGUAGE_VERSION + from loopy.transform.iname import ( set_loop_priority, prioritize_loops, split_iname, chunk_iname, join_inames, tag_inames, duplicate_inames, @@ -171,6 +173,8 @@ __all__ = [ "register_reduction_parser", + "VERSION", "MOST_RECENT_LANGUAGE_VERSION", + # {{{ transforms "set_loop_priority", "prioritize_loops", diff --git a/loopy/check.py b/loopy/check.py index 7e661b566b15c47ec99e03ffdeb035057602da76..83f529206c6d1d4cb058673162c2285a5ad1356a 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -250,6 +250,8 @@ def check_for_data_dependent_parallel_bounds(kernel): % (i, par, ", ".join(par_inames))) +# {{{ check access bounds + class _AccessCheckMapper(WalkMapper): def __init__(self, kernel, domain, insn_id): self.kernel = kernel @@ -277,7 +279,8 @@ class _AccessCheckMapper(WalkMapper): if not isinstance(subscript, tuple): subscript = (subscript,) - from loopy.symbolic import get_dependencies, get_access_range + from loopy.symbolic import (get_dependencies, get_access_range, + UnableToDetermineAccessRange) available_vars = set(self.domain.get_var_dict()) shape_deps = set() @@ -298,11 +301,8 @@ class _AccessCheckMapper(WalkMapper): try: access_range = get_access_range(self.domain, subscript, self.kernel.assumptions) - except isl.Error: - # Likely: index was non-linear, nothing we can do. - return - except TypeError: - # Likely: index was non-linear, nothing we can do. + except UnableToDetermineAccessRange: + # Likely: index was non-affine, nothing we can do. return shape_domain = isl.BasicSet.universe(access_range.get_space()) @@ -340,6 +340,10 @@ def check_bounds(kernel): insn.with_transformed_expressions(run_acm) +# }}} + + +# {{{ check write destinations def check_write_destinations(kernel): for insn in kernel.instructions: @@ -363,6 +367,10 @@ def check_write_destinations(kernel): or wvar in kernel.arg_dict) and wvar not in kernel.all_params(): raise LoopyError +# }}} + + +# {{{ check_has_schedulable_iname_nesting def check_has_schedulable_iname_nesting(kernel): from loopy.transform.iname import (has_schedulable_iname_nesting, @@ -382,6 +390,196 @@ def check_has_schedulable_iname_nesting(kernel): # }}} +# {{{ check_variable_access_ordered + +class IndirectDependencyEdgeFinder(object): + def __init__(self, kernel): + self.kernel = kernel + self.dep_edge_cache = {} + + def __call__(self, depender_id, dependee_id): + cache_key = (depender_id, dependee_id) + + try: + return self.dep_edge_cache[cache_key] + except KeyError: + pass + + depender = self.kernel.id_to_insn[depender_id] + + if dependee_id in depender.depends_on: + self.dep_edge_cache[cache_key] = True + return True + + for dep in depender.depends_on: + if self(dep, dependee_id): + self.dep_edge_cache[cache_key] = True + return True + + return False + + +def declares_nosync_with(kernel, var_scope, dep_a, dep_b): + from loopy.kernel.data import temp_var_scope + if var_scope == temp_var_scope.GLOBAL: + search_scopes = ["global", "any"] + elif var_scope == temp_var_scope.LOCAL: + search_scopes = ["local", "any"] + elif var_scope == temp_var_scope.PRIVATE: + search_scopes = ["any"] + else: + raise ValueError("unexpected value of 'temp_var_scope'") + + ab_nosync = False + ba_nosync = False + + for scope in search_scopes: + if (dep_a.id, scope) in dep_b.no_sync_with: + ab_nosync = True + if (dep_b.id, scope) in dep_a.no_sync_with: + ba_nosync = True + + return ab_nosync and ba_nosync + + +def check_variable_access_ordered(kernel): + """Checks that between each write to a variable and all other accesses to + the variable there is either: + + * an (at least indirect) depdendency edge, or + * an explicit statement that no ordering is necessary (expressed + through a bi-directional :attr:`loopy.Instruction.no_sync_with`) + """ + if kernel.options.enforce_variable_access_ordered not in [ + "no_check", + True, + False]: + raise LoopyError("invalid value for option " + "'enforce_variable_access_ordered': %s" + % kernel.options.enforce_variable_access_ordered) + + if kernel.options.enforce_variable_access_ordered == "no_check": + return + + logger.debug("%s: check_variable_access_ordered: start" % kernel.name) + + checked_variables = kernel.get_written_variables() & ( + set(kernel.temporary_variables) | set(arg for arg in kernel.arg_dict)) + + wmap = kernel.writer_map() + rmap = kernel.reader_map() + + from loopy.kernel.data import GlobalArg, ValueArg, temp_var_scope + from loopy.kernel.tools import find_aliasing_equivalence_classes + + depfind = IndirectDependencyEdgeFinder(kernel) + aliasing_equiv_classes = find_aliasing_equivalence_classes(kernel) + + for name in checked_variables: + # This is a tad redundant in that this could probably be restructured + # to iterate only over equivalence classes and not individual variables. + # But then the access-range overlap check below would have to be smarter. + eq_class = aliasing_equiv_classes[name] + + readers = set.union( + *[rmap.get(eq_name, set()) for eq_name in eq_class]) + writers = set.union( + *[wmap.get(eq_name, set()) for eq_name in eq_class]) + unaliased_readers = rmap.get(name, set()) + unaliased_writers = wmap.get(name, set()) + + if not writers: + continue + + if name in kernel.temporary_variables: + scope = kernel.temporary_variables[name].scope + else: + arg = kernel.arg_dict[name] + if isinstance(arg, GlobalArg): + scope = temp_var_scope.GLOBAL + elif isinstance(arg, ValueArg): + scope = temp_var_scope.PRIVATE + else: + # No need to consider ConstantArg and ImageArg (for now) + # because those won't be written. + raise ValueError("could not determine scope of '%s'" % name) + + # Check even for PRIVATE scope, to ensure intentional program order. + + from loopy.symbolic import do_access_ranges_overlap_conservative + + for writer_id in writers: + for other_id in readers | writers: + if writer_id == other_id: + continue + + writer = kernel.id_to_insn[writer_id] + other = kernel.id_to_insn[other_id] + + has_dependency_relationship = ( + declares_nosync_with(kernel, scope, other, writer) + or + depfind(writer_id, other_id) + or + depfind(other_id, writer_id) + ) + + if has_dependency_relationship: + continue + + is_relationship_by_aliasing = not ( + writer_id in unaliased_writers + and (other_id in unaliased_writers + or other_id in unaliased_readers)) + + # Do not enforce ordering for disjoint access ranges + if (not is_relationship_by_aliasing + and not do_access_ranges_overlap_conservative( + kernel, writer_id, "w", other_id, "any", + name)): + continue + + # Do not enforce ordering for aliasing-based relationships + # in different groups. + if (is_relationship_by_aliasing and ( + bool(writer.groups & other.conflicts_with_groups) + or + bool(other.groups & writer.conflicts_with_groups))): + continue + + msg = ("No dependency relationship found between " + "'{writer_id}' which writes {var} and " + "'{other_id}' which also accesses {var}. " + "Either add a (possibly indirect) dependency " + "between the two, or add them to each others' nosync " + "set to indicate that no ordering is intended, or " + "turn off this check by setting the " + "'enforce_variable_access_ordered' option" + .format( + writer_id=writer_id, + other_id=other_id, + var=( + "the variable '%s'" % name + if len(eq_class) == 1 + else ( + "the aliasing equivalence class '%s'" + % ", ".join(eq_class)) + ))) + if kernel.options.enforce_variable_access_ordered: + from loopy.diagnostic import VariableAccessNotOrdered + raise VariableAccessNotOrdered(msg) + else: + from loopy.diagnostic import warn_with_kernel + warn_with_kernel( + kernel, "variable_access_ordered", msg) + + logger.debug("%s: check_variable_access_ordered: done" % kernel.name) + +# }}} + +# }}} + + def pre_schedule_checks(kernel): try: logger.debug("%s: pre-schedule check: start" % kernel.name) @@ -397,6 +595,7 @@ def pre_schedule_checks(kernel): check_bounds(kernel) check_write_destinations(kernel) check_has_schedulable_iname_nesting(kernel) + check_variable_access_ordered(kernel) logger.debug("%s: pre-schedule check: done" % kernel.name) except KeyboardInterrupt: diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index 4868f70af81ae54972e7d81282b62798da233407..c2b78f4d78698998cecb1f082aae2ed433310aed 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -115,6 +115,10 @@ class LoopyTypeError(LoopyError): class ExpressionNotAffineError(LoopyError): pass + +class VariableAccessNotOrdered(LoopyError): + pass + # }}} diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index e801d09dcf10750ce09af647e0b14f4641fa1fb2..bcbe41874d8613eaabd84ae71dd65317558f0185 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -708,6 +708,7 @@ class F2LoopyTranslator(FTreeWalkerBase): # }}} + from loopy.version import MOST_RECENT_LANGUAGE_VERSION knl = lp.make_kernel( sub.index_sets, sub.instructions, @@ -717,6 +718,7 @@ class F2LoopyTranslator(FTreeWalkerBase): index_dtype=self.index_dtype, target=self.target, seq_dependencies=seq_dependencies, + lang_version=MOST_RECENT_LANGUAGE_VERSION ) from loopy.loop import fuse_loop_domains diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 4a08c28bd8091425293892384e01d20447413cd5..0daf327f441031662b46a4a83b4fc40e73eb5688 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1666,7 +1666,7 @@ def _is_wildcard(s): return any(c in s for c in WILDCARD_SYMBOLS) -def _resolve_dependencies(knl, insn, deps): +def _resolve_dependencies(what, knl, insn, deps): from loopy import find_instructions from loopy.match import MatchExpressionBase @@ -1692,10 +1692,11 @@ def _resolve_dependencies(knl, insn, deps): found_any = True if not found_any and knl.options.check_dep_resolution: - raise LoopyError("instruction '%s' declared a depency on '%s', " + raise LoopyError("instruction '%s' declared %s on '%s', " "which did not resolve to any instruction present in the " "kernel '%s'. Set the kernel option 'check_dep_resolution'" - "to False to disable this check." % (insn.id, dep, knl.name)) + "to False to disable this check." + % (insn.id, what, dep, knl.name)) for dep_id in new_deps: if dep_id not in knl.id_to_insn: @@ -1710,13 +1711,14 @@ def resolve_dependencies(knl): for insn in knl.instructions: new_insns.append(insn.copy( - depends_on=_resolve_dependencies(knl, insn, insn.depends_on), - no_sync_with=frozenset( - (resolved_insn_id, nosync_scope) - for nosync_dep, nosync_scope in insn.no_sync_with - for resolved_insn_id in - _resolve_dependencies(knl, insn, (nosync_dep,))), - )) + depends_on=_resolve_dependencies( + "a dependency", knl, insn, insn.depends_on), + no_sync_with=frozenset( + (resolved_insn_id, nosync_scope) + for nosync_dep, nosync_scope in insn.no_sync_with + for resolved_insn_id in + _resolve_dependencies("nosync", knl, insn, (nosync_dep,))), + )) return knl.copy(instructions=new_insns) @@ -1909,6 +1911,30 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): will be fixed to *value*. *name* may refer to :ref:`domain-parameters` or :ref:`arguments`. See also :func:`loopy.fix_parameters`. + :arg lang_version: The language version against which the kernel was + written, a tuple. To ensure future compatibility, copy the current value of + :data:`loopy.MOST_RECENT_LANGUAGE_VERSION` and pass that value. + + (If you just pass :data:`loopy.MOST_RECENT_LANGUAGE_VERSION` directly, + breaking language changes *will* apply to your kernel without asking, + likely breaking your code.) + + If not given, this value defaults to version **(2017, 2, 1)** and + a warning will be issued. + + To set the kernel version for all :mod:`loopy` kernels in a (Python) source + file, you may simply say:: + + from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 + + If *lang_version* is not explicitly given, that version value will be used. + + See also :ref:`language-versioning`. + + .. versionchanged:: 2017.2.1 + + *lang_version* added. + .. versionchanged:: 2017.2 *fixed_parameters* added. @@ -1953,6 +1979,56 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): from loopy.options import make_options options = make_options(options) + lang_version = kwargs.pop("lang_version", None) + if lang_version is None: + # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION + + from loopy.version import LANGUAGE_VERSION_SYMBOLS + + # This *is* gross. But it seems like the right thing interface-wise. + import inspect + caller_globals = inspect.currentframe().f_back.f_globals + + for ver_sym in LANGUAGE_VERSION_SYMBOLS: + try: + lang_version = caller_globals[ver_sym] + break + except KeyError: + pass + + # }}} + + import loopy.version + version_to_symbol = dict( + (getattr(loopy.version, lvs), lvs) + for lvs in LANGUAGE_VERSION_SYMBOLS) + + if lang_version is None: + from warnings import warn + from loopy.diagnostic import LoopyWarning + from loopy.version import ( + MOST_RECENT_LANGUAGE_VERSION, + FALLBACK_LANGUAGE_VERSION) + warn("'lang_version' was not passed to make_kernel(). " + "To avoid this warning, pass " + "lang_version={ver} in this invocation. " + "(Or say 'from loopy.version import " + "{sym_ver}' in " + "the global scope of the calling frame.)" + .format( + ver=MOST_RECENT_LANGUAGE_VERSION, + sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION] + ), + LoopyWarning, stacklevel=2) + + lang_version = FALLBACK_LANGUAGE_VERSION + + if lang_version not in version_to_symbol: + raise LoopyError("Language version '%s' is not known." % lang_version) + + if lang_version >= (2018, 1): + options = options.copy(enforce_variable_access_ordered=True) + if isinstance(silenced_warnings, str): silenced_warnings = silenced_warnings.split(";") diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 9d95408acc7f1a53f1f1a7616f7d6611249c796b..95001c78bb1f3ef0c6e823589075ddb6e3fbb506 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -91,7 +91,7 @@ class InstructionBase(ImmutableRecord): .. attribute:: no_sync_with - a :class:`frozenset` of tuples of the form `(insn_id, scope)`, where + a :class:`frozenset` of tuples of the form ``(insn_id, scope)``, where `insn_id` refers to :attr:`id` of :class:`Instruction` instances and `scope` is one of the following strings: @@ -99,13 +99,20 @@ class InstructionBase(ImmutableRecord): - `"global"` - `"any"`. - This indicates no barrier synchronization is necessary with the given - instruction using barriers of type `scope`, even given the existence of - a dependency chain and apparently conflicting access. + An element ``(insn_id, scope)`` means "do not consider any variable + access conflicting for variables of ``scope`` between this instruction + and ``insn_id``". + Specifically, loopy will not complain even if it detects that accesses + potentially requiring ordering (e.g. by dependencies) exist, and it + will not emit barriers to guard any dependencies from this + instruction on ``insn_id`` that may exist. Note, that :attr:`no_sync_with` allows instruction matching through wildcards and match expression, just like :attr:`depends_on`. + This data is used specifically by barrier insertion and + :func:`loopy.check.enforce_variable_access_ordered`. + .. rubric:: Conditionals .. attribute:: predicates diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index fbc4238c21e966cb61d1c074ce6924fd9af26084..15db06ad78b448675b193a2d880ae5b50073e99d 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1278,14 +1278,14 @@ def draw_dependencies_as_unicode_arrows( for insn in instructions: for dep in insn.depends_on: - reverse_deps.setdefault(dep, []).append(insn.id) + reverse_deps.setdefault(dep, set()).add(insn.id) # mapping of (from_id, to_id) tuples to column_index dep_to_column = {} # {{{ find column assignments - # mapping from column indices to (end_insn_id, updown) + # mapping from column indices to (end_insn_ids, pointed_at_insn_id) columns_in_use = {} n_columns = [0] @@ -1299,47 +1299,101 @@ def draw_dependencies_as_unicode_arrows( row.append(" ") return i - def do_flag_downward(s, updown): - if flag_downward and updown == "down": + def do_flag_downward(s, pointed_at_insn_id): + if flag_downward and pointed_at_insn_id not in processed_ids: return fore.RED+s+style.RESET_ALL else: return s def make_extender(): result = n_columns[0] * [" "] - for col, (_, updown) in six.iteritems(columns_in_use): - result[col] = do_flag_downward(u"│", updown) + for col, (_, pointed_at_insn_id) in six.iteritems(columns_in_use): + result[col] = do_flag_downward(u"│", pointed_at_insn_id) return result + processed_ids = set() + rows = [] for insn in instructions: row = make_extender() - for rdep in reverse_deps.get(insn.id, []): - assert rdep != insn.id + # {{{ add rdeps for already existing columns - dep_key = (rdep, insn.id) - if dep_key not in dep_to_column: - col = dep_to_column[dep_key] = find_free_column() - columns_in_use[col] = (rdep, "up") - row[col] = u"↱" + rdeps = reverse_deps.get(insn.id, set()).copy() - processed_ids + assert insn.id not in rdeps + + if insn.id in dep_to_column: + columns_in_use[insn.id][0].update(rdeps) + + # }}} + + # {{{ add deps for already existing columns + + for dep in insn.depends_on: + dep_key = dep + if dep_key in dep_to_column: + col = dep_to_column[dep] + columns_in_use[col][0].add(insn.id) + + # }}} + + for col, (starts, pointed_at_insn_id) in list(six.iteritems(columns_in_use)): + if insn.id == pointed_at_insn_id: + if starts: + # will continue downward + row[col] = do_flag_downward(u">", pointed_at_insn_id) + else: + # stops here + + # placeholder, pending deletion + columns_in_use[col] = None + + row[col] = do_flag_downward(u"↳", pointed_at_insn_id) + + elif insn.id in starts: + starts.remove(insn.id) + if starts: + # will continue downward + row[col] = do_flag_downward(u"├", pointed_at_insn_id) + + else: + # stops here + row[col] = u"└" + # placeholder, pending deletion + columns_in_use[col] = None + + # {{{ start arrows by reverse dep + + dep_key = insn.id + if dep_key not in dep_to_column and rdeps: + col = dep_to_column[dep_key] = find_free_column() + columns_in_use[col] = (rdeps, insn.id) + row[col] = u"↱" + + # }}} + + # {{{ start arrows by forward dep for dep in insn.depends_on: assert dep != insn.id - dep_key = (insn.id, dep) + dep_key = dep if dep_key not in dep_to_column: col = dep_to_column[dep_key] = find_free_column() - columns_in_use[col] = (dep, "down") - row[col] = do_flag_downward(u"┌", "down") + columns_in_use[col] = (set([insn.id]), dep) + row[col] = do_flag_downward(u"┌", dep) - for col, (end, updown) in list(six.iteritems(columns_in_use)): - if insn.id == end: + # }}} + + # {{{ delete columns_in_use entry for end-of-life columns + + for col, value in list(six.iteritems(columns_in_use)): + if value is None: del columns_in_use[col] - if updown == "up": - row[col] = u"└" - else: - row[col] = do_flag_downward(u"↳", updown) + + # }} + + processed_ids.add(insn.id) extender = make_extender() @@ -1731,4 +1785,84 @@ def get_subkernel_to_insn_id_map(kernel): # }}} +# {{{ find aliasing equivalence classes + +class DisjointSets(object): + """ + .. automethod:: __getitem__ + .. automethod:: find_leader_or_create_group + .. automethod:: union + .. automethod:: union_many + """ + + # https://en.wikipedia.org/wiki/Disjoint-set_data_structure + + def __init__(self): + self.leader_to_group = {} + self.element_to_leader = {} + + def __getitem__(self, item): + """ + :arg item: A representative of an equivalence class. + :returns: the equivalence class, given as a set of elements + """ + try: + leader = self.element_to_leader[item] + except KeyError: + return set([item]) + else: + return self.leader_to_group[leader] + + def find_leader_or_create_group(self, el): + try: + return self.element_to_leader[el] + except KeyError: + pass + + self.element_to_leader[el] = el + self.leader_to_group[el] = set([el]) + return el + + def union(self, a, b): + leader_a = self.find_leader_or_create_group(a) + leader_b = self.find_leader_or_create_group(b) + + if leader_a == leader_b: + return + + new_leader = leader_a + + for b_el in self.leader_to_group[leader_b]: + self.element_to_leader[b_el] = new_leader + + self.leader_to_group[leader_a].update(self.leader_to_group[leader_b]) + del self.leader_to_group[leader_b] + + def union_many(self, relation): + """ + :arg relation: an iterable of 2-tuples enumerating the elements of the + relation. The relation is assumed to be an equivalence relation + (transitive, reflexive, symmetric) but need not explicitly contain + all elements to make it that. + + The first elements of the tuples become group leaders. + + :returns: *self* + """ + + for a, b in relation: + self.union(a, b) + + return self + + +def find_aliasing_equivalence_classes(kernel): + return DisjointSets().union_many( + (tv.base_storage, tv.name) + for tv in six.itervalues(kernel.temporary_variables) + if tv.base_storage is not None) + +# }}} + + # vim: foldmethod=marker diff --git a/loopy/options.py b/loopy/options.py index 13d0b752dfcfa0f0da233880f27f09a963ab4c81..63089d94d3487e77a1def39a98fe24631c508398 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -162,6 +162,16 @@ class Options(ImmutableRecord): .. rubric:: Features .. attribute:: disable_global_barriers + + .. attribute:: enforce_variable_access_ordered + + If *True*, require that + :func:`loopy.check.check_variable_access_ordered` passes. + Required for language versions 2018.1 and above. This check + helps find and eliminate unintentionally unordered access + to variables. + + If equal to ``"no_check"``, then no check is performed. """ _legacy_options_map = { @@ -216,6 +226,9 @@ class Options(ImmutableRecord): disable_global_barriers=kwargs.get("disable_global_barriers", False), check_dep_resolution=kwargs.get("check_dep_resolution", True), + + enforce_variable_access_ordered=kwargs.get( + "enforce_variable_access_ordered", False), ) # {{{ legacy compatibility diff --git a/loopy/preprocess.py b/loopy/preprocess.py index ad119e94e74b294e16cdc15c5ab1f723cf7f254b..5e36e51a190f9ba93a48aa8eaab5e34c20153e47 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1861,9 +1861,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # An expansion happened, so insert the generated stuff plus # ourselves back into the queue. + result_assignment_dep_on = \ + insn.depends_on | frozenset(new_insn_add_depends_on) kwargs = insn.get_copy_kwargs( - depends_on=insn.depends_on - | frozenset(new_insn_add_depends_on), no_sync_with=insn.no_sync_with | frozenset(new_insn_add_no_sync_with), within_inames=( @@ -1871,6 +1871,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, | new_insn_add_within_inames)) kwargs.pop("id") + kwargs.pop("depends_on") kwargs.pop("expression") kwargs.pop("assignee", None) kwargs.pop("assignees", None) @@ -1878,20 +1879,27 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, kwargs.pop("temp_var_types", None) if isinstance(insn.expression, Reduction) and nresults > 1: + result_assignment_ids = [ + insn_id_gen(insn.id) for i in range(nresults)] replacement_insns = [ lp.Assignment( - id=insn_id_gen(insn.id), + id=result_assignment_ids[i], + depends_on=( + result_assignment_dep_on + | (frozenset([result_assignment_ids[i-1]]) + if i else frozenset())), assignee=assignee, expression=new_expr, **kwargs) - for assignee, new_expr in zip( - insn.assignees, new_expressions)] + for i, (assignee, new_expr) in enumerate(zip( + insn.assignees, new_expressions))] else: new_expr, = new_expressions replacement_insns = [ make_assignment( id=insn_id_gen(insn.id), + depends_on=result_assignment_dep_on, assignees=insn.assignees, expression=new_expr, **kwargs) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 850f0a61fcdc2878d43895bc0e024032532aa680..b196b343edebf0b9346b449bc5a44bcc065407a2 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1427,8 +1427,8 @@ class DependencyTracker(object): raise ValueError("unknown 'var_kind': %s" % var_kind) from collections import defaultdict - self.writer_map = defaultdict(set) - self.reader_map = defaultdict(set) + self.base_writer_map = defaultdict(set) + self.base_access_map = defaultdict(set) self.temp_to_base_storage = kernel.get_temporary_to_base_storage_map() def map_to_base_storage(self, var_names): @@ -1442,23 +1442,27 @@ class DependencyTracker(object): return result def discard_all_sources(self): - self.writer_map.clear() - self.reader_map.clear() + self.base_writer_map.clear() + self.base_access_map.clear() + + # Anything with 'base' in the name in this class contains names normalized + # to their 'base_storage'. def add_source(self, source): """ - Specify that an instruction may be used as the source of a dependency edge. + Specify that an instruction used as the source (depended-upon + part) of a dependency edge is of interest to this tracker. """ # If source is an insn ID, look up the actual instruction. source = self.kernel.id_to_insn.get(source, source) for written in self.map_to_base_storage( set(source.assignee_var_names()) & self.relevant_vars): - self.writer_map[written].add(source.id) + self.base_writer_map[written].add(source.id) for read in self.map_to_base_storage( - source.read_dependency_names() & self.relevant_vars): - self.reader_map[read].add(source.id) + source.dependency_names() & self.relevant_vars): + self.base_access_map[read].add(source.id) def gen_dependencies_with_target_at(self, target): """ @@ -1471,51 +1475,87 @@ class DependencyTracker(object): # If target is an insn ID, look up the actual instruction. target = self.kernel.id_to_insn.get(target, target) - tgt_write = self.map_to_base_storage( - set(target.assignee_var_names()) & self.relevant_vars) - tgt_read = self.map_to_base_storage( - target.read_dependency_names() & self.relevant_vars) - - for (accessed_vars, accessor_map) in [ - (tgt_read, self.writer_map), - (tgt_write, self.reader_map), - (tgt_write, self.writer_map)]: + for ( + tgt_dir, src_dir, src_base_var_to_accessor_map + ) in [ + ("any", "w", self.base_writer_map), + ("w", "any", self.base_access_map), + ]: for dep in self.get_conflicting_accesses( - accessed_vars, accessor_map, target.id): + target, tgt_dir, src_dir, src_base_var_to_accessor_map): yield dep - def get_conflicting_accesses( - self, accessed_vars, var_to_accessor_map, target): + def get_conflicting_accesses(self, target, tgt_dir, src_dir, + src_base_var_to_accessor_map): + + def get_written_names(insn): + return set(insn.assignee_var_names()) & self.relevant_vars + + def get_accessed_names(insn): + return insn.dependency_names() & self.relevant_vars + + dir_to_getter = {"w": get_written_names, "any": get_accessed_names} + + def filter_var_set_for_base_storage(var_name_set, base_storage_name): + return set( + name + for name in var_name_set + if (self.temp_to_base_storage.get(name, name) + == base_storage_name)) + + tgt_accessed_vars = dir_to_getter[tgt_dir](target) + tgt_accessed_vars_base = self.map_to_base_storage(tgt_accessed_vars) + + for race_var_base in sorted(tgt_accessed_vars_base): + for source_id in sorted( + src_base_var_to_accessor_map[race_var_base]): - def determine_conflict_nature(source, target): - if (not self.reverse and source in - self.kernel.get_nosync_set(target, scope=self.var_kind)): - return None - if (self.reverse and target in - self.kernel.get_nosync_set(source, scope=self.var_kind)): - return None - return self.describe_dependency(source, target) + # {{{ no barrier if nosync - for var in sorted(accessed_vars): - for source in sorted(var_to_accessor_map[var]): - dep_descr = determine_conflict_nature(source, target) + if (not self.reverse and source_id in + self.kernel.get_nosync_set(target.id, scope=self.var_kind)): + continue + if (self.reverse and target.id in + self.kernel.get_nosync_set(source_id, scope=self.var_kind)): + continue + # }}} + + dep_descr = self.describe_dependency(source_id, target) if dep_descr is None: continue + source = self.kernel.id_to_insn[source_id] + src_race_vars = filter_var_set_for_base_storage( + dir_to_getter[src_dir](source), race_var_base) + tgt_race_vars = filter_var_set_for_base_storage( + tgt_accessed_vars, race_var_base) + + race_var = race_var_base + + # Only one (non-base_storage) race variable name: Data is not + # being passed between aliases, so we may look at indices. + if src_race_vars == tgt_race_vars and len(src_race_vars) == 1: + race_var, = src_race_vars + + from loopy.symbolic import do_access_ranges_overlap_conservative + if not do_access_ranges_overlap_conservative( + self.kernel, target.id, tgt_dir, + source_id, src_dir, race_var): + continue + yield DependencyRecord( - source=self.kernel.id_to_insn[source], - target=self.kernel.id_to_insn[target], + source=source, + target=target, dep_descr=dep_descr, - variable=var, + variable=race_var, var_kind=self.var_kind) - def describe_dependency(self, source, target): + def describe_dependency(self, source_id, target): dep_descr = None - source = self.kernel.id_to_insn[source] - target = self.kernel.id_to_insn[target] + source = self.kernel.id_to_insn[source_id] if self.reverse: source, target = target, source diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 9e16c3a598246aa71e125ce3d04f372d7c90f28e..242ba6ab79e7b58b99b0d416f0c4b3025fdb7f09 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1537,6 +1537,10 @@ class PrimeAdder(IdentityMapper): # {{{ get access range +class UnableToDetermineAccessRange(Exception): + pass + + def get_access_range(domain, subscript, assumptions): domain, assumptions = isl.align_two(domain, assumptions) @@ -1558,8 +1562,17 @@ def get_access_range(domain, subscript, assumptions): access_map = access_map.insert_dims(dim_type.set, dn, dims) for idim in range(dims): - idx_aff = aff_from_expr(access_map.get_space(), - subscript[idim]) + sub_idim = subscript[idim] + with isl.SuppressedWarnings(domain.get_ctx()): + try: + idx_aff = aff_from_expr(access_map.get_space(), sub_idim) + except TypeError as e: + raise UnableToDetermineAccessRange( + "%s: %s" % (type(e).__name__, str(e))) + except isl.Error as e: + raise UnableToDetermineAccessRange( + "%s: %s" % (type(e).__name__, str(e))) + idx_aff = idx_aff.set_coefficient_val( dim_type.in_, dn+idim, -1) @@ -1582,11 +1595,11 @@ def get_access_range(domain, subscript, assumptions): class BatchedAccessRangeMapper(WalkMapper): - def __init__(self, kernel, arg_names): + def __init__(self, kernel, var_names): self.kernel = kernel - self.arg_names = set(arg_names) - self.access_ranges = dict((arg, None) for arg in arg_names) - self.bad_subscripts = dict((arg, []) for arg in arg_names) + self.var_names = set(var_names) + self.access_ranges = dict((arg, None) for arg in var_names) + self.bad_subscripts = dict((arg, []) for arg in var_names) def map_subscript(self, expr, inames): domain = self.kernel.get_inames_domain(inames) @@ -1594,7 +1607,7 @@ class BatchedAccessRangeMapper(WalkMapper): assert isinstance(expr.aggregate, p.Variable) - if expr.aggregate.name not in self.arg_names: + if expr.aggregate.name not in self.var_names: return arg_name = expr.aggregate.name @@ -1604,7 +1617,12 @@ class BatchedAccessRangeMapper(WalkMapper): self.bad_subscripts[arg_name].append(expr) return - access_range = get_access_range(domain, subscript, self.kernel.assumptions) + try: + access_range = get_access_range( + domain, subscript, self.kernel.assumptions) + except UnableToDetermineAccessRange: + self.bad_subscripts[arg_name].append(expr) + return if self.access_ranges[arg_name] is None: self.access_ranges[arg_name] = access_range @@ -1622,7 +1640,7 @@ class BatchedAccessRangeMapper(WalkMapper): def map_linear_subscript(self, expr, inames): self.rec(expr.index, inames) - if expr.aggregate.name in self.arg_names: + if expr.aggregate.name in self.var_names: self.bad_subscripts[expr.aggregate.name].append(expr) def map_reduction(self, expr, inames): @@ -1634,20 +1652,87 @@ class BatchedAccessRangeMapper(WalkMapper): class AccessRangeMapper(object): - def __init__(self, kernel, arg_name): - self.arg_name = arg_name - self.inner_mapper = BatchedAccessRangeMapper(kernel, [arg_name]) + def __init__(self, kernel, var_name): + self.var_name = var_name + self.inner_mapper = BatchedAccessRangeMapper(kernel, [var_name]) def __call__(self, expr, inames): return self.inner_mapper(expr, inames) @property def access_range(self): - return self.inner_mapper.access_ranges[self.arg_name] + return self.inner_mapper.access_ranges[self.var_name] @property def bad_subscripts(self): - return self.inner_mapper.bad_subscripts[self.arg_name] + return self.inner_mapper.bad_subscripts[self.var_name] + +# }}} + + +# {{{ do_access_ranges_overlap_conservative + +def _get_access_range_conservative(kernel, insn_id, access_dir, var_name): + insn = kernel.id_to_insn[insn_id] + from loopy.kernel.instruction import MultiAssignmentBase + + assert access_dir in ["w", "any"] + + if not isinstance(insn, MultiAssignmentBase): + if access_dir == "any": + return var_name in insn.dependency_names() + else: + return var_name in insn.write_dependency_names() + + exprs = list(insn.assignees) + if access_dir == "any": + exprs.append(insn.expression) + exprs.extend(insn.predicates) + + arange = False + for expr in exprs: + arm = AccessRangeMapper(kernel, var_name) + arm(expr, kernel.insn_inames(insn)) + + if arm.bad_subscripts: + return True + + expr_arange = arm.access_range + if expr_arange is None: + continue + + if arange is False: + arange = expr_arange + else: + arange = arange | expr_arange + + return arange + + +def do_access_ranges_overlap_conservative( + kernel, insn1_id, insn1_dir, insn2_id, insn2_dir, var_name): + """Determine whether the access ranges to *var_name* in the two + given instructions overlap. This determination is made 'conservatively', + i.e. if precise information is unavailable, it is concluded that the + ranges overlap. + + :arg insn1_dir: either ``"w"`` or ``"any"``, to indicate which + type of access is desired--writing or any + :arg insn2_dir: either ``"w"`` or ``"any"`` + :returns: a :class:`bool` + """ + + insn1_arange = _get_access_range_conservative( + kernel, insn1_id, insn1_dir, var_name) + insn2_arange = _get_access_range_conservative( + kernel, insn2_id, insn2_dir, var_name) + + if insn1_arange is False or insn2_arange is False: + return False + if insn1_arange is True or insn2_arange is True: + return True + + return not (insn1_arange & insn2_arange).is_empty() # }}} diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 37c5d85a1ade5c8f7fadb2c6a785cf7cea3dde40..e6ecb4093ad24ceafe521c5379f4d2cd96ea6f52 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -228,7 +228,8 @@ def tag_instructions(kernel, new_tag, within=None): # {{{ add nosync -def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False): +def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, + empty_ok=False): """Add a *no_sync_with* directive between *source* and *sink*. *no_sync_with* is only added if *sink* depends on *source* or if the instruction pair is in a conflicting group. @@ -248,8 +249,16 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False): :arg force: A :class:`bool`. If *True*, add a *no_sync_with* directive even without the presence of a dependency edge or conflicting instruction group. + :arg empty_ok: If *True*, do not complain even if no *nosync* tags were + added as a result of the transformation. :return: The updated kernel + + .. versionchanged:: 2018.1 + + If the transformation adds no *nosync* directives, it will complain. + This used to silently pass. This behavior can be restored using + *empty_ok*. """ if isinstance(source, str) and source in kernel.id_to_insn: @@ -264,6 +273,11 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False): sinks = frozenset( sink.id for sink in find_instructions(kernel, sink)) + if not sources and not empty_ok: + raise LoopyError("No match found for source specification '%s'." % source) + if not sinks and not empty_ok: + raise LoopyError("No match found for sink specification '%s'." % sink) + def insns_in_conflicting_groups(insn1_id, insn2_id): insn1 = kernel.id_to_insn[insn1_id] insn2 = kernel.id_to_insn[insn2_id] @@ -275,11 +289,12 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False): from collections import defaultdict nosync_to_add = defaultdict(set) + rec_dep_map = kernel.recursive_insn_dep_map() for sink in sinks: for source in sources: needs_nosync = force or ( - source in kernel.recursive_insn_dep_map()[sink] + source in rec_dep_map[sink] or insns_in_conflicting_groups(source, sink)) if not needs_nosync: @@ -289,6 +304,12 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False): if bidirectional: nosync_to_add[source].add((sink, scope)) + if not nosync_to_add and not empty_ok: + raise LoopyError("No nosync annotations were added as a result " + "of this call. add_nosync will (by default) only add them to " + "accompany existing depencies or group exclusions. Maybe you want " + "to pass force=True?") + new_instructions = list(kernel.instructions) for i, insn in enumerate(new_instructions): diff --git a/loopy/version.py b/loopy/version.py index 7141a678297ded5e0d6e2f16f065f035a034d540..aeb0b277a6c4de8a6db346aee97014699d591d03 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,76 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v76-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v77-islpy%s" % _islpy_version + + +FALLBACK_LANGUAGE_VERSION = (2017, 2, 1) +MOST_RECENT_LANGUAGE_VERSION = (2018, 1) + +LOOPY_USE_LANGUAGE_VERSION_2018_1 = (2018, 1) +LOOPY_USE_LANGUAGE_VERSION_2017_2_1 = (2017, 2, 1) + +LANGUAGE_VERSION_SYMBOLS = [ + "LOOPY_USE_LANGUAGE_VERSION_2018_1", + "LOOPY_USE_LANGUAGE_VERSION_2017_2_1", + ] + +__doc__ = """ + +.. currentmodule:: loopy +.. data:: VERSION + + A tuple representing the current version number of loopy, for example + **(2017, 2, 1)**. Direct comparison of these tuples will always yield + valid version comparisons. + +.. _language-versioning: + +Loopy Language Versioning +------------------------- + +At version 2018.1, :mod:`loopy` introduced a language versioning scheme to make +it easier to evolve the language while retaining backward compatibility. What +prompted this is the addition of +:attr:`loopy.Options.enforce_variable_access_ordered`, which (despite +its name) serves to enable a new check that helps ensure that all variable +access in a kernel is ordered as intended. Since that has the potential to +break existing programs, kernels now have to declare support for a given +language version to let them take advantage of this check. + +As a result, :mod:`loopy` will now issue a warning when a call to +:func:`loopy.make_kernel` does not declare a language version. Such kernels +will (indefinitely) default to language version 2017.2.1. If passing a +language version to :func:`make_kernel` is impractical, you may also import +one of the ``LOOPY_USE_LANGUAGE_VERSION_...`` symbols given below using:: + + from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 + +in the global namespace of the function calling :func:`make_kernel`. If +*lang_version* in that call is not explicitly given, this value will be used. + +Language versions will generally reflect the version number of :mod:`loopy` in +which they were introduced, though it is likely that most versions of +:mod:`loopy` do not introduce language incompatibilities. In such +situations, the previous language version number remains. (In fact, we +will work hard to avoid backward-incompatible language changes.) + +.. data:: MOST_RECENT_LANGUAGE_VERSION + + A tuple representing the most recent language version number of loopy, for + example **(2018, 1)**. Direct comparison of these tuples will always + yield valid version comparisons. + + +History of Language Versions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. data:: LOOPY_USE_LANGUAGE_VERSION_2018_1 + + :attr:`loopy.Options.enforce_variable_access_ordered` + is turned on by default. + +.. data:: LOOPY_USE_LANGUAGE_VERSION_2017_2_1 + + Initial legacy language version. +""" diff --git a/test/test_apps.py b/test/test_apps.py index c4844d3a3c5d88e0c4eeccf0d67e9b4284fd744f..12b59e18afc1ae956d6b7a4817e908a8bd89e7dc 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -49,6 +49,9 @@ __all__ = [ ] +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # noqa + + # {{{ convolutions def test_convolution(ctx_factory): @@ -194,7 +197,7 @@ def test_rob_stroud_bernstein(ctx_factory): for alpha2 tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \ - {id=write_tmp} + {id=write_tmp,dep=init_w:aind_init} w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ {id=update_w,dep=init_w:write_tmp} aind = aind + 1 \ @@ -255,7 +258,7 @@ def test_rob_stroud_bernstein_full(ctx_factory): <> w = s**(deg-alpha1) {id=init_w} <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \ - {id=write_tmp} + {id=write_tmp,dep=init_w:aind_init} for alpha2 w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ {id=update_w,dep=init_w:write_tmp} @@ -269,15 +272,16 @@ def test_rob_stroud_bernstein_full(ctx_factory): <> xi2 = qpts[0, i1_2] {dep=aind_incr} <> s2 = 1-xi2 <> r2 = xi2/s2 - <> w2 = s2**deg + <> w2 = s2**deg {id=w2_init} for alpha1_2 for i2_2 result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \ - w2 * tmp[alpha1_2, i2_2] + w2 * tmp[alpha1_2, i2_2] {id=res2,dep=w2_init} end - w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2) + w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2) \ + {id=w2_update, dep=res2} end end end diff --git a/test/test_c_execution.py b/test/test_c_execution.py index d1b3c95caa034191b4b29c49076fc101cd318950..f653eb0dc2b5edeec421c58a56d07d07472305f1 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -40,6 +40,9 @@ else: faulthandler.enable() +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # noqa + + def test_c_target(): from loopy.target.c import ExecutableCTarget diff --git a/test/test_dg.py b/test/test_dg.py index d65c68ed4c729582d511064d6495535efcf7a9a4..e96c76d88b0ab497c7ccdc7df94880a894607bc3 100644 --- a/test/test_dg.py +++ b/test/test_dg.py @@ -34,6 +34,9 @@ from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # noqa + + def test_dg_volume(ctx_factory): #logging.basicConfig(level=logging.DEBUG) diff --git a/test/test_diff.py b/test/test_diff.py index 95471f9b126fd6b763530d115c21509d14d2ba47..3d19721ac030ceccf819f4135b4e734594384e53 100644 --- a/test/test_diff.py +++ b/test/test_diff.py @@ -48,6 +48,9 @@ __all__ = [ ] +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # noqa + + def test_diff(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) diff --git a/test/test_domain.py b/test/test_domain.py index 9d0379a50af188cc84de8e01f8278030b6cc04e2..680ff299292e928c6286a168c3e71a23c60aac9b 100644 --- a/test/test_domain.py +++ b/test/test_domain.py @@ -52,6 +52,9 @@ __all__ = [ ] +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # noqa + + def test_assume(ctx_factory): ctx = ctx_factory() diff --git a/test/test_fortran.py b/test/test_fortran.py index 842a0127e3118ec8e7a0ea89ed17decc091e8566..ea2f68b617e4af41210dc05988a3eebb7f0e49a6 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -405,15 +405,11 @@ def test_fuse_kernels(ctx_factory): fortran_template.format( inner=(xd_line + "\n" + yd_line), name="xyderiv")) - knl = lp.fuse_kernels((xderiv, yderiv)) + knl = lp.fuse_kernels((xderiv, yderiv), data_flow=[("result", 0, 1)]) knl = lp.prioritize_loops(knl, "e,i,j,k") assert len(knl.temporary_variables) == 2 - # This is needed for correctness, otherwise ordering could foul things up. - knl = lp.assignment_to_subst(knl, "prev") - knl = lp.assignment_to_subst(knl, "prev_0") - ctx = ctx_factory() lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4)) diff --git a/test/test_linalg.py b/test/test_linalg.py index 3d422f1d8b5a847d4445468978ee529db95c481f..accdebc1237c70f4227adc5bfcba6fa9cf88d190 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -62,6 +62,9 @@ def check_float4(result, ref_result): ref_result[comp], result[comp], rtol=1e-3, atol=1e-3), None +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # noqa + + def test_axpy(ctx_factory): logging.basicConfig(level="INFO") ctx = ctx_factory() diff --git a/test/test_loopy.py b/test/test_loopy.py index e36a4c2c3cb3f7e70a5b039ea631bbce20923be8..8581ae5b879a7d9a282b5fefb9f3155928e83631 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -52,6 +52,9 @@ __all__ = [ ] +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # noqa + + def test_globals_decl_once_with_multi_subprogram(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -1085,12 +1088,12 @@ def test_atomic_load(ctx_factory, dtype): "{ [i,j]: 0<=i,j upper = 0 - <> lower = 0 + <> upper = 0 {id=init_upper} + <> lower = 0 {id=init_lower} temp = 0 {id=init, atomic} for i - upper = upper + i * a[i] {id=sum0} - lower = lower - b[i] {id=sum1} + upper = upper + i * a[i] {id=sum0,dep=init_upper} + lower = lower - b[i] {id=sum1,dep=init_lower} end temp = temp + lower {id=temp_sum, dep=sum*:init, atomic,\ nosync=init} @@ -1608,8 +1611,10 @@ def test_missing_temporary_definition_detection(): def test_missing_definition_check_respects_aliases(): # Based on https://github.com/inducer/loopy/issues/69 knl = lp.make_kernel("{ [i] : 0<=i z[i] = z[i+1] + z[i] {id=wr_z} <> v[i] = 11 {id=wr_v} ... nop {dep=wr_z:wr_v,id=yoink} - z[i] = z[i] - z[i+1] + v[i] + z[i] = z[i] - z[i+1] + v[i] {dep=yoink} end """) @@ -1870,7 +1875,7 @@ def test_global_barrier(ctx_factory): <> z[i] = z[i+1] + z[i] {id=wr_z,dep=top} <> v[i] = 11 {id=wr_v,dep=top} ... gbarrier {dep=wr_z:wr_v,id=yoink} - z[i] = z[i] - z[i+1] + v[i] {id=iupd} + z[i] = z[i] - z[i+1] + v[i] {id=iupd, dep=wr_z} end ... gbarrier {dep=iupd,id=postloop} z[i] = z[i] - z[i+1] + v[i] {dep=postloop} @@ -2107,11 +2112,11 @@ def test_if_else(ctx_factory): "{ [i]: 0<=i<50}", """ if i % 3 == 0 - a[i] = 15 + a[i] = 15 {nosync_query=writes:a} elif i % 3 == 1 - a[i] = 11 + a[i] = 11 {nosync_query=writes:a} else - a[i] = 3 + a[i] = 3 {nosync_query=writes:a} end """ ) @@ -2131,14 +2136,14 @@ def test_if_else(ctx_factory): for i if i % 2 == 0 if i % 3 == 0 - a[i] = 15 + a[i] = 15 {nosync_query=writes:a} elif i % 3 == 1 - a[i] = 11 + a[i] = 11 {nosync_query=writes:a} else - a[i] = 3 + a[i] = 3 {nosync_query=writes:a} end else - a[i] = 4 + a[i] = 4 {nosync_query=writes:a} end end """ @@ -2159,17 +2164,17 @@ def test_if_else(ctx_factory): if i < 25 for j if j % 2 == 0 - a[i, j] = 1 + a[i, j] = 1 {nosync_query=writes:a} else - a[i, j] = 0 + a[i, j] = 0 {nosync_query=writes:a} end end else for j if j % 2 == 0 - a[i, j] = 0 + a[i, j] = 0 {nosync_query=writes:a} else - a[i, j] = 1 + a[i, j] = 1 {nosync_query=writes:a} end end end @@ -2363,8 +2368,9 @@ def test_nosync_option_parsing(): assert "id=insn5, no_sync_with=insn1@any" in kernel_str -def assert_barrier_between(knl, id1, id2, ignore_barriers_in_levels=()): - from loopy.schedule import (RunInstruction, Barrier, EnterLoop, LeaveLoop) +def barrier_between(knl, id1, id2, ignore_barriers_in_levels=()): + from loopy.schedule import (RunInstruction, Barrier, EnterLoop, LeaveLoop, + CallKernel, ReturnFromKernel) watch_for_barrier = False seen_barrier = False loop_level = 0 @@ -2374,9 +2380,7 @@ def assert_barrier_between(knl, id1, id2, ignore_barriers_in_levels=()): if sched_item.insn_id == id1: watch_for_barrier = True elif sched_item.insn_id == id2: - assert watch_for_barrier - assert seen_barrier - return + return watch_for_barrier and seen_barrier elif isinstance(sched_item, Barrier): if watch_for_barrier and loop_level not in ignore_barriers_in_levels: seen_barrier = True @@ -2384,6 +2388,11 @@ def assert_barrier_between(knl, id1, id2, ignore_barriers_in_levels=()): loop_level += 1 elif isinstance(sched_item, LeaveLoop): loop_level -= 1 + elif isinstance(sched_item, (CallKernel, ReturnFromKernel)): + pass + else: + raise RuntimeError("schedule item type '%s' not understood" + % type(sched_item).__name__) raise RuntimeError("id2 was not seen") @@ -2410,9 +2419,9 @@ def test_barrier_insertion_near_top_of_loop(): print(knl) - assert_barrier_between(knl, "ainit", "tcomp") - assert_barrier_between(knl, "tcomp", "bcomp1") - assert_barrier_between(knl, "bcomp1", "bcomp2") + assert barrier_between(knl, "ainit", "tcomp") + assert barrier_between(knl, "tcomp", "bcomp1") + assert barrier_between(knl, "bcomp1", "bcomp2") def test_barrier_insertion_near_bottom_of_loop(): @@ -2437,8 +2446,8 @@ def test_barrier_insertion_near_bottom_of_loop(): print(knl) - assert_barrier_between(knl, "bcomp1", "bcomp2") - assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1]) + assert barrier_between(knl, "bcomp1", "bcomp2") + assert barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1]) def test_barrier_in_overridden_get_grid_size_expanded_kernel(): @@ -2570,10 +2579,10 @@ def test_struct_assignment(ctx_factory): "{ [i]: 0<=i {[i]: 0 <= i < n}", """ - <>tmp[i] = i - tmp[0] = 0 + <>tmp[i] = i {id=init} + tmp[0] = 0 {dep=init} """, fixed_parameters=dict(n=1)) @@ -2788,6 +2797,65 @@ def test_add_prefetch_works_in_lhs_index(): assert "a1_map" not in get_dependencies(insn.assignees) +def test_check_for_variable_access_ordering(): + knl = lp.make_kernel( + "{[i]: 0<=i 1: exec(sys.argv[1]) diff --git a/test/test_misc.py b/test/test_misc.py index 0273948b38b28b85e42a600bffb65fbf86dcc554..ec14770a912af978fbc6651110529a86b307df83 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -32,6 +32,9 @@ import logging logger = logging.getLogger(__name__) +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # noqa + + def test_compute_sccs(): from loopy.tools import compute_sccs import random diff --git a/test/test_nbody.py b/test/test_nbody.py index e118b04b997020943d79ec1ba566eff85d56199a..f2a8fc1981ddc2066ff52a2b712df95b5d36ccd2 100644 --- a/test/test_nbody.py +++ b/test/test_nbody.py @@ -34,6 +34,9 @@ import logging logger = logging.getLogger(__name__) +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # noqa + + def test_nbody(ctx_factory): logging.basicConfig(level=logging.INFO) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index eff3dbd0e07439bbec399479183a7e9ddb69b9ff..a287ad59d7697eef79336678afa831e73b81784b 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -44,6 +44,9 @@ __all__ = [ ] +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # noqa + + @pytest.mark.parametrize("Nq", [7]) @pytest.mark.parametrize("ilp_multiple", [1, 2]) @pytest.mark.parametrize("opt_level", [11]) @@ -57,13 +60,14 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ - knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False) + knl for knl in lp.parse_fortran(source, filename, seq_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s + hsv = lp.add_nosync(hsv, "any", "writes:rhsQ", "writes:rhsQ", force=True) from gnuma_loopy_transforms import ( fix_euler_parameters, diff --git a/test/test_reduction.py b/test/test_reduction.py index 909a800b29c75b13fad494b5a859186b9cd5587c..6b62bad5b50952a3d29beec49cfce4369d5a4acf 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -49,6 +49,9 @@ __all__ = [ ] +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # noqa + + def test_nonsense_reduction(ctx_factory): ctx = ctx_factory() diff --git a/test/test_scan.py b/test/test_scan.py index 08754819c9a156403aba689cb3e9c238144e7905..44903611d27e14e502c0c8459be9378dbc77a9a4 100644 --- a/test/test_scan.py +++ b/test/test_scan.py @@ -56,6 +56,9 @@ __all__ = [ # - scan(a) + scan(b) # - test for badly tagged inames +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # noqa + + @pytest.mark.parametrize("n", [1, 2, 3, 16]) @pytest.mark.parametrize("stride", [1, 2]) def test_sequential_scan(ctx_factory, n, stride): diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py index 0571e41910020aa0a60cd911a63b6ce2984ed939..ecb2352ae277bd0677af09801d7bf24ee30da6b9 100644 --- a/test/test_sem_reagan.py +++ b/test/test_sem_reagan.py @@ -31,6 +31,9 @@ from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # noqa + + def test_tim2d(ctx_factory): dtype = np.float32 ctx = ctx_factory() diff --git a/test/test_statistics.py b/test/test_statistics.py index eeb4a5a288afdd5b9295b0b681abb61b5f021d97..e4232e613c569cb4a0d66b500a981643bf5bac05 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -34,6 +34,9 @@ import numpy as np from pymbolic.primitives import Variable +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # noqa + + def test_op_counter_basic(): knl = lp.make_kernel( diff --git a/test/test_target.py b/test/test_target.py index d3cf2670cb0db0eb5d0046ce1d816b679d4a1ed8..15964987ab3d83d31c91ea266f29698a695c74a6 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -52,6 +52,9 @@ __all__ = [ ] +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # noqa + + def test_ispc_target(occa_mode=False): from loopy.target.ispc import ISPCTarget @@ -203,8 +206,8 @@ def test_random123(ctx_factory, tp): <> key2 = make_uint2(i, 324830944) {inames=i} <> key4 = make_uint4(i, 324830944, 234181, 2233) {inames=i} <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} - <> real, ctr = philox4x32_TYPE(ctr, key2) {dep=init_ctr} - <> imag, ctr = threefry4x32_TYPE(ctr, key4) {dep=init_ctr} + <> real, ctr = philox4x32_TYPE(ctr, key2) {id=realpart,dep=init_ctr} + <> imag, ctr = threefry4x32_TYPE(ctr, key4) {dep=init_ctr:realpart} out[i, 0] = real.s0 + 1j * imag.s0 out[i, 1] = real.s1 + 1j * imag.s1 diff --git a/test/test_transform.py b/test/test_transform.py index 0e10db362f36b7fc258059c2ec7ed1a344b97212..e1a58e30286141b4d0592debcd308552f32ff632 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -49,6 +49,9 @@ __all__ = [ ] +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_1 # noqa + + def test_chunk_iname(ctx_factory): ctx = ctx_factory() @@ -75,7 +78,8 @@ def test_collect_common_factors(ctx_factory): """ out_tmp = 0 {id=out_init,inames=i} out_tmp = out_tmp + alpha[i]*a[i,j]*b1[j] {id=out_up1,dep=out_init} - out_tmp = out_tmp + alpha[i]*a[j,i]*b2[j] {id=out_up2,dep=out_init} + out_tmp = out_tmp + alpha[i]*a[j,i]*b2[j] \ + {id=out_up2,dep=out_init,nosync=out_up1} out[i] = out_tmp {dep=out_up1:out_up2} """) knl = lp.add_and_infer_dtypes(knl, @@ -492,7 +496,8 @@ def test_add_nosync(): orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local") # No dependency present - don't add nosync - knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2") + knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2", + empty_ok=True) assert frozenset() == knl.id_to_insn["insn2"].no_sync_with # Dependency present