diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1caef802b7340c7308f1b6655711481b91f0d889..ea69114d6b21e1306f07cdf0684ac1a025bfbaac 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -12,6 +12,10 @@ Python 2.7 POCL: - pocl except: - tags + artifacts: + reports: + junit: test/pytest.xml + Python 2.7 with legacy PyOpenCL: script: @@ -29,6 +33,10 @@ Python 2.7 with legacy PyOpenCL: except: - tags retry: 2 + artifacts: + reports: + junit: test/pytest.xml + Python 3.6 POCL: script: @@ -43,6 +51,10 @@ Python 3.6 POCL: - pocl except: - tags + artifacts: + reports: + junit: test/pytest.xml + Python 3.6 POCL Twice With Cache: script: @@ -59,6 +71,10 @@ Python 3.6 POCL Twice With Cache: - pocl except: - tags + artifacts: + reports: + junit: test/pytest.xml + # PyPy POCL: # script: @@ -77,7 +93,7 @@ Python 3.6 POCL Examples: script: - export PY_EXE=python3.6 - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib jupyter nbconvert" + - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert" - ". ./build-py-project-and-run-examples.sh" tags: - python3.6 @@ -87,6 +103,7 @@ Python 3.6 POCL Examples: except: - tags + CentOS binary: script: - (cd build-helpers; ./make-linux-build-docker.sh --nodate) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..601df74bd9a655b3b29decdbdf499d55b25b6385 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Andreas Klöckner and contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/build-helpers/make-linux-build-docker-inner-part-2.sh b/build-helpers/make-linux-build-docker-inner-part-2.sh index 1e35a1e1b9949b37f95b05ebeef223c8a5955ff8..035634b16072e0188270abd8736dab99ce31dada 100755 --- a/build-helpers/make-linux-build-docker-inner-part-2.sh +++ b/build-helpers/make-linux-build-docker-inner-part-2.sh @@ -23,6 +23,10 @@ git clone --recursive git://github.com/inducer/loopy cd loopy grep -v pyopencl requirements.txt > myreq.txt + +# needed for pyinstaller package to be usable +echo packaging >> myreq.txt + pip install -r myreq.txt python setup.py install diff --git a/doc/ref_call.rst b/doc/ref_call.rst index 4ff1ef2fc343f845cac372fe6149cd786796ef90..5a59e84282119209cc89eb18e3a4eda97725edf0 100644 --- a/doc/ref_call.rst +++ b/doc/ref_call.rst @@ -4,6 +4,8 @@ Calling Loopy Kernels and External Functions Goals of a function interface ----------------------------- +- *FIXME: * Needs to change after the new design of program. + - Must be able to have complete information of the function just through the epxression node. - Must adhere to :mod:`loopy` semantics of immutability. @@ -30,7 +32,7 @@ kernel, whose name has been resolved by the kernel. The process of matching a function idenitifier with the function definition is called "resolving". A pymbolic ``Call`` node can be converted to a ``ResolvedFunction`` if it -is "resolved" by one of the ``function_scoper`` in a +is "resolved" by one of the ``function_id_to_in_knl_callable_mapper`` in a :attr:`LoopKernel.scoped_functions` - Functions already registered by the target. Some examples include -- @@ -41,11 +43,11 @@ is "resolved" by one of the ``function_scoper`` in a - Functions registered as ``CallableKernels`` using ``lp.register_callable_kernel(...)``. - Functions that have been provided through - ``lp.register_function_scoper(...)`` + ``lp.register_function_id_to_in_knl_callable_mapper(...)`` - Functions that can be made known from the user through ``lp.register_function_mangler``. This is planned to be deprecated, as its functionality is superseded by - ``lp.register_function_scoper(...)``. + ``lp.register_function_id_to_in_knl_callable_mapper(...)``. Expressions after a function is scoped -------------------------------------- @@ -180,7 +182,7 @@ Changes on the target side to accommodate the new function interface -------------------------------------------------------------------- The earlier "function\_mangler" as a member method of the class -``lp.ASTBuilderBase`` will be replaced by ``function_scopers``. The +``lp.ASTBuilderBase`` will be replaced by ``function_id_in_knl_callable_mapper``. The function scopers would return a list of functions with the signature ``(target, identifier)->lp.InKernelCallable``. diff --git a/doc/tutorial.rst b/doc/tutorial.rst index aaa3228526d242741044b779829957f0a90cdc5c..25082f88a10a7e3276c1ac73251633ee9ac93e29 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -334,7 +334,7 @@ that these dependencies show up there, too: .. doctest:: - >>> print(knl.stringify(with_dependencies=True)) + >>> print(knl.root_kernel.stringify(with_dependencies=True)) --------------------------------------------------------------------------- KERNEL: loopy_kernel --------------------------------------------------------------------------- @@ -1145,7 +1145,7 @@ the right by 1 in parallel: ... end ... """, ... [ - ... lp.ArrayArg("arr", shape=("n",), dtype=np.int32), + ... lp.GlobalArg("arr", shape=("n",), dtype=np.int32), ... "...", ... ], ... name="rotate_v1", @@ -1179,7 +1179,7 @@ Let us start with an example. Consider the kernel from above with a .. doctest:: - >>> knl = lp.make_kernel( + >>> prog = lp.make_kernel( ... "[n] -> {[i] : 0<=i>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0") + >>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0") Here is what happens when we try to generate code for the kernel: - >>> cgr = lp.generate_code_v2(knl) + >>> cgr = lp.generate_code_v2(prog) Traceback (most recent call last): ... loopy.diagnostic.MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?) @@ -1207,8 +1207,10 @@ This happens due to the kernel splitting done by :mod:`loopy`. The splitting happens when the instruction schedule is generated. To see the schedule, we should call :func:`loopy.get_one_scheduled_kernel`: - >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) - >>> print(knl) + >>> prog = lp.preprocess_kernel(prog) + >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) + >>> prog = prog.with_root_kernel(knl) + >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 --------------------------------------------------------------------------- @@ -1237,10 +1239,10 @@ function adds instructions to the kernel without scheduling them. That means that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to put those instructions into the schedule. - >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) - >>> knl = lp.save_and_reload_temporaries(knl) - >>> knl = lp.get_one_scheduled_kernel(knl) # Schedule added instructions - >>> print(knl) + >>> prog = lp.save_and_reload_temporaries(prog) + >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) # Schedule added instructions + >>> prog = prog.with_root_kernel(knl) + >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 --------------------------------------------------------------------------- @@ -1279,7 +1281,7 @@ does in more detail: The kernel translates into two OpenCL kernels. - >>> cgr = lp.generate_code_v2(knl) + >>> cgr = lp.generate_code_v2(prog) >>> print(cgr.device_code()) #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) @@ -1321,8 +1323,8 @@ tagged, as in the following example:: "{ [i]: 0<=i>> op_map = lp.get_op_map(knl) + >>> op_map = lp.get_op_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(op_map)) - Op(np:dtype('float32'), add, workitem) : ... + Op(np:dtype('float32'), add, subgroup) : ... Each line of output will look roughly like:: - Op(np:dtype('float32'), add, workitem) : [l, m, n] -> { l * m * n : l > 0 and m > 0 and n > 0 } + Op(np:dtype('float32'), add, subgroup) : [l, m, n] -> { l * m * n : l > 0 and m > 0 and n > 0 } :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{** :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. A @@ -1579,12 +1581,12 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} >>> from loopy.statistics import CountGranularity as CG - >>> f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(param_dict) - >>> f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(param_dict) - >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(param_dict) - >>> f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(param_dict) - >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.WORKITEM)].eval_with_dict(param_dict) - >>> i32add = op_map[lp.Op(np.int32, 'add', CG.WORKITEM)].eval_with_dict(param_dict) + >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) + >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(param_dict) + >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict) + >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) + >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict) + >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 diff --git a/examples/python/call-external.py b/examples/python/call-external.py index 68618a7ecaaaba9c9c93495a05c2313a7660b377..c13d99bd06295096c26d6e113841c853f80645fc 100644 --- a/examples/python/call-external.py +++ b/examples/python/call-external.py @@ -7,14 +7,14 @@ from loopy.target.c import CTarget # {{{ blas callable class BLASCallable(lp.ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): for i in range(0, 2): if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: # the types provided aren't mature enough to specialize the # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) mat_dtype = arg_id_to_dtype[0].numpy_dtype vec_dtype = arg_id_to_dtype[1].numpy_dtype @@ -34,7 +34,7 @@ class BLASCallable(lp.ScalarCallable): from loopy.types import NumpyType return self.copy(name_in_target=name_in_target, arg_id_to_dtype={0: NumpyType(vec_dtype), 1: NumpyType(vec_dtype), - -1: NumpyType(vec_dtype)}), program_callables_info + -1: NumpyType(vec_dtype)}), callables_table def emit_call_insn(self, insn, target, expression_to_code_mapper): assert self.is_ready_for_codegen() diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py index cc4926feeeb3815da1c66a2548bf3235df8f1fcc..be22e268c85fe985a98763426faf8cfadf73c5fb 100644 --- a/examples/python/global_barrier_removal.py +++ b/examples/python/global_barrier_removal.py @@ -1,7 +1,5 @@ import numpy as np import loopy as lp -import pyopencl as cl -import pyopencl.array knl = lp.make_kernel( "{ [i,k]: 0<=i[_0-9a-zA-Z]+)" - "(\((?P[-+*0-9:a-zA-Z, \t]+)\))?$") + r"(\((?P[-+*0-9:a-zA-Z, \t]+)\))?$") def parse_dimension_specs(self, node, dim_decls): def parse_bounds(bounds_str): diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index d2723c57ff6872d613adeff2e30f2e74b2616bb5..26db6ec4e6ac8b81ab7f4e0dcf1a000bd9e9fd3c 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -38,7 +38,7 @@ import re from pytools import UniqueNameGenerator, generate_unique_names from loopy.diagnostic import CannotBranchDomainTree, LoopyError -from loopy.tools import natsorted +from loopy.tools import natsorted, update_persistent_hash from loopy.diagnostic import StaticValueFindingError from loopy.kernel.data import filter_iname_tags_by_type from warnings import warn @@ -222,10 +222,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): A subclass of :class:`loopy.TargetBase`. .. attribute:: is_called_from_host - An instance of :class:`bool`. Will be set *False* for the kernel which - would be called from another top level kernels. Default value is + would be called from other top level kernels. Default value is *True*. + """ # {{{ constructor @@ -253,7 +253,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): state=KernelState.INITIAL, target=None, - is_called_from_host=True, overridden_get_grid_sizes_for_insn_ids=None, @@ -1036,8 +1035,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): self.get_iname_bounds(iname, constants_only=True).size, constants_only=True))) + @memoize_method def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, - program_callables_info, ignore_auto=False): + callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. @@ -1052,7 +1052,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): from loopy.kernel.tools import get_direct_callee_kernels callee_kernels = get_direct_callee_kernels(self, - program_callables_info, insn_ids) + callables_table, insn_ids) # }}} @@ -1073,7 +1073,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): for callee_kernel in callee_kernels: gsize, lsize = callee_kernel.get_grid_sizes_for_insn_ids_as_dicts( frozenset(insn.id for insn in callee_kernel.instructions), - program_callables_info, ignore_auto) + callables_table, ignore_auto) global_sizes.update(gsize) local_sizes.update(lsize) @@ -1120,7 +1120,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): return global_sizes, local_sizes - def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info, + @memoize_method + def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given @@ -1134,14 +1135,14 @@ class LoopKernel(ImmutableRecordWithoutPickling): if self.overridden_get_grid_sizes_for_insn_ids: return self.overridden_get_grid_sizes_for_insn_ids( insn_ids, - program_callables_info=program_callables_info, + callables_table=callables_table, ignore_auto=ignore_auto) assert self.is_called_from_host, ("Callee kernels do not have sufficient " "information to compute grid sizes.") global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts( - insn_ids, program_callables_info, ignore_auto=ignore_auto) + insn_ids, callables_table, ignore_auto=ignore_auto) def to_dim_tuple(size_dict, which, forced_sizes={}): forced_sizes = forced_sizes.copy() @@ -1173,7 +1174,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, - program_callables_info, ignore_auto=False): + callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. @@ -1184,7 +1185,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ grid_size, group_size = self.get_grid_sizes_for_insn_ids( - insn_ids, program_callables_info, ignore_auto) + insn_ids, callables_table, ignore_auto) def tup_to_exprs(tup): from loopy.symbolic import pw_aff_to_expr @@ -1192,7 +1193,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return tup_to_exprs(grid_size), tup_to_exprs(group_size) - def get_grid_size_upper_bounds(self, program_callables_info, ignore_auto=False): + def get_grid_size_upper_bounds(self, callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1200,10 +1201,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ return self.get_grid_sizes_for_insn_ids( frozenset(insn.id for insn in self.instructions), - program_callables_info, + callables_table, ignore_auto=ignore_auto) - def get_grid_size_upper_bounds_as_exprs(self, program_callables_info, + def get_grid_size_upper_bounds_as_exprs(self, callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1213,7 +1214,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return self.get_grid_sizes_for_insn_ids_as_exprs( frozenset(insn.id for insn in self.instructions), - program_callables_info, + callables_table, ignore_auto=ignore_auto) # }}} @@ -1407,9 +1408,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ direct execution def __call__(self, *args, **kwargs): - # FIXME: scream and then convert to a program - from loopy.program import make_program_from_kernel - program = make_program_from_kernel(self) + warn("Calling a LoopKernel is deprecated, call a Program " + "instead.", DeprecationWarning, stacklevel=2) + from loopy.program import make_program + program = make_program(self) return program(*args, **kwargs) # }}} @@ -1514,14 +1516,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "symbol_manglers", ) - def update_persistent_hash(self, key_hash, key_builder): - """Custom hash computation function for use with - :class:`pytools.persistent_dict.PersistentDict`. - - Only works in conjunction with :class:`loopy.tools.KeyBuilder`. - """ - for field_name in self.hash_fields: - key_builder.rec(key_hash, getattr(self, field_name)) + update_persistent_hash = update_persistent_hash def __hash__(self): from loopy.tools import LoopyKeyBuilder diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 186597c64734b54b8d08f0db43b57826d79f9567..0ed1f940131238a123616d95b8b99a426e10bfc7 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -833,6 +833,8 @@ class ArrayBase(ImmutableRecord): dim_names=dim_names, order=order, alignment=alignment, + for_atomic=for_atomic, + target=target, **kwargs) def __eq__(self, other): diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 674eaca3fcd3f11b92d9e5e9c5a946364714e519..52e299b6140428515a0aab764c4b6e2dab0c0827 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2120,7 +2120,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): target = kwargs.pop("target", None) seq_dependencies = kwargs.pop("seq_dependencies", False) fixed_parameters = kwargs.pop("fixed_parameters", {}) - make_program = kwargs.pop("make_program", True) + is_callee_kernel = kwargs.pop("is_callee_kernel", False) if defines: from warnings import warn @@ -2146,7 +2146,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # {{{ handle kernel language version - if make_program: + if not is_callee_kernel: from loopy.version import LANGUAGE_VERSION_SYMBOLS version_to_symbol = dict( @@ -2337,19 +2337,19 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_duplicate_names(knl) check_written_variable_names(knl) - from loopy.kernel.tools import infer_arg_is_output_only - knl = infer_arg_is_output_only(knl) + from loopy.kernel.tools import infer_args_are_output_only + knl = infer_args_are_output_only(knl) from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) creation_plog.done() - if make_program: - from loopy.program import make_program_from_kernel - return make_program_from_kernel(knl) - else: + if is_callee_kernel: return knl + else: + from loopy.program import make_program + return make_program(knl) def make_function(*args, **kwargs): @@ -2358,7 +2358,7 @@ def make_function(*args, **kwargs): raise LoopyError("lang_version should be set for program, not " "functions.") - kwargs['make_program'] = False + kwargs['is_callee_kernel'] = True return make_kernel(*args, **kwargs) # }}} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 2ea260656b8d89031849d43364977404e1ca6690..3e628f5c9a60f188c0033efa40c5fd762e53ba11 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -23,20 +23,26 @@ THE SOFTWARE. """ -import re -import six - from six.moves import zip from pytools import ImmutableRecord from loopy.diagnostic import LoopyError -from loopy.symbolic import parse_tagged_name +from loopy.tools import update_persistent_hash +from loopy.kernel import LoopKernel + +__doc__ = """ -from loopy.symbolic import (ResolvedFunction, SubstitutionRuleMappingContext, - RuleAwareIdentityMapper, SubstitutionRuleExpander) +.. currentmodule:: loopy -from loopy.kernel import LoopKernel +.. autoclass:: ValueArgDescriptor +.. autoclass:: ArrayArgDescriptor +.. autoclass:: InKernelCallable +.. autoclass:: CallableKernel +.. autoclass:: ScalarCallable +.. autoclass:: ManglerCallable + +""" # {{{ argument descriptors @@ -44,7 +50,7 @@ from loopy.kernel import LoopKernel class ValueArgDescriptor(ImmutableRecord): hash_fields = () - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash class ArrayArgDescriptor(ImmutableRecord): @@ -94,7 +100,7 @@ class ArrayArgDescriptor(ImmutableRecord): "address_space", "dim_tags") - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash # }}} @@ -106,8 +112,6 @@ def get_kw_pos_association(kernel): Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in *kernel*. """ - from loopy.kernel.tools import infer_arg_is_output_only - kernel = infer_arg_is_output_only(kernel) kw_to_pos = {} pos_to_kw = {} @@ -131,7 +135,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord): """ Helper class to set the :attr:`loopy.kernel.LoopKernel.override_get_grid_size_for_insn_ids` of the - callee kernels. Refer + callee kernels. Refer to :func:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`, :func:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`. @@ -145,7 +149,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord): .. note:: - This class acts as a pseduo-callable and its significance lies in + This class acts as a pseudo-callable and its significance lies in solving picklability issues. """ fields = set(["local_size", "global_size"]) @@ -154,7 +158,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord): self.local_size = local_size self.global_size = global_size - def __call__(self, insn_ids, program_callables_info, ignore_auto=True): + def __call__(self, insn_ids, callables_table, ignore_auto=True): return self.local_size, self.global_size # }}} @@ -168,22 +172,26 @@ class InKernelCallable(ImmutableRecord): .. attribute:: name - The name of the callable which can be encountered within a kernel. + The name of the callable which can be encountered within expressions in + a kernel. .. attribute:: arg_id_to_dtype - A mapping which indicates the arguments types and result types it would - be handling. This would be set once the callable is type specialized. + A mapping which indicates the arguments types and result types of the + callable. .. attribute:: arg_id_to_descr A mapping which gives indicates the argument shape and ``dim_tags`` it - would be responsible for generating code. These parameters would be set, - once it is shape and stride(``dim_tags``) specialized. + would be responsible for generating code. .. note:: + - "``arg_id`` can either be an instance of :class:`int` integer + corresponding to the position of the argument or an instance of + :class:`str` corresponding to the name of keyword argument accepted + by the function. - Negative "id" values ``-i`` in the mapping attributes indicate + - Negative "arg_id" values ``-i`` in the mapping attributes indicate return value with (0-based) index *i*. .. automethod:: __init__ @@ -209,9 +217,9 @@ class InKernelCallable(ImmutableRecord): def __getinitargs__(self): return (self.arg_id_to_dtype, self.arg_id_to_descr) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash - def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): """ :arg arg_id_to_type: a mapping from argument identifiers (integers for positional arguments, names for keyword @@ -228,12 +236,10 @@ class InKernelCallable(ImmutableRecord): Any argument information exists both by its positional and its keyword identifier. """ - # FIXME: In all these with_** functions add that also passes a - # program_callables_info raise NotImplementedError() - def with_descrs(self, arg_id_to_descr, program_callables_info): + def with_descrs(self, arg_id_to_descr, callables_table): """ :arg arg_id_to_descr: a mapping from argument identifiers (integers for positional arguments, names for keyword @@ -284,7 +290,7 @@ class InKernelCallable(ImmutableRecord): def with_hw_axes_sizes(self, local_size, global_size): """ Returns a copy of *self* with modifications to comply with the grid - sizes ``(local_size, global_size)`` of the kernel in which it is + sizes ``(local_size, global_size)`` of the program in which it is supposed to be called. :arg local_size: An instance of :class:`islpy.PwAff`. @@ -298,7 +304,8 @@ class InKernelCallable(ImmutableRecord): self.arg_id_to_descr is not None) def generate_preambles(self, target): - """ Yields the target specific preamble. + """ + Yields the target specific preamble. """ raise NotImplementedError() @@ -333,12 +340,12 @@ class InKernelCallable(ImmutableRecord): class ScalarCallable(InKernelCallable): """ - An abstranct interface the to a scalar callable encountered in a kernel. + An abstract interface the to a scalar callable encountered in a kernel. .. note:: The :meth:`ScalarCallable.with_types` is intended to assist with type - specialization of the funciton and is expected to be supplemented in the + specialization of the function and is expected to be supplemented in the derived subclasses. """ @@ -361,16 +368,16 @@ class ScalarCallable(InKernelCallable): return (self.arg_id_to_dtype, self.arg_id_to_descr, self.name_in_target) - def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): raise LoopyError("No type inference information present for " "the function %s." % (self.name)) - def with_descrs(self, arg_id_to_descr, program_callables_info): + def with_descrs(self, arg_id_to_descr, callables_table): arg_id_to_descr[-1] = ValueArgDescriptor() return ( self.copy(arg_id_to_descr=arg_id_to_descr), - program_callables_info) + callables_table) def with_hw_axes_sizes(self, global_size, local_size): return self.copy() @@ -525,7 +532,7 @@ class CallableKernel(InKernelCallable): return self.subkernel.name def with_types(self, arg_id_to_dtype, caller_kernel, - program_callables_info): + callables_table): kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) new_args = [] @@ -548,10 +555,10 @@ class CallableKernel(InKernelCallable): # infer the types of the written variables based on the knowledge # of the types of the arguments supplied - specialized_kernel, program_callables_info = ( + specialized_kernel, callables_table = ( infer_unknown_types_for_a_single_kernel( pre_specialized_subkernel, - program_callables_info, + callables_table, expect_completion=True)) new_arg_id_to_dtype = {} @@ -564,9 +571,9 @@ class CallableKernel(InKernelCallable): # Return the kernel call with specialized subkernel and the corresponding # new arg_id_to_dtype return self.copy(subkernel=specialized_kernel, - arg_id_to_dtype=new_arg_id_to_dtype), program_callables_info + arg_id_to_dtype=new_arg_id_to_dtype), callables_table - def with_descrs(self, arg_id_to_descr, program_callables_info): + def with_descrs(self, arg_id_to_descr, callables_table): # tune the subkernel so that we have the matching shapes and # dim_tags @@ -595,15 +602,15 @@ class CallableKernel(InKernelCallable): type(descr)) descriptor_specialized_knl = self.subkernel.copy(args=new_args) from loopy.preprocess import traverse_to_infer_arg_descr - descriptor_specialized_knl, program_callables_info = ( + descriptor_specialized_knl, callables_table = ( traverse_to_infer_arg_descr(descriptor_specialized_knl, - program_callables_info)) + callables_table)) return ( self.copy( subkernel=descriptor_specialized_knl, arg_id_to_descr=arg_id_to_descr), - program_callables_info) + callables_table) def with_packing_for_args(self): from loopy.kernel.data import AddressSpace @@ -696,7 +703,7 @@ class CallableKernel(InKernelCallable): class ManglerCallable(ScalarCallable): """ - A callable whose characateristic is defined by a function mangler. + A callable whose characteristic is defined by a function mangler. .. attribute:: function_mangler @@ -725,7 +732,7 @@ class ManglerCallable(ScalarCallable): return (self.name, self.function_mangler, self.arg_id_to_dtype, self.arg_id_to_descr, self.name_in_target) - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): if self.arg_id_to_dtype is not None: # specializing an already specialized function. for arg_id, dtype in arg_id_to_dtype.items(): @@ -749,7 +756,7 @@ class ManglerCallable(ScalarCallable): return ( self.copy(name_in_target=mangle_result.target_name, arg_id_to_dtype=new_arg_id_to_dtype), - program_callables_info) + callables_table) else: # The function mangler does not agree with the arg id to dtypes # provided. Indicating that is illegal. @@ -769,99 +776,4 @@ class ManglerCallable(ScalarCallable): # }}} - -# {{{ new pymbolic calls to scoped functions - -def next_indexed_variable(function): - """ - Returns an instance of :class:`str` with the next indexed-name in the - sequence for the name of *function*. - - *Example:* ``Variable('sin_0')`` will return ``'sin_1'``. - - :arg function: Either an instance of :class:`pymbolic.primitives.Variable` - or :class:`loopy.reduction.ArgExtOp` or - :class:`loopy.reduction.SegmentedOp`. - """ - from loopy.library.reduction import ArgExtOp, SegmentedOp - if isinstance(function, (ArgExtOp, SegmentedOp)): - return function.copy() - func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") - - match = func_name.match(function.name) - - if match is None: - if function.name[-1] == '_': - return "{old_name}0".format(old_name=function.name) - else: - return "{old_name}_0".format(old_name=function.name) - - return "{alpha}_{num}".format(alpha=match.group('alpha'), - num=int(match.group('num'))+1) - - -class FunctionNameChanger(RuleAwareIdentityMapper): - """ - Changes the names of scoped functions in calls of expressions according to - the mapping ``calls_to_new_functions`` - """ - - def __init__(self, rule_mapping_context, calls_to_new_names, - subst_expander): - super(FunctionNameChanger, self).__init__(rule_mapping_context) - self.calls_to_new_names = calls_to_new_names - self.subst_expander = subst_expander - - def map_call(self, expr, expn_state): - name, tag = parse_tagged_name(expr.function) - - if name not in self.rule_mapping_context.old_subst_rules: - expanded_expr = self.subst_expander(expr) - if expr in self.calls_to_new_names: - return type(expr)( - ResolvedFunction(self.calls_to_new_names[expr]), - tuple(self.rec(child, expn_state) - for child in expr.parameters)) - elif expanded_expr in self.calls_to_new_names: - # FIXME: this is horribly wrong logic. - # investigate how to make edits to a substitution rule - return type(expr)( - ResolvedFunction(self.calls_to_new_names[expanded_expr]), - tuple(self.rec(child, expn_state) - for child in expanded_expr.parameters)) - else: - return super(FunctionNameChanger, self).map_call( - expr, expn_state) - else: - return self.map_substitution(name, tag, expr.parameters, expn_state) - - def map_call_with_kwargs(self, expr, expn_state): - - if expr in self.calls_to_new_names: - return type(expr)( - ResolvedFunction(self.calls_to_new_names[expr]), - tuple(self.rec(child, expn_state) - for child in expr.parameters), - dict( - (key, self.rec(val, expn_state)) - for key, val in six.iteritems(expr.kw_parameters)) - ) - else: - return super(FunctionNameChanger, self).map_call_with_kwargs( - expr, expn_state) - - -def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): - rule_mapping_context = SubstitutionRuleMappingContext( - kernel.substitutions, kernel.get_var_name_generator()) - subst_expander = SubstitutionRuleExpander(kernel.substitutions) - name_changer = FunctionNameChanger(rule_mapping_context, - pymbolic_calls_to_new_names, subst_expander) - - return rule_mapping_context.finish_kernel( - name_changer.map_kernel(kernel)) - -# }}} - - # vim: foldmethod=marker diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 8e238badb8cb9dbd2df270c56c28e6b923732f53..26856d64fc189191d818bd2ea3eac82c7bfa406e 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -755,7 +755,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): # }}} -def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=None): +def assign_automatic_axes(kernel, callables_table, axis=0, local_size=None): logger.debug("%s: assign automatic axes" % kernel.name) # TODO: do the tag removal rigorously, might be easier after switching # to set() from tuple() @@ -769,7 +769,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non if local_size is None: _, local_size = kernel.get_grid_size_upper_bounds_as_exprs( - program_callables_info, ignore_auto=True) + callables_table, ignore_auto=True) # {{{ axis assignment helper function @@ -797,7 +797,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non return assign_automatic_axes( kernel.copy(iname_to_tags=new_iname_to_tags), - program_callables_info, + callables_table, axis=recursion_axis) if axis is None: @@ -849,7 +849,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non iname, inner_length=local_size[axis], outer_tag=None, inner_tag=new_tag, do_tagged_check=False), - program_callables_info=program_callables_info, + callables_table=callables_table, axis=recursion_axis, local_size=local_size) if not kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase): @@ -871,7 +871,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non del new_iname_to_tags[iname] return assign_automatic_axes(kernel.copy(iname_to_tags=new_iname_to_tags), - program_callables_info, axis=recursion_axis, local_size=local_size) + callables_table, axis=recursion_axis, local_size=local_size) # }}} @@ -940,7 +940,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non return kernel else: return assign_automatic_axes(kernel, - program_callables_info=program_callables_info, axis=axis+1, + callables_table=callables_table, axis=axis+1, local_size=local_size) # }}} @@ -1253,7 +1253,7 @@ def draw_dependencies_as_unicode_arrows( for dep in insn.depends_on: reverse_deps.setdefault(dep, set()).add(insn.id) - # mapping of (from_id, to_id) tuples to column_index + # mapping of to_id tuples to column_index dep_to_column = {} # {{{ find column assignments @@ -1330,7 +1330,7 @@ def draw_dependencies_as_unicode_arrows( elif insn.id in starts: starts.remove(insn.id) - if starts: + if starts or pointed_at_insn_id not in processed_ids: # will continue downward row[col] = do_flag_downward(u"├", pointed_at_insn_id) @@ -1866,7 +1866,7 @@ def find_aliasing_equivalence_classes(kernel): # {{{ callee kernel tools -def get_direct_callee_kernels(kernel, program_callables_info, insn_ids=None,): +def get_direct_callee_kernels(kernel, callables_table, insn_ids=None,): """ Returns an instance of :class:`frozenset` of all the callee kernels called in instructions in the *kernel* whose IDs are given in *insn_ids*. @@ -1892,8 +1892,8 @@ def get_direct_callee_kernels(kernel, program_callables_info, insn_ids=None,): from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, CInstruction, _DataObliviousInstruction) if isinstance(insn, CallInstruction): - if insn.expression.function.name in program_callables_info: - in_knl_callable = program_callables_info[ + if insn.expression.function.name in callables_table: + in_knl_callable = callables_table[ insn.expression.function.name] if isinstance(in_knl_callable, CallableKernel): return in_knl_callable.subkernel @@ -1914,7 +1914,7 @@ def get_direct_callee_kernels(kernel, program_callables_info, insn_ids=None,): # {{{ direction helper tools -def infer_arg_is_output_only(kernel): +def infer_args_are_output_only(kernel): """ Returns a copy of *kernel* with the attribute ``is_output_only`` set. diff --git a/loopy/library/function.py b/loopy/library/function.py index 8338875d0ec9f57dcce702a603293d038a9fbd02..f225b62f9f77b889c7137d69ff7e3944268641fa 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -26,44 +26,54 @@ from loopy.kernel.function_interface import ScalarCallable class MakeTupleCallable(ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): new_arg_id_to_dtype = arg_id_to_dtype.copy() for i in range(len(arg_id_to_dtype)): if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i] return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target="loopy_make_tuple"), program_callables_info) + name_in_target="loopy_make_tuple"), callables_table) - def with_descrs(self, arg_id_to_descr, program_callables_info): + def with_descrs(self, arg_id_to_descr, callables_table): from loopy.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = dict(((id, ValueArgDescriptor()), (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys()) return ( self.copy(arg_id_to_descr=new_arg_id_to_descr), - program_callables_info) + callables_table) class IndexOfCallable(ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): new_arg_id_to_dtype = dict((i, dtype) for i, dtype in arg_id_to_dtype.items() if dtype is not None) new_arg_id_to_dtype[-1] = kernel.index_dtype return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), - program_callables_info) + callables_table) -def loopy_specific_callable_scopers(target, identifier): +def loopy_specific_callable_func_id_to_knl_callable_mappers(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` for the *idenitifer* + which is not present in *target*, but whose interface is given by + :mod:`loo.py`. Callables that fall in this category are -- + + - reductions leading to function calls like ``argmin``, ``argmax``. + - callables that have a predefined meaning in :mod:`loo.py` like + ``make_tuple``, ``index_of``, ``indexof_vec``. + """ if identifier == "make_tuple": return MakeTupleCallable(name="make_tuple") if identifier in ["indexof", "indexof_vec"]: return IndexOfCallable(name=identifier) - from loopy.library.reduction import reduction_scoper - return reduction_scoper(target, identifier) + from loopy.library.reduction import ( + reduction_func_id_to_in_knl_callable_mapper) + return reduction_func_id_to_in_knl_callable_mapper(target, identifier) # vim: foldmethod=marker diff --git a/loopy/library/random123.py b/loopy/library/random123.py index 59ca72df1c9f4e16ec94ee0d38eed30a9420f309..e59a892bb4c7b3bd7222bf61b29e0ade92195240 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -169,14 +169,14 @@ class Random123Callable(ScalarCallable): Records information about for the random123 functions. """ - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): # the types provided aren't mature enough to specialize the # callable return (self.copy(), - program_callables_info) + callables_table) name = self.name target = kernel.target @@ -195,7 +195,7 @@ class Random123Callable(ScalarCallable): return ( self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target=fn+"_gen"), - program_callables_info) + callables_table) elif name == fn + "_f32": new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32), @@ -203,7 +203,7 @@ class Random123Callable(ScalarCallable): -2: ctr_dtype, 0: ctr_dtype, 1: key_dtype} return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=name), program_callables_info + name_in_target=name), callables_table elif name == fn + "_f64": new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64), @@ -211,10 +211,10 @@ class Random123Callable(ScalarCallable): -2: ctr_dtype, 0: ctr_dtype, 1: key_dtype} return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=name), program_callables_info + name_in_target=name), callables_table return (self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) def generate_preambles(self, target): rng_variant = FUNC_NAMES_TO_RNG[self.name] @@ -231,7 +231,7 @@ class Random123Callable(ScalarCallable): return -def random123_function_scoper(target, identifier): +def random123_function_id_to_in_knl_callable_mapper(target, identifier): if identifier in FUNC_NAMES_TO_RNG: return Random123Callable(name=identifier) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 6ec8e4b219d93e717ebdc4a1965531c28171c84f..dd0e1e3e9e73792629dbefeafac63e6b70f9a4ef 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -31,7 +31,7 @@ import numpy as np from loopy.symbolic import FunctionIdentifier from loopy.diagnostic import LoopyError from loopy.types import NumpyType -from loopy.kernel import LoopKernel +from loopy.tools import update_persistent_hash class ReductionOperation(object): @@ -227,8 +227,7 @@ class ReductionOpFunction(FunctionIdentifier): hash_fields = ( "reduction_op",) - update_persistent_hash = LoopKernel.update_persistent_hash - + update_persistent_hash = update_persistent_hash # }}} @@ -286,7 +285,7 @@ class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): "which", "op",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): @@ -299,7 +298,7 @@ class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): "op", "base_reduction_class",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash # }}} @@ -355,7 +354,7 @@ class ArgMaxReductionOperation(_ArgExtremumReductionOperation): "update_comparison", "neutral_sign",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash class ArgMinReductionOperation(_ArgExtremumReductionOperation): @@ -367,7 +366,7 @@ class ArgMinReductionOperation(_ArgExtremumReductionOperation): "update_comparison", "neutral_sign",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash # }}} @@ -425,7 +424,7 @@ def parse_reduction_op(name): # {{{ reduction specific callables class ReductionCallable(ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): scalar_dtype = arg_id_to_dtype[0] index_dtype = arg_id_to_dtype[1] result_dtypes = self.name.reduction_op.result_dtypes(kernel, scalar_dtype, @@ -437,15 +436,15 @@ class ReductionCallable(ScalarCallable): index_dtype) + "_op" return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=name_in_target), program_callables_info + name_in_target=name_in_target), callables_table - def with_descr(self, arg_id_to_descr, program_callables_info): + def with_descr(self, arg_id_to_descr, callables_table): from loopy.library.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = arg_id_to_descr.copy() new_arg_id_to_descr[-1] = ValueArgDescriptor() return ( self.copy(arg_id_to_descr=arg_id_to_descr), - program_callables_info) + callables_table) def generate_preambles(self, target): if isinstance(self.name, ArgExtOp): @@ -503,8 +502,8 @@ class ReductionCallable(ScalarCallable): return -def reduction_scoper(target, identifier): - if isinstance(identifier, (ArgExtOp, SegmentedOp)): +def reduction_func_id_to_in_knl_callable_mapper(target, identifier): + if isinstance(identifier, ReductionOpFunction): return ReductionCallable(name=identifier) return None diff --git a/loopy/preprocess.py b/loopy/preprocess.py index f6e6181cb39b713c6ae0f4d727859de31e9b178a..aa536d7aec5394debf9ec69383101de149b1085a 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -62,7 +62,7 @@ def prepare_for_caching(kernel): and not isinstance(dtype, OpaqueType) and dtype is not lp.auto and dtype.target is not tgt): - arg = arg.copy(dtype=dtype.with_target(kernel.target)) + arg = arg.copy(dtype=dtype.with_target(tgt), target=tgt) new_args.append(arg) @@ -70,7 +70,7 @@ def prepare_for_caching(kernel): for name, temp in six.iteritems(kernel.temporary_variables): dtype = temp.dtype if dtype is not None and dtype is not lp.auto and dtype.target is not tgt: - temp = temp.copy(dtype=dtype.with_target(tgt)) + temp = temp.copy(dtype=dtype.with_target(tgt), target=tgt) new_temporary_variables[name] = temp @@ -897,7 +897,7 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} -def realize_reduction_for_single_kernel(kernel, program_callables_info, +def realize_reduction_for_single_kernel(kernel, callables_table, insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, force_scan=False, force_outer_iname_for_scan=None): """Rewrites reductions into their imperative form. With *insn_id_filter* @@ -1019,7 +1019,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, # {{{ sequential - def map_reduction_seq(expr, rec, program_callables_info, nresults, arg_dtypes, + def map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1137,7 +1137,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, v[iname].lt_set(v[0] + ubound)).get_basic_sets() return bs - def map_reduction_local(expr, rec, program_callables_info, nresults, arg_dtypes, + def map_reduction_local(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes): red_iname, = expr.inames @@ -1377,7 +1377,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, # {{{ sequential scan - def map_scan_seq(expr, rec, program_callables_info, nresults, arg_dtypes, + def map_scan_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, scan_min_value, stride): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1466,7 +1466,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, # {{{ local-parallel scan - def map_scan_local(expr, rec, program_callables_info, nresults, arg_dtypes, + def map_scan_local(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, scan_min_value, stride): @@ -1475,7 +1475,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, assert scan_size > 0 if scan_size == 1: - return map_reduction_seq(expr, rec, program_callables_info, + return map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes) outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1675,15 +1675,15 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, # {{{ seq/par dispatch - def map_reduction(expr, rec, program_callables_info, nresults=1): + def map_reduction(expr, rec, callables_table, nresults=1): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. from loopy.type_inference import ( infer_arg_and_reduction_dtypes_for_reduction_expression) - arg_dtypes, reduction_dtypes, program_callables_info = ( + arg_dtypes, reduction_dtypes, callables_table = ( infer_arg_and_reduction_dtypes_for_reduction_expression( - temp_kernel, expr, program_callables_info, unknown_types_ok)) + temp_kernel, expr, callables_table, unknown_types_ok)) outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = frozenset(expr.inames) & outer_insn_inames @@ -1792,7 +1792,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, for tag in temp_kernel.iname_tags(sweep_iname)))) elif parallel: return map_scan_local( - expr, rec, program_callables_info, nresults, + expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, scan_param.sweep_lower_bound, @@ -1800,7 +1800,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, scan_param.stride) elif sequential: return map_scan_seq( - expr, rec, program_callables_info, nresults, + expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, scan_param.sweep_lower_bound, @@ -1821,12 +1821,12 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, if n_sequential: assert n_local_par == 0 - return map_reduction_seq(expr, rec, program_callables_info, + return map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes) else: assert n_local_par > 0 return map_reduction_local( - expr, rec, program_callables_info, nresults, arg_dtypes, + expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes) # }}} @@ -1861,12 +1861,12 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: new_expressions = cb_mapper(insn.expression, - program_callables_info=program_callables_info, + callables_table=callables_table, nresults=nresults) else: new_expressions = ( cb_mapper(insn.expression, - program_callables_info=program_callables_info),) + callables_table=callables_table),) if generated_insns: # An expansion happened, so insert the generated stuff plus @@ -1960,10 +1960,10 @@ def realize_reduction(program, *args, **kwargs): assert isinstance(program, Program) new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = realize_reduction_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -1976,9 +1976,9 @@ def realize_reduction(program, *args, **kwargs): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # }}} @@ -2157,18 +2157,15 @@ def check_atomic_loads(kernel): class ArgDescrInferenceMapper(RuleAwareIdentityMapper): """ - Returns a set of instances of :class:`tuple` (expr, - in_kernel_callable). The mapped `in_kernel_callable` of the - :class:`InKernelCallable` are descriptor specialized for the given - arguments. + Infers the :attr:`loopy` """ def __init__(self, rule_mapping_context, caller_kernel, - program_callables_info): + callables_table): super(ArgDescrInferenceMapper, self).__init__( rule_mapping_context) self.caller_kernel = caller_kernel - self.program_callables_info = program_callables_info + self.callables_table = callables_table def map_call(self, expr, expn_state, **kwargs): from pymbolic.primitives import Call, CallWithKwargs @@ -2209,12 +2206,12 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): combined_arg_id_to_descr.update(assignee_id_to_descr) # specializing the function according to the parameter description - in_knl_callable = self.program_callables_info[expr.function.name] - new_in_knl_callable, self.program_callables_info = ( + in_knl_callable = self.callables_table[expr.function.name] + new_in_knl_callable, self.callables_table = ( in_knl_callable.with_descrs( - combined_arg_id_to_descr, self.program_callables_info)) - self.program_callables_info, new_func_id = ( - self.program_callables_info.with_callable( + combined_arg_id_to_descr, self.callables_table)) + self.callables_table, new_func_id = ( + self.callables_table.with_callable( expr.function.function, new_in_knl_callable)) @@ -2258,44 +2255,53 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): return kernel.copy(instructions=new_insns) -def traverse_to_infer_arg_descr(kernel, program_callables_info): +def traverse_to_infer_arg_descr(kernel, callables_table): """ Returns a copy of *kernel* with the argument shapes and strides matching for scoped functions in the *kernel*. Refer :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`. - """ - # FIXME: update this docs, once the design is finalized + .. note:: + + Initiates a walk starting from *kernel* to all its callee kernels. + """ from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) arg_descr_inf_mapper = ArgDescrInferenceMapper(rule_mapping_context, - kernel, program_callables_info) + kernel, callables_table) descr_inferred_kernel = rule_mapping_context.finish_kernel( arg_descr_inf_mapper.map_kernel(kernel)) - return descr_inferred_kernel, arg_descr_inf_mapper.program_callables_info + return descr_inferred_kernel, arg_descr_inf_mapper.callables_table def infer_arg_descr(program): - root_kernel_callable = program.program_callables_info[program.name] - program_callables_info = ( - program.program_callables_info.with_edit_callables_mode()) + """ + Returns a copy of *program* with the + :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the + callables. + """ + root_kernel_callable = program.callables_table[program.name] + old_callables_count = program.callables_table.callables_count + callables_table = ( + program.callables_table.with_edit_callables_mode()) root_kernel = program.root_kernel - new_root_kernel, program_callables_info = traverse_to_infer_arg_descr( - root_kernel, program_callables_info) + new_root_kernel, callables_table = traverse_to_infer_arg_descr( + root_kernel, callables_table) new_root_kernel_callable = root_kernel_callable.copy( subkernel=new_root_kernel) - program_callables_info, _ = program_callables_info.with_callable(program.name, + callables_table, _ = callables_table.with_callable(program.name, new_root_kernel_callable) - program_callables_info = program_callables_info.with_exit_edit_callables_mode() + callables_table = callables_table.with_exit_edit_callables_mode( + old_callables_count) - return program.copy(program_callables_info=program_callables_info) + return program.copy(callables_table=callables_table) # }}} @@ -2305,7 +2311,7 @@ preprocess_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) -def preprocess_single_kernel(kernel, program_callables_info, device=None): +def preprocess_single_kernel(kernel, callables_table, device=None): from loopy.kernel import KernelState if kernel.state >= KernelState.PREPROCESSED: return kernel @@ -2363,7 +2369,7 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None): # because it manipulates the depends_on field, which could prevent # defaults from being applied. kernel = realize_reduction_for_single_kernel(kernel, - program_callables_info, unknown_types_ok=False) + callables_table, unknown_types_ok=False) # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators @@ -2408,80 +2414,93 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None): return kernel -def preprocess_kernel(kernel, device=None): - # FIXME: error message? - return preprocess_program(kernel, device) +# {{{ hw axes inference + +def infer_hw_axes_sizes(program): + """ + Returns copy of *program* with the hardware axes sizes inferred. + + .. note:: + + - Firstly, computes the collective hardware axes sizes from all the + callable kernels. + - Then, overrides the grid sizes of all the callable kernels to the + collective value. + """ + + local_size, global_size = program.get_grid_size_upper_bounds() + + resolved_function_with_hw_axes_sizes_inferred = {} + + for func_id, in_knl_callable in ( + program.callables_table.items()): + if func_id == program.name: + resolved_function_with_hw_axes_sizes_inferred[func_id] = ( + in_knl_callable) + else: + resolved_function_with_hw_axes_sizes_inferred[func_id] = ( + in_knl_callable.with_hw_axes_sizes(local_size, global_size)) + + new_callables_table = ( + program.callables_table.copy( + resolved_functions=resolved_function_with_hw_axes_sizes_inferred)) + + return program.copy(callables_table=new_callables_table) + +# }}} def preprocess_program(program, device=None): if device is not None: + # FIXME: Time to remove this? (Git blame shows 5 years ago) from warnings import warn warn("passing 'device' to preprocess_kernel() is deprecated", DeprecationWarning, stacklevel=2) program = infer_unknown_types(program, expect_completion=False) - # {{{ preprocess the root kernel + # {{{ preprocess callable kernels # Callable editing restrictions: # - # - cannot edit program_callables_info in :meth:`preprocess_single_kernel` - # as we are iterating over it. + # - should not edit callables_table in :meth:`preprocess_single_kernel` + # as we are iterating over it.[1] # - # Refer: https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects + # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = preprocess_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, device) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) elif isinstance(in_knl_callable, ScalarCallable): pass else: - raise NotImplementedError("Unknown type of callable %s." % ( + raise NotImplementedError("Unknown callable type %s." % ( type(in_knl_callable).__name__)) new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - program = program.copy(program_callables_info=new_program_callables_info) + program = program.copy(callables_table=new_callables_table) # }}} # infer arg descrs of the callables program = infer_arg_descr(program) - # {{{ hw axes inference + program = infer_hw_axes_sizes(program) - # FIXME: think of wrapping this in a function? - - local_size, global_size = program.get_grid_size_upper_bounds() - - resolved_function_with_hw_axes_sizes_set = {} - - for func_id, in_knl_callable in ( - program.program_callables_info.items()): - if func_id == program.name: - resolved_function_with_hw_axes_sizes_set[func_id] = ( - in_knl_callable) - else: - resolved_function_with_hw_axes_sizes_set[func_id] = ( - in_knl_callable.with_hw_axes_sizes(local_size, global_size)) - - new_program_callables_info = ( - program.program_callables_info.copy( - resolved_functions=resolved_function_with_hw_axes_sizes_set)) - - program = program.copy(program_callables_info=new_program_callables_info) + return program - # }}} - return program +# FIXME: Do we add a deprecation warning? +preprocess_kernel = preprocess_program # vim: foldmethod=marker diff --git a/loopy/program.py b/loopy/program.py index 096bd1eca86c130a488fbbbea2527f17315ac731..c8534f0511353da45977ab282df18a585b63e632 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -1,6 +1,6 @@ from __future__ import division, absolute_import -__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy @@ -29,12 +29,31 @@ from pytools import ImmutableRecord, memoize_method from pymbolic.primitives import Variable from functools import wraps -from loopy.symbolic import RuleAwareIdentityMapper, ResolvedFunction +from loopy.symbolic import (RuleAwareIdentityMapper, ResolvedFunction, + CombineMapper, SubstitutionRuleExpander) from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) +from loopy.kernel.instruction import ( + MultiAssignmentBase, CInstruction, _DataObliviousInstruction) from loopy.diagnostic import LoopyError +from loopy.library.reduction import ReductionOpFunction from loopy.kernel import LoopKernel +from loopy.tools import update_persistent_hash +from collections import Counter +from pymbolic.primitives import Call, CallWithKwargs + +__doc__ = """ + +.. currentmodule:: loopy + +.. autoclass:: Program +.. autoclass:: CallablesTable + +.. autofunction:: make_program +.. autofunction:: iterate_over_kernels_if_given_program + +""" class ResolvedFunctionMarker(RuleAwareIdentityMapper): @@ -55,12 +74,11 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): :arg function_ids: A container with instances of :class:`str` indicating the function identifiers to look for while scoping functions. """ - def __init__(self, rule_mapping_context, kernel, program_callables_info, + def __init__(self, rule_mapping_context, kernel, callables_table, function_id_to_in_knl_callable_mappers): super(ResolvedFunctionMarker, self).__init__(rule_mapping_context) self.kernel = kernel - self.program_callables_info = program_callables_info - # FIXME: function_resolvesrs looks like a very bad name change it + self.callables_table = callables_table self.function_id_to_in_knl_callable_mappers = ( function_id_to_in_knl_callable_mappers) @@ -71,7 +89,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): :arg:`identifier` is known to any kernel function scoper, otherwise returns *None*. """ - # FIXME change docs for func_id_to_in_knl_callable_mapper in ( self.function_id_to_in_knl_callable_mappers): # fixme: do we really need to given target for the function @@ -83,7 +100,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): return None def map_call(self, expr, expn_state): - from pymbolic.primitives import Call, CallWithKwargs from loopy.symbolic import parse_tagged_name name, tag = parse_tagged_name(expr.function) @@ -108,9 +124,9 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): # associate the newly created ResolvedFunction with the # resolved in-kernel callable - self.program_callables_info, new_func_id = ( - self.program_callables_info.with_callable(expr.function, - in_knl_callable, True)) + self.callables_table, new_func_id = ( + self.callables_table.with_added_callable( + expr.function, in_knl_callable)) return type(expr)( ResolvedFunction(new_func_id), tuple(self.rec(child, expn_state) @@ -129,56 +145,104 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): expr.operation.get_scalar_callables()): in_knl_callable = self.find_in_knl_callable_from_identifier(func_id) assert in_knl_callable is not None - self.program_callables_info, _ = ( - self.program_callables_info.with_callable(func_id, - in_knl_callable, True)) + self.callables_table, _ = ( + self.callables_table.with_added_callable(func_id, + in_knl_callable)) return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) -def initialize_program_callables_info_from_kernel( - kernel, func_id_to_kernel_callable_mappers): - program_callables_info = ProgramCallablesInfo({}) - program_callables_info = program_callables_info.with_edit_callables_mode() +def _default_func_id_to_kernel_callable_mappers(target): + """ + Returns a list of functions that are provided through *target* by deafault. + """ + from loopy.library.function import ( + loopy_specific_callable_func_id_to_knl_callable_mappers) + return ( + [loopy_specific_callable_func_id_to_knl_callable_mappers] + ( + target.get_device_ast_builder().function_id_in_knl_callable_mapper( + ))) + + +def initialize_callables_table_from_kernel(kernel): + """ + Returns an instance of :class:`loopy.CallablesTable`, by resolving + the functions based on :mod:`loopy`'s default function resolvers. + """ + # collect the default function resolvers + func_id_to_kernel_callable_mappers = ( + _default_func_id_to_kernel_callable_mappers(kernel.target)) + callables_table = CallablesTable({}) from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) resolved_function_marker = ResolvedFunctionMarker( - rule_mapping_context, kernel, program_callables_info, + rule_mapping_context, kernel, callables_table, func_id_to_kernel_callable_mappers) - # scoping fucntions and collecting the scoped functions + # mark the functions as "Resolved" in the expression nodes. kernel_with_functions_resolved = rule_mapping_context.finish_kernel( resolved_function_marker.map_kernel(kernel)) - program_callables_info = resolved_function_marker.program_callables_info + # collect the update callables_table + callables_table = resolved_function_marker.callables_table callable_kernel = CallableKernel(kernel_with_functions_resolved) - program_callables_info, _ = program_callables_info.with_callable( - Variable(kernel.name), callable_kernel, True) - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) - return program_callables_info + # add the callable kernel to the callables_table + callables_table, _ = callables_table.with_added_callable( + Variable(kernel.name), callable_kernel) + + return callables_table # {{{ program definition class Program(ImmutableRecord): + """ + Records the information about all the callables in a :mod:`loopy` program. + + .. attribute:: name + + An instance of :class:`str`, also the name of the top-most level + :class:`loopy.LoopKernel`. + + .. attribute:: callables_table + + An instance of :class:`loopy.program.CallablesTable`. + + .. attribute:: target + + An instance of :class:`loopy.target.TargetBase`. + + .. attribute:: func_id_to_in_knl_callables_mappers + + A list of functions of the signature ``(target: TargetBase, + function_indentifier: str)`` that would return an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. + + .. note:: + + - To create an instance of :class:`loopy.Program`, it is recommended to + go through :method:`loopy.make_kernel`. + - This data structure and its attributes should be considered + immutable, any modifications should be done through :method:`copy`. + + .. automethod:: __init__ + .. automethod:: with_root_kernel + """ def __init__(self, name, - program_callables_info, + callables_table, target, func_id_to_in_knl_callable_mappers): - assert isinstance(program_callables_info, ProgramCallablesInfo) + assert isinstance(callables_table, CallablesTable) - # FIXME: check if all sanity checks have been covered? - # FIXME: The comments over here may need some attention. - assert name in program_callables_info + assert name in callables_table super(Program, self).__init__( name=name, - program_callables_info=program_callables_info, + callables_table=callables_table, target=target, func_id_to_in_knl_callable_mappers=( func_id_to_in_knl_callable_mappers)) @@ -187,18 +251,19 @@ class Program(ImmutableRecord): hash_fields = ( "name", - "program_callables_info", + "callables_table", "target",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash def copy(self, **kwargs): if 'target' in kwargs: + # target attribute of all the callable kernels should be updated. target = kwargs['target'] new_self = super(Program, self).copy(**kwargs) new_resolved_functions = {} for func_id, in_knl_callable in ( - new_self.program_callables_info.items()): + new_self.callables_table.items()): if isinstance(in_knl_callable, CallableKernel): subkernel = in_knl_callable.subkernel new_resolved_functions[func_id] = in_knl_callable.copy( @@ -206,11 +271,11 @@ class Program(ImmutableRecord): else: new_resolved_functions[func_id] = in_knl_callable - program_callables_info = new_self.program_callables_info.copy( + callables_table = new_self.callables_table.copy( resolved_functions=new_resolved_functions) return super(Program, new_self).copy( - program_callables_info=program_callables_info) + callables_table=callables_table) else: return super(Program, self).copy(**kwargs) @@ -221,7 +286,7 @@ class Program(ImmutableRecord): *global_size* and *local_size* are :class:`islpy.PwAff` objects. """ return self.root_kernel.get_grid_size_upper_bounds( - self.program_callables_info, + self.callables_table, ignore_auto=ignore_auto) def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): @@ -231,7 +296,7 @@ class Program(ImmutableRecord): *global_size* and *local_size* are :mod:`pymbolic` expressions """ return self.root_kernel.get_grid_size_upper_bounds_as_exprs( - self.program_callables_info, + self.callables_table, ignore_auto=ignore_auto) # {{{ implementation arguments @@ -266,27 +331,53 @@ class Program(ImmutableRecord): @property def root_kernel(self): - return self.program_callables_info[self.name].subkernel + """ + Returns an instance of :class:`loopy.LoopKernel` denoting the topmost + level kernel. + + .. note:: + + Syntactic sugar. + """ + return self.callables_table[self.name].subkernel @property def arg_dict(self): + """ + Returns ``arg_dict`` of the ``root_kernel``. + + .. note:: + + Syntactic sugar. + """ return self.root_kernel.arg_dict + @property + def args(self): + """ + Returns ``args`` of the ``root_kernel``. + + .. note:: + + Syntactic sugar. + """ + return self.root_kernel.args[:] + def with_root_kernel(self, root_kernel): - new_in_knl_callable = self.program_callables_info[ + """ + Returns a copy of *self* with the topmost level kernel as + *root_kernel*. + """ + new_in_knl_callable = self.callables_table[ self.name].copy(subkernel=root_kernel) new_resolved_functions = ( - self.program_callables_info.resolved_functions.copy()) + self.callables_table.resolved_functions.copy()) new_resolved_functions[self.name] = new_in_knl_callable return self.copy( - program_callables_info=self.program_callables_info.copy( + callables_table=self.callables_table.copy( resolved_functions=new_resolved_functions)) - @property - def args(self): - return self.root_kernel.args[:] - def __call__(self, *args, **kwargs): key = self.target.get_kernel_executor_cache_key(*args, **kwargs) try: @@ -298,51 +389,46 @@ class Program(ImmutableRecord): return pex(*args, **kwargs) def __str__(self): - # FIXME: make this better - print(self.program_callables_info.num_times_callables_called) - return ( - (self.program_callables_info[ - self.name].subkernel).__str__() + - '\nResolved Functions: ' + - (self.program_callables_info.resolved_functions.keys()).__str__() + - '\n' + 75*'-' + '\n') + return self.root_kernel.__str__() # }}} -def next_indexed_function_identifier(function): +def next_indexed_function_identifier(function_id): """ Returns an instance of :class:`str` with the next indexed-name in the sequence for the name of *function*. - *Example:* ``Variable('sin_0')`` will return ``'sin_1'``. + *Example:* ``'sin_0'`` will return ``'sin_1'``. - :arg function: Either an instance of :class:`pymbolic.primitives.Variable` - or :class:`loopy.reduction.ArgExtOp` or - :class:`loopy.reduction.SegmentedOp`. + :arg function_id: Either an instance of :class:`str`. """ - from loopy.library.reduction import ArgExtOp, SegmentedOp - if isinstance(function, (ArgExtOp, SegmentedOp)): - return function.copy() - elif isinstance(function, str): - function = Variable(function) - assert isinstance(function, Variable) + # {{{ sanity checks + + assert isinstance(function_id, str) + + # }}} + func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") - match = func_name.match(function.name) + match = func_name.match(function_id) if match is None: - if function.name[-1] == '_': - return "{old_name}0".format(old_name=function.name) + if function_id[-1] == '_': + return "{old_name}0".format(old_name=function_id) else: - return "{old_name}_0".format(old_name=function.name) + return "{old_name}_0".format(old_name=function_id) return "{alpha}_{num}".format(alpha=match.group('alpha'), num=int(match.group('num'))+1) class ResolvedFunctionRenamer(RuleAwareIdentityMapper): + """ + Mapper to rename the resolved functions in an expression according to + *renaming_dict*. + """ def __init__(self, rule_mapping_context, renaming_dict): super(ResolvedFunctionRenamer, self).__init__( rule_mapping_context) @@ -358,6 +444,10 @@ class ResolvedFunctionRenamer(RuleAwareIdentityMapper): def rename_resolved_functions_in_a_single_kernel(kernel, renaming_dict): + """ + Returns a copy of *kernel* with the instances of :class:`ResolvedFunction` + renames according to *renaming_dict*. + """ from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) @@ -368,188 +458,421 @@ def rename_resolved_functions_in_a_single_kernel(kernel, resolved_function_renamer.map_kernel(kernel))) +# {{{ counting helpers + +class CallablesCountingMapper(CombineMapper): + """ + Returns an instance of :class:`collections.Counter` with the count of + callables registered in *callables_table*. + + .. attribute:: callables_table + + An instance of :class:`loopy.program.CallablesTable`. + """ + def __init__(self, callables_table): + self.callables_table = callables_table + + def combine(self, values): + return sum(values, Counter()) + + def map_call(self, expr): + + if isinstance(expr, CallWithKwargs): + kw_parameters = expr.kw_parameters + else: + assert isinstance(expr, Call) + kw_parameters = {} + + if isinstance(expr.function, (ResolvedFunction)): + in_knl_callable = self.callables_table[expr.function.name] + if isinstance(in_knl_callable, ScalarCallable): + return (Counter([expr.function.name]) + + self.combine((self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values())))) + + elif isinstance(in_knl_callable, CallableKernel): + + # callable kernels have more callables in them. + callables_count_in_subkernel = ( + count_callables_in_kernel( + in_knl_callable.subkernel, + self.callables_table)) + + return (Counter([expr.function.name]) + + self.combine((self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values())))) + ( + callables_count_in_subkernel) + else: + raise NotImplementedError("Unknown callable type %s." % ( + type)) + else: + return ( + self.combine((self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values())))) + + map_call_with_kwargs = map_call + + def map_reduction(self, expr): + return Counter(expr.operation.get_scalar_callables()) + ( + super(CallablesCountingMapper, self).map_reduction(expr)) + + def map_constant(self, expr): + return Counter() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +@memoize_method +def count_callables_in_kernel(kernel, callables_table): + """ + Returns an instance of :class:`collections.Counter` representing the number + of callables in the *kernel* that are registered in + *callables_table*. + """ + assert isinstance(kernel, LoopKernel) + callables_count = Counter() + callables_counting_mapper = CallablesCountingMapper( + callables_table) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + callables_count += ( + callables_counting_mapper(subst_expander( + insn.expression))) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + pass + else: + raise NotImplementedError("Unknown instruction type %s." % ( + type(insn))) + + return callables_count + +# }}} + + # {{{ program callables info -class ProgramCallablesInfo(ImmutableRecord): - # FIXME: dont evalutate num_times_called, rahter compute it from the - # resolved_functions - # FIXME: make the edit callables thing a ContextManager. - def __init__(self, resolved_functions, num_times_callables_called=None, - history=None, is_being_edited=False, - num_times_hit_during_editing={}, - renames_needed_after_editing={}): - - if num_times_callables_called is None: - num_times_callables_called = dict((func_id, 1) for func_id in - resolved_functions) +class CallablesTable(ImmutableRecord): + # FIXME: is CallablesTable a better name?(similar to symbol table in + # compilers.) + """ + Records the information of all the callables called in a :class:`loopy.Program`. + + .. attribute:: resolved_functions + + An instance of :class:`dict` that contains a mapping from function + identifier to instances of + :class:`loopy.kernel.function_interface.InKernelCallable` + + .. attribute:: history + + An instance of :class:`dict` that contains a mapping from function + identifier to and instance of :class:`list`that would contain all the + names taken by a function before the current name.(For example: one + possibility could be ``{'sin_1': ['sin', 'sin_0', 'sin_1']}``) + + .. attribute:: is_being_edited + + An instance of :class:`bool` which is intended to aid the working of + :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and + :meth:`with_exit_edit_callables_mode`. + + .. automethod:: __init__ + .. automethod:: callables_count + .. automethod:: with_added_callable + .. automethod:: with_edit_callables_mode + .. automethod:: with_callable + .. automethod:: with_exit_edit_callables_mode + """ + def __init__(self, resolved_functions, + history=None, is_being_edited=False): + if history is None: - history = dict((func_id, set([func_id])) for func_id in + history = dict((func_id, frozenset([func_id])) for func_id in resolved_functions) - super(ProgramCallablesInfo, self).__init__( + super(CallablesTable, self).__init__( resolved_functions=resolved_functions, - num_times_callables_called=num_times_callables_called, history=history, - is_being_edited=is_being_edited, - num_times_hit_during_editing=num_times_hit_during_editing, - renames_needed_after_editing=renames_needed_after_editing) + is_being_edited=is_being_edited) hash_fields = ( "resolved_functions", - "num_times_callables_called", "is_being_edited", - "num_times_hit_during_editing", - "renames_needed_after_editing", "history") - update_persistent_hash = LoopKernel.update_persistent_hash + def __hash__(self): + return hash(( + frozenset(six.iteritems(self.resolved_functions)), + frozenset(six.iteritems(self.history)), + self.is_being_edited + )) + + update_persistent_hash = update_persistent_hash + + @property + @memoize_method + def callables_count(self): + """ + Returns an instance of :class:`collection.Counter` representing the number + of times the callables is called in callables_table. + """ + root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable + in self.values() if + isinstance(in_knl_callable, CallableKernel) and + in_knl_callable.subkernel.is_called_from_host] + + from collections import Counter + callables_count = Counter([root_kernel_name]) + callables_count += ( + count_callables_in_kernel(self[ + root_kernel_name].subkernel, self)) + + return callables_count + + # {{{ interface to perform edits on callables + + def with_added_callable(self, function, in_kernel_callable): + """ + Returns an instance of :class:`tuple` of ``(new_self, new_function)``. + ``new_self`` is a copy of *self* with the *function* associated with the + *in_kernel_callable*. ``new_function`` is the function identifier that + should be noted in the expression node so that it could be associated + with an instance of :class:`InKernelCallable`. + + .. note:: + + - Always checks whether the + :attr:``loopy.CallablesTable.resolved_functions` has + *in_kernel_callable*, does not introduce copies. + + - The difference between + :meth:`loopy.CallablesTable.with_added_callable` + and :meth:`CallablesTable.with_callable` being that + the former has no support for renaming the callable back i.e. + ``with_callable`` supports renaming from ``sin_0`` to ``sin``, + if possible, through the member method + ``loopy.CallablesTable.with_exit_edit_callables_mode`` + + This subtle difference makes -- + + - :meth:`loopy.CallablesTable.with_added_callable` suitable + for usage while resolving the functions first time, where no + renaming is needed. + + - :meth:`loopy.CallablesTable.with_callable` suitable for + implementing edits in callables during inference-walks. + """ + + # {{{ sanity checks + + if isinstance(function, str): + function = Variable(function) + + assert isinstance(function, (Variable, ReductionOpFunction)) + + # }}} + + history = self.history.copy() + + if in_kernel_callable in self.resolved_functions.values(): + # the callable already exists, implies return the function + # identifier corresponding to that callable. + for func_id, in_knl_callable in self.resolved_functions.items(): + if in_knl_callable == in_kernel_callable: + history[func_id] = history[func_id] | frozenset([function.name]) + return ( + self.copy( + history=history), + func_id) + else: + + # {{{ handle ReductionOpFunction + + if isinstance(function, ReductionOpFunction): + unique_function_identifier = function.copy() + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + history[unique_function_identifier] = frozenset( + [unique_function_identifier]) + + return ( + self.copy( + history=history, + resolved_functions=updated_resolved_functions), + unique_function_identifier) + + # }}} + + unique_function_identifier = function.name + + if isinstance(in_kernel_callable, CallableKernel) and ( + in_kernel_callable.subkernel.is_called_from_host): + # do not rename root kernel + pass + else: + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + history[unique_function_identifier] = frozenset( + [unique_function_identifier]) + + return ( + self.copy( + history=history, + resolved_functions=updated_resolved_functions), + Variable(unique_function_identifier)) def with_edit_callables_mode(self): - return self.copy(is_being_edited=True, - num_times_hit_during_editing=dict((func_id, 0) for func_id in - self.resolved_functions)) + """ + Returns a copy of *self* for a walk traversal through all the callables. + """ + return self.copy( + is_being_edited=True) - def with_callable(self, function, in_kernel_callable, - resolved_for_the_first_time=False): + def with_callable(self, function, in_kernel_callable): """ + Returns an instance of :class:`tuple` ``(new_self, new_function)``. + Also refer -- :meth:`loopy.CallablesTable.with_added_callable` + + :arg function: An instance of :class:`pymbolic.primitives.Variable` or :class:`loopy.library.reduction.ReductionOpFunction`. - :arg in_kernel_callables: An instance of + :arg in_kernel_callable: An instance of :class:`loopy.InKernelCallable`. .. note:: - Assumes that each callable is touched atmost once, the internal - working of this function fails if that is violated. + - Use :meth:`with_added_callable` if a callable is being resolved for the + first time. """ - # FIXME: add a note about using enter and exit. ~KK - # FIXME: think about a better idea of "with_added_callable" this would - # be more convenient for developer-faced usage. ~KK + + # {{{ non-edit mode if not self.is_being_edited: if function.name in self.resolved_functions and ( self.resolved_functions[function.name] == in_kernel_callable): + # if not being edited, check that the given function is + # equal to the old version of the callable. return self, function else: print('Old: ', self.resolved_functions[function.name]) print('New: ', in_kernel_callable) - raise LoopyError("Use 'enter_edit_callables_mode' first.") + raise LoopyError("Use 'with_enter_edit_callables_mode' first.") - from loopy.library.reduction import ArgExtOp, SegmentedOp + # }}} # {{{ sanity checks if isinstance(function, str): function = Variable(function) - assert isinstance(function, (Variable, ArgExtOp, SegmentedOp)) + assert isinstance(function, (Variable, ReductionOpFunction)) # }}} - renames_needed_after_editing = self.renames_needed_after_editing.copy() - num_times_hit_during_editing = self.num_times_hit_during_editing.copy() - num_times_callables_called = self.num_times_callables_called.copy() history = self.history.copy() - if not resolved_for_the_first_time: - if isinstance(function, (ArgExtOp, SegmentedOp)): - num_times_hit_during_editing[function] += 1 - else: - num_times_hit_during_editing[function.name] += 1 - - if isinstance(function, (ArgExtOp, SegmentedOp)): - unique_function_identifier = function.copy() - if not resolved_for_the_first_time: - num_times_callables_called[function] -= 1 - - num_times_callables_called[unique_function_identifier] = 1 - - updated_resolved_functions = self.resolved_functions.copy() - updated_resolved_functions[unique_function_identifier] = ( - in_kernel_callable) - - return ( - self.copy( - resolved_functions=updated_resolved_functions, - num_times_callables_called=num_times_callables_called, - num_times_hit_during_editing=( - num_times_hit_during_editing), - renames_needed_after_editing=( - renames_needed_after_editing)), - unique_function_identifier) - if in_kernel_callable in self.resolved_functions.values(): - # the callable already exists, implies return the function - # identifier corresposing to that callable. + + # the callable already exists, hence return the function + # identifier corresponding to that callable. for func_id, in_knl_callable in self.resolved_functions.items(): if in_knl_callable == in_kernel_callable: - num_times_callables_called[func_id] += 1 - if not resolved_for_the_first_time: - num_times_callables_called[function.name] -= 1 - if num_times_callables_called[function.name] == 0: - renames_needed_after_editing[func_id] = function.name - - history[func_id] = history[func_id] | set([function.name]) + history[func_id] = history[func_id] | frozenset([function.name]) return ( self.copy( - history=history, - num_times_hit_during_editing=( - num_times_hit_during_editing), - num_times_callables_called=( - num_times_callables_called), - renames_needed_after_editing=( - renames_needed_after_editing)), + history=history), func_id) else: - # FIXME: maybe deal with the history over here? - # FIXME: once the code logic is running beautify this part. - # many "ifs" can be avoided + # {{{ handle ReductionOpFunction + + if isinstance(function, ReductionOpFunction): + unique_function_identifier = function.copy() + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + return ( + self.copy( + resolved_functions=updated_resolved_functions), + unique_function_identifier) + + # }}} unique_function_identifier = function.name - if (resolved_for_the_first_time or - self.num_times_callables_called[function.name] > 1): + + if isinstance(in_kernel_callable, CallableKernel) and ( + in_kernel_callable.subkernel.is_called_from_host): + # do not rename root kernel + pass + else: while unique_function_identifier in self.resolved_functions: unique_function_identifier = ( next_indexed_function_identifier( unique_function_identifier)) - if not resolved_for_the_first_time: - num_times_callables_called[function.name] -= 1 - - num_times_callables_called[unique_function_identifier] = 1 - updated_resolved_functions = self.resolved_functions.copy() updated_resolved_functions[unique_function_identifier] = ( in_kernel_callable) - if not resolved_for_the_first_time: - history[unique_function_identifier] = ( - history[function.name] | set([unique_function_identifier])) - else: - history[unique_function_identifier] = set( - [unique_function_identifier]) + history[unique_function_identifier] = ( + history[function.name] | frozenset([unique_function_identifier])) return ( self.copy( history=history, - resolved_functions=updated_resolved_functions, - num_times_callables_called=num_times_callables_called, - num_times_hit_during_editing=num_times_hit_during_editing, - renames_needed_after_editing=renames_needed_after_editing), + resolved_functions=updated_resolved_functions), Variable(unique_function_identifier)) - def with_exit_edit_callables_mode(self): + def with_exit_edit_callables_mode(self, old_callables_count): + """ + Returns a copy of *self* with renaming of the callables done whenever + possible. + + *For example: * If all the ``sin`` got diverged as ``sin_0, sin_1``, + then all the renaming is done such that one of flavors of the callable + is renamed back to ``sin``. + """ + assert self.is_being_edited - num_times_callables_called = {} - resolved_functions = {} - history = self.history.copy() + new_callables_count = self.callables_count - for func_id, in_knl_callable in self.resolved_functions.items(): + # {{{ calculate the renames needed + + renames_needed = {} + for old_func_id in old_callables_count-new_callables_count: + # this implies that all the function instances having the name + # "func_id" have been renamed to something else. + for new_func_id in ( + six.viewkeys(new_callables_count)-six.viewkeys(renames_needed)): + if old_func_id in self.history[new_func_id]: + renames_needed[new_func_id] = old_func_id + break + # }}} + + new_resolved_functions = {} + new_history = {} + + for func_id in new_callables_count: + in_knl_callable = self.resolved_functions[func_id] if isinstance(in_knl_callable, CallableKernel): + # if callable kernel, perform renames inside its expressions. old_subkernel = in_knl_callable.subkernel new_subkernel = rename_resolved_functions_in_a_single_kernel( - old_subkernel, self.renames_needed_after_editing) + old_subkernel, renames_needed) in_knl_callable = ( in_knl_callable.copy(subkernel=new_subkernel)) elif isinstance(in_knl_callable, ScalarCallable): @@ -558,45 +881,23 @@ class ProgramCallablesInfo(ImmutableRecord): raise NotImplementedError("Unknown callable type %s." % type(in_knl_callable).__name__) - if func_id in self.renames_needed_after_editing: - history.pop(func_id) - - new_func_id = self.renames_needed_after_editing[func_id] - resolved_functions[new_func_id] = ( + if func_id in renames_needed: + new_func_id = renames_needed[func_id] + new_resolved_functions[new_func_id] = ( in_knl_callable) - num_times_callables_called[new_func_id] = ( - self.num_times_callables_called[func_id]) - + new_history[new_func_id] = self.history[func_id] else: - resolved_functions[func_id] = in_knl_callable - num_times_callables_called[func_id] = ( - self.num_times_callables_called[func_id]) + new_resolved_functions[func_id] = in_knl_callable + new_history[func_id] = self.history[func_id] return self.copy( is_being_edited=False, - resolved_functions=resolved_functions, - num_times_callables_called=num_times_callables_called, - num_times_hit_during_editing={}, - renames_needed_after_editing={}) - - def with_deleted_callable(self, func_id, instances=1): - num_times_callables_called = self.num_times_callables_called.copy() - history = self.history.copy() - resolved_functions = self.resolved_functions.copy() - - assert instances <= num_times_callables_called[func_id] + resolved_functions=new_resolved_functions, + history=new_history) - num_times_callables_called[func_id] -= instances + # }}} - if num_times_callables_called[func_id] == 0: - num_times_callables_called.pop(func_id) - history.pop(func_id) - resolved_functions.pop(func_id) - - return self.copy( - resolved_functions=resolved_functions, - num_times_callables_called=num_times_callables_called, - history=history) + # {{{ behave like a dict(syntactic sugar) def __getitem__(self, item): return self.resolved_functions[item] @@ -605,44 +906,53 @@ class ProgramCallablesInfo(ImmutableRecord): return item in self.resolved_functions def items(self): - return self.resolved_functions.items() + return six.iteritems(self.resolved_functions) def values(self): - return self.resolved_functions.values() - + return six.itervalues(self.resolved_functions) -# }}} + def keys(self): + return six.iterkeys(self.resolved_functions) + # }}} -def default_func_id_to_kernel_callable_mappers(target): +# }}} - from loopy.library.function import loopy_specific_callable_scopers - return ( - [loopy_specific_callable_scopers] + ( - target.get_device_ast_builder().function_scopers())) +# {{{ helper functions -def make_program_from_kernel(kernel): +def make_program(kernel): + """ + Returns an instance of :class:`loopy.Program` with the *kernel* as the root + kernel. + """ - program_callables_info = initialize_program_callables_info_from_kernel(kernel, - default_func_id_to_kernel_callable_mappers(kernel.target)) + # get the program callables info + callables_table = initialize_callables_table_from_kernel(kernel) + # get the program from program callables info program = Program( name=kernel.name, - program_callables_info=program_callables_info, + callables_table=callables_table, func_id_to_in_knl_callable_mappers=( - default_func_id_to_kernel_callable_mappers(kernel.target)), + _default_func_id_to_kernel_callable_mappers(kernel.target)), target=kernel.target) return program def iterate_over_kernels_if_given_program(transform_for_single_kernel): + """ + Function wrapper for transformations of the type ``transform(kernel: + LoopKernel, *args, **kwargs): LoopKernel``. Returns a function with the + ``transform`` being implemented on all of the callable kernels in a + :class:`loopy.Program`. + """ def _collective_transform(program_or_kernel, *args, **kwargs): if isinstance(program_or_kernel, Program): program = program_or_kernel new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = transform_for_single_kernel( in_knl_callable.subkernel, *args, **kwargs) @@ -657,9 +967,9 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) else: assert isinstance(program_or_kernel, LoopKernel) kernel = program_or_kernel @@ -667,17 +977,6 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): return wraps(transform_for_single_kernel)(_collective_transform) - -# {{{ ingoring this for now - -# if False and isinstance(function, (ArgExtOp, SegmentedOp)): -# FIXME: ignoring this casse for now -# FIXME: If a kernel has two flavors of ArgExtOp then they are -# overwritten and hence not supported.(for now). -# updated_resolved_functions = self.scoped_functions.copy() -# updated_resolved_functions[function] = in_kernel_callable -# return self.copy(updated_resolved_functions), function.copy() - # }}} diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 201bcc2562754a30d20d3903ecf01d0d8984b11e..3dc1c0bbe71e43c134360650c134e5d6d9940cc7 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -794,9 +794,13 @@ def generate_loop_schedules_internal( if not is_ready: if debug_mode: - print("instruction '%s' is missing insn depedencies '%s'" % ( - format_insn(kernel, insn.id), ",".join( - insn.depends_on - sched_state.scheduled_insn_ids))) + # These are not that interesting when understanding scheduler + # failures. + + # print("instruction '%s' is missing insn depedencies '%s'" % ( + # format_insn(kernel, insn.id), ",".join( + # insn.depends_on - sched_state.scheduled_insn_ids))) + pass continue want = kernel.insn_inames(insn) - sched_state.parallel_inames @@ -1832,7 +1836,7 @@ class MinRecursionLimitForScheduling(MinRecursionLimit): # {{{ main scheduling entrypoint -def generate_loop_schedules(kernel, program_callables_info, debug_args={}): +def generate_loop_schedules(kernel, callables_table, debug_args={}): """ .. warning:: @@ -1846,18 +1850,18 @@ def generate_loop_schedules(kernel, program_callables_info, debug_args={}): with MinRecursionLimitForScheduling(kernel): for sched in generate_loop_schedules_inner(kernel, - program_callables_info, debug_args=debug_args): + callables_table, debug_args=debug_args): yield sched -def generate_loop_schedules_inner(kernel, program_callables_info, debug_args={}): +def generate_loop_schedules_inner(kernel, callables_table, debug_args={}): from loopy.kernel import KernelState if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") from loopy.check import pre_schedule_checks - pre_schedule_checks(kernel, program_callables_info) + pre_schedule_checks(kernel, callables_table) schedule_count = 0 @@ -1971,7 +1975,7 @@ def generate_loop_schedules_inner(kernel, program_callables_info, debug_args={}) kernel, gen_sched) gsize, lsize = ( - kernel.get_grid_size_upper_bounds(program_callables_info)) + kernel.get_grid_size_upper_bounds(callables_table)) if (gsize or lsize): if not kernel.options.disable_global_barriers: @@ -2028,7 +2032,7 @@ schedule_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) -def _get_one_scheduled_kernel_inner(kernel, program_callables_info): +def _get_one_scheduled_kernel_inner(kernel, callables_table): # This helper function exists to ensure that the generator chain is fully # out of scope after the function returns. This allows it to be # garbage-collected in the exit handler of the @@ -2038,10 +2042,10 @@ def _get_one_scheduled_kernel_inner(kernel, program_callables_info): # # See https://gitlab.tiker.net/inducer/sumpy/issues/31 for context. - return next(iter(generate_loop_schedules(kernel, program_callables_info))) + return next(iter(generate_loop_schedules(kernel, callables_table))) -def get_one_scheduled_kernel(kernel, program_callables_info): +def get_one_scheduled_kernel(kernel, callables_table): from loopy import CACHING_ENABLED sched_cache_key = kernel @@ -2060,7 +2064,7 @@ def get_one_scheduled_kernel(kernel, program_callables_info): with ProcessLogger(logger, "%s: schedule" % kernel.name): with MinRecursionLimitForScheduling(kernel): result = _get_one_scheduled_kernel_inner(kernel, - program_callables_info) + callables_table) if CACHING_ENABLED and not from_cache: schedule_cache.store_if_not_present(sched_cache_key, result) diff --git a/loopy/statistics.py b/loopy/statistics.py index 08b7f89e91d1997a5583923571fbfa0140bb591d..c621ea727a0c25348370e78ccac08935ee2d18e6 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -32,8 +32,10 @@ from functools import reduce from loopy.kernel.data import ( MultiAssignmentBase, TemporaryVariable, AddressSpace) from loopy.diagnostic import warn_with_kernel, LoopyError -from pytools import Record +from pytools import Record, memoize_method from loopy.kernel.function_interface import ScalarCallable, CallableKernel +from loopy.kernel import LoopKernel +from loopy.program import make_program __doc__ = """ @@ -61,12 +63,12 @@ __doc__ = """ # FIXME: this is broken for the callable kernel design. -# Qns: -# - The variable name, what if multiple kernels use the same name? +# - The variable name, what if multiple kernels use the same name?(needs a +# different MemAccessInfo) # - We should also add the cumulative effect on the arguments of callee kernels -# into the caller kernel. -# FIXME: add an error that there is only one callable kernel. disable for -# multiple callable kernels. +# into the caller kernel +# - Make changes to MemAccessInfo to include the effect of several kernels. +# - Renovate `count`. # {{{ GuardedPwQPolynomial @@ -648,11 +650,11 @@ class MemAccess(Record): # {{{ counter base class CounterBase(CombineMapper): - def __init__(self, knl, program_callables_info): + def __init__(self, knl, callables_table): self.knl = knl - self.program_callables_info = program_callables_info + self.callables_table = callables_table from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl, program_callables_info) + self.type_inf = TypeInferenceMapper(knl, callables_table) def combine(self, values): return sum(values) @@ -707,11 +709,12 @@ class CounterBase(CombineMapper): # {{{ ExpressionOpCounter class ExpressionOpCounter(CounterBase): - def __init__(self, knl, program_callables_info): + def __init__(self, knl, callables_table, count_within_subscripts=True): self.knl = knl - self.program_callables_info = program_callables_info + self.callables_table = callables_table + self.count_within_subscripts = count_within_subscripts from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl, program_callables_info) + self.type_inf = TypeInferenceMapper(knl, callables_table) def combine(self, values): return sum(values) @@ -725,7 +728,7 @@ class ExpressionOpCounter(CounterBase): def map_call(self, expr): from loopy.symbolic import ResolvedFunction if isinstance(expr.function, ResolvedFunction): - function_identifier = self.program_callables_info[ + function_identifier = self.callables_table[ expr.function.name].name else: function_identifier = expr.function.name @@ -733,18 +736,21 @@ class ExpressionOpCounter(CounterBase): return ToCountMap( {Op(dtype=self.type_inf(expr), name='func:'+function_identifier, - count_granularity=CountGranularity.WORKITEM): 1} + count_granularity=CountGranularity.SUBGROUP): 1} ) + self.rec(expr.parameters) def map_subscript(self, expr): - return self.rec(expr.index) + if self.count_within_subscripts: + return self.rec(expr.index) + else: + return ToCountMap() def map_sum(self, expr): assert expr.children return ToCountMap( {Op(dtype=self.type_inf(expr), name='add', - count_granularity=CountGranularity.WORKITEM): + count_granularity=CountGranularity.SUBGROUP): len(expr.children)-1} ) + sum(self.rec(child) for child in expr.children) @@ -753,18 +759,18 @@ class ExpressionOpCounter(CounterBase): assert expr.children return sum(ToCountMap({Op(dtype=self.type_inf(expr), name='mul', - count_granularity=CountGranularity.WORKITEM): 1}) + count_granularity=CountGranularity.SUBGROUP): 1}) + self.rec(child) for child in expr.children if not is_zero(child + 1)) + \ ToCountMap({Op(dtype=self.type_inf(expr), name='mul', - count_granularity=CountGranularity.WORKITEM): -1}) + count_granularity=CountGranularity.SUBGROUP): -1}) def map_quotient(self, expr, *args): return ToCountMap({Op(dtype=self.type_inf(expr), name='div', - count_granularity=CountGranularity.WORKITEM): 1}) \ + count_granularity=CountGranularity.SUBGROUP): 1}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) @@ -774,14 +780,14 @@ class ExpressionOpCounter(CounterBase): def map_power(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='pow', - count_granularity=CountGranularity.WORKITEM): 1}) \ + count_granularity=CountGranularity.SUBGROUP): 1}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) def map_left_shift(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='shift', - count_granularity=CountGranularity.WORKITEM): 1}) \ + count_granularity=CountGranularity.SUBGROUP): 1}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) @@ -790,13 +796,13 @@ class ExpressionOpCounter(CounterBase): def map_bitwise_not(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='bw', - count_granularity=CountGranularity.WORKITEM): 1}) \ + count_granularity=CountGranularity.SUBGROUP): 1}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='bw', - count_granularity=CountGranularity.WORKITEM): + count_granularity=CountGranularity.SUBGROUP): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -820,7 +826,7 @@ class ExpressionOpCounter(CounterBase): def map_min(self, expr): return ToCountMap({Op(dtype=self.type_inf(expr), name='maxmin', - count_granularity=CountGranularity.WORKITEM): + count_granularity=CountGranularity.SUBGROUP): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -936,7 +942,7 @@ class LocalMemAccessCounter(MemAccessCounter): sub_map[MemAccess( mtype='local', dtype=dtype, - count_granularity=CountGranularity.WORKITEM) + count_granularity=CountGranularity.SUBGROUP) ] = 1 return sub_map @@ -956,7 +962,7 @@ class LocalMemAccessCounter(MemAccessCounter): lid_strides=dict(sorted(six.iteritems(lid_strides))), gid_strides=dict(sorted(six.iteritems(gid_strides))), variable=name, - count_granularity=CountGranularity.WORKITEM)] = 1 + count_granularity=CountGranularity.SUBGROUP)] = 1 return sub_map @@ -1111,7 +1117,7 @@ def count(kernel, set, space=None): from loopy.program import Program if isinstance(kernel, Program): if len([in_knl_callable for in_knl_callable in - kernel.program_callables_info.values() if isinstance(in_knl_callable, + kernel.callables_table.values() if isinstance(in_knl_callable, CallableKernel)]) != 1: raise NotImplementedError("Currently only supported for program with " "only one CallableKernel.") @@ -1216,10 +1222,10 @@ def count(kernel, set, space=None): return add_assumptions_guard(kernel, count) -def get_unused_hw_axes_factor(knl, program_callables_info, insn, +def get_unused_hw_axes_factor(knl, callables_table, insn, disregard_local_axes, space=None): # FIXME: Multi-kernel support - gsize, lsize = knl.get_grid_size_upper_bounds(program_callables_info) + gsize, lsize = knl.get_grid_size_upper_bounds(callables_table) g_used = set() l_used = set() @@ -1257,7 +1263,7 @@ def get_unused_hw_axes_factor(knl, program_callables_info, insn, return add_assumptions_guard(knl, result) -def count_insn_runs(knl, program_callables_info, insn, count_redundant_work, +def count_insn_runs(knl, callables_table, insn, count_redundant_work, disregard_local_axes=False): insn_inames = knl.insn_inames(insn) @@ -1278,39 +1284,100 @@ def count_insn_runs(knl, program_callables_info, insn, count_redundant_work, c = count(knl, domain, space=space) if count_redundant_work: - unused_fac = get_unused_hw_axes_factor(knl, program_callables_info, + unused_fac = get_unused_hw_axes_factor(knl, callables_table, insn, disregard_local_axes=disregard_local_axes, space=space) return c * unused_fac else: return c + +@memoize_method +def _get_insn_count(knl, callables_table, insn_id, subgroup_size, + count_redundant_work, count_granularity=CountGranularity.WORKITEM): + insn = knl.id_to_insn[insn_id] + + if count_granularity is None: + warn_with_kernel(knl, "get_insn_count_assumes_granularity", + "get_insn_count: No count granularity passed, " + "assuming %s granularity." + % (CountGranularity.WORKITEM)) + count_granularity == CountGranularity.WORKITEM + + if count_granularity == CountGranularity.WORKITEM: + return count_insn_runs( + knl, callables_table, insn, + count_redundant_work=count_redundant_work, + disregard_local_axes=False) + + ct_disregard_local = count_insn_runs( + knl, callables_table, insn, disregard_local_axes=True, + count_redundant_work=count_redundant_work) + + if count_granularity == CountGranularity.WORKGROUP: + return ct_disregard_local + elif count_granularity == CountGranularity.SUBGROUP: + # get the group size + from loopy.symbolic import aff_to_expr + _, local_size = knl.get_grid_size_upper_bounds(callables_table) + workgroup_size = 1 + if local_size: + for size in local_size: + s = aff_to_expr(size) + if not isinstance(s, int): + raise LoopyError("Cannot count insn with %s granularity, " + "work-group size is not integer: %s" + % (CountGranularity.SUBGROUP, local_size)) + workgroup_size *= s + + warn_with_kernel(knl, "insn_count_subgroups_upper_bound", + "get_insn_count: when counting instruction %s with " + "count_granularity=%s, using upper bound for work-group size " + "(%d work-items) to compute sub-groups per work-group. When " + "multiple device programs present, actual sub-group count may be" + "lower." % (insn_id, CountGranularity.SUBGROUP, workgroup_size)) + + from pytools import div_ceil + return ct_disregard_local*div_ceil(workgroup_size, subgroup_size) + else: + # this should not happen since this is enforced in Op/MemAccess + raise ValueError("get_insn_count: count_granularity '%s' is" + "not allowed. count_granularity options: %s" + % (count_granularity, CountGranularity.ALL+[None])) + # }}} # {{{ get_op_map - -def get_op_map_for_single_kernel(knl, program_callables_info, +def get_op_map_for_single_kernel(knl, callables_table, numpy_types=True, count_redundant_work=False, - subgroup_size=None): + count_within_subscripts=True, subgroup_size=None): if not knl.options.ignore_boostable_into: raise LoopyError("Kernel '%s': Using operation counting requires the option " "ignore_boostable_into to be set." % knl.name) + subgroup_size = _process_subgroup_size(knl, subgroup_size) + + op_map = ToCountMap() + op_counter = ExpressionOpCounter(knl, callables_table, + count_within_subscripts) + from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, NoOpInstruction, BarrierInstruction) - op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl, - program_callables_info=program_callables_info) for insn in knl.instructions: if isinstance(insn, (CallInstruction, CInstruction, Assignment)): ops = op_counter(insn.assignee) + op_counter(insn.expression) - op_map = op_map + ops*count_insn_runs( - knl, program_callables_info, insn, - count_redundant_work=count_redundant_work) + for key, val in six.iteritems(ops.count_map): + op_map = ( + op_map + + ToCountMap({key: val}) + * _get_insn_count(knl, callables_table, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity)) + elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): pass else: @@ -1333,7 +1400,7 @@ def get_op_map_for_single_kernel(knl, program_callables_info, def get_op_map(program, numpy_types=True, count_redundant_work=False, - subgroup_size=None): + count_within_subscripts=True, subgroup_size=None): """Count the number of operations in a loopy kernel. @@ -1349,6 +1416,9 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, (Likely desirable for performance modeling, but undesirable for code optimization.) + :arg count_within_subscripts: A :class:`bool` specifying whether to + count operations inside array indices. + :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` ``'guess'``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within @@ -1390,23 +1460,26 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, """ + if isinstance(program, LoopKernel): + program = make_program(program) + from loopy.preprocess import preprocess_program, infer_unknown_types program = infer_unknown_types(program, expect_completion=True) program = preprocess_program(program) op_map = ToCountMap() - for func_id, in_knl_callable in program.program_callables_info.items(): + callables_count = ( + program.callables_table.callables_count) + + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): - num_times_called = ( - program.program_callables_info.num_times_callables_called[ - func_id]) knl = in_knl_callable.subkernel knl_op_map = get_op_map_for_single_kernel(knl, - program.program_callables_info, numpy_types, - count_redundant_work, subgroup_size) + program.callables_table, numpy_types, count_redundant_work, + count_within_subscripts, subgroup_size) - for i in range(num_times_called): + for i in range(callables_count[func_id]): op_map += knl_op_map elif isinstance(in_knl_callable, ScalarCallable): pass @@ -1433,21 +1506,16 @@ def _find_subgroup_size_for_knl(knl): return None -# {{{ get_mem_access_map - - -def get_access_map_for_single_kernel(knl, program_callables_info, - numpy_types=True, count_redundant_work=False, subgroup_size=None): +@memoize_method +def _process_subgroup_size(knl, subgroup_size_requested): - if not knl.options.ignore_boostable_into: - raise LoopyError("Kernel '%s': Using operation counting requires the option " - "ignore_boostable_into to be set." % knl.name) - - if not isinstance(subgroup_size, int): + if isinstance(subgroup_size_requested, int): + return subgroup_size_requested + else: # try to find subgroup_size subgroup_size_guess = _find_subgroup_size_for_knl(knl) - if subgroup_size is None: + if subgroup_size_requested is None: if subgroup_size_guess is None: # 'guess' was not passed and either no target device found # or get_simd_group_size returned None @@ -1457,84 +1525,39 @@ def get_access_map_for_single_kernel(knl, program_callables_info, "and kernel.target.device is set, or (3) pass " "subgroup_size='guess' and hope for the best.") else: - subgroup_size = subgroup_size_guess + return subgroup_size_guess - elif subgroup_size == 'guess': + elif subgroup_size_requested == 'guess': if subgroup_size_guess is None: # unable to get subgroup_size from device, so guess - subgroup_size = 32 - warn_with_kernel(knl, "get_mem_access_map_guessing_subgroup_size", - "get_mem_access_map: 'guess' sub-group size " - "passed, no target device found, wildly guessing " - "that sub-group size is %d." % (subgroup_size)) + subgroup_size_guess = 32 + warn_with_kernel(knl, "get_x_map_guessing_subgroup_size", + "'guess' sub-group size passed, no target device " + "found, wildly guessing that sub-group size is %d." + % (subgroup_size_guess)) + return subgroup_size_guess else: - subgroup_size = subgroup_size_guess + return subgroup_size_guess else: raise ValueError("Invalid value for subgroup_size: %s. subgroup_size " "must be integer, 'guess', or, if you're feeling " - "lucky, None." % (subgroup_size)) + "lucky, None." % (subgroup_size_requested)) - class CacheHolder(object): - pass - cache_holder = CacheHolder() - from pytools import memoize_in - - @memoize_in(cache_holder, "insn_count") - def get_insn_count(knl, insn_id, count_granularity=CountGranularity.WORKITEM): - insn = knl.id_to_insn[insn_id] - - if count_granularity is None: - warn_with_kernel(knl, "get_insn_count_assumes_granularity", - "get_insn_count: No count granularity passed for " - "MemAccess, assuming %s granularity." - % (CountGranularity.WORKITEM)) - count_granularity == CountGranularity.WORKITEM - - if count_granularity == CountGranularity.WORKITEM: - return count_insn_runs( - knl, program_callables_info, insn, - count_redundant_work=count_redundant_work, - disregard_local_axes=False) - - ct_disregard_local = count_insn_runs( - knl, program_callables_info, insn, disregard_local_axes=True, - count_redundant_work=count_redundant_work) - - if count_granularity == CountGranularity.WORKGROUP: - return ct_disregard_local - elif count_granularity == CountGranularity.SUBGROUP: - # get the group size - from loopy.symbolic import aff_to_expr - _, local_size = knl.get_grid_size_upper_bounds(program_callables_info) - workgroup_size = 1 - if local_size: - for size in local_size: - s = aff_to_expr(size) - if not isinstance(s, int): - raise LoopyError("Cannot count insn with %s granularity, " - "work-group size is not integer: %s" - % (CountGranularity.SUBGROUP, local_size)) - workgroup_size *= s - - warn_with_kernel(knl, "insn_count_subgroups_upper_bound", - "get_insn_count: when counting instruction %s with " - "count_granularity=%s, using upper bound for work-group size " - "(%d work-items) to compute sub-groups per work-group. When " - "multiple device programs present, actual sub-group count may be" - "lower." % (insn_id, CountGranularity.SUBGROUP, workgroup_size)) - - from pytools import div_ceil - return ct_disregard_local*div_ceil(workgroup_size, subgroup_size) - else: - # this should not happen since this is enforced in MemAccess - raise ValueError("get_insn_count: count_granularity '%s' is" - "not allowed. count_granularity options: %s" - % (count_granularity, CountGranularity.ALL+[None])) +# {{{ get_mem_access_map + +def get_mem_access_map_for_single_kernel(knl, callables_table, + numpy_types=True, count_redundant_work=False, subgroup_size=None): + + if not knl.options.ignore_boostable_into: + raise LoopyError("Kernel '%s': Using operation counting requires the option " + "ignore_boostable_into to be set." % knl.name) + + subgroup_size = _process_subgroup_size(knl, subgroup_size) access_map = ToCountMap() - access_counter_g = GlobalMemAccessCounter(knl, program_callables_info) - access_counter_l = LocalMemAccessCounter(knl, program_callables_info) + access_counter_g = GlobalMemAccessCounter(knl, callables_table) + access_counter_l = LocalMemAccessCounter(knl, callables_table) from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, @@ -1557,14 +1580,18 @@ def get_access_map_for_single_kernel(knl, program_callables_info, access_map = ( access_map + ToCountMap({key: val}) - * get_insn_count(knl, insn.id, key.count_granularity)) + * _get_insn_count(knl, callables_table, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity)) for key, val in six.iteritems(access_assignee.count_map): access_map = ( access_map + ToCountMap({key: val}) - * get_insn_count(knl, insn.id, key.count_granularity)) + * _get_insn_count(knl, callables_table, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity)) elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): pass @@ -1684,18 +1711,17 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, access_map = ToCountMap() - for func_id, in_knl_callable in program.program_callables_info.items(): + callables_count = program.callables_table.callables_count + + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): - num_times_called = ( - program.program_callables_info.num_times_callables_called[ - func_id]) knl = in_knl_callable.subkernel - knl_access_map = get_access_map_for_single_kernel(knl, - program.program_callables_info, numpy_types, + knl_access_map = get_mem_access_map_for_single_kernel(knl, + program.callables_table, numpy_types, count_redundant_work, subgroup_size) # FIXME: didn't see any easy way to multiply - for i in range(num_times_called): + for i in range(callables_count[func_id]): access_map += knl_access_map elif isinstance(in_knl_callable, ScalarCallable): pass @@ -1705,13 +1731,12 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, return access_map - # }}} # {{{ get_synchronization_map -def get_synchronization_map_for_single_kernel(knl, program_callables_info, +def get_synchronization_map_for_single_kernel(knl, callables_table, subgroup_size=None): """Count the number of synchronization events each work-item encounters in @@ -1757,7 +1782,7 @@ def get_synchronization_map_for_single_kernel(knl, program_callables_info, from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) from operator import mul - knl = lp.get_one_scheduled_kernel(knl, program_callables_info) + knl = lp.get_one_scheduled_kernel(knl, callables_table) iname_list = [] result = ToCountMap() @@ -1809,18 +1834,16 @@ def get_synchronization_map(program, subgroup_size=None): program = preprocess_program(program) sync_map = ToCountMap() + callables_count = program.callables_table.callables_count - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): - num_times_called = ( - program.program_callables_info.num_times_callables_called[ - func_id]) knl = in_knl_callable.subkernel knl_sync_map = get_synchronization_map_for_single_kernel(knl, - program.program_callables_info, subgroup_size) + program.callables_table, subgroup_size) # FIXME: didn't see any easy way to multiply - for i in range(num_times_called): + for i in range(callables_count[func_id]): sync_map += knl_sync_map elif isinstance(in_knl_callable, ScalarCallable): pass @@ -1874,7 +1897,7 @@ def gather_access_footprints_for_single_kernel(kernel, ignore_uncountable=False) def gather_access_footprints(program, ignore_uncountable=False): # FIMXE: works only for one callable kernel till now. if len([in_knl_callable for in_knl_callable in - program.program_callables_info.values() if isinstance(in_knl_callable, + program.callables_table.values() if isinstance(in_knl_callable, CallableKernel)]) != 1: raise NotImplementedError("Currently only supported for program with " "only one CallableKernel.") @@ -1887,18 +1910,17 @@ def gather_access_footprints(program, ignore_uncountable=False): write_footprints = [] read_footprints = [] - for func_id, in_knl_callable in program.program_callables_info.items(): + callables_count = program.callables_table.callables_count + + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): - num_times_called = ( - program.program_callables_info.num_times_callables_called[ - func_id]) knl = in_knl_callable.subkernel knl_write_footprints, knl_read_footprints = ( gather_access_footprints_for_single_kernel(knl, ignore_uncountable)) # FIXME: didn't see any easy way to multiply - for i in range(num_times_called): + for i in range(callables_count[func_id]): write_footprints.extend(knl_write_footprints) read_footprints.extend(knl_read_footprints) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 54dd61966050de1149bff41f8845531b6c3687f8..5721c58effa6addd7cb79d1f373a9b27562cbd16 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -706,16 +706,16 @@ class ResolvedFunction(p.Expression): def __init__(self, function): if isinstance(function, str): function = p.Variable(function) - from loopy.library.reduction import ArgExtOp, SegmentedOp - assert isinstance(function, (p.Variable, ArgExtOp, SegmentedOp)) + from loopy.library.reduction import ReductionOpFunction + assert isinstance(function, (p.Variable, ReductionOpFunction)) self.function = function @property def name(self): - from loopy.library.reduction import ArgExtOp, SegmentedOp + from loopy.library.reduction import ReductionOpFunction if isinstance(self.function, p.Variable): return self.function.name - elif isinstance(self.function, (ArgExtOp, SegmentedOp)): + elif isinstance(self.function, ReductionOpFunction): return self.function else: raise LoopyError("Unexpected function type %s in ResolvedFunction." % @@ -1902,7 +1902,7 @@ def get_access_range(domain, subscript, assumptions, shape=None, if shape is not None: try: shape_aff = guarded_aff_from_expr(access_map.space, shape[idim]) - except ExpressionToAffineConversionError as sub_err: + except ExpressionToAffineConversionError: pass if shape_aff is None: diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index e3b4853c3878106896a736d9f49a23665b5ff81b..f27ee4e96f11f686250bddf57ec87422c717373e 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -80,7 +80,7 @@ class TargetBase(object): def preprocess(self, kernel): return kernel - def pre_codegen_check(self, kernel, program_callables_info): + def pre_codegen_check(self, kernel, callables_table): pass # }}} @@ -150,7 +150,7 @@ class ASTBuilderBase(object): # {{{ library - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): """ Returns an instance of list of the functions of signature ``(target, identifiers)`` returning either an instance of diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 52437ef8024fbd08261522362494924cd0e23d1b..58051e42f8898d374ded6b1e7dcd70eb3989f100 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -364,7 +364,7 @@ class CMathCallable(ScalarCallable): C-Target. """ - def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): name = self.name if name in ["abs", "min", "max"]: @@ -383,7 +383,7 @@ class CMathCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = arg_id_to_dtype[0] dtype = dtype.numpy_dtype @@ -411,7 +411,7 @@ class CMathCallable(ScalarCallable): self.copy(name_in_target=name, arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}), - program_callables_info) + callables_table) # binary functions if name in ["fmax", "fmin", "pow", "atan2"]: @@ -426,7 +426,7 @@ class CMathCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() @@ -435,7 +435,7 @@ class CMathCallable(ScalarCallable): if dtype.kind == "c": raise LoopyTypeError("%s does not support complex numbers") - elif dtype.kind == "f" and name in ["fmax", "fmin"]: + elif dtype.kind == "f": from loopy.target.opencl import OpenCLTarget if not isinstance(caller_kernel.target, OpenCLTarget): if dtype == np.float64: @@ -451,11 +451,11 @@ class CMathCallable(ScalarCallable): return ( self.copy(name_in_target=name, arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), - program_callables_info) + callables_table) return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) def scope_c_math_functions(target, identifier): @@ -488,9 +488,9 @@ class CASTBuilder(ASTBuilderBase): _preamble_generator, ]) - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): return ( - super(CASTBuilder, self).function_scopers() + [ + super(CASTBuilder, self).function_id_in_knl_callable_mapper() + [ scope_c_math_functions]) # }}} @@ -897,7 +897,7 @@ class CASTBuilder(ASTBuilderBase): ecm = codegen_state.expression_to_code_mapper func_id = insn.expression.function.name - in_knl_callable = codegen_state.program_callables_info[func_id] + in_knl_callable = codegen_state.callables_table[func_id] if isinstance(in_knl_callable, ScalarCallable) and ( in_knl_callable.name_in_target == 'loopy_make_tuple'): diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index b3c304d585f7282f5d8c6446e9503fecdb4b5a7e..d443739ac455d6a6d3f203286b84c2fe618e20ab 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -403,7 +403,7 @@ class CKernelExecutor(KernelExecutorBase): if self.program.root_kernel.options.write_cl: output = all_code if self.program.root_kernel.options.highlight_cl: - output = get_highlighted_code(code=output) + output = get_highlighted_code(output) if self.program.root_kernel.options.write_cl is True: print(output) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 2908c4efa4638ec8e35db781eb1c07997a795ace..97b6b43bbfc5b68ffa5bbb4af706d2f2caaf8121 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -55,7 +55,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): if type_inf_mapper is None: type_inf_mapper = TypeInferenceMapper(self.kernel, - self.codegen_state.program_callables_info) + self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper self.allow_complex = codegen_state.allow_complex @@ -393,7 +393,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): # {{{ implement indexof, indexof_vec identifier_name = ( - self.codegen_state.program_callables_info[expr.function.name].name) + self.codegen_state.callables_table[expr.function.name].name) if identifier_name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: raise LoopyError("%s takes exactly one argument" % identifier_name) @@ -436,11 +436,11 @@ class ExpressionToCExpressionMapper(IdentityMapper): # }}} from loopy.kernel.function_interface import ManglerCallable - if isinstance(self.codegen_state.program_callables_info[expr.function.name], + if isinstance(self.codegen_state.callables_table[expr.function.name], ManglerCallable): from loopy.codegen import SeenFunction in_knl_callable = ( - self.codegen_state.program_callables_info[ + self.codegen_state.callables_table[ expr.function.name]) mangle_result = in_knl_callable.mangle_result(self.kernel) self.codegen_state.seen_functions.add( @@ -449,7 +449,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): mangle_result.arg_dtypes)) return ( - self.codegen_state.program_callables_info[ + self.codegen_state.callables_table[ expr.function.name].emit_call( expression_to_code_mapper=self, expression=expr, diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 89cbfd034e6dfdddebd8811f3e2620c90cc285ad..6b4385bff2a580979eb1c551606760ee00b5e49a 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -123,7 +123,7 @@ _CUDA_SPECIFIC_FUNCTIONS = { class CudaCallable(ScalarCallable): def cuda_with_types(self, arg_id_to_dtype, caller_kernel, - program_callables_info): + callables_table): name = self.name @@ -138,7 +138,7 @@ class CudaCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = arg_id_to_dtype[0] scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"] @@ -146,7 +146,7 @@ class CudaCallable(ScalarCallable): self.copy(name_in_target=name, arg_id_to_dtype={-1: NumpyType(scalar_dtype), 0: dtype, 1: dtype}), - program_callables_info) + callables_table) if name in _CUDA_SPECIFIC_FUNCTIONS: num_args = _CUDA_SPECIFIC_FUNCTIONS[name] @@ -161,7 +161,7 @@ class CudaCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in @@ -177,11 +177,11 @@ class CudaCallable(ScalarCallable): return ( self.copy(name_in_target=name, arg_id_to_dtype=updated_arg_id_to_dtype), - program_callables_info) + callables_table) return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) def scope_cuda_functions(target, identifier): @@ -274,9 +274,9 @@ class CUDACASTBuilder(CASTBuilder): # {{{ library - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): return [scope_cuda_functions] + ( - super(CUDACASTBuilder, self).function_scopers()) + super(CUDACASTBuilder, self).function_id_in_knl_callable_mapper()) # }}} @@ -303,7 +303,7 @@ class CUDACASTBuilder(CASTBuilder): codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( codegen_state.kernel.schedule, schedule_index), - codegen_state.program_callables_info) + codegen_state.callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): @@ -344,7 +344,7 @@ class CUDACASTBuilder(CASTBuilder): _VEC_AXES = "xyzw" def add_vector_access(self, access_expr, index): - return access_expr.a(self._VEC_AXES[index]) + return access_expr.attr(self._VEC_AXES[index]) def emit_barrier(self, synchronization_kind, mem_kind, comment): """ diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 43963ddb2f47fb80475b7060b7929cf21d34142e..c067bc4b901ef3922236ac3dc1a8245aaf28c8aa 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -763,7 +763,7 @@ class KernelExecutorBase(object): from loopy.schedule import get_one_scheduled_kernel program = program.with_root_kernel( get_one_scheduled_kernel(program.root_kernel, - program.program_callables_info)) + program.callables_table)) return program diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 5396318338e4b36148d4397390a33d024283150d..94a81a65a15bcc0da7530b51d5943d4b94bf74ac 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -172,9 +172,9 @@ class ISPCTarget(CTarget): host_program_name_suffix = "" device_program_name_suffix = "_inner" - def pre_codegen_check(self, kernel, program_callables_info): + def pre_codegen_check(self, kernel, callables_table): gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs( - program_callables_info) + callables_table) if len(lsize) > 1: for i, ls_i in enumerate(lsize[1:]): if ls_i != 1: @@ -445,9 +445,9 @@ class ISPCASTBuilder(CASTBuilder): else: for dep in get_dependencies(term): if filter_iname_tags_by_type( - kernel.iname_to_tags[dep], LocalIndexTag): + kernel.iname_to_tags.get(dep, []), LocalIndexTag): tag, = filter_iname_tags_by_type( - kernel.iname_to_tags[dep], LocalIndexTag, 1) + kernel.iname_to_tags.get(dep, []), LocalIndexTag, 1) if tag.axis == 0: raise LoopyError( "streaming store must have stride 1 in " diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 44f782a726d7daa1b9371e50e2e9cbefc62eb7d9..36e59c3c8cffc6b8031cdacf5de82b2eeb1e96ec 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -172,7 +172,7 @@ class OpenCLCallable(ScalarCallable): :class:`loopy.target.c.CMathCallable`. """ - def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): name = self.name if name in ["max", "min"]: @@ -182,8 +182,7 @@ class OpenCLCallable(ScalarCallable): if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype: return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) - + callables_table) dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() if (id >= 0 and dtype is not None)]) @@ -195,7 +194,7 @@ class OpenCLCallable(ScalarCallable): return ( self.copy(name_in_target=name, arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), - program_callables_info) + callables_table) else: # Unsupported type. raise LoopyError("%s function not supported for the types %s" % @@ -212,14 +211,14 @@ class OpenCLCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = arg_id_to_dtype[0] scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"] return ( self.copy(name_in_target=name, arg_id_to_dtype={-1: NumpyType(scalar_dtype), 0: dtype, 1: dtype}), - program_callables_info) + callables_table) if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] @@ -234,7 +233,7 @@ class OpenCLCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in @@ -250,7 +249,7 @@ class OpenCLCallable(ScalarCallable): return ( self.copy(name_in_target=name, arg_id_to_dtype=updated_arg_id_to_dtype), - program_callables_info) + callables_table) if name in VECTOR_LITERAL_FUNCS: base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] @@ -266,7 +265,7 @@ class OpenCLCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(count)) @@ -276,13 +275,13 @@ class OpenCLCallable(ScalarCallable): return ( self.copy(name_in_target="(%s%d) " % (base_tp_name, count), arg_id_to_dtype=updated_arg_id_to_dtype), - program_callables_info) + callables_table) # does not satisfy any of the conditions needed for specialization. # hence just returning a copy of the callable. return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) def scope_opencl_functions(target, identifier): @@ -442,10 +441,10 @@ class OpenCLTarget(CTarget): class OpenCLCASTBuilder(CASTBuilder): # {{{ library - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): return ( [scope_opencl_functions] + super( - OpenCLCASTBuilder, self).function_scopers()) + OpenCLCASTBuilder, self).function_id_in_knl_callable_mapper()) def symbol_manglers(self): return ( @@ -484,7 +483,7 @@ class OpenCLCASTBuilder(CASTBuilder): _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( codegen_state.kernel.schedule, schedule_index), - codegen_state.program_callables_info) + codegen_state.callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_sizes): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 03ba26930b677dc8f407b5ec73fa02722ebffec8..5ef56457223f8a89de4c37aed2f5a3e0ea5aa87b 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -125,7 +125,8 @@ def adjust_local_temp_var_storage(kernel, device): new_storage_shape = storage_shape - new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape) + new_temp_vars[temp_var.name] = temp_var.copy( + storage_shape=tuple(new_storage_shape)) return kernel.copy(temporary_variables=new_temp_vars) @@ -134,7 +135,7 @@ def adjust_local_temp_var_storage(kernel, device): # {{{ check sizes against device properties -def check_sizes(kernel, program_callables_info, device): +def check_sizes(kernel, callables_table, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError @@ -152,7 +153,7 @@ def check_sizes(kernel, program_callables_info, device): parameters[arg.name] = arg.approximately glens, llens = ( - kernel.get_grid_size_upper_bounds_as_exprs(program_callables_info)) + kernel.get_grid_size_upper_bounds_as_exprs(callables_table)) if (max(len(glens), len(llens)) > device.max_work_item_dimensions): @@ -207,7 +208,7 @@ class PyOpenCLCallable(ScalarCallable): Records information about the callables which are not covered by :class:`loopy.target.opencl.OpenCLCallable` """ - def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): name = self.name @@ -221,7 +222,7 @@ class PyOpenCLCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = arg_id_to_dtype[0] @@ -238,7 +239,7 @@ class PyOpenCLCallable(ScalarCallable): self.copy(name_in_target="%s_%s" % (tpname, name), arg_id_to_dtype={0: dtype, -1: NumpyType( np.dtype(dtype.numpy_dtype.type(0).real))}), - program_callables_info) + callables_table) if name in ["sqrt", "exp", "log", "sin", "cos", "tan", @@ -256,7 +257,7 @@ class PyOpenCLCallable(ScalarCallable): return ( self.copy(name_in_target="%s_%s" % (tpname, name), arg_id_to_dtype={0: dtype, -1: dtype}), - program_callables_info) + callables_table) else: # function calls for floating parameters. numpy_dtype = dtype.numpy_dtype @@ -267,14 +268,14 @@ class PyOpenCLCallable(ScalarCallable): return ( self.copy(name_in_target=name, arg_id_to_dtype={0: dtype, -1: dtype}), - program_callables_info) + callables_table) return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) -def pyopencl_function_scoper(target, identifier): +def pyopencl_function_id_to_in_knl_callable_mapper(target, identifier): if identifier in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", "tanh", "conj", "real", "imag", "abs"]: return PyOpenCLCallable(name=identifier) @@ -397,8 +398,8 @@ class PyOpenCLTarget(OpenCLTarget): kernel = adjust_local_temp_var_storage(kernel, self.device) return kernel - def pre_codegen_check(self, kernel, program_callables_info): - check_sizes(kernel, program_callables_info, self.device) + def pre_codegen_check(self, kernel, callables_table): + check_sizes(kernel, callables_table, self.device) def get_host_ast_builder(self): return PyOpenCLPythonASTBuilder(self) @@ -792,11 +793,13 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library - def function_scopers(self): - from loopy.library.random123 import random123_function_scoper + def function_id_in_knl_callable_mapper(self): + from loopy.library.random123 import ( + random123_function_id_to_in_knl_callable_mapper) return ( - [pyopencl_function_scoper, random123_function_scoper] + super( - PyOpenCLCASTBuilder, self).function_scopers()) + [pyopencl_function_id_to_in_knl_callable_mapper, + random123_function_id_to_in_knl_callable_mapper] + super( + PyOpenCLCASTBuilder, self).function_id_in_knl_callable_mapper()) def preamble_generators(self): return ([ diff --git a/loopy/target/python.py b/loopy/target/python.py index cd6e611673754034238fbc6e8775c43eb3c4c2f4..1f83112ff8fd9f32f2e48f3c76a3de0abaad92fd 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -45,7 +45,7 @@ class ExpressionToPythonMapper(StringifyMapper): if type_inf_mapper is None: type_inf_mapper = TypeInferenceMapper(self.kernel, - self.codegen_state.program_callables_info) + self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper def handle_unsupported_expression(self, victim, enclosing_prec): @@ -85,7 +85,7 @@ class ExpressionToPythonMapper(StringifyMapper): def map_call(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_NONE - identifier_name = self.codegen_state.program_callables_info[ + identifier_name = self.codegen_state.callables_table[ expr.function.name].name if identifier_name in ["indexof", "indexof_vec"]: @@ -93,7 +93,7 @@ class ExpressionToPythonMapper(StringifyMapper): "indexof, indexof_vec not yet supported in Python") from loopy.kernel.function_interface import ManglerCallable - in_knl_callable = self.codegen_state.program_callables_info[ + in_knl_callable = self.codegen_state.callables_table[ expr.function.name] if isinstance(in_knl_callable, ManglerCallable): from loopy.codegen import SeenFunction @@ -180,10 +180,11 @@ class PythonASTBuilderBase(ASTBuilderBase): # {{{ code generation guts - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): from loopy.target.c import scope_c_math_functions return ( - super(PythonASTBuilderBase, self).function_scopers() + + super(PythonASTBuilderBase, + self).function_id_in_knl_callable_mapper() + [scope_c_math_functions]) def preamble_generators(self): diff --git a/loopy/tools.py b/loopy/tools.py index b243a79492dacc17f70e5afc7626c17a6ee03774..52fc7d3ce8c12d6c8491cd363d42836f5b2aca5a 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -43,6 +43,17 @@ else: return isinstance(obj, (int, np.integer)) +def update_persistent_hash(obj, key_hash, key_builder): + """ + Custom hash computation function for use with + :class:`pytools.persistent_dict.PersistentDict`. + + Only works in conjunction with :class:`loopy.tools.KeyBuilder`. + """ + for field_name in obj.hash_fields: + key_builder.rec(key_hash, getattr(obj, field_name)) + + # {{{ custom KeyBuilder subclass class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase): @@ -79,6 +90,11 @@ class LoopyKeyBuilder(KeyBuilderBase): update_for_defaultdict = update_for_dict + def update_for_frozenset(self, key_hash, key): + for set_key in sorted(key, + key=lambda obj: type(obj).__name__ + str(obj)): + self.rec(key_hash, set_key) + def update_for_BasicSet(self, key_hash, key): # noqa from islpy import Printer prn = Printer.to_str(key.get_ctx()) diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 57c4397f998a74222bc482127076dee809de2bac..2519b6a14765400c9aaba249c1010d07d820355e 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -133,7 +133,7 @@ buffer_array_cache = WriteOncePersistentDict( # Adding an argument? also add something to the cache_key below. -def buffer_array_for_single_kernel(kernel, program_callables_info, var_name, +def buffer_array_for_single_kernel(kernel, callables_table, var_name, buffer_inames, init_expression=None, store_expression=None, within=None, default_tag="l.auto", temporary_scope=None, temporary_is_local=None, fetch_bounding_box=False): @@ -534,7 +534,7 @@ def buffer_array_for_single_kernel(kernel, program_callables_info, var_name, kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel, program_callables_info) + kernel = assign_automatic_axes(kernel, callables_table) if CACHING_ENABLED: from loopy.preprocess import prepare_for_caching @@ -548,10 +548,10 @@ def buffer_array(program, *args, **kwargs): assert isinstance(program, Program) new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = buffer_array_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -564,8 +564,8 @@ def buffer_array(program, *args, **kwargs): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # vim: foldmethod=marker diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index dbda5d74f1b0465344288641ea70d6cf70c63ad6..749817bad2a07c57f3ab8086e6d45d92c559184d 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -31,7 +31,7 @@ from loopy.kernel import LoopKernel from pytools import ImmutableRecord from loopy.diagnostic import LoopyError from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, - CInstruction, _DataObliviousInstruction) + Assignment, CInstruction, _DataObliviousInstruction) from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper from loopy.isl_helpers import simplify_via_aff from loopy.kernel.function_interface import (get_kw_pos_association, @@ -49,14 +49,22 @@ __doc__ = """ # {{{ register function lookup -def resolved_callables_from_function_lookup(program, - func_id_to_kernel_callable_mapper): - program_callables_info = program.program_callables_info - program_callables_info = program_callables_info.with_edit_callables_mode() +def _resolved_callables_from_function_lookup(program, + func_id_to_in_kernel_callable_mapper): + """ + Returns a copy of *program* with the expression nodes marked "Resolved" + if any match is found through the given + *func_id_to_in_kernel_callable_mapper*. + + :arg func_id_to_in_kernel_callable_mapper: A function with signature + ``(target, identifier)`` that returns either an instance of + :class:`loopy.InKernelCallable` or *None*. + """ + callables_table = program.callables_table callable_knls = dict( (func_id, in_knl_callable) for func_id, in_knl_callable in - program_callables_info.items() if isinstance(in_knl_callable, + callables_table.items() if isinstance(in_knl_callable, CallableKernel)) edited_callable_knls = {} @@ -68,38 +76,34 @@ def resolved_callables_from_function_lookup(program, kernel.substitutions, kernel.get_var_name_generator()) resolved_function_marker = ResolvedFunctionMarker( - rule_mapping_context, kernel, program_callables_info, - [func_id_to_kernel_callable_mapper]) + rule_mapping_context, kernel, callables_table, + [func_id_to_in_kernel_callable_mapper]) - # scoping fucntions and collecting the scoped functions new_subkernel = rule_mapping_context.finish_kernel( resolved_function_marker.map_kernel(kernel)) - program_callables_info = resolved_function_marker.program_callables_info + callables_table = resolved_function_marker.callables_table edited_callable_knls[func_id] = in_knl_callable.copy( subkernel=new_subkernel) - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) - new_resolved_functions = {} - for func_id, in_knl_callable in program_callables_info.items(): + for func_id, in_knl_callable in callables_table.items(): if func_id in edited_callable_knls: new_resolved_functions[func_id] = edited_callable_knls[func_id] else: new_resolved_functions[func_id] = in_knl_callable - program_callables_info = program_callables_info.copy( + callables_table = callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=program_callables_info) + return program.copy(callables_table=callables_table) def register_function_id_to_in_knl_callable_mapper(program, func_id_to_in_knl_callable_mapper): """ - Returns a copy of *kernel* with the *function_lookup* registered. + Returns a copy of *program* with the *function_lookup* registered. :arg func_id_to_in_knl_callable_mapper: A function of signature ``(target, identifier)`` returning a @@ -119,7 +123,7 @@ def register_function_id_to_in_knl_callable_mapper(program, new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + ( [func_id_to_in_knl_callable_mapper]) - program = resolved_callables_from_function_lookup(program, + program = _resolved_callables_from_function_lookup(program, func_id_to_in_knl_callable_mapper) new_program = program.copy( @@ -169,7 +173,7 @@ def register_callable_kernel(program, callee_kernel): expected_num_assignees = len([arg for arg in callee_kernel.args if arg.is_output_only]) expected_num_parameters = len(callee_kernel.args) - expected_num_assignees - for in_knl_callable in program.program_callables_info.values(): + for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): caller_kernel = in_knl_callable.subkernel for insn in caller_kernel.instructions: @@ -207,25 +211,20 @@ def register_callable_kernel(program, callee_kernel): # take the function resolvers from the Program and resolve the functions in # the callee kernel - program_callables_info = ( - program.program_callables_info.with_edit_callables_mode()) - from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( callee_kernel.substitutions, callee_kernel.get_var_name_generator()) resolved_function_marker = ResolvedFunctionMarker( - rule_mapping_context, callee_kernel, program_callables_info, + rule_mapping_context, callee_kernel, program.callables_table, program.func_id_to_in_knl_callable_mappers) callee_kernel = rule_mapping_context.finish_kernel( resolved_function_marker.map_kernel(callee_kernel)) - program_callables_info = resolved_function_marker.program_callables_info + callables_table = resolved_function_marker.callables_table.copy() - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) - program = program.copy(program_callables_info=program_callables_info) + program = program.copy(callables_table=callables_table) # making the target of the child kernel to be same as the target of parent # kernel. @@ -456,15 +455,25 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): type(atomicity)(var_map[p.Variable(atomicity.var_name)].name) for atomicity in insn.atomicity) - insn = insn.copy( - id=insn_id[insn.id], - within_inames=within_inames, - # TODO: probaby need to keep priority in callee kernel - priority=instruction.priority, - depends_on=depends_on, - tags=insn.tags | instruction.tags, - atomicity=new_atomicity - ) + if isinstance(insn, Assignment): + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in callee kernel + priority=instruction.priority, + depends_on=depends_on, + tags=insn.tags | instruction.tags, + atomicity=new_atomicity + ) + else: + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in callee kernel + priority=instruction.priority, + depends_on=depends_on, + tags=insn.tags | instruction.tags, + ) inner_insns.append(insn) inner_insns.append(noop_end) @@ -488,27 +497,22 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): # {{{ inline callable kernel def _inline_single_callable_kernel(caller_kernel, function_name, - program_callables_info): + callables_table): old_insns = caller_kernel.instructions for insn in old_insns: if isinstance(insn, CallInstruction): # FIXME This seems to use identifiers across namespaces. Why not # check whether the function is a scoped function first? ~AK - if insn.expression.function.name in program_callables_info: - history_of_identifier = program_callables_info.history[ + if insn.expression.function.name in callables_table: + history_of_identifier = callables_table.history[ insn.expression.function.name] if function_name in history_of_identifier: - in_knl_callable = program_callables_info[ + in_knl_callable = callables_table[ insn.expression.function.name] assert isinstance(in_knl_callable, CallableKernel) caller_kernel = _inline_call_instruction( caller_kernel, in_knl_callable.subkernel, insn) - program_callables_info = ( - program_callables_info.with_deleted_callable( - insn.expression.function.name, - program_callables_info.num_times_callables_called[ - caller_kernel.name])) elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): pass @@ -517,7 +521,7 @@ def _inline_single_callable_kernel(caller_kernel, function_name, "Unknown instruction type %s" % type(insn).__name__) - return caller_kernel, program_callables_info + return caller_kernel, callables_table # FIXME This should take a 'within' parameter to be able to only inline @@ -529,33 +533,33 @@ def inline_callable_kernel(program, function_name): """ from loopy.preprocess import infer_arg_descr program = infer_arg_descr(program) - program_callables_info = program.program_callables_info - old_program_callables_info = program_callables_info.copy() + callables_table = program.callables_table + old_callables_table = callables_table.copy() edited_callable_kernels = {} - for func_id, in_knl_callable in old_program_callables_info.items(): - if function_name not in old_program_callables_info.history[func_id] and ( + for func_id, in_knl_callable in old_callables_table.items(): + if function_name not in old_callables_table.history[func_id] and ( isinstance(in_knl_callable, CallableKernel)): caller_kernel = in_knl_callable.subkernel - caller_kernel, program_callables_info = ( + caller_kernel, callables_table = ( _inline_single_callable_kernel(caller_kernel, function_name, - program_callables_info)) + callables_table)) edited_callable_kernels[func_id] = in_knl_callable.copy( subkernel=caller_kernel) new_resolved_functions = {} - for func_id, in_knl_callable in program_callables_info.items(): + for func_id, in_knl_callable in callables_table.items(): if func_id in edited_callable_kernels: new_resolved_functions[func_id] = edited_callable_kernels[func_id] else: new_resolved_functions[func_id] = in_knl_callable - program_callables_info = program_callables_info.copy( + callables_table = callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=program_callables_info) + return program.copy(callables_table=callables_table) # }}} @@ -725,20 +729,20 @@ def _match_caller_callee_argument_dimension_(program, callee_function_name): callee_function_name).map_kernel caller_knl, = [in_knl_callable.subkernel for in_knl_callable in - program.program_callables_info.values() if isinstance(in_knl_callable, + program.callables_table.values() if isinstance(in_knl_callable, CallableKernel) and is_invoking_callee(in_knl_callable.subkernel)] - old_callee_knl = program.program_callables_info[ + old_callee_knl = program.callables_table[ callee_function_name].subkernel new_callee_kernel = _match_caller_callee_argument_dimension_for_single_kernel( caller_knl, old_callee_knl) - new_program_callables_info = program.program_callables_info.copy() - new_program_callables_info.resolved_functions[callee_function_name] = ( - new_program_callables_info[callee_function_name].copy( + new_callables_table = program.callables_table.copy() + new_callables_table.resolved_functions[callee_function_name] = ( + new_callables_table[callee_function_name].copy( subkernel=new_callee_kernel)) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # }}} diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 5f4f2f2a77b927e4a4352077ed94492249ef75a0..888bedc1de4ba1993d505a2c660390d41e286a99 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -143,7 +143,7 @@ class _not_provided: # noqa: N801 pass -def add_prefetch_for_single_kernel(kernel, program_callables_info, var_name, +def add_prefetch_for_single_kernel(kernel, callables_table, var_name, sweep_inames=[], dim_arg_names=None, # "None" is a valid value here, distinct from the default. @@ -334,7 +334,7 @@ def add_prefetch_for_single_kernel(kernel, program_callables_info, var_name, # warning message. from loopy.transform.precompute import precompute_for_single_kernel - new_kernel = precompute_for_single_kernel(kernel, program_callables_info, + new_kernel = precompute_for_single_kernel(kernel, callables_table, subst_use, sweep_inames, precompute_inames=dim_arg_names, default_tag=default_tag, dtype=arg.dtype, fetch_bounding_box=fetch_bounding_box, @@ -373,10 +373,10 @@ def add_prefetch(program, *args, **kwargs): assert isinstance(program, Program) new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = add_prefetch_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -389,9 +389,9 @@ def add_prefetch(program, *args, **kwargs): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # }}} diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index d43ce025b3b9a232ebf14a35a47da2dad645a872..9b83f242bde7923a3932a00b42f442954cf9a7db 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -292,50 +292,6 @@ def _fuse_two_kernels(knla, knlb): def fuse_loop_kernels(kernels, suffixes=None, data_flow=None): - """Return a kernel that performs all the operations in all entries - of *kernels*. - - :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused. - :arg suffixes: If given, must be a list of strings of a length matching - that of *kernels*. This will be used to disambiguate the names - of temporaries, as described below. - :arg data_flow: A list of data dependencies - ``[(var_name, from_kernel, to_kernel), ...]``. - Based on this, the fuser will create dependencies between all - writers of *var_name* in ``kernels[from_kernel]`` to - readers of *var_name* in ``kernels[to_kernel]``. - *from_kernel* and *to_kernel* are indices into *kernels*. - - The components of the kernels are fused as follows: - - * The resulting kernel will have a domain involving all the inames - and parameters occurring across *kernels*. - Inames with matching names across *kernels* are fused in such a way - that they remain a single iname in the fused kernel. - Use :func:`loopy.rename_iname` if this is not desired. - - * The projection of the domains of each pair of kernels onto their - common subset of inames must match in order for fusion to - succeed. - - * Assumptions are fused by taking their conjunction. - - * If kernel arguments with matching names are encountered across - *kernels*, their declarations must match in order for fusion to - succeed. - - * Temporaries are automatically renamed to remain uniquely associated - with each instruction stream. - - * The resulting kernel will contain all instructions from each entry - of *kernels*. Clashing instruction IDs will be renamed to ensure - uniqueness. - - .. versionchanged:: 2016.2 - - *data_flow* was added in version 2016.2 - """ - assert all(isinstance(knl, LoopKernel) for knl in kernels) kernels = list(kernels) @@ -419,29 +375,80 @@ def fuse_loop_kernels(kernels, suffixes=None, data_flow=None): def fuse_kernels(programs, suffixes=None, data_flow=None): + """Return a kernel that performs all the operations in all entries + of *kernels*. + + :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused. + :arg suffixes: If given, must be a list of strings of a length matching + that of *kernels*. This will be used to disambiguate the names + of temporaries, as described below. + :arg data_flow: A list of data dependencies + ``[(var_name, from_kernel, to_kernel), ...]``. + Based on this, the fuser will create dependencies between all + writers of *var_name* in ``kernels[from_kernel]`` to + readers of *var_name* in ``kernels[to_kernel]``. + *from_kernel* and *to_kernel* are indices into *kernels*. + + The components of the kernels are fused as follows: + + * The resulting kernel will have a domain involving all the inames + and parameters occurring across *kernels*. + Inames with matching names across *kernels* are fused in such a way + that they remain a single iname in the fused kernel. + Use :func:`loopy.rename_iname` if this is not desired. + + * The projection of the domains of each pair of kernels onto their + common subset of inames must match in order for fusion to + succeed. + + * Assumptions are fused by taking their conjunction. + + * If kernel arguments with matching names are encountered across + *kernels*, their declarations must match in order for fusion to + succeed. + + * Temporaries are automatically renamed to remain uniquely associated + with each instruction stream. + + * The resulting kernel will contain all instructions from each entry + of *kernels*. Clashing instruction IDs will be renamed to ensure + uniqueness. + + .. versionchanged:: 2016.2 + + *data_flow* was added in version 2016.2 + """ + + # all the resolved functions in programs must be registered in + # main_callables_table main_prog_callables_info = ( - programs[0].program_callables_info.with_edit_callables_mode()) + programs[0].callables_table) old_root_kernel_callable = ( - programs[0].program_callables_info[programs[0].name]) + programs[0].callables_table[programs[0].name]) kernels = [programs[0].root_kernel] # removing the callable collisions that maybe present for prog in programs[1:]: root_kernel = prog.root_kernel renames_needed = {} - for old_func_id, in_knl_callable in prog.program_callables_info.items(): + for old_func_id, in_knl_callable in prog.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): - if in_knl_callable.name != prog.name: + # Fusing programs with multiple callable kernels is tough. + # Reason: Need to first figure out the order in which the + # callable kernels must be resolved into + # main_callables_table, because of renaming is + # needed to be done in the callable kernels before registering. + # Hence disabling it until required. + if in_knl_callable.subkernel.name != prog.name: raise LoopyError("fuse_kernels cannot fuse programs with " "multiple callable kernels.") + + # root kernel are dealt at the end after performing all the + # renaming. continue - num_times_called = ( - prog.program_callables_info.num_times_callables_called[ - old_func_id]) - for i in range(num_times_called): - main_prog_callables_info, new_func_id = ( - main_prog_callables_info.with_callables(var(old_func_id), - in_knl_callable, True)) + main_prog_callables_info, new_func_id = ( + main_prog_callables_info.with_added_callable(var(old_func_id), + in_knl_callable)) if old_func_id != new_func_id: renames_needed[old_func_id] = new_func_id @@ -456,13 +463,11 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): new_root_kernel_callable = old_root_kernel_callable.copy( subkernel=new_root_kernel.copy(name=programs[0].name)) - main_prog_callables_info, _ = main_prog_callables_info.with_callable( + # TODO: change the name of the final root kernel. + main_prog_callables_info, _ = main_prog_callables_info.with_added_callable( var(programs[0].name), new_root_kernel_callable) - main_prog_callables_info = ( - main_prog_callables_info.with_exit_edit_callables_mode()) - return programs[0].copy( - program_callables_info=main_prog_callables_info) + callables_table=main_prog_callables_info) # vim: foldmethod=marker diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 93f6c53e84513c8e703c9cc51c66f5eb9f491084..fb6682f4866317c0852c3814268ada904b2391b8 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -146,8 +146,7 @@ class _InameSplitter(RuleAwareIdentityMapper): and self.split_iname not in expn_state.arg_context and self.within( expn_state.kernel, - expn_state.instruction, - expn_state.stack)): + expn_state.instruction)): new_inames = list(expr.inames) new_inames.remove(self.split_iname) new_inames.extend([self.outer_iname, self.inner_iname]) @@ -164,8 +163,7 @@ class _InameSplitter(RuleAwareIdentityMapper): and self.split_iname not in expn_state.arg_context and self.within( expn_state.kernel, - expn_state.instruction, - expn_state.stack)): + expn_state.instruction)): return self.replacement_index else: return super(_InameSplitter, self).map_variable(expr, expn_state) @@ -184,6 +182,22 @@ def _split_iname_backend(kernel, split_iname, for syntax. """ + from loopy.match import parse_match + within = parse_match(within) + + # {{{ return the same kernel if no kernel matches + + def _do_not_transform_if_no_within_matches(): + for insn in kernel.instructions: + if within(kernel, insn): + return + + return kernel + + _do_not_transform_if_no_within_matches() + + # }}} + existing_tags = kernel.iname_tags(split_iname) from loopy.kernel.data import ForceSequentialTag, filter_iname_tags_by_type if (do_tagged_check and existing_tags @@ -237,10 +251,15 @@ def _split_iname_backend(kernel, split_iname, name_dim_type, name_idx = space.get_var_dict()[split_iname] s = s.intersect(fixed_constraint_set) - if within is None: - s = s.project_out(name_dim_type, name_idx, 1) + def _project_out_only_if_all_instructions_in_within(): + for insn in kernel.instructions: + if split_iname in insn.within_inames and ( + not within(kernel, insn)): + return s - return s + return s.project_out(name_dim_type, name_idx, 1) + + return _project_out_only_if_all_instructions_in_within() new_domains = [process_set(dom) for dom in kernel.domains] @@ -256,7 +275,8 @@ def _split_iname_backend(kernel, split_iname, new_insns = [] for insn in kernel.instructions: - if split_iname in insn.within_inames: + if split_iname in insn.within_inames and ( + within(kernel, insn)): new_within_inames = ( (insn.within_inames.copy() - frozenset([split_iname])) @@ -291,9 +311,6 @@ def _split_iname_backend(kernel, split_iname, applied_iname_rewrites=applied_iname_rewrites, loop_priority=frozenset(new_priorities)) - from loopy.match import parse_stack_match - within = parse_stack_match(within) - rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) ins = _InameSplitter(rule_mapping_context, within, @@ -338,7 +355,7 @@ def split_iname(kernel, split_iname, inner_length, :arg inner_tag: The iname tag (see :ref:`iname-tags`) to apply to *inner_iname*. :arg within: a stack match as understood by - :func:`loopy.match.parse_stack_match`. + :func:`loopy.match.parse_match`. """ assert isinstance(kernel, LoopKernel) @@ -1078,7 +1095,7 @@ def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=Fals def get_iname_duplication_options(program, use_boostable_into=False): - for in_knl_callable in program.program_callables_info.values(): + for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): for option in get_iname_duplication_options_for_single_kernel( in_knl_callable.subkernel, use_boostable_into): @@ -1104,7 +1121,7 @@ def has_schedulable_iname_nesting_for_single_kernel(knl): def has_schedulable_iname_nesting(program): return all(has_schedulable_iname_nesting_for_single_kernel( in_knl_callable.subkernel) for in_knl_callable in - program.program_callables_info.values() if isinstance(in_knl_callable, + program.callables_table.values() if isinstance(in_knl_callable, CallableKernel)) # }}} diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 93cf932b1bf555b64d502b2c7a8c2fcc658d6a25..f73110ecdff79d7c029c0dd0d895ef71ea68326b 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -42,7 +42,7 @@ def find_instructions_in_single_kernel(kernel, insn_match): def find_instructions(program, insn_match): assert isinstance(program, Program) insns = [] - for in_knl_callable in program.program_callables_info.values(): + for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): insns += (find_instructions_in_single_kernel( in_knl_callable.subkernel, insn_match)) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index 734072574cf970cd6093fef56105578b95e76c5f..e5ed850c655df726e79bd487c4ea6c603d94520b 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -37,7 +37,7 @@ __doc__ = """ def pack_and_unpack_args_for_call_for_single_kernel(kernel, - program_callables_info, call_name, args_to_pack=None, + callables_table, call_name, args_to_pack=None, args_to_unpack=None): """ Returns a a copy of *kernel* with instructions appended to copy the @@ -63,10 +63,10 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, if not isinstance(insn, CallInstruction): # pack and unpack call only be done for CallInstructions. continue - if insn.expression.function.name not in program_callables_info: + if insn.expression.function.name not in callables_table: continue - in_knl_callable = program_callables_info[ + in_knl_callable = callables_table[ insn.expression.function.name] if in_knl_callable.name != call_name: @@ -324,10 +324,10 @@ def pack_and_unpack_args_for_call(program, *args, **kwargs): assert isinstance(program, Program) new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = pack_and_unpack_args_for_call_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -340,8 +340,8 @@ def pack_and_unpack_args_for_call(program, *args, **kwargs): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # vim: foldmethod=marker diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 66c7114aee2a987c3a7810252ba3cc4f4d293c83..71b11fa24c49437032537d22aa7275b7e9e3d7f4 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -261,7 +261,7 @@ class _not_provided(object): # noqa: N801 pass -def precompute_for_single_kernel(kernel, program_callables_info, subst_use, +def precompute_for_single_kernel(kernel, callables_table, subst_use, sweep_inames=[], within=None, storage_axes=None, temporary_name=None, precompute_inames=None, precompute_outer_inames=None, storage_axis_to_tag={}, @@ -1047,7 +1047,7 @@ def precompute_for_single_kernel(kernel, program_callables_info, subst_use, if filter_iname_tags_by_type(new_iname_to_tag.values(), AutoFitLocalIndexTag): from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel, program_callables_info) + kernel = assign_automatic_axes(kernel, callables_table) return kernel @@ -1056,10 +1056,10 @@ def precompute(program, *args, **kwargs): assert isinstance(program, Program) new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = precompute_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -1072,8 +1072,8 @@ def precompute(program, *args, **kwargs): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # vim: foldmethod=marker diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 4b957b033fd4659732970b12ee3986b4028811cf..e463353ef9ea0860188b49fcb8f2f06bb96b0f41 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -235,9 +235,9 @@ class TemporarySaver(object): def new_shape(self): return self.hw_dims + self.non_hw_dims - def __init__(self, kernel, program_callables_info): + def __init__(self, kernel, callables_table): self.kernel = kernel - self.program_callables_info = program_callables_info + self.callables_table = callables_table self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() @@ -441,7 +441,7 @@ class TemporarySaver(object): group_sizes, local_sizes = ( self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids, - self.program_callables_info)) + self.callables_table)) if temporary.address_space == lp.AddressSpace.LOCAL: # Elide local axes in the save slot for local temporaries. @@ -630,7 +630,7 @@ class TemporarySaver(object): kernel = lp.add_nosync(kernel, "global", source, sink) from loopy.kernel.tools import assign_automatic_axes - return assign_automatic_axes(kernel, self.program_callables_info) + return assign_automatic_axes(kernel, self.callables_table) def save(self, temporary, subkernel): self.save_or_reload_impl(temporary, subkernel, "save") @@ -754,12 +754,12 @@ def save_and_reload_temporaries(program): program = lp.preprocess_program(program) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(program.root_kernel, - program.program_callables_info) + program.callables_table) assert knl.schedule is not None liveness = LivenessAnalysis(knl) - saver = TemporarySaver(knl, program.program_callables_info) + saver = TemporarySaver(knl, program.callables_table) from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 6a93e0bd99bc779f66fa3fb0aea67b55ea246740..56ae123b78fde7bbf51c27317d74a50ecad0dd5b 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -511,7 +511,7 @@ def find_rules_matching(knl, pattern): def find_one_rule_matching(program, pattern): rules = [] - for in_knl_callable in program.program_callables_info.values(): + for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): knl = in_knl_callable.subkernel rules.extend(find_rules_matching(knl, pattern)) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index b434f03b146a7eb76ced59ddd5fa670fea4e7f09..5047dcc2743b8d47a16beaf67c69e54ac9afb554 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -35,8 +35,11 @@ from loopy.diagnostic import ( TypeInferenceFailure, DependencyTypeInferenceFailure) from loopy.kernel.instruction import _DataObliviousInstruction -from loopy.program import ProgramCallablesInfo -from loopy.symbolic import SubArrayRef, LinearSubscript +from loopy.program import CallablesTable +from loopy.symbolic import ( + LinearSubscript, parse_tagged_name, RuleAwareIdentityMapper, + SubstitutionRuleExpander, ResolvedFunction, + SubstitutionRuleMappingContext, SubArrayRef) from pymbolic.primitives import Variable, Subscript, Lookup import logging @@ -52,7 +55,7 @@ def _debug(kernel, s, *args): def get_return_types_as_tuple(arg_id_to_dtype): """Returns the types of arguments in a tuple format. - :param arg_id_to_dtype: An instance of :class:`dict` which denotes a + :arg arg_id_to_dtype: An instance of :class:`dict` which denotes a mapping from the arguments to their inferred types. """ return_arg_id_to_dtype = dict((id, dtype) for id, dtype in @@ -62,10 +65,139 @@ def get_return_types_as_tuple(arg_id_to_dtype): return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) +# {{{ renaming helpers + +class FunctionNameChanger(RuleAwareIdentityMapper): + """ + Changes the names of scoped functions in calls of expressions according to + the mapping ``calls_to_new_functions`` + """ + + def __init__(self, rule_mapping_context, calls_to_new_names, + subst_expander): + super(FunctionNameChanger, self).__init__(rule_mapping_context) + self.calls_to_new_names = calls_to_new_names + self.subst_expander = subst_expander + + def map_call(self, expr, expn_state): + name, tag = parse_tagged_name(expr.function) + + if name not in self.rule_mapping_context.old_subst_rules: + expanded_expr = self.subst_expander(expr) + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + elif expanded_expr in self.calls_to_new_names: + # FIXME: This is killing the substitution. + # Maybe using a RuleAwareIdentityMapper for TypeInferenceMapper + # would help. + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expanded_expr]), + tuple(self.rec(child, expn_state) + for child in expanded_expr.parameters)) + else: + return super(FunctionNameChanger, self).map_call( + expr, expn_state) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + else: + return super(FunctionNameChanger, self).map_call_with_kwargs( + expr, expn_state) + + +def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): + """ + Returns a copy of *kernel* with the names of pymbolic calls changed + according to the mapping given by *pymbolic_calls_new_names*. + + :arg pymbolic_calls_to_new_names: A mapping from instances of + :class:`pymbolic.primitives.Call` to :class:`str`. + + **Example: ** + + - Given a *kernel* -- + + .. code:: + + ------------------------------------------------------------- + KERNEL: loopy_kernel + ------------------------------------------------------------- + ARGUMENTS: + x: type: , shape: (10), dim_tags: (N0:stride:1) + y: type: , shape: (10), dim_tags: (N0:stride:1) + ------------------------------------------------------------- + DOMAINS: + { [i] : 0 <= i <= 9 } + ------------------------------------------------------------- + INAME IMPLEMENTATION TAGS: + i: None + ------------------------------------------------------------- + INSTRUCTIONS: + for i + y[i] = ResolvedFunction('sin')(x[i]) + end i + ------------------------------------------------------------- + + - And given a *pymbolic_calls_to_new_names* -- + + .. code:: + + {Call(ResolvedFunction(Variable('sin')), (Subscript(Variable('x'), + Variable('i')),))": 'sin_1'} + + - The following *kernel* is returned -- + + .. code:: + + ------------------------------------------------------------- + KERNEL: loopy_kernel + ------------------------------------------------------------- + ARGUMENTS: + x: type: , shape: (10), dim_tags: (N0:stride:1) + y: type: , shape: (10), dim_tags: (N0:stride:1) + ------------------------------------------------------------- + DOMAINS: + { [i] : 0 <= i <= 9 } + ------------------------------------------------------------- + INAME IMPLEMENTATION TAGS: + i: None + ------------------------------------------------------------- + INSTRUCTIONS: + for i + y[i] = ResolvedFunction('sin_1')(x[i]) + end i + ------------------------------------------------------------- + """ + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + name_changer = FunctionNameChanger(rule_mapping_context, + pymbolic_calls_to_new_names, subst_expander) + + return rule_mapping_context.finish_kernel( + name_changer.map_kernel(kernel)) + +# }}} + + # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): - def __init__(self, kernel, program_callables_info, new_assignments=None): + def __init__(self, kernel, callables_table, new_assignments=None): """ :arg new_assignments: mapping from names to either :class:`loopy.kernel.data.TemporaryVariable` @@ -74,12 +206,12 @@ class TypeInferenceMapper(CombineMapper): instances """ self.kernel = kernel - assert isinstance(program_callables_info, ProgramCallablesInfo) + assert isinstance(callables_table, CallablesTable) if new_assignments is None: new_assignments = {} self.new_assignments = new_assignments self.symbols_with_unknown_types = set() - self.program_callables_info = program_callables_info + self.callables_table = callables_table self.old_calls_to_new_calls = {} def __call__(self, expr, return_tuple=False, return_dtype_set=False): @@ -113,16 +245,16 @@ class TypeInferenceMapper(CombineMapper): # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) # are Python-equal (for many common constants such as integers). - def copy(self, program_callables_info=None): - if program_callables_info is None: - program_callables_info = self.program_callables_info - return type(self)(self.kernel, program_callables_info, + def copy(self, callables_table=None): + if callables_table is None: + callables_table = self.callables_table + return type(self)(self.kernel, callables_table, self.new_assignments) def with_assignments(self, names_to_vars): new_ass = self.new_assignments.copy() new_ass.update(names_to_vars) - return type(self)(self.kernel, self.program_callables_info, new_ass) + return type(self)(self.kernel, self.callables_table, new_ass) @staticmethod def combine(dtype_sets): @@ -276,7 +408,6 @@ class TypeInferenceMapper(CombineMapper): def map_call(self, expr, return_tuple=False): from pymbolic.primitives import Variable, CallWithKwargs, Call - from loopy.symbolic import ResolvedFunction if isinstance(expr, CallWithKwargs): kw_parameters = expr.kw_parameters @@ -300,7 +431,7 @@ class TypeInferenceMapper(CombineMapper): # specializing the known function wrt type if isinstance(expr.function, ResolvedFunction): - in_knl_callable = self.program_callables_info[expr.function.name] + in_knl_callable = self.callables_table[expr.function.name] # {{{ checking that there is no overwriting of types of in_knl_callable @@ -326,8 +457,11 @@ class TypeInferenceMapper(CombineMapper): np.int64): continue + if np.can_cast(arg_id_to_dtype[id].dtype.type, + in_knl_callable.arg_id_to_dtype[id].dtype.type): + continue + # }}} - continue raise LoopyError("Overwriting a specialized function " "is illegal--maybe start with new instance of " @@ -335,17 +469,17 @@ class TypeInferenceMapper(CombineMapper): # }}} - in_knl_callable, self.program_callables_info = ( + in_knl_callable, self.callables_table = ( in_knl_callable.with_types( arg_id_to_dtype, self.kernel, - self.program_callables_info)) + self.callables_table)) in_knl_callable = in_knl_callable.with_target(self.kernel.target) # storing the type specialized function so that it can be used for # later use - self.program_callables_info, new_function_id = ( - self.program_callables_info.with_callable( + self.callables_table, new_function_id = ( + self.callables_table.with_callable( expr.function.function, in_knl_callable)) @@ -408,9 +542,9 @@ class TypeInferenceMapper(CombineMapper): in_knl_callable = ManglerCallable( identifier, function_mangler, arg_id_to_dtype, arg_id_to_descr, mangle_result.target_name) - self.program_callables_info, new_function_id = ( - self.program_callables_info.with_callable( - expr.function, in_knl_callable, True)) + self.callables_table, new_function_id = ( + self.callables_table.with_added_callable( + expr.function, in_knl_callable)) if isinstance(expr, Call): self.old_calls_to_new_calls[expr] = new_function_id @@ -566,7 +700,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if var_name in kernel.all_params(): return [kernel.index_dtype], [], {}, ( - type_inf_mapper.program_callables_info) + type_inf_mapper.callables_table) from functools import partial debug = partial(_debug, kernel) @@ -613,13 +747,13 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if not dtype_sets: return ( None, type_inf_mapper.symbols_with_unknown_types, None, - type_inf_mapper.program_callables_info) + type_inf_mapper.callables_table) result = type_inf_mapper.combine(dtype_sets) return (result, type_inf_mapper.symbols_with_unknown_types, type_inf_mapper.old_calls_to_new_calls, - type_inf_mapper.program_callables_info) + type_inf_mapper.callables_table) # }}} @@ -646,7 +780,7 @@ class _DictUnionView: # {{{ infer_unknown_types -def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, +def infer_unknown_types_for_a_single_kernel(kernel, callables_table, expect_completion=False): """Infer types on temporaries and arguments.""" @@ -709,7 +843,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, new_temp_vars, new_arg_dict ]) - type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info, + type_inf_mapper = TypeInferenceMapper(kernel, callables_table, item_lookup) from loopy.symbolic import SubstitutionRuleExpander @@ -745,11 +879,11 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, debug("inferring type for %s %s", type(item).__name__, item.name) (result, symbols_with_unavailable_types, - new_old_calls_to_new_calls, program_callables_info) = ( + new_old_calls_to_new_calls, callables_table) = ( _infer_var_type( kernel, item.name, type_inf_mapper, subst_expander)) type_inf_mapper = type_inf_mapper.copy( - program_callables_info=program_callables_info) + callables_table=callables_table) failed = not result if not failed: @@ -768,9 +902,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError("unexpected item type in type inference") - # TODO: I dont like in-place updates. Change this to something - # else. Perhaps add a function for doing this, which does it - # using a bunch of copies? old_calls_to_new_calls.update(new_old_calls_to_new_calls) else: debug(" failure") @@ -859,8 +990,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, if isinstance(insn, lp.MultiAssignmentBase): # just a dummy run over the expression, to pass over all the # functions - # FIXME: need a check over here which checks the instruction for - # unseen cases if _instruction_missed_during_inference(insn): type_inf_mapper(insn.expression, return_tuple=isinstance(insn, lp.CallInstruction), return_dtype_set=True) @@ -871,7 +1000,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, raise NotImplementedError("Unknown instructions type %s." % ( type(insn).__name__)) - program_callables_info = type_inf_mapper.program_callables_info + callables_table = type_inf_mapper.callables_table old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls) end_time = time.time() @@ -883,9 +1012,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, args=[new_arg_dict[arg.name] for arg in kernel.args], ) - # this has to be subsitutition - from loopy.kernel.function_interface import ( - change_names_of_pymbolic_calls) type_specialized_kernel = change_names_of_pymbolic_calls( pre_type_specialized_knl, old_calls_to_new_calls) @@ -895,47 +1021,42 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, if expect_completion: # if completion is expected, then it is important that all the # callables are scoped. - from loopy.check import check_functions_are_scoped - check_functions_are_scoped(type_specialized_kernel) + from loopy.check import check_functions_are_resolved + check_functions_are_resolved(type_specialized_kernel) - return type_specialized_kernel, program_callables_info + return type_specialized_kernel, callables_table def infer_unknown_types(program, expect_completion=False): """Infer types on temporaries and arguments.""" - from loopy.kernel import LoopKernel - if isinstance(program, LoopKernel): - # FIXME: deprecate warning needed here - from loopy.program import make_program_from_kernel - program = make_program_from_kernel(program) - program_callables_info = program.program_callables_info + callables_table = program.callables_table type_uninferred_knl_callable = ( - program_callables_info[program.name]) + callables_table[program.name]) type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel - program_callables_info = ( - program.program_callables_info.with_edit_callables_mode()) - root_kernel, program_callables_info = ( + old_callables_count = callables_table.callables_count + callables_table = ( + program.callables_table.with_edit_callables_mode()) + root_kernel, callables_table = ( infer_unknown_types_for_a_single_kernel( type_uninferred_root_kernel, - program_callables_info, expect_completion)) + callables_table, expect_completion)) type_inferred_knl_callable = type_uninferred_knl_callable.copy( subkernel=root_kernel) - program_callables_info, _ = ( - program_callables_info.with_callable( + callables_table, _ = ( + callables_table.with_callable( program.name, type_inferred_knl_callable)) - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) + callables_table = ( + callables_table.with_exit_edit_callables_mode( + old_callables_count)) - # FIXME: maybe put all of this in a function? - # need to infer functions that were left out during inference - return program.copy(program_callables_info=program_callables_info) + return program.copy(callables_table=callables_table) # }}} @@ -943,8 +1064,8 @@ def infer_unknown_types(program, expect_completion=False): # {{{ reduction expression helper def infer_arg_and_reduction_dtypes_for_reduction_expression( - kernel, expr, program_callables_info, unknown_types_ok): - type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info) + kernel, expr, callables_table, unknown_types_ok): + type_inf_mapper = TypeInferenceMapper(kernel, callables_table) import loopy as lp if expr.is_tuple_typed: @@ -976,7 +1097,7 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( for dt in reduction_dtypes) return tuple(arg_dtypes), reduction_dtypes, ( - type_inf_mapper.program_callables_info) + type_inf_mapper.callables_table) # }}} diff --git a/requirements.txt b/requirements.txt index a3e88cfea99e7413211c35d11464932f98e23758..97c2024764715d0a715520800e2e1dd467183479 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,4 @@ git+https://github.com/inducer/codepy.git git+https://github.com/inducer/f2py # Optional, needed for using the C preprocessor on Fortran -ply>=3.6 - -# This is needed for the pyinstaller executable to be usable. -packaging +ply>=3.6 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index b939ce0cf8b680bb1eb3501ed6d7f563e9c1c7b6..eec3dfd1f52ed97c58f5281716eac8fc18980094 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [flake8] -ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814 +ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,W504 max-line-length=85 exclude= loopy/target/c/compyte/ndarray, diff --git a/test/test_diff.py b/test/test_diff.py index 7e14a7ab5274b8e40fbc5d792a7303daf171dc17..d001233c0eced5ecaf9342b90da0487faefb21f3 100644 --- a/test/test_diff.py +++ b/test/test_diff.py @@ -66,7 +66,7 @@ def test_diff(ctx_factory): from loopy.transform.diff import diff_kernel dknl, diff_map = diff_kernel(knl, "z", "x") - dknl = lp.make_program_from_kernel(dknl) + dknl = lp.make_program(dknl) dknl = lp.remove_unused_arguments(dknl) dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a") diff --git a/test/test_loopy.py b/test/test_loopy.py index 9dc74b94f72347e3b4287e244f06292ce60527b4..2c10b11820ef88a6984a505f42eb0e2b1a95f707 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -408,11 +408,14 @@ def test_ilp_write_race_detection_global(ctx_factory): knl = lp.tag_inames(knl, dict(j="ilp")) + knl = lp.preprocess_kernel(knl) + with lp.CacheMode(False): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: - lp.generate_code_v2(knl) + list(lp.generate_loop_schedules(knl.root_kernel, + knl.callables_table)) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list) @@ -1267,7 +1270,7 @@ def save_and_reload_temporaries_test(queue, prog, out_expect, debug=False): from loopy.transform.save import save_and_reload_temporaries prog = save_and_reload_temporaries(prog) prog = prog.with_root_kernel(lp.get_one_scheduled_kernel(prog.root_kernel, - prog.program_callables_info)) + prog.callables_table)) if debug: print(prog) @@ -2220,7 +2223,7 @@ def test_unscheduled_insn_detection(): "...") prog = lp.preprocess_kernel(prog) - knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) + knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) prog = prog.with_root_kernel(knl) insn1, = lp.find_instructions(prog, "id:insn1") insns = prog.root_kernel.instructions[:] @@ -2390,7 +2393,7 @@ def test_barrier_insertion_near_top_of_loop(): prog = lp.set_temporary_scope(prog, "a", "local") prog = lp.set_temporary_scope(prog, "b", "local") prog = lp.preprocess_kernel(prog) - knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) + knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) print(knl) @@ -2418,7 +2421,7 @@ def test_barrier_insertion_near_bottom_of_loop(): prog = lp.set_temporary_scope(prog, "a", "local") prog = lp.set_temporary_scope(prog, "b", "local") prog = lp.preprocess_kernel(prog) - knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) + knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) print(knl) @@ -2477,7 +2480,7 @@ def test_multi_argument_reduction_type_inference(): allow_simultaneous=True) t_inf_mapper = TypeInferenceMapper(prog.root_kernel, - prog.program_callables_info) + prog.callables_table) assert ( t_inf_mapper(expr, return_tuple=True, return_dtype_set=True) @@ -2834,7 +2837,7 @@ def test_no_barriers_for_nonoverlapping_access(second_index, expect_barrier): prog = lp.preprocess_kernel(prog) knl = lp.get_one_scheduled_kernel(prog.root_kernel, - prog.program_callables_info) + prog.callables_table) assert barrier_between(knl, "first", "second") == expect_barrier @@ -2888,6 +2891,25 @@ def test_dep_cycle_printing_and_error(): print(lp.generate_code(knl).device_code()) +def test_backwards_dep_printing_and_error(): + knl = lp.make_kernel( + "{[i]: 0<=i 1: exec(sys.argv[1]) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 4f802f8bff3ba00763825bc09dbc6051ff1ac527..1ba44e77e13a88ecbc05f4eecc6b9c7e397eb656 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -47,8 +47,8 @@ __all__ = [ from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa -@pytest.mark.parametrize("Nq", [7]) @pytest.mark.parametrize("ilp_multiple", [1, 2]) +@pytest.mark.parametrize("Nq", [7]) @pytest.mark.parametrize("opt_level", [11]) def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa ctx = ctx_factory() @@ -231,7 +231,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa if 1: print("OPS") - op_map = lp.get_op_map(hsv) + op_map = lp.get_op_map(hsv, subgroup_size=32) print(lp.stringify_stats_mapping(op_map)) print("MEM") diff --git a/test/test_reduction.py b/test/test_reduction.py index 96dab405a98bc710eb7a50ec3f276e248cf44cf9..aaf11ee296341b8d8f4653e41bf8640c4e34583c 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -219,32 +219,38 @@ def test_local_parallel_reduction(ctx_factory, size): def test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() - prog = lp.make_kernel( + knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. - z[0] = sum(i, i/13) + z[0] = sum(i, a[i]) """) - ref_prog = prog + knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + ref_knl = knl gsize = 128 - prog = lp.split_iname(prog, "i", gsize * 20) - prog = lp.split_iname(prog, "i_inner", gsize, outer_tag="l.0") - prog = lp.split_reduction_inward(prog, "i_inner_inner") - prog = lp.split_reduction_inward(prog, "i_inner_outer") + knl = lp.split_iname(knl, "i", gsize * 20) + knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0") + knl = lp.split_reduction_outward(knl, "i_outer") + knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule - prog = reduction_arg_to_subst_rule(prog, "i_outer") - prog = lp.precompute(prog, "red_i_outer_arg", "i_outer", + knl = reduction_arg_to_subst_rule(knl, "i_outer") + + knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") - prog = lp.realize_reduction(prog) - prog = lp.add_dependency( - prog, "writes:acc_i_outer", + knl = lp.realize_reduction(knl) + knl = lp.tag_inames(knl, "i_outer_0:g.0") + + # Keep the i_outer accumulator on the correct (lower) side of the barrier, + # otherwise there will be useless save/reload code generated. + knl = lp.add_dependency( + knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref( - ref_prog, ctx, prog, parameters={"n": size}, + ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True) diff --git a/test/test_statistics.py b/test/test_statistics.py index 79c5ec7da0971b534588be3bfcd58a9f5fc8405a..41b44b5a7e9bbfe8f371e6a605ccfa8068a563b6 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -39,6 +39,9 @@ from pymbolic.primitives import Variable from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa +SGS = 32 # Subgroup size + + def test_op_counter_basic(): knl = lp.make_kernel( @@ -54,21 +57,27 @@ def test_op_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) + n_workgroups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params) - f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params) - f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.WORKITEM) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) + f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params) + f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) ].eval_with_dict(params) - assert f32add == f32mul == f32div == n*m*ell - assert f64mul == n*m - assert i32add == n*m*2 + # (count-per-sub-group)*n_subgroups + assert f32add == f32mul == f32div == n*m*ell*n_subgroups + assert f64mul == n*m*n_subgroups + assert i32add == n*m*2*n_subgroups def test_op_counter_reduction(): @@ -81,15 +90,20 @@ def test_op_counter_reduction(): name="matmul_serial", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - op_map = lp.get_op_map(knl, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + n_workgroups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params) - f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.WORKITEM) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) + f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP) ].eval_with_dict(params) - assert f32add == f32mul == n*m*ell + # (count-per-sub-group)*n_subgroups + assert f32add == f32mul == n*m*ell*n_subgroups op_map_dtype = op_map.group_by('dtype') f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) @@ -111,21 +125,26 @@ def test_op_counter_logic(): name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - op_map = lp.get_op_map(knl, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + n_workgroups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params) - f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(params) - f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.WORKITEM) + f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) + f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(params) + f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) ].eval_with_dict(params) - assert f32mul == n*m - assert f64div == 2*n*m # TODO why? - assert f64add == n*m - assert i32add == n*m + # (count-per-sub-group)*n_subgroups + assert f32mul == n*m*n_subgroups + assert f64div == 2*n*m*n_subgroups # TODO why? + assert f64add == n*m*n_subgroups + assert i32add == n*m*n_subgroups def test_op_counter_specialops(): @@ -143,27 +162,33 @@ def test_op_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) + n_workgroups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params) - f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params) - f64pow = op_map[lp.Op(np.float64, 'pow', CG.WORKITEM)].eval_with_dict(params) - f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.WORKITEM) + f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) + f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) + f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP)].eval_with_dict(params) + f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) ].eval_with_dict(params) - f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.WORKITEM) + f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP) ].eval_with_dict(params) - f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.WORKITEM) + f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.SUBGROUP) ].eval_with_dict(params) - assert f32div == 2*n*m*ell - assert f32mul == f32add == n*m*ell - assert f64add == 3*n*m - assert f64pow == i32add == f64rsq == f64sin == n*m + # (count-per-sub-group)*n_subgroups + assert f32div == 2*n*m*ell*n_subgroups + assert f32mul == f32add == n*m*ell*n_subgroups + assert f64add == 3*n*m*n_subgroups + assert f64pow == i32add == f64rsq == f64sin == n*m*n_subgroups def test_op_counter_bitwise(): @@ -183,26 +208,32 @@ def test_op_counter_bitwise(): a=np.int32, b=np.int32, g=np.int64, h=np.int64)) - op_map = lp.get_op_map(knl, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=False) + n_workgroups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - i32add = op_map[lp.Op(np.int32, 'add', CG.WORKITEM)].eval_with_dict(params) - i32bw = op_map[lp.Op(np.int32, 'bw', CG.WORKITEM)].eval_with_dict(params) - i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.WORKITEM) + i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(params) + i32bw = op_map[lp.Op(np.int32, 'bw', CG.SUBGROUP)].eval_with_dict(params) + i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.SUBGROUP) ].eval_with_dict(params) - i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.WORKITEM) + i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.SUBGROUP) ].eval_with_dict(params) - i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.WORKITEM) + i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.SUBGROUP) ].eval_with_dict(params) - i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.WORKITEM) + i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP) ].eval_with_dict(params) - assert i32add == n*m+n*m*ell - assert i32bw == 2*n*m*ell - assert i64bw == 2*n*m - assert i64add == i64mul == n*m - assert i64shift == 2*n*m + # (count-per-sub-group)*n_subgroups + assert i32add == n*m*ell*n_subgroups + assert i32bw == 2*n*m*ell*n_subgroups + assert i64bw == 2*n*m*n_subgroups + assert i64add == i64mul == n*m*n_subgroups + assert i64shift == 2*n*m*n_subgroups def test_op_counter_triangular_domain(): @@ -228,15 +259,21 @@ def test_op_counter_triangular_domain(): op_map = lp.get_op_map( knl, + subgroup_size=SGS, count_redundant_work=True - )[lp.Op(np.float64, 'mul', CG.WORKITEM)] + )[lp.Op(np.float64, 'mul', CG.SUBGROUP)] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) + n_workgroups = 1 + group_size = 1 + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group + if expect_fallback: - assert flops == 144 + assert flops == 144*n_subgroups else: - assert flops == 78 + assert flops == 78*n_subgroups def test_mem_access_counter_basic(): @@ -254,10 +291,8 @@ def test_mem_access_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - subgroup_size = 32 - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=subgroup_size) + subgroup_size=SGS) n = 512 m = 256 @@ -266,7 +301,8 @@ def test_mem_access_counter_basic(): n_workgroups = 1 group_size = 1 - subgroups_per_group = div_ceil(group_size, subgroup_size) + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group f32l = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, @@ -289,9 +325,9 @@ def test_mem_access_counter_basic(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f32l == (3*n*m*ell)*n_workgroups*subgroups_per_group - assert f64l == (2*n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f32l == (3*n*m*ell)*n_subgroups + assert f64l == (2*n*m)*n_subgroups f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={}, gid_strides={}, @@ -304,9 +340,9 @@ def test_mem_access_counter_basic(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f32s == (n*m*ell)*n_workgroups*subgroups_per_group - assert f64s == (n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f32s == (n*m*ell)*n_subgroups + assert f64s == (n*m)*n_subgroups def test_mem_access_counter_reduction(): @@ -320,10 +356,8 @@ def test_mem_access_counter_reduction(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - subgroup_size = 32 - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=subgroup_size) + subgroup_size=SGS) n = 512 m = 256 ell = 128 @@ -331,7 +365,8 @@ def test_mem_access_counter_reduction(): n_workgroups = 1 group_size = 1 - subgroups_per_group = div_ceil(group_size, subgroup_size) + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group f32l = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, @@ -344,8 +379,8 @@ def test_mem_access_counter_reduction(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f32l == (2*n*m*ell)*n_subgroups f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={}, gid_strides={}, @@ -353,8 +388,8 @@ def test_mem_access_counter_reduction(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f32s == (n*ell)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f32s == (n*ell)*n_subgroups ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] ).to_bytes().eval_and_sum(params) @@ -379,10 +414,8 @@ def test_mem_access_counter_logic(): knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - subgroup_size = 32 - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=subgroup_size) + subgroup_size=SGS) n = 512 m = 256 ell = 128 @@ -390,7 +423,8 @@ def test_mem_access_counter_logic(): n_workgroups = 1 group_size = 1 - subgroups_per_group = div_ceil(group_size, subgroup_size) + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') @@ -404,10 +438,10 @@ def test_mem_access_counter_logic(): direction='store') ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f32_g_l == (2*n*m)*n_workgroups*subgroups_per_group - assert f64_g_l == (n*m)*n_workgroups*subgroups_per_group - assert f64_g_s == (n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f32_g_l == (2*n*m)*n_subgroups + assert f64_g_l == (n*m)*n_subgroups + assert f64_g_s == (n*m)*n_subgroups def test_mem_access_counter_specialops(): @@ -425,10 +459,8 @@ def test_mem_access_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - subgroup_size = 32 - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=subgroup_size) + subgroup_size=SGS) n = 512 m = 256 ell = 128 @@ -436,7 +468,8 @@ def test_mem_access_counter_specialops(): n_workgroups = 1 group_size = 1 - subgroups_per_group = div_ceil(group_size, subgroup_size) + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group f32 = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, @@ -459,9 +492,9 @@ def test_mem_access_counter_specialops(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f32 == (2*n*m*ell)*n_workgroups*subgroups_per_group - assert f64 == (2*n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f32 == (2*n*m*ell)*n_subgroups + assert f64 == (2*n*m)*n_subgroups f32 = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, @@ -474,16 +507,16 @@ def test_mem_access_counter_specialops(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f32 == (n*m*ell)*n_workgroups*subgroups_per_group - assert f64 == (n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f32 == (n*m*ell)*n_subgroups + assert f64 == (n*m)*n_subgroups filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'], count_granularity=CG.SUBGROUP) tot = filtered_map.eval_and_sum(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert tot == (n*m*ell + n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert tot == (n*m*ell + n*m)*n_subgroups def test_mem_access_counter_bitwise(): @@ -503,10 +536,8 @@ def test_mem_access_counter_bitwise(): a=np.int32, b=np.int32, g=np.int32, h=np.int32)) - subgroup_size = 32 - mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=subgroup_size) + subgroup_size=SGS) n = 512 m = 256 ell = 128 @@ -514,7 +545,8 @@ def test_mem_access_counter_bitwise(): n_workgroups = 1 group_size = 1 - subgroups_per_group = div_ceil(group_size, subgroup_size) + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group i32 = mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, @@ -537,8 +569,8 @@ def test_mem_access_counter_bitwise(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert i32 == (4*n*m+2*n*m*ell)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert i32 == (4*n*m+2*n*m*ell)*n_subgroups i32 = mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, @@ -551,8 +583,8 @@ def test_mem_access_counter_bitwise(): count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert i32 == (n*m+n*m*ell)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert i32 == (n*m+n*m*ell)*n_subgroups def test_mem_access_counter_mixed(): @@ -571,7 +603,6 @@ def test_mem_access_counter_mixed(): x=np.float32)) group_size_0 = 65 - subgroup_size = 32 knl = lp.split_iname(knl, "j", group_size_0) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) @@ -583,10 +614,11 @@ def test_mem_access_counter_mixed(): n_workgroups = div_ceil(ell, group_size_0) group_size = group_size_0 - subgroups_per_group = div_ceil(group_size, subgroup_size) + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=subgroup_size) + subgroup_size=SGS) f64uniform = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='g', @@ -617,9 +649,9 @@ def test_mem_access_counter_mixed(): count_granularity=CG.WORKITEM) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f64uniform == (2*n*m)*n_workgroups*subgroups_per_group - assert f32uniform == (m*n)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f64uniform == (2*n*m)*n_subgroups + assert f32uniform == (m*n)*n_subgroups expect_fallback = False import islpy as isl @@ -651,8 +683,8 @@ def test_mem_access_counter_mixed(): count_granularity=CG.WORKITEM) ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f64uniform == m*n*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f64uniform == m*n*n_subgroups if expect_fallback: if ell < group_size_0: @@ -681,7 +713,7 @@ def test_mem_access_counter_nonconsec(): knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=32) # noqa + subgroup_size=SGS) # noqa n = 512 m = 256 ell = 128 @@ -939,30 +971,35 @@ def test_all_counters_parallel_matmul(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} + group_size = bsize*bsize + n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize) + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 assert sync_map["kernel_launch"].eval_with_dict(params) == 1 assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize - op_map = lp.get_op_map(knl, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) f32mul = op_map[ - lp.Op(np.float32, 'mul', CG.WORKITEM) + lp.Op(np.float32, 'mul', CG.SUBGROUP) ].eval_with_dict(params) f32add = op_map[ - lp.Op(np.float32, 'add', CG.WORKITEM) + lp.Op(np.float32, 'add', CG.SUBGROUP) ].eval_with_dict(params) i32ops = op_map[ - lp.Op(np.int32, 'add', CG.WORKITEM) + lp.Op(np.int32, 'add', CG.SUBGROUP) ].eval_with_dict(params) i32ops += op_map[ - lp.Op(np.dtype(np.int32), 'mul', CG.WORKITEM) + lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP) ].eval_with_dict(params) - assert f32mul+f32add == n*m*ell*2 + # (count-per-sub-group)*n_subgroups + assert f32mul+f32add == m*2*n_subgroups mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=32) + subgroup_size=SGS) f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('ell')}, @@ -991,33 +1028,36 @@ def test_all_counters_parallel_matmul(): local_mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=32).filter_by(mtype=['local']) + subgroup_size=SGS).filter_by(mtype=['local']) local_mem_l = local_mem_map.filter_by(direction=['load'] ).eval_and_sum(params) - assert local_mem_l == n*m*ell*2 + # (count-per-sub-group)*n_subgroups + assert local_mem_l == m*2*n_subgroups local_mem_l_a = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', lid_strides={1: 16}, gid_strides={}, variable='a_fetch', - count_granularity=CG.WORKITEM) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', lid_strides={0: 1}, gid_strides={}, variable='b_fetch', - count_granularity=CG.WORKITEM) + count_granularity=CG.SUBGROUP) ].eval_with_dict(params) - assert local_mem_l_a == local_mem_l_b == n*m*ell + # (count-per-sub-group)*n_subgroups + assert local_mem_l_a == local_mem_l_b == m*n_subgroups local_mem_s = local_mem_map.filter_by(direction=['store'] ).eval_and_sum(params) - assert local_mem_s == n*m*ell*2/bsize + # (count-per-sub-group)*n_subgroups + assert local_mem_s == m*2/bsize*n_subgroups def test_gather_access_footprint(): @@ -1067,8 +1107,6 @@ def test_summations_and_filters(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - subgroup_size = 32 - n = 512 m = 256 ell = 128 @@ -1076,24 +1114,25 @@ def test_summations_and_filters(): n_workgroups = 1 group_size = 1 - subgroups_per_group = div_ceil(group_size, subgroup_size) + subgroups_per_group = div_ceil(group_size, SGS) + n_subgroups = n_workgroups*subgroups_per_group mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, - subgroup_size=subgroup_size) + subgroup_size=SGS) loads_a = mem_map.filter_by(direction=['load'], variable=['a'], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert loads_a == (2*n*m*ell)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert loads_a == (2*n*m*ell)*n_subgroups global_stores = mem_map.filter_by(mtype=['global'], direction=['store'], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert global_stores == (n*m*ell + n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert global_stores == (n*m*ell + n*m)*n_subgroups ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'], count_granularity=[CG.SUBGROUP] @@ -1102,9 +1141,9 @@ def test_summations_and_filters(): count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_workgroups*subgroups_per_group - assert st_bytes == (4*n*m*ell + 8*n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_subgroups + assert st_bytes == (4*n*m*ell + 8*n*m)*n_subgroups # ignore stride and variable names in this map reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') @@ -1113,11 +1152,12 @@ def test_summations_and_filters(): f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') ].eval_with_dict(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f32lall == (3*n*m*ell)*n_workgroups*subgroups_per_group - assert f64lall == (2*n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f32lall == (3*n*m*ell)*n_subgroups + assert f64lall == (2*n*m)*n_subgroups - op_map = lp.get_op_map(knl, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) @@ -1149,8 +1189,8 @@ def test_summations_and_filters(): key.direction == 'load' f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) - # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group - assert f64l == (2*n*m)*n_workgroups*subgroups_per_group + # uniform: (count-per-sub-group)*n_subgroups + assert f64l == (2*n*m)*n_subgroups def test_strided_footprint(): diff --git a/test/test_target.py b/test/test_target.py index 7b9d4f40ae83dadc8e8bd20946cadfbbad6c2f70..095bf0939d7439be440f7300f70f1ad4491f0fb9 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -323,6 +323,50 @@ def test_target_invalid_type_cast(): lp.TypeCast(dtype, 1) +def test_ispc_streaming_stores(): + stream_dtype = np.float32 + index_dtype = np.int32 + + knl = lp.make_kernel( + "{[i]: 0<=i0") + knl = lp.split_iname( + knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1)) + knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0") + knl = lp.tag_instructions(knl, "!streaming_store") + + knl = lp.add_and_infer_dtypes(knl, { + var: stream_dtype + for var in vars + }) + + knl = lp.set_argument_order(knl, vars + ["n"]) + + lp.generate_code_v2(knl).all_code() + + +def test_cuda_short_vector(): + knl = lp.make_kernel( + "{ [i]: 0<=i 1: exec(sys.argv[1]) diff --git a/test/test_transform.py b/test/test_transform.py index d54a820a8a7dbced4ed8b3aab9af3f53a92499bc..04162331ddc92a4ef13ef0086fee2866a5b9f0af 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -543,6 +543,23 @@ def test_uniquify_instruction_ids(): assert all(isinstance(id, str) for id in insn_ids) +def test_split_iname_only_if_in_within(): + prog = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + c[i] = 3*d[i] {id=to_split} + a[i] = 2*b[i] {id=not_to_split} + """) + + prog = lp.split_iname(prog, "i", 4, within='id:to_split') + + for insn in prog.root_kernel.instructions: + if insn.id == 'to_split': + assert insn.within_inames == frozenset({'i_outer', 'i_inner'}) + if insn.id == 'not_to_split': + assert insn.within_inames == frozenset({'i'}) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/testlib.py b/test/testlib.py index eebc792d00297adf4785f850582b415ca1d20439..853e2584a1e10732b3ec49cd737016734cdea5fa 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -9,9 +9,9 @@ class GridOverride(object): self.clean = clean self.vecsize = vecsize - def __call__(self, insn_ids, program_callables_info, ignore_auto=True): + def __call__(self, insn_ids, callables_table, ignore_auto=True): gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, - program_callables_info, ignore_auto) + callables_table, ignore_auto) return gsize, (self.vecsize,) # }}} @@ -139,14 +139,14 @@ class SeparateTemporariesPreambleTestPreambleGenerator( class Log2Callable(lp.ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: # the types provided aren't mature enough to specialize the # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = arg_id_to_dtype[0].numpy_dtype @@ -168,7 +168,7 @@ class Log2Callable(lp.ScalarCallable): self.copy(name_in_target=name_in_target, arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}), - program_callables_info) + callables_table) def register_log2_lookup(target, identifier):