diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 70017ee228c054b2818428e997257de638c5a2fb..a865134add02fd7527bf336d0c3596e2777aa232 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ jobs: uses: actions/setup-python@v1 with: # matches compat target in setup.py - python-version: '3.6' + python-version: '3.8' - name: "Main Script" run: | curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh @@ -35,6 +35,19 @@ jobs: curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-pylint.sh . ./prepare-and-run-pylint.sh "$(basename $GITHUB_REPOSITORY)" test/test_*.py + mypy: + name: Mypy + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: "Main Script" + run: | + curl -L -O https://tiker.net/ci-support-v0 + . ./ci-support-v0 + build_py_project_in_conda_env + python -m pip install mypy + ./run-mypy.sh + pytest: name: Conda Pytest runs-on: ubuntu-latest diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f8e08058c3e5d61f56655d88a5cebbab8a5f202c..5a8273849e03cab0dfd632f8f18f47bfa659314b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -168,6 +168,18 @@ Flake8: except: - tags +Mypy: + script: | + curl -L -O https://tiker.net/ci-support-v0 + . ./ci-support-v0 + build_py_project_in_venv + python -m pip install mypy + ./run-mypy.sh + tags: + - python3 + except: + - tags + Benchmarks: stage: test script: diff --git a/doc/conf.py b/doc/conf.py index 7cd9884fd5041a55643fba7e31afea54a8ab1b1f..ce500cbecd1d15ab97c6bfa95934b4034d0bf17c 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -35,6 +35,25 @@ intersphinx_mapping = { "https://pyrsistent.readthedocs.io/en/latest/": None, } +# Some modules need to import things just so that sphinx can resolve symbols in +# type annotations. Often, we do not want these imports (e.g. of PyOpenCL) when +# in normal use (because they would introduce unintended side effects or hard +# dependencies). This flag exists so that these imports only occur during doc +# build. Since sphinx appears to resolve type hints lexically (as it should), +# this needs to be cross-module (since, e.g. an inherited arraycontext +# docstring can be read by sphinx when building meshmode, a dependent package), +# this needs a setting of the same name across all packages involved, that's +# why this name is as global-sounding as it is. +import sys +sys._BUILDING_SPHINX_DOCS = True + nitpick_ignore_regex = [ ["py:class", r"typing_extensions\.(.+)"], + ["py:class", r"numpy\.u?int[0-9]+"], + ["py:class", r"numpy\.float[0-9]+"], + ["py:class", r"numpy\.complex[0-9]+"], + + # As of 2022-06-22, it doesn't look like there's sphinx documentation + # available. + ["py:class", r"immutables\.(.+)"], ] diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index 922315685579089e787389ec5c2f74c56b690092..c53c56530d43a2ec03b2126cc70cbaf3221caf33 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -515,24 +515,14 @@ Arguments ^^^^^^^^^ .. autoclass:: KernelArgument - :members: - :undoc-members: .. autoclass:: ValueArg - :members: - :undoc-members: .. autoclass:: ArrayArg - :members: - :undoc-members: .. autoclass:: ConstantArg - :members: - :undoc-members: .. autoclass:: ImageArg - :members: - :undoc-members: .. _temporaries: diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index b3cfbc5c485f9670f59de4edda80ffabf6e23076..9ef012d663e7a8eb32c2cb1fa236018200338600 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -52,6 +52,8 @@ Influencing data access .. automodule:: loopy.transform.privatize +.. autofunction:: allocate_temporaries_for_base_storage + Padding Data ------------ diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 7fe11871b1fc91f9d64232f0e145d7ee8b6a6819..13a597e5b2c82b2ec6ea5c872b6edc6ee70e29d6 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -235,7 +235,7 @@ inspect that code, too, using :attr:`loopy.Options.write_wrapper`: if allocator is None: allocator = _lpy_cl_tools.DeferredAllocator(queue.context) <BLANKLINE> - # {{{ find integer arguments from shapes + # {{{ find integer arguments from array data <BLANKLINE> if n is None: if a is not None: @@ -1228,11 +1228,11 @@ should call :func:`loopy.get_one_linearized_kernel`: ... --------------------------------------------------------------------------- LINEARIZATION: - 0: CALL KERNEL rotate_v2(extra_args=[], extra_inames=[]) + 0: CALL KERNEL rotate_v2 1: tmp = arr[i_inner + i_outer*16] {id=maketmp} 2: RETURN FROM KERNEL rotate_v2 3: ... gbarrier - 4: CALL KERNEL rotate_v2_0(extra_args=[], extra_inames=[]) + 4: CALL KERNEL rotate_v2_0 5: arr[(1 + i_inner + i_outer*16) % n] = tmp {id=rotate} 6: RETURN FROM KERNEL rotate_v2_0 --------------------------------------------------------------------------- @@ -1260,18 +1260,18 @@ put those instructions into the schedule. ... --------------------------------------------------------------------------- TEMPORARIES: - tmp: type: np:dtype('int32'), shape: () aspace:private - tmp_save_slot: type: np:dtype('int32'), shape: (n // 16, 16), dim_tags: (N1:stride:16, N0:stride:1) aspace:global + tmp: type: np:dtype('int32'), shape: () aspace: private + tmp_save_slot: type: np:dtype('int32'), shape: (n // 16, 16), dim_tags: (N1:stride:16, N0:stride:1) aspace: global --------------------------------------------------------------------------- ... --------------------------------------------------------------------------- LINEARIZATION: - 0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[]) + 0: CALL KERNEL rotate_v2 1: tmp = arr[i_inner + i_outer*16] {id=maketmp} 2: tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] = tmp {id=tmp.save} 3: RETURN FROM KERNEL rotate_v2 4: ... gbarrier - 5: CALL KERNEL rotate_v2_0(extra_args=['tmp_save_slot'], extra_inames=[]) + 5: CALL KERNEL rotate_v2_0 6: tmp = tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0] {id=tmp.reload} 7: arr[(1 + i_inner + i_outer*16) % n] = tmp {id=rotate} 8: RETURN FROM KERNEL rotate_v2_0 @@ -1297,7 +1297,7 @@ The kernel translates into two OpenCL kernels. #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE> - __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot) + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int const *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot) { int tmp; <BLANKLINE> @@ -1305,7 +1305,7 @@ The kernel translates into two OpenCL kernels. tmp_save_slot[16 * gid(0) + lid(0)] = tmp; } <BLANKLINE> - __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot) + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int const *__restrict__ tmp_save_slot) { int tmp; <BLANKLINE> diff --git a/loopy/__init__.py b/loopy/__init__.py index 7bebb98bd6804520d7187cd3d4ad302848633fe7..ce3ba1439659b3dd118dc011e94ecb09c2f4f343 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -95,7 +95,8 @@ from loopy.transform.data import ( alias_temporaries, set_argument_order, rename_argument, set_temporary_scope, - set_temporary_address_space) + set_temporary_address_space, + allocate_temporaries_for_base_storage) from loopy.transform.subst import (extract_subst, assignment_to_subst, expand_subst, find_rules_matching, @@ -157,7 +158,6 @@ from loopy.target.cuda import CudaTarget from loopy.target.opencl import OpenCLTarget from loopy.target.pyopencl import PyOpenCLTarget from loopy.target.ispc import ISPCTarget -from loopy.target.numba import NumbaTarget, NumbaCudaTarget from loopy.tools import Optional, t_unit_to_python, memoize_on_disk @@ -216,6 +216,7 @@ __all__ = [ "remove_unused_arguments", "alias_temporaries", "set_argument_order", "rename_argument", "set_temporary_scope", "set_temporary_address_space", + "allocate_temporaries_for_base_storage", "find_instructions", "map_instructions", "set_instruction_priority", "add_dependency", @@ -302,7 +303,6 @@ __all__ = [ "CWithGNULibcTarget", "ExecutableCWithGNULibcTarget", "CudaTarget", "OpenCLTarget", "PyOpenCLTarget", "ISPCTarget", - "NumbaTarget", "NumbaCudaTarget", "ASTBuilderBase", "Optional", "memoize_on_disk", @@ -366,7 +366,7 @@ def set_options(kernel, *args, **kwargs): # {{{ library registration @for_each_kernel -def register_preamble_generators(kernel, preamble_generators): +def register_preamble_generators(kernel: LoopKernel, preamble_generators): """ :arg manglers: list of functions of signature ``(preamble_info)`` generating tuples ``(sortable_str_identifier, code)``, @@ -376,7 +376,8 @@ def register_preamble_generators(kernel, preamble_generators): """ from loopy.tools import unpickles_equally - new_pgens = kernel.preamble_generators[:] + new_pgens = tuple(kernel.preamble_generators) + for pgen in preamble_generators: if pgen not in new_pgens: if not unpickles_equally(pgen): @@ -385,7 +386,7 @@ def register_preamble_generators(kernel, preamble_generators): "and would thus disrupt loopy's caches" % pgen) - new_pgens.insert(0, pgen) + new_pgens = (pgen,) + new_pgens return kernel.copy(preamble_generators=new_pgens) @@ -394,7 +395,7 @@ def register_preamble_generators(kernel, preamble_generators): def register_symbol_manglers(kernel, manglers): from loopy.tools import unpickles_equally - new_manglers = kernel.symbol_manglers[:] + new_manglers = kernel.symbol_manglers for m in manglers: if m not in new_manglers: if not unpickles_equally(m): @@ -403,7 +404,7 @@ def register_symbol_manglers(kernel, manglers): "and would disrupt loopy's caches" % m) - new_manglers.insert(0, m) + new_manglers = (m,) + new_manglers return kernel.copy(symbol_manglers=new_manglers) @@ -484,7 +485,8 @@ def make_copy_kernel(new_dim_tags, old_dim_tags=None): result = make_kernel(set_str, "output[%s] = input[%s]" % (commad_indices, commad_indices), - lang_version=MOST_RECENT_LANGUAGE_VERSION) + lang_version=MOST_RECENT_LANGUAGE_VERSION, + default_offset=auto) result = tag_array_axes(result, "input", old_dim_tags) result = tag_array_axes(result, "output", new_dim_tags) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 8b89391e29b1d1f1d770aab24434a94ed4562a7b..2b5ccb54d5a18ed2f4ceaf762619aea73fa4d86d 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -20,16 +20,20 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from typing import TYPE_CHECKING, Tuple, Optional +from dataclasses import dataclass from warnings import warn -from pytools import Record - import numpy as np import loopy as lp +from loopy.kernel.array import get_strides from loopy.diagnostic import LoopyError, AutomaticTestFailure +if TYPE_CHECKING: + import pyopencl.array as cla + AUTO_TEST_SKIP_RUN = False @@ -68,13 +72,34 @@ def fill_rand(ary): fill_rand(ary) -class TestArgInfo(Record): - pass +@dataclass +class TestArgInfo: + name: str + ref_array: "cla.Array" + ref_storage_array: "cla.Array" + + ref_pre_run_array: "cla.Array" + ref_pre_run_storage_array: "cla.Array" + + ref_shape: Tuple[int, ...] + ref_strides: Tuple[int, ...] + ref_alloc_size: int + ref_numpy_strides: Tuple[int, ...] + needs_checking: bool + + # The attributes below are being modified in make_args, hence this dataclass + # cannot be frozen. + test_storage_array: Optional["cla.Array"] = None + test_array: Optional["cla.Array"] = None + test_shape: Optional[Tuple[int, ...]] = None + test_strides: Optional[Tuple[int, ...]] = None + test_numpy_strides: Optional[Tuple[int, ...]] = None + test_alloc_size: Optional[Tuple[int, ...]] = None # {{{ "reference" arguments -def make_ref_args(kernel, impl_arg_info, queue, parameters): +def make_ref_args(kernel, queue, parameters): import pyopencl as cl import pyopencl.array as cl_array @@ -86,13 +111,8 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): ref_args = {} ref_arg_data = [] - for arg in impl_arg_info: - kernel_arg = kernel.impl_arg_to_arg.get(arg.name) - - if arg.arg_class is ValueArg: - if arg.offset_for_name: - continue - + for arg in kernel.args: + if isinstance(arg, ValueArg): arg_value = parameters[arg.name] try: @@ -107,25 +127,24 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): ref_arg_data.append(None) - elif arg.arg_class is ArrayArg or arg.arg_class is ImageArg \ - or arg.arg_class is ConstantArg: + elif isinstance(arg, (ArrayArg, ImageArg, ConstantArg)): if arg.shape is None or any(saxis is None for saxis in arg.shape): raise LoopyError("array '%s' needs known shape to use automatic " "testing" % arg.name) - shape = evaluate_shape(arg.unvec_shape, parameters) - dtype = kernel_arg.dtype + shape = evaluate_shape(arg.shape, parameters) + dtype = arg.dtype - is_output = kernel_arg.is_output + is_output = arg.is_output - if arg.arg_class is ImageArg: + if isinstance(arg, ImageArg): storage_array = ary = cl_array.empty( queue, shape, dtype, order="C") numpy_strides = None alloc_size = None strides = None else: - strides = evaluate(arg.unvec_strides, parameters) + strides = evaluate(get_strides(arg), parameters) alloc_size = sum(astrd*(alen-1) if astrd != 0 else alen-1 for alen, astrd in zip(shape, strides)) + 1 @@ -142,13 +161,13 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): storage_array = cl_array.empty(queue, alloc_size, dtype) - if is_output and arg.arg_class is ImageArg: + if is_output and isinstance(arg, ImageArg): raise LoopyError("write-mode images not supported in " "automatic testing") fill_rand(storage_array) - if arg.arg_class is ImageArg: + if isinstance(arg, ImageArg): # must be contiguous pre_run_ary = pre_run_storage_array = storage_array.copy() @@ -191,20 +210,17 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): # {{{ "full-scale" arguments -def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): +def make_args(kernel, queue, ref_arg_data, parameters): import pyopencl as cl import pyopencl.array as cl_array - from loopy.kernel.data import ValueArg, ArrayArg, ImageArg,\ - TemporaryVariable, ConstantArg + from loopy.kernel.data import ValueArg, ArrayArg, ImageArg, ConstantArg from pymbolic import evaluate args = {} - for arg, arg_desc in zip(impl_arg_info, ref_arg_data): - kernel_arg = kernel.impl_arg_to_arg.get(arg.name) - - if arg.arg_class is ValueArg: + for arg, arg_desc in zip(kernel.args, ref_arg_data): + if isinstance(arg, ValueArg): arg_value = parameters[arg.name] try: @@ -217,24 +233,23 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): args[arg.name] = arg_value - elif arg.arg_class is ImageArg: + elif isinstance(arg, ImageArg): if arg.name in kernel.get_written_variables(): raise NotImplementedError("write-mode images not supported in " "automatic testing") - shape = evaluate_shape(arg.unvec_shape, parameters) + shape = evaluate_shape(arg.shape, parameters) assert shape == arg_desc.ref_shape # must be contiguous args[arg.name] = cl.image_from_array( queue.context, arg_desc.ref_pre_run_array.get()) - elif arg.arg_class is ArrayArg or\ - arg.arg_class is ConstantArg: - shape = evaluate(arg.unvec_shape, parameters) - strides = evaluate(arg.unvec_strides, parameters) + elif isinstance(arg, (ArrayArg, ConstantArg)): + shape = evaluate(arg.shape, parameters) + strides = evaluate(get_strides(arg), parameters) - dtype = kernel_arg.dtype + dtype = arg.dtype itemsize = dtype.itemsize numpy_strides = [itemsize*s for s in strides] @@ -280,10 +295,6 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): arg_desc.test_numpy_strides = numpy_strides arg_desc.test_alloc_size = alloc_size - elif arg.arg_class is TemporaryVariable: - # global temporary, handled by invocation logic - pass - else: raise LoopyError("arg type not understood") @@ -309,6 +320,7 @@ def _default_check_result(result, ref_result): / np.max(np.abs(ref_result-result))) # pylint: disable=bad-string-format-type return (False, + # pylint: disable=bad-string-format-type "results do not match -- (rel) l_2 err: %g, l_inf err: %g" % (l2_err, linf_err)) else: @@ -455,9 +467,6 @@ def auto_test_vs_ref( properties=cl.command_queue_properties.PROFILING_ENABLE) ref_codegen_result = lp.generate_code_v2(ref_prog) - ref_implemented_data_info = ref_codegen_result.implemented_data_infos[ - ref_entrypoint] - logger.info("{} (ref): trying {} for the reference calculation".format( ref_entrypoint, dev)) @@ -471,9 +480,7 @@ def auto_test_vs_ref( try: ref_args, ref_arg_data = \ - make_ref_args(ref_prog[ref_entrypoint], - ref_implemented_data_info, - ref_queue, parameters) + make_ref_args(ref_prog[ref_entrypoint], ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: @@ -548,8 +555,6 @@ def auto_test_vs_ref( test_prog_codegen_result = lp.generate_code_v2(test_prog) args = make_args(test_prog[test_entrypoint], - test_prog_codegen_result.implemented_data_infos[ - test_entrypoint], queue, ref_arg_data, parameters) args["out_host"] = False diff --git a/loopy/check.py b/loopy/check.py index 9d0c04f28c55c9f123a874913c13e56c33497e54..0f3faced863685c389d04ff2f9db3d588684e118 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -20,9 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from typing import Union, Tuple, Optional, List +import numpy as np from islpy import dim_type import islpy as isl + from loopy.symbolic import WalkMapper, CombineMapper, ResolvedFunction from loopy.diagnostic import (LoopyError, WriteRaceConditionWarning, warn_with_kernel, LoopyIndexError) @@ -30,7 +33,13 @@ from loopy.type_inference import TypeReader from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, CInstruction, _DataObliviousInstruction, NoOpInstruction) +from loopy.kernel import LoopKernel +from loopy.kernel.array import ( + FixedStrideArrayDimTag, SeparateArrayArrayDimTag, ArrayBase) +from loopy.kernel.data import auto, ArrayArg, ArrayDimImplementationTag from loopy.translation_unit import for_each_kernel +from loopy.typing import ExpressionT + from pytools import memoize_method from collections import defaultdict @@ -146,6 +155,131 @@ def check_functions_are_resolved(kernel): else: raise NotImplementedError(type(insn)) + +@for_each_kernel +def check_separated_array_consistency(kernel: LoopKernel) -> None: + # Boo. This is (part of) the price of redundant representation. + for arg in kernel.args: + if isinstance(arg, ArrayArg) and arg._separation_info is not None: + sep_indices = arg._separation_info.sep_axis_indices_set + for subarg_name in arg._separation_info.subarray_names.values(): + sub_arg = kernel.arg_dict[subarg_name] + + from loopy.preprocess import _remove_at_indices + + assert arg.shape is None or isinstance(arg.shape, tuple) + if _remove_at_indices(sep_indices, arg.shape) != sub_arg.shape: + raise LoopyError( + f"Shapes of '{arg.name}' and associated sep array " + "'{sub_arg.name}' are not consistent.") + + assert arg.dim_tags is None or isinstance(arg.dim_tags, tuple) + if _remove_at_indices(sep_indices, arg.dim_tags) != sub_arg.dim_tags: + raise LoopyError( + f"Dim tags of '{arg.name}' and associated sep array " + "'{sub_arg.name}' are not consistent.") + + for attr_name in ["address_space", "is_input", "is_output"]: + if getattr(arg, attr_name) != getattr(sub_arg, attr_name): + raise LoopyError( + "Attribute '{attr_name}' of " + f"'{arg.name}' and associated sep array " + f"'{sub_arg.name}' is not consistent.") + + +@for_each_kernel +def check_offsets_and_dim_tags(kernel: LoopKernel) -> None: + from loopy.symbolic import DependencyMapper + from pymbolic.primitives import Variable, Expression + + arg_name_vars = {Variable(name) for name in kernel.arg_dict} + dep_mapper = DependencyMapper() + + def ensure_depends_only_on_arguments( + what: str, expr: Union[str, ExpressionT]) -> None: + if isinstance(expr, str): + expr = Variable(expr) + + deps = dep_mapper(expr) + if not deps <= arg_name_vars: + raise LoopyError( + f"not all names in {what} are arguments: " + + ", ".join(str(v) for v in deps - arg_name_vars)) + + # {{{ process arguments + + new_args = [] + for arg in kernel.args: + if isinstance(arg, ArrayArg): + what = f"offset of argument '{arg.name}'" + if arg.offset is None: + continue + if arg.offset is auto: + pass + elif isinstance(arg.offset, (int, np.integer, Expression, str)): + ensure_depends_only_on_arguments(what, arg.offset) + + else: + raise LoopyError(f"invalid value of offset for '{arg.name}'") + + if arg.dim_tags is None: + new_dim_tags: Optional[Tuple[ArrayDimImplementationTag, ...]] = \ + arg.dim_tags + else: + new_dim_tags = () + for iaxis, dim_tag in enumerate(arg.dim_tags): + if isinstance(dim_tag, FixedStrideArrayDimTag): + what = (f"stride for axis {iaxis+1} (1-based) of " + f"of argument '{arg.name}'") + if dim_tag.stride is auto: + pass + elif isinstance( + dim_tag.stride, (int, np.integer, Expression)): + ensure_depends_only_on_arguments(what, dim_tag.stride) + else: + raise LoopyError(f"invalid value of {what}") + + assert new_dim_tags is not None + new_dim_tags = new_dim_tags + (dim_tag,) + + arg = arg.copy(dim_tags=new_dim_tags) + + new_args.append(arg) + + # }}} + + # {{{ process temporary variables + + for tv in kernel.temporary_variables.values(): + what = f"offset of temporary '{tv.name}'" + if tv.offset is None: + pass + if tv.offset is auto: + pass + elif isinstance(tv.offset, (int, np.integer, Expression, str)): + ensure_depends_only_on_arguments(what, tv.offset) + else: + raise LoopyError(f"invalid value of offset for '{tv.name}'") + + if tv.dim_tags is not None: + for iaxis, dim_tag in enumerate(tv.dim_tags): + if isinstance(dim_tag, FixedStrideArrayDimTag): + what = ("axis stride for axis " + f"{iaxis+1} (1-based) of temporary '{tv.name}'") + if dim_tag.stride is auto: + raise LoopyError(f"The {what}" f" is 'auto', " + "which is not allowed.") + elif isinstance(dim_tag.stride, (int, np.integer, Expression)): + ensure_depends_only_on_arguments(what, dim_tag.stride) + else: + raise LoopyError(f"invalid value of {what}") + + elif isinstance(dim_tag, SeparateArrayArrayDimTag): + raise LoopyError(f"Axis {iaxis+1} of temporary " + f"'{tv.name} is tagged 'sep'. This is not allowed.") + + # }}} + # }}} @@ -465,24 +599,6 @@ def check_for_write_races(kernel): WriteRaceConditionWarning) -@for_each_kernel -def check_for_orphaned_user_hardware_axes(kernel): - from loopy.kernel.data import LocalInameTag - for axis in kernel.local_sizes: - found = False - for iname in kernel.inames.values(): - for tag in iname.tags: - if isinstance(tag, LocalInameTag) and tag.axis == axis: - found = True - break - if found: - break - - if not found: - raise LoopyError("user-requested local hardware axis %d " - "has no iname mapped to it" % axis) - - @for_each_kernel def check_for_data_dependent_parallel_bounds(kernel): """ @@ -1118,12 +1234,13 @@ def pre_schedule_checks(t_unit): check_for_integer_subscript_indices(t_unit) check_functions_are_resolved(t_unit) + check_separated_array_consistency(t_unit) + check_offsets_and_dim_tags(t_unit) # Ordering restriction: # check_sub_array_ref_inames_not_within_or_redn_inames should be done # before check_bounds. See: BatchedAccessMapMapper.map_sub_array_ref. check_sub_array_ref_inames_not_within_or_redn_inames(t_unit) check_for_duplicate_insn_ids(t_unit) - check_for_orphaned_user_hardware_axes(t_unit) check_for_double_use_of_hw_axes(t_unit) check_insn_attributes(t_unit) check_loop_priority_inames_known(t_unit) @@ -1151,6 +1268,37 @@ def pre_schedule_checks(t_unit): # {{{ post-schedule / pre-code-generation checks +# {{{ check_for_nested_base_storage + +def check_for_nested_base_storage(kernel: LoopKernel) -> None: + # must run after preprocessing has created variables for base_storage + + from loopy.kernel.data import ArrayArg + arrays: List[ArrayBase] = [ + arg for arg in kernel.args if isinstance(arg, ArrayArg) + ] + arrays = arrays + list(kernel.temporary_variables.values()) + + name_to_array = {ary.name: ary for ary in arrays} + + for ary in kernel.temporary_variables.values(): + if ary.base_storage: + storage_array = name_to_array.get(ary.base_storage, None) + + if storage_array is None: + raise ValueError("nothing known about storage array " + f"'{ary.base_storage}' serving as base_storage of " + f"'{ary.name}'") + + if storage_array.base_storage: + raise ValueError("storage array " + f"'{storage_array.name}' serving as base_storage of " + f"'{ary.name}' may not itself use base_storage " + "(currently given as '{storage_array.base_storage}'") + +# }}} + + # {{{ check for unused hw axes def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, @@ -1540,8 +1688,6 @@ def _validate_kernel_call_sites_inner(kernel, callables): def validate_kernel_call_sites(translation_unit): - from loopy import LoopKernel - for name in translation_unit.callables_table: clbl = translation_unit[name] if isinstance(clbl, LoopKernel): @@ -1562,6 +1708,7 @@ def pre_codegen_entrypoint_checks(kernel, callables_table): def pre_codegen_callable_checks(kernel, callables_table): logger.debug("pre-codegen callable check %s: start" % kernel.name) + check_for_nested_base_storage(kernel) check_for_unused_hw_axes_in_insns(kernel, callables_table) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) check_that_temporaries_are_defined_in_subkernels_where_used(kernel) diff --git a/loopy/cli.py b/loopy/cli.py index d7e6d148c8de44bbdefbb40b32190faf42e4e34a..426fb2e3e2a04b7be33778ffd2e9a0776cbb827d 100644 --- a/loopy/cli.py +++ b/loopy/cli.py @@ -77,7 +77,7 @@ def main(): target = ISPCTarget elif args.target == "ispc-occa": from loopy.target.ispc import ISPCTarget - target = lambda: ISPCTarget(occa_mode=True) # noqa: E731 + target = lambda: ISPCTarget() # noqa: E731 elif args.target == "c": from loopy.target.c import CTarget target = CTarget diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index edf59d62a17b4f1ddf8c92be927f8c1f367cb9d4..a700b153ab4b62ce9ea66cecd6ae2de70f4b5664 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -20,17 +20,27 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import sys +from immutables import Map +from typing import (Set, Mapping, Sequence, Any, FrozenSet, Union, + Optional, Tuple, TYPE_CHECKING) +from dataclasses import dataclass, replace import logging logger = logging.getLogger(__name__) import islpy as isl from loopy.diagnostic import LoopyError, warn -from pytools import ImmutableRecord +from pytools import UniqueNameGenerator from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION +from loopy.types import LoopyType +from loopy.typing import ExpressionT +from loopy.kernel import LoopKernel +from loopy.target import TargetBase +from loopy.kernel.function_interface import InKernelCallable from loopy.symbolic import CombineMapper @@ -40,11 +50,19 @@ from loopy.kernel.function_interface import CallableKernel from pytools import ProcessLogger +if TYPE_CHECKING: + from loopy.codegen.tools import CodegenOperationCacheManager + from loopy.codegen.result import GeneratedProgram + + +if getattr(sys, "_BUILDING_SPHINX_DOCS", False): + from loopy.codegen.tools import CodegenOperationCacheManager # noqa: F811 + from loopy.codegen.result import GeneratedProgram # noqa: F811 + + __doc__ = """ .. currentmodule:: loopy.codegen -.. autoclass:: ImplementedDataInfo - .. autoclass:: PreambleInfo .. autoclass:: VectorizationInfo @@ -61,84 +79,13 @@ __doc__ = """ """ -# {{{ implemented data info - -class ImplementedDataInfo(ImmutableRecord): - """ - .. attribute:: name - - The expanded name of the array. Note that, for example - in the case of separate-array-tagged axes, multiple - implemented arrays may correspond to one user-facing - array. - - .. attribute:: dtype - - .. attribute:: arg_class - - .. attribute:: base_name - - The user-facing name of the underlying array. - May be *None* for non-array arguments. - - .. attribute:: shape - .. attribute:: strides - - Strides in multiples of ``dtype.itemsize``. - - .. attribute:: unvec_shape - .. attribute:: unvec_strides - - Strides in multiples of ``dtype.itemsize`` that accounts for - :class:`loopy.kernel.array.VectorArrayDimTag` in a scalar - manner - - - .. attribute:: offset_for_name - .. attribute:: stride_for_name_and_axis - - A tuple *(name, axis)* indicating the (implementation-facing) - name of the array and axis number for which this argument provides - the strides. - - .. attribute:: allows_offset - .. attribute:: is_written - """ - - def __init__(self, target, name, dtype, arg_class, - base_name=None, - shape=None, strides=None, - unvec_shape=None, unvec_strides=None, - offset_for_name=None, stride_for_name_and_axis=None, - allows_offset=None, - is_written=None): - - from loopy.types import LoopyType - assert isinstance(dtype, LoopyType) - - ImmutableRecord.__init__(self, - name=name, - dtype=dtype, - arg_class=arg_class, - base_name=base_name, - shape=shape, - strides=strides, - unvec_shape=unvec_shape, - unvec_strides=unvec_strides, - offset_for_name=offset_for_name, - stride_for_name_and_axis=stride_for_name_and_axis, - allows_offset=allows_offset, - is_written=is_written) - -# }}} - - # {{{ code generation state class UnvectorizableError(Exception): pass +@dataclass(frozen=True) class VectorizationInfo: """ .. attribute:: iname @@ -146,13 +93,14 @@ class VectorizationInfo: .. attribute:: space """ - def __init__(self, iname, length, space): - self.iname = iname - self.length = length - self.space = space + iname: str + length: int + # FIXME why is this here? + space: isl.Space -class SeenFunction(ImmutableRecord): +@dataclass(frozen=True) +class SeenFunction: """This is used to track functions that emerge late during code generation, e.g. C functions to realize arithmetic. No connection with :class:`~loopy.kernel.function_interface.InKernelCallable`. @@ -167,23 +115,17 @@ class SeenFunction(ImmutableRecord): a tuple of result dtypes """ - - def __init__(self, name, c_name, arg_dtypes, result_dtypes): - ImmutableRecord.__init__(self, - name=name, - c_name=c_name, - arg_dtypes=arg_dtypes, - result_dtypes=result_dtypes) + name: str + c_name: str + arg_dtypes: Tuple[LoopyType, ...] + result_dtypes: Tuple[LoopyType, ...] +@dataclass(frozen=True) class CodeGenerationState: """ .. attribute:: kernel .. attribute:: target - .. attribute:: implemented_data_info - - a list of :class:`ImplementedDataInfo` objects. - .. attribute:: implemented_domain The entire implemented domain (as an :class:`islpy.Set`) @@ -210,7 +152,8 @@ class CodeGenerationState: .. attribute:: vectorization_info - None or an instance of :class:`VectorizationInfo` + *None* (to mean vectorization has not yet been applied), or an instance of + :class:`VectorizationInfo`. .. attribute:: is_generating_device_code @@ -237,105 +180,48 @@ class CodeGenerationState: An instance of :class:`loopy.codegen.tools.CodegenOperationCacheManager`. """ - def __init__(self, kernel, target, - implemented_data_info, implemented_domain, implemented_predicates, - seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, - allow_complex, - callables_table, - is_entrypoint, - vectorization_info=None, var_name_generator=None, - is_generating_device_code=None, - gen_program_name=None, - schedule_index_end=None, - codegen_cachemanager=None): - self.kernel = kernel - self.target = target - self.implemented_data_info = implemented_data_info - self.implemented_domain = implemented_domain - self.implemented_predicates = implemented_predicates - self.seen_dtypes = seen_dtypes - self.seen_functions = seen_functions - self.seen_atomic_dtypes = seen_atomic_dtypes - self.var_subst_map = var_subst_map.copy() - self.allow_complex = allow_complex - self.callables_table = callables_table - self.is_entrypoint = is_entrypoint - self.vectorization_info = vectorization_info - self.var_name_generator = var_name_generator - self.is_generating_device_code = is_generating_device_code - self.gen_program_name = gen_program_name - self.schedule_index_end = schedule_index_end - self.codegen_cachemanager = codegen_cachemanager + kernel: LoopKernel + target: TargetBase + implemented_domain: isl.Set + implemented_predicates: FrozenSet[Union[str, ExpressionT]] + + # /!\ mutable + seen_dtypes: Set[LoopyType] + seen_functions: Set[SeenFunction] + seen_atomic_dtypes: Set[LoopyType] + + var_subst_map: Map[str, ExpressionT] + allow_complex: bool + callables_table: Mapping[str, InKernelCallable] + is_entrypoint: bool + var_name_generator: UniqueNameGenerator + is_generating_device_code: bool + gen_program_name: str + schedule_index_end: int + codegen_cachemanager: "CodegenOperationCacheManager" + vectorization_info: Optional[VectorizationInfo] = None + + def __post_init__(self): + # FIXME: If this doesn't bomb during testing, we can get rid of target. + assert self.target == self.kernel.target + + assert self.vectorization_info is None or isinstance( + self.vectorization_info, VectorizationInfo) # {{{ copy helpers - def copy(self, kernel=None, target=None, implemented_data_info=None, - implemented_domain=None, implemented_predicates=frozenset(), - var_subst_map=None, is_entrypoint=None, vectorization_info=None, - is_generating_device_code=None, gen_program_name=None, - schedule_index_end=None): - - if kernel is None: - kernel = self.kernel - - if target is None: - target = self.target - - if implemented_data_info is None: - implemented_data_info = self.implemented_data_info - - if is_entrypoint is None: - is_entrypoint = self.is_entrypoint - - if vectorization_info is False: - vectorization_info = None - - elif vectorization_info is None: - vectorization_info = self.vectorization_info - - if is_generating_device_code is None: - is_generating_device_code = self.is_generating_device_code - - if gen_program_name is None: - gen_program_name = self.gen_program_name - - if schedule_index_end is None: - schedule_index_end = self.schedule_index_end - - return CodeGenerationState( - kernel=kernel, - target=target, - implemented_data_info=implemented_data_info, - implemented_domain=implemented_domain or self.implemented_domain, - implemented_predicates=( - implemented_predicates or self.implemented_predicates), - seen_dtypes=self.seen_dtypes, - seen_functions=self.seen_functions, - seen_atomic_dtypes=self.seen_atomic_dtypes, - var_subst_map=var_subst_map or self.var_subst_map, - allow_complex=self.allow_complex, - callables_table=self.callables_table, - is_entrypoint=is_entrypoint, - vectorization_info=vectorization_info, - var_name_generator=self.var_name_generator, - is_generating_device_code=is_generating_device_code, - gen_program_name=gen_program_name, - schedule_index_end=schedule_index_end, - codegen_cachemanager=self.codegen_cachemanager.with_kernel(kernel), - ) + def copy(self, **kwargs: Any) -> "CodeGenerationState": + return replace(self, **kwargs) - def copy_and_assign(self, name, value): + def copy_and_assign( + self, name: str, value: ExpressionT) -> "CodeGenerationState": """Make a copy of self with variable *name* fixed to *value*.""" - var_subst_map = self.var_subst_map.copy() - var_subst_map[name] = value - return self.copy(var_subst_map=var_subst_map) + return self.copy(var_subst_map=self.var_subst_map.set(name, value)) - def copy_and_assign_many(self, assignments): + def copy_and_assign_many(self, assignments) -> "CodeGenerationState": """Make a copy of self with *assignments* included.""" - var_subst_map = self.var_subst_map.copy() - var_subst_map.update(assignments) - return self.copy(var_subst_map=var_subst_map) + return self.copy(var_subst_map=self.var_subst_map.update(assignments)) # }}} @@ -396,8 +282,10 @@ class CodeGenerationState: def unvectorize(self, func): vinf = self.vectorization_info + assert vinf is not None + result = [] - novec_self = self.copy(vectorization_info=False) + novec_self = self.copy(vectorization_info=None) for i in range(vinf.length): idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i @@ -453,14 +341,15 @@ class InKernelCallablesCollector(CombineMapper): map_type_cast = map_constant -class PreambleInfo(ImmutableRecord): - """ - .. attribute:: kernel - .. attribute:: seen_dtypes - .. attribute:: seen_functions - .. attribute:: seen_atomic_dtypes - .. attribute:: codegen_state - """ +@dataclass(frozen=True) +class PreambleInfo: + kernel: LoopKernel + seen_dtypes: Set[LoopyType] + seen_functions: Set[SeenFunction] + seen_atomic_dtypes: Set[LoopyType] + + # FIXME: This makes all the above redundant. It probably shouldn't be here. + codegen_state: CodeGenerationState # {{{ main code generation entrypoint @@ -480,47 +369,8 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, codegen_plog = ProcessLogger(logger, f"{kernel.name}: generate code") - # {{{ pre-codegen-process of non-entrypoint kernel - - if not is_entrypoint: - from loopy.kernel.array import ArrayBase - from loopy.kernel.data import auto - - new_args = [arg.copy(offset=0 if arg.offset is auto else arg.offset) - if isinstance(arg, ArrayBase) - else arg - for arg in kernel.args] - kernel = kernel.copy(args=new_args) - - # }}} - # {{{ examine arg list - from loopy.kernel.data import ValueArg - from loopy.kernel.array import ArrayBase - - implemented_data_info = [] - - for arg in kernel.args: - is_written = arg.name in kernel.get_written_variables() - if isinstance(arg, ArrayBase): - implemented_data_info.extend( - arg.decl_info( - target, - is_written=is_written, - index_dtype=kernel.index_dtype)) - - elif isinstance(arg, ValueArg): - implemented_data_info.append(ImplementedDataInfo( - target=target, - name=arg.name, - dtype=arg.dtype, - arg_class=ValueArg, - is_written=is_written)) - - else: - raise ValueError("argument type not understood: '%s'" % type(arg)) - allow_complex = False for var in kernel.args + list(kernel.temporary_variables.values()): if var.dtype.involves_complex(): @@ -539,13 +389,12 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, codegen_state = CodeGenerationState( kernel=kernel, target=target, - implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, seen_atomic_dtypes=seen_atomic_dtypes, - var_subst_map={}, + var_subst_map=Map(), allow_complex=allow_complex, var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, @@ -573,17 +422,16 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, # {{{ handle preambles - for idi in codegen_state.implemented_data_info: - seen_dtypes.add(idi.dtype) + for arg in kernel.args: + seen_dtypes.add(arg.dtype) for tv in kernel.temporary_variables.values(): - for idi in tv.decl_info(kernel.target, index_dtype=kernel.index_dtype): - seen_dtypes.add(idi.dtype) + seen_dtypes.add(tv.dtype) if kernel.all_inames(): seen_dtypes.add(kernel.index_dtype) - preambles = kernel.preambles[:] + preambles = list(kernel.preambles) preamble_info = PreambleInfo( kernel=kernel, @@ -594,8 +442,8 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, codegen_state=codegen_state ) - preamble_generators = (kernel.preamble_generators - + target.get_device_ast_builder().preamble_generators()) + preamble_generators = (list(kernel.preamble_generators) + + list(target.get_device_ast_builder().preamble_generators())) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) @@ -648,7 +496,8 @@ def diverge_callee_entrypoints(program): return program.copy(callables_table=new_callables) -class TranslationUnitCodeGenerationResult(ImmutableRecord): +@dataclass(frozen=True) +class TranslationUnitCodeGenerationResult: """ .. attribute:: host_program @@ -663,16 +512,16 @@ class TranslationUnitCodeGenerationResult(ImmutableRecord): .. attribute:: host_preambles .. attribute:: device_preambles - .. attribute:: implemented_data_infos - - A mapping from names of entrypoints to their - list of :class:`ImplementedDataInfo` objects. - .. automethod:: host_code .. automethod:: device_code .. automethod:: all_code """ + host_programs: Mapping[str, "GeneratedProgram"] + device_programs: Sequence["GeneratedProgram"] + host_preambles: Sequence[Tuple[int, str]] = () + device_preambles: Sequence[Tuple[int, str]] = () + def host_code(self): from loopy.codegen.result import process_preambles preamble_codes = process_preambles(getattr(self, "host_preambles", [])) @@ -695,9 +544,9 @@ class TranslationUnitCodeGenerationResult(ImmutableRecord): def all_code(self): from loopy.codegen.result import process_preambles preamble_codes = process_preambles( - getattr(self, "host_preambles", []) + tuple(getattr(self, "host_preambles", ())) + - getattr(self, "device_preambles", []) + tuple(getattr(self, "device_preambles", ())) ) return ( @@ -768,7 +617,6 @@ def generate_code_v2(program): device_programs = [] device_preambles = [] callee_fdecls = [] - implemented_data_infos = {} # {{{ collect host/device programs @@ -780,7 +628,6 @@ def generate_code_v2(program): func_id in program.entrypoints) if func_id in program.entrypoints: host_programs[func_id] = cgr.host_program - implemented_data_infos[func_id] = cgr.implemented_data_info else: assert len(cgr.device_programs) == 1 callee_fdecls.append(cgr.device_programs[0].ast.fdecl) @@ -805,8 +652,7 @@ def generate_code_v2(program): cgr = TranslationUnitCodeGenerationResult( host_programs=host_programs, device_programs=device_programs, - device_preambles=device_preambles, - implemented_data_infos=implemented_data_infos) + device_preambles=device_preambles) if CACHING_ENABLED: code_gen_cache.store_if_not_present(input_program, cgr) @@ -820,6 +666,11 @@ def generate_code(kernel, device=None): warn("passing 'device' to generate_code() is deprecated", DeprecationWarning, stacklevel=2) + if device is not None: + from warnings import warn + warn("generate_code is deprecated and will stop working in 2023. " + "Call generate_code_v2 instead.", DeprecationWarning, stacklevel=2) + codegen_result = generate_code_v2(kernel) if len(codegen_result.device_programs) > 1: @@ -829,10 +680,7 @@ def generate_code(kernel, device=None): raise LoopyError("kernel passed to generate_code yielded multiple " "host programs. Use generate_code_v2.") - assert len(codegen_result.implemented_data_infos) == 1 - implemented_data_info, = codegen_result.implemented_data_infos.values() - - return codegen_result.device_code(), implemented_data_info + return codegen_result.device_code(), None # }}} diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 8d98196f2d74955867c44ac6d2e66ab259a07bb7..9f597f8478c1b0524112c8bd261a17e132e81fc3 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -23,47 +23,15 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from loopy.codegen.result import merge_codegen_results, wrap_in_if import islpy as isl + +from loopy.codegen.result import merge_codegen_results, wrap_in_if from loopy.schedule import ( EnterLoop, LeaveLoop, RunInstruction, Barrier, CallKernel, gather_schedule_block, generate_sub_sched_items) from loopy.diagnostic import LoopyError -def synthesize_idis_for_extra_args(kernel, schedule_index): - """ - :returns: A list of :class:`loopy.codegen.ImplementedDataInfo` - """ - sched_item = kernel.linearization[schedule_index] - - from loopy.codegen import ImplementedDataInfo - from loopy.kernel.data import InameArg, AddressSpace - - assert isinstance(sched_item, CallKernel) - - idis = [] - - for arg in sched_item.extra_args: - temporary = kernel.temporary_variables[arg] - assert temporary.address_space == AddressSpace.GLOBAL - idis.extend( - temporary.decl_info( - kernel.target, - index_dtype=kernel.index_dtype)) - - for iname in sched_item.extra_inames: - idis.append( - ImplementedDataInfo( - target=kernel.target, - name=iname, - dtype=kernel.index_dtype, - arg_class=InameArg, - is_written=False)) - - return idis - - def generate_code_for_sched_index(codegen_state, sched_index): kernel = codegen_state.kernel sched_item = kernel.linearization[sched_index] @@ -75,14 +43,11 @@ def generate_code_for_sched_index(codegen_state, sched_index): _, past_end_i = gather_schedule_block(kernel.linearization, sched_index) assert past_end_i <= codegen_state.schedule_index_end - extra_args = synthesize_idis_for_extra_args(kernel, sched_index) - new_codegen_state = codegen_state.copy( is_generating_device_code=True, gen_program_name=sched_item.kernel_name, schedule_index_end=past_end_i-1, - implemented_data_info=(codegen_state.implemented_data_info - + extra_args)) + ) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( @@ -98,8 +63,7 @@ def generate_code_for_sched_index(codegen_state, sched_index): codegen_state.ast_builder.get_kernel_call( codegen_state, sched_item.kernel_name, - glob_grid, loc_grid, - extra_args), + glob_grid, loc_grid) ]) else: # do not generate host code for non-entrypoint kernels @@ -157,8 +121,7 @@ def generate_code_for_sched_index(codegen_state, sched_index): return CodeGenerationResult( host_program=None, device_programs=[], - implemented_domains={}, - implemented_data_info=codegen_state.implemented_data_info) + implemented_domains={}) else: raise LoopyError("do not know how to emit code for barrier " diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 7523c11d7e4ca9764abbe9cec4a917ade8128974..e19686929d787a06cb562ebfc975ffe509e482c6 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -20,10 +20,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from pytools import ImmutableRecord +from typing import Any, Sequence, Mapping, Tuple, Optional +from dataclasses import dataclass, replace +import islpy as isl -def process_preambles(preambles): + +def process_preambles(preambles: Sequence[Tuple[int, str]]) -> Sequence[str]: seen_preamble_tags = set() dedup_preambles = [] @@ -55,7 +58,8 @@ __doc__ = """ # {{{ code generation result -class GeneratedProgram(ImmutableRecord): +@dataclass(frozen=True) +class GeneratedProgram: """ .. attribute:: name @@ -73,8 +77,17 @@ class GeneratedProgram(ImmutableRecord): the overall function definition. """ + name: str + is_device_program: bool + ast: Any + body_ast: Optional[Any] = None + + def copy(self, **kwargs: Any) -> "GeneratedProgram": + return replace(self, **kwargs) -class CodeGenerationResult(ImmutableRecord): + +@dataclass(frozen=True) +class CodeGenerationResult: """ .. attribute:: host_program .. attribute:: device_programs @@ -93,12 +106,15 @@ class CodeGenerationResult(ImmutableRecord): .. automethod:: host_code .. automethod:: device_code .. automethod:: all_code - - .. attribute:: implemented_data_info - - a list of :class:`loopy.codegen.ImplementedDataInfo` objects. - Only added at the very end of code generation. """ + host_program: Optional[GeneratedProgram] + device_programs: Sequence[GeneratedProgram] + implemented_domains: Mapping[str, isl.Set] + host_preambles: Sequence[Tuple[int, str]] = () + device_preambles: Sequence[Tuple[int, str]] = () + + def copy(self, **kwargs: Any) -> "CodeGenerationResult": + return replace(self, **kwargs) @staticmethod def new(codegen_state, insn_id, ast, implemented_domain): @@ -119,12 +135,12 @@ class CodeGenerationResult(ImmutableRecord): } return CodeGenerationResult( - implemented_data_info=codegen_state.implemented_data_info, implemented_domains={insn_id: [implemented_domain]}, **kwargs) def host_code(self): - preamble_codes = process_preambles(getattr(self, "host_preambles", [])) + assert self.host_program is not None + preamble_codes = process_preambles(self.host_preambles) return ( "".join(preamble_codes) @@ -132,7 +148,7 @@ class CodeGenerationResult(ImmutableRecord): str(self.host_program.ast)) def device_code(self): - preamble_codes = process_preambles(getattr(self, "device_preambles", [])) + preamble_codes = process_preambles(self.device_preambles) return ( "".join(preamble_codes) @@ -140,6 +156,7 @@ class CodeGenerationResult(ImmutableRecord): + "\n\n".join(str(dp.ast) for dp in self.device_programs)) def all_code(self): + assert self.host_program is not None preamble_codes = process_preambles( getattr(self, "host_preambles", []) + @@ -178,7 +195,7 @@ class CodeGenerationResult(ImmutableRecord): assert program.is_device_program return self.copy( device_programs=( - self.device_programs[:-1] + list(self.device_programs[:-1]) + [program])) else: @@ -207,8 +224,7 @@ def merge_codegen_results(codegen_state, elements, collapse=True): return CodeGenerationResult( host_program=None, device_programs=[], - implemented_domains={}, - implemented_data_info=codegen_state.implemented_data_info) + implemented_domains={}) ast_els = [] new_device_programs = [] @@ -260,7 +276,6 @@ def merge_codegen_results(codegen_state, elements, collapse=True): .with_new_ast(codegen_state, ast) .copy( implemented_domains=implemented_domains, - implemented_data_info=codegen_state.implemented_data_info, **kwargs)) @@ -330,3 +345,5 @@ def generate_host_or_device_program(codegen_state, schedule_index): return codegen_result # }}} + +# vim: foldmethod=marker diff --git a/loopy/codegen/tools.py b/loopy/codegen/tools.py index ba8e1cfc9270ad5a5b14f6b13eb5fa5fa907cb39..d9206ed4d93b5095c315433ba63f077245068349 100644 --- a/loopy/codegen/tools.py +++ b/loopy/codegen/tools.py @@ -20,7 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from functools import cached_property from pytools import memoize_method + from loopy.schedule import (EnterLoop, LeaveLoop, CallKernel, ReturnFromKernel, Barrier, BeginBlockItem, gather_schedule_block, ScheduleItem) @@ -40,7 +42,7 @@ __doc__ = """ """ -@dataclass +@dataclass(frozen=True) class KernelProxyForCodegenOperationCacheManager: """ Proxy to :class:`loopy.LoopKernel` to be used by @@ -50,8 +52,7 @@ class KernelProxyForCodegenOperationCacheManager: linearization: List[ScheduleItem] inames: Dict[str, Iname] - @property - @memoize_method + @cached_property def id_to_insn(self): return {insn.id: insn for insn in self.instructions} @@ -111,8 +112,7 @@ class CodegenOperationCacheManager: return self - @property - @memoize_method + @cached_property def active_inames(self): """ Returns an instance of :class:`list`, with the i-th entry being a @@ -139,8 +139,7 @@ class CodegenOperationCacheManager: return active_inames - @property - @memoize_method + @cached_property def callkernel_index(self): """ Returns an instance of :class:`list`, with the i-th entry being the index of @@ -165,8 +164,7 @@ class CodegenOperationCacheManager: return callkernel_index - @property - @memoize_method + @cached_property def has_barrier_within(self): """ Returns an instance of :class:`list`. The list's i-th entry is *True* if the diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index a2e631587361e76e258a39f95f00cba5d99e6f3a..02a82ab8a74d78b1342fc09b92c19c663d7f2209 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -22,78 +22,47 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from functools import cached_property +from enum import IntEnum from sys import intern +from typing import ( + Dict, Sequence, Tuple, Mapping, Optional, FrozenSet, Any, Union, + Callable, Iterator, List, Set, TYPE_CHECKING) +from dataclasses import dataclass, replace, field, fields +from warnings import warn from collections import defaultdict import numpy as np -from pytools import ImmutableRecordWithoutPickling, ImmutableRecord, memoize_method -from pytools.tag import Taggable +from pytools import (memoize_method, + UniqueNameGenerator, generate_unique_names, natsorted) +from pytools.tag import Taggable, Tag import islpy as isl from islpy import dim_type -import re - -from pytools import UniqueNameGenerator, generate_unique_names, natsorted +from immutables import Map from loopy.diagnostic import CannotBranchDomainTree, LoopyError from loopy.tools import update_persistent_hash from loopy.diagnostic import StaticValueFindingError -from loopy.kernel.data import filter_iname_tags_by_type, Iname -from warnings import warn - - -# {{{ unique var names - -class _UniqueVarNameGenerator(UniqueNameGenerator): - - def __init__(self, existing_names=frozenset(), forced_prefix=""): - super().__init__(existing_names, forced_prefix) - array_prefix_pattern = re.compile("(.*)_s[0-9]+$") - - array_prefixes = set() - for name in existing_names: - match = array_prefix_pattern.match(name) - if match is None: - continue - - array_prefixes.add(match.group(1)) - - self.conflicting_array_prefixes = array_prefixes - self.array_prefix_pattern = array_prefix_pattern - - def _name_added(self, name): - match = self.array_prefix_pattern.match(name) - if match is None: - return - - self.conflicting_array_prefixes.add(match.group(1)) - - def is_name_conflicting(self, name): - if name in self.existing_names: - return True - - # Array dimensions implemented as separate arrays generate - # names by appending '_s<NUMBER>'. Make sure that no - # conflicts can arise from these names. - - # Case 1: a_s0 is already a name; we are trying to insert a - # Case 2: a is already a name; we are trying to insert a_s0 - - if name in self.conflicting_array_prefixes: - return True - - match = self.array_prefix_pattern.match(name) - if match is None: - return False - - return match.group(1) in self.existing_names - -# }}} - +from loopy.kernel.data import ( + _ArraySeparationInfo, + KernelArgument, + filter_iname_tags_by_type, Iname, + TemporaryVariable, ValueArg, ArrayArg, SubstitutionRule) +from loopy.kernel.instruction import InstructionBase +from loopy.types import LoopyType, NumpyType +from loopy.options import Options +from loopy.schedule import ScheduleItem +from loopy.typing import ExpressionT +from loopy.target import TargetBase + +if TYPE_CHECKING: + from loopy.kernel.function_interface import InKernelCallable + from loopy.codegen import PreambleInfo # {{{ loop kernel object -class KernelState: # noqa +class KernelState(IntEnum): # noqa INITIAL = 0 CALLS_RESOLVED = 1 PREPROCESSED = 2 @@ -105,11 +74,18 @@ def _get_inames_from_domains(domains): (frozenset(dom.get_var_names(dim_type.set)) for dom in domains)) -class _not_provided: # noqa: N801 - pass +@dataclass(frozen=True) +class _BoundsRecord: + lower_bound_pw_aff: isl.PwAff + upper_bound_pw_aff: isl.PwAff + size: isl.PwAff -class LoopKernel(ImmutableRecordWithoutPickling, Taggable): +PreambleGenerator = Callable[["PreambleInfo"], Iterator[Tuple[int, str]]] + + +@dataclass(frozen=True) +class LoopKernel(Taggable): """These correspond more or less directly to arguments of :func:`loopy.make_kernel`. @@ -144,7 +120,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): A :class:`islpy.BasicSet` parameter domain. - .. attribute:: local_sizes .. attribute:: temporary_variables A :class:`dict` of mapping variable names to @@ -179,7 +154,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): were applied to the kernel. These are stored so that they may be repeated on expressions the user specifies later. - .. attribute:: cache_manager .. attribute:: options An instance of :class:`loopy.Options` @@ -204,162 +178,55 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): .. automethod:: tagged .. automethod:: without_tags """ - - # {{{ constructor - - def __init__(self, domains, instructions, args=None, - schedule=None, - linearization=None, - name="loopy_kernel", - preambles=None, - preamble_generators=None, - assumptions=None, - local_sizes=None, - temporary_variables=None, - inames=None, - iname_to_tags=None, - substitutions=None, - symbol_manglers=None, - - iname_slab_increments=None, - loop_priority=frozenset(), - silenced_warnings=None, - - applied_iname_rewrites=None, - cache_manager=None, - index_dtype=None, - options=None, - - state=KernelState.INITIAL, - target=None, - - overridden_get_grid_sizes_for_insn_ids=None, - _cached_written_variables=None, - tags=frozenset()): - """ - :arg overridden_get_grid_sizes_for_insn_ids: A callable. When kernels get - intersected in slab decomposition, their grid sizes shouldn't - change. This provides a way to forward sub-kernel grid size requests. - """ - - # {{{ process constructor arguments - - if args is None: - args = [] - if preambles is None: - preambles = [] - if preamble_generators is None: - preamble_generators = [] - if local_sizes is None: - local_sizes = {} - if temporary_variables is None: - temporary_variables = {} - if substitutions is None: - substitutions = {} - if symbol_manglers is None: - symbol_manglers = [] - if iname_slab_increments is None: - iname_slab_increments = {} - - if silenced_warnings is None: - silenced_warnings = [] - if applied_iname_rewrites is None: - applied_iname_rewrites = [] - - if cache_manager is None: - from loopy.kernel.tools import SetOperationCacheManager - cache_manager = SetOperationCacheManager() - - if iname_to_tags is not None: - warn("Providing iname_to_tags is deprecated, pass inames instead. " - "Will be unsupported in 2022.", - DeprecationWarning, stacklevel=2) - - if inames is not None: - raise LoopyError("Cannot provide both iname_to_tags and inames to " - "LoopKernel.__init__") - - inames = { - name: inames.get(name, Iname(name, frozenset())) - for name in _get_inames_from_domains(domains)} - - assert isinstance(inames, dict) - - if index_dtype is None: - index_dtype = np.int32 - - # }}} - - assert isinstance(assumptions, isl.BasicSet) - assert assumptions.is_params() - - from loopy.types import to_loopy_type - index_dtype = to_loopy_type(index_dtype) - if not index_dtype.is_integral(): + domains: Sequence[isl.BasicSet] + instructions: Sequence[InstructionBase] + args: Sequence[KernelArgument] + assumptions: isl.BasicSet + temporary_variables: Mapping[str, TemporaryVariable] + inames: Mapping[str, Iname] + substitutions: Mapping[str, SubstitutionRule] + options: Options + target: TargetBase + tags: FrozenSet[Tag] + state: KernelState = KernelState.INITIAL + name: str = "loopy_kernel" + + preambles: Sequence[Tuple[int, str]] = () + preamble_generators: Sequence[PreambleGenerator] = () + symbol_manglers: Sequence[ + Callable[["LoopKernel", str], Optional[Tuple[LoopyType, str]]]] = () + linearization: Optional[Sequence[ScheduleItem]] = None + iname_slab_increments: Mapping[str, Tuple[int, int]] = field( + default_factory=Map) + loop_priority: FrozenSet[Tuple[str]] = field( + default_factory=frozenset) + applied_iname_rewrites: Sequence[Dict[str, ExpressionT]] = () + index_dtype: NumpyType = NumpyType(np.dtype(np.int32)) + silenced_warnings: FrozenSet[str] = frozenset() + + # FIXME Yuck, this should go. + overridden_get_grid_sizes_for_insn_ids: Optional[ + Callable[ + [FrozenSet[str], + Dict[str, "InKernelCallable"], + bool], + Tuple[Tuple[int, ...], Tuple[int, ...]]]] = None + + def __post_init__(self): + assert isinstance(self.assumptions, isl.BasicSet) + assert self.assumptions.is_params() + + if not self.index_dtype.is_integral(): raise TypeError("index_dtype must be an integer") - if np.iinfo(index_dtype.numpy_dtype).min >= 0: + if np.iinfo(self.index_dtype.numpy_dtype).min >= 0: raise TypeError("index_dtype must be signed") - if state not in [ - KernelState.INITIAL, - KernelState.CALLS_RESOLVED, - KernelState.PREPROCESSED, - KernelState.LINEARIZED, - ]: - raise ValueError("invalid value for 'state'") - - if linearization is not None: - if schedule is not None: - # these should not both be present - raise ValueError( - "Received both 'schedule' and 'linearization' args. " - "'schedule' is deprecated and will be removed " - "in 2022. Pass 'linearization' only instead.") - elif schedule is not None: - warn( - "'schedule' is deprecated and will be removed in 2022. " - "Use 'linearization' instead.", - DeprecationWarning, stacklevel=2) - linearization = schedule - - assert assumptions.get_ctx() == isl.DEFAULT_CONTEXT - - super().__init__( - domains=domains, - instructions=instructions, - args=args, - linearization=linearization, - name=name, - preambles=preambles, - preamble_generators=preamble_generators, - assumptions=assumptions, - iname_slab_increments=iname_slab_increments, - loop_priority=loop_priority, - silenced_warnings=silenced_warnings, - temporary_variables=temporary_variables, - local_sizes=local_sizes, - inames=inames, - substitutions=substitutions, - cache_manager=cache_manager, - applied_iname_rewrites=applied_iname_rewrites, - symbol_manglers=symbol_manglers, - index_dtype=index_dtype, - options=options, - state=state, - target=target, - overridden_get_grid_sizes_for_insn_ids=( - overridden_get_grid_sizes_for_insn_ids), - _cached_written_variables=_cached_written_variables, - tags=tags) - - self._kernel_executor_cache = {} - - # }}} + assert self.assumptions.get_ctx() == isl.DEFAULT_CONTEXT # {{{ symbol mangling def mangle_symbol(self, ast_builder, identifier): - manglers = ast_builder.symbol_manglers() + self.symbol_manglers + manglers = ast_builder.symbol_manglers() + list(self.symbol_manglers) for mangler in manglers: result = mangler(self, identifier) @@ -378,18 +245,15 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): | set(self.temporary_variables.keys())) @memoize_method - def all_variable_names(self, include_temp_storage=True): + def all_variable_names(self): return ( set(self.temporary_variables.keys()) - | {tv.base_storage - for tv in self.temporary_variables.values() - if tv.base_storage is not None and include_temp_storage} | set(self.substitutions.keys()) | {arg.name for arg in self.args} | set(self.all_inames())) def get_var_name_generator(self): - return _UniqueVarNameGenerator(self.all_variable_names()) + return UniqueNameGenerator(self.all_variable_names()) def get_instruction_id_generator(self, based_on="insn"): used_ids = {insn.id for insn in self.instructions} @@ -416,9 +280,10 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): return frozenset(result) def get_group_name_generator(self): - return _UniqueVarNameGenerator(set(self.all_group_names())) + return UniqueNameGenerator(set(self.all_group_names())) - def get_var_descriptor(self, name): + def get_var_descriptor( + self, name: str) -> Union[TemporaryVariable, KernelArgument]: try: return self.arg_dict[name] except KeyError: @@ -439,15 +304,13 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): try: dtype, name = self.mangle_symbol(self.target.get_device_ast_builder(), name) - from loopy import ValueArg return ValueArg(name, dtype) except TypeError: pass raise ValueError("nothing known about variable '%s'" % name) - @property - @memoize_method + @cached_property def id_to_insn(self): return {insn.id: insn for insn in self.instructions} @@ -456,7 +319,7 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): # {{{ domain wrangling @memoize_method - def parents_per_domain(self): + def parents_per_domain(self) -> Sequence[Optional[int]]: """Return a list corresponding to self.domains (by index) containing domain indices which are nested around this domain. @@ -470,8 +333,8 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): # determines the granularity of inames to be popped/decactivated # if we ascend a level. - iname_set_stack = [] - result = [] + iname_set_stack: List[Set[str]] = [] + result: List[Optional[int]] = [] from loopy.kernel.tools import is_domain_dependent_on_inames @@ -543,24 +406,24 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): return result @memoize_method - def _get_home_domain_map(self): + def _get_home_domain_map(self) -> Mapping[str, int]: return { iname: i_domain for i_domain, dom in enumerate(self.domains) for iname in dom.get_var_names(dim_type.set)} - def get_home_domain_index(self, iname): + def get_home_domain_index(self, iname: str) -> int: return self._get_home_domain_map()[iname] @property - def isl_context(self): + def isl_context(self) -> isl.Context: for dom in self.domains: return dom.get_ctx() raise AssertionError() @memoize_method - def combine_domains(self, domains): + def combine_domains(self, domains: Sequence[int]) -> isl.BasicSet: """ :arg domains: domain indices of domains to be combined. More 'dominant' domains (those which get most say on the actual dim_type of an iname) @@ -584,7 +447,7 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): return result - def get_inames_domain(self, inames): + def get_inames_domain(self, inames: FrozenSet[str]) -> isl.BasicSet: if not inames: return self.combine_domains(()) @@ -665,18 +528,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): # {{{ iname wrangling - @property - @memoize_method - def iname_to_tags(self): - warn( - "LoopKernel.iname_to_tags is deprecated. " - "Call LoopKernel.inames instead, " - "will be unsupported in 2022.", - DeprecationWarning, stacklevel=2) - return {name: iname.tags - for name, iname in self.inames.items() - if iname.tags} - def iname_tags(self, iname): return self.inames[iname].tags @@ -865,15 +716,20 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): return result - @memoize_method def get_written_variables(self): - if self._cached_written_variables is not None: + try: return self._cached_written_variables + except AttributeError: + pass - return frozenset( + result = { var_name for insn in self.instructions - for var_name in insn.assignee_var_names()) + for var_name in insn.assignee_var_names()} + + object.__setattr__(self, "_cached_written_variables", result) + + return result @memoize_method def get_temporary_to_base_storage_map(self): @@ -888,7 +744,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): def get_unwritten_value_args(self): written_vars = self.get_written_variables() - from loopy.kernel.data import ValueArg return { arg.name for arg in self.args @@ -898,16 +753,12 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): # {{{ argument wrangling - @property - @memoize_method - def arg_dict(self): + @cached_property + def arg_dict(self) -> Dict[str, KernelArgument]: return {arg.name: arg for arg in self.args} - @property - @memoize_method + @cached_property def scalar_loop_args(self): - from loopy.kernel.data import ValueArg - if self.args is None: return [] else: @@ -936,6 +787,18 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): # {{{ bounds finding + @property + def cache_manager(self): + try: + return self._cache_manager + except AttributeError: + pass + + from loopy.kernel.tools import SetOperationCacheManager + cm = SetOperationCacheManager() + object.__setattr__(self, "_cache_manager", cm) + return cm + @memoize_method def get_iname_bounds(self, iname, constants_only=False): domain = self.get_inames_domain(frozenset([iname])) @@ -963,13 +826,10 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): dom_intersect_assumptions, iname_idx) .coalesce()) - class BoundsRecord(ImmutableRecord): - pass - size = (upper_bound_pw_aff - lower_bound_pw_aff + 1) size = size.gist(assumptions) - return BoundsRecord( + return _BoundsRecord( lower_bound_pw_aff=lower_bound_pw_aff, upper_bound_pw_aff=upper_bound_pw_aff, size=size) @@ -1074,19 +934,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): tgt_dict[tag.axis] = size - # {{{ override local_sizes with self.local_sizes - - for i_lsize, lsize in self.local_sizes.items(): - if i_lsize <= max(local_sizes.keys()): - local_sizes[i_lsize] = lsize - else: - from warnings import warn - warn(f"Forced local sizes '{i_lsize}: {lsize}' is unused" - f" because kernel '{self.name}' uses {max(local_sizes.keys())}" - " local hardware axes.") - - # }}} - return global_sizes, local_sizes @memoize_method @@ -1181,8 +1028,10 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): callables_table, ignore_auto=ignore_auto, return_dict=return_dict) - def get_grid_size_upper_bounds_as_exprs(self, callables_table, - ignore_auto=False, return_dict=False): + def get_grid_size_upper_bounds_as_exprs( + self, callables_table, + ignore_auto=False, return_dict=False + ) -> Tuple[Tuple[ExpressionT, ...], Tuple[ExpressionT, ...]]: """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1298,8 +1147,9 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): lines.extend(sep) if show_labels: lines.append("ARGUMENTS:") - for arg_name in natsorted(kernel.arg_dict): - lines.append(str(kernel.arg_dict[arg_name])) + # Arguments are ordered, do not be tempted to sort them. + for arg in kernel.args: + lines.append(str(arg)) if "domains" in what: lines.extend(sep) @@ -1378,36 +1228,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): # }}} - # {{{ implementation arguments - - @property - @memoize_method - def impl_arg_to_arg(self): - from loopy.kernel.array import ArrayBase - - result = {} - - for arg in self.args: - if not isinstance(arg, ArrayBase): - result[arg.name] = arg - continue - - if arg.shape is None or arg.dim_tags is None: - result[arg.name] = arg - continue - - subscripts_and_names = arg.subscripts_and_names() - if subscripts_and_names is None: - result[arg.name] = arg - continue - - for _index, sub_arg_name in subscripts_and_names: - result[sub_arg_name] = arg - - return result - - # }}} - # {{{ direct execution def __call__(self, *args, **kwargs): @@ -1426,11 +1246,10 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): def __getstate__(self): result = { - key: getattr(self, key) - for key in self.__class__.fields - if hasattr(self, key)} - - result.pop("cache_manager", None) + fld.name: getattr(self, fld.name) + for fld in fields(self.__class__) + if hasattr(self, fld.name) + and not fld.name.startswith("_")} # Make the instructions lazily unpickling, to support faster # cache retrieval for execution. @@ -1452,71 +1271,75 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): from loopy.tools import LoopyKeyBuilder LoopyKeyBuilder()(self) + # pylint: disable=no-member return (result, self._pytools_persistent_hash_digest) def __setstate__(self, state): attribs, p_hash_digest = state - new_fields = set() - - for k, v in attribs.items(): - setattr(self, k, v) - new_fields.add(k) - - self.register_fields(new_fields) + for name, val in attribs.items(): + object.__setattr__(self, name, val) if 0: # {{{ check that 'reconstituted' object has same hash from loopy.tools import LoopyKeyBuilder - assert p_hash_digest == LoopyKeyBuilder()(self) + hash_before = LoopyKeyBuilder()(self) - # }}} + object.__setattr__( + self, "_pytools_persistent_hash_digest", p_hash_digest) - self._pytools_persistent_hash_digest = p_hash_digest + assert hash_before == LoopyKeyBuilder()(self) + + # }}} + else: + object.__setattr__( + self, "_pytools_persistent_hash_digest", p_hash_digest) from loopy.kernel.tools import SetOperationCacheManager - self.cache_manager = SetOperationCacheManager() - self._kernel_executor_cache = {} + object.__setattr__(self, "_cache_manager", SetOperationCacheManager()) # }}} # {{{ persistent hash key generation / comparison - hash_fields = ( + hash_fields = [ "domains", "instructions", "args", - "linearization", - "name", - "preambles", "assumptions", - "local_sizes", "temporary_variables", "inames", "substitutions", - "iname_slab_increments", - "loop_priority", - "silenced_warnings", "options", - "state", "target", - ) + "tags", + "state", + "name", - comparison_fields = hash_fields + ( - # Contains pymbolic expressions, hence a (small) headache to hash. - # Likely not needed for hash uniqueness => headache avoided. - "applied_iname_rewrites", + "preambles", + # preamble_generators + # symbol_manglers + "linearization", + "iname_slab_increments", + "loop_priority", + # applied_iname_rewrites + "index_dtype", + "silenced_warnings", - # These are lists of functions. It's not clear how to - # hash these correctly, so let's not attempt it. We'll - # just assume that the rest of the hash is specific enough - # that we won't have to rely on differences in these to - # resolve hash conflicts. + # missing: + # - applied_iname_rewrites + # Contains pymbolic expressions, hence a (small) headache to hash. + # Likely not needed for hash uniqueness => headache avoided. - "preamble_generators", - "symbol_manglers", - ) + # - preamble_generators + # - symbol_manglers + # These are lists of functions. It's not clear how to + # hash these correctly, so let's not attempt it. We'll + # just assume that the rest of the hash is specific enough + # that we won't have to rely on differences in these to + # resolve hash conflicts. + ] update_persistent_hash = update_persistent_hash @@ -1528,56 +1351,9 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): self.update_persistent_hash(key_hash, LoopyKeyBuilder()) return hash(key_hash.digest()) - def __eq__(self, other): - if self is other: - return True - - if not isinstance(other, LoopKernel): - return False - - for field_name in self.comparison_fields: - if field_name == "domains": - if len(self.domains) != len(other.domains): - return False - - for set_a, set_b in zip(self.domains, other.domains): - if not (set_a.plain_is_equal(set_b) or set_a.is_equal(set_b)): - return False - - elif field_name == "assumptions": - if not ( - self.assumptions.plain_is_equal(other.assumptions) - or self.assumptions.is_equal(other.assumptions)): - return False - - elif getattr(self, field_name) != getattr(other, field_name): - return False - - return True - - def __ne__(self, other): - return not self.__eq__(other) - # }}} - def get_copy_kwargs(self, **kwargs): - if "iname_to_tags" in kwargs: - if "inames" in kwargs: - raise LoopyError("Cannot pass both `inames` and `iname_to_tags` to " - "LoopKernel.get_copy_kwargs") - - warn("Providing iname_to_tags is deprecated, pass inames instead. " - "Will be unsupported in 2022.", - DeprecationWarning, stacklevel=2) - - iname_to_tags = kwargs["iname_to_tags"] - domains = kwargs.get("domains", self.domains) - kwargs["inames"] = {name: Iname(name, - iname_to_tags.get(name, frozenset())) - for name in _get_inames_from_domains(domains) - } - del kwargs["iname_to_tags"] - + def get_copy_kwargs(self, **kwargs: Any) -> Dict[str, Any]: if "domains" in kwargs: inames = kwargs.get("inames", self.inames) domains = kwargs["domains"] @@ -1586,35 +1362,37 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains) - if "instructions" in kwargs: - # Avoid carrying over an invalid cache when instructions are - # modified. - kwargs["_cached_written_variables"] = None + return kwargs - return super().get_copy_kwargs(**kwargs) + def copy(self, **kwargs: Any) -> "LoopKernel": + result = replace(self, **self.get_copy_kwargs(**kwargs)) - def copy(self, **kwargs): - if "iname_to_tags" in kwargs: - if "inames" in kwargs: - raise LoopyError("Cannot pass both `inames` and `iname_to_tags` to " - "LoopKernel.copy") + object.__setattr__(result, "_cache_manager", self.cache_manager) - if "schedule" in kwargs: - if "linearization" in kwargs: - raise LoopyError("Cannot pass both `schedule` and " - "`linearization` to LoopKernel.copy") - - kwargs["linearization"] = None + if "instructions" not in kwargs: + # Avoid carrying over an invalid cache when instructions are + # modified. + try: + # The type system does not know about this attribute, and we're + # not about to tell it. It's an internal caching hack. + cwv = self._cached_written_variables # type: ignore[attr-defined] + except AttributeError: + pass + else: + object.__setattr__(result, "_cached_written_variables", cwv) - from pytools.tag import normalize_tags, check_tag_uniqueness - tags = kwargs.pop("tags", _not_provided) - if tags is not _not_provided: - kwargs["tags"] = check_tag_uniqueness(normalize_tags(tags)) + return result - return super().copy(**kwargs) + def _with_new_tags(self, tags) -> "LoopKernel": + return replace(self, tags=tags) - def _with_new_tags(self, tags): - return self.copy(tags=tags) + @memoize_method + def _separation_info(self) -> Dict[str, _ArraySeparationInfo]: + return { + arg.name: arg._separation_info + for arg in self.args + if isinstance(arg, ArrayArg) and arg._separation_info is not None + } # }}} diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index d41a18152f0b3c35da1704e0071a6f8f9af7609c..3c444931aa47870763682ea59340070437fd0e78 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -1,5 +1,6 @@ """Implementation tagging of array axes.""" +from __future__ import annotations __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -23,16 +24,34 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import sys +from typing import (cast, Optional, Tuple, Union, FrozenSet, Type, Sequence, + List, Callable, ClassVar, TypeVar, TYPE_CHECKING) +from dataclasses import dataclass import re from warnings import warn -from pytools import ImmutableRecord, memoize_method -from pytools.tag import Taggable +from pytools import ImmutableRecord +from pytools.tag import Taggable, Tag import numpy as np # noqa from loopy.diagnostic import LoopyError from loopy.tools import is_integer +from loopy.typing import ExpressionT, ShapeType +from loopy.types import LoopyType + +if TYPE_CHECKING: + from loopy.target import TargetBase + from loopy.kernel import LoopKernel + from loopy.kernel.data import auto, TemporaryVariable, ArrayArg + from loopy.codegen import VectorizationInfo + +if getattr(sys, "_BUILDING_SPHINX_DOCS", False): + from loopy.target import TargetBase # noqa: F811 + + +T = TypeVar("T") __doc__ = """ @@ -680,11 +699,19 @@ class ArrayBase(ImmutableRecord, Taggable): (supports persistent hashing) """ + name: str + dtype: LoopyType + shape: Union[ShapeType, Type["auto"], None] + dim_tags: Optional[Sequence[ArrayDimImplementationTag]] + offset: Union[ExpressionT, str, None] + dim_names: Optional[Tuple[str, ...]] + alignment: Optional[int] + tags: FrozenSet[Tag] # Note that order may also wind up in attributes, if the # number of dimensions has not yet been determined. - allowed_extra_kwargs = [] + allowed_extra_kwargs: ClassVar[Tuple[str, ...]] = () def __init__(self, name, dtype=None, shape=None, dim_tags=None, offset=0, dim_names=None, strides=None, order=None, for_atomic=False, @@ -949,7 +976,7 @@ class ArrayBase(ImmutableRecord, Taggable): for i in self.dim_tags)) if self.offset: - info_entries.append("offset: %s" % self.offset) + info_entries.append(f"offset: {self.offset}") if self.tags: info_entries.append( @@ -1042,18 +1069,27 @@ class ArrayBase(ImmutableRecord, Taggable): else: return self - def vector_size(self, target): + def vector_size(self, target: TargetBase) -> int: """Return the size of the vector type used for the array divided by the basic data type. Note: For 3-vectors, this will be 4. """ - if self.dim_tags is None: + if self.dim_tags is None or self.shape is None: return 1 + assert isinstance(self.shape, tuple) + + saw_vec_tag = False + for i, dim_tag in enumerate(self.dim_tags): if isinstance(dim_tag, VectorArrayDimTag): + if saw_vec_tag: + raise LoopyError("more than one axis of '{self.name}' " + "is tagged 'vec'") + saw_vec_tag = True + shape_i = self.shape[i] if not is_integer(shape_i): raise LoopyError("shape of '%s' has non-constant-integer " @@ -1066,222 +1102,51 @@ class ArrayBase(ImmutableRecord, Taggable): return 1 - def decl_info(self, target, is_written, index_dtype, shape_override=None): - """Return a list of :class:`loopy.codegen.ImplementedDataInfo` - instances corresponding to the array. - """ - array_shape = self.shape - if shape_override is not None: - array_shape = shape_override - - from loopy.codegen import ImplementedDataInfo - from loopy.kernel.data import ValueArg - - def gen_decls(name_suffix, - shape, strides, - unvec_shape, unvec_strides, - stride_arg_axes, - dtype, user_index): - """ - :arg unvec_shape: shape tuple - that accounts for :class:`loopy.kernel.array.VectorArrayDimTag` - in a scalar manner - :arg unvec_strides: strides tuple - that accounts for :class:`loopy.kernel.array.VectorArrayDimTag` - in a scalar manner - :arg stride_arg_axes: a tuple *(user_axis, impl_axis, unvec_impl_axis)* - :arg user_index: A tuple representing a (user-facing) - multi-dimensional subscript. This is filled in with - concrete integers when known (such as for separate-array - dim tags), and with *None* where the index won't be - known until run time. - """ - - if dtype is None: - dtype = self.dtype - - user_axis = len(user_index) - - num_user_axes = self.num_user_axes(require_answer=False) - - if num_user_axes is None or user_axis >= num_user_axes: - # {{{ recursion base case - - full_name = self.name + name_suffix - - stride_args = [] - strides = list(strides) - unvec_strides = list(unvec_strides) - - # generate stride arguments, yielded later to keep array first - for stride_user_axis, stride_impl_axis, stride_unvec_impl_axis \ - in stride_arg_axes: - stride_name = full_name+"_stride%d" % stride_user_axis - - from pymbolic import var - strides[stride_impl_axis] = \ - unvec_strides[stride_unvec_impl_axis] = \ - var(stride_name) - - stride_args.append( - ImplementedDataInfo( - target=target, - name=stride_name, - dtype=index_dtype, - arg_class=ValueArg, - stride_for_name_and_axis=( - full_name, stride_impl_axis), - is_written=False)) - - yield ImplementedDataInfo( - target=target, - name=full_name, - base_name=self.name, - - arg_class=type(self), - dtype=dtype, - shape=shape, - strides=tuple(strides), - unvec_shape=unvec_shape, - unvec_strides=tuple(unvec_strides), - allows_offset=bool(self.offset), - - is_written=is_written) - - import loopy as lp - - if self.offset is lp.auto: - offset_name = full_name+"_offset" - yield ImplementedDataInfo( - target=target, - name=offset_name, - dtype=index_dtype, - arg_class=ValueArg, - offset_for_name=full_name, - is_written=False) - - yield from stride_args - - # }}} - - return - - dim_tag = self.dim_tags[user_axis] - - if isinstance(dim_tag, FixedStrideArrayDimTag): - if array_shape is None: - new_shape_axis = None - else: - new_shape_axis = array_shape[user_axis] - - import loopy as lp - if dim_tag.stride is lp.auto: - new_stride_arg_axes = stride_arg_axes \ - + ((user_axis, len(strides), len(unvec_strides)),) - - # repaired above when final array name is known - # (and stride argument is created) - new_stride_axis = None - else: - new_stride_arg_axes = stride_arg_axes - new_stride_axis = dim_tag.stride - - yield from gen_decls(name_suffix, - shape + (new_shape_axis,), strides + (new_stride_axis,), - unvec_shape + (new_shape_axis,), - unvec_strides + (new_stride_axis,), - new_stride_arg_axes, - dtype, user_index + (None,)) - - elif isinstance(dim_tag, SeparateArrayArrayDimTag): - shape_i = array_shape[user_axis] - if not is_integer(shape_i): - raise LoopyError("shape of '%s' has non-constant " - "integer axis %d (0-based)" % ( - self.name, user_axis)) - - for i in range(shape_i): - yield from gen_decls(name_suffix + "_s%d" % i, - shape, strides, unvec_shape, unvec_strides, - stride_arg_axes, dtype, - user_index + (i,)) - - elif isinstance(dim_tag, VectorArrayDimTag): - shape_i = array_shape[user_axis] - if not is_integer(shape_i): - raise LoopyError("shape of '%s' has non-constant " - "integer axis %d (0-based)" % ( - self.name, user_axis)) - - yield from gen_decls(name_suffix, - shape, strides, - unvec_shape + (shape_i,), - # vectors always have stride 1 - unvec_strides + (1,), - stride_arg_axes, - target.vector_dtype(dtype, shape_i), - user_index + (None,)) - - else: - raise LoopyError("unsupported array dim implementation tag '%s' " - "in array '%s'" % (dim_tag, self.name)) - - yield from gen_decls(name_suffix="", - shape=(), strides=(), - unvec_shape=(), unvec_strides=(), - stride_arg_axes=(), - dtype=self.dtype, user_index=()) - - @memoize_method - def sep_shape(self): - sep_shape = [] - for shape_i, dim_tag in zip(self.shape, self.dim_tags): - if isinstance(dim_tag, SeparateArrayArrayDimTag): - if not is_integer(shape_i): - raise TypeError("array '%s' has non-fixed-size " - "separate-array axis" % self.name) +# }}} - sep_shape.append(shape_i) +def drop_vec_dims( + dim_tags: Tuple[ArrayDimImplementationTag, ...], + t: Tuple[T, ...]) -> Tuple[T, ...]: + assert len(dim_tags) == len(t) + return tuple(t_i for dim_tag, t_i in zip(dim_tags, t) + if not isinstance(dim_tag, VectorArrayDimTag)) - return tuple(sep_shape) - @memoize_method - def subscripts_and_names(self): - sep_shape = self.sep_shape() +def get_strides(array: ArrayBase) -> Tuple[ExpressionT, ...]: + from pymbolic import var + result: List[ExpressionT] = [] - if not sep_shape: - return None + if array.dim_tags is None: + return () - def unwrap_1d_indices(idx): - # This allows these indices to work on Python sequences, too, not - # just numpy arrays. + for dim_tag in array.dim_tags: + if isinstance(dim_tag, VectorArrayDimTag): + result.append(1) - if len(idx) == 1: - return idx[0] + elif isinstance(dim_tag, FixedStrideArrayDimTag): + if isinstance(dim_tag.stride, str): + result.append(var(dim_tag.stride)) else: - return idx + result.append(dim_tag.stride) - return [ - (unwrap_1d_indices(i), - self.name + "".join("_s%d" % sub_i for sub_i in i)) - for i in np.ndindex(sep_shape)] + else: + raise ValueError("unexpected dim tag type during stride finding: " + f"'{type(dim_tag)}'") -# }}} + return tuple(result) # {{{ access code generation +@dataclass(frozen=True) class AccessInfo(ImmutableRecord): - """ - .. attribute:: array_name - .. attribute:: vector_index - .. attribute:: subscripts - List of expressions, one for each target axis - """ + array_name: str + vector_index: Optional[int] + subscripts: Tuple[ExpressionT, ...] -def _apply_offset(sub, array_name, ary): +def _apply_offset(sub: ExpressionT, ary: ArrayBase) -> ExpressionT: """ Helper for :func:`get_access_info`. Augments *ary*'s subscript index expression (*sub*) with its offset info. @@ -1293,18 +1158,33 @@ def _apply_offset(sub, array_name, ary): from pymbolic import var if ary.offset: + from loopy.kernel.data import TemporaryVariable + if isinstance(ary, TemporaryVariable): + # offsets for base_storage are added when the temporary + # is declared. + return sub + if ary.offset is lp.auto: - return var(array_name+"_offset") + sub + raise AssertionError( + f"Offset for '{ary.name}' should have been replaced " + "with an actual argument by " + "make_temporaries_for_offsets_and_strides " + "during preprocessing.") elif isinstance(ary.offset, str): return var(ary.offset) + sub else: # assume it's an expression - return ary.offset + sub + # FIXME: mypy can't figure out that ExpressionT + ExpressionT works + return ary.offset + sub # type: ignore[call-overload, arg-type, operator] # noqa: E501 else: return sub -def get_access_info(target, ary, index, eval_expr, vectorization_info): +def get_access_info(kernel: "LoopKernel", + ary: Union["ArrayArg", "TemporaryVariable"], + index: Union[ExpressionT, Tuple[ExpressionT, ...]], + eval_expr: Callable[[ExpressionT], int], + vectorization_info: "VectorizationInfo") -> AccessInfo: """ :arg ary: an object of type :class:`ArrayBase` :arg index: a tuple of indices representing a subscript into ary @@ -1313,7 +1193,6 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): """ import loopy as lp - from pymbolic import var def eval_expr_assert_integer_constant(i, expr): from pymbolic.mapper.evaluator import UnknownVariableError @@ -1336,8 +1215,6 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): if not isinstance(index, tuple): index = (index,) - array_name = ary.name - if ary.dim_tags is None: if len(index) != 1: raise LoopyError("Array '%s' has no known axis implementation " @@ -1347,8 +1224,8 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): % ary.name) return AccessInfo( - array_name=array_name, - subscripts=(_apply_offset(index[0], array_name, ary),), + array_name=ary.name, + subscripts=(_apply_offset(index[0], ary),), vector_index=None) if len(ary.dim_tags) != len(index): @@ -1359,21 +1236,33 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): num_target_axes = ary.num_target_axes() vector_index = None - subscripts = [0] * num_target_axes + subscripts: List[ExpressionT] = [0] * num_target_axes - vector_size = ary.vector_size(target) + vector_size = ary.vector_size(kernel.target) # {{{ process separate-array dim tags first, to find array name - for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)): - if isinstance(dim_tag, SeparateArrayArrayDimTag): - idx = eval_expr_assert_integer_constant(i, idx) - array_name += "_s%d" % idx + from loopy.kernel.data import ArrayArg + if isinstance(ary, ArrayArg) and ary._separation_info: + sep_index = [] + remaining_index = [] + for iaxis, (index_i, dim_tag) in enumerate(zip(index, ary.dim_tags)): + if iaxis in ary._separation_info.sep_axis_indices_set: + sep_index.append(eval_expr_assert_integer_constant(iaxis, index_i)) + assert isinstance(dim_tag, SeparateArrayArrayDimTag) + else: + remaining_index.append(index_i) + + index = tuple(remaining_index) + # only arguments (not temporaries) may be sep-tagged + ary = cast(ArrayArg, + kernel.arg_dict[ary._separation_info.subarray_names[tuple(sep_index)]]) # }}} # {{{ process remaining dim tags + assert ary.dim_tags is not None for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)): if isinstance(dim_tag, FixedStrideArrayDimTag): stride = dim_tag.stride @@ -1386,18 +1275,24 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): % (ary.name, i, dim_tag.stride, vector_size)) elif stride is lp.auto: - stride = var(array_name + "_stride%d" % i) + raise AssertionError( + f"Stride for axis {i+1} (1-based) of " + "'{array_name}' should have been replaced " + "with an actual argument by " + "make_temporaries_for_offsets_and_strides " + "during preprocessing.") subscripts[dim_tag.target_axis] += (stride // vector_size)*idx elif isinstance(dim_tag, SeparateArrayArrayDimTag): - pass + raise AssertionError() elif isinstance(dim_tag, VectorArrayDimTag): from pymbolic.primitives import Variable + index_i = index[i] if (vectorization_info is not None - and isinstance(index[i], Variable) - and index[i].name == vectorization_info.iname): + and isinstance(index_i, Variable) + and index_i.name == vectorization_info.iname): # We'll do absolutely nothing here, which will result # in the vector being returned. pass @@ -1414,18 +1309,17 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info): # }}} - from pymbolic import var import loopy as lp if ary.offset: if num_target_axes > 1: raise NotImplementedError("offsets for multiple image axes") - subscripts[0] = _apply_offset(subscripts[0], array_name, ary) + subscripts[0] = _apply_offset(subscripts[0], ary) return AccessInfo( - array_name=array_name, + array_name=ary.name, vector_index=vector_index, - subscripts=subscripts) + subscripts=tuple(subscripts)) # }}} diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index cccfee1228403fa82bd6fcb83fb059084dbbf35c..e61063246d1d66eb47217f2f9391a911fc42e2cd 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2447,6 +2447,25 @@ def make_function(domains, instructions, kernel_data=None, **kwargs): from pytools.tag import normalize_tags, check_tag_uniqueness tags = check_tag_uniqueness(normalize_tags(kwargs.pop("tags", frozenset()))) + index_dtype = kwargs.pop("index_dtype", None) + if index_dtype is None: + index_dtype = np.int32 + + from loopy.types import to_loopy_type + index_dtype = to_loopy_type(index_dtype) + + preambles = kwargs.pop("preambles", None) + if preambles is None: + preambles = () + elif not isinstance(preambles, tuple): + preambles = tuple(preambles) + + preamble_generators = kwargs.pop("preamble_generators", None) + if preamble_generators is None: + preamble_generators = () + elif not isinstance(preamble_generators, tuple): + preamble_generators = tuple(preamble_generators) + from loopy.kernel import LoopKernel knl = LoopKernel(domains, instructions, kernel_args, temporary_variables=temporary_variables, @@ -2456,6 +2475,9 @@ def make_function(domains, instructions, kernel_data=None, **kwargs): tags=tags, inames=inames, assumptions=assumptions, + index_dtype=index_dtype, + preambles=preambles, + preamble_generators=preamble_generators, **kwargs) from loopy.transform.instruction import uniquify_instruction_ids diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 8024d8ddd917407e6ab16ffd984df2dc5bc44c69..f0e4c823a8a8c0aceef9e497067ada4396753998 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -24,13 +24,23 @@ THE SOFTWARE. """ +from typing import (Type, Union, FrozenSet, Tuple, Optional, Sequence, Any, ClassVar, + cast) from sys import intern +from dataclasses import dataclass, replace +from enum import IntEnum +from warnings import warn + +from immutables import Map import numpy as np # noqa from pytools import ImmutableRecord from pytools.tag import Taggable -from pytools.tag import UniqueTag as UniqueTagBase -from loopy.kernel.array import ArrayBase +from pytools.tag import UniqueTag as UniqueTagBase, Tag + +from loopy.kernel.array import ArrayBase, ArrayDimImplementationTag from loopy.diagnostic import LoopyError +from loopy.typing import ExpressionT +from loopy.types import LoopyType from loopy.kernel.instruction import ( # noqa InstructionBase, MemoryOrdering, @@ -44,6 +54,7 @@ from loopy.kernel.instruction import ( # noqa make_assignment, CInstruction) + __doc__ = """ .. autofunction:: filter_iname_tags_by_type @@ -64,10 +75,47 @@ __doc__ = """ .. autoclass:: UnrollTag .. autoclass:: Iname - -.. autoclass:: KernelArgument """ +# This docstring is included in ref_internals. Do not include parts of the public +# interface, e.g. TemporaryVariable, KernelArgument, ArrayArg. + + +# {{{ utilities + +def _names_from_expr(expr: Union[None, ExpressionT, str]) -> FrozenSet[str]: + from numbers import Number + from loopy.symbolic import DependencyMapper + dep_mapper = DependencyMapper() + + from pymbolic.primitives import Expression + if isinstance(expr, str): + return frozenset({expr}) + elif isinstance(expr, Expression): + return frozenset(v.name for v in dep_mapper(expr)) + elif expr is None: + return frozenset() + elif isinstance(expr, Number): + return frozenset() + else: + raise ValueError(f"unexpected value of expression-like object: '{expr}'") + + +def _names_from_dim_tags( + dim_tags: Optional[Sequence[ArrayDimImplementationTag]]) -> FrozenSet[str]: + from loopy.kernel.array import FixedStrideArrayDimTag + if dim_tags is not None: + return frozenset({ + name + for dim_tag in dim_tags + if isinstance(dim_tag, FixedStrideArrayDimTag) + for name in _names_from_expr(dim_tag.stride) + }) + else: + return frozenset() + +# }}} + class auto: # noqa """A generic placeholder object for something that should be automatically @@ -109,7 +157,7 @@ def filter_iname_tags_by_type(tags, tag_type, max_num=None, min_num=None): class InameImplementationTag(ImmutableRecord, UniqueTagBase): - __slots__ = [] + __slots__: ClassVar[Tuple[str, ...]] = () def __hash__(self): return hash(self.key) @@ -262,7 +310,7 @@ def parse_tag(tag): # {{{ memory address space -class AddressSpace: +class AddressSpace(IntEnum): """Storage location of a variable. .. attribute:: PRIVATE @@ -278,7 +326,7 @@ class AddressSpace: GLOBAL = 2 @classmethod - def stringify(cls, val): + def stringify(cls, val: Union["AddressSpace", Type[auto]]) -> str: if val == cls.PRIVATE: return "private" elif val == cls.LOCAL: @@ -296,24 +344,36 @@ class AddressSpace: # {{{ arguments class KernelArgument(ImmutableRecord): - """Base class for all argument types""" + """Base class for all argument types. + + .. attribute:: name + .. attribute:: dtype + .. attribute:: is_output + .. attribute:: is_input + + .. automethod:: supporting_names + """ + name: str + dtype: LoopyType + is_output: bool + is_input: bool def __init__(self, **kwargs): kwargs["name"] = intern(kwargs.pop("name")) target = kwargs.pop("target", None) + if target is not None: + warn("Passing 'target' is deprecated and will stop working in 2023. " + "It is already being ignored.", + DeprecationWarning, stacklevel=2) dtype = kwargs.pop("dtype", None) - if "for_atomic" in kwargs: - for_atomic = kwargs["for_atomic"] - else: - for_atomic = False + for_atomic = kwargs.pop("for_atomic", False) from loopy.types import to_loopy_type dtype = to_loopy_type( - dtype, allow_auto=True, allow_none=True, for_atomic=for_atomic, - target=target) + dtype, allow_auto=True, allow_none=True, for_atomic=for_atomic) import loopy as lp if dtype is lp.auto: @@ -325,9 +385,27 @@ class KernelArgument(ImmutableRecord): ImmutableRecord.__init__(self, **kwargs) + def supporting_names(self) -> FrozenSet[str]: + """'Supporting' names are those that are likely to be required to be + present for any use of the argument. + """ + + return frozenset() + + +@dataclass(frozen=True) +class _ArraySeparationInfo: + """Not user-facing. If an array has been split because an axis + is tagged with :class:`~loopy.kernel.data.SeparateArrayArrayDimTag`, + this records the names of the actually present sub-arrays that + should be used to realize this array. + """ + sep_axis_indices_set: FrozenSet[int] + subarray_names: Map[Tuple[int, ...], str] + class ArrayArg(ArrayBase, KernelArgument): - __doc__ = ArrayBase.__doc__ + ( + __doc__ = cast(str, ArrayBase.__doc__) + ( """ .. attribute:: address_space @@ -347,11 +425,17 @@ class ArrayArg(ArrayBase, KernelArgument): at kernel entry. """) - allowed_extra_kwargs = [ + address_space: AddressSpace + + # _separation_info is not user-facing and hence not documented. + _separation_info: Optional[_ArraySeparationInfo] + + allowed_extra_kwargs = ( "address_space", "is_output", "is_input", - "tags"] + "tags", + "_separation_info") def __init__(self, *args, **kwargs): if "address_space" not in kwargs: @@ -359,29 +443,27 @@ class ArrayArg(ArrayBase, KernelArgument): kwargs["is_output"] = kwargs.pop("is_output", None) kwargs["is_input"] = kwargs.pop("is_input", None) + kwargs["_separation_info"] = kwargs.pop("_separation_info", None) super().__init__(*args, **kwargs) min_target_axes = 0 max_target_axes = 1 - def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): - return ast_builder.get_array_arg_decl(self.name + name_suffix, - self.address_space, shape, dtype, is_written) - def __str__(self): - # dont mention the type name if shape is known + # Don't mention the type of array arg if shape is known + # FIXME: Why? include_typename = self.shape in (None, auto) aspace_str = AddressSpace.stringify(self.address_space) - assert self.is_input or self.is_output - inout = [] if self.is_input: inout.append("in") if self.is_output: inout.append("out") + if not (self.is_input or self.is_output): + inout.append("neither_in_nor_out?") return ( self.stringify(include_typename=include_typename) @@ -396,6 +478,15 @@ class ArrayArg(ArrayBase, KernelArgument): key_builder.rec(key_hash, self.address_space) key_builder.rec(key_hash, self.is_output) key_builder.rec(key_hash, self.is_input) + key_builder.rec(key_hash, self._separation_info) + + def supporting_names(self) -> FrozenSet[str]: + # Do not consider separation info here: The subarrays don't support, they + # replace this array. + return ( + _names_from_expr(self.offset) + | _names_from_dim_tags(self.dim_tags) + ) # Making this a function prevents incorrect use in isinstance. @@ -426,10 +517,6 @@ class ConstantArg(ArrayBase, KernelArgument): min_target_axes = 0 max_target_axes = 1 - def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): - return ast_builder.get_constant_arg_decl(self.name + name_suffix, shape, - dtype, is_written) - class ImageArg(ArrayBase, KernelArgument): __doc__ = ArrayBase.__doc__ @@ -449,12 +536,19 @@ class ImageArg(ArrayBase, KernelArgument): @property def dimensions(self): + assert self.dim_tags is not None return len(self.dim_tags) def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): return ast_builder.get_image_arg_decl(self.name + name_suffix, shape, self.num_target_axes(), dtype, is_written) + def supporting_names(self) -> FrozenSet[str]: + return ( + _names_from_expr(self.offset) + | _names_from_dim_tags(self.dim_tags) + ) + """ :attribute tags: A (possibly empty) frozenset of instances of @@ -511,18 +605,13 @@ class ValueArg(KernelArgument, Taggable): return ast_builder.get_value_arg_decl(self.name, (), self.dtype, False) - -class InameArg(ValueArg): - pass - # }}} # {{{ temporary variable - class TemporaryVariable(ArrayBase): - __doc__ = ArrayBase.__doc__ + """ + __doc__ = cast(str, ArrayBase.__doc__) + """ .. attribute:: storage_shape .. attribute:: base_indices .. attribute:: address_space @@ -535,8 +624,9 @@ class TemporaryVariable(ArrayBase): .. attribute:: base_storage The name of a storage array that is to be used to actually - hold the data in this temporary. Note that this storage - array must not match any existing variable names. + hold the data in this temporary, or *None*. If not *None* or the name + of an existing variable, a variable of this name and appropriate size + will be created. .. attribute:: initializer @@ -560,7 +650,7 @@ class TemporaryVariable(ArrayBase): min_target_axes = 0 max_target_axes = 1 - allowed_extra_kwargs = [ + allowed_extra_kwargs = ( "storage_shape", "base_indices", "address_space", @@ -568,7 +658,7 @@ class TemporaryVariable(ArrayBase): "initializer", "read_only", "_base_storage_access_may_be_aliasing", - ] + ) def __init__(self, name, dtype=None, shape=auto, address_space=None, dim_tags=None, offset=0, dim_names=None, strides=None, order=None, @@ -680,29 +770,20 @@ class TemporaryVariable(ArrayBase): from pytools import product return product(si for si in shape)*self.dtype.itemsize - def decl_info(self, target, index_dtype): - return super().decl_info( - target, is_written=True, index_dtype=index_dtype, - shape_override=self.storage_shape) - - def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): - if self.address_space == AddressSpace.GLOBAL: - return ast_builder.get_array_arg_decl(self.name + name_suffix, - AddressSpace.GLOBAL, shape, dtype, is_written) - else: - raise LoopyError("unexpected request for argument declaration of " - "non-global temporary") - def __str__(self): if self.address_space is auto: aspace_str = "auto" else: aspace_str = AddressSpace.stringify(self.address_space) + if self.base_storage is None: + bs_str = "" + else: + bs_str = " base_storage: "+str(self.base_storage) + return ( self.stringify(include_typename=False) - + - " aspace:%s" % aspace_str) + + f" aspace: {aspace_str}{bs_str}") def __eq__(self, other): return ( @@ -739,6 +820,15 @@ class TemporaryVariable(ArrayBase): key_builder.rec(key_hash, self.read_only) key_builder.rec(key_hash, self._base_storage_access_may_be_aliasing) + def supporting_names(self) -> FrozenSet[str]: + return ( + _names_from_expr(self.offset) + | _names_from_dim_tags(self.dim_tags) + | ( + frozenset({self.base_storage}) + if self.base_storage else frozenset()) + ) + # }}} @@ -809,6 +899,7 @@ class CallMangleInfo(ImmutableRecord): # {{{ Iname class +@dataclass(frozen=True) class Iname(Taggable): """ Records an iname in a :class:`~loopy.LoopKernel`. See :ref:`domain-tree` for @@ -824,41 +915,19 @@ class Iname(Taggable): An instance of :class:`str`, denoting the iname's name. - .. attribute:: tas + .. attribute:: tags An instance of :class:`frozenset` of :class:`pytools.tag.Tag`. """ - def __init__(self, name, tags=frozenset()): - super().__init__(tags=tags) - - assert isinstance(name, str) - self.name = name + name: str + tags: FrozenSet[Tag] - def copy(self, *, name=None, tags=None): - if name is None: - name = self.name - if tags is None: - tags = self.tags - - return type(self)(name=name, tags=tags) + def copy(self, **kwargs: Any) -> "Iname": + return replace(self, **kwargs) def _with_new_tags(self, tags): return self.copy(tags=tags) - def update_persistent_hash(self, key_hash, key_builder): - """Custom hash computation function for use with - :class:`pytools.persistent_dict.PersistentDict`. - """ - key_builder.rec(key_hash, type(self).__name__.encode("utf-8")) - key_builder.rec(key_hash, self.name) - key_builder.rec(key_hash, self.tags) - - def __eq__(self, other): - return ( - type(self) == type(other) - and self.name == other.name - and self.tags == other.tags) - # }}} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 427bcb621951a49146367878dbdd515cb369ff83..eb373a12df8426ba32ca3e24d6a9cf53323d617a 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from typing import ClassVar, Tuple + from pytools import ImmutableRecord from loopy.diagnostic import LoopyError @@ -324,7 +326,8 @@ class InKernelCallable(ImmutableRecord): """ - hash_fields = ("name", "arg_id_to_dtype", "arg_id_to_descr") + hash_fields: ClassVar[Tuple[str, ...]] = ( + "name", "arg_id_to_dtype", "arg_id_to_descr") def __init__(self, name, arg_id_to_dtype=None, arg_id_to_descr=None): diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 19af767ac28734fa193beee64d23cf688d45fd51..9a2437bf63feba1fd9114539248ecf889cae5ad2 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -21,12 +21,15 @@ THE SOFTWARE. """ from sys import intern +from functools import cached_property + +from warnings import warn +import islpy as isl from pytools import ImmutableRecord, memoize_method from pytools.tag import Tag, tag_dataclass, Taggable + from loopy.diagnostic import LoopyError from loopy.tools import Optional -from warnings import warn -import islpy as isl # {{{ instruction tags @@ -412,8 +415,7 @@ class InstructionBase(ImmutableRecord, Taggable): # {{{ hashing and key building - @property - @memoize_method + @cached_property def _key_builder(self): from loopy.tools import LoopyEqKeyBuilder key_builder = LoopyEqKeyBuilder() diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index ba2fb22562036154981b98dbda9c063381890a62..99f5f3503289a36a9c6fef17bba4f8f6ad29c892 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -23,6 +23,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from typing import Sequence, Mapping, FrozenSet, Dict, AbstractSet, Set, List + import sys from sys import intern @@ -638,7 +640,8 @@ def show_dependency_graph(*args, **kwargs): # {{{ is domain dependent on inames -def is_domain_dependent_on_inames(kernel, domain_index, inames): +def is_domain_dependent_on_inames(kernel: LoopKernel, + domain_index: int, inames: AbstractSet[str]) -> bool: dom = kernel.domains[domain_index] dom_parameters = set(dom.get_var_names(dim_type.param)) @@ -1763,7 +1766,26 @@ def find_most_recent_global_barrier(kernel, insn_id): # {{{ subkernel tools @memoize_on_first_arg -def get_subkernels(kernel): +def get_subkernel_start_indices(kernel: LoopKernel) -> Mapping[str, int]: + """Return a sequence of schedule indices of + + See also :class:`loopy.schedule.CallKernel`. + """ + from loopy.kernel import KernelState + if kernel.state != KernelState.LINEARIZED: + raise LoopyError("Kernel must be linearized") + + assert kernel.linearization is not None + + from loopy.schedule import CallKernel + + return {lin_item.kernel_name: i + for i, lin_item in enumerate(kernel.linearization) + if isinstance(lin_item, CallKernel)} + + +@memoize_on_first_arg +def get_subkernels(kernel) -> Sequence[str]: """Return a :class:`tuple` of the names of the subkernels in the kernel. The kernel must be scheduled. @@ -1773,15 +1795,14 @@ def get_subkernels(kernel): if kernel.state != KernelState.LINEARIZED: raise LoopyError("Kernel must be linearized") - from loopy.schedule import CallKernel + assert kernel.linearization is not None - return tuple(sched_item.kernel_name - for sched_item in kernel.linearization - if isinstance(sched_item, CallKernel)) + return tuple(kernel.linearization[i].kernel_name + for i in get_subkernel_start_indices(kernel).values()) @memoize_on_first_arg -def get_subkernel_to_insn_id_map(kernel): +def get_subkernel_to_insn_id_map(kernel: LoopKernel) -> Mapping[str, FrozenSet[str]]: """Return a :class:`dict` mapping subkernel names to a :class:`frozenset` consisting of the instruction ids scheduled within the subkernel. The kernel must be scheduled. @@ -1790,26 +1811,52 @@ def get_subkernel_to_insn_id_map(kernel): if kernel.state != KernelState.LINEARIZED: raise LoopyError("Kernel must be scheduled") + assert kernel.linearization is not None + from loopy.schedule import ( sched_item_to_insn_id, CallKernel, ReturnFromKernel) subkernel = None - result = {} + result: Dict[str, Set[str]] = {} - for sched_item in kernel.linearization: - if isinstance(sched_item, CallKernel): - subkernel = sched_item.kernel_name + for lin_item in kernel.linearization: + if isinstance(lin_item, CallKernel): + subkernel = lin_item.kernel_name result[subkernel] = set() - if isinstance(sched_item, ReturnFromKernel): + if isinstance(lin_item, ReturnFromKernel): subkernel = None if subkernel is not None: - for insn_id in sched_item_to_insn_id(sched_item): + for insn_id in sched_item_to_insn_id(lin_item): result[subkernel].add(insn_id) return {name: frozenset(insn_ids) for name, insn_ids in result.items()} + +@memoize_on_first_arg +def get_subkernel_extra_inames(kernel: LoopKernel) -> Mapping[str, FrozenSet[str]]: + from loopy.kernel import KernelState + if kernel.state != KernelState.LINEARIZED: + raise LoopyError("Kernel must be scheduled") + + assert kernel.linearization is not None + + result = {} + inames: List[str] = [] + + from loopy.schedule import CallKernel, EnterLoop, LeaveLoop + + for lin_item in kernel.linearization: + if isinstance(lin_item, CallKernel): + result[lin_item.kernel_name] = frozenset(inames) + elif isinstance(lin_item, EnterLoop): + inames.append(lin_item.iname) + elif isinstance(lin_item, LeaveLoop): + inames.pop() + + return result + # }}} diff --git a/loopy/library/function.py b/loopy/library/function.py index d7558960ab0c7e2c4f045655a068fc67d0785797..9c465653f4465524dfee8bcd7861b7e6dd052d6d 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -70,7 +70,7 @@ class IndexOfCallable(ScalarCallable): from loopy.kernel.array import get_access_info from pymbolic import evaluate - access_info = get_access_info(expression_to_code_mapper.kernel.target, + access_info = get_access_info(expression_to_code_mapper.kernel, ary, arg.index, lambda expr: evaluate(expr, expression_to_code_mapper.codegen_state.var_subst_map), expression_to_code_mapper.codegen_state.vectorization_info) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index ac749f6eff8b7e12b2376e916066218f76e4ebd2..190e6bd9abcd9e07e54dc13b607820201aa9d1fa 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -21,11 +21,13 @@ THE SOFTWARE. """ +from typing import ClassVar, Tuple + from pymbolic import var -from loopy.symbolic import ResolvedFunction -from loopy.kernel.function_interface import ScalarCallable import numpy as np +from loopy.symbolic import ResolvedFunction +from loopy.kernel.function_interface import ScalarCallable from loopy.symbolic import FunctionIdentifier from loopy.diagnostic import LoopyError from loopy.types import NumpyType @@ -271,7 +273,7 @@ class MinReductionOperation(ScalarReductionOperation): # {{{ base class for symbolic reduction ops class ReductionOpFunction(FunctionIdentifier): - init_arg_names = ("reduction_op",) + init_arg_names: ClassVar[Tuple[str, ...]] = ("reduction_op",) def __init__(self, reduction_op): self.reduction_op = reduction_op diff --git a/loopy/match.py b/loopy/match.py index 1ec6b32b497672eacca354f0aab6bf1c45bbfb5c..624276dce486dad3e7f1b8f8af9a757262f62a2e 100644 --- a/loopy/match.py +++ b/loopy/match.py @@ -291,7 +291,7 @@ class Tagged(GlobMatchExpressionBase): class Writes(GlobMatchExpressionBase): def __call__(self, kernel, matchable): return any(self.re.match(name) - for name in matchable.write_dependency_names()) + for name in matchable.assignee_var_names()) class Reads(GlobMatchExpressionBase): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 199efdace5a976b79141634e8deef95222c0668e..e26431c93f144b5781cb53960be56142fb21a51a 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -20,15 +20,21 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from typing import Tuple, TypeVar, Iterable, Optional, List, FrozenSet, cast import logging logger = logging.getLogger(__name__) +from immutables import Map +import numpy as np + from loopy.diagnostic import ( LoopyError, WriteRaceConditionWarning, warn_with_kernel, LoopyAdvisory) from loopy.tools import memoize_on_disk -from loopy.kernel.data import filter_iname_tags_by_type +from loopy.kernel.data import filter_iname_tags_by_type, ArrayArg, auto, ValueArg + +from loopy.kernel import LoopKernel # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types from loopy.symbolic import RuleAwareIdentityMapper @@ -37,6 +43,11 @@ from loopy.symbolic import RuleAwareIdentityMapper from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) from loopy.kernel.function_interface import CallableKernel, ScalarCallable +from loopy.transform.data import allocate_temporaries_for_base_storage +from loopy.kernel.array import ArrayDimImplementationTag +from loopy.kernel.data import _ArraySeparationInfo, KernelArgument +from loopy.translation_unit import for_each_kernel +from loopy.typing import ExpressionT from pytools import ProcessLogger from functools import partial @@ -100,6 +111,195 @@ def check_reduction_iname_uniqueness(kernel): # }}} +# {{{ make_arrays_for_sep_arrays + +T = TypeVar("T") + + +def _remove_at_indices( + indices: FrozenSet[int], values: Optional[Iterable[T]] + ) -> Optional[Tuple[T, ...]]: + """ + Assumes *indices* is sorted. + """ + if values is None: + return values + + return tuple(val for i, val in enumerate(values) if i not in indices) + + +@for_each_kernel +def make_arrays_for_sep_arrays(kernel: LoopKernel) -> LoopKernel: + from loopy.kernel.array import SeparateArrayArrayDimTag + new_args = [] + + vng = kernel.get_var_name_generator() + made_changes = False + + # {{{ rewrite arguments + + for arg in kernel.args: + if not isinstance(arg, ArrayArg) or arg.dim_tags is None: + new_args.append(arg) + continue + + sep_axis_indices = [ + i for i, dim_tag in enumerate(arg.dim_tags) + if isinstance(dim_tag, SeparateArrayArrayDimTag)] + + if not sep_axis_indices or arg._separation_info: + new_args.append(arg) + continue + + made_changes = True + + sep_axis_indices_set = frozenset(sep_axis_indices) + + assert isinstance(arg.shape, tuple) + new_shape: Optional[Tuple[ExpressionT, ...]] = \ + _remove_at_indices(sep_axis_indices_set, arg.shape) + new_dim_tags: Optional[Tuple[ArrayDimImplementationTag, ...]] = \ + _remove_at_indices(sep_axis_indices_set, arg.dim_tags) + new_dim_names: Optional[Tuple[Optional[str], ...]] = \ + _remove_at_indices(sep_axis_indices_set, arg.dim_names) + + sep_shape: List[ExpressionT] = [arg.shape[i] for i in sep_axis_indices] + for i, sep_shape_i in enumerate(sep_shape): + if not isinstance(sep_shape_i, (int, np.integer)): + raise LoopyError( + f"Axis {sep_axis_indices[i]+1} (1-based) of " + f"argument '{arg.name}' is tagged 'sep', but " + "does not have constant length.") + + sep_info = _ArraySeparationInfo( + sep_axis_indices_set=sep_axis_indices_set, + subarray_names=Map({ + ind: vng(f"{arg.name}_s{'_'.join(str(i) for i in ind)}") + for ind in np.ndindex(*cast(List[int], sep_shape))})) + + new_args.append(arg.copy(_separation_info=sep_info)) + + for san in sorted(sep_info.subarray_names.values()): + new_args.append( + arg.copy( + name=san, + shape=new_shape, + dim_tags=new_dim_tags, + dim_names=new_dim_names)) + + # }}} + + if not made_changes: + return kernel + + kernel = kernel.copy(args=new_args) + + return kernel + +# }}} + + +# {{{ make temporary variables for offsets and strides + +def make_args_for_offsets_and_strides(kernel: LoopKernel) -> LoopKernel: + additional_args: List[KernelArgument] = [] + + vng = kernel.get_var_name_generator() + + from pymbolic.primitives import Expression, Variable + from loopy.kernel.array import FixedStrideArrayDimTag + + # {{{ process arguments + + new_args = [] + for arg in kernel.args: + if isinstance(arg, ArrayArg) and not arg._separation_info: + what = f"offset for argument '{arg.name}'" + if arg.offset is None: + pass + if arg.offset is auto: + offset_name = vng(arg.name+"_offset") + additional_args.append(ValueArg( + offset_name, kernel.index_dtype)) + arg = arg.copy(offset=offset_name) + elif isinstance(arg.offset, (int, np.integer, Expression, str)): + pass + else: + raise LoopyError(f"invalid value of {what}") + + if arg.dim_tags is None: + new_dim_tags: Optional[Tuple[ArrayDimImplementationTag, ...]] \ + = arg.dim_tags + else: + new_dim_tags = () + for iaxis, dim_tag in enumerate(arg.dim_tags): + if isinstance(dim_tag, FixedStrideArrayDimTag): + what = ("axis stride for axis " + f"{iaxis+1} (1-based) of '{arg.name}'") + if dim_tag.stride is auto: + stride_name = vng(f"{arg.name}_stride{iaxis}") + dim_tag = dim_tag.copy(stride=Variable(stride_name)) + additional_args.append(ValueArg( + stride_name, kernel.index_dtype)) + elif isinstance( + dim_tag.stride, (int, np.integer, Expression)): + pass + else: + raise LoopyError(f"invalid value of {what}") + + new_dim_tags = new_dim_tags + (dim_tag,) + + arg = arg.copy(dim_tags=new_dim_tags) + + new_args.append(arg) + + # }}} + + if not additional_args: + return kernel + else: + return kernel.copy(args=new_args + additional_args) + +# }}} + + +# {{{ zero_offsets + +def zero_offsets_and_strides(kernel: LoopKernel) -> LoopKernel: + made_changes = False + from pymbolic.primitives import Expression + + # {{{ process arguments + + new_args = [] + for arg in kernel.args: + if isinstance(arg, ArrayArg): + if arg.offset is None: + pass + if arg.offset is auto: + made_changes = True + arg = arg.copy(offset=0) + elif isinstance(arg.offset, (int, np.integer, Expression, str)): + from pymbolic.primitives import is_zero + if not is_zero(arg.offset): + raise LoopyError( + f"Non-zero offset on argument '{arg.name}' " + f"of callable kernel '{kernel.name}. This is not allowed.") + else: + raise LoopyError(f"invalid value of offset for '{arg.name}'") + + new_args.append(arg) + + # }}} + + if not made_changes: + return kernel + else: + return kernel.copy(args=new_args) + +# }}} + + # {{{ decide temporary address space def _get_compute_inames_tagged(kernel, insn, tag_base): @@ -129,16 +329,8 @@ def find_temporary_address_space(kernel): base_storage_to_aliases = {} - kernel_var_names = kernel.all_variable_names(include_temp_storage=False) - for temp_var in kernel.temporary_variables.values(): if temp_var.base_storage is not None: - # no nesting allowed - if temp_var.base_storage in kernel_var_names: - raise LoopyError("base_storage for temporary '%s' is '%s', " - "which is an existing variable name" - % (temp_var.name, temp_var.base_storage)) - base_storage_to_aliases.setdefault( temp_var.base_storage, []).append(temp_var.name) @@ -535,11 +727,19 @@ def filter_reachable_callables(t_unit): return t_unit.copy(callables_table=new_callables) -def _preprocess_single_kernel(kernel, callables_table): +def _preprocess_single_kernel(kernel: LoopKernel, is_entrypoint: bool) -> LoopKernel: from loopy.kernel import KernelState prepro_logger = ProcessLogger(logger, "%s: preprocess" % kernel.name) + kernel = make_arrays_for_sep_arrays(kernel) + + if is_entrypoint: + kernel = make_args_for_offsets_and_strides(kernel) + else: + # No need for offsets internally, we can pass arbitrary pointers. + kernel = zero_offsets_and_strides(kernel) + from loopy.check import check_identifiers_in_subst_rules check_identifiers_in_subst_rules(kernel) @@ -569,6 +769,10 @@ def _preprocess_single_kernel(kernel, callables_table): kernel = find_temporary_address_space(kernel) + # Ordering restriction: temporary address spaces need to be found before + # allocating base_storage + kernel = allocate_temporaries_for_base_storage(kernel, _implicitly_run=True) + # check for atomic loads, much easier to do here now that the dependencies # have been established kernel = check_atomic_loads(kernel) @@ -637,7 +841,8 @@ def preprocess_program(program): for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = _preprocess_single_kernel( - in_knl_callable.subkernel, program.callables_table) + in_knl_callable.subkernel, + is_entrypoint=func_id in program.entrypoints) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) elif isinstance(in_knl_callable, ScalarCallable): diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index b46f04826d27cb7f2503f919c42bbadd3b4ecaf5..9dc3bdc365219ccad8e580b2250d379fe611942a 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -20,9 +20,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import sys +from dataclasses import dataclass, replace +from typing import Any, TypeVar from pytools import ImmutableRecord -import sys import islpy as isl from loopy.diagnostic import LoopyError, ScheduleDebugInputError, warn_with_kernel @@ -50,17 +52,15 @@ __doc__ = """ """ -# {{{ schedule items +SelfT = TypeVar("SelfT") -class ScheduleItem(ImmutableRecord): - __slots__ = [] - def update_persistent_hash(self, key_hash, key_builder): - """Custom hash computation function for use with - :class:`pytools.persistent_dict.PersistentDict`. - """ - for field_name in self.hash_fields: - key_builder.rec(key_hash, getattr(self, field_name)) +# {{{ schedule items + +@dataclass(frozen=True) +class ScheduleItem: + def copy(self: SelfT, **kwargs: Any) -> SelfT: + return replace(self, **kwargs) class BeginBlockItem(ScheduleItem): @@ -71,26 +71,32 @@ class EndBlockItem(ScheduleItem): pass +@dataclass(frozen=True) class EnterLoop(BeginBlockItem): - hash_fields = __slots__ = ["iname"] + iname: str +@dataclass(frozen=True) class LeaveLoop(EndBlockItem): - hash_fields = __slots__ = ["iname"] + iname: str +@dataclass(frozen=True) class RunInstruction(ScheduleItem): - hash_fields = __slots__ = ["insn_id"] + insn_id: str +@dataclass(frozen=True) class CallKernel(BeginBlockItem): - hash_fields = __slots__ = ["kernel_name", "extra_args", "extra_inames"] + kernel_name: str +@dataclass(frozen=True) class ReturnFromKernel(EndBlockItem): - hash_fields = __slots__ = ["kernel_name"] + kernel_name: str +@dataclass(frozen=True) class Barrier(ScheduleItem): """ .. attribute:: comment @@ -107,9 +113,10 @@ class Barrier(ScheduleItem): .. attribute:: originating_insn_id """ - - hash_fields = ["comment", "synchronization_kind", "mem_kind"] - __slots__ = hash_fields + ["originating_insn_id"] + comment: str + synchronization_kind: str + mem_kind: str + originating_insn_id: str # }}} @@ -450,11 +457,7 @@ def dump_schedule(kernel, schedule): indent = indent[:-4] lines.append(indent + "end %s" % sched_item.iname) elif isinstance(sched_item, CallKernel): - lines.append(indent + - "CALL KERNEL {}(extra_args={}, extra_inames={})".format( - sched_item.kernel_name, - sched_item.extra_args, - sched_item.extra_inames)) + lines.append(indent + f"CALL KERNEL {sched_item.kernel_name}") indent += " " elif isinstance(sched_item, ReturnFromKernel): indent = indent[:-4] @@ -2102,8 +2105,6 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args=None): # Device mapper only gets run once. new_kernel = map_schedule_onto_host_or_device(new_kernel) - from loopy.schedule.tools import add_extra_args_to_schedule - new_kernel = add_extra_args_to_schedule(new_kernel) yield new_kernel debug.start() diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py index 35c73b775395f488df4a2e46fdae35c8cc434948..38d03a2960487c023c8a6be3d8f493d59a3a0db1 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/schedule/device_mapping.py @@ -21,7 +21,7 @@ THE SOFTWARE. """ from loopy.diagnostic import LoopyError -from loopy.schedule import (Barrier, CallKernel, EnterLoop, LeaveLoop, +from loopy.schedule import (Barrier, CallKernel, EnterLoop, ReturnFromKernel, RunInstruction) from loopy.schedule.tools import get_block_boundaries @@ -40,9 +40,7 @@ def map_schedule_onto_host_or_device(kernel): if not kernel.target.split_kernel_at_global_barriers(): new_schedule = ( - [CallKernel(kernel_name=device_prog_name_gen(), - extra_args=[], - extra_inames=[])] + + [CallKernel(kernel_name=device_prog_name_gen())] + list(kernel.linearization) + [ReturnFromKernel(kernel_name=kernel.name)]) kernel = kernel.copy(linearization=new_schedule) @@ -59,7 +57,7 @@ def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen): # {{{ inner mapper function - dummy_call = CallKernel(kernel_name="", extra_args=[], extra_inames=[]) + dummy_call = CallKernel(kernel_name="") dummy_return = ReturnFromKernel(kernel_name="") def inner_mapper(start_idx, end_idx, new_schedule): @@ -143,22 +141,14 @@ def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen): new_schedule + [dummy_return.copy()]) - # Assign names, extra_inames to CallKernel / ReturnFromKernel instructions - inames = [] + # Assign names to CallKernel / ReturnFromKernel instructions for idx, sched_item in enumerate(new_schedule): if isinstance(sched_item, CallKernel): last_kernel_name = device_prog_name_gen() - new_schedule[idx] = sched_item.copy( - kernel_name=last_kernel_name, - extra_inames=list(inames)) + new_schedule[idx] = sched_item.copy(kernel_name=last_kernel_name) elif isinstance(sched_item, ReturnFromKernel): - new_schedule[idx] = sched_item.copy( - kernel_name=last_kernel_name) - elif isinstance(sched_item, EnterLoop): - inames.append(sched_item.iname) - elif isinstance(sched_item, LeaveLoop): - inames.pop() + new_schedule[idx] = sched_item.copy(kernel_name=last_kernel_name) new_kernel = kernel.copy(linearization=new_schedule) diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index d021b0aa43edf9c9f3373857b63b7cb89a83032d..f2164b6dd8c328b826376015d6653418e694ca9d 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -20,10 +20,16 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from loopy.kernel.data import AddressSpace +from functools import cached_property +import enum +from typing import Sequence, FrozenSet, Tuple, List, Set, Dict +from dataclasses import dataclass + from pytools import memoize_method import islpy as isl -import enum + +from loopy.kernel.data import AddressSpace, TemporaryVariable, ArrayArg +from loopy.kernel import LoopKernel # {{{ block boundary finder @@ -51,60 +57,244 @@ def get_block_boundaries(schedule): # {{{ subkernel tools -def temporaries_read_in_subkernel(kernel, subkernel): +def temporaries_read_in_subkernel( + kernel: LoopKernel, subkernel_name: str) -> FrozenSet[str]: from loopy.kernel.tools import get_subkernel_to_insn_id_map - insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel] + insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel_name] + inames = frozenset().union(*(kernel.insn_inames(insn_id) + for insn_id in insn_ids)) + domain_idxs = {kernel.get_home_domain_index(iname) for iname in inames} + params = frozenset().union(*( + kernel.domains[dom_idx].get_var_names(isl.dim_type.param) + for dom_idx in domain_idxs)) + + return (frozenset(tv + for insn_id in insn_ids + for tv in kernel.id_to_insn[insn_id].read_dependency_names() + if tv in kernel.temporary_variables) + | (params & frozenset(kernel.temporary_variables))) + + +def temporaries_written_in_subkernel( + kernel: LoopKernel, subkernel_name: str) -> FrozenSet[str]: + from loopy.kernel.tools import get_subkernel_to_insn_id_map + insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel_name] return frozenset(tv - for insn_id in insn_ids - for tv in kernel.id_to_insn[insn_id].read_dependency_names() - if tv in kernel.temporary_variables) + for insn_id in insn_ids + for tv in kernel.id_to_insn[insn_id].assignee_var_names() + if tv in kernel.temporary_variables) -def temporaries_written_in_subkernel(kernel, subkernel): +def args_read_in_subkernel( + kernel: LoopKernel, subkernel_name: str) -> FrozenSet[str]: from loopy.kernel.tools import get_subkernel_to_insn_id_map - insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel] - return frozenset(tv - for insn_id in insn_ids - for tv in kernel.id_to_insn[insn_id].write_dependency_names() - if tv in kernel.temporary_variables) + insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel_name] + inames = frozenset().union(*(kernel.insn_inames(insn_id) + for insn_id in insn_ids)) + domain_idxs = {kernel.get_home_domain_index(iname) for iname in inames} + params = frozenset().union(*( + kernel.domains[dom_idx].get_var_names(isl.dim_type.param) + for dom_idx in domain_idxs)) + return (frozenset(arg + for insn_id in insn_ids + for arg in kernel.id_to_insn[insn_id].read_dependency_names() + if arg in kernel.arg_dict) + | (params & frozenset(kernel.arg_dict))) + + +def args_written_in_subkernel( + kernel: LoopKernel, subkernel_name: str) -> FrozenSet[str]: + from loopy.kernel.tools import get_subkernel_to_insn_id_map + insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel_name] + return frozenset(arg + for insn_id in insn_ids + for arg in kernel.id_to_insn[insn_id].assignee_var_names() + if arg in kernel.arg_dict) + + +def supporting_temporary_names( + kernel: LoopKernel, tv_names: FrozenSet[str]) -> FrozenSet[str]: + result: Set[str] = set() + + for name in tv_names: + tv = kernel.temporary_variables[name] + for supp_name in tv.supporting_names(): + if supp_name in kernel.temporary_variables: + result.add(supp_name) + + return frozenset(result) # }}} -# {{{ add extra args to schedule +# {{{ argument lists -def add_extra_args_to_schedule(kernel): - """ - Fill the `extra_args` fields in all the :class:`loopy.schedule.CallKernel` - instructions in the schedule with global temporaries. - """ - new_schedule = [] - from loopy.schedule import CallKernel +@dataclass(frozen=True) +class KernelArgInfo: + passed_arg_names: Sequence[str] + + written_names: FrozenSet[str] + + @property + def passed_names(self) -> Sequence[str]: + return self.passed_arg_names + + +@dataclass(frozen=True) +class SubKernelArgInfo(KernelArgInfo): + passed_inames: Sequence[str] + passed_temporaries: Sequence[str] + + @property + def passed_names(self) -> Sequence[str]: + return (list(self.passed_arg_names) + + list(self.passed_inames) + + list(self.passed_temporaries)) + + +def _should_temp_var_be_passed(tv: TemporaryVariable) -> bool: + return tv.address_space == AddressSpace.GLOBAL and tv.initializer is None + + +class _SupportingNameTracker: + def __init__(self, kernel: LoopKernel): + self.kernel = kernel + self.name_to_main_name: Dict[str, str] = {} + + def add_supporting_names_for(self, name): + var_descr = self.kernel.get_var_descriptor(name) + for supp_name in var_descr.supporting_names(): + self.name_to_main_name[supp_name] = ( + self.name_to_main_name.get(supp_name, frozenset()) + | {name}) + + def get_additional_args_and_tvs( + self, already_passed: Set[str] + ) -> Tuple[List[str], List[str]]: + additional_args = [] + additional_temporaries = [] + + for supporting_name in sorted(frozenset(self.name_to_main_name)): + if supporting_name not in already_passed: + already_passed.add(supporting_name) + var_descr = self.kernel.get_var_descriptor(supporting_name) + if isinstance(var_descr, TemporaryVariable): + if _should_temp_var_be_passed(var_descr): + additional_temporaries.append(supporting_name) + else: + additional_args.append(supporting_name) + + return additional_args, additional_temporaries + + +def _process_args_for_arg_info( + kernel: LoopKernel, args_read: Set[str], args_written: Set[str], + supp_name_tracker: _SupportingNameTracker, used_only: bool, + ) -> List[str]: + + args_expected: Set[str] = set() + + passed_arg_names = [] + for arg in kernel.args: + if used_only and not (arg.name in args_read or arg.name in args_written): + continue + + try: + args_expected.remove(arg.name) + except KeyError: + pass + + # Disregard the original array if it had a sep-tagged axis. + if isinstance(arg, ArrayArg): + if not arg._separation_info: + passed_arg_names.append(arg.name) + supp_name_tracker.add_supporting_names_for(arg.name) + else: + for sep_name in sorted(arg._separation_info.subarray_names.values()): + # Separated arrays occur later in the argument list. + # Mark them as accessed if the original array was, + # we'll stumble on them when it is their turn. + # Add them to args_expected to ensure they're not missed. + if arg.name in args_read: + args_read.add(sep_name) + args_expected.add(sep_name) + if arg.name in args_written: + args_written.add(sep_name) + args_expected.add(sep_name) - for sched_item in kernel.linearization: - if isinstance(sched_item, CallKernel): - subkernel = sched_item.kernel_name - - used_temporaries = ( - temporaries_read_in_subkernel(kernel, subkernel) - | temporaries_written_in_subkernel(kernel, subkernel)) - - more_args = {tv - for tv in used_temporaries - if - kernel.temporary_variables[tv].address_space - == AddressSpace.GLOBAL - and - kernel.temporary_variables[tv].initializer is None - and - tv not in sched_item.extra_args} - - new_schedule.append(sched_item.copy( - extra_args=sched_item.extra_args + sorted(more_args))) else: - new_schedule.append(sched_item) + passed_arg_names.append(arg.name) + supp_name_tracker.add_supporting_names_for(arg.name) + + assert not args_expected + + return passed_arg_names + + +def get_kernel_arg_info(kernel: LoopKernel) -> KernelArgInfo: + args_written = set(kernel.arg_dict) & kernel.get_written_variables() + + supp_name_tracker = _SupportingNameTracker(kernel) + + passed_arg_names = _process_args_for_arg_info(kernel, + args_read=set(), args_written=args_written, + supp_name_tracker=supp_name_tracker, + used_only=False) + + additional_args, additional_temporaries = \ + supp_name_tracker.get_additional_args_and_tvs( + already_passed=( + set(passed_arg_names))) + + assert not additional_temporaries + + return KernelArgInfo( + passed_arg_names=passed_arg_names + additional_args, + written_names=frozenset(args_written)) + + +def get_subkernel_arg_info( + kernel: LoopKernel, subkernel_name: str) -> SubKernelArgInfo: + assert kernel.linearization is not None - return kernel.copy(linearization=new_schedule) + args_read = set(args_read_in_subkernel(kernel, subkernel_name)) + args_written = set(args_written_in_subkernel(kernel, subkernel_name)) + + tvs_read = temporaries_read_in_subkernel(kernel, subkernel_name) + tvs_written = set(temporaries_written_in_subkernel(kernel, subkernel_name)) + + supp_name_tracker = _SupportingNameTracker(kernel) + + passed_arg_names = _process_args_for_arg_info(kernel, + args_read=args_read, args_written=args_written, + supp_name_tracker=supp_name_tracker, + used_only=True) + + passed_temporaries: List[str] = [] + for tv_name in sorted(tvs_read | tvs_written): + supp_name_tracker.add_supporting_names_for(tv_name) + tv = kernel.temporary_variables[tv_name] + + if _should_temp_var_be_passed(tv): + if tv.base_storage: + if tv_name in tvs_written: + if tv_name in tvs_written: + tvs_written.add(tv.base_storage) + else: + passed_temporaries.append(tv.name) + + additional_args, additional_temporaries = \ + supp_name_tracker.get_additional_args_and_tvs( + already_passed=( + set(passed_arg_names) | set(passed_temporaries))) + + from loopy.kernel.tools import get_subkernel_extra_inames + + return SubKernelArgInfo( + passed_arg_names=passed_arg_names + additional_args, + passed_inames=sorted(get_subkernel_extra_inames(kernel)[subkernel_name]), + passed_temporaries=passed_temporaries + additional_temporaries, + written_names=frozenset(args_written | tvs_written)) # }}} @@ -287,8 +477,7 @@ class WriteRaceChecker: self.kernel = kernel self.callables_table = callables_table - @property - @memoize_method + @cached_property def vars(self): return (self.kernel.get_written_variables() | self.kernel.get_read_variables()) @@ -364,3 +553,5 @@ class WriteRaceChecker: self.kernel, self.callables_table) # }}} + +# vim: foldmethod=marker diff --git a/loopy/statistics.py b/loopy/statistics.py index 913400eb34285beba63fd305b3e2524353610232..bdcdb0878a4b0249da38d80757cf540d29d8a597 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -25,10 +25,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import loopy as lp +from functools import partial, cached_property + from islpy import dim_type import islpy as isl from pymbolic.mapper import CombineMapper + +import loopy as lp from loopy.kernel.data import ( MultiAssignmentBase, TemporaryVariable, AddressSpace) from loopy.diagnostic import warn_with_kernel, LoopyError @@ -36,7 +39,6 @@ from loopy.symbolic import CoefficientCollector from pytools import ImmutableRecord, memoize_method from loopy.kernel.function_interface import CallableKernel from loopy.translation_unit import TranslationUnit -from functools import partial __doc__ = """ @@ -813,8 +815,7 @@ class CounterBase(CombineMapper): self.zero = get_kernel_zero_pwqpolynomial(self.knl) self.one = self.zero + 1 - @property - @memoize_method + @cached_property def param_space(self): return get_kernel_parameter_space(self.knl) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index c1708adfd772e5ca8f2282e7cbade29c07f07cc1..2226dc3e467173db454df9a8c7540a3b9af89982 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -24,12 +24,19 @@ THE SOFTWARE. """ -from functools import reduce +from typing import ClassVar, Tuple +from functools import reduce, cached_property from sys import intern +import re + +import numpy as np -from pytools import memoize, memoize_method, memoize_on_first_arg, ImmutableRecord +from pytools import (memoize, memoize_method, memoize_on_first_arg, + ImmutableRecord) import pytools.lex from pytools.tag import Taggable +import islpy as isl +from islpy import dim_type import pymbolic.primitives as p @@ -64,12 +71,6 @@ from loopy.diagnostic import (ExpressionToAffineConversionError, UnableToDetermineAccessRangeError) -import islpy as isl -from islpy import dim_type - -import re -import numpy as np - __doc__ = """ .. currentmodule:: loopy.symbolic @@ -592,7 +593,7 @@ class LocalHardwareAxisIndex(HardwareAxisIndex): class FunctionIdentifier(LoopyExpressionBase): """A base class for symbols representing functions.""" - init_arg_names = () + init_arg_names: ClassVar[Tuple[str, ...]] = () mapper_method = intern("map_loopy_function_identifier") @@ -801,8 +802,7 @@ class Reduction(LoopyExpressionBase): def is_tuple_typed(self): return self.operation.arg_count > 1 - @property - @memoize_method + @cached_property def inames_set(self): return set(self.inames) @@ -2680,8 +2680,7 @@ class AccessRangeOverlapChecker: def __init__(self, kernel): self.kernel = kernel - @property - @memoize_method + @cached_property def vars(self): return (self.kernel.get_written_variables() | self.kernel.get_read_variables()) @@ -2722,7 +2721,7 @@ class AccessRangeOverlapChecker: if access_dir == "any": return var_name in insn.dependency_names() else: - return var_name in insn.write_dependency_names() + return var_name in insn.assignee_var_names() return self._get_access_ranges(insn_id, access_dir)[var_name] diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index a6357a12b61d2b837ab1cd016554c2c5af100024..e38624b4316b3e4b7855e4283faf6b1228a0bb69 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -1,4 +1,27 @@ -"""Base target interface.""" +""" +.. currentmodule:: loopy + +.. autoclass:: TargetBase +.. autoclass:: ASTBuilderBase +.. autoclass:: CFamilyTarget +.. autoclass:: CTarget +.. autoclass:: ExecutableCTarget +.. autoclass:: CudaTarget +.. autoclass:: OpenCLTarget +.. autoclass:: PyOpenCLTarget +.. autoclass:: ISPCTarget + +References to Canonical Names +----------------------------- + +.. currentmodule:: loopy.target + +.. class:: TargetBase + + See :class:`loopy.TargetBase`. +""" + +from __future__ import annotations __copyright__ = "Copyright (C) 2015 Andreas Kloeckner" @@ -23,34 +46,20 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -__doc__ = """ -.. currentmodule:: loopy +from typing import (Any, Tuple, Generic, TypeVar, Sequence, ClassVar, Optional, + TYPE_CHECKING) -.. autoclass:: TargetBase -.. autoclass:: ASTBuilderBase -.. autoclass:: CFamilyTarget -.. autoclass:: CTarget -.. autoclass:: ExecutableCTarget -.. autoclass:: CudaTarget -.. autoclass:: OpenCLTarget -.. autoclass:: PyOpenCLTarget -.. autoclass:: ISPCTarget -.. autoclass:: NumbaTarget -.. autoclass:: NumbaCudaTarget - -References to Canonical Names ------------------------------ +if TYPE_CHECKING: + from loopy.typing import ExpressionT + from loopy.codegen import CodeGenerationState + from loopy.codegen.result import CodeGenerationResult -.. currentmodule:: loopy.target -.. class:: TargetBase - - See :class:`loopy.TargetBase`. -""" +ASTType = TypeVar("ASTType") -class TargetBase: +class TargetBase(): """Base class for all targets, i.e. different combinations of code that loopy can generate. @@ -59,8 +68,8 @@ class TargetBase: # {{{ persistent hashing - hash_fields = () - comparison_fields = () + hash_fields: ClassVar[Tuple[str, ...]] = () + comparison_fields: ClassVar[Tuple[str, ...]] = () def update_persistent_hash(self, key_hash, key_builder): key_hash.update(type(self).__name__.encode()) @@ -100,20 +109,20 @@ class TargetBase: device_program_name_prefix = "" device_program_name_suffix = "" - def split_kernel_at_global_barriers(self): + def split_kernel_at_global_barriers(self) -> bool: """ :returns: a :class:`bool` indicating whether the kernel should be split when a global barrier is encountered. """ raise NotImplementedError() - def get_host_ast_builder(self): + def get_host_ast_builder(self) -> ASTBuilderBase[Any]: """ :returns: a class implementing :class:`ASTBuilderBase` for the host code """ raise NotImplementedError() - def get_device_ast_builder(self): + def get_device_ast_builder(self) -> ASTBuilderBase[Any]: """ :returns: a class implementing :class:`ASTBuilderBase` for the host code """ @@ -151,7 +160,7 @@ class TargetBase: raise NotImplementedError() -class ASTBuilderBase: +class ASTBuilderBase(Generic[ASTType]): """An interface for generating (host or device) ASTs. """ @@ -184,28 +193,38 @@ class ASTBuilderBase: def ast_module(self): raise NotImplementedError() - def get_function_definition(self, codegen_state, codegen_result, - schedule_index, function_decl, function_body): + def get_function_definition( + self, codegen_state: CodeGenerationState, + codegen_result: CodeGenerationResult, + schedule_index: int, function_decl: ASTType, function_body: ASTType + ) -> ASTType: raise NotImplementedError - def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + def get_function_declaration( + self, codegen_state: CodeGenerationState, + codegen_result: CodeGenerationResult, schedule_index: int + ) -> ASTType: raise NotImplementedError - def generate_top_of_body(self, codegen_state): + def generate_top_of_body( + self, codegen_state: CodeGenerationState) -> Sequence[ASTType]: return [] - def get_temporary_decls(self, codegen_state, schedule_index): + def get_temporary_decls(self, codegen_state: CodeGenerationState, + schedule_index: int) -> ASTType: raise NotImplementedError - def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): - raise NotImplementedError + def get_kernel_call(self, codegen_state: CodeGenerationState, + subkernel_name: str, + gsize: Tuple[ExpressionT, ...], + lsize: Tuple[ExpressionT, ...]) -> Optional[ASTType]: + raise NotImplementedError() @property def ast_block_class(self): raise NotImplementedError() - def get_expression_to_code_mapper(self, codegen_state): + def get_expression_to_code_mapper(self, codegen_state: CodeGenerationState): raise NotImplementedError() def add_vector_access(self, access_expr, index): @@ -218,15 +237,6 @@ class ASTBuilderBase: """ raise NotImplementedError() - def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written): - raise NotImplementedError() - - def get_global_arg_decl(self, name, shape, dtype, is_written): - raise NotImplementedError() - - def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): - raise NotImplementedError() - def emit_assignment(self, codegen_state, insn): raise NotImplementedError() @@ -294,7 +304,7 @@ class DummyHostASTBuilder(ASTBuilderBase): def get_expression_to_code_mapper(self, codegen_state): return _DummyExpressionToCodeMapper() - def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): + def get_kernel_call(self, codegen_state, name, gsize, lsize): return None @property diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 022308852035e15430d7e715d2e1d5134de5fb5f..9df5da5ca56b1fe3a6d0d5079922fad589a27631 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -23,21 +23,32 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from typing import cast, Tuple, Optional +import re + import numpy as np # noqa -from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder -from loopy.diagnostic import LoopyError, LoopyTypeError -from cgen import Pointer, NestedDeclarator, Block + +from cgen import Pointer, NestedDeclarator, Block, Generable, Declarator, Const from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE -from loopy.symbolic import IdentityMapper -from loopy.types import NumpyType -from loopy.kernel.function_interface import ScalarCallable import pymbolic.primitives as p +from pytools import memoize_method +from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder +from loopy.diagnostic import LoopyError, LoopyTypeError +from loopy.symbolic import IdentityMapper +from loopy.types import NumpyType, LoopyType +from loopy.typing import ExpressionT +from loopy.kernel import LoopKernel +from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag +from loopy.kernel.data import (TemporaryVariable, AddressSpace, ArrayArg, + ConstantArg, ImageArg, ValueArg) +from loopy.kernel.function_interface import ScalarCallable +from loopy.schedule import CallKernel from loopy.tools import remove_common_indentation -import re +from loopy.codegen import CodeGenerationState +from loopy.codegen.result import CodeGenerationResult -from pytools import memoize_method __doc__ = """ .. currentmodule loopy.target.c @@ -250,9 +261,6 @@ def _preamble_generator(preamble_info, func_qualifier="inline"): # {{{ cgen overrides -from cgen import Declarator - - class POD(Declarator): """A simple declarator: The type is given as a :class:`numpy.dtype` and the *name* is given as a string. @@ -748,7 +756,7 @@ def get_gnu_libc_callables(): # }}} -class CFamilyASTBuilder(ASTBuilderBase): +class CFamilyASTBuilder(ASTBuilderBase[Generable]): preamble_function_qualifier = "inline" @@ -777,10 +785,13 @@ class CFamilyASTBuilder(ASTBuilderBase): # {{{ code generation - def get_function_definition(self, codegen_state, codegen_result, - schedule_index, - function_decl, function_body): + def get_function_definition( + self, codegen_state: CodeGenerationState, + codegen_result: CodeGenerationResult, + schedule_index: int, function_decl: Generable, function_body: Generable + ) -> Generable: kernel = codegen_state.kernel + assert kernel.linearization is not None from cgen import ( FunctionBody, @@ -811,12 +822,8 @@ class CFamilyASTBuilder(ASTBuilderBase): tv.initializer is not None): assert tv.read_only - decl_info, = tv.decl_info(self.target, - index_dtype=kernel.index_dtype) decl = self.wrap_global_constant( - self.get_temporary_decl( - codegen_state, schedule_index, tv, - decl_info)) + self.get_temporary_var_declarator(codegen_state, tv)) if tv.initializer is not None: decl = Initializer(decl, generate_array_literal( @@ -830,29 +837,16 @@ class CFamilyASTBuilder(ASTBuilderBase): else: return Collection(result+[Line(), fbody]) - def idi_to_cgen_declarator(self, kernel, idi): - from loopy.kernel.data import InameArg - if (idi.offset_for_name is not None - or idi.stride_for_name_and_axis is not None): - assert not idi.is_written - from cgen import Const - return Const(POD(self, idi.dtype, idi.name)) - elif issubclass(idi.arg_class, InameArg): - return InameArg(idi.name, idi.dtype).get_arg_decl(self) - else: - name = idi.base_name or idi.name - var_descr = kernel.get_var_descriptor(name) - from loopy.kernel.data import ArrayBase - if isinstance(var_descr, ArrayBase): - return var_descr.get_arg_decl( - self, - idi.name[len(name):], idi.shape, idi.dtype, - idi.is_written) - else: - return var_descr.get_arg_decl(self) + def get_function_declaration(self, codegen_state: CodeGenerationState, + codegen_result: CodeGenerationResult, schedule_index: int) -> Generable: + kernel = codegen_state.kernel + + assert codegen_state.kernel.linearization is not None + subkernel_name = cast( + CallKernel, + codegen_state.kernel.linearization[schedule_index] + ).kernel_name - def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): from cgen import FunctionDeclaration, Value name = codegen_result.current_program(codegen_state).name @@ -861,146 +855,110 @@ class CFamilyASTBuilder(ASTBuilderBase): if codegen_state.is_entrypoint: name = Value("void", name) + + # subkernel launches occur only as part of entrypoint kernels for now + from loopy.schedule.tools import get_subkernel_arg_info + skai = get_subkernel_arg_info(kernel, subkernel_name) + passed_names = skai.passed_names + written_names = skai.written_names else: name = Value("static void", name) + passed_names = [arg.name for arg in kernel.args] + written_names = kernel.get_written_variables() + return FunctionDeclarationWrapper( FunctionDeclaration( name, - [self.idi_to_cgen_declarator(codegen_state.kernel, idi) - for idi in codegen_state.implemented_data_info])) - - def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): + [self.arg_to_cgen_declarator( + kernel, arg_name, + is_written=arg_name in written_names) + for arg_name in passed_names])) + + def get_kernel_call(self, codegen_state: CodeGenerationState, + subkernel_name: str, + gsize: Tuple[ExpressionT, ...], + lsize: Tuple[ExpressionT, ...]) -> Optional[Generable]: return None def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import AddressSpace kernel = codegen_state.kernel + assert kernel.linearization is not None - base_storage_decls = [] temp_decls = [] + temp_decls_using_base_storage = [] # {{{ declare temporaries - base_storage_sizes = {} - base_storage_to_scope = {} - base_storage_to_align_bytes = {} - - from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line + from cgen import Initializer, Line # Getting the temporary variables that are needed for the current # sub-kernel. from loopy.schedule.tools import ( temporaries_read_in_subkernel, - temporaries_written_in_subkernel) - subkernel = kernel.linearization[schedule_index].kernel_name + temporaries_written_in_subkernel, + supporting_temporary_names) + subkernel_name = kernel.linearization[schedule_index].kernel_name + sub_knl_temps = ( + temporaries_read_in_subkernel(kernel, subkernel_name) + | temporaries_written_in_subkernel(kernel, subkernel_name)) sub_knl_temps = ( - temporaries_read_in_subkernel(kernel, subkernel) - | temporaries_written_in_subkernel(kernel, subkernel)) + sub_knl_temps + | supporting_temporary_names(kernel, sub_knl_temps)) + + ecm = self.get_expression_to_code_mapper(codegen_state) for tv in sorted( kernel.temporary_variables.values(), key=lambda key_tv: key_tv.name): - decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype) - if not tv.base_storage: - for idi in decl_info: - # global temp vars are mapped to arguments or global declarations - if tv.address_space != AddressSpace.GLOBAL and ( - tv.name in sub_knl_temps): - decl = self.wrap_temporary_decl( - self.get_temporary_decl( - codegen_state, schedule_index, tv, idi), - tv.address_space) - - if tv.initializer is not None: - assert tv.read_only - decl = Initializer(decl, generate_array_literal( - codegen_state, tv, tv.initializer)) - - temp_decls.append(decl) + # global temp vars are mapped to arguments or global + # declarations, no need to declare locally. + if tv.address_space != AddressSpace.GLOBAL and ( + tv.name in sub_knl_temps): + decl = self.get_temporary_var_declarator(codegen_state, tv) - else: - assert tv.initializer is None - if (tv.address_space == AddressSpace.GLOBAL - and codegen_state.is_generating_device_code): - # global temps trigger no codegen in the device code - continue - - offset = 0 - base_storage_sizes.setdefault(tv.base_storage, []).append( - tv.nbytes) - base_storage_to_scope.setdefault(tv.base_storage, []).append( - tv.address_space) - - align_size = tv.dtype.itemsize - - from loopy.kernel.array import VectorArrayDimTag - for dim_tag, axis_len in zip(tv.dim_tags, tv.shape): - if isinstance(dim_tag, VectorArrayDimTag): - align_size *= axis_len - - base_storage_to_align_bytes.setdefault(tv.base_storage, []).append( - align_size) - - for idi in decl_info: - cast_decl = POD(self, idi.dtype, "") - temp_var_decl = POD(self, idi.dtype, idi.name) - - cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space) - temp_var_decl = self.wrap_temporary_decl( - temp_var_decl, tv.address_space) - - if tv._base_storage_access_may_be_aliasing: - ptrtype = _ConstPointer - else: - # The 'restrict' part of this is a complete lie--of course - # all these temporaries are aliased. But we're promising to - # not use them to shovel data from one representation to the - # other. That counts, right? - ptrtype = _ConstRestrictPointer - - cast_decl = ptrtype(cast_decl) - temp_var_decl = ptrtype(temp_var_decl) - - cast_tp, cast_d = cast_decl.get_decl_pair() - temp_var_decl = Initializer( - temp_var_decl, - "({} {}) ({} + {})".format( - " ".join(cast_tp), cast_d, - tv.base_storage, - offset)) - - temp_decls.append(temp_var_decl) - - from pytools import product - offset += ( - idi.dtype.itemsize - * product(si for si in idi.shape)) - - ecm = self.get_expression_to_code_mapper(codegen_state) + if tv.initializer is not None: + assert tv.read_only + decl = Initializer(decl, generate_array_literal( + codegen_state, tv, tv.initializer)) - for bs_name, bs_sizes in sorted(base_storage_sizes.items()): - bs_var_decl = Value("char", bs_name) - from pytools import single_valued - bs_var_decl = self.wrap_temporary_decl( - bs_var_decl, single_valued(base_storage_to_scope[bs_name])) + temp_decls.append(decl) - # FIXME: Could try to use isl knowledge to simplify max. - if all(isinstance(bs, int) for bs in bs_sizes): - bs_size_max = max(bs_sizes) else: - bs_size_max = p.Max(tuple(bs_sizes)) - - bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max)) + assert tv.initializer is None - alignment = max(base_storage_to_align_bytes[bs_name]) - bs_var_decl = AlignedAttribute(alignment, bs_var_decl) + cast_decl = POD(self, tv.dtype, "") + temp_var_decl = POD(self, tv.dtype, tv.name) - base_storage_decls.append(bs_var_decl) + if tv._base_storage_access_may_be_aliasing: + ptrtype = _ConstPointer + else: + # The 'restrict' part of this is a complete lie--of course + # all these temporaries are aliased. But we're promising to + # not use them to shovel data from one representation to the + # other. That counts, right? + ptrtype = _ConstRestrictPointer + + cast_decl = self.wrap_decl_for_address_space( + ptrtype(cast_decl), tv.address_space) + temp_var_decl = self.wrap_decl_for_address_space( + ptrtype(temp_var_decl), tv.address_space) + + cast_tp, cast_d = cast_decl.get_decl_pair() + temp_var_decl = Initializer( + temp_var_decl, + "({} {}) ({} + {})".format( + " ".join(cast_tp), cast_d, + tv.base_storage, + ecm(tv.offset) + )) + + temp_decls_using_base_storage.append(temp_var_decl) # }}} - result = base_storage_decls + temp_decls + result = temp_decls + temp_decls_using_base_storage if result: result.append(Line()) @@ -1018,8 +976,6 @@ class CFamilyASTBuilder(ASTBuilderBase): # }}} - # {{{ code generation guts - @property def ast_module(self): import cgen @@ -1037,36 +993,18 @@ class CFamilyASTBuilder(ASTBuilderBase): from loopy.target.c.codegen.expression import CExpressionToCodeMapper return CExpressionToCodeMapper() - def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info): - temp_var_decl = POD(self, decl_info.dtype, decl_info.name) - - if temp_var.read_only: - from cgen import Const - temp_var_decl = Const(temp_var_decl) - - if decl_info.shape: - from cgen import ArrayOf - ecm = self.get_expression_to_code_mapper(codegen_state) - temp_var_decl = ArrayOf(temp_var_decl, - ecm(p.flattened_product(decl_info.shape), - prec=PREC_NONE, type_context="i")) + # {{{ declarators - if temp_var.alignment: - from cgen import AlignedAttribute - temp_var_decl = AlignedAttribute(temp_var.alignment, temp_var_decl) - - return temp_var_decl - - def wrap_temporary_decl(self, decl, scope): + def wrap_decl_for_address_space( + self, decl: Declarator, address_space: AddressSpace) -> Declarator: return decl - def wrap_global_constant(self, decl): + def wrap_global_constant(self, decl: Declarator) -> Declarator: from cgen import Static return Static(decl) - def get_value_arg_decl(self, name, shape, dtype, is_written): - assert shape == () - + def get_value_arg_declaraotor( + self, name: str, dtype: LoopyType, is_written: bool) -> Declarator: result = POD(self, dtype, name) if not is_written: @@ -1079,34 +1017,114 @@ class CFamilyASTBuilder(ASTBuilderBase): return result - def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written): - from cgen import RestrictPointer, Const + def get_array_base_declarator(self, ary: ArrayBase) -> Declarator: + arg_decl = POD(self, ary.dtype, ary.name) + + if ary.dim_tags: + for dim_tag in ary.dim_tags: + if isinstance(dim_tag, FixedStrideArrayDimTag): + # we're OK with that + pass + else: + raise NotImplementedError( + f"{type(self).__name__} does not understand axis tag " + f"'{type(dim_tag)}.") + + return arg_decl - arg_decl = RestrictPointer(POD(self, dtype, name)) + def get_array_arg_declarator( + self, arg: ArrayArg, is_written: bool) -> Declarator: + from cgen import RestrictPointer + arg_decl = RestrictPointer( + self.wrap_decl_for_address_space( + self.get_array_base_declarator(arg), arg.address_space)) if not is_written: arg_decl = Const(arg_decl) return arg_decl - def get_global_arg_decl(self, name, shape, dtype, is_written): - from warnings import warn - warn("get_global_arg_decl is deprecated use get_array_arg_decl " - "instead.", DeprecationWarning, stacklevel=2) - from loopy.kernel.data import AddressSpace - return self.get_array_arg_decl(name, AddressSpace.GLOBAL, shape, - dtype, is_written) + def get_constant_arg_declarator( + self, arg: ConstantArg) -> Declarator: + from cgen import RestrictPointer + return Const(self.wrap_decl_for_address_space( + RestrictPointer( + self.get_array_base_declarator(arg)), arg.address_space)) + + def get_temporary_arg_decl( + self, temp_var: TemporaryVariable, is_written: bool) -> Declarator: + if temp_var.address_space == AddressSpace.GLOBAL: + from cgen import RestrictPointer + arg_decl = RestrictPointer( + self.wrap_decl_for_address_space( + self.get_array_base_declarator(temp_var), + temp_var.address_space)) + if not is_written: + arg_decl = Const(arg_decl) + + return arg_decl + else: + raise LoopyError("unexpected request for argument declaration of " + "non-global temporary") + + def get_image_arg_declarator( + self, arg: ImageArg, is_written: bool) -> Declarator: + raise NotImplementedError() + + def arg_to_cgen_declarator( + self, kernel: LoopKernel, passed_name: str, is_written: bool + ) -> Declarator: + if passed_name in kernel.all_inames(): + assert not is_written + return self.get_value_arg_declaraotor( + passed_name, kernel.index_dtype, is_written) + var_descr = kernel.get_var_descriptor(passed_name) + if isinstance(var_descr, ValueArg): + return self.get_value_arg_declaraotor( + var_descr.name, var_descr.dtype, is_written) + elif isinstance(var_descr, ArrayArg): + return self.get_array_arg_declarator(var_descr, is_written) + elif isinstance(var_descr, TemporaryVariable): + return self.get_temporary_arg_decl(var_descr, is_written) + elif isinstance(var_descr, ConstantArg): + return self.get_constant_arg_declarator(var_descr) + elif isinstance(var_descr, ImageArg): + return self.get_image_arg_declarator(var_descr, is_written) + else: + raise ValueError(f"unexpected type of argument '{passed_name}': " + f"'{type(var_descr)}'") - def get_constant_arg_decl(self, name, shape, dtype, is_written): - from loopy.target.c import POD # uses the correct complex type - from cgen import RestrictPointer, Const + def get_temporary_var_declarator(self, + codegen_state: CodeGenerationState, + temp_var: TemporaryVariable) -> Declarator: + temp_var_decl = self.get_array_base_declarator(temp_var) - arg_decl = RestrictPointer(POD(self, dtype, name)) + if temp_var.storage_shape: + shape = temp_var.storage_shape + else: + shape = temp_var.shape - if not is_written: - arg_decl = Const(arg_decl) + assert isinstance(shape, tuple) + assert isinstance(temp_var.dim_tags, tuple) - return arg_decl + from loopy.kernel.array import drop_vec_dims + unvec_shape = drop_vec_dims(temp_var.dim_tags, shape) + + if unvec_shape: + from cgen import ArrayOf + ecm = self.get_expression_to_code_mapper(codegen_state) + temp_var_decl = ArrayOf(temp_var_decl, + ecm(p.flattened_product(unvec_shape), + prec=PREC_NONE, type_context="i")) + + if temp_var.alignment: + from cgen import AlignedAttribute + temp_var_decl = AlignedAttribute(temp_var.alignment, temp_var_decl) + + return self.wrap_decl_for_address_space(temp_var_decl, + temp_var.address_space) + + # }}} def emit_assignment(self, codegen_state, insn): kernel = codegen_state.kernel diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 22a6c8b43bf1695d38ad362b642a12cc1f0fec6b..bcaa4a395a2605fa248f3f6aa1ac35d3a49abf43 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -20,23 +20,46 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from typing import Callable, Any, Union, Tuple, Sequence, Optional import tempfile import os +import ctypes +from dataclasses import dataclass -from loopy.target.execution import (KernelExecutorBase, _KernelInfo, - ExecutionWrapperGeneratorBase, get_highlighted_code) +from immutables import Map from pytools import memoize_method -from pytools.py_codegen import (Indentation) +from pytools.codegen import Indentation, CodeGenerator from pytools.prefork import ExecError from codepy.toolchain import guess_toolchain, ToolchainGuessError, GCCToolchain from codepy.jit import compile_from_string -import ctypes import numpy as np +from loopy.typing import ExpressionT +from loopy.types import LoopyType +from loopy.kernel import LoopKernel +from loopy.kernel.array import ArrayBase +from loopy.kernel.data import ArrayArg +from loopy.schedule.tools import KernelArgInfo +from loopy.codegen.result import GeneratedProgram +from loopy.translation_unit import TranslationUnit +from loopy.target.execution import (KernelExecutorBase, + ExecutionWrapperGeneratorBase, get_highlighted_code) + import logging logger = logging.getLogger(__name__) +DEF_EVEN_DIV_FUNCTION = """ +def _lpy_even_div(a, b): + result, remdr = divmod(a, b) + if remdr != 0: + # FIXME: This error message is kind of crummy. + raise ValueError("expected even division") + return result +""" + + +# {{{ CExecutionWrapperGenerator class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): """ @@ -65,20 +88,28 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ handle allocation of unspecified arguements - def handle_alloc(self, gen, arg, kernel_arg, strify, skip_arg_checks): + def handle_alloc( + self, gen: CodeGenerator, arg: ArrayArg, + strify: Callable[[Union[ExpressionT, Tuple[ExpressionT]]], str], + skip_arg_checks: bool) -> None: """ Handle allocation of non-specified arguements for C-execution """ from pymbolic import var - num_axes = len(arg.unvec_shape) + assert isinstance(arg.shape, tuple) + num_axes = len(arg.shape) for i in range(num_axes): - gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i]))) + gen("_lpy_shape_%d = %s" % (i, strify(arg.shape[i]))) - itemsize = kernel_arg.dtype.numpy_dtype.itemsize + from loopy.kernel.array import get_strides + strides = get_strides(arg) + num_axes = len(strides) + + itemsize = arg.dtype.numpy_dtype.itemsize for i in range(num_axes): gen("_lpy_strides_%d = %s" % (i, strify( - itemsize*arg.unvec_strides[i]))) + itemsize*strides[i]))) if not skip_arg_checks: for i in range(num_axes): @@ -95,7 +126,9 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): for i in range(num_axes)) # find order of array - order = "'C'" if (arg.shape == () or arg.unvec_strides[-1] == 1) else "'F'" + from loopy.kernel.array import get_strides + strides = get_strides(arg) + order = "'C'" if (arg.shape == () or strides[-1] == 1) else "'F'" gen("%(name)s = _lpy_np.empty(%(shape)s, " "%(dtype)s, order=%(order)s)" @@ -103,7 +136,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): name=arg.name, shape=strify(sym_shape), dtype=self.python_dtype_str( - gen, kernel_arg.dtype.numpy_dtype), + gen, arg.dtype.numpy_dtype), order=order)) expected_strides = tuple( @@ -136,6 +169,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): Add default C-imports to preamble """ gen.add_to_preamble("import numpy as _lpy_np") + gen.add_to_preamble(DEF_EVEN_DIV_FUNCTION) def initialize_system_args(self, gen): """ @@ -145,44 +179,30 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ generate invocation - def generate_invocation(self, gen, kernel_name, args, - kernel, implemented_data_info): + def generate_invocation(self, gen: CodeGenerator, kernel: LoopKernel, + kai: KernelArgInfo, host_program_name: str, args: Sequence[str]) -> None: gen("for knl in _lpy_c_kernels:") with Indentation(gen): - gen("knl({args})".format( - args=", ".join(args))) + gen(f"knl({', '.join(args)})") # }}} # {{{ - def generate_output_handler( - self, gen, options, kernel, implemented_data_info): - - from loopy.kernel.data import KernelArgument - - def is_output(idi): - from loopy.kernel.array import ArrayBase - if not issubclass(idi.arg_class, ArrayBase): - return False - - arg = kernel.impl_arg_to_arg[idi.name] - return arg.is_output + def generate_output_handler(self, gen: CodeGenerator, + kernel: LoopKernel, kai: KernelArgInfo) -> None: + options = kernel.options if options.return_dict: gen("return None, {%s}" - % ", ".join(f'"{idi.name}": {idi.name}' - for idi in implemented_data_info - if issubclass(idi.arg_class, KernelArgument) - if is_output(idi))) + % ", ".join(f'"{arg_name}": {arg_name}' + for arg_name in kai.passed_arg_names + if kernel.arg_dict[arg_name].is_output)) else: - out_idis = [idi - for idi in implemented_data_info - if issubclass(idi.arg_class, KernelArgument) - if is_output(idi)] - if out_idis: - gen("return None, (%s,)" - % ", ".join(idi.name for idi in out_idis)) + out_names = [arg_name for arg_name in kai.passed_arg_names + if kernel.arg_dict[arg_name].is_output] + if out_names: + gen(f"return None, ({', '.join(out_names)},)") else: gen("return None, ()") @@ -197,6 +217,10 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): def get_arg_pass(self, arg): return arg.name +# }}} + + +# {{{ CCompiler class CCompiler: """ @@ -304,6 +328,10 @@ class CCompiler: # and return compiled return ctypes.CDLL(ext_file) +# }}} + + +# {{{ CPlusPlusCompiler class CPlusPlusCompiler(CCompiler): """Subclass of CCompiler to invoke a C++ compiler.""" @@ -319,8 +347,10 @@ class CPlusPlusCompiler(CCompiler): libraries=libraries, include_dirs=include_dirs, library_dirs=library_dirs, defines=defines, source_suffix=source_suffix) +# }}} + -# {{{ placeholder till ctypes fixes: bugs.python.org/issue16899 +# {{{ placeholder till ctypes fixes: https://github.com/python/cpython/issues/61103 class Complex64(ctypes.Structure): _fields_ = [("real", ctypes.c_float), ("imag", ctypes.c_float)] @@ -344,28 +374,10 @@ if hasattr(np, "complex256"): # }}} -class IDIToCDLL: - """ - A utility class that extracts arguement and return type info from a - :class:`ImplementedDataInfo` in order to create a :class:`ctype.CDLL` - """ - def __init__(self, target): - self.target = target - from loopy.target.c import CTarget - self.registry = CTarget().get_dtype_registry().wrapped_registry - - def __call__(self, knl, idi): - # next loop through the implemented data info to get the arg data - arg_info = [] - for arg in idi: - # check if pointer: outputs and arrays must be passed - # by reference. - pointer = arg.shape or arg.is_written - arg_info.append(self._dtype_to_ctype(arg.dtype, pointer)) - - return arg_info - - def _dtype_to_ctype(self, dtype, pointer=False): +# {{{ _args_to_ctypes + +def _args_to_ctypes(kernel: LoopKernel, passed_names: Sequence[str]): + def _dtype_to_ctype(dtype): """Map NumPy dtype to equivalent ctypes type.""" if dtype.is_complex(): # complex ctypes aren't exposed @@ -373,10 +385,23 @@ class IDIToCDLL: basetype = _NUMPY_COMPLEX_TYPE_TO_CTYPE[np_dtype] else: basetype = np.ctypeslib.as_ctypes_type(dtype) - if pointer: - return ctypes.POINTER(basetype) return basetype + arg_info = [] + for arg_name in passed_names: + arg = kernel.arg_dict[arg_name] + + ctype = _dtype_to_ctype(arg.dtype) + if isinstance(arg, ArrayBase): + ctype = ctypes.POINTER(ctype) + arg_info.append(ctype) + + return arg_info + +# }}} + + +# {{{ CompiledCKernel class CompiledCKernel: """ @@ -386,23 +411,19 @@ class CompiledCKernel: to automatically map argument types. """ - def __init__(self, knl, idi, dev_code, target, comp=None): - from loopy.target.c import ExecutableCTarget - assert isinstance(target, ExecutableCTarget) - self.target = target - self.name = knl.name + def __init__(self, kernel: LoopKernel, devprog: GeneratedProgram, + passed_names: Sequence[str], dev_code: str, + comp: Optional["CCompiler"] = None): # get code and build self.code = dev_code self.comp = comp if comp is not None else CCompiler() - self.dll = self.comp.build(self.name, self.code) + self.dll = self.comp.build(devprog.name, self.code) # get the function declaration for interface with ctypes - func_decl = IDIToCDLL(self.target) - arg_info = func_decl(knl, idi) - self._fn = getattr(self.dll, self.name) + self._fn = getattr(self.dll, devprog.name) # kernels are void by defn. self._fn.restype = None - self._fn.argtypes = [ctype for ctype in arg_info] + self._fn.argtypes = _args_to_ctypes(kernel, passed_names) def __call__(self, *args): """Execute kernel with given args mapped to ctypes equivalents.""" @@ -419,6 +440,17 @@ class CompiledCKernel: args_.append(arg_) self._fn(*args_) +# }}} + + +@dataclass(frozen=True) +class _KernelInfo: + t_unit: TranslationUnit + c_kernels: Sequence[CompiledCKernel] + invoker: Callable[..., Any] + + +# {{{ CKernelExecutor class CKernelExecutor(KernelExecutorBase): """An object connecting a kernel to a :class:`CompiledKernel` @@ -428,7 +460,7 @@ class CKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, program, entrypoint, compiler=None): + def __init__(self, program, entrypoint, compiler: Optional["CCompiler"] = None): """ :arg kernel: may be a loopy.LoopKernel, a generator returning kernels (a warning will be issued if more than one is returned). If the @@ -447,30 +479,34 @@ class CKernelExecutor(KernelExecutorBase): return CExecutionWrapperGenerator() @memoize_method - def program_info(self, entrypoint, arg_to_dtype_set=frozenset(), - all_kwargs=None): - program = self.get_typed_and_scheduled_translation_unit( - entrypoint, arg_to_dtype_set) + def translation_unit_info( + self, entrypoint: str, + arg_to_dtype: Optional[Map[str, LoopyType]] = None) -> _KernelInfo: + # FIXME: Remove entrypoint argument + assert entrypoint == self.entrypoint + + t_unit = self.get_typed_and_scheduled_translation_unit( + entrypoint, arg_to_dtype) from loopy.codegen import generate_code_v2 - codegen_result = generate_code_v2(program) + codegen_result = generate_code_v2(t_unit) dev_code = codegen_result.device_code() host_code = codegen_result.host_code() all_code = "\n".join([dev_code, "", host_code]) - if self.program[entrypoint].options.write_code: + if t_unit[entrypoint].options.write_code: output = all_code - if self.program[entrypoint].options.allow_terminal_colors: + if t_unit[entrypoint].options.allow_terminal_colors: output = get_highlighted_code(output) - if self.program[entrypoint].options.write_code is True: + if t_unit[entrypoint].options.write_code is True: print(output) else: - with open(self.program[entrypoint].options.write_code, "w") as outf: + with open(t_unit[entrypoint].options.write_code, "w") as outf: outf.write(output) - if self.program[entrypoint].options.edit_code: + if t_unit[entrypoint].options.edit_code: from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.c") # update code from editor @@ -478,19 +514,17 @@ class CKernelExecutor(KernelExecutorBase): c_kernels = [] + from loopy.schedule.tools import get_kernel_arg_info + kai = get_kernel_arg_info(t_unit[entrypoint]) for dp in codegen_result.device_programs: - c_kernels.append(CompiledCKernel(dp, - codegen_result.implemented_data_infos[entrypoint], all_code, - self.program.target, self.compiler)) + c_kernels.append(CompiledCKernel( + t_unit[entrypoint], dp, kai.passed_names, all_code, + self.compiler)) return _KernelInfo( - program=program, + t_unit=t_unit, c_kernels=c_kernels, - implemented_data_info=codegen_result.implemented_data_infos[ - entrypoint], - invoker=self.get_invoker(program, entrypoint, codegen_result)) - - # }}} + invoker=self.get_invoker(t_unit, entrypoint, codegen_result)) def __call__(self, *args, entrypoint=None, **kwargs): """ @@ -510,8 +544,12 @@ class CKernelExecutor(KernelExecutorBase): if self.packing_controller is not None: kwargs = self.packing_controller(kwargs) - program_info = self.program_info(entrypoint, - self.arg_to_dtype_set(kwargs)) + program_info = self.translation_unit_info(entrypoint, + self.arg_to_dtype(kwargs)) return program_info.invoker( program_info.c_kernels, *args, **kwargs) + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 05c6b3b929b8ffac13532f9467b7bf4c0b6e488a..496c75e58bcfcfb142479ec6d9b34dd71ecefa8e 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -163,7 +163,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): from loopy.kernel.array import _apply_offset from loopy.symbolic import simplify_using_aff - subscript = _apply_offset(0, expr.name, arg) + subscript = _apply_offset(0, arg) result = self.make_subscript( arg, var(expr.name), @@ -223,7 +223,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): index_tuple = tuple( simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple) - access_info = get_access_info(self.kernel.target, ary, index_tuple, + access_info = get_access_info(self.kernel, ary, index_tuple, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 5ba5d22ff9ccc286e7d7748d4702e8decb69dd5c..f95dea681904818e15d4a72a809aefad38352533 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -24,15 +24,16 @@ THE SOFTWARE. """ import numpy as np - +from pymbolic import var from pytools import memoize_method +from cgen import Declarator, Const from loopy.target.c import CFamilyTarget, CFamilyASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from loopy.diagnostic import LoopyError, LoopyTypeError from loopy.types import NumpyType -from loopy.kernel.data import AddressSpace -from pymbolic import var +from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag, VectorArrayDimTag +from loopy.kernel.data import AddressSpace, ImageArg, ConstantArg, ArrayArg from loopy.kernel.function_interface import ScalarCallable @@ -388,55 +389,77 @@ class CUDACASTBuilder(CFamilyASTBuilder): else: raise LoopyError("unknown barrier kind") - def wrap_temporary_decl(self, decl, scope): - if scope == AddressSpace.LOCAL: - from cgen.cuda import CudaShared + # }}} + + # {{{ declarators + + def wrap_decl_for_address_space( + self, decl: Declarator, address_space: AddressSpace) -> Declarator: + from cgen.cuda import CudaGlobal, CudaShared + if address_space == AddressSpace.GLOBAL: + return CudaGlobal(decl) + if address_space == AddressSpace.LOCAL: return CudaShared(decl) - elif scope == AddressSpace.PRIVATE: + elif address_space == AddressSpace.PRIVATE: return decl else: - raise ValueError("unexpected temporary variable scope: %s" - % scope) + raise ValueError("unexpected address_space: %s" + % address_space) - def wrap_global_constant(self, decl): - from cgen.cuda import CudaConstant + def wrap_global_constant(self, decl: Declarator) -> Declarator: + from cgen.cuda import CudaConstant, CudaGlobal + assert isinstance(decl, CudaGlobal) + decl = decl.subdecl return CudaConstant(decl) - def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written): - from loopy.target.c import POD # uses the correct complex type - from cgen import Const - from cgen.cuda import CudaRestrictPointer + # duplicated in OpenCL, update there if updating here + def get_array_base_declarator(self, ary: ArrayBase) -> Declarator: + dtype = ary.dtype + + vec_size = ary.vector_size(self.target) + if vec_size > 1: + dtype = self.target.vector_dtype(dtype, vec_size) + + if ary.dim_tags: + for dim_tag in ary.dim_tags: + if isinstance(dim_tag, (FixedStrideArrayDimTag, VectorArrayDimTag)): + # we're OK with those + pass + + else: + raise NotImplementedError( + f"{type(self).__name__} does not understand axis tag " + f"'{type(dim_tag)}.") - arg_decl = CudaRestrictPointer(POD(self, dtype, name)) + from loopy.target.c import POD + return POD(self, dtype, ary.name) + + def get_array_arg_declarator( + self, arg: ArrayArg, is_written: bool) -> Declarator: + from cgen.cuda import CudaRestrictPointer + arg_decl = CudaRestrictPointer( + self.wrap_decl_for_address_space( + self.get_array_base_declarator(arg), arg.address_space)) if not is_written: arg_decl = Const(arg_decl) return arg_decl - def get_global_arg_decl(self, name, shape, dtype, is_written): - from warnings import warn - warn("get_global_arg_decl is deprecated use get_array_arg_decl " - "instead.", DeprecationWarning, stacklevel=2) - return self.get_array_arg_decl(name, AddressSpace.GLOBAL, shape, - dtype, is_written) - - def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): - raise NotImplementedError("not yet: texture arguments in CUDA") - - def get_constant_arg_decl(self, name, shape, dtype, is_written): - from loopy.target.c import POD # uses the correct complex type - from cgen import RestrictPointer, Const + def get_constant_arg_declarator(self, arg: ConstantArg) -> Declarator: + from cgen import RestrictPointer from cgen.cuda import CudaConstant - arg_decl = RestrictPointer(POD(self, dtype, name)) + # constant *is* an address space as far as CUDA is concerned, do not re-wrap + return CudaConstant(RestrictPointer(self.get_array_base_declarator(arg))) - if not is_written: - arg_decl = Const(arg_decl) + def get_image_arg_declarator( + self, arg: ImageArg, is_written: bool) -> Declarator: + raise NotImplementedError("not yet: texture arguments in CUDA") - return CudaConstant(arg_decl) + # }}} - # {{{ code generation for atomic update + # {{{ atomics def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): @@ -527,8 +550,6 @@ class CUDACASTBuilder(CFamilyASTBuilder): # }}} - # }}} - # }}} # vim: foldmethod=marker diff --git a/loopy/target/execution.py b/loopy/target/execution.py index e911ae89de27725aade963178639026e2b6460cf..131e35396da526e13604c57f846031352928bf5b 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -21,34 +21,35 @@ THE SOFTWARE. """ -import numpy as np +from typing import (Callable, Tuple, Union, Set, FrozenSet, List, Dict, + Optional, Sequence, Any) +from dataclasses import dataclass + +from immutables import Map + from abc import ABC, abstractmethod -from pytools import ImmutableRecord from loopy.diagnostic import LoopyError -from pytools.py_codegen import ( - Indentation, PythonFunctionGenerator) +from pytools.py_codegen import PythonFunctionGenerator +from pytools.codegen import Indentation, CodeGenerator + +from pymbolic import var import logging logger = logging.getLogger(__name__) from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder +from loopy.typing import ExpressionT +from loopy.types import LoopyType, NumpyType +from loopy.kernel import KernelState, LoopKernel +from loopy.kernel.data import _ArraySeparationInfo, ArrayArg, auto +from loopy.translation_unit import TranslationUnit +from loopy.schedule.tools import KernelArgInfo from loopy.version import DATA_MODEL_VERSION # {{{ object array argument packing -class _PackingInfo(ImmutableRecord): - """ - .. attribute:: name - .. attribute:: sep_shape - - .. attribute:: subscripts_and_names - - A list of type ``[(index, unpacked_name), ...]``. - """ - - class SeparateArrayPackingController: """For argument arrays with axes tagged to be implemented as separate arrays, this class provides preprocessing of the incoming arguments so that @@ -59,59 +60,60 @@ class SeparateArrayPackingController: It also repacks outgoing arrays of this type back into an object array. """ - def __init__(self, packing_info): - self.packing_info = packing_info + def __init__(self, packing_info: Dict[str, _ArraySeparationInfo]) -> None: + # These must work to index tuples if 1D. + def untuple_length_1_indices( + ind: Tuple[int, ...]) -> Union[int, Tuple[int, ...]]: + if len(ind) == 1: + return ind[0] + else: + return ind - def __call__(self, kernel_kwargs): - if not self.packing_info: - return kernel_kwargs + self.packing_info = { + name: { + untuple_length_1_indices(ind): sep_name + for ind, sep_name in sep_info.subarray_names.items() + } + for name, sep_info in packing_info.items() + } + def __call__(self, kernel_kwargs: Dict[str, Any]) -> Dict[str, Any]: kernel_kwargs = kernel_kwargs.copy() - for packing_info in self.packing_info.values(): - arg_name = packing_info.name - if packing_info.name in kernel_kwargs: - arg = kernel_kwargs[arg_name] - for index, unpacked_name in packing_info.subscripts_and_names: + for name, ind_to_subary_name in self.packing_info.items(): + if name in kernel_kwargs: + arg = kernel_kwargs[name] + for index, unpacked_name in ind_to_subary_name.items(): assert unpacked_name not in kernel_kwargs kernel_kwargs[unpacked_name] = arg[index] - del kernel_kwargs[arg_name] + del kernel_kwargs[name] return kernel_kwargs +# }}} -def make_packing_controller(program, entrypoint): - packing_info = {} - from loopy.kernel.array import ArrayBase - for arg in program[entrypoint].args: - if not isinstance(arg, ArrayBase): - continue - if arg.shape is None or arg.dim_tags is None: - continue +# {{{ ExecutionWrapperGeneratorBase - subscripts_and_names = arg.subscripts_and_names() +def _str_to_expr(name_or_expr: Union[str, ExpressionT]) -> ExpressionT: + if isinstance(name_or_expr, str): + return var(name_or_expr) + else: + return name_or_expr - if subscripts_and_names is None: - continue - packing_info[arg.name] = _PackingInfo( - name=arg.name, - sep_shape=arg.sep_shape(), - subscripts_and_names=subscripts_and_names, - is_written=arg.name in - program[entrypoint].get_written_variables()) +@dataclass(frozen=True) +class _ArgFindingEquation: + lhs: ExpressionT + rhs: ExpressionT - if packing_info: - return SeparateArrayPackingController(packing_info) - else: - return None + # Arg finding code is sorted by priority, lowest order first + order: int -# }}} + based_on_names: FrozenSet[str] + require_names: bool -# {{{ ExecutionWrapperGeneratorBase - class ExecutionWrapperGeneratorBase(ABC): """ A set of common methods for generating a wrapper @@ -130,7 +132,7 @@ class ExecutionWrapperGeneratorBase(ABC): def python_dtype_str_inner(self, dtype): pass - def python_dtype_str(self, gen, numpy_dtype): + def python_dtype_str(self, gen: CodeGenerator, numpy_dtype): dtype_str = self.python_dtype_str_inner(numpy_dtype) try: return self.dtype_str_to_name[dtype_str] @@ -147,176 +149,178 @@ class ExecutionWrapperGeneratorBase(ABC): # /!\ This code runs in a namespace controlled by the user. # Prefix all auxiliary variables with "_lpy". - # {{{ integer arg finding from shapes - - def generate_integer_arg_finding_from_shapes( - self, gen, program, implemented_data_info): - # a mapping from integer argument names to a list of tuples - # (arg_name, expression), where expression is a - # unary function of kernel.arg_dict[arg_name] - # returning the desired integer argument. - iarg_to_sources = {} + # {{{ integer arg finding from array data + def generate_integer_arg_finding_from_array_data( + self, gen: CodeGenerator, kernel: LoopKernel, kai: KernelArgInfo + ) -> None: from loopy.kernel.data import ArrayArg + from loopy.kernel.array import get_strides from loopy.symbolic import DependencyMapper, StringifyMapper from loopy.diagnostic import ParameterFinderWarning dep_map = DependencyMapper() - from pymbolic import var - for arg in implemented_data_info: - if arg.arg_class is ArrayArg: - sym_shape = var(arg.name).attr("shape") - for axis_nr, shape_i in enumerate(arg.shape): - if shape_i is None: - continue - - deps = dep_map(shape_i) - - if len(deps) == 1: - integer_arg_var, = deps - - if program.arg_dict[ - integer_arg_var.name].dtype.is_integral(): - from pymbolic.algorithm import solve_affine_equations_for - try: - # friggin' overkill :) - iarg_expr = solve_affine_equations_for( - [integer_arg_var.name], - [(shape_i, sym_shape.index(axis_nr))] - )[integer_arg_var] - except Exception as e: - #from traceback import print_exc - #print_exc() - - # went wrong? oh well - from warnings import warn - warn("Unable to generate code to automatically " - "find '%s' from the shape of '%s':\n%s" - % (integer_arg_var.name, arg.name, str(e)), - ParameterFinderWarning) - else: - iarg_to_sources.setdefault(integer_arg_var.name, [])\ - .append((arg.name, iarg_expr)) - - gen("# {{{ find integer arguments from shapes") - gen("") + # {{{ find equations + + equations: List[_ArgFindingEquation] = [] + + from pymbolic.primitives import If + + for arg_name in kai.passed_arg_names: + arg = kernel.arg_dict[arg_name] + if isinstance(arg, ArrayArg): + assert arg.shape is not auto + if isinstance(arg.shape, tuple): + for axis_nr, shape_i in enumerate(arg.shape): + if shape_i is not None: + equations.append( + _ArgFindingEquation( + lhs=var(arg.name).attr("shape").index(axis_nr), + rhs=shape_i, + order=0, + based_on_names=frozenset({arg.name}), + require_names=True)) + + for axis_nr, stride_i in enumerate(get_strides(arg)): + if stride_i is not None: + equations.append( + _ArgFindingEquation( + lhs=var("_lpy_even_div")( + var(arg.name).attr("strides").index(axis_nr), + arg.dtype.itemsize), + rhs=_str_to_expr(stride_i), + order=0, + based_on_names=frozenset({arg.name}), + require_names=True)) + + if arg.offset is not None: + if not kernel.options.no_numpy: + offset = var("getattr")(var(arg.name), var('"offset"'), 0) + else: + offset = var(arg.name).attr("offset") - for iarg_name, sources in iarg_to_sources.items(): - gen("if %s is None:" % iarg_name) - with Indentation(gen): - if_stmt = "if" - for arg_name, value_expr in sources: - gen(f"{if_stmt} {arg_name} is not None:") - with Indentation(gen): - gen("%s = %s" - % (iarg_name, StringifyMapper()(value_expr))) + offset = If(var(f"{arg.name} is None"), 0, offset) - if_stmt = "elif" + equations.append( + _ArgFindingEquation( + lhs=var("_lpy_even_div")( + offset, arg.dtype.itemsize), + rhs=_str_to_expr(arg.offset), - gen("") + # Argument finding from offsets should run last, + # as it assumes a zero offset if a variable is + # not passed. That should only be done if no + # other approach yielded a value for the variable. + order=1, + based_on_names=frozenset(arg.name), + require_names=False, + )) - gen("# }}}") - gen("") - - # }}} + # }}} - # {{{ integer arg finding from offsets + # {{{ regroup equations by unknown - def generate_integer_arg_finding_from_offsets(self, gen, kernel, - implemented_data_info): - options = kernel.options + unknown_to_equations: Dict[str, List[_ArgFindingEquation]] = {} - gen("# {{{ find integer arguments from offsets") - gen("") + for eqn in equations: + deps = dep_map(eqn.rhs) - for arg in implemented_data_info: - impl_array_name = arg.offset_for_name - if impl_array_name is not None: - gen("if %s is None:" % arg.name) - with Indentation(gen): - gen("if %s is None:" % impl_array_name) - with Indentation(gen): - gen("# Output variable, we'll be allocating " - "it, with zero offset.") - gen("%s = 0" % arg.name) - gen("else:") - with Indentation(gen): - if not options.no_numpy: - gen('_lpy_offset = getattr(%s, "offset", 0)' - % impl_array_name) - else: - gen("_lpy_offset = %s.offset" % impl_array_name) + if len(deps) == 1: + unknown_var, = deps + unknown_to_equations.setdefault(unknown_var.name, []).append((eqn)) - base_arg = kernel.impl_arg_to_arg[impl_array_name] + del equations - if not options.skip_arg_checks: - gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)" - % (arg.name, base_arg.dtype.itemsize)) + # }}} - gen("assert _lpy_remdr == 0, \"Offset of array '%s' is " - 'not divisible by its dtype itemsize"' - % impl_array_name) - gen("del _lpy_remdr") - else: - gen("%s = _lpy_offset // %d" - % (arg.name, base_arg.dtype.itemsize)) + # {{{ generate arg finding code - if not options.skip_arg_checks: - gen("del _lpy_offset") + from pymbolic.algorithm import solve_affine_equations_for + from pymbolic.primitives import Variable + from pytools.codegen import CodeGenerator - gen("# }}}") + gen("# {{{ find integer arguments from array data") gen("") - # }}} + for unknown_name in sorted(unknown_to_equations): + unk_equations = sorted(unknown_to_equations[unknown_name], + key=lambda eqn: eqn.order) + req_subgen = CodeGenerator() + not_req_subgen = CodeGenerator() - # {{{ integer arg finding from strides + seen_based_on_names: Set[FrozenSet[str]] = set() - def generate_integer_arg_finding_from_strides( - self, gen, kernel, implemented_data_info): - options = kernel.options + if_or_elif = "if" - gen("# {{{ find integer arguments from strides") - gen("") + for eqn in unk_equations: + try: + # overkill :) + value_expr = solve_affine_equations_for( + [unknown_name], + [(eqn.lhs, eqn.rhs)] + )[Variable(unknown_name)] + except Exception as e: + # went wrong? oh well + from warnings import warn + warn("Unable to generate code to automatically " + f"find '{unknown_name}' " + f"from '{', '.join(eqn.based_on_names)}':\n" + f"{e}", ParameterFinderWarning) + continue + + # Do not use more than one bit of data from each of the + # 'based_on_names' to find each value, i.e. if a value can be + # found via shape and strides, only one of them suffices. + # This also helps because strides can be unreliable in the + # face of zero-length axes. + if eqn.based_on_names in seen_based_on_names: + continue + seen_based_on_names.add(eqn.based_on_names) + + if eqn.require_names: + condition = " and ".join( + f"{ary_name} is not None" + for ary_name in eqn.based_on_names) + req_subgen(f"{if_or_elif} {condition}:") + with Indentation(req_subgen): + req_subgen( + f"{unknown_name} = {StringifyMapper()(value_expr)}") + if_or_elif = "elif" + + req_subgen("") + else: + not_req_subgen( + f"{unknown_name} = {StringifyMapper()(value_expr)}") - for arg in implemented_data_info: - if arg.stride_for_name_and_axis is not None: - impl_array_name, stride_impl_axis = arg.stride_for_name_and_axis + not_req_subgen("") - gen("if %s is None:" % arg.name) + if not_req_subgen.code: + gen(f"if {unknown_name} is None:") with Indentation(gen): - if not options.skip_arg_checks: - gen("if %s is None:" % impl_array_name) + gen.extend(not_req_subgen) + + if req_subgen.code: + # still? try the req_subgen + gen(f"if {unknown_name} is None:") with Indentation(gen): - gen("raise RuntimeError(\"required stride '%s' for " - "argument '%s' not given or deducible from " - 'passed array")' - % (arg.name, impl_array_name)) - - base_arg = kernel.impl_arg_to_arg[impl_array_name] - - if not options.skip_arg_checks: - gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)" - % (arg.name, impl_array_name, stride_impl_axis, - base_arg.dtype.dtype.itemsize)) - - gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' " - ' is not divisible by its dtype itemsize"' - % (stride_impl_axis, impl_array_name)) - gen("del _lpy_remdr") - else: - gen("%s = %s.strides[%d] // %d" - % (arg.name, impl_array_name, stride_impl_axis, - base_arg.dtype.itemsize)) + gen.extend(req_subgen) + elif req_subgen.code: + gen(f"if {unknown_name} is None:") + with Indentation(gen): + gen.extend(req_subgen) gen("# }}}") gen("") + # }}} + # }}} # {{{ check that value args are present def generate_value_arg_check( - self, gen, kernel, implemented_data_info): + self, gen: CodeGenerator, kernel: LoopKernel, kai: KernelArgInfo + ) -> None: if kernel.options.skip_arg_checks: return @@ -325,8 +329,9 @@ class ExecutionWrapperGeneratorBase(ABC): gen("# {{{ check that value args are present") gen("") - for arg in implemented_data_info: - if not issubclass(arg.arg_class, ValueArg): + for arg_name in kai.passed_arg_names: + arg = kernel.arg_dict[arg_name] + if not isinstance(arg, ValueArg): continue gen("if %s is None:" % arg.name) @@ -342,14 +347,17 @@ class ExecutionWrapperGeneratorBase(ABC): # {{{ handle non numpy arguements - def handle_non_numpy_arg(self, gen, arg): + def handle_non_numpy_arg(self, gen: CodeGenerator, arg): raise NotImplementedError() # }}} # {{{ handle allocation of unspecified arguements - def handle_alloc(self, gen, arg, kernel_arg, strify, skip_arg_checks): + def handle_alloc( + self, gen: CodeGenerator, arg: ArrayArg, + strify: Callable[[Union[ExpressionT, Tuple[ExpressionT]]], str], + skip_arg_checks: bool) -> None: """ Handle allocation of non-specified arguements for C-execution """ @@ -384,10 +392,12 @@ class ExecutionWrapperGeneratorBase(ABC): # {{{ arg setup def generate_arg_setup( - self, gen, kernel, implemented_data_info, options): + self, gen: CodeGenerator, kernel: LoopKernel, kai: KernelArgInfo, + ) -> Sequence[str]: + options = kernel.options import loopy as lp - from loopy.kernel.data import KernelArgument + from loopy.kernel.data import ImageArg from loopy.kernel.array import ArrayBase from loopy.symbolic import StringifyMapper from loopy.types import NumpyType @@ -404,21 +414,11 @@ class ExecutionWrapperGeneratorBase(ABC): strify = StringifyMapper() - expect_no_more_arguments = False + for arg_name in kai.passed_arg_names: + arg = kernel.arg_dict[arg_name] + is_written = arg.name in kernel.get_written_variables() - for arg in implemented_data_info: - is_written = arg.base_name in kernel.get_written_variables() - kernel_arg = kernel.impl_arg_to_arg.get(arg.name) - - if not issubclass(arg.arg_class, KernelArgument): - expect_no_more_arguments = True - continue - - if expect_no_more_arguments: - raise LoopyError("Further arguments encountered after arg info " - "describing a global temporary variable") - - if not issubclass(arg.arg_class, ArrayBase): + if not isinstance(arg, ArrayBase): args.append(arg.name) continue @@ -428,15 +428,14 @@ class ExecutionWrapperGeneratorBase(ABC): if not options.no_numpy: self.handle_non_numpy_arg(gen, arg) - if not options.skip_arg_checks and kernel_arg.is_input: + if not options.skip_arg_checks and arg.is_input: gen("if %s is None:" % arg.name) with Indentation(gen): gen("raise RuntimeError(\"input argument '%s' must " 'be supplied")' % arg.name) gen("") - if (is_written - and arg.arg_class is lp.ImageArg + if (is_written and isinstance(arg, ImageArg) and not options.skip_arg_checks): gen("if %s is None:" % arg.name) with Indentation(gen): @@ -455,8 +454,8 @@ class ExecutionWrapperGeneratorBase(ABC): # {{{ allocate written arrays, if needed - if kernel_arg.is_output \ - and arg.arg_class in [lp.ArrayArg, lp.ConstantArg] \ + if arg.is_output \ + and isinstance(arg, (lp.ArrayArg, lp.ConstantArg)) \ and arg.shape is not None \ and all(si is not None for si in arg.shape): @@ -471,7 +470,7 @@ class ExecutionWrapperGeneratorBase(ABC): gen("if %s is None:" % arg.name) with Indentation(gen): self.handle_alloc( - gen, arg, kernel_arg, strify, options.skip_arg_checks) + gen, arg, strify, options.skip_arg_checks) gen("_lpy_made_by_loopy = True") gen("") @@ -479,7 +478,7 @@ class ExecutionWrapperGeneratorBase(ABC): # {{{ argument checking - if arg.arg_class in [lp.ArrayArg, lp.ConstantArg] \ + if isinstance(arg, (lp.ArrayArg, lp.ConstantArg)) \ and not options.skip_arg_checks: if possibly_made_by_loopy: gen("if not _lpy_made_by_loopy:") @@ -489,7 +488,7 @@ class ExecutionWrapperGeneratorBase(ABC): with Indentation(gen): gen("if %s.dtype != %s:" % (arg.name, self.python_dtype_str( - gen, kernel_arg.dtype.numpy_dtype))) + gen, arg.dtype.numpy_dtype))) with Indentation(gen): gen("raise TypeError(\"dtype mismatch on argument '%s' " '(got: %%s, expected: %s)" %% %s.dtype)' @@ -503,7 +502,9 @@ class ExecutionWrapperGeneratorBase(ABC): else: return strify(shape_axis) - def strify_tuple(t): + def strify_tuple(t: Optional[Tuple[ExpressionT, ...]]) -> str: + if t is None: + return "None" if len(t) == 0: return "()" else: @@ -512,21 +513,22 @@ class ExecutionWrapperGeneratorBase(ABC): for sa in t) shape_mismatch_msg = ( - "raise TypeError(\"shape mismatch on argument '%s' " + "raise ValueError(\"shape mismatch on argument '%s' " '(got: %%s, expected: %%s)" ' "%% (%s.shape, %s))" - % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) + % (arg.name, arg.name, strify_tuple(arg.shape))) - if kernel_arg.shape is None: + if arg.shape is None: pass - elif any(shape_axis is None for shape_axis in kernel_arg.shape): + elif any(shape_axis is None for shape_axis in arg.shape): + assert isinstance(arg.shape, tuple) gen("if len(%s.shape) != %s:" - % (arg.name, len(arg.unvec_shape))) + % (arg.name, len(arg.shape))) with Indentation(gen): gen(shape_mismatch_msg) - for i, shape_axis in enumerate(arg.unvec_shape): + for i, shape_axis in enumerate(arg.shape): if shape_axis is None: continue @@ -537,18 +539,21 @@ class ExecutionWrapperGeneratorBase(ABC): else: # not None, no Nones in tuple gen("if %s.shape != %s:" - % (arg.name, strify(arg.unvec_shape))) + % (arg.name, strify(arg.shape))) with Indentation(gen): gen(shape_mismatch_msg) # }}} - if arg.unvec_strides and kernel_arg.dim_tags: - itemsize = kernel_arg.dtype.numpy_dtype.itemsize - sym_strides = tuple( - itemsize*s_i for s_i in arg.unvec_strides) + from loopy.kernel.array import get_strides + strides = get_strides(arg) - ndim = len(arg.unvec_shape) + if strides and arg.dim_tags and arg.shape is not None: + assert isinstance(arg.shape, tuple) + itemsize = arg.dtype.numpy_dtype.itemsize + sym_strides = tuple(itemsize*s_i for s_i in strides) + + ndim = len(arg.shape) shape = ["_lpy_shape_%d" % i for i in range(ndim)] strides = ["_lpy_stride_%d" % i for i in range(ndim)] @@ -561,14 +566,8 @@ class ExecutionWrapperGeneratorBase(ABC): shape, strides, [strify(s) for s in sym_strides])) with Indentation(gen): - gen("_lpy_got = tuple(stride " - "for (dim, stride) in zip(%s.shape, %s.strides) " - ")" - % (arg.name, arg.name)) - gen("_lpy_expected = tuple(stride " - "for (dim, stride) in zip(%s.shape, %s) " - ")" - % (arg.name, strify_tuple(sym_strides))) + gen(f"_lpy_got = {arg.name}.strides") + gen(f"_lpy_expected = {strify_tuple(sym_strides)}") gen('raise ValueError("strides mismatch on ' "argument '%s' " @@ -576,7 +575,7 @@ class ExecutionWrapperGeneratorBase(ABC): "%% (_lpy_got, _lpy_expected))" % arg.name) - if not arg.allows_offset: + if not arg.offset: gen("if hasattr({}, 'offset') and {}.offset:".format( arg.name, arg.name)) with Indentation(gen): @@ -592,7 +591,7 @@ class ExecutionWrapperGeneratorBase(ABC): gen("del _lpy_made_by_loopy") gen("") - if arg.arg_class in [lp.ArrayArg, lp.ConstantArg]: + if isinstance(arg, (lp.ArrayArg, lp.ConstantArg)): args.append(self.get_arg_pass(arg)) else: args.append("%s" % arg.name) @@ -623,17 +622,16 @@ class ExecutionWrapperGeneratorBase(ABC): # {{{ generate invocation - def generate_invocation(self, gen, kernel_name, args, - kernel, implemented_data_info): + def generate_invocation(self, gen: CodeGenerator, kernel: LoopKernel, + kai: KernelArgInfo, host_program_name: str, args: Sequence[str]) -> None: raise NotImplementedError() # }}} # {{{ output - def generate_output_handler( - self, gen, options, kernel, implemented_data_info): - + def generate_output_handler(self, gen: CodeGenerator, + kernel: LoopKernel, kai: KernelArgInfo) -> None: raise NotImplementedError() # }}} @@ -653,16 +651,17 @@ class ExecutionWrapperGeneratorBase(ABC): kernel """ - options = program[entrypoint].options - implemented_data_info = codegen_result.implemented_data_infos[entrypoint] + kernel = program[entrypoint] + options = kernel.options + + from loopy.schedule.tools import get_kernel_arg_info + kai = get_kernel_arg_info(kernel) - from loopy.kernel.data import KernelArgument gen = PythonFunctionGenerator( "invoke_%s_loopy_kernel" % entrypoint, self.system_args + [ - "%s=None" % idi.name - for idi in implemented_data_info - if issubclass(idi.arg_class, KernelArgument) + "%s=None" % arg_name + for arg_name in kai.passed_arg_names ]) self.target_specific_preamble(gen) @@ -672,25 +671,18 @@ class ExecutionWrapperGeneratorBase(ABC): self.initialize_system_args(gen) - self.generate_integer_arg_finding_from_shapes( - gen, program[entrypoint], implemented_data_info) - self.generate_integer_arg_finding_from_offsets( - gen, program[entrypoint], implemented_data_info) - self.generate_integer_arg_finding_from_strides( - gen, program[entrypoint], implemented_data_info) - self.generate_value_arg_check( - gen, program[entrypoint], implemented_data_info) - args = self.generate_arg_setup( - gen, program[entrypoint], implemented_data_info, options) + self.generate_integer_arg_finding_from_array_data( + gen, program[entrypoint], kai) + self.generate_value_arg_check(gen, program[entrypoint], kai) + args = self.generate_arg_setup(gen, program[entrypoint], kai) #FIXME: should we make this as a dict as well. host_program_name = codegen_result.host_programs[entrypoint].name - self.generate_invocation(gen, host_program_name, args, - program[entrypoint], implemented_data_info) + self.generate_invocation(gen, program[entrypoint], kai, + host_program_name, args) - self.generate_output_handler(gen, options, program[entrypoint], - implemented_data_info) + self.generate_output_handler(gen, program[entrypoint], kai) if options.write_wrapper: output = gen.get() @@ -710,14 +702,6 @@ class ExecutionWrapperGeneratorBase(ABC): # }}} -class _KernelInfo(ImmutableRecord): - pass - - -class _Kernels: - pass - - typed_and_scheduled_cache = WriteOncePersistentDict( "loopy-typed-and-scheduled-cache-v1-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -737,18 +721,13 @@ class KernelExecutorBase: .. automethod:: __init__ .. automethod:: __call__ """ + packing_controller: Optional[SeparateArrayPackingController] - def __init__(self, program, entrypoint): - """ - :arg kernel: a loopy.LoopKernel - """ - - self.program = program + def __init__(self, t_unit: TranslationUnit, entrypoint: str): + self.t_unit = t_unit self.entrypoint = entrypoint - self.packing_controller = make_packing_controller(program, entrypoint) - - kernel = self.program[entrypoint] + kernel = self.t_unit[entrypoint] self.output_names = set(arg.name for arg in kernel.args if arg.is_output) from loopy import ArrayArg @@ -759,6 +738,23 @@ class KernelExecutorBase: self.has_runtime_typed_args = any( arg.dtype is None for arg in kernel.args) + # We're doing this ahead of time to learn about array separation. + # This will be done again as part of preprocessing below, and we're + # betting that it happens consistently both times. (No reason it wouldn't, + # but it is done redundantly.) We can't *use* the result of this + # because we need to do the 'official' array separation after type + # inference has completed. + from loopy.preprocess import make_arrays_for_sep_arrays + self.separated_entry_knl = make_arrays_for_sep_arrays( + self.t_unit[self.entrypoint]) + + self.sep_info = self.separated_entry_knl._separation_info() + if self.sep_info: + self.packing_controller = SeparateArrayPackingController(self.sep_info) + else: + self.packing_controller = None + return None + def check_for_required_array_arguments(self, input_args): # Formerly, the first exception raised when a required argument is not # passed was often at type inference. This exists to raise a more meaningful @@ -768,7 +764,7 @@ class KernelExecutorBase: # and links therin for context. if not self.input_array_names <= set(input_args): missing_args = self.input_array_names - set(input_args) - kernel = self.program[self.entrypoint] + kernel = self.t_unit[self.entrypoint] raise LoopyError( f"Kernel {kernel.name}() missing required array input arguments: " f"{', '.join(missing_args)}. " @@ -776,44 +772,48 @@ class KernelExecutorBase: "your argument.") def get_typed_and_scheduled_translation_unit_uncached( - self, entrypoint, arg_to_dtype_set): - from loopy.kernel.tools import add_dtypes - from loopy.kernel import KernelState - from loopy.translation_unit import resolve_callables - - program = resolve_callables(self.program) - - if arg_to_dtype_set: - var_to_dtype = {} - entry_knl = program[entrypoint] - for var, dtype in arg_to_dtype_set: - if var in entry_knl.impl_arg_to_arg: - dest_name = entry_knl.impl_arg_to_arg[var].name - else: - dest_name = var + self, entrypoint, arg_to_dtype: Optional[Map[str, LoopyType]] + ) -> TranslationUnit: + t_unit = self.t_unit + + if arg_to_dtype: + entry_knl = t_unit[entrypoint] - var_to_dtype[dest_name] = dtype + # FIXME: This is not so nice. This transfers types from the + # subarrays of sep-tagged arrays to the 'main' array, because + # type inference fails otherwise. + with arg_to_dtype.mutate() as mm: + for name, sep_info in self.sep_info.items(): + if entry_knl.arg_dict[name].dtype is None: + for sep_name in sep_info.subarray_names.values(): + if sep_name in arg_to_dtype: + mm.set(name, arg_to_dtype[sep_name]) + del mm[sep_name] - program = program.with_kernel(add_dtypes(entry_knl, var_to_dtype)) + arg_to_dtype = mm.finish() + + from loopy.kernel.tools import add_dtypes + t_unit = t_unit.with_kernel(add_dtypes(entry_knl, arg_to_dtype)) from loopy.type_inference import infer_unknown_types - program = infer_unknown_types(program, expect_completion=True) + t_unit = infer_unknown_types(t_unit, expect_completion=True) - if program.state < KernelState.LINEARIZED: + if t_unit.state < KernelState.PREPROCESSED: from loopy.preprocess import preprocess_program - program = preprocess_program(program) + t_unit = preprocess_program(t_unit) - from loopy.schedule import get_one_linearized_kernel - for e in program.entrypoints: - program = program.with_kernel( - get_one_linearized_kernel(program[e], program.callables_table)) + if t_unit.state < KernelState.LINEARIZED: + from loopy.schedule import linearize + t_unit = linearize(t_unit) - return program + return t_unit - def get_typed_and_scheduled_translation_unit(self, entrypoint, arg_to_dtype_set): + def get_typed_and_scheduled_translation_unit( + self, entrypoint: str, arg_to_dtype: Optional[Map[str, LoopyType]] + ) -> TranslationUnit: from loopy import CACHING_ENABLED - cache_key = (type(self).__name__, self.program, arg_to_dtype_set) + cache_key = (type(self).__name__, self.t_unit, arg_to_dtype) if CACHING_ENABLED: try: @@ -822,29 +822,24 @@ class KernelExecutorBase: pass logger.debug("%s: typed-and-scheduled cache miss" % - self.program.entrypoints) + self.t_unit.entrypoints) kernel = self.get_typed_and_scheduled_translation_unit_uncached(entrypoint, - arg_to_dtype_set) + arg_to_dtype) if CACHING_ENABLED: typed_and_scheduled_cache.store_if_not_present(cache_key, kernel) return kernel - def arg_to_dtype_set(self, kwargs): - kwargs = kwargs.copy() + def arg_to_dtype(self, kwargs) -> Optional[Map[str, LoopyType]]: if not self.has_runtime_typed_args: return None - impl_arg_to_arg = self.program[self.entrypoint].impl_arg_to_arg + arg_dict = self.separated_entry_knl.arg_dict arg_to_dtype = {} for arg_name, val in kwargs.items(): - arg = impl_arg_to_arg.get(arg_name, None) - - if arg is None: - # offsets, strides and such - continue + arg = arg_dict[arg_name] if arg.dtype is None and val is not None: try: @@ -852,9 +847,9 @@ class KernelExecutorBase: except AttributeError: pass else: - arg_to_dtype[arg_name] = dtype + arg_to_dtype[arg_name] = NumpyType(dtype) - return frozenset(arg_to_dtype.items()) + return Map(arg_to_dtype) # {{{ debugging aids @@ -863,20 +858,9 @@ class KernelExecutorBase: code = self.get_code(entrypoint, arg_to_dtype) return get_highlighted_code(code) - def get_code(self, entrypoint, arg_to_dtype=None): - def process_dtype(dtype): - if isinstance(dtype, type) and issubclass(dtype, np.generic): - dtype = np.dtype(dtype) - if isinstance(dtype, np.dtype): - from loopy.types import NumpyType - dtype = NumpyType(dtype, self.program.target) - - return dtype - - if arg_to_dtype is not None: - arg_to_dtype = frozenset( - (k, process_dtype(v)) for k, v in arg_to_dtype.items()) - + def get_code( + self, entrypoint: str, + arg_to_dtype: Optional[Map[str, LoopyType]] = None) -> str: kernel = self.get_typed_and_scheduled_translation_unit( entrypoint, arg_to_dtype) @@ -887,10 +871,10 @@ class KernelExecutorBase: def get_invoker_uncached(self, program, entrypoint, *args): raise NotImplementedError() - def get_invoker(self, program, entrypoint, *args): + def get_invoker(self, t_unit, entrypoint, *args): from loopy import CACHING_ENABLED - cache_key = (self.__class__.__name__, (program, entrypoint)) + cache_key = (self.__class__.__name__, (t_unit, entrypoint)) if CACHING_ENABLED: try: @@ -900,7 +884,7 @@ class KernelExecutorBase: logger.debug("%s: invoker cache miss" % entrypoint) - invoker = self.get_invoker_uncached(program, entrypoint, *args) + invoker = self.get_invoker_uncached(t_unit, entrypoint, *args) if CACHING_ENABLED: invoker_cache.store_if_not_present(cache_key, invoker) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 3c6ff52b052035a0680146df3b7272751b699373..9974803c5c08ea4696f91b3e7ba31213f23c35c7 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -24,17 +24,25 @@ THE SOFTWARE. """ +from typing import cast, Tuple + import numpy as np # noqa +import pymbolic.primitives as p +from pymbolic import var +from pymbolic.mapper.stringifier import PREC_NONE +from pytools import memoize_method +from cgen import Generable, Declarator, Const + from loopy.target.c import CFamilyTarget, CFamilyASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from loopy.diagnostic import LoopyError from loopy.symbolic import Literal -from pymbolic import var -import pymbolic.primitives as p -from loopy.kernel.data import AddressSpace -from pymbolic.mapper.stringifier import PREC_NONE - -from pytools import memoize_method +from loopy.schedule import CallKernel +from loopy.typing import ExpressionT +from loopy.types import LoopyType +from loopy.kernel.data import AddressSpace, TemporaryVariable, ArrayArg +from loopy.codegen import CodeGenerationState +from loopy.codegen.result import CodeGenerationResult # {{{ expression mapper @@ -110,7 +118,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): from loopy.kernel.array import get_access_info from pymbolic import evaluate - access_info = get_access_info(self.kernel.target, ary, expr.index, + access_info = get_access_info(self.kernel, ary, expr.index, lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) @@ -159,15 +167,6 @@ class ISPCTarget(CFamilyTarget): Intel CPUs with wide vector units. """ - def __init__(self, occa_mode=False): - """ - :arg occa_mode: Whether to modify the generated call signature to - be compatible with OCCA - """ - self.occa_mode = occa_mode - - super().__init__() - host_program_name_suffix = "" device_program_name_suffix = "_inner" @@ -201,43 +200,36 @@ class ISPCTarget(CFamilyTarget): class ISPCASTBuilder(CFamilyASTBuilder): - def _arg_names_and_decls(self, codegen_state): - implemented_data_info = codegen_state.implemented_data_info - arg_names = [iai.name for iai in implemented_data_info] - - arg_decls = [ - self.idi_to_cgen_declarator(codegen_state.kernel, idi) - for idi in implemented_data_info] - - # {{{ occa compatibility hackery - - from cgen import Value - if self.target.occa_mode: - from cgen import ArrayOf, Const - from cgen.ispc import ISPCUniform - - arg_decls = [ - Const(ISPCUniform(ArrayOf(Value("int", "loopy_dims")))), - Const(ISPCUniform(Value("int", "o1"))), - Const(ISPCUniform(Value("int", "o2"))), - Const(ISPCUniform(Value("int", "o3"))), - ] + arg_decls - arg_names = ["loopy_dims", "o1", "o2", "o3"] + arg_names - - # }}} - - return arg_names, arg_decls - # {{{ top-level codegen - def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + def get_function_declaration(self, codegen_state: CodeGenerationState, + codegen_result: CodeGenerationResult, schedule_index: int) -> Generable: name = codegen_result.current_program(codegen_state).name + kernel = codegen_state.kernel + + assert codegen_state.kernel.linearization is not None + subkernel_name = cast( + CallKernel, + codegen_state.kernel.linearization[schedule_index] + ).kernel_name from cgen import (FunctionDeclaration, Value) from cgen.ispc import ISPCExport, ISPCTask - arg_names, arg_decls = self._arg_names_and_decls(codegen_state) + if codegen_state.is_entrypoint: + # subkernel launches occur only as part of entrypoint kernels for now + from loopy.schedule.tools import get_subkernel_arg_info + skai = get_subkernel_arg_info(codegen_state.kernel, subkernel_name) + passed_names = skai.passed_names + written_names = skai.written_names + else: + passed_names = [arg.name for arg in kernel.args] + written_names = kernel.get_written_variables() + + arg_decls = [self.arg_to_cgen_declarator( + kernel, arg_name, + is_written=arg_name in written_names) + for arg_name in passed_names] if codegen_state.is_generating_device_code: result = ISPCTask( @@ -253,9 +245,11 @@ class ISPCASTBuilder(CFamilyASTBuilder): from loopy.target.c import FunctionDeclarationWrapper return FunctionDeclarationWrapper(result) - # }}} - - def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): + def get_kernel_call(self, codegen_state: CodeGenerationState, + subkernel_name: str, + gsize: Tuple[ExpressionT, ...], + lsize: Tuple[ExpressionT, ...]) -> Generable: + kernel = codegen_state.kernel ecm = self.get_expression_to_code_mapper(codegen_state) from pymbolic.mapper.stringifier import PREC_NONE @@ -267,19 +261,27 @@ class ISPCASTBuilder(CFamilyASTBuilder): "assert(programCount == (%s))" % ecm(lsize[0], PREC_NONE))) - arg_names, arg_decls = self._arg_names_and_decls(codegen_state) + if codegen_state.is_entrypoint: + # subkernel launches occur only as part of entrypoint kernels for now + from loopy.schedule.tools import get_subkernel_arg_info + skai = get_subkernel_arg_info(codegen_state.kernel, subkernel_name) + passed_names = skai.passed_names + else: + passed_names = [arg.name for arg in kernel.args] from cgen.ispc import ISPCLaunch result.append( ISPCLaunch( tuple(ecm(gs_i, PREC_NONE) for gs_i in gsize), "{}({})".format( - name, - ", ".join(arg_names) + subkernel_name, + ", ".join(passed_names) ))) return Block(result) + # }}} + # {{{ code generation guts def get_expression_to_c_expression_mapper(self, codegen_state): @@ -302,17 +304,43 @@ class ISPCASTBuilder(CFamilyASTBuilder): else: raise LoopyError("unknown barrier kind") - def get_temporary_decl(self, codegen_state, sched_index, temp_var, decl_info): - from loopy.target.c import POD # uses the correct complex type - temp_var_decl = POD(self, decl_info.dtype, decl_info.name) + # }}} + + # {{{ declarators + + def get_value_arg_declaraotor( + self, name: str, dtype: LoopyType, is_written: bool) -> Declarator: + from cgen.ispc import ISPCUniform + return ISPCUniform(super().get_value_arg_declaraotor( + name, dtype, is_written)) + + def get_array_arg_declarator( + self, arg: ArrayArg, is_written: bool) -> Declarator: + # FIXME restrict? + from cgen.ispc import ISPCUniformPointer, ISPCUniform + decl = ISPCUniform( + ISPCUniformPointer(self.get_array_base_declarator(arg))) - shape = decl_info.shape + if not is_written: + decl = Const(decl) + + return decl + + def get_temporary_var_declarator(self, + codegen_state: CodeGenerationState, + temp_var: TemporaryVariable) -> Declarator: + temp_var_decl = self.get_array_base_declarator(temp_var) + + shape = temp_var.shape + + assert isinstance(shape, tuple) if temp_var.address_space == AddressSpace.PRIVATE: # FIXME: This is a pretty coarse way of deciding what # private temporaries get duplicated. Refine? (See also # above in expr to code mapper) - _, lsize = codegen_state.kernel.get_grid_size_upper_bounds_as_exprs() + _, lsize = codegen_state.kernel.get_grid_size_upper_bounds_as_exprs( + codegen_state.callables_table) shape = lsize + shape if shape: @@ -325,50 +353,9 @@ class ISPCASTBuilder(CFamilyASTBuilder): return temp_var_decl - def wrap_temporary_decl(self, decl, scope): - from cgen.ispc import ISPCUniform - return ISPCUniform(decl) - - def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written): - from loopy.target.c import POD # uses the correct complex type - from cgen import Const - from cgen.ispc import ISPCUniformPointer, ISPCUniform - - arg_decl = ISPCUniformPointer(POD(self, dtype, name)) - - if not is_written: - arg_decl = Const(arg_decl) - - arg_decl = ISPCUniform(arg_decl) - - return arg_decl - - def get_global_arg_decl(self, name, shape, dtype, is_written): - from warnings import warn - warn("get_global_arg_decl is deprecated use get_array_arg_decl " - "instead.", DeprecationWarning, stacklevel=2) - return self.get_array_arg_decl(name, AddressSpace.GLOBAL, shape, - dtype, is_written) - - def get_value_arg_decl(self, name, shape, dtype, is_written): - result = super().get_value_arg_decl( - name, shape, dtype, is_written) - - from cgen import Reference, Const - was_const = isinstance(result, Const) - - if was_const: - result = result.subdecl - - if self.target.occa_mode: - result = Reference(result) - - if was_const: - result = Const(result) - - from cgen.ispc import ISPCUniform - return ISPCUniform(result) + # }}} + # {{{ emit_... def emit_assignment(self, codegen_state, insn): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper @@ -404,7 +391,7 @@ class ISPCASTBuilder(CFamilyASTBuilder): index_tuple = tuple( simplify_using_aff(kernel, idx) for idx in lhs.index_tuple) - access_info = get_access_info(kernel.target, ary, index_tuple, + access_info = get_access_info(kernel, ary, index_tuple, lambda expr: evaluate(expr, codegen_state.var_subst_map), codegen_state.vectorization_info) @@ -505,6 +492,7 @@ class ISPCASTBuilder(CFamilyASTBuilder): PREC_NONE, "i"), "++%s" % iname, inner) + # }}} diff --git a/loopy/target/numba.py b/loopy/target/numba.py deleted file mode 100644 index 2df81ec1f332be87d8ca361480a37b68b369b56f..0000000000000000000000000000000000000000 --- a/loopy/target/numba.py +++ /dev/null @@ -1,219 +0,0 @@ -"""Python host AST builder for integration with PyOpenCL.""" - - -__copyright__ = "Copyright (C) 2016 Andreas Kloeckner" - -__license__ = """ -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -""" - - -from pytools import memoize_method - -from loopy.target.python import ExpressionToPythonMapper, PythonASTBuilderBase -from loopy.target import TargetBase, DummyHostASTBuilder - -from loopy.diagnostic import LoopyWarning - - -# {{{ base numba - -def _base_numba_preamble_generator(preamble_info): - yield ("06_numba_imports", """ - import numba as _lpy_numba - """) - - -class NumbaBaseASTBuilder(PythonASTBuilderBase): - def preamble_generators(self): - return ( - super().preamble_generators() + [ - _base_numba_preamble_generator - ]) - - def get_function_definition(self, codegen_state, codegen_result, - schedule_index, - function_decl, function_body): - - assert function_decl is None - - from genpy import Function - return Function( - codegen_result.current_program(codegen_state).name, - [idi.name for idi in codegen_state.implemented_data_info], - function_body, - decorators=self.get_python_function_decorators()) - - def get_python_function_decorators(self): - return () - - def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): - from pymbolic.mapper.stringifier import PREC_NONE - from genpy import Statement - - ecm = self.get_expression_to_code_mapper(codegen_state) - implemented_data_info = codegen_state.implemented_data_info - - return Statement( - "{}[{}, {}]({})".format( - name, - ecm(gsize, PREC_NONE), - ecm(lsize, PREC_NONE), - ", ".join(idi.name for idi in implemented_data_info) - )) - - -class NumbaJITASTBuilder(NumbaBaseASTBuilder): - def get_python_function_decorators(self): - return ("@_lpy_numba.jit",) - - -class NumbaTarget(TargetBase): - """A target for plain Python as understood by Numba, without any parallel extensions. - """ - - def __init__(self): - from warnings import warn - warn("The Numba targets are not yet feature-complete", - LoopyWarning, stacklevel=2) - - def split_kernel_at_global_barriers(self): - return False - - def get_host_ast_builder(self): - return DummyHostASTBuilder(self) - - def get_device_ast_builder(self): - return NumbaJITASTBuilder(self) - - # {{{ types - - @memoize_method - def get_dtype_registry(self): - from loopy.target.c import DTypeRegistryWrapper - from loopy.target.c.compyte.dtypes import ( - DTypeRegistry, fill_registry_with_c_types) - result = DTypeRegistry() - fill_registry_with_c_types(result, respect_windows=False, - include_bool=True) - return DTypeRegistryWrapper(result) - - def is_vector_dtype(self, dtype): - return False - - def get_vector_dtype(self, base, count): - raise KeyError() - - def get_or_register_dtype(self, names, dtype=None): - # These kind of shouldn't be here. - return self.get_dtype_registry().get_or_register_dtype(names, dtype) - - def dtype_to_typename(self, dtype): - # These kind of shouldn't be here. - return self.get_dtype_registry().dtype_to_ctype(dtype) - - # }}} - -# }}} - - -# {{{ numba.cuda - -class NumbaCudaExpressionToPythonMapper(ExpressionToPythonMapper): - _GRID_AXES = "xyz" - - def map_group_hw_index(self, expr, enclosing_prec): - return "_lpy_ncu.blockIdx.%s" % self._GRID_AXES[expr.axis] - - def map_local_hw_index(self, expr, enclosing_prec): - return "_lpy_ncu.threadIdx.%s" % self._GRID_AXES[expr.axis] - - -def _cuda_numba_preamble_generator(preamble_info): - yield ("06_import_numba_cuda", """ - import numba.cuda as _lpy_ncu - """) - - -class NumbaCudaASTBuilder(NumbaBaseASTBuilder): - def preamble_generators(self): - return ( - super().preamble_generators() + [ - _cuda_numba_preamble_generator - ]) - - def get_python_function_decorators(self): - return ("@_lpy_ncu.jit",) - - def get_expression_to_code_mapper(self, codegen_state): - return NumbaCudaExpressionToPythonMapper(codegen_state) - - -class NumbaCudaTarget(TargetBase): - """A target for Numba with CUDA extensions. - """ - - host_program_name_suffix = "" - device_program_name_suffix = "_inner" - - def __init__(self): - from warnings import warn - warn("The Numba target is not yet feature-complete", - LoopyWarning, stacklevel=2) - - def split_kernel_at_global_barriers(self): - return True - - def get_host_ast_builder(self): - return NumbaBaseASTBuilder(self) - - def get_device_ast_builder(self): - return NumbaCudaASTBuilder(self) - - # {{{ types - - @memoize_method - def get_dtype_registry(self): - from loopy.target.c import DTypeRegistryWrapper - from loopy.target.c.compyte.dtypes import ( - DTypeRegistry, fill_registry_with_c_types) - result = DTypeRegistry() - fill_registry_with_c_types(result, respect_windows=False, - include_bool=True) - return DTypeRegistryWrapper(result) - - def is_vector_dtype(self, dtype): - return False - - def get_vector_dtype(self, base, count): - raise KeyError() - - def get_or_register_dtype(self, names, dtype=None): - # These kind of shouldn't be here. - return self.get_dtype_registry().get_or_register_dtype(names, dtype) - - def dtype_to_typename(self, dtype): - # These kind of shouldn't be here. - return self.get_dtype_registry().dtype_to_ctype(dtype) - - # }}} - -# }}} - -# vim: foldmethod=marker diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 437dd66818e4b11dab784851cdaba631007eafb0..d9b23670ea92d7f1576a7a0f313286e3eade4196 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -24,16 +24,18 @@ THE SOFTWARE. """ import numpy as np +from pymbolic import var +from pytools import memoize_method +from cgen import Declarator from loopy.target.c import CFamilyTarget, CFamilyASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper -from pytools import memoize_method from loopy.diagnostic import LoopyError, LoopyTypeError from loopy.types import NumpyType from loopy.target.c import DTypeRegistryWrapper -from loopy.kernel.data import AddressSpace +from loopy.kernel.array import VectorArrayDimTag, FixedStrideArrayDimTag, ArrayBase +from loopy.kernel.data import AddressSpace, ImageArg, ConstantArg from loopy.kernel.function_interface import ScalarCallable -from pymbolic import var # {{{ dtype registry wrappers @@ -668,8 +670,6 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): # }}} - # {{{ code generation guts - def get_expression_to_c_expression_mapper(self, codegen_state): return ExpressionToOpenCLCExpressionMapper(codegen_state) @@ -695,68 +695,71 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): else: raise LoopyError("unknown barrier kind") - def wrap_temporary_decl(self, decl, scope): - if scope == AddressSpace.LOCAL: - from cgen.opencl import CLLocal + # {{{ declarators + + def wrap_decl_for_address_space( + self, decl: Declarator, address_space: AddressSpace) -> Declarator: + from cgen.opencl import CLGlobal, CLLocal + if address_space == AddressSpace.GLOBAL: + return CLGlobal(decl) + elif address_space == AddressSpace.LOCAL: return CLLocal(decl) - elif scope == AddressSpace.PRIVATE: + elif address_space == AddressSpace.PRIVATE: return decl else: - raise ValueError("unexpected temporary variable scope: %s" - % scope) + raise ValueError("unexpected temporary variable address space: %s" + % address_space) + + def wrap_global_constant(self, decl: Declarator) -> Declarator: + from cgen.opencl import CLGlobal, CLConstant + assert isinstance(decl, CLGlobal) + decl = decl.subdecl - def wrap_global_constant(self, decl): - from cgen.opencl import CLConstant return CLConstant(decl) - def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written): - from cgen.opencl import CLGlobal, CLLocal - from loopy.kernel.data import AddressSpace + # duplicated in CUDA, update there if updating here + def get_array_base_declarator(self, ary: ArrayBase) -> Declarator: + dtype = ary.dtype - if mem_address_space == AddressSpace.LOCAL: - return CLLocal(super().get_array_arg_decl( - name, mem_address_space, shape, dtype, is_written)) - elif mem_address_space == AddressSpace.PRIVATE: - return super().get_array_arg_decl( - name, mem_address_space, shape, dtype, is_written) - elif mem_address_space == AddressSpace.GLOBAL: - return CLGlobal(super().get_array_arg_decl( - name, mem_address_space, shape, dtype, is_written)) - else: - raise ValueError("unexpected array argument scope: %s" - % mem_address_space) + vec_size = ary.vector_size(self.target) + if vec_size > 1: + dtype = self.target.vector_dtype(dtype, vec_size) - def get_global_arg_decl(self, name, shape, dtype, is_written): - from loopy.kernel.data import AddressSpace - from warnings import warn - warn("get_global_arg_decl is deprecated use get_array_arg_decl " - "instead.", DeprecationWarning, stacklevel=2) + if ary.dim_tags: + for dim_tag in ary.dim_tags: + if isinstance(dim_tag, (FixedStrideArrayDimTag, VectorArrayDimTag)): + # we're OK with those + pass + + else: + raise NotImplementedError( + f"{type(self).__name__} does not understand axis tag " + f"'{type(dim_tag)}.") + + from loopy.target.c import POD + return POD(self, dtype, ary.name) + + def get_constant_arg_declarator(self, arg: ConstantArg) -> Declarator: + from cgen import RestrictPointer + from cgen.opencl import CLConstant - return self.get_array_arg_decl(name, AddressSpace.GLOBAL, shape, - dtype, is_written) + # constant *is* an address space as far as CL is concerned, do not re-wrap + return CLConstant(RestrictPointer(self.get_array_base_declarator( + arg))) - def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): + def get_image_arg_declarator( + self, arg: ImageArg, is_written: bool) -> Declarator: if is_written: mode = "w" else: mode = "r" from cgen.opencl import CLImage - return CLImage(num_target_axes, mode, name) - - def get_constant_arg_decl(self, name, shape, dtype, is_written): - from loopy.target.c import POD # uses the correct complex type - from cgen import RestrictPointer, Const - from cgen.opencl import CLConstant - - arg_decl = RestrictPointer(POD(self, dtype, name)) + return CLImage(arg.num_target_axes(), mode, arg.name) - if not is_written: - arg_decl = Const(arg_decl) - - return CLConstant(arg_decl) + # }}} - # {{{ + # {{{ atomics def emit_atomic_init(self, codegen_state, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): @@ -766,10 +769,6 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): return self.emit_atomic_update(codegen_state, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, rhs_type_context) - # }}} - - # {{{ code generation for atomic update - def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): from pymbolic.mapper.stringifier import PREC_NONE @@ -881,8 +880,6 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): # }}} - # }}} - # }}} diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 01dc024c694244c3461944bf79ffdb962216a146..66dd9ae3f255981df67f59ec202881f4e8f77928 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -22,12 +22,16 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from typing import Sequence, Mapping, Tuple, Dict, List, Union + import numpy as np import pymbolic.primitives as p +import genpy from loopy.target.opencl import (OpenCLTarget, OpenCLCASTBuilder, ExpressionToOpenCLCExpressionMapper) from loopy.target.python import PythonASTBuilderBase +from loopy.kernel import LoopKernel from loopy.types import NumpyType from loopy.diagnostic import LoopyError, LoopyTypeError from warnings import warn @@ -527,20 +531,22 @@ class PyOpenCLTarget(OpenCLTarget): # {{{ host code: value arg setup -def generate_value_arg_setup(kernel, implemented_data_info): +def generate_value_arg_setup( + kernel: LoopKernel, passed_names: Sequence[str] + ) -> Tuple[genpy.Generable, Mapping[int, int], int]: options = kernel.options import loopy as lp from loopy.kernel.array import ArrayBase cl_arg_idx = 0 - arg_idx_to_cl_arg_idx = {} + arg_idx_to_cl_arg_idx: Dict[int, int] = {} fp_arg_count = 0 from genpy import If, Raise, Statement as S, Suite - result = [] + result: List[str] = [] gen = result.append buf_indices_and_args = [] @@ -557,11 +563,18 @@ def generate_value_arg_setup(kernel, implemented_data_info): buf_indices_and_args.append(arg_idx) buf_indices_and_args.append(f"pack('{typechar}', {expr_str})") - for arg_idx, idi in enumerate(implemented_data_info): + for arg_idx, passed_name in enumerate(passed_names): arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx - if not issubclass(idi.arg_class, lp.ValueArg): - assert issubclass(idi.arg_class, ArrayBase) + if passed_name in kernel.all_inames(): + add_buf_arg(cl_arg_idx, kernel.index_dtype.numpy_dtype.char, passed_name) + cl_arg_idx += 1 + continue + + var_descr = kernel.get_var_descriptor(passed_name) + + if not isinstance(var_descr, lp.ValueArg): + assert isinstance(var_descr, ArrayBase) # assume each of those generates exactly one... cl_arg_idx += 1 @@ -569,20 +582,20 @@ def generate_value_arg_setup(kernel, implemented_data_info): continue if not options.skip_arg_checks: - gen(If("%s is None" % idi.name, - Raise('RuntimeError("input argument \'{name}\' ' - 'must be supplied")'.format(name=idi.name)))) + gen(If(f"{passed_name} is None", + Raise('RuntimeError("input argument \'{var_descr.name}\' ' + 'must be supplied")'))) - if idi.dtype.is_composite(): + if var_descr.dtype.is_composite(): buf_indices_and_args.append(cl_arg_idx) - buf_indices_and_args.append(f"{idi.name}") + buf_indices_and_args.append(f"{passed_name}") cl_arg_idx += 1 - elif idi.dtype.is_complex(): - assert isinstance(idi.dtype, NumpyType) + elif var_descr.dtype.is_complex(): + assert isinstance(var_descr.dtype, NumpyType) - dtype = idi.dtype + dtype = var_descr.dtype if dtype.numpy_dtype == np.complex64: arg_char = "f" @@ -594,21 +607,21 @@ def generate_value_arg_setup(kernel, implemented_data_info): buf_indices_and_args.append(cl_arg_idx) buf_indices_and_args.append( f"_lpy_pack('{arg_char}{arg_char}', " - f"{idi.name}.real, {idi.name}.imag)") + f"{passed_name}.real, {passed_name}.imag)") cl_arg_idx += 1 fp_arg_count += 2 - elif isinstance(idi.dtype, NumpyType): - if idi.dtype.dtype.kind == "f": + elif isinstance(var_descr.dtype, NumpyType): + if var_descr.dtype.dtype.kind == "f": fp_arg_count += 1 - add_buf_arg(cl_arg_idx, idi.dtype.dtype.char, idi.name) + add_buf_arg(cl_arg_idx, var_descr.dtype.dtype.char, passed_name) cl_arg_idx += 1 else: raise LoopyError("do not know how to pass argument of type '%s'" - % idi.dtype) + % var_descr.dtype) for arg_kind, args_and_indices, entry_length in [ ("_buf", buf_indices_and_args, 2), @@ -625,18 +638,23 @@ def generate_value_arg_setup(kernel, implemented_data_info): # }}} -def generate_array_arg_setup(kernel, implemented_data_info, arg_idx_to_cl_arg_idx): +def generate_array_arg_setup(kernel: LoopKernel, passed_names: Sequence[str], + arg_idx_to_cl_arg_idx: Mapping[int, int]) -> genpy.Generable: from loopy.kernel.array import ArrayBase from genpy import Statement as S, Suite - result = [] + result: List[str] = [] gen = result.append - cl_indices_and_args = [] - for arg_idx, arg in enumerate(implemented_data_info): - if issubclass(arg.arg_class, ArrayBase): + cl_indices_and_args: List[Union[int, str]] = [] + for arg_idx, passed_name in enumerate(passed_names): + if passed_name in kernel.all_inames(): + continue + + var_descr = kernel.get_var_descriptor(passed_name) + if isinstance(var_descr, ArrayBase): cl_indices_and_args.append(arg_idx_to_cl_arg_idx[arg_idx]) - cl_indices_and_args.append(arg.name) + cl_indices_and_args.append(passed_name) if cl_indices_and_args: assert len(cl_indices_and_args) % 2 == 0 @@ -656,13 +674,18 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): # {{{ code generation guts - def get_function_definition(self, codegen_state, codegen_result, - schedule_index, function_decl, function_body): - from loopy.kernel.data import TemporaryVariable + def get_function_definition( + self, codegen_state, codegen_result, + schedule_index: int, function_decl, function_body: genpy.Generable + ) -> genpy.Function: + assert schedule_index == 0 + + from loopy.schedule.tools import get_kernel_arg_info + kai = get_kernel_arg_info(codegen_state.kernel) + args = ( ["_lpy_cl_kernels", "queue"] - + [idi.name for idi in codegen_state.implemented_data_info - if not issubclass(idi.arg_class, TemporaryVariable)] + + [arg_name for arg_name in kai.passed_arg_names] + ["wait_for=None", "allocator=None"]) from genpy import (For, Function, Suite, Return, Line, Statement as S) @@ -702,14 +725,6 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): def get_temporary_decls(self, codegen_state, schedule_index): from genpy import Assign, Comment, Line - from collections import defaultdict - from numbers import Number - import pymbolic.primitives as prim - - def alloc_nbytes(tv): - from functools import reduce - from operator import mul - return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1) from pymbolic.mapper.stringifier import PREC_NONE ecm = self.get_expression_to_code_mapper(codegen_state) @@ -718,37 +733,14 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): if not global_temporaries: return [] - # {{{ allocate space for the base_storage - - base_storage_sizes = defaultdict(set) - - for tv in global_temporaries: - if tv.base_storage: - base_storage_sizes[tv.base_storage].add(tv.nbytes) - - # }}} - allocated_var_names = [] code_lines = [] code_lines.append(Line()) code_lines.append(Comment("{{{ allocate global temporaries")) code_lines.append(Line()) - for name, sizes in base_storage_sizes.items(): - if all(isinstance(s, Number) for s in sizes): - size = max(sizes) - else: - size = prim.Max(tuple(sizes)) - - allocated_var_names.append(name) - code_lines.append(Assign(name, - f"allocator({ecm(size, PREC_NONE, 'i')})")) - for tv in global_temporaries: - if tv.base_storage: - assert tv.base_storage in base_storage_sizes - code_lines.append(Assign(tv.name, tv.base_storage)) - else: + if not tv.base_storage: nbytes_str = ecm(tv.nbytes, PREC_NONE, "i") allocated_var_names.append(tv.name) code_lines.append(Assign(tv.name, @@ -763,7 +755,12 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): return code_lines - def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): + def get_kernel_call(self, + codegen_state, subkernel_name, gsize, lsize): + from loopy.schedule.tools import get_subkernel_arg_info + skai = get_subkernel_arg_info( + codegen_state.kernel, subkernel_name) + ecm = self.get_expression_to_code_mapper(codegen_state) if not gsize: @@ -771,16 +768,10 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): if not lsize: lsize = (1,) - all_args = codegen_state.implemented_data_info + extra_args - value_arg_code, arg_idx_to_cl_arg_idx, cl_arg_count = \ - generate_value_arg_setup( - codegen_state.kernel, - all_args) + generate_value_arg_setup(codegen_state.kernel, skai.passed_names) arry_arg_code = generate_array_arg_setup( - codegen_state.kernel, - all_args, - arg_idx_to_cl_arg_idx) + codegen_state.kernel, skai.passed_names, arg_idx_to_cl_arg_idx) from genpy import Suite, Assign, Assert, Line, Comment from pymbolic.mapper.stringifier import PREC_NONE @@ -794,10 +785,11 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): # TODO: Generate finer-grained dependency structure return Suite([ - Comment("{{{ enqueue %s" % name), + Comment("{{{ enqueue %s" % subkernel_name), Line(), - Assign("_lpy_knl", "_lpy_cl_kernels."+name), - Assert(f"_lpy_knl.num_args == {cl_arg_count}, f'Kernel \"{name}\" " + Assign("_lpy_knl", "_lpy_cl_kernels."+subkernel_name), + Assert(f"_lpy_knl.num_args == {cl_arg_count}, " + f"f'Kernel \"{subkernel_name}\" " f"invoker argument count ({cl_arg_count}) does not match the " # No f"" here since {_lpy_knl.num_args} needs to be evaluated # at runtime, not here. @@ -858,7 +850,6 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): def get_expression_to_c_expression_mapper(self, codegen_state): return ExpressionToPyOpenCLCExpressionMapper(codegen_state) - # }}} diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index d20ddf1d2a632b75fbe4cd892a0ba40ea8b8a87f..d870a11b60177f4385d189ac066833d859cbb6f7 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -21,15 +21,31 @@ THE SOFTWARE. """ +from typing import Sequence, Tuple, Union, Callable, Any, Optional, TYPE_CHECKING +from dataclasses import dataclass + import numpy as np +from immutables import Map + from pytools import memoize_method -from pytools.py_codegen import Indentation +from pytools.codegen import Indentation, CodeGenerator + +from loopy.types import LoopyType +from loopy.typing import ExpressionT +from loopy.kernel import LoopKernel +from loopy.kernel.data import ArrayArg +from loopy.translation_unit import TranslationUnit +from loopy.schedule.tools import KernelArgInfo from loopy.target.execution import ( - KernelExecutorBase, ExecutionWrapperGeneratorBase, _KernelInfo, _Kernels) + KernelExecutorBase, ExecutionWrapperGeneratorBase) import logging logger = logging.getLogger(__name__) +if TYPE_CHECKING: + import pyopencl as cl + + # {{{ invoker generation # /!\ This code runs in a namespace controlled by the user. @@ -90,18 +106,22 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ handle allocation of unspecified arguments - def handle_alloc(self, gen, arg, kernel_arg, strify, skip_arg_checks): + def handle_alloc( + self, gen: CodeGenerator, arg: ArrayArg, + strify: Callable[[Union[ExpressionT, Tuple[ExpressionT]]], str], + skip_arg_checks: bool) -> None: """ Handle allocation of non-specified arguments for pyopencl execution """ from pymbolic import var - num_axes = len(arg.strides) + from loopy.kernel.array import get_strides + strides = get_strides(arg) + num_axes = len(strides) - itemsize = kernel_arg.dtype.numpy_dtype.itemsize + itemsize = arg.dtype.numpy_dtype.itemsize for i in range(num_axes): - gen("_lpy_ustrides_%d = %s" % (i, strify( - arg.unvec_strides[i]))) + gen("_lpy_ustrides_%d = %s" % (i, strify(strides[i]))) if not skip_arg_checks: for i in range(num_axes): @@ -109,12 +129,11 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): "\"'%s' has negative stride in axis %d\"" % (i, arg.name, i)) + assert isinstance(arg.shape, tuple) sym_ustrides = tuple( var("_lpy_ustrides_%d" % i) for i in range(num_axes)) - sym_shape = tuple( - arg.unvec_shape[i] - for i in range(num_axes)) + sym_shape = tuple(arg.shape[i] for i in range(num_axes)) size_expr = (sum(astrd*(alen-1) for alen, astrd in zip(sym_shape, sym_ustrides)) @@ -123,7 +142,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): gen("_lpy_size = %s" % strify(size_expr)) sym_strides = tuple(itemsize*s_i for s_i in sym_ustrides) - dtype_name = self.python_dtype_str(gen, kernel_arg.dtype.numpy_dtype) + dtype_name = self.python_dtype_str(gen, arg.dtype.numpy_dtype) gen(f"{arg.name} = _lpy_cl_array.Array(None, {strify(sym_shape)}, " f"{dtype_name}, strides={strify(sym_strides)}, " f"data=allocator({strify(itemsize * var('_lpy_size'))}), " @@ -147,6 +166,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): gen.add_to_preamble("import pyopencl.array as _lpy_cl_array") gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools") gen.add_to_preamble("from struct import pack as _lpy_pack") + from loopy.target.c.c_execution import DEF_EVEN_DIV_FUNCTION + gen.add_to_preamble(DEF_EVEN_DIV_FUNCTION) def initialize_system_args(self, gen): """ @@ -159,8 +180,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ generate invocation - def generate_invocation(self, gen, kernel_name, args, - kernel, implemented_data_info): + def generate_invocation(self, gen: CodeGenerator, kernel: LoopKernel, + kai: KernelArgInfo, host_program_name: str, args: Sequence[str]) -> None: if kernel.options.cl_exec_manage_array_events: gen(""" if wait_for is None: @@ -168,47 +189,35 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): """) gen("") - from loopy.kernel.data import ArrayArg - for arg in implemented_data_info: - if issubclass(arg.arg_class, ArrayArg): + for arg_name in kai.passed_arg_names: + arg = kernel.arg_dict[arg_name] + if isinstance(arg, ArrayArg): gen( "wait_for.extend({arg_name}.events)" .format(arg_name=arg.name)) gen("") - gen("_lpy_evt = {kernel_name}({args})" - .format( - kernel_name=kernel_name, - args=", ".join( - ["_lpy_cl_kernels", "queue"] - + args - + ["wait_for=wait_for", "allocator=allocator"]))) + arg_list = (["_lpy_cl_kernels", "queue"] + + list(args) + + ["wait_for=wait_for", "allocator=allocator"]) + gen(f"_lpy_evt = {host_program_name}({', '.join(arg_list)})") if kernel.options.cl_exec_manage_array_events: gen("") - from loopy.kernel.data import ArrayArg - for arg in implemented_data_info: - if (issubclass(arg.arg_class, ArrayArg) - and arg.base_name in kernel.get_written_variables()): + for arg_name in kai.passed_arg_names: + arg = kernel.arg_dict[arg_name] + if (isinstance(arg, ArrayArg) + and arg.name in kernel.get_written_variables()): gen(f"{arg.name}.add_event(_lpy_evt)") # }}} - # {{{ + # {{{ generate_output_handler - def generate_output_handler( - self, gen, options, kernel, implemented_data_info): - - from loopy.kernel.data import KernelArgument - - def is_output(idi): - from loopy.kernel.array import ArrayBase - if not issubclass(idi.arg_class, ArrayBase): - return False - - arg = kernel.impl_arg_to_arg[idi.name] - return arg.is_output + def generate_output_handler(self, gen: CodeGenerator, + kernel: LoopKernel, kai: KernelArgInfo) -> None: + options = kernel.options if not options.no_numpy: gen("if out_host is None and (_lpy_encountered_numpy " @@ -216,33 +225,28 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): with Indentation(gen): gen("out_host = True") - for idi in implemented_data_info: - if not issubclass(idi.arg_class, KernelArgument): - continue - - if is_output(idi): - np_name = "_lpy_%s_np_input" % idi.name + for arg_name in kai.passed_arg_names: + arg = kernel.arg_dict[arg_name] + if arg.is_output: + np_name = "_lpy_%s_np_input" % arg.name gen("if out_host or %s is not None:" % np_name) with Indentation(gen): gen("%s = %s.get(queue=queue, ary=%s)" - % (idi.name, idi.name, np_name)) + % (arg.name, arg.name, np_name)) gen("") if options.return_dict: gen("return _lpy_evt, {%s}" - % ", ".join(f'"{idi.name}": {idi.name}' - for idi in implemented_data_info - if issubclass(idi.arg_class, KernelArgument) - if is_output(idi))) + % ", ".join(f'"{arg_name}": {arg_name}' + for arg_name in kai.passed_arg_names + if kernel.arg_dict[arg_name].is_output)) else: - out_idis = [idi - for idi in implemented_data_info - if issubclass(idi.arg_class, KernelArgument) - if is_output(idi)] - if out_idis: + out_names = [arg_name for arg_name in kai.passed_arg_names + if kernel.arg_dict[arg_name].is_output] + if out_names: gen("return _lpy_evt, (%s,)" - % ", ".join(idi.name for idi in out_idis)) + % ", ".join(out_names)) else: gen("return _lpy_evt, ()") @@ -257,6 +261,17 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # }}} +@dataclass(frozen=True) +class _KernelInfo: + t_unit: TranslationUnit + cl_kernels: "_Kernels" + invoker: Callable[..., Any] + + +class _Kernels: + pass + + # {{{ kernel executor class PyOpenCLKernelExecutor(KernelExecutorBase): @@ -267,52 +282,45 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, context, program, entrypoint): - """ - :arg context: a :class:`pyopencl.Context` - :arg kernel: may be a loopy.LoopKernel, a generator returning kernels - (a warning will be issued if more than one is returned). If the - kernel has not yet been loop-scheduled, that is done, too, with no - specific arguments. - """ - - super().__init__(program, entrypoint) + def __init__(self, context: "cl.Context", t_unit, entrypoint): + super().__init__(t_unit, entrypoint) self.context = context - def get_invoker_uncached(self, program, entrypoint, codegen_result): + def get_invoker_uncached(self, t_unit, entrypoint, codegen_result): generator = PyOpenCLExecutionWrapperGenerator() - return generator(program, entrypoint, codegen_result) + return generator(t_unit, entrypoint, codegen_result) def get_wrapper_generator(self): return PyOpenCLExecutionWrapperGenerator() @memoize_method - def translation_unit_info(self, entrypoint, arg_to_dtype_set=frozenset(), - all_kwargs=None): - program = self.get_typed_and_scheduled_translation_unit( - entrypoint, arg_to_dtype_set) + def translation_unit_info( + self, entrypoint: str, + arg_to_dtype: Optional[Map[str, LoopyType]] = None) -> _KernelInfo: + t_unit = self.get_typed_and_scheduled_translation_unit( + entrypoint, arg_to_dtype) # FIXME: now just need to add the types to the arguments from loopy.codegen import generate_code_v2 from loopy.target.execution import get_highlighted_code - codegen_result = generate_code_v2(program) + codegen_result = generate_code_v2(t_unit) dev_code = codegen_result.device_code() - if program[entrypoint].options.write_code: + if t_unit[entrypoint].options.write_code: #FIXME: redirect to "translation unit" level option as well. output = dev_code - if self.program[entrypoint].options.allow_terminal_colors: + if self.t_unit[entrypoint].options.allow_terminal_colors: output = get_highlighted_code(output) - if self.program[entrypoint].options.write_code is True: + if self.t_unit[entrypoint].options.write_code is True: print(output) else: - with open(self.program[entrypoint].options.write_code, "w") as outf: + with open(self.t_unit[entrypoint].options.write_code, "w") as outf: outf.write(output) - if program[entrypoint].options.edit_code: + if t_unit[entrypoint].options.edit_code: #FIXME: redirect to "translation unit" level option as well. from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.cl") @@ -322,18 +330,16 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): #FIXME: redirect to "translation unit" level option as well. cl_program = ( cl.Program(self.context, dev_code) - .build(options=program[entrypoint].options.build_options)) + .build(options=t_unit[entrypoint].options.build_options)) cl_kernels = _Kernels() for dp in cl_program.kernel_names.split(";"): setattr(cl_kernels, dp, getattr(cl_program, dp)) return _KernelInfo( - program=program, + t_unit=t_unit, cl_kernels=cl_kernels, - implemented_data_info=codegen_result.implemented_data_infos[ - entrypoint], - invoker=self.get_invoker(program, entrypoint, codegen_result)) + invoker=self.get_invoker(t_unit, entrypoint, codegen_result)) def __call__(self, queue, *, allocator=None, wait_for=None, out_host=None, entrypoint=None, @@ -366,6 +372,9 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): assert entrypoint is not None + # FIXME: Remove entrypoint argument + assert entrypoint == self.entrypoint + if __debug__: self.check_for_required_array_arguments(kwargs.keys()) @@ -373,7 +382,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller(kwargs) translation_unit_info = self.translation_unit_info(entrypoint, - self.arg_to_dtype_set(kwargs)) + self.arg_to_dtype(kwargs)) return translation_unit_info.invoker( translation_unit_info.cl_kernels, queue, allocator, wait_for, diff --git a/loopy/tools.py b/loopy/tools.py index fe42918d8e5e3452f5ff11425dff17e499cd387b..5ae620bbab1582e1d90b1d29f4d0322b900a0600 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -21,7 +21,10 @@ THE SOFTWARE. """ import collections.abc as abc +from functools import cached_property +from immutables import Map +import islpy as isl import numpy as np from pytools import memoize_method, ProcessLogger from pytools.persistent_dict import KeyBuilder as KeyBuilderBase @@ -109,7 +112,13 @@ class LoopyKeyBuilder(KeyBuilderBase): getattr(prn, "print_"+key._base_name)(key) key_hash.update(prn.get_str().encode("utf8")) - update_for_Map = update_for_BasicSet # noqa + def update_for_Map(self, key_hash, key): # noqa + if isinstance(key, Map): + self.update_for_dict(key_hash, key) + elif isinstance(key, isl.Map): + self.update_for_BasicSet(key_hash, key) + else: + raise AssertionError() def update_for_pymbolic_expression(self, key_hash, key): if key is None: @@ -673,8 +682,7 @@ class _CallablesUnresolver(RuleAwareIdentityMapper): self.callables_table = callables_table self.target = target - @property - @memoize_method + @cached_property def known_callables(self): from loopy.kernel.function_interface import CallableKernel return (frozenset(self.target.get_device_ast_builder().known_callables) @@ -800,9 +808,9 @@ def _kernel_to_python(kernel, is_entrypoint=False, var_name="kernel"): % endif ) - % for iname, tags in kernel.iname_to_tags.items(): - % for tag in tags: - ${var_name} = lp.tag_inames(${var_name}, "${"%s:%s" %(iname, tag)}") + % for iname in kernel.inames.values(): + % for tag in iname.tags: + ${var_name} = lp.tag_inames(${var_name}, "${"%s:%s" %(iname.name, tag)}") % endfor % endfor @@ -856,8 +864,12 @@ def t_unit_to_python(t_unit, var_name="t_unit", knl_args = ", ".join(f"{name}_knl" for name in t_unit.callables_table) merge_stmt = f"{var_name} = lp.merge([{knl_args}])" - preamble_str = "\n".join(["import loopy as lp", "import numpy as np", - "from pymbolic.primitives import *"]) + preamble_str = "\n".join([ + "import loopy as lp", + "import numpy as np", + "from pymbolic.primitives import *", + "import immutables", + ]) body_str = "\n".join(knl_python_code_srcs + ["\n", merge_stmt]) python_code = "\n".join([preamble_str, "\n", body_str]) diff --git a/loopy/transform/data.py b/loopy/transform/data.py index a137e82c9dc34c5007768b0f6f3d0772c39c15b8..39b706e8313c5d1362c32245c725aedd77999b8b 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -20,14 +20,24 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from loopy.diagnostic import LoopyError -from islpy import dim_type +from warnings import warn + +from dataclasses import dataclass, replace + +from typing import Optional, Tuple, Dict -from loopy.kernel.data import ImageArg +import numpy as np + +from islpy import dim_type from pytools import MovedFunctionDeprecationWrapper -from loopy.translation_unit import (TranslationUnit, - for_each_kernel) + +from loopy.diagnostic import LoopyError +from loopy.kernel.data import ImageArg, auto, TemporaryVariable + +from loopy.types import LoopyType +from loopy.typing import ExpressionT +from loopy.translation_unit import TranslationUnit, for_each_kernel from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel, ScalarCallable @@ -578,7 +588,12 @@ def alias_temporaries(kernel, names, base_name_prefix=None, if base_name_prefix is None: base_name_prefix = "temp_storage" - vng = kernel.get_var_name_generator() + from pytools import UniqueNameGenerator + vng = UniqueNameGenerator( + kernel.all_variable_names() + | {tv.base_storage + for tv in kernel.temporary_variables.values() + if tv.base_storage is not None}) base_name = vng(base_name_prefix) names_set = set(names) @@ -623,7 +638,9 @@ def alias_temporaries(kernel, names, base_name_prefix=None, .format(tv=tv.name)) new_temporary_variables[tv.name] = \ - tv.copy(base_storage=base_name) + tv.copy( + base_storage=base_name, + _base_storage_access_may_be_aliasing=False) else: new_temporary_variables[tv.name] = tv @@ -952,4 +969,126 @@ def add_padding_to_avoid_bank_conflicts(kernel, device): # }}} +# {{{ allocate_temporaries_for_base_storage + +@dataclass(frozen=True) +class _BaseStorageInfo: + name: str + next_offset: ExpressionT + approx_nbytes: Optional[int] = None + + +def _sym_max(a: ExpressionT, b: ExpressionT) -> ExpressionT: + from numbers import Number + if isinstance(a, Number) and isinstance(b, Number): + # https://github.com/python/mypy/issues/3186 + return max(a, b) # type: ignore[call-overload] + else: + from pymbolic.primitives import Max + return Max((a, b)) + + +@for_each_kernel +def allocate_temporaries_for_base_storage(kernel: LoopKernel, + only_address_space: Optional[int] = None, + aliased=True, + max_nbytes: Optional[int] = None, + _implicitly_run=False, + ) -> LoopKernel: + from pytools import product + + new_tvs = dict(kernel.temporary_variables) + made_changes = False + + vng = kernel.get_var_name_generator() + + name_aspace_dtype_to_bsi: Dict[Tuple[str, int, LoopyType], _BaseStorageInfo] = {} + + for tv in sorted( + kernel.temporary_variables.values(), + key=lambda key_tv: key_tv.name): + if tv.base_storage and tv.initializer: + raise LoopyError( + f"Temporary '{tv.name}' has both base_storage " + "and an initializer. That's not allowed.") + if tv.offset and not tv.base_storage: + raise LoopyError( + f"Temporary '{tv.name}' has an offset and no base_storage. " + "That's not allowed.") + + if (tv.base_storage + and tv.base_storage not in kernel.temporary_variables + and ( + only_address_space is None + or tv.address_space == only_address_space)): + made_changes = True + + if tv.address_space is auto: + raise LoopyError("Ahen allocating base storage for temporary " + f"'{tv.name}', the address space of the temporary " + "was not yet determined (set to 'auto').") + + assert isinstance(tv.shape, tuple) + ary_size = product(si for si in tv.shape) + if isinstance(ary_size, (int, np.integer)): + approx_array_nbytes = tv.dtype.numpy_dtype.itemsize * ary_size + else: + # FIXME: Could use approximate values of ValueArgs + approx_array_nbytes = 0 + + bs_key = (tv.base_storage, tv.address_space, tv.dtype) + bsi = name_aspace_dtype_to_bsi.get(bs_key) + + if bsi is None or ( + # are we out of space? + not aliased + and max_nbytes is not None + and bsi.approx_nbytes is not None + and bsi.approx_nbytes + approx_array_nbytes > max_nbytes): + bsi = name_aspace_dtype_to_bsi[bs_key] = _BaseStorageInfo( + name=vng(tv.base_storage), + next_offset=0, + approx_nbytes=None if aliased else 0) + + new_tvs[bsi.name] = TemporaryVariable( + name=bsi.name, + dtype=tv.dtype, + shape=(0,), + address_space=tv.address_space) + + new_tvs[tv.name] = tv.copy( + base_storage=bsi.name, + offset=bsi.next_offset, + _base_storage_access_may_be_aliasing=( + aliased if tv._base_storage_access_may_be_aliasing is None + else tv._base_storage_access_may_be_aliasing)) + + bs_tv = new_tvs[bsi.name] + assert isinstance(bs_tv.shape, tuple) + bs_size, = bs_tv.shape + if aliased: + new_bs_size = _sym_max(bs_size, ary_size) + else: + new_bs_size = bs_size + ary_size + + assert bsi.approx_nbytes is not None + name_aspace_dtype_to_bsi[bs_key] = replace(bsi, + next_offset=bsi.next_offset + ary_size, + approx_nbytes=bsi.approx_nbytes + approx_array_nbytes) + + new_tvs[bsi.name] = new_tvs[bsi.name].copy(shape=(new_bs_size,)) + + if made_changes: + if _implicitly_run: + warn("Base storage allocation was performed implicitly during " + "preprocessing. This is deprecated and will stop working " + "in 2023. Call loopy.allocate_temporaries_for_base_storage " + "explicitly to aovid this warning.", DeprecationWarning) + + return kernel.copy(temporary_variables=new_tvs) + else: + return kernel + +# }}} + # vim: foldmethod=marker diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 6e28d9e7b969372a714af78a3b772f0052347e39..2fd39cfc257f2d0a52cf9b70ec89af6200c96c81 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -23,6 +23,7 @@ THE SOFTWARE. import islpy as isl from islpy import dim_type +from immutables import Map from loopy.diagnostic import LoopyError from pymbolic import var @@ -103,7 +104,7 @@ def _ordered_merge_lists(list_a, list_b): def _merge_dicts(item_name, dict_a, dict_b): - result = dict_a.copy() + result = dict(dict_a) for k, v in dict_b.items(): if k in result: @@ -114,7 +115,10 @@ def _merge_dicts(item_name, dict_a, dict_b): else: result[k] = v - return result + if isinstance(dict_a, Map): + return Map(result) + else: + return result def _merge_values(item_name, val_a, val_b): @@ -242,8 +246,6 @@ def _fuse_two_kernels(kernela, kernelb): preamble_generators=_ordered_merge_lists( kernela.preamble_generators, kernelb.preamble_generators), assumptions=new_assumptions, - local_sizes=_merge_dicts( - "local size", kernela.local_sizes, kernelb.local_sizes), temporary_variables=new_temporaries, inames=_merge_dicts( "inames", @@ -276,7 +278,8 @@ def _fuse_two_kernels(kernela, kernelb): "target", kernela.target, kernelb.target), - options=kernela.options), old_b_id_to_new_b_id + options=kernela.options, + tags=kernela.tags | kernelb.tags), old_b_id_to_new_b_id # }}} diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 9c40b6faf7a5efeecd197848793d7451140eb070..b3ccc647e8ec1dd380174d2a36a02ef2f2498163 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -245,7 +245,7 @@ def _split_iname_backend(kernel, iname_to_split, raise ValueError( f"cannot split loop for unknown variable '{iname_to_split}'") - applied_iname_rewrites = kernel.applied_iname_rewrites[:] + applied_iname_rewrites = list(kernel.applied_iname_rewrites) vng = kernel.get_var_name_generator() @@ -284,8 +284,7 @@ def _split_iname_backend(kernel, iname_to_split, # }}} - iname_slab_increments = kernel.iname_slab_increments.copy() - iname_slab_increments[outer_iname] = slabs + iname_slab_increments = kernel.iname_slab_increments.set(outer_iname, slabs) new_priorities = [] for prio in kernel.loop_priority: @@ -626,7 +625,7 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): .copy( instructions=new_insns, domains=domch.get_domains_with(new_domain), - applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict] + applied_iname_rewrites=kernel.applied_iname_rewrites + (subst_dict,) )) from loopy.match import parse_stack_match @@ -1396,7 +1395,7 @@ def affine_map_inames(kernel, old_inames, new_inames, equations): rule_mapping_context.finish_kernel( old_to_new.map_kernel(kernel)) .copy( - applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict] + applied_iname_rewrites=kernel.applied_iname_rewrites + (subst_dict,) )) # }}} @@ -1987,7 +1986,7 @@ def map_domain(kernel, transform_map): substitutions = {} var_substitutions = {} - applied_iname_rewrites = kernel.applied_iname_rewrites[:] + applied_iname_rewrites = kernel.applied_iname_rewrites from loopy.symbolic import aff_to_expr from pymbolic import var @@ -1997,7 +1996,7 @@ def map_domain(kernel, transform_map): substitutions[iname] = subst_from_map var_substitutions[var(iname)] = subst_from_map - applied_iname_rewrites.append(var_substitutions) + applied_iname_rewrites = applied_iname_rewrites + (var_substitutions,) del var_substitutions # }}} diff --git a/loopy/transform/realize_reduction.py b/loopy/transform/realize_reduction.py index 2f8e3abe80dfdb6b098835ef0d95b5034e52450f..58f596dc6f0af85bce8406074e7837618613cc71 100644 --- a/loopy/transform/realize_reduction.py +++ b/loopy/transform/realize_reduction.py @@ -725,6 +725,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): TemporaryVariable( name=new_assignee_name, dtype=None, + shape=(), address_space=AddressSpace.PRIVATE)) from pymbolic import var diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 3ced06bfc9fb0afb660b9a37a571a10fc203387a..6bf1c1543d47c2db4805c1403a617ebe9c255a85 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -20,12 +20,16 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from functools import cached_property + +from immutables import Map from loopy.diagnostic import LoopyError import loopy as lp from loopy.kernel.data import auto, AddressSpace from pytools import memoize_method, Record +from loopy.kernel.data import Iname from loopy.schedule import ( EnterLoop, LeaveLoop, RunInstruction, CallKernel, ReturnFromKernel, Barrier) @@ -242,8 +246,7 @@ class TemporarySaver: from collections import defaultdict self.insns_to_insert = [] self.insns_to_update = {} - self.extra_args_to_add = {} - self.updated_iname_to_tags = {} + self.updated_iname_objs = Map() self.updated_temporary_variables = {} # temporary name -> save or reload insn ids @@ -295,8 +298,7 @@ class TemporarySaver: return frozenset(accessing_insns_in_subkernel) - @property - @memoize_method + @cached_property def base_storage_to_temporary_map(self): from collections import defaultdict @@ -309,8 +311,7 @@ class TemporarySaver: return result - @property - @memoize_method + @cached_property def subkernel_to_slice_indices(self): result = {} @@ -322,8 +323,7 @@ class TemporarySaver: return result - @property - @memoize_method + @cached_property def subkernel_to_surrounding_inames(self): current_outer_inames = set() within_subkernel = False @@ -502,7 +502,7 @@ class TemporarySaver: if promoted_temporary is None: return - new_subdomain, hw_inames, dim_inames, iname_to_tags = ( + new_subdomain, hw_inames, dim_inames, iname_objs = ( self.augment_domain_for_save_or_reload( self.new_subdomain, promoted_temporary, mode, subkernel)) @@ -580,7 +580,7 @@ class TemporarySaver: self.updated_temporary_variables[promoted_temporary.name] = ( promoted_temporary.as_kernel_temporary(self.kernel)) - self.updated_iname_to_tags.update(iname_to_tags) + self.updated_iname_objs = self.updated_iname_objs.update(iname_objs) @memoize_method def finish(self): @@ -596,7 +596,7 @@ class TemporarySaver: new_instructions.extend( sorted(insns_to_insert.values(), key=lambda insn: insn.id)) - self.updated_iname_to_tags.update(self.kernel.iname_to_tags) + self.updated_iname_objs = self.updated_iname_objs.update(self.kernel.inames) self.updated_temporary_variables.update(self.kernel.temporary_variables) new_domains = list(self.kernel.domains) @@ -607,7 +607,7 @@ class TemporarySaver: kernel = self.kernel.copy( domains=new_domains, instructions=new_instructions, - iname_to_tags=self.updated_iname_to_tags, + inames=self.updated_iname_objs, temporary_variables=self.updated_temporary_variables, overridden_get_grid_sizes_for_insn_ids=None) @@ -649,7 +649,7 @@ class TemporarySaver: orig_dim = domain.dim(isl.dim_type.set) # Tags for newly added inames - iname_to_tags = {} + iname_objs = {} from loopy.symbolic import aff_from_expr @@ -674,7 +674,8 @@ class TemporarySaver: # If the temporary has local scope, then loads / stores can # be done in parallel. from loopy.kernel.data import AutoFitLocalInameTag - iname_to_tags[new_iname] = frozenset([AutoFitLocalInameTag()]) + iname_objs[new_iname] = Iname( + new_iname, tags=frozenset([AutoFitLocalInameTag()])) dim_inames.append(new_iname) @@ -704,7 +705,8 @@ class TemporarySaver: & aff[new_iname].lt_set(aff_from_expr(domain.space, dim))) - self.updated_iname_to_tags[new_iname] = frozenset([hw_tag]) + self.updated_iname_objs = self.updated_iname_objs.set(new_iname, + Iname(name=new_iname, tags=frozenset([hw_tag]))) hw_inames.append(new_iname) # The operations on the domain above return a Set object, but the @@ -712,7 +714,7 @@ class TemporarySaver: domain_list = domain.get_basic_set_list() assert domain_list.n_basic_set() == 1 domain = domain_list.get_basic_set(0) - return domain, hw_inames, dim_inames, iname_to_tags + return domain, hw_inames, dim_inames, iname_objs # }}} diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 8568f350875d428bf486f693cc8ddfba97bb9f14..13c99de1d88281499a19acf953d31f7a61e494f7 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -337,7 +337,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, def_id = set() for dep_id in insn.depends_on: dep_insn = id_to_insn[dep_id] - if lhs_name in dep_insn.write_dependency_names(): + if lhs_name in dep_insn.assignee_var_names(): if lhs_name in dep_insn.read_dependency_names(): raise LoopyError("instruction '%s' both reads *and* " "writes '%s'--cannot transcribe to substitution " @@ -378,7 +378,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, definition_insn_ids = set() for insn in kernel.instructions: - if lhs_name in insn.write_dependency_names(): + if lhs_name in insn.assignee_var_names(): definition_insn_ids.add(insn.id) # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 96e436966385eb64c501f007cf74fbc1f60abc8d..8e9ad5bd5af0b9cf08b2e05c53f97c380d532f7a 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -855,6 +855,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx): from loopy.kernel.data import TemporaryVariable, KernelArgument old_calls_to_new_calls = {} + touched_variable_names = set() for var_chain in sccs: changed_during_last_queue_run = False @@ -896,6 +897,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx): if new_dtype != item.dtype: debug(" changed from: %s", item.dtype) changed_during_last_queue_run = True + touched_variable_names.add(name) if isinstance(item, TemporaryVariable): new_temp_vars[name] = item.copy(dtype=new_dtype) @@ -1001,6 +1003,18 @@ def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx): logger.debug("type inference took {dur:.2f} seconds".format( dur=end_time - start_time)) + if kernel._separation_info(): + sep_names = set(kernel._separation_info()) | { + sep_info.subarray_names.values() + for sep_info in kernel._separation_info().values()} + + touched_sep_names = sep_names & touched_variable_names + if touched_sep_names: + raise LoopyError("Type inference must not touch variables subject to " + "separation after separation has been performed. " + "Untyped separation-related variables: " + f"{', '.join(touched_sep_names)}") + pre_type_specialized_knl = unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], diff --git a/loopy/types.py b/loopy/types.py index 14bf89fa197bf1aacbf1488ca53b8cfeebe9a174..57b9548bbaba3e36ec0af9ec5739ce93633a2992 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -1,3 +1,5 @@ +from __future__ import annotations + __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" __license__ = """ @@ -20,7 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from typing import Any +from typing import Any, Mapping from warnings import warn import numpy as np @@ -247,11 +249,11 @@ def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False, "found '%s' instead" % type(dtype)) -_TO_UNSIGNED_MAPPING = { - np.int8: np.uint8, - np.int16: np.uint16, - np.int32: np.uint32, - np.int64: np.uint64, +_TO_UNSIGNED_MAPPING: Mapping[np.dtype[Any], np.dtype[Any]] = { + np.dtype(np.int8): np.dtype(np.uint8), + np.dtype(np.int16): np.dtype(np.uint16), + np.dtype(np.int32): np.dtype(np.uint32), + np.dtype(np.int64): np.dtype(np.uint64), } @@ -261,7 +263,7 @@ def to_unsigned_dtype(dtype: "np.dtype[Any]") -> "np.dtype[Any]": if dtype.kind != "i": raise ValueError("can only convert integer types to unsigned") - return _TO_UNSIGNED_MAPPING[dtype.type] + return _TO_UNSIGNED_MAPPING[dtype] # vim: foldmethod=marker diff --git a/loopy/typing.py b/loopy/typing.py new file mode 100644 index 0000000000000000000000000000000000000000..d6714d87048fb0419f14641993e4929da709b6fc --- /dev/null +++ b/loopy/typing.py @@ -0,0 +1,38 @@ +__copyright__ = "Copyright (C) 2022 University of Illinois Board of Trustees" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +from typing import Union, Tuple + +import numpy as np + +from pymbolic.primitives import Expression + +IntegralT = Union[int, np.int8, np.int16, np.int32, np.int64, np.uint8, + np.uint16, np.uint32, np.uint64] +FloatT = Union[float, complex, np.float32, np.float64, np.complex64, + np.complex128] + + +ExpressionT = Union[IntegralT, FloatT, Expression] +ShapeType = Tuple[ExpressionT, ...] +StridesType = ShapeType diff --git a/run-mypy.sh b/run-mypy.sh new file mode 100755 index 0000000000000000000000000000000000000000..46fe61defd71d26e046b4b61d2f0a8a2c91bc532 --- /dev/null +++ b/run-mypy.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +python -m mypy --show-error-codes loopy # examples test diff --git a/setup.cfg b/setup.cfg index 0a2df03be2613b4945fdfbb42b63f09522c94fef..3b0411bfc0704c78515a67eb2af7f5433a8a11db 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,3 +13,48 @@ multiline-quotes = """ [tool:pytest] doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL ELLIPSIS + +[mypy] +python_version = 3.8 +warn_unused_ignores = True + +exclude = (?x)( + loopy/target/c/compyte/ndarray/.* + | loopy/target/c/compyte/array\.py + ) + +[mypy-islpy.*] +ignore_missing_imports = True + +[mypy-pymbolic.*] +ignore_missing_imports = True + +[mypy-cgen.*] +ignore_missing_imports = True + +[mypy-genpy.*] +ignore_missing_imports = True + +[mypy-pyopencl.*] +ignore_missing_imports = True + +[mypy-pygments.*] +ignore_missing_imports = True + +[mypy-colorama.*] +ignore_missing_imports = True + +[mypy-codepy.*] +ignore_missing_imports = True + +[mypy-mako.*] +ignore_missing_imports = True + +[mypy-fparser.*] +ignore_missing_imports = True + +[mypy-ply.*] +ignore_missing_imports = True + +[mypy-IPython.*] +ignore_missing_imports = True diff --git a/setup.py b/setup.py index 011db206a4e0918030a1f75c2c094c5cc5ea4857..f265326e7534860e80ef1e91f63ea181a75de423 100644 --- a/setup.py +++ b/setup.py @@ -82,7 +82,7 @@ setup(name="loopy", "Topic :: Utilities", ], - python_requires="~=3.6", + python_requires="~=3.8", install_requires=[ "pytools>=2022.1.2", "pymbolic>=2022.1", diff --git a/test/test_callables.py b/test/test_callables.py index 8ac29f39ad5f99a42094c9ff7cf44ba373e3d66e..1a1b37f2c70f635cc355d41f6d366a1f554e5b6a 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -1342,6 +1342,38 @@ def test_c_instruction_in_callee(ctx_factory, inline): assert out.get() == (n-1) +def test_global_temp_var_with_base_storage(ctx_factory): + ctx = ctx_factory() + cq = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i, i2] : 0<=i,i2<3}", + """ + a[i] = 5 + b[i] = a[i] + 1 + ... gbarrier + c[i2] = b[i2] + 2 + d[i2] = c[i2] + 3 + """, [ + lp.TemporaryVariable("a", dtype=np.int32, shape=(3,), + address_space=lp.AddressSpace.GLOBAL, base_storage="bs"), + lp.TemporaryVariable("b", dtype=np.int32, shape=(3,), + address_space=lp.AddressSpace.GLOBAL, base_storage="bs"), + lp.TemporaryVariable("c", dtype=np.int32, shape=(3,), + address_space=lp.AddressSpace.GLOBAL, base_storage="bs"), + ... + ], + seq_dependencies=True) + + knl = lp.allocate_temporaries_for_base_storage(knl, aliased=False) + + cl_prg = cl.Program(ctx, lp.generate_code_v2(knl).device_code()).build() + assert [knl.num_args for knl in cl_prg.all_kernels()] == [1, 2] + + _evt, (d,) = knl(cq) + assert (d.get() == 5 + 1 + 2 + 3).all() + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_expression.py b/test/test_expression.py index 9b4e1fbc5deda2f1313b816564061a78e43de8f3..451594dc0192832ee7d66d30d9fce0e914026a29 100644 --- a/test/test_expression.py +++ b/test/test_expression.py @@ -482,7 +482,8 @@ def test_divide_precedence(ctx_factory): x[0] = c*(a/b) y[0] = c*(a%b) """, - [lp.ValueArg("a, b, c", np.int32), lp.GlobalArg("x, y", np.int32)]) + [lp.ValueArg("a, b, c", np.int32), + lp.GlobalArg("x, y", np.int32, shape=lp.auto)]) print(lp.generate_code_v2(knl).device_code()) evt, (x_out, y_out) = knl(queue, c=2, b=2, a=5) diff --git a/test/test_loopy.py b/test/test_loopy.py index 4b1146667c0d889c95ec956d59e8609f69223644..80a59981dda41c1005038923b578a74dac901fb9 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -24,6 +24,7 @@ import sys import numpy as np import loopy as lp import pyopencl as cl +import pyopencl.array # noqa import pyopencl.clmath # noqa import pyopencl.clrandom # noqa import pytest @@ -203,23 +204,6 @@ def test_owed_barriers(): print(lp.generate_code_v2(knl)) -def test_wg_too_small(): - knl = lp.make_kernel( - "{[i]: 0<=i<100}", - [ - "<float32> z[i] = a[i] {id=copy}" - ], - [lp.GlobalArg("a", np.float32, shape=(100,))], - target=lp.PyOpenCLTarget(), - local_sizes={0: 16}) - - knl = lp.tag_inames(knl, dict(i="l.0")) - - print(knl) - with pytest.raises(RuntimeError): - print(lp.generate_code_v2(knl)) - - def test_multi_cse(): knl = lp.make_kernel( "{[i]: 0<=i<100}", @@ -227,8 +211,7 @@ def test_multi_cse(): "<float32> z[i] = a[i] + a[i]**2" ], [lp.GlobalArg("a", np.float32, shape=(100,))], - target=lp.PyOpenCLTarget(), - local_sizes={0: 16}) + target=lp.PyOpenCLTarget()) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.add_prefetch(knl, "a", []) @@ -766,6 +749,23 @@ def test_make_copy_kernel(ctx_factory): assert (a1 == a3).all() +def test_make_copy_kernel_with_offsets(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + a1 = np.random.randn(3, 1024, 4) + a1_dev = cl.array.to_device(queue, a1) + + cknl1 = lp.make_copy_kernel("c,c,c", "sep,c,c") + + cknl1 = lp.fix_parameters(cknl1, n0=3) + + cknl1 = lp.set_options(cknl1, write_code=True) + evt, (a2_dev,) = cknl1(queue, input=a1_dev) + + assert (a1 == a2_dev.get()).all() + + def test_auto_test_can_detect_problems(ctx_factory): ctx = ctx_factory() @@ -1400,7 +1400,7 @@ def test_global_temporary(ctx_factory): assert len(cgr.device_programs) == 2 - #print(cgr.device_code()) + print(cgr.device_code()) #print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) @@ -1841,7 +1841,7 @@ def test_header_extract(): cuknl = knl.copy(target=lp.CudaTarget()) assert str(lp.generate_header(cuknl)[0]) == ( 'extern "C" __global__ void __launch_bounds__(1) ' - "loopy_kernel(float *__restrict__ T);") + "loopy_kernel(__global__ float *__restrict__ T);") #test OpenCL oclknl = knl.copy(target=lp.PyOpenCLTarget()) @@ -1859,8 +1859,12 @@ def test_scalars_with_base_storage(ctx_factory): knl = lp.make_kernel( [isl.BasicSet("[] -> {[]: }")], # empty (domain w/unused inames errors) "a = 1", - [lp.TemporaryVariable("a", dtype=np.float64, - shape=(), base_storage="base")]) + [ + lp.TemporaryVariable("a", dtype=np.float64, + shape=(), base_storage="base"), + lp.TemporaryVariable("b", dtype=np.float64, + shape=(), base_storage="base"), + ]) knl(queue, out_host=True) @@ -2392,20 +2396,6 @@ def test_inames_conditional_generation(ctx_factory): knl(queue) -def test_kernel_var_name_generator(): - prog = lp.make_kernel( - "{[i]: 0 <= i <= 10}", - """ - <>a = 0 - <>b_s0 = 0 - """) - - vng = prog["loopy_kernel"].get_var_name_generator() - - assert vng("a_s0") != "a_s0" - assert vng("b") != "b" - - def test_fixed_parameters(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) diff --git a/test/test_misc.py b/test/test_misc.py index 0e8a528ecc0784e4280ca8e4e0d3f417b3db145c..7feb5ae73b5618b0c3d7219db1252fe9df1f7daa 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from pickle import loads, dumps + import pytest import loopy as lp @@ -32,6 +34,16 @@ logger = logging.getLogger(__name__) from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa +def test_kernel_pickling_and_hashing(): + knl = lp.make_kernel("{[i]: 0<=i<10}", + """ + y[i] = i + """) + from loopy.tools import LoopyKeyBuilder + reconst_knl = loads(dumps(knl)) + assert LoopyKeyBuilder()(knl) == LoopyKeyBuilder()(reconst_knl) + + def test_SetTrie(): from loopy.kernel.tools import SetTrie @@ -79,8 +91,6 @@ def test_LazilyUnpicklingDict(): assert not cls.instance_unpickled - from pickle import loads, dumps - pickled_mapping = dumps(mapping) # {{{ test lazy loading @@ -133,7 +143,6 @@ def test_LazilyUnpicklingList(): lst = LazilyUnpicklingList([cls()]) assert not cls.instance_unpickled - from pickle import loads, dumps pickled_lst = dumps(lst) # {{{ test lazy loading @@ -188,7 +197,6 @@ def test_LazilyUnpicklingListWithEqAndPersistentHashing(): from loopy.tools import LazilyUnpicklingListWithEqAndPersistentHashing cls = PickleDetectorForLazilyUnpicklingListWithEqAndPersistentHashing - from pickle import loads, dumps # {{{ test comparison of a pair of lazy lists diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 3d54b231145db0fed43113a2b22d265d77148f36..6d97104a9542340db8c2c227105ef7175e55547c 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -51,6 +51,11 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa pytest.importorskip("fparser") ctx = ctx_factory() + if (ctx.devices[0].platform.name == "Portable Computing Language" + and ilp_multiple > 1): + # about 400s, cf. https://gitlab.tiker.net/inducer/loopy/-/jobs/421250#L937 + pytest.skip("takes a very long time to compile on pocl") + filename = os.path.join(os.path.dirname(__file__), "strongVolumeKernels.f90") with open(filename) as sourcef: source = sourcef.read() diff --git a/test/test_target.py b/test/test_target.py index 6a0c26fd9b9ce769366e112441b7549fce150b8c..b6acd0902e75b7480a6891b9320df5248cce882b 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -54,7 +54,7 @@ __all__ = [ from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa -def test_ispc_target(occa_mode=False): +def test_ispc_target(): from loopy.target.ispc import ISPCTarget knl = lp.make_kernel( @@ -64,7 +64,7 @@ def test_ispc_target(occa_mode=False): lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." ], - target=ISPCTarget(occa_mode=occa_mode)) + target=ISPCTarget()) knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") @@ -254,39 +254,6 @@ def test_clamp(ctx_factory): evt, (out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15)) -def test_numba_target(): - knl = lp.make_kernel( - "{[i,j,k]: 0<=i,j<M and 0<=k<N}", - "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))", - target=lp.NumbaTarget()) - - knl = lp.add_and_infer_dtypes(knl, {"X": np.float32}) - - print(lp.generate_code_v2(knl).device_code()) - - -def test_numba_cuda_target(): - knl = lp.make_kernel( - "{[i,j,k]: 0<=i,j<M and 0<=k<N}", - "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))", - target=lp.NumbaCudaTarget()) - - knl = lp.assume(knl, "M>0") - knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") - knl = lp.split_iname(knl, "j", 128, inner_tag="l.0", slabs=(0, 1)) - knl = lp.add_prefetch(knl, "X[i,:]", - fetch_outer_inames="i_inner, i_outer, j_inner", - default_tag="l.auto") - knl = lp.fix_parameters(knl, N=3) - knl = lp.prioritize_loops(knl, "i_inner,j_outer") - knl = lp.tag_inames(knl, "k:unr") - knl = lp.tag_array_axes(knl, "X", "N0,N1") - - knl = lp.add_and_infer_dtypes(knl, {"X": np.float32}) - - print(lp.generate_code_v2(knl).all_code()) - - def test_sized_integer_c_codegen(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -557,7 +524,7 @@ def test_input_args_are_required(ctx_factory): """ g[i] = f[i] + 1.5 """, - [lp.GlobalArg("f, g", dtype="float64"), ...] + [lp.GlobalArg("f, g", shape=lp.auto, dtype="float64"), ...] ) knl2 = lp.make_kernel( @@ -582,7 +549,7 @@ def test_input_args_are_required(ctx_factory): f[i] = 3. g[i] = f[i] + 1.5 """, - [lp.GlobalArg("f, g", dtype="float64"), ...] + [lp.GlobalArg("f, g", shape=lp.auto, dtype="float64"), ...] ) # FIXME: this should not raise!