diff --git a/.github/workflows/autopush.yml b/.github/workflows/autopush.yml index f89b08ac535f4b5c2cd6e7a535b7c7da6fdad816..f7e2297314fc619358c5766d3fc4a47836d5c55d 100644 --- a/.github/workflows/autopush.yml +++ b/.github/workflows/autopush.yml @@ -3,6 +3,7 @@ on: push: branches: - main + - kernel_callables_v3-edit2 jobs: autopush: @@ -14,7 +15,9 @@ jobs: mkdir ~/.ssh && echo -e "Host gitlab.tiker.net\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config eval $(ssh-agent) && echo "$GITLAB_AUTOPUSH_KEY" | ssh-add - git fetch --unshallow - git push "git@gitlab.tiker.net:inducer/$(basename $GITHUB_REPOSITORY).git" main + TGT_BRANCH="${GITHUB_REF#refs/heads/}" + echo "pushing to $TGT_BRANCH..." + git push "git@gitlab.tiker.net:inducer/$(basename $GITHUB_REPOSITORY).git" "$TGT_BRANCH" env: GITLAB_AUTOPUSH_KEY: ${{ secrets.GITLAB_AUTOPUSH_KEY }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0e932ef2f55adcecd1eee76b51151fae037c9988..68f9a62c77c6b85b86b9104625bd151a4901a575 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -146,21 +146,31 @@ jobs: env: DOWNSTREAM_PROJECT: ${{ matrix.downstream_project }} run: | - git clone "https://github.com/inducer/$DOWNSTREAM_PROJECT.git" + curl -L -O https://tiker.net/ci-support-v0 + . ./ci-support-v0 + + # Use "special grudge" for kcv3 and branches targeting it. + if [[ "$DOWNSTREAM_PROJECT" = "grudge" ]] && [[ "$GITHUB_HEAD_REF" = "kernel_callables_v3-edit2" || "$GITHUB_BASE_REF" = "kernel_callables_v3-edit2" ]]; then + with_echo git clone "https://github.com/kaushikcfd/$DOWNSTREAM_PROJECT.git" -b "kcv3-e2-compat" + else + with_echo git clone "https://github.com/inducer/$DOWNSTREAM_PROJECT.git" + fi cd "$DOWNSTREAM_PROJECT" echo "*** $DOWNSTREAM_PROJECT version: $(git rev-parse --short HEAD)" + if [[ "$DOWNSTREAM_PROJECT" = "pytential" ]] && [[ "$GITHUB_HEAD_REF" = "kernel_callables_v3-edit2" || "$GITHUB_BASE_REF" = "kernel_callables_v3-edit2" ]]; then + sed -i "/egg=sumpy/ c git+https://github.com/inducer/sumpy.git@loopy-callables#egg=sumpy" requirements.txt + fi + + transfer_requirements_git_urls ../requirements.txt ./requirements.txt sed -i "/egg=loopy/ c git+file://$(readlink -f ..)#egg=loopy" requirements.txt + sed -i "/mpi4py/ d" requirements.txt export CONDA_ENVIRONMENT=.test-conda-env-py3.yml # Avoid slow or complicated tests in downstream projects export PYTEST_ADDOPTS="-k 'not (slowtest or octave or mpi)'" - sed -i "/mpi4py/ d" requirements.txt - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/ci-support.sh - . ./ci-support.sh build_py_project_in_conda_env test_py_project diff --git a/asv.conf.json b/asv.conf.json index 99c2ea2b5941721a045d8aa7a0586d7d5f9e1eb6..3988c0fafe34ebaed5fd0fdad45fa9ec37ee8fdd 100644 --- a/asv.conf.json +++ b/asv.conf.json @@ -40,7 +40,7 @@ //"install_timeout": 600, // the base URL to show a commit for the project. - "show_commit_url": "http://gitlab.tiker.net/inducer/loopy/commits/", + "show_commit_url": "http://github.com/inducer/loopy/commit/", // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. diff --git a/benchmarks/run_sumpy_kernels.py b/benchmarks/run_sumpy_kernels.py index dadff4b2c152ec3687ce5acbbeb921ee24fbf573..72c61a539056634d1626e3bcd08cb1c047f8d4f6 100644 --- a/benchmarks/run_sumpy_kernels.py +++ b/benchmarks/run_sumpy_kernels.py @@ -32,7 +32,7 @@ def _sumpy_kernel_init(param): m_expn = mpole_expn_class(knl, order=order) l_expn = local_expn_class(knl, order=order) - m2l = E2EFromCSR(ctx, m_expn, l_expn) + m2l = E2EFromCSR(ctx, m_expn, l_expn, name="loopy_kernel") m2l.get_translation_loopy_insns() m2l.ctx = None m2l.device = None @@ -78,7 +78,8 @@ def cached_data(params): knl = _sumpy_kernel_make(expn, param) knl = lp.preprocess_kernel(knl) data[param]["instantiated"] = knl - scheduled = lp.get_one_scheduled_kernel(knl) + scheduled = knl.with_kernel(lp.get_one_scheduled_kernel(knl["loopy_kernel"], + knl.callables_table)) data[param]["scheduled"] = scheduled return data @@ -102,7 +103,9 @@ class SumpyBenchmarkSuite: lp.preprocess_kernel(knl) def time_schedule(self, data, param): - lp.get_one_scheduled_kernel(data[param]["instantiated"]) + knl = data[param]["instantiated"] + knl.with_kernel(lp.get_one_scheduled_kernel(knl["loopy_kernel"], + knl.callables_table)) def time_generate_code(self, data, param): lp.generate_code_v2(data[param]["scheduled"]) diff --git a/doc/conf.py b/doc/conf.py index ad02864cb11c69b44c20c126078e59602e4407e1..f0211ff62fa70d6a1d00838ec41e93a2f75ee239 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -104,11 +104,13 @@ man_pages = [ intersphinx_mapping = { "https://docs.python.org/3": None, "https://numpy.org/doc/stable/": None, + "https://documen.tician.de/pytools": None, "https://documen.tician.de/islpy": None, "https://documen.tician.de/pyopencl": None, "https://documen.tician.de/cgen": None, "https://documen.tician.de/pymbolic": None, "https://documen.tician.de/pytools": None, + "https://pyrsistent.readthedocs.io/en/latest/": None, } autoclass_content = "class" diff --git a/doc/index.rst b/doc/index.rst index 7baff3249a25e69019c06802901538500c1af971..3bc0361c53796c8839e61c905bc2113a56dfbc55 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -42,7 +42,9 @@ Please check :ref:`installation` to get started. tutorial ref_creation ref_kernel + ref_translation_unit ref_transform + ref_call ref_other misc ref_internals diff --git a/doc/misc.rst b/doc/misc.rst index da61e3051968ab6e064e12c484d5e2d8a48d2ba8..cd3d205018b455c23b8a6bf56caeff05230c6339 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -332,7 +332,7 @@ This list is always growing, but here are a few pointers: * Interface with your own library functions - Use :func:`loopy.register_function_manglers`. + See :ref:`func-interface` for details. * Loop collapse diff --git a/doc/ref_call.rst b/doc/ref_call.rst new file mode 100644 index 0000000000000000000000000000000000000000..2a48ecca0688710198def0c455dafd08d8a4ab8e --- /dev/null +++ b/doc/ref_call.rst @@ -0,0 +1,66 @@ +.. currentmodule:: loopy + + +.. _func-interface: + +Function Interface +================== + + +Resolving and specialization +---------------------------- + +In :mod:`loopy`, a :class:`loopy.TranslationUnit` is a collection of callables +and entrypoints. Callables are of type +:class`:loopy.kernel.function_interface.InKernelCallable`. Functions start life +as simple :class:`pymbolic.primitives.Call` nodes. Call resolution turns the function +identifiers in those calls into :class:`~loopy.symbolic.ResolvedFunction` objects. +Each resolved function has an entry in :attr:`TranslationUnit.callables_table`. +The process of realizing a function as a +:class:`~loopy.kernel.function_interface.InKernelCallable` is referred to as +resolving. + +During code generation for a :class:`~loopy.TranslationUnit`, a (resolved) callable +is *specialized* depending on the types and shapes of the arguments passed at a +call site. For example, a call to ``sin(x)`` in :mod:`loopy` is type-generic to +begin with, but it later specialized to either ``sinf``, ``sin`` or ``sinl`` +depending on the type of its argument ``x``. A callable's behavior during type +or shape specialization is encoded via +:meth:`~loopy.kernel.function_interface.InKernelCallable.with_types` and +:meth:`~loopy.kernel.function_interface.InKernelCallable.with_descrs`. + + +Registering callables +--------------------- + +A user can *register* callables within a :class:`~loopy.TranslationUnit` to +allow loopy to resolve calls not pre-defined in :mod:`loopy`. In :mod:`loopy`, +we typically aim to expose all the standard math functions defined for +a :class:`~loopy.target.TargetBase`. Other foreign functions could be invoked by +*registering* them. + +An example demonstrating registering a ``CBlasGemv`` as a loopy callable: + +.. literalinclude:: ../examples/python/call-external.py + +Call Instruction for a kernel call +---------------------------------- + +At a call-site involving a call to a :class:`loopy.LoopKernel`, the arguments to +the call must be ordered by the order of input arguments of the callee kernel. +Similarly, the assignees must be ordered by the order of callee kernel's output +arguments. Since a :class:`~loopy.kernel.data.KernelArgument` can be both an +input and an output, such arguments would be a part of the call instruction's +assignees as well as the call expression node's parameters. + +Entry points +------------ + +Only callables in :attr:`loopy.TranslationUnit.entrypoints` can be called from +the outside. All other callables are only visible from within the translation +unit, similar to C's ``static`` functions. + +Reference +--------- + +.. automodule:: loopy.kernel.function_interface diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index 57d33b53999e06cbb07cc8363bbc46c091033cb3..1ba295777c88a79190171867354b04dec5d9405b 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -87,8 +87,6 @@ Registering Library Routines .. autofunction:: register_symbol_manglers -.. autofunction:: register_function_manglers - Modifying Arguments ------------------- @@ -144,4 +142,3 @@ TODO: Matching instruction tags .. automodule:: loopy.match .. vim: tw=75:spell - diff --git a/doc/ref_translation_unit.rst b/doc/ref_translation_unit.rst new file mode 100644 index 0000000000000000000000000000000000000000..631c5756130026cf10f4708f28ff76e1a5539722 --- /dev/null +++ b/doc/ref_translation_unit.rst @@ -0,0 +1,11 @@ +.. currentmodule:: loopy + +TranslationUnit +=============== + +.. autoclass:: TranslationUnit + +Reference +--------- + +.. automodule:: loopy.translation_unit diff --git a/doc/tutorial.rst b/doc/tutorial.rst index e48fcb31c3c5632459078db499a1068e114f9021..7732aacbe6b25f51c35e3fcd72e558b2bea503b7 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -333,7 +333,7 @@ an explicit dependency: ... """ ... out[j,i] = a[i,j] {id=transpose} ... out[i,j] = 2*out[i,j] {dep=transpose} - ... """) + ... """, name="transpose_and_dbl") ``{id=transpose}`` assigns the identifier *transpose* to the first instruction, and ``{dep=transpose}`` declares a dependency of the second @@ -342,9 +342,9 @@ that these dependencies show up there, too: .. doctest:: - >>> print(knl.stringify(with_dependencies=True)) + >>> print(knl["transpose_and_dbl"].stringify(with_dependencies=True)) --------------------------------------------------------------------------- - KERNEL: loopy_kernel + KERNEL: transpose_and_dbl --------------------------------------------------------------------------- ... --------------------------------------------------------------------------- @@ -394,7 +394,7 @@ Let us take a look at the generated code for the above kernel: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) transpose_and_dbl(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) { for (int i = 0; i <= -1 + n; ++i) for (int j = 0; j <= -1 + n; ++j) @@ -743,7 +743,7 @@ those for us: .. doctest:: - >>> glob, loc = knl.get_grid_size_upper_bounds() + >>> glob, loc = knl["loopy_kernel"].get_grid_size_upper_bounds(knl.callables_table) >>> print(glob) (Aff("[n] -> { [(floor((127 + n)/128))] }"),) >>> print(loc) @@ -1165,7 +1165,7 @@ this, :mod:`loopy` will complain that global barrier needs to be inserted: >>> cgr = lp.generate_code_v2(knl) Traceback (most recent call last): ... - loopy.diagnostic.MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) + loopy.diagnostic.MissingBarrierError: rotate_v1: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) The syntax for a inserting a global barrier instruction is ``... gbarrier``. :mod:`loopy` also supports manually inserting local @@ -1186,7 +1186,7 @@ Let us start with an example. Consider the kernel from above with a .. doctest:: - >>> knl = lp.make_kernel( + >>> prog = lp.make_kernel( ... "[n] -> {[i] : 0<=i>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0") + >>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0") Here is what happens when we try to generate code for the kernel: - >>> cgr = lp.generate_code_v2(knl) + >>> cgr = lp.generate_code_v2(prog) Traceback (most recent call last): ... loopy.diagnostic.MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?) @@ -1214,8 +1214,10 @@ This happens due to the kernel splitting done by :mod:`loopy`. The splitting happens when the instruction schedule is generated. To see the schedule, we should call :func:`loopy.get_one_linearized_kernel`: - >>> knl = lp.get_one_linearized_kernel(lp.preprocess_kernel(knl)) - >>> print(knl) + >>> prog = lp.preprocess_kernel(prog) + >>> knl = lp.get_one_linearized_kernel(prog["rotate_v2"], prog.callables_table) + >>> prog = prog.with_kernel(knl) + >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 --------------------------------------------------------------------------- @@ -1244,10 +1246,10 @@ function adds instructions to the kernel without scheduling them. That means that :func:`loopy.get_one_linearized_kernel` needs to be called one more time to put those instructions into the schedule. - >>> knl = lp.get_one_linearized_kernel(lp.preprocess_kernel(knl)) - >>> knl = lp.save_and_reload_temporaries(knl) - >>> knl = lp.get_one_linearized_kernel(knl) # Schedule added instructions - >>> print(knl) + >>> prog = lp.save_and_reload_temporaries(prog) + >>> knl = lp.get_one_linearized_kernel(prog["rotate_v2"], prog.callables_table) # Schedule added instructions + >>> prog = prog.with_kernel(knl) + >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 --------------------------------------------------------------------------- @@ -1286,7 +1288,7 @@ does in more detail: The kernel translates into two OpenCL kernels. - >>> cgr = lp.generate_code_v2(knl) + >>> cgr = lp.generate_code_v2(prog) >>> print(cgr.device_code()) #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) @@ -1312,7 +1314,7 @@ Now we can execute the kernel. >>> arr = cl.array.arange(queue, 16, dtype=np.int32) >>> print(arr) [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] - >>> evt, (out,) = knl(queue, arr=arr) + >>> evt, (out,) = prog(queue, arr=arr) >>> print(arr) [15 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] @@ -1549,7 +1551,7 @@ containing different types of data: ... """ ... c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] ... e[i, k] = g[i,k]*(2+h[i,k+1]) - ... """) + ... """, name="stats_knl") >>> knl = lp.add_and_infer_dtypes(knl, ... dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) @@ -1560,7 +1562,7 @@ information provided. Now we will count the operations: >>> op_map = lp.get_op_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(op_map)) - Op(np:dtype('float32'), add, subgroup) : ... + Op(np:dtype('float32'), add, subgroup, stats_knl) : ... Each line of output will look roughly like:: @@ -1586,12 +1588,12 @@ One way to evaluate these polynomials is with :meth:`islpy.PwQPolynomial.eval_wi >>> param_dict = {'n': 256, 'm': 256, 'l': 8} >>> from loopy.statistics import CountGranularity as CG - >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) + >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 @@ -1605,7 +1607,7 @@ One way to evaluate these polynomials is with :meth:`islpy.PwQPolynomial.eval_wi grouping, and evaluating subsets of the counts. Suppose we want to know the total number of 32-bit operations of any kind. We can easily count these using functions :func:`loopy.ToCountMap.filter_by` and -:func:`loopy.ToCountMap.eval_and_sum`: +:func:`loopy.ToCountPolynomialMap.eval_and_sum`: .. doctest:: @@ -1648,15 +1650,15 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, stats_knl) : ... Each line of output will look roughly like:: - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, stats_knl) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup, stats_knl) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup, stats_knl) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1691,13 +1693,13 @@ We can evaluate these polynomials using :meth:`islpy.PwQPolynomial.eval_with_dic .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP, "stats_knl") ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1715,13 +1717,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, stats_knl) : ... >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) - MemAccess(None, None, None, None, load, None, None, None) : ... - MemAccess(None, None, None, None, store, None, None, None) : ... + MemAccess(None, None, None, None, load, None, None, None, None) : ... + MemAccess(None, None, None, None, store, None, None, None, None) : ... >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') ... ].eval_with_dict(param_dict) @@ -1758,12 +1760,12 @@ this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem, stats_knl) : ... With this parallelization, consecutive work-items will access consecutive array @@ -1773,13 +1775,13 @@ array accesses has not changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1799,12 +1801,12 @@ we'll switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem, stats_knl) : ... With this parallelization, consecutive work-items will access *nonconsecutive* @@ -1813,13 +1815,13 @@ changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1853,14 +1855,14 @@ kernel from the previous example: >>> sync_map = lp.get_synchronization_map(knl) >>> print(lp.stringify_stats_mapping(sync_map)) - kernel_launch : { 1 } + Sync(kernel_launch, stats_knl) : [l, m, n] -> { 1 } We can evaluate this polynomial using :meth:`islpy.PwQPolynomial.eval_with_dict`: .. doctest:: - >>> launch_count = sync_map["kernel_launch"].eval_with_dict(param_dict) + >>> launch_count = sync_map[lp.Sync("kernel_launch", "stats_knl")].eval_with_dict(param_dict) >>> print("Kernel launch count: %s" % launch_count) Kernel launch count: 1 @@ -1913,8 +1915,8 @@ count the barriers using :func:`loopy.get_synchronization_map`: >>> sync_map = lp.get_synchronization_map(knl) >>> print(lp.stringify_stats_mapping(sync_map)) - barrier_local : { 1000 } - kernel_launch : { 1 } + Sync(barrier_local, loopy_kernel) : { 1000 } + Sync(kernel_launch, loopy_kernel) : { 1 } Based on the kernel code printed above, we would expect each work-item to diff --git a/examples/fortran/ipython-integration-demo.ipynb b/examples/fortran/ipython-integration-demo.ipynb index 7a5c8257bf80fdfcc3d3b978a7dca2d401c48271..d9ac1f1b22a92b138e4f6432315f281b2a894aed 100644 --- a/examples/fortran/ipython-integration-demo.ipynb +++ b/examples/fortran/ipython-integration-demo.ipynb @@ -49,7 +49,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(fill)" + "print(prog)" ] }, { @@ -91,10 +91,10 @@ "\n", "!$loopy begin\n", "!\n", - "! tr_fill, = lp.parse_fortran(SOURCE)\n", + "! tr_fill = lp.parse_fortran(SOURCE)\n", "! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n", "! outer_tag=\"g.0\", inner_tag=\"l.0\")\n", - "! RESULT = [tr_fill]\n", + "! RESULT = tr_fill\n", "!\n", "!$loopy end" ] @@ -105,7 +105,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(tr_fill)" + "print(prog)" ] }, { diff --git a/examples/fortran/matmul-driver.py b/examples/fortran/matmul-driver.py index 111ac241198581a75ad42d91f9db8e4e89a3cbf2..499bc9b719258833bf5dac80878fa7b0311f0cb9 100644 --- a/examples/fortran/matmul-driver.py +++ b/examples/fortran/matmul-driver.py @@ -11,7 +11,7 @@ def main(): with open(fn) as inf: source = inf.read() - dgemm, = lp.parse_transformed_fortran(source, filename=fn) + dgemm = lp.parse_transformed_fortran(source, filename=fn) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy index 23840f09a46ab97902a8d1ed7e078a7c70d36dec..733cdaac4d9153803dcb54d5c114a33871403bbf 100644 --- a/examples/fortran/matmul.floopy +++ b/examples/fortran/matmul.floopy @@ -13,7 +13,7 @@ subroutine dgemm(m,n,l,alpha,a,b,c) end subroutine !$loopy begin -! dgemm, = lp.parse_fortran(SOURCE, FILENAME) +! dgemm = lp.parse_fortran(SOURCE, FILENAME) ! dgemm = lp.split_iname(dgemm, "i", 16, ! outer_tag="g.0", inner_tag="l.1") ! dgemm = lp.split_iname(dgemm, "j", 8, @@ -28,5 +28,5 @@ end subroutine ! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", ! precompute_outer_inames="i_outer, j_outer, k_outer", ! default_tag="l.auto") -! RESULT = [dgemm] +! RESULT = dgemm !$loopy end diff --git a/examples/fortran/sparse.floopy b/examples/fortran/sparse.floopy index 18542e6b0403a7ab475b3e357f18489847367c3d..2b156bdd709e8f4258492d258adb888ad16fbccd 100644 --- a/examples/fortran/sparse.floopy +++ b/examples/fortran/sparse.floopy @@ -23,11 +23,11 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y) end !$loopy begin -! sparse, = lp.parse_fortran(SOURCE, FILENAME) +! sparse = lp.parse_fortran(SOURCE, FILENAME) ! sparse = lp.split_iname(sparse, "i", 128) ! sparse = lp.tag_inames(sparse, {"i_outer": "g.0"}) ! sparse = lp.tag_inames(sparse, {"i_inner": "l.0"}) ! sparse = lp.split_iname(sparse, "j", 4) ! sparse = lp.tag_inames(sparse, {"j_inner": "unr"}) -! RESULT = [sparse] +! RESULT = sparse !$loopy end diff --git a/examples/fortran/tagging.floopy b/examples/fortran/tagging.floopy index 87aacba68ae2fc6f3b7052325fcd2378e9880e47..c7ebb75667142a8bb470b32f1d92177e135db9b2 100644 --- a/examples/fortran/tagging.floopy +++ b/examples/fortran/tagging.floopy @@ -23,13 +23,13 @@ end ! "factor 4.0", ! "real_type real*8", ! ]) -! fill, = lp.parse_fortran(SOURCE, FILENAME) +! fill = lp.parse_fortran(SOURCE, FILENAME) ! fill = lp.add_barrier(fill, "tag:init", "tag:mult", "gb1") ! fill = lp.split_iname(fill, "i", 128, ! outer_tag="g.0", inner_tag="l.0") ! fill = lp.split_iname(fill, "i_1", 128, ! outer_tag="g.0", inner_tag="l.0") -! RESULT = [fill] +! RESULT = fill ! !$loopy end diff --git a/examples/fortran/volumeKernel.floopy b/examples/fortran/volumeKernel.floopy index c5784b63492063bfd2a9604c42dbf65b2ecb86bf..211c38049076cbe065ce847f948d724c293a032c 100644 --- a/examples/fortran/volumeKernel.floopy +++ b/examples/fortran/volumeKernel.floopy @@ -67,7 +67,7 @@ end subroutine volumeKernel !$loopy begin ! -! volumeKernel, = lp.parse_fortran(SOURCE, FILENAME) +! volumeKernel = lp.parse_fortran(SOURCE, FILENAME) ! volumeKernel = lp.split_iname(volumeKernel, ! "e", 32, outer_tag="g.1", inner_tag="g.0") ! volumeKernel = lp.fix_parameters(volumeKernel, @@ -76,6 +76,6 @@ end subroutine volumeKernel ! i="l.0", j="l.1", k="l.2", ! i_1="l.0", j_1="l.1", k_1="l.2" ! )) -! RESULT = [volumeKernel] +! RESULT = volumeKernel ! !$loopy end diff --git a/examples/python/call-external.py b/examples/python/call-external.py new file mode 100644 index 0000000000000000000000000000000000000000..49b25d6e015780789c5e56af46d47a14e4611cf8 --- /dev/null +++ b/examples/python/call-external.py @@ -0,0 +1,99 @@ +import loopy as lp +import numpy as np +from loopy.diagnostic import LoopyError +from loopy.target.c import CTarget +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 + + +# {{{ blas callable + +class CBLASGEMV(lp.ScalarCallable): + def with_types(self, arg_id_to_dtype, callables_table): + mat_dtype = arg_id_to_dtype.get(0) + vec_dtype = arg_id_to_dtype.get(1) + + if mat_dtype is None or vec_dtype is None: + # types aren't specialized enough to be resolved + return self, callables_table + + if mat_dtype != vec_dtype: + raise LoopyError("GEMV requires same dtypes for matrix and " + "vector") + + if vec_dtype.numpy_dtype == np.float32: + name_in_target = "cblas_sgemv" + elif vec_dtype. numpy_dtype == np.float64: + name_in_target = "cblas_dgemv" + else: + raise LoopyError("GEMV is only supported for float32 and float64 " + "types") + + return (self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: vec_dtype, + 1: vec_dtype, + -1: vec_dtype}), + callables_table) + + def with_descrs(self, arg_id_to_descr, callables_table): + mat_descr = arg_id_to_descr.get(0) + vec_descr = arg_id_to_descr.get(1) + res_descr = arg_id_to_descr.get(-1) + + if mat_descr is None or vec_descr is None or res_descr is None: + # shapes aren't specialized enough to be resolved + return self, callables_table + + assert mat_descr.shape[1] == vec_descr.shape[0] + assert mat_descr.shape[0] == res_descr.shape[0] + assert len(vec_descr.shape) == len(res_descr.shape) == 1 + # handling only the easy case when stride == 1 + assert vec_descr.dim_tags[0].stride == 1 + assert mat_descr.dim_tags[1].stride == 1 + assert res_descr.dim_tags[0].stride == 1 + + return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + from pymbolic import var + mat_descr = self.arg_id_to_descr[0] + m, n = mat_descr.shape + ecm = expression_to_code_mapper + mat, vec = insn.expression.parameters + result, = insn.assignees + + c_parameters = [var("CblasRowMajor"), + var("CblasNoTrans"), + m, n, + 1, + ecm(mat).expr, + 1, + ecm(vec).expr, + 1, + ecm(result).expr, + 1] + return (var(self.name_in_target)(*c_parameters), + False # cblas_gemv does not return anything + ) + + def generate_preambles(self, target): + assert isinstance(target, CTarget) + yield("99_cblas", "#include ") + return + +# }}} + + +n = 10 + +knl = lp.make_kernel( + "{:}", + """ + y[:] = gemv(A[:, :], x[:]) + """, [ + lp.GlobalArg("A", dtype=np.float64, shape=(n, n)), + lp.GlobalArg("x", dtype=np.float64, shape=(n, )), + lp.GlobalArg("y", shape=(n, )), ...], + target=CTarget()) + +knl = lp.register_callable(knl, "gemv", CBLASGEMV(name="gemv")) +print(lp.generate_code_v2(knl).device_code()) diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py index 7f80175ebe82b8412a38708a5b1d32042d8061fe..d97fc3fa67adb22c17d4f60c2e4283aed727af8a 100644 --- a/examples/python/global_barrier_removal.py +++ b/examples/python/global_barrier_removal.py @@ -22,7 +22,9 @@ from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel -knl = get_one_scheduled_kernel(knl) +knl = knl.with_kernel(get_one_scheduled_kernel(knl["loopy_kernel"], + knl.callables_table)) + # map schedule onto host or device print(knl) diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py index 41fddfdee2ddf3b670bf9770ad8c4b3ec9ea7da1..ce40487b1f41a6a591134a21eeb14113fd8be4fa 100644 --- a/examples/python/ispc-stream-harness.py +++ b/examples/python/ispc-stream-harness.py @@ -29,8 +29,6 @@ def transform(knl, vars, stream_dtype): def gen_code(knl): - knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) codegen_result = lp.generate_code_v2(knl) return codegen_result.device_code() + "\n" + codegen_result.host_code() diff --git a/examples/python/sparse.py b/examples/python/sparse.py index 0e56df1bc3085976bfadd783f976fa912af45da8..b4dd07df40007db16ab588c26dfefb4aadb4b7eb 100644 --- a/examples/python/sparse.py +++ b/examples/python/sparse.py @@ -11,9 +11,9 @@ k = lp.make_kernel([ <> length = rowend - rowstart y[i] = sum(j, values[rowstart+j] * x[colindices[rowstart + j]]) end - """) + """, name="spmv") k = lp.add_and_infer_dtypes(k, { - "values,x": np.float64, "rowstarts,colindices": k.index_dtype + "values,x": np.float64, "rowstarts,colindices": k["spmv"].index_dtype }) -print(lp.generate_code(k)[0]) +print(lp.generate_code_v2(k).device_code()) diff --git a/loopy/__init__.py b/loopy/__init__.py index b6008df3c65cbcac1566853f18408b496674de35..a50c622f73cd5cb17056240cbe3aabce53a012d1 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -24,13 +24,10 @@ THE SOFTWARE. from loopy.symbolic import ( TaggedVariable, Reduction, LinearSubscript, TypeCast) from loopy.diagnostic import LoopyError, LoopyWarning - +from loopy.translation_unit import for_each_kernel # {{{ imported user interface -from loopy.library.function import ( - default_function_mangler, single_arg_function_mangler) - from loopy.kernel.instruction import ( LegacyStringInstructionTag, UseStreamingStoreTag, MemoryOrdering, memory_ordering, @@ -47,6 +44,10 @@ from loopy.kernel.data import ( TemporaryVariable, SubstitutionRule, CallMangleInfo) +from loopy.kernel.function_interface import ( + CallableKernel, ScalarCallable) +from loopy.translation_unit import ( + TranslationUnit, Program, make_program) from loopy.kernel import LoopKernel, KernelState, kernel_state from loopy.kernel.tools import ( @@ -59,7 +60,7 @@ from loopy.kernel.tools import ( get_subkernels, get_subkernel_to_insn_id_map) from loopy.types import to_loopy_type -from loopy.kernel.creation import make_kernel, UniqueName +from loopy.kernel.creation import make_kernel, UniqueName, make_function from loopy.library.reduction import register_reduction_parser # {{{ import transforms @@ -115,16 +116,21 @@ from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier +from loopy.transform.callable import (register_callable, + merge, inline_callable_kernel, rename_callable) +from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call + # }}} from loopy.type_inference import infer_unknown_types -from loopy.preprocess import preprocess_kernel, realize_reduction +from loopy.preprocess import (preprocess_kernel, realize_reduction, + preprocess_program, infer_arg_descr) from loopy.schedule import ( generate_loop_schedules, get_one_scheduled_kernel, get_one_linearized_kernel) -from loopy.statistics import (ToCountMap, CountGranularity, +from loopy.statistics import (ToCountMap, ToCountPolynomialMap, CountGranularity, stringify_stats_mapping, Op, MemAccess, get_op_map, get_mem_access_map, get_synchronization_map, gather_access_footprints, - gather_access_footprint_bytes) + gather_access_footprint_bytes, Sync) from loopy.codegen import ( PreambleInfo, generate_code, generate_code_v2, generate_body) @@ -167,6 +173,10 @@ __all__ = [ "CallInstruction", "CInstruction", "NoOpInstruction", "BarrierInstruction", + "ScalarCallable", "CallableKernel", + + "TranslationUnit", "make_program", "Program", + "KernelArgument", "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg", "AddressSpace", "temp_var_scope", # temp_var_scope is deprecated @@ -174,9 +184,7 @@ __all__ = [ "SubstitutionRule", "CallMangleInfo", - "default_function_mangler", "single_arg_function_mangler", - - "make_kernel", "UniqueName", + "make_kernel", "UniqueName", "make_function", "register_reduction_parser", @@ -229,6 +237,13 @@ __all__ = [ "add_barrier", + "register_callable", + "merge", + + "inline_callable_kernel", "rename_callable", + + "pack_and_unpack_args_for_call", + # }}} "get_dot_dependency_graph", @@ -244,17 +259,20 @@ __all__ = [ "infer_unknown_types", - "preprocess_kernel", "realize_reduction", + "preprocess_kernel", "realize_reduction", "preprocess_program", + "infer_arg_descr", + "generate_loop_schedules", "get_one_scheduled_kernel", "get_one_linearized_kernel", "GeneratedProgram", "CodeGenerationResult", "PreambleInfo", "generate_code", "generate_code_v2", "generate_body", - "ToCountMap", "CountGranularity", "stringify_stats_mapping", "Op", - "MemAccess", "get_op_map", "get_mem_access_map", - "get_synchronization_map", "gather_access_footprints", - "gather_access_footprint_bytes", + "ToCountMap", "ToCountPolynomialMap", "CountGranularity", + "stringify_stats_mapping", "Op", "MemAccess", "get_op_map", + "get_mem_access_map", "get_synchronization_map", + "gather_access_footprints", "gather_access_footprint_bytes", + "Sync", "CompiledKernel", @@ -280,7 +298,6 @@ __all__ = [ "register_preamble_generators", "register_symbol_manglers", - "register_function_manglers", "set_caching_enabled", "CacheMode", @@ -295,6 +312,7 @@ __all__ = [ # {{{ set_options +@for_each_kernel def set_options(kernel, *args, **kwargs): """Return a new kernel with the options given as keyword arguments, or from a string representation passed in as the first (and only) positional @@ -302,6 +320,7 @@ def set_options(kernel, *args, **kwargs): See also :class:`Options`. """ + assert isinstance(kernel, LoopKernel) if args and kwargs: raise TypeError("cannot pass both positional and keyword arguments") @@ -333,6 +352,7 @@ def set_options(kernel, *args, **kwargs): # {{{ library registration +@for_each_kernel def register_preamble_generators(kernel, preamble_generators): """ :arg manglers: list of functions of signature ``(preamble_info)`` @@ -357,6 +377,7 @@ def register_preamble_generators(kernel, preamble_generators): return kernel.copy(preamble_generators=new_pgens) +@for_each_kernel def register_symbol_manglers(kernel, manglers): from loopy.tools import unpickles_equally @@ -373,28 +394,6 @@ def register_symbol_manglers(kernel, manglers): return kernel.copy(symbol_manglers=new_manglers) - -def register_function_manglers(kernel, manglers): - """ - :arg manglers: list of functions of signature ``(kernel, name, arg_dtypes)`` - returning a :class:`loopy.CallMangleInfo`. - :returns: *kernel* with *manglers* registered - """ - from loopy.tools import unpickles_equally - - new_manglers = kernel.function_manglers[:] - for m in manglers: - if m not in new_manglers: - if not unpickles_equally(m): - raise LoopyError("mangler '%s' does not " - "compare equally after being upickled " - "and would disrupt loopy's caches" - % m) - - new_manglers.insert(0, m) - - return kernel.copy(function_manglers=new_manglers) - # }}} @@ -439,7 +438,7 @@ class CacheMode: # {{{ make copy kernel def make_copy_kernel(new_dim_tags, old_dim_tags=None): - """Returns a :class:`LoopKernel` that changes the data layout + """Returns a :class:`loopy.TranslationUnit` that changes the data layout of a variable (called "input") to the new layout specified by *new_dim_tags* from the one specified by *old_dim_tags*. *old_dim_tags* defaults to an all-C layout of the same rank diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 988e83f88c7a1f7a065813f3c1f9319695b0d97c..e3e41beef89c6796a4bef226b5f5f933f286478e 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -27,6 +27,7 @@ from pytools import Record import numpy as np import loopy as lp + from loopy.diagnostic import LoopyError, AutomaticTestFailure @@ -115,7 +116,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): shape = evaluate_shape(arg.unvec_shape, parameters) dtype = kernel_arg.dtype - is_output = arg.base_name in kernel.get_written_variables() + is_output = kernel_arg.is_output if arg.arg_class is ImageArg: storage_array = ary = cl_array.empty( @@ -302,12 +303,10 @@ def _default_check_result(result, ref_result): if not np.allclose(ref_result, result, rtol=1e-3, atol=1e-3): l2_err = ( np.sum(np.abs(ref_result-result)**2) - / - np.sum(np.abs(ref_result)**2)) + / np.sum(np.abs(ref_result)**2)) linf_err = ( np.max(np.abs(ref_result-result)) - / - np.max(np.abs(ref_result-result))) + / np.max(np.abs(ref_result-result))) return (False, "results do not match -- (rel) l_2 err: %g, l_inf err: %g" % (l2_err, linf_err)) @@ -366,12 +365,13 @@ def _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors, need_image_support # {{{ main automatic testing entrypoint def auto_test_vs_ref( - ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={}, + ref_prog, ctx, test_prog=None, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, max_test_kernel_count=1, - quiet=False, blacklist_ref_vendors=[]): + quiet=False, blacklist_ref_vendors=[], ref_entrypoint=None, + test_entrypoint=None): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. @@ -383,24 +383,37 @@ def auto_test_vs_ref( import pyopencl as cl - if test_knl is None: - test_knl = ref_knl + if test_prog is None: + test_prog = ref_prog do_check = False - if len(ref_knl.args) != len(test_knl.args): - raise LoopyError("ref_knl and test_knl do not have the same number " + if ref_entrypoint is None: + if len(ref_prog.entrypoints) != 1: + raise LoopyError("Unable to guess entrypoint for ref_prog.") + ref_entrypoint = list(ref_prog.entrypoints)[0] + + if test_entrypoint is None: + if len(test_prog.entrypoints) != 1: + raise LoopyError("Unable to guess entrypoint for ref_prog.") + test_entrypoint = list(test_prog.entrypoints)[0] + + ref_prog = lp.preprocess_kernel(ref_prog) + test_prog = lp.preprocess_kernel(test_prog) + + if len(ref_prog[ref_entrypoint].args) != len(test_prog[test_entrypoint].args): + raise LoopyError("ref_prog and test_prog do not have the same number " "of arguments") - for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)): + for i, (ref_arg, test_arg) in enumerate(zip(ref_prog[ref_entrypoint].args, + test_prog[test_entrypoint].args)): if ref_arg.name != test_arg.name: - raise LoopyError("ref_knl and test_knl argument lists disagree at index " - "%d (1-based)" % (i+1)) + raise LoopyError("ref_prog and test_prog argument lists disagree at " + "index %d (1-based)" % (i+1)) if ref_arg.dtype != test_arg.dtype: - raise LoopyError("ref_knl and test_knl argument lists disagree at index " - "%d (1-based)" % (i+1)) + raise LoopyError("ref_prog and test_prog argument lists disagree at " + "index %d (1-based)" % (i+1)) - from loopy.compiled import CompiledKernel from loopy.target.execution import get_highlighted_code if isinstance(op_count, (int, float)): @@ -421,14 +434,15 @@ def auto_test_vs_ref( # {{{ compile and run reference code from loopy.type_inference import infer_unknown_types - ref_knl = infer_unknown_types(ref_knl, expect_completion=True) + ref_prog = infer_unknown_types(ref_prog, expect_completion=True) found_ref_device = False ref_errors = [] from loopy.kernel.data import ImageArg - need_ref_image_support = any(isinstance(arg, ImageArg) for arg in ref_knl.args) + need_ref_image_support = any(isinstance(arg, ImageArg) + for arg in ref_prog[ref_entrypoint].args) for dev in _enumerate_cl_devices_for_ref_test( blacklist_ref_vendors, need_ref_image_support): @@ -436,30 +450,26 @@ def auto_test_vs_ref( ref_ctx = cl.Context([dev]) ref_queue = cl.CommandQueue(ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + ref_codegen_result = lp.generate_code_v2(ref_prog) - pp_ref_knl = lp.preprocess_kernel(ref_knl) - - for knl in lp.generate_loop_schedules(pp_ref_knl): - ref_sched_kernel = knl - break + ref_implemented_data_info = ref_codegen_result.implemented_data_infos[ + ref_entrypoint] logger.info("{} (ref): trying {} for the reference calculation".format( - ref_knl.name, dev)) + ref_entrypoint, dev)) - ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) if not quiet and print_ref_code: print(75*"-") print("Reference Code:") print(75*"-") - print(get_highlighted_code(ref_compiled.get_code())) + print(get_highlighted_code( + ref_codegen_result.device_code())) print(75*"-") - ref_kernel_info = ref_compiled.kernel_info(frozenset()) - try: ref_args, ref_arg_data = \ - make_ref_args(ref_sched_kernel, - ref_kernel_info.implemented_data_info, + make_ref_args(ref_prog[ref_entrypoint], + ref_implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: @@ -484,13 +494,13 @@ def auto_test_vs_ref( ref_queue.finish() logger.info("{} (ref): using {} for the reference calculation".format( - ref_knl.name, dev)) - logger.info("%s (ref): run" % ref_knl.name) + ref_entrypoint, dev)) + logger.info("%s (ref): run" % ref_entrypoint) ref_start = time() if not AUTO_TEST_SKIP_RUN: - ref_evt, _ = ref_compiled(ref_queue, **ref_args) + ref_evt, _ = ref_prog(ref_queue, **ref_args) else: ref_evt = cl.enqueue_marker(ref_queue) @@ -498,7 +508,7 @@ def auto_test_vs_ref( ref_stop = time() ref_elapsed_wall = ref_stop-ref_start - logger.info("%s (ref): run done" % ref_knl.name) + logger.info("%s (ref): run done" % ref_entrypoint) ref_evt.wait() ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START) @@ -521,160 +531,144 @@ def auto_test_vs_ref( from loopy.kernel import KernelState from loopy.target.pyopencl import PyOpenCLTarget - if test_knl.state not in [ + if test_prog[test_entrypoint].state not in [ KernelState.PREPROCESSED, KernelState.LINEARIZED]: - if isinstance(test_knl.target, PyOpenCLTarget): - test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0])) - - test_knl = lp.preprocess_kernel(test_knl) + if isinstance(test_prog.target, PyOpenCLTarget): + test_prog = test_prog.copy(target=PyOpenCLTarget(ctx.devices[0])) - if not test_knl.schedule: - test_kernels = lp.generate_loop_schedules(test_knl) - else: - test_kernels = [test_knl] - - test_kernel_count = 0 + test_prog = lp.preprocess_kernel(test_prog) from loopy.type_inference import infer_unknown_types - for i, kernel in enumerate(test_kernels): - test_kernel_count += 1 - if test_kernel_count > max_test_kernel_count: - break - - kernel = infer_unknown_types(kernel, expect_completion=True) - compiled = CompiledKernel(ctx, kernel) + test_prog = infer_unknown_types(test_prog, expect_completion=True) + test_prog_codegen_result = lp.generate_code_v2(test_prog) + + args = make_args(test_prog[test_entrypoint], + test_prog_codegen_result.implemented_data_infos[ + test_entrypoint], + queue, ref_arg_data, parameters) + args["out_host"] = False + + if not quiet: + print(75*"-") + print("Kernel:") + print(75*"-") + if print_code: + print(get_highlighted_code( + test_prog_codegen_result.device_code())) + print(75*"-") + if dump_binary: + print(type(test_prog_codegen_result.cl_program)) + print(test_prog_codegen_result.cl_program.binaries[0]) + print(75*"-") - kernel_info = compiled.kernel_info(frozenset()) + logger.info("%s: run warmup" % (test_entrypoint)) - args = make_args(kernel, - kernel_info.implemented_data_info, - queue, ref_arg_data, parameters) + for i in range(warmup_rounds): + if not AUTO_TEST_SKIP_RUN: + test_prog(queue, **args) - args["out_host"] = False + if need_check and not AUTO_TEST_SKIP_RUN: + for arg_desc in ref_arg_data: + if arg_desc is None: + continue + if not arg_desc.needs_checking: + continue - if not quiet: - print(75*"-") - print("Kernel #%d:" % i) - print(75*"-") - if print_code: - print(compiled.get_highlighted_code()) - print(75*"-") - if dump_binary: - # {{{ find cl program + from pyopencl.compyte.array import as_strided + ref_ary = as_strided( + arg_desc.ref_storage_array.get(), + shape=arg_desc.ref_shape, + strides=arg_desc.ref_numpy_strides).flatten() + test_ary = as_strided( + arg_desc.test_storage_array.get(), + shape=arg_desc.test_shape, + strides=arg_desc.test_numpy_strides).flatten() + common_len = min(len(ref_ary), len(test_ary)) + ref_ary = ref_ary[:common_len] + test_ary = test_ary[:common_len] - for name in dir(kernel_info.cl_kernels): - if name.startswith("__"): - continue - cl_kernel = getattr(kernel_info.cl_kernels, name) - cl_program = cl_kernel.get_info(cl.kernel_info.PROGRAM) - break - else: - assert False, "could not find cl_program" + error_is_small, error = check_result(test_ary, ref_ary) + if not error_is_small: + raise AutomaticTestFailure(error) - # }}} + need_check = False - print(type(cl_program)) - if hasattr(cl_program, "binaries"): - print(cl_program.binaries[0]) + events = [] + queue.finish() - print(75*"-") + logger.info("%s: warmup done" % (test_entrypoint)) - logger.info("%s: run warmup" % (knl.name)) + logger.info("%s: timing run" % (test_entrypoint)) - for i in range(warmup_rounds): - if not AUTO_TEST_SKIP_RUN: - compiled(queue, **args) - - if need_check and not AUTO_TEST_SKIP_RUN: - for arg_desc in ref_arg_data: - if arg_desc is None: - continue - if not arg_desc.needs_checking: - continue - - from pyopencl.compyte.array import as_strided - ref_ary = as_strided( - arg_desc.ref_storage_array.get(), - shape=arg_desc.ref_shape, - strides=arg_desc.ref_numpy_strides).flatten() - test_ary = as_strided( - arg_desc.test_storage_array.get(), - shape=arg_desc.test_shape, - strides=arg_desc.test_numpy_strides).flatten() - common_len = min(len(ref_ary), len(test_ary)) - ref_ary = ref_ary[:common_len] - test_ary = test_ary[:common_len] - - error_is_small, error = check_result(test_ary, ref_ary) - if not error_is_small: - raise AutomaticTestFailure(error) - - need_check = False - - events = [] - queue.finish() + timing_rounds = max(warmup_rounds, 1) - logger.info("%s: warmup done" % (knl.name)) + while True: + from time import time + start_time = time() - logger.info("%s: timing run" % (knl.name)) + evt_start = cl.enqueue_marker(queue) - timing_rounds = max(warmup_rounds, 1) + for i in range(timing_rounds): + if not AUTO_TEST_SKIP_RUN: + evt, _ = test_prog(queue, **args) + events.append(evt) + else: + events.append(cl.enqueue_marker(queue)) - while True: - from time import time - start_time = time() + evt_end = cl.enqueue_marker(queue) - evt_start = cl.enqueue_marker(queue) + queue.finish() + stop_time = time() - for i in range(timing_rounds): - if not AUTO_TEST_SKIP_RUN: - evt, _ = compiled(queue, **args) - events.append(evt) - else: - events.append(cl.enqueue_marker(queue)) + for evt in events: + evt.wait() + evt_start.wait() + evt_end.wait() - evt_end = cl.enqueue_marker(queue) + elapsed_event = (1e-9*events[-1].profile.END + - 1e-9*events[0].profile.START) \ + / timing_rounds + try: + elapsed_event_marker = ((1e-9*evt_end.profile.START + - 1e-9*evt_start.profile.START) + / timing_rounds) + except cl.RuntimeError: + elapsed_event_marker = None - queue.finish() - stop_time = time() + elapsed_wall = (stop_time-start_time)/timing_rounds - for evt in events: - evt.wait() - evt_start.wait() - evt_end.wait() + if elapsed_wall * timing_rounds < 0.3: + timing_rounds *= 4 + else: + break - elapsed_event = (1e-9*events[-1].profile.END - - 1e-9*events[0].profile.START) \ - / timing_rounds - try: - elapsed_event_marker = ((1e-9*evt_end.profile.START - - 1e-9*evt_start.profile.START) - / timing_rounds) - except cl.RuntimeError: - elapsed_event_marker = None + logger.info("%s: timing run done" % (test_entrypoint)) - elapsed_wall = (stop_time-start_time)/timing_rounds + rates = "" + for cnt, lbl in zip(op_count, op_label): + rates += " {:g} {}/s".format(cnt/elapsed_wall, lbl) - if elapsed_wall * timing_rounds < 0.3: - timing_rounds *= 4 + if not quiet: + def format_float_or_none(v): + if v is None: + return "" else: - break + return "%g" % v - logger.info("%s: timing run done" % (knl.name)) + print("elapsed: %s s event, %s s marker-event %s s wall " + "(%d rounds)%s" % ( + format_float_or_none(elapsed_event), + format_float_or_none(elapsed_event_marker), + format_float_or_none(elapsed_wall), timing_rounds, rates)) - rates = "" + if do_check: + ref_rates = "" for cnt, lbl in zip(op_count, op_label): rates += " {:g} {}/s".format(cnt/elapsed_wall, lbl) if not quiet: - def format_float_or_none(v): - if v is None: - return "" - else: - return "%g" % v - print("elapsed: %s s event, %s s marker-event %s s wall " "(%d rounds)%s" % ( format_float_or_none(elapsed_event), diff --git a/loopy/check.py b/loopy/check.py index 133097073f625e211828ebed8c2da36f79163d04..3e37b3e22651187ce838855c15caec3b10619230 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -23,16 +23,19 @@ THE SOFTWARE. from islpy import dim_type import islpy as isl -from loopy.symbolic import WalkMapper +from loopy.symbolic import WalkMapper, CombineMapper, ResolvedFunction from loopy.diagnostic import (LoopyError, WriteRaceConditionWarning, - warn_with_kernel, LoopyIndexError) -from loopy.type_inference import TypeInferenceMapper + warn_with_kernel, LoopyIndexError) +from loopy.type_inference import TypeReader from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, - CInstruction, _DataObliviousInstruction) + CInstruction, _DataObliviousInstruction, + NoOpInstruction) from pytools import memoize_method from collections import defaultdict +from functools import reduce + import logging logger = logging.getLogger(__name__) @@ -88,6 +91,58 @@ def check_identifiers_in_subst_rules(knl): % (knl.name, rule.name, ", ".join(deps-rule_allowed_identifiers))) + +class UnresolvedCallCollector(CombineMapper): + """ + Collects all the unresolved calls within a kernel. + + :returns: + A :class:`frozenset` of function names that are not resolved. + """ + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + if not isinstance(expr.function, ResolvedFunction): + return frozenset([expr.function.name]) | self.rec(expr.parameters) + else: + return self.rec(expr.parameters) + + def map_call_with_kwargs(self, expr): + # See: https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def check_functions_are_resolved(kernel): + """ Checks if all call nodes in the *kernel* expression have been + resolved. + """ + from loopy.symbolic import SubstitutionRuleExpander + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + unresolved_calls = UnresolvedCallCollector()(subst_expander(insn + .expression)) + if unresolved_calls: + raise LoopyError("Unknown function '%s' -- register a " + "callable corresponding to it." % + set(unresolved_calls).pop()) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError(type(insn)) + # }}} @@ -98,7 +153,7 @@ def check_identifiers_in_subst_rules(knl): VALID_NOSYNC_SCOPES = frozenset(["local", "global", "any"]) -class SubscriptIndicesIsIntChecker(TypeInferenceMapper): +class SubscriptIndicesIsIntChecker(TypeReader): def map_subscript(self, expr): for idx in expr.index_tuple: type_inf_result = self.rec(idx) @@ -114,12 +169,12 @@ class SubscriptIndicesIsIntChecker(TypeInferenceMapper): return self.rec(expr.aggregate) -def check_for_integer_subscript_indices(kernel): +def check_for_integer_subscript_indices(kernel, callables_table): """ Checks is every array access is of type :class:`int`. """ from pymbolic.primitives import Subscript - idx_int_checker = SubscriptIndicesIsIntChecker(kernel) + idx_int_checker = SubscriptIndicesIsIntChecker(kernel, callables_table) for insn in kernel.instructions: if isinstance(insn, MultiAssignmentBase): idx_int_checker(insn.expression, return_tuple=isinstance(insn, @@ -133,6 +188,27 @@ def check_for_integer_subscript_indices(kernel): type(insn).__name__)) +def check_sub_array_ref_inames_not_within_or_redn_inames(kernel): + all_within_inames = frozenset().union(*(insn.within_inames + for insn in kernel.instructions)) + all_redn_inames = frozenset().union(*(insn.reduction_inames() + for insn in kernel.instructions)) + all_sar_inames = frozenset().union(*(insn.sub_array_ref_inames() + for insn in kernel.instructions)) + + if all_sar_inames & all_within_inames: + sample = next(iter(all_sar_inames & all_within_inames)) + raise LoopyError(f"Iname '{sample}' used as a sub-array ref's sweep" + " iname and an instruction's within inames. Such usage" + " is illegal.") + + if all_sar_inames & all_redn_inames: + sample = next(iter(all_sar_inames & all_within_inames)) + raise LoopyError(f"Iname '{sample}' used as a sub-array ref's sweep" + " iname and a reduction iname. Such usage is" + " illegal.") + + def check_insn_attributes(kernel): """ Check for legality of attributes of every instruction in *kernel*. @@ -209,15 +285,24 @@ def check_multiple_tags_allowed(kernel): "tags: {}".format(iname.name, iname.tags)) -def check_for_double_use_of_hw_axes(kernel): +def check_for_double_use_of_hw_axes(kernel, callables_table): """ Check if any instruction of *kernel* is within multiple inames tagged with the same hw axis tag. """ - from loopy.kernel.data import UniqueTag + from loopy.kernel.data import UniqueTag, GroupIndexTag, LocalIndexTag + from loopy.kernel.instruction import CallInstruction + from loopy.symbolic import ResolvedFunction for insn in kernel.instructions: insn_tag_keys = set() + if isinstance(insn, CallInstruction): + assert isinstance(insn.expression.function, ResolvedFunction) + clbl = callables_table[insn.expression.function.name] + gsize, lsize = clbl.get_used_hw_axes(callables_table) + insn_tag_keys |= {GroupIndexTag(i).key for i in gsize} + insn_tag_keys |= {LocalIndexTag(i).key for i in lsize} + for iname in insn.within_inames: for tag in kernel.iname_tags_of_type(iname, UniqueTag): key = tag.key @@ -238,9 +323,11 @@ def check_for_inactive_iname_access(kernel): if not expression_inames <= insn.within_inames: raise LoopyError( "instruction '%s' references " - "inames '%s' that the instruction does not depend on" + "inames '%s' that the instruction does not depend on in " + "the kernel '%s'" % (insn.id, - ", ".join(expression_inames - insn.within_inames))) + ", ".join(expression_inames + - insn.within_inames), kernel.name)) def check_for_unused_inames(kernel): @@ -555,7 +642,7 @@ def check_write_destinations(kernel): def check_has_schedulable_iname_nesting(kernel): from loopy.transform.iname import (has_schedulable_iname_nesting, - get_iname_duplication_options) + get_iname_duplication_options) if not has_schedulable_iname_nesting(kernel): import itertools as it opt = get_iname_duplication_options(kernel) @@ -860,14 +947,25 @@ def check_variable_access_ordered(kernel): # }}} -def pre_schedule_checks(kernel): +def pre_schedule_checks(kernel, callables_table): try: logger.debug("%s: pre-schedule check: start" % kernel.name) - check_for_integer_subscript_indices(kernel) + from loopy.kernel.data import auto + if all(arg.dtype not in [None, auto] for arg in kernel.args) and ( + all(tv.dtype not in [None, auto] for tv in + kernel.temporary_variables.values())): + # only check if all types are known + check_for_integer_subscript_indices(kernel, callables_table) + + check_functions_are_resolved(kernel) + # Ordering restriction: + # check_sub_array_ref_inames_not_within_or_redn_inames should be done + # before check_bounds. See: BatchedAccessMapMapper.map_sub_array_ref. + check_sub_array_ref_inames_not_within_or_redn_inames(kernel) check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) - check_for_double_use_of_hw_axes(kernel) + check_for_double_use_of_hw_axes(kernel, callables_table) check_insn_attributes(kernel) check_loop_priority_inames_known(kernel) check_multiple_tags_allowed(kernel) @@ -896,7 +994,8 @@ def pre_schedule_checks(kernel): # {{{ check for unused hw axes -def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): +def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, + sched_index=None): from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, get_insn_ids_for_block_at, gather_schedule_block) @@ -911,10 +1010,11 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): assert isinstance(kernel.schedule[sched_index], CallKernel) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index)) + get_insn_ids_for_block_at(kernel.schedule, sched_index), + callables_table, return_dict=True) - group_axes = {ax for ax, length in enumerate(group_size)} - local_axes = {ax for ax, length in enumerate(local_size)} + group_axes = set(group_size.keys()) + local_axes = set(local_size.keys()) i = sched_index + 1 assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel) @@ -928,12 +1028,16 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): while i < loop_end_i: sched_item = kernel.schedule[i] if isinstance(sched_item, CallKernel): - i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i) + i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, + callables_table, i) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] i += 1 + if isinstance(insn, NoOpInstruction): + continue + group_axes_used = set() local_axes_used = set() @@ -952,6 +1056,19 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): elif altags: raise LoopyError("auto local tag encountered") + # {{{ account for any hw axes due to a callable + + if isinstance(insn, CallInstruction): + assert isinstance(insn.expression.function, ResolvedFunction) + clbl = callables_table[insn.expression.function.name] + clbl_g_axes, clbl_l_axes = clbl.get_used_hw_axes(callables_table) + assert len(group_axes_used & clbl_g_axes) == 0 + assert len(local_axes_used & clbl_l_axes) == 0 + group_axes_used |= clbl_g_axes + local_axes_used |= clbl_l_axes + + # }}} + if group_axes != group_axes_used: raise LoopyError( f"instruction '{insn.id}' does not use all group hw axes " @@ -983,9 +1100,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): return past_end_i -def check_for_unused_hw_axes_in_insns(kernel): +def check_for_unused_hw_axes_in_insns(kernel, callables_table): if kernel.schedule: - _check_for_unused_hw_axes_in_kernel_chunk(kernel) + _check_for_unused_hw_axes_in_kernel_chunk(kernel, + callables_table) # }}} @@ -1135,23 +1253,174 @@ def check_that_shapes_and_strides_are_arguments(kernel): # }}} -def pre_codegen_checks(kernel): - try: - logger.debug("pre-codegen check %s: start" % kernel.name) +# {{{ validate_kernel_call_sites + +def _get_sub_array_ref_swept_range(kernel, sar): + from loopy.symbolic import get_access_map + domain = kernel.get_inames_domain(frozenset({iname_var.name + for iname_var in sar.swept_inames})) + return get_access_map(domain, sar.swept_inames, kernel.assumptions).range() - check_for_unused_hw_axes_in_insns(kernel) - check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) - check_that_temporaries_are_defined_in_subkernels_where_used(kernel) - check_that_all_insns_are_scheduled(kernel) - kernel.target.pre_codegen_check(kernel) - check_that_shapes_and_strides_are_arguments(kernel) - logger.debug("pre-codegen check %s: done" % kernel.name) +def _are_sub_array_refs_equivalent(sar1, sar2, caller): + """ + Returns *True* iff *sar1* and *sar2* are equivalent + :class:`loopy.SubArrayRef`s. + + Two sub-array-refs are said to be equivalent iff they point to the same + array sub-regions. This equivalence check is less strict than + :meth:`~loopy.SubArrayRef.is_equal`. + + :arg caller: An instance of :class:`loopy.LoopKernel` in which they are + referenced. + """ + if len(sar1.swept_inames) != len(sar2.swept_inames): + return False + + if sar1.subscript.aggregate.name != sar2.subscript.aggregate.name: + return False + + if len(sar1.subscript.index_tuple) != len(sar2.subscript.index_tuple): + return False + + if (_get_sub_array_ref_swept_range(caller, sar1) + != _get_sub_array_ref_swept_range(caller, sar2)): + return False + + from loopy.symbolic import SubstitutionMapper + from pymbolic.mapper.substitutor import make_subst_func + from loopy.isl_helpers import simplify_via_aff + subst_func = make_subst_func({iname1.name: iname2 + for iname1, iname2 in zip(sar1.swept_inames, + sar2.swept_inames) + }) + + # subst_mapper: maps swept inames from sar1 to sar2 + subst_mapper = SubstitutionMapper(subst_func) + + for idx1, idx2 in zip(sar1.subscript.index_tuple, + sar2.subscript.index_tuple): + if simplify_via_aff(subst_mapper(idx1) - idx2) != 0: + return False + return True + + +def _validate_kernel_call_insn(caller, call_insn, callee): + assert call_insn.expression.function.name == callee.name + from loopy.symbolic import SubArrayRef + from loopy.kernel.array import ArrayBase + + arg_id_to_arg = call_insn.arg_id_to_arg() + + next_iarg_input = 0 + next_iarg_output = -1 + + for arg in callee.args: + if arg.is_input: + if next_iarg_input not in arg_id_to_arg: + raise LoopyError(f"Call to '{callee.name}' in '{call_insn}' expects" + f" a {next_iarg_input+1}-th positional " + "argument corresponding" + f" to '{arg.name}'in the callee.") + in_val = arg_id_to_arg[next_iarg_input] + next_iarg_input += 1 + if isinstance(arg, ArrayBase): + if not isinstance(in_val, SubArrayRef): + raise LoopyError(f"Call to '{callee.name}' in '{call_insn}'" + f" expects a sub-array-ref for '{arg.name}'" + f" (got {in_val}).") + else: + if isinstance(in_val, SubArrayRef): + raise LoopyError(f"Call to '{callee.name}' in '{call_insn}'" + f" expects a value argument for '{arg.name}'" + f" (got {in_val}).") + if arg.is_output: + if next_iarg_output not in arg_id_to_arg: + raise LoopyError(f"Call to '{callee.name}' in '{call_insn}' expects" + f" a {-next_iarg_output}-th positional assignee" + f" corresponding to '{arg.name}'in the callee.") + + out_val = arg_id_to_arg[next_iarg_output] + next_iarg_output -= 1 + assert isinstance(arg, ArrayBase) + if not isinstance(out_val, SubArrayRef): + raise LoopyError(f"Call to '{callee.name}' in '{call_insn}'" + f" expects a sub-array-ref for '{arg.name}'" + f" (got {out_val}).") + + if arg.is_input and arg.is_output: + if not _are_sub_array_refs_equivalent(in_val, out_val, caller): + raise LoopyError(f"Call to '{callee.name}' in '{call_insn}' expects" + f" equivalent sub-array-refs for '{arg.name}'" + f" (got {in_val}, {out_val}).") + + +def _validate_kernel_call_sites_inner(kernel, callables): + from pymbolic.primitives import Call + from loopy.kernel.function_interface import CallableKernel + + for insn in kernel.instructions: + if (isinstance(insn, CallInstruction) + and isinstance(insn.expression, Call) + and isinstance(insn.expression.function, ResolvedFunction)): + clbl = callables[insn.expression.function.name] + if isinstance(clbl, CallableKernel): + _validate_kernel_call_insn(kernel, insn, clbl.subkernel) + elif isinstance(insn, (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError(type(insn)) + + +def validate_kernel_call_sites(translation_unit): + from loopy import LoopKernel + + for name in translation_unit.callables_table: + clbl = translation_unit[name] + if isinstance(clbl, LoopKernel): + _validate_kernel_call_sites_inner(clbl, translation_unit.callables_table) + + +# }}} + + +def pre_codegen_entrypoint_checks(kernel, callables_table): + logger.debug("pre-codegen entrypoint check %s: start" % kernel.name) + + kernel.target.pre_codegen_entrypoint_check(kernel, callables_table) + + logger.debug("pre-codegen entrypoint check %s: done" % kernel.name) + + +def pre_codegen_callable_checks(kernel, callables_table): + logger.debug("pre-codegen callable check %s: start" % kernel.name) + + check_for_unused_hw_axes_in_insns(kernel, callables_table) + check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) + check_that_temporaries_are_defined_in_subkernels_where_used(kernel) + check_that_all_insns_are_scheduled(kernel) + kernel.target.pre_codegen_callable_check(kernel, callables_table) + check_that_shapes_and_strides_are_arguments(kernel) + + logger.debug("pre-codegen callable check %s: done" % kernel.name) + + +def pre_codegen_checks(t_unit): + from loopy.kernel.function_interface import CallableKernel + + try: + for e in t_unit.entrypoints: + pre_codegen_entrypoint_checks(t_unit[e], t_unit.callables_table) + + for name, clbl in t_unit.callables_table.items(): + if isinstance(clbl, CallableKernel): + pre_codegen_callable_checks(clbl.subkernel, t_unit.callables_table) except Exception: print(75*"=") - print("failing kernel during pre-schedule check:") + print("failing kernel during pre-codegen check:") print(75*"=") - print(kernel) + print(t_unit) print(75*"=") raise diff --git a/loopy/cli.py b/loopy/cli.py index 4230b74967fc0fa7dcb0064bb712ee9ab140b299..4544df1662ee9dd8d34116a2f2962d8719563916 100644 --- a/loopy/cli.py +++ b/loopy/cli.py @@ -63,11 +63,9 @@ def main(): parser.add_argument("--target", choices=( "opencl", "ispc", "ispc-occa", "c", "c-fortran", "cuda"), default="opencl") - parser.add_argument("--name") parser.add_argument("--transform") parser.add_argument("--edit-code", action="store_true") parser.add_argument("--occa-defines") - parser.add_argument("--occa-add-dummy-arg", action="store_true") parser.add_argument("--print-ir", action="store_true") args = parser.parse_args() @@ -106,9 +104,11 @@ def main(): ".loopy": "loopy", ".floopy": "fortran", ".f90": "fortran", + ".F90": "fortran", ".fpp": "fortran", ".f": "fortran", ".f77": "fortran", + ".F77": "fortran", }.get(ext) with open(args.infile) as infile_fd: infile_content = infile_fd.read() @@ -159,10 +159,7 @@ def main(): raise RuntimeError("loopy-lang requires 'lp_knl' " "to be defined on exit") - if args.name is not None: - kernel = kernel.copy(name=args.name) - - kernels = [kernel] + t_unit = [kernel] elif lang in ["fortran", "floopy", "fpp"]: pre_transform_code = None @@ -179,69 +176,31 @@ def main(): defines_to_python_code(defines_fd.read()) + pre_transform_code) - kernels = lp.parse_transformed_fortran( + t_unit = lp.parse_transformed_fortran( infile_content, pre_transform_code=pre_transform_code, filename=args.infile) - if args.name is not None: - kernels = [kernel for kernel in kernels - if kernel.name == args.name] - - if not kernels: - raise RuntimeError("no kernels found (name specified: %s)" - % args.name) - else: raise RuntimeError("unknown language: '%s'" % args.lang) + if not isinstance(t_unit, lp.TranslationUnit): + # FIXME + assert isinstance(t_unit, list) # of kernels + raise NotImplementedError("convert list of kernels to TranslationUnit") + if args.print_ir: - for kernel in kernels: - print(kernel, file=sys.stderr) - - if args.occa_add_dummy_arg: - new_kernels = [] - for kernel in kernels: - new_args = [ - lp.ArrayArg("occa_info", np.int32, shape=None) - ] + kernel.args - new_kernels.append(kernel.copy(args=new_args)) - - kernels = new_kernels - del new_kernels - - codes = [] - from loopy.codegen import generate_code - for kernel in kernels: - kernel = lp.preprocess_kernel(kernel) - code, impl_arg_info = generate_code(kernel) - codes.append(code) + print(t_unit, file=sys.stderr) + + t_unit = lp.preprocess_kernel(t_unit) + cgr = lp.generate_code_v2(t_unit) if args.outfile is not None: outfile = args.outfile else: outfile = "-" - code = "\n\n".join(codes) - - # {{{ edit code if requested - - import os - edit_kernel_env = os.environ.get("LOOPY_EDIT_KERNEL") - need_edit = args.edit_code - if not need_edit and edit_kernel_env is not None: - # Do not replace with "any()"--Py2.6/2.7 bug doesn't like - # comprehensions in functions with exec(). - - for k in kernels: - if edit_kernel_env.lower() in k.name.lower(): - need_edit = True - - if need_edit: - from pytools import invoke_editor - code = invoke_editor(code, filename="edit.cl") - - # }}} + code = cgr.device_code() if outfile == "-": sys.stdout.write(code) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 694ebb07af076e7e8635414443794be279891fd0..59cc894148432d14dc22b1efdc3c615da6a320a9 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -20,16 +20,25 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from loopy.diagnostic import LoopyError, warn -from pytools import ImmutableRecord, ProcessLogger +import logging +logger = logging.getLogger(__name__) + import islpy as isl +from loopy.diagnostic import LoopyError, warn +from pytools import ImmutableRecord + from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION -import logging -logger = logging.getLogger(__name__) + +from loopy.symbolic import CombineMapper +from functools import reduce + +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + +from pytools import ProcessLogger __doc__ = """ .. currentmodule:: loopy.codegen @@ -44,6 +53,8 @@ __doc__ = """ .. autoclass:: CodeGenerationState +.. autoclass:: TranslationUnitCodeGenerationResult + .. automodule:: loopy.codegen.result .. automodule:: loopy.codegen.tools @@ -142,7 +153,10 @@ class VectorizationInfo: class SeenFunction(ImmutableRecord): - """ + """This is used to track functions that emerge late during code generation, + e.g. C functions to realize arithmetic. No connection with + :class:`~loopy.kernel.function_interface.InKernelCallable`. + .. attribute:: name .. attribute:: c_name .. attribute:: arg_dtypes @@ -165,6 +179,7 @@ class SeenFunction(ImmutableRecord): class CodeGenerationState: """ .. attribute:: kernel + .. attribute:: target .. attribute:: implemented_data_info a list of :class:`ImplementedDataInfo` objects. @@ -207,21 +222,34 @@ class CodeGenerationState: .. attribute:: schedule_index_end + .. attribute:: callables_table + + A mapping from callable names to instances of + :class:`loopy.kernel.function_interface.InKernelCallable`. + + .. attribute:: is_entrypoint + + A :class:`bool` to indicate if the code is being generated for an + entrypoint kernel + .. attribute:: codegen_cache_manager An instance of :class:`loopy.codegen.tools.CodegenOperationCacheManager`. """ - def __init__(self, kernel, + def __init__(self, kernel, target, implemented_data_info, implemented_domain, implemented_predicates, seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, + callables_table, + is_entrypoint, vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, schedule_index_end=None, codegen_cachemanager=None): self.kernel = kernel + self.target = target self.implemented_data_info = implemented_data_info self.implemented_domain = implemented_domain self.implemented_predicates = implemented_predicates @@ -230,6 +258,8 @@ class CodeGenerationState: self.seen_atomic_dtypes = seen_atomic_dtypes self.var_subst_map = var_subst_map.copy() self.allow_complex = allow_complex + self.callables_table = callables_table + self.is_entrypoint = is_entrypoint self.vectorization_info = vectorization_info self.var_name_generator = var_name_generator self.is_generating_device_code = is_generating_device_code @@ -239,19 +269,24 @@ class CodeGenerationState: # {{{ copy helpers - def copy(self, kernel=None, implemented_data_info=None, + def copy(self, kernel=None, target=None, implemented_data_info=None, implemented_domain=None, implemented_predicates=frozenset(), - var_subst_map=None, vectorization_info=None, - is_generating_device_code=None, - gen_program_name=None, + var_subst_map=None, is_entrypoint=None, vectorization_info=None, + is_generating_device_code=None, gen_program_name=None, schedule_index_end=None): if kernel is None: kernel = self.kernel + if target is None: + target = self.target + if implemented_data_info is None: implemented_data_info = self.implemented_data_info + if is_entrypoint is None: + is_entrypoint = self.is_entrypoint + if vectorization_info is False: vectorization_info = None @@ -269,6 +304,7 @@ class CodeGenerationState: return CodeGenerationState( kernel=kernel, + target=target, implemented_data_info=implemented_data_info, implemented_domain=implemented_domain or self.implemented_domain, implemented_predicates=( @@ -278,6 +314,8 @@ class CodeGenerationState: seen_atomic_dtypes=self.seen_atomic_dtypes, var_subst_map=var_subst_map or self.var_subst_map, allow_complex=self.allow_complex, + callables_table=self.callables_table, + is_entrypoint=is_entrypoint, vectorization_info=vectorization_info, var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, @@ -389,6 +427,32 @@ code_gen_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) +class InKernelCallablesCollector(CombineMapper): + """ + Returns an instance of :class:`frozenset` containing instances of + :class:`loopy.kernel.function_interface.InKernelCallable` in the + :attr:``kernel`. + """ + def __init__(self, kernel): + self.kernel = kernel + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_resolved_function(self, expr): + return frozenset([self.kernel.scoped_functions[ + expr.name]]) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + class PreambleInfo(ImmutableRecord): """ .. attribute:: kernel @@ -401,45 +465,19 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_v2(kernel): +def generate_code_for_a_single_kernel(kernel, callables_table, target, + is_entrypoint): """ :returns: a :class:`CodeGenerationResult` + + :param kernel: An instance of :class:`loopy.LoopKernel`. """ from loopy.kernel import KernelState - if kernel.state == KernelState.INITIAL: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) - - if kernel.schedule is None: - from loopy.schedule import get_one_linearized_kernel - kernel = get_one_linearized_kernel(kernel) - if kernel.state != KernelState.LINEARIZED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") - # {{{ cache retrieval - - from loopy import CACHING_ENABLED - - if CACHING_ENABLED: - input_kernel = kernel - try: - result = code_gen_cache[input_kernel] - logger.debug("%s: code generation cache hit" % kernel.name) - return result - except KeyError: - pass - - # }}} - - from loopy.type_inference import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - - from loopy.check import pre_codegen_checks - pre_codegen_checks(kernel) - codegen_plog = ProcessLogger(logger, f"{kernel.name}: generate code") # {{{ examine arg list @@ -454,13 +492,13 @@ def generate_code_v2(kernel): if isinstance(arg, ArrayBase): implemented_data_info.extend( arg.decl_info( - kernel.target, + target, is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): implemented_data_info.append(ImplementedDataInfo( - target=kernel.target, + target=target, name=arg.name, dtype=arg.dtype, arg_class=ValueArg, @@ -486,6 +524,7 @@ def generate_code_v2(kernel): codegen_state = CodeGenerationState( kernel=kernel, + target=target, implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), @@ -497,14 +536,17 @@ def generate_code_v2(kernel): var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=( - kernel.target.host_program_name_prefix + target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule), + callables_table=callables_table, + is_entrypoint=is_entrypoint, codegen_cachemanager=CodegenOperationCacheManager.from_kernel(kernel), ) from loopy.codegen.result import generate_host_or_device_program + codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) @@ -539,7 +581,7 @@ def generate_code_v2(kernel): ) preamble_generators = (kernel.preamble_generators - + kernel.target.get_device_ast_builder().preamble_generators()) + + target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) @@ -555,10 +597,221 @@ def generate_code_v2(kernel): codegen_plog.done() + return codegen_result + + +def diverge_callee_entrypoints(program): + """ + If a :class:`loopy.kernel.function_interface.CallableKernel` is both an + entrypoint and a callee, then rename the callee. + """ + from loopy.translation_unit import (get_reachable_resolved_callable_ids, + rename_resolved_functions_in_a_single_kernel, + make_callable_name_generator) + callable_ids = get_reachable_resolved_callable_ids(program.callables_table, + program.entrypoints) + + new_callables = {} + todo_renames = {} + + vng = make_callable_name_generator(program.callables_table) + + for clbl_id in callable_ids & program.entrypoints: + todo_renames[clbl_id] = vng(based_on=clbl_id) + + for name, clbl in program.callables_table.items(): + if name in todo_renames: + name = todo_renames[name] + + if isinstance(clbl, CallableKernel): + knl = rename_resolved_functions_in_a_single_kernel(clbl.subkernel, + todo_renames) + knl = knl.copy(name=name) + clbl = clbl.copy(subkernel=knl) + + new_callables[name] = clbl + + return program.copy(callables_table=new_callables) + + +class TranslationUnitCodeGenerationResult(ImmutableRecord): + """ + .. attribute:: host_program + + A mapping from names of entrypoints to their host + :class:`~loopy.codegen.result.GeneratedProgram`. + + .. attribute:: device_programs + + A list of :class:`~loopy.codegen.result.GeneratedProgram` instances + intended to run on the compute device. + + .. attribute:: host_preambles + .. attribute:: device_preambles + + .. attribute:: implemented_data_infos + + A mapping from names of entrypoints to their + list of :class:`ImplementedDataInfo` objects. + + .. automethod:: host_code + .. automethod:: device_code + .. automethod:: all_code + + """ + def host_code(self): + from loopy.codegen.result import process_preambles + preamble_codes = process_preambles(getattr(self, "host_preambles", [])) + + return ( + "".join(preamble_codes) + + "\n" + + "\n\n".join(str(hp.ast) + for hp in self.host_programs.values())) + + def device_code(self): + from loopy.codegen.result import process_preambles + preamble_codes = process_preambles(getattr(self, "device_preambles", [])) + + return ( + "".join(preamble_codes) + + "\n" + + "\n\n".join(str(dp.ast) for dp in self.device_programs)) + + def all_code(self): + from loopy.codegen.result import process_preambles + preamble_codes = process_preambles( + getattr(self, "host_preambles", []) + + + getattr(self, "device_preambles", []) + ) + + return ( + "".join(preamble_codes) + + "\n" + + "\n\n".join(str(dp.ast) for dp in self.device_programs) + + "\n\n" + + "\n\n".join(str(hp.ast) for hp in + self.host_programs.values())) + + +def generate_code_v2(program): + """ + Returns an instance of :class:`CodeGenerationResult`. + + :param program: An instance of :class:`loopy.TranslationUnit`. + """ + + from loopy.kernel import LoopKernel + from loopy.translation_unit import make_program + + # {{{ cache retrieval + + from loopy import CACHING_ENABLED + from loopy.preprocess import prepare_for_caching + if CACHING_ENABLED: - code_gen_cache.store_if_not_present(input_kernel, codegen_result) + input_program = prepare_for_caching(program) + try: + result = code_gen_cache[input_program] + logger.debug(f"TranslationUnit with entrypoints {program.entrypoints}:" + " code generation cache hit") + return result + except KeyError: + pass - return codegen_result + # }}} + + if isinstance(program, LoopKernel): + program = make_program(program) + + from loopy.kernel import KernelState + if program.state < KernelState.PREPROCESSED: + # Note that we cannot have preprocessing separately for everyone. + # Since, now the preprocessing of each one depends on the other. + # So we check if any one of the callable kernels are not preprocesses + # then, we have to do the preprocessing of every other kernel. + from loopy.preprocess import preprocess_program + program = preprocess_program(program) + + from loopy.type_inference import infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) + + new_callables = {} + + for name, clbl in program.callables_table.items(): + if isinstance(clbl, CallableKernel): + from loopy.schedule import get_one_linearized_kernel + knl = clbl.subkernel + if knl.schedule is None: + knl = get_one_linearized_kernel( + knl, program.callables_table) + new_callables[name] = clbl.copy(subkernel=knl) + elif isinstance(clbl, ScalarCallable): + new_callables[name] = clbl + else: + raise NotImplementedError(type(clbl)) + + program = program.copy(callables_table=new_callables) + + # Why diverge? Generated code for a non-entrypoint kernel and an entrypoint + # kernel isn't same for a general loopy target. For example in OpenCL, a + # kernel callable from host and the one supposed to be callable from device + # have different function signatures. To generate correct code, each + # callable should be exclusively an entrypoint or a non-entrypoint kernel. + program = diverge_callee_entrypoints(program) + + from loopy.check import pre_codegen_checks + pre_codegen_checks(program) + + host_programs = {} + device_programs = [] + device_preambles = [] + callee_fdecls = [] + implemented_data_infos = {} + + # {{{ collect host/device programs + + for func_id in sorted(key for key, val in program.callables_table.items() + if isinstance(val, CallableKernel)): + cgr = generate_code_for_a_single_kernel(program[func_id], + program.callables_table, + program.target, + func_id in program.entrypoints) + if func_id in program.entrypoints: + host_programs[func_id] = cgr.host_program + implemented_data_infos[func_id] = cgr.implemented_data_info + else: + assert len(cgr.device_programs) == 1 + callee_fdecls.append(cgr.device_programs[0].ast.fdecl) + + device_programs.extend(cgr.device_programs) + device_preambles.extend(cgr.device_preambles) + + # }}} + + # {{{ collect preambles + + for func_id, clbl in program.callables_table.items(): + device_preambles.extend(list(clbl.generate_preambles(program.target))) + + # }}} + + # adding the callee fdecls to the device_programs + device_programs = ([device_programs[0].copy( + ast=program.target.get_device_ast_builder().ast_module.Collection( + callee_fdecls+[device_programs[0].ast]))] + + device_programs[1:]) + cgr = TranslationUnitCodeGenerationResult( + host_programs=host_programs, + device_programs=device_programs, + device_preambles=device_preambles, + implemented_data_infos=implemented_data_infos) + + if CACHING_ENABLED: + code_gen_cache.store_if_not_present(input_program, cgr) + + return cgr def generate_code(kernel, device=None): @@ -572,8 +825,14 @@ def generate_code(kernel, device=None): if len(codegen_result.device_programs) > 1: raise LoopyError("kernel passed to generate_code yielded multiple " "device programs. Use generate_code_v2.") + if len(codegen_result.host_programs) > 1: + raise LoopyError("kernel passed to generate_code yielded multiple " + "host programs. Use generate_code_v2.") + + assert len(codegen_result.implemented_data_infos) == 1 + implemented_data_info, = codegen_result.implemented_data_infos.values() - return codegen_result.device_code(), codegen_result.implemented_data_info + return codegen_result.device_code(), implemented_data_info # }}} diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index f7e953d9481aee705b785406462725ea25d860fe..ec2a2e2832d7b68b61d8bcf48df94014e077a2de 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -88,18 +88,22 @@ def generate_code_for_sched_index(codegen_state, sched_index): codegen_result = generate_host_or_device_program( new_codegen_state, sched_index) - glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index)) - - return merge_codegen_results(codegen_state, [ - codegen_result, - - codegen_state.ast_builder.get_kernel_call( - codegen_state, - sched_item.kernel_name, - glob_grid, loc_grid, - extra_args), - ]) + if codegen_state.is_entrypoint: + glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( + get_insn_ids_for_block_at(kernel.schedule, sched_index), + codegen_state.callables_table) + return merge_codegen_results(codegen_state, [ + codegen_result, + + codegen_state.ast_builder.get_kernel_call( + codegen_state, + sched_item.kernel_name, + glob_grid, loc_grid, + extra_args), + ]) + else: + # do not generate host code for non-entrypoint kernels + return codegen_result elif isinstance(sched_item, EnterLoop): tags = kernel.iname_tags(sched_item.iname) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index d149eb95ab7c115b38cc1b819b1c24f7b4597170..724989d287526917194bfe041896e9f548874852 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -251,7 +251,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( - insn_ids_for_block) + insn_ids_for_block, codegen_state.callables_table, return_dict=True) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 685df8fdec9ef0ea9e45223ceae563d943a69d79..620430f93696195e78495047a2375f986098f357 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -302,27 +302,29 @@ def generate_host_or_device_program(codegen_state, schedule_index): else: codegen_result = build_loop_nest(codegen_state, schedule_index) - codegen_result = merge_codegen_results( - codegen_state, - ast_builder.generate_top_of_body(codegen_state) - + temp_decls - + [codegen_result], - collapse=False) - - cur_prog = codegen_result.current_program(codegen_state) - body_ast = cur_prog.ast - fdecl_ast = ast_builder.get_function_declaration( - codegen_state, codegen_result, schedule_index) - - fdef_ast = ast_builder.get_function_definition( - codegen_state, codegen_result, - schedule_index, fdecl_ast, body_ast) - - codegen_result = codegen_result.with_new_program( - codegen_state, - cur_prog.copy( - ast=ast_builder.process_ast(fdef_ast), - body_ast=ast_builder.process_ast(body_ast))) + if (codegen_state.is_generating_device_code + or codegen_state.is_entrypoint): + codegen_result = merge_codegen_results( + codegen_state, + ast_builder.generate_top_of_body(codegen_state) + + temp_decls + + [codegen_result], + collapse=False) + + cur_prog = codegen_result.current_program(codegen_state) + body_ast = cur_prog.ast + fdecl_ast = ast_builder.get_function_declaration( + codegen_state, codegen_result, schedule_index) + + fdef_ast = ast_builder.get_function_definition( + codegen_state, codegen_result, + schedule_index, fdecl_ast, body_ast) + + codegen_result = codegen_result.with_new_program( + codegen_state, + cur_prog.copy( + ast=ast_builder.process_ast(fdef_ast), + body_ast=ast_builder.process_ast(body_ast))) return codegen_result diff --git a/loopy/compiled.py b/loopy/compiled.py index f9313c6c95612ddba6566d7c8175d998e8312147..0fa18eacbc3a16059e06c33202c91f89cc39ef64 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -31,11 +31,11 @@ class CompiledKernel(PyOpenCLKernelExecutor): """ .. automethod:: __call__ """ - def __init__(self, context, kernel): + def __init__(self, context, kernel, entrypoint): from warnings import warn warn("CompiledKernel is deprecated. Use LoopKernel.__call__ directly.", DeprecationWarning, stacklevel=2) - super().__init__(context, kernel) + super().__init__(context, kernel, entrypoint) # }}} diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index 67b8efda9a6375f7e6b58dbdf4a52631aab793f6..fc8fbe71f8fbec10f31684107f012dc94201618d 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -48,7 +48,7 @@ class WriteRaceConditionWarning(LoopyWarning): # }}} -def warn_with_kernel(kernel, id, text, type=LoopyWarning): +def warn_with_kernel(kernel, id, text, type=LoopyWarning, stacklevel=None): from fnmatch import fnmatchcase for sw in kernel.silenced_warnings: if fnmatchcase(id, sw): @@ -57,8 +57,12 @@ def warn_with_kernel(kernel, id, text, type=LoopyWarning): text += (" (add '%s' to silenced_warnings kernel attribute to disable)" % id) + if stacklevel is None: + stacklevel = 2 + else: + stacklevel = stacklevel + 1 from warnings import warn - warn(f"in kernel {kernel.name}: {text}", type, stacklevel=2) + warn(f"in kernel {kernel.name}: {text}", type, stacklevel=stacklevel) warn = MovedFunctionDeprecationWrapper(warn_with_kernel) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 10c390ff1e937d5a708e8667bc8598e1c91ef861..fa5c5050f47f220d895fc7c0229558a4f67f6f25 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -20,7 +20,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import logging +logger = logging.getLogger(__name__) + from loopy.diagnostic import LoopyError +from pytools import ProcessLogger def c_preprocess(source, defines=None, filename=None, include_paths=None): @@ -152,8 +156,9 @@ def parse_transformed_fortran(source, free_form=True, strict=True, :func:`parse_fortran`. * ``FILENAME``: the file name of the code being processed - The transform code must define ``RESULT``, conventionally a list of - kernels, which is returned from this function unmodified. + The transform code must define ``RESULT``, conventionally a list of kernels + or a :class:`loopy.TranslationUnit`, which is returned from this function + unmodified. An example of *source* may look as follows:: @@ -234,17 +239,68 @@ def parse_transformed_fortran(source, free_form=True, strict=True, return proc_dict["RESULT"] -def parse_fortran(source, filename="", free_form=True, strict=True, - seq_dependencies=None, auto_dependencies=None, target=None, - all_names_known=True): +def _add_assignees_to_calls(knl, all_kernels): + """ + Returns a copy of *knl* coming from the fortran parser adjusted to the + loopy specification that written variables of a call must appear in the + assignee. + + :param knl: An instance of :class:`loopy.LoopKernel`, which have incorrect + calls to the kernels in *all_kernels* by stuffing both the input and + output arguments into parameters. + + :param all_kernels: An instance of :class:`list` of loopy kernels which + may be called by *kernel*. + """ + new_insns = [] + subroutine_dict = {kernel.name: kernel for kernel in all_kernels} + from loopy.kernel.instruction import (Assignment, CallInstruction, + CInstruction, _DataObliviousInstruction, + modify_assignee_for_array_call) + from pymbolic.primitives import Call, Variable + + for insn in knl.instructions: + if isinstance(insn, CallInstruction): + if isinstance(insn.expression, Call) and ( + insn.expression.function.name in subroutine_dict): + assignees = [] + new_params = [] + subroutine = subroutine_dict[insn.expression.function.name] + for par, arg in zip(insn.expression.parameters, subroutine.args): + if arg.name in subroutine.get_written_variables(): + par = modify_assignee_for_array_call(par) + assignees.append(par) + if arg.name in subroutine.get_read_variables(): + new_params.append(par) + if arg.name not in (subroutine.get_written_variables() | + subroutine.get_read_variables()): + new_params.append(par) + + new_insns.append( + insn.copy( + assignees=tuple(assignees), + expression=Variable( + insn.expression.function.name)(*new_params))) + else: + new_insns.append(insn) + pass + elif isinstance(insn, (Assignment, CInstruction, + _DataObliviousInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError(type(insn).__name__) + + return knl.copy(instructions=new_insns) + + +def parse_fortran(source, filename="", free_form=None, strict=None, + seq_dependencies=None, auto_dependencies=None, target=None): """ - :arg all_names_known: if set to *False*, enter an undocumented mode - in which Fortran parsing will try to tolerate unknown names. - If used, ``loopy.frontend.fortran.translator.specialize_fortran_division`` - must be called as soon as all names are known. - :returns: a list of :class:`loopy.LoopKernel` objects + :returns: a :class:`loopy.TranslationUnit` """ + parse_plog = ProcessLogger(logger, "parsing fortran file '%s'" % filename) + if seq_dependencies is not None and auto_dependencies is not None: raise TypeError( "may not specify both seq_dependencies and auto_dependencies") @@ -256,6 +312,10 @@ def parse_fortran(source, filename="", free_form=True, strict=True, if seq_dependencies is None: seq_dependencies = True + if free_form is None: + free_form = True + if strict is None: + strict = True import logging console = logging.StreamHandler() @@ -273,11 +333,29 @@ def parse_fortran(source, filename="", free_form=True, strict=True, "and returned invalid data (Sorry!)") from loopy.frontend.fortran.translator import F2LoopyTranslator - f2loopy = F2LoopyTranslator( - filename, target=target, all_names_known=all_names_known) + f2loopy = F2LoopyTranslator(filename, target=target) f2loopy(tree) - return f2loopy.make_kernels(seq_dependencies=seq_dependencies) + kernels = f2loopy.make_kernels(seq_dependencies=seq_dependencies) + + from loopy.transform.callable import merge + prog = merge(kernels) + all_kernels = [clbl.subkernel + for clbl in prog.callables_table.values()] + + for knl in all_kernels: + prog.with_kernel(_add_assignees_to_calls(knl, all_kernels)) + + if len(all_kernels) == 1: + # guesssing in the case of only one function + prog = prog.with_entrypoints(all_kernels[0].name) + + from loopy.frontend.fortran.translator import specialize_fortran_division + prog = specialize_fortran_division(prog) + + parse_plog.done() + + return prog # vim: foldmethod=marker diff --git a/loopy/frontend/fortran/expression.py b/loopy/frontend/fortran/expression.py index 354a769a0f4b4762cc3e39befa8fb27723be5e72..cc93e914d0470c423812b69913a7185dca9c7b67 100644 --- a/loopy/frontend/fortran/expression.py +++ b/loopy/frontend/fortran/expression.py @@ -42,6 +42,25 @@ _and = intern("and") _or = intern("or") +def tuple_to_complex_literal(expr): + if len(expr) != 2: + raise TranslationError("complex literals must have " + "two entries") + + r, i = expr + + r = np.array(r)[()] + i = np.array(i)[()] + + dtype = (r.dtype.type(0) + i.dtype.type(0)) + if dtype == np.float32: + dtype = np.complex64 + else: + dtype = np.complex128 + + return dtype(float(r) + float(i)*1j) + + # {{{ expression parser class FortranExpressionParser(ExpressionParserBase): @@ -176,24 +195,31 @@ class FortranExpressionParser(ExpressionParserBase): left_exp, did_something = ExpressionParserBase.parse_postfix( self, pstate, min_precedence, left_exp) - if isinstance(left_exp, tuple) and min_precedence < self._PREC_FUNC_ARGS: - # this must be a complex literal - if len(left_exp) != 2: - raise TranslationError("complex literals must have " - "two entries") + return left_exp, did_something - r, i = left_exp + def parse_expression(self, pstate, min_precedence=0): + left_exp = self.parse_prefix(pstate) - dtype = (r.dtype.type(0) + i.dtype.type(0)) - if dtype == np.float32: - dtype = np.complex64 - else: - dtype = np.complex128 + did_something = True + while did_something: + did_something = False + if pstate.is_at_end(): + return left_exp - left_exp = dtype(float(r) + float(i)*1j) + result = self.parse_postfix( + pstate, min_precedence, left_exp) + left_exp, did_something = result - return left_exp, did_something + from pymbolic.parser import FinalizedTuple + if isinstance(left_exp, FinalizedTuple): + # View all tuples that survive parsing as complex literals + # "FinalizedTuple" indicates that this tuple was enclosed + # in parens. + return tuple_to_complex_literal(left_exp) + + return left_exp # }}} + # vim: foldmethod=marker diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index abd1b888f1781595118cfc1b797b3a9cc0254391..6b6c75622c67e7cb8656359f3ea17ca6fdc9d7e4 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -37,12 +37,14 @@ from loopy.symbolic import (IdentityMapper, RuleAwareIdentityMapper, SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError from loopy.kernel.instruction import LegacyStringInstructionTag -from pymbolic.primitives import Wildcard +from pymbolic.primitives import (Wildcard, Slice) # {{{ subscript base shifter -class SubscriptIndexBaseShifter(IdentityMapper): +class SubscriptIndexAdjuster(IdentityMapper): + """Adjust base indices of subscripts and lengths of slices.""" + def __init__(self, scope): self.scope = scope @@ -60,21 +62,63 @@ class SubscriptIndexBaseShifter(IdentityMapper): if not isinstance(subscript, tuple): subscript = (subscript,) - subscript = list(subscript) - if len(dims) != len(subscript): raise TranslationError("inconsistent number of indices " "to '%s'" % name) + new_subscript = [] for i in range(len(dims)): if len(dims[i]) == 2: - # has a base index - subscript[i] -= dims[i][0] + # has an explicit base index + base_index, end_index = dims[i] elif len(dims[i]) == 1: - # base index is 1 implicitly - subscript[i] -= 1 + base_index = 1 + end_index, = dims[i] + + sub_i = subscript[i] + if isinstance(sub_i, Slice): + start = sub_i.start + if start is None: + start = base_index + + step = sub_i.step + if step is None: + step = 1 + + stop = sub_i.stop + if stop is None: + stop = end_index + + if step == 1: + sub_i = Slice(( + start - base_index, + + # FIXME This is only correct for unit strides + stop - base_index + 1, + + step + )) + elif step == -1: + sub_i = Slice(( + start - base_index, + + # FIXME This is only correct for unit strides + stop - base_index - 1, - return expr.aggregate[self.rec(tuple(subscript))] + step + )) + + else: + # FIXME + raise NotImplementedError("Fortran slice processing for " + "non-unit strides") + + else: + sub_i = sub_i - base_index + + new_subscript.append(sub_i) + + return expr.aggregate[self.rec(tuple(new_subscript))] # }}} @@ -87,9 +131,6 @@ class Scope: arg_names = set() self.subprogram_name = subprogram_name - # map name to data - self.data_statements = {} - # map first letter to type self.implicit_types = {} @@ -100,7 +141,7 @@ class Scope: self.type_map = {} # map name to data - self.data = {} + self.data_map = {} self.arg_names = arg_names @@ -189,11 +230,21 @@ class Scope: expr = submap(expr) - subshift = SubscriptIndexBaseShifter(self) + subshift = SubscriptIndexAdjuster(self) expr = subshift(expr) return expr + def written_vars(self): + return frozenset().union(*(insn.write_dependency_names() + for insn in self.instructions)) + + def read_vars(self): + return (frozenset().union(*(insn.read_dependency_names() + for insn in self.instructions)) + | frozenset().union(*(frozenset(bset.get_var_names(dim_type.param)) + for bset in self.index_sets))) + # }}} @@ -214,16 +265,21 @@ class FortranDivisionToFloorDiv(IdentityMapper): class FortranDivisionSpecializer(RuleAwareIdentityMapper): - def __init__(self, rule_mapping_context, kernel): + def __init__(self, rule_mapping_context, kernel, callables): super().__init__(rule_mapping_context) - from loopy.type_inference import TypeInferenceMapper - self.infer_type = TypeInferenceMapper(kernel) + from loopy.type_inference import TypeReader + self.infer_type = TypeReader(kernel, callables) self.kernel = kernel def map_fortran_division(self, expr, *args): # We remove all these before type inference ever sees them. - num_dtype = self.infer_type(expr.numerator).numpy_dtype - den_dtype = self.infer_type(expr.denominator).numpy_dtype + from loopy.type_inference import TypeInferenceFailure + + try: + num_dtype = self.infer_type(expr.numerator).numpy_dtype + den_dtype = self.infer_type(expr.denominator).numpy_dtype + except TypeInferenceFailure: + return super().map_fortran_division(expr, *args) from pymbolic.primitives import Quotient, FloorDiv if num_dtype.kind in "iub" and den_dtype.kind in "iub": @@ -241,10 +297,31 @@ class FortranDivisionSpecializer(RuleAwareIdentityMapper): self.rec(expr.denominator, *args)) -def specialize_fortran_division(knl): +def _specialize_fortran_division_for_kernel(knl, callables): rmc = SubstitutionRuleMappingContext( knl.substitutions, knl.get_var_name_generator()) - return FortranDivisionSpecializer(rmc, knl).map_kernel(knl) + return FortranDivisionSpecializer(rmc, knl, callables).map_kernel(knl) + + +def specialize_fortran_division(t_unit): + from loopy.translation_unit import TranslationUnit, resolve_callables + from loopy.kernel.function_interface import CallableKernel + from loopy.type_inference import infer_unknown_types + assert isinstance(t_unit, TranslationUnit) + + t_unit = resolve_callables(t_unit) + t_unit = infer_unknown_types(t_unit) + new_callables = {} + + for name, clbl in t_unit.callables_table.items(): + if isinstance(clbl, CallableKernel): + knl = clbl.subkernel + clbl = clbl.copy(subkernel=_specialize_fortran_division_for_kernel( + knl, t_unit.callables_table)) + + new_callables[name] = clbl + + return t_unit.copy(callables_table=new_callables) # }}} @@ -252,11 +329,10 @@ def specialize_fortran_division(knl): # {{{ translator class F2LoopyTranslator(FTreeWalkerBase): - def __init__(self, filename, target=None, all_names_known=True): + def __init__(self, filename, target=None): FTreeWalkerBase.__init__(self, filename) self.target = target - self.all_names_known = all_names_known self.scope_stack = [] @@ -273,11 +349,16 @@ class F2LoopyTranslator(FTreeWalkerBase): self.block_nest = [] + def add_instruction(self, insn): + scope = self.scope_stack[-1] + + scope.previous_instruction_id = insn.id + scope.instructions.append(insn) + def add_expression_instruction(self, lhs, rhs): scope = self.scope_stack[-1] - new_id = intern("insn%d" % self.insn_id_counter) - self.insn_id_counter += 1 + new_id = self.get_insn_id() from loopy.kernel.data import Assignment insn = Assignment( @@ -288,8 +369,13 @@ class F2LoopyTranslator(FTreeWalkerBase): predicates=frozenset(self.conditions), tags=tuple(self.instruction_tags)) - scope.previous_instruction_id = new_id - scope.instructions.append(insn) + self.add_instruction(insn) + + def get_insn_id(self): + new_id = intern("insn%d" % self.insn_id_counter) + self.insn_id_counter += 1 + + return new_id # {{{ map_XXX functions @@ -383,7 +469,8 @@ class F2LoopyTranslator(FTreeWalkerBase): tp = self.dtype_from_stmt(node) - for name, shape in self.parse_dimension_specs(node, node.entity_decls): + for name, shape, initializer in self.parse_dimension_specs( + node, node.entity_decls): if shape is not None: assert name not in scope.dim_map scope.dim_map[name] = shape @@ -392,6 +479,9 @@ class F2LoopyTranslator(FTreeWalkerBase): assert name not in scope.type_map scope.type_map[name] = tp + assert name not in scope.data_map + scope.data_map[name] = initializer + return [] map_Logical = map_type_decl # noqa: N815 @@ -403,7 +493,10 @@ class F2LoopyTranslator(FTreeWalkerBase): def map_Dimension(self, node): scope = self.scope_stack[-1] - for name, shape in self.parse_dimension_specs(node, node.items): + for name, shape, initializer in self.parse_dimension_specs(node, node.items): + if initializer is not None: + raise LoopyError("initializer in dimension statement") + if shape is not None: assert name not in scope.dim_map scope.dim_map[name] = shape @@ -494,7 +587,47 @@ class F2LoopyTranslator(FTreeWalkerBase): raise NotImplementedError("goto") def map_Call(self, node): - raise NotImplementedError("call") + from loopy.kernel.instruction import _get_assignee_var_name + scope = self.scope_stack[-1] + + new_id = self.get_insn_id() + + # {{{ comply with loopy's kernel call requirements + + callee, = (knl for knl in self.kernels + if knl.subprogram_name == node.designator) + call_params = [scope.process_expression_for_loopy(self.parse_expr(node, + item)) + for item in node.items] + callee_read_vars = callee.read_vars() + callee_written_vars = callee.written_vars() + + lpy_params = [] + lpy_assignees = [] + for param in call_params: + name = _get_assignee_var_name(param) + if name in callee_read_vars: + lpy_params.append(param) + if name in callee_written_vars: + lpy_assignees.append(param) + if name not in (callee_read_vars | callee_written_vars): + lpy_params.append(param) + + # }}} + + from pymbolic import var + + from loopy.kernel.data import CallInstruction + insn = CallInstruction( + tuple(lpy_assignees), + var(node.designator)(*lpy_params), + within_inames=frozenset( + scope.active_loopy_inames), + id=new_id, + predicates=frozenset(self.conditions), + tags=tuple(self.instruction_tags)) + + self.add_instruction(insn) def map_Return(self, node): raise NotImplementedError("return") @@ -502,11 +635,6 @@ class F2LoopyTranslator(FTreeWalkerBase): def map_ArithmeticIf(self, node): raise NotImplementedError("arithmetic-if") - def map_If(self, node): - raise NotImplementedError("if") - # node.expr - # node.content[0] - def realize_conditional(self, node, context_cond=None): scope = self.scope_stack[-1] @@ -535,6 +663,15 @@ class F2LoopyTranslator(FTreeWalkerBase): self.conditions.append(cond_expr) + def map_If(self, node): + self.realize_conditional(node, None) + + for c in node.content: + self.rec(c) + + self.conditions_data.pop() + self.conditions.pop() + def map_IfThen(self, node): self.block_nest.append("if") self.realize_conditional(node, None) @@ -735,6 +872,10 @@ class F2LoopyTranslator(FTreeWalkerBase): for arg_name in sub.arg_names: dims = sub.dim_map.get(arg_name) + if sub.data_map.get(arg_name) is not None: + raise NotImplementedError( + "initializer for argument %s" % arg_name) + if dims is not None: # default order is set to "F" in kernel creation below kernel_data.append( @@ -760,15 +901,22 @@ class F2LoopyTranslator(FTreeWalkerBase): if sub.implicit_types is None and dtype is None: continue + kwargs = {} + if sub.data_map.get(var_name) is not None: + kwargs["read_only"] = True + kwargs["address_space"] = lp.AddressSpace.PRIVATE + kwargs["initializer"] = np.array( + sub.data_map[var_name], dtype=dtype) + kernel_data.append( lp.TemporaryVariable( var_name, dtype=dtype, - shape=sub.get_loopy_shape(var_name))) + shape=sub.get_loopy_shape(var_name), + **kwargs)) # }}} - from loopy.version import MOST_RECENT_LANGUAGE_VERSION - knl = lp.make_kernel( + knl = lp.make_function( sub.index_sets, sub.instructions, kernel_data, @@ -777,14 +925,11 @@ class F2LoopyTranslator(FTreeWalkerBase): index_dtype=self.index_dtype, target=self.target, seq_dependencies=seq_dependencies, - lang_version=MOST_RECENT_LANGUAGE_VERSION ) - if self.all_names_known: - knl = specialize_fortran_division(knl) + from loopy.loop import merge_loop_domains + knl = merge_loop_domains(knl) - from loopy.loop import fuse_loop_domains - knl = fuse_loop_domains(knl) knl = lp.fold_constants(knl) result.append(knl) diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py index cc41afc6c5bca7d202548490983169374dcf1535..9a703a794ebb20a1881c1ced79258fa83b06ed04 100644 --- a/loopy/frontend/fortran/tree.py +++ b/loopy/frontend/fortran/tree.py @@ -60,7 +60,9 @@ class FTreeWalkerBase: ENTITY_RE = re.compile( r"^(?P[_0-9a-zA-Z]+)\s*" - r"(\((?P[-+*/0-9:a-zA-Z, \t]+)\))?$") + r"(\((?P[-+*/0-9:a-zA-Z, \t]+)\))?" + r"(\s*=\s*(?P.+))?" + "$") def parse_dimension_specs(self, node, dim_decls): def parse_bounds(bounds_str): @@ -83,7 +85,31 @@ class FTreeWalkerBase: else: shape = None - yield name, shape + init_str = groups["initializer"] + if init_str: + init_str = init_str.replace("(/", "[") + init_str = init_str.replace("/)", "]") + init_expr = self.parse_expr(node, init_str) + + from numbers import Number + if isinstance(init_expr, Number): + initializer = init_expr + elif isinstance(init_expr, list): + for i, item in enumerate(init_expr): + if not isinstance(item, Number): + raise LoopyError("unexpected type of " + "item %d in initializer: %s" + % (i+1, type(init_expr).__name__)) + initializer = init_expr + + else: + raise LoopyError("unexpected type of initializer: %s" + % type(init_expr).__name__) + + else: + initializer = None + + yield name, shape, initializer def __call__(self, expr, *args, **kwargs): return self.rec(expr, *args, **kwargs) diff --git a/loopy/ipython_ext.py b/loopy/ipython_ext.py index 7f9177e0ef8430cc450cb462641b12ed1a9f9b28..a469b46489786b39516ccda58a20130de4d0e7ea 100644 --- a/loopy/ipython_ext.py +++ b/loopy/ipython_ext.py @@ -8,9 +8,7 @@ class LoopyMagics(Magics): @cell_magic def fortran_kernel(self, line, cell): result = lp.parse_fortran(cell) - - for knl in result: - self.shell.user_ns[knl.name] = knl + self.shell.user_ns["prog"] = result @cell_magic def transformed_fortran_kernel(self, line, cell): @@ -18,8 +16,7 @@ class LoopyMagics(Magics): cell, transform_code_context=self.shell.user_ns) - for knl in result: - self.shell.user_ns[knl.name] = knl + self.shell.user_ns["prog"] = result def load_ipython_extension(ip): diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 234074d89201700d2253b2903b94c767f523548d..d67df115474ff6e97796ac44bd5ee8fa17f15e08 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -24,7 +24,7 @@ THE SOFTWARE. """ -from loopy.diagnostic import StaticValueFindingError +from loopy.diagnostic import StaticValueFindingError, LoopyError import islpy as isl from islpy import dim_type @@ -60,7 +60,35 @@ def dump_space(ls): # {{{ make_slab -def make_slab(space, iname, start, stop): +def make_slab(space, iname, start, stop, iname_multiplier=1): + """ + Returns an instance of :class:`islpy._isl.BasicSet`, which satisfies the + constraint ``start <= iname_multiplier*iname < stop``. + + :arg space: An instance of :class:`islpy._isl.Space`. + + :arg iname: + + Either an instance of :class:`str` as a name of the ``iname`` or a + tuple of ``(iname_dt, iname_dx)`` indicating the *iname* in the space. + + :arg start: + + An instance of :class:`int` or an instance of + :class:`islpy._isl.Aff` indicating the lower bound of + ``iname_multiplier*iname``(inclusive). + + :arg stop: + + An instance of :class:`int` or an instance of + :class:`islpy._isl.Aff` indicating the upper bound of + ``iname_multiplier*iname``. + + :arg iname_multiplier: + + A strictly positive :class:`int` denoting *iname*'s coefficient in the + above inequality expression. + """ zero = isl.Aff.zero_on_domain(space) if isinstance(start, (isl.Aff, isl.PwAff)): @@ -89,13 +117,16 @@ def make_slab(space, iname, start, stop): iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1) - result = (isl.BasicSet.universe(space) - # start <= iname - .add_constraint(isl.Constraint.inequality_from_aff( - iname_aff - start)) - # iname < stop - .add_constraint(isl.Constraint.inequality_from_aff( - stop-1 - iname_aff))) + if iname_multiplier > 0: + result = (isl.BasicSet.universe(space) + # start <= iname_multiplier*iname + .add_constraint(isl.Constraint.inequality_from_aff( + iname_multiplier*iname_aff - start)) + # iname_multiplier*iname < stop + .add_constraint(isl.Constraint.inequality_from_aff( + stop-1 - iname_multiplier*iname_aff))) + else: + raise LoopyError("iname_multiplier must be strictly positive") return result @@ -419,11 +450,16 @@ def boxify(cache_manager, domain, box_inames, context): def simplify_via_aff(expr): - from loopy.symbolic import aff_from_expr, aff_to_expr, get_dependencies + from loopy.symbolic import aff_to_expr, guarded_aff_from_expr, get_dependencies + from loopy.diagnostic import ExpressionToAffineConversionError + deps = sorted(get_dependencies(expr)) - return aff_to_expr(aff_from_expr( - isl.Space.create_from_names(isl.DEFAULT_CONTEXT, list(deps)), - expr)) + try: + return aff_to_expr(guarded_aff_from_expr( + isl.Space.create_from_names(isl.DEFAULT_CONTEXT, list(deps)), + expr)) + except ExpressionToAffineConversionError: + return expr def project_out(set, inames): @@ -570,7 +606,7 @@ def find_max_of_pwaff_with_params(pw_aff, n_allowed_params): def set_dim_name(obj, dt, pos, name): assert isinstance(name, str) - if isinstance(obj, isl.PwQPolynomial): + if isinstance(obj, (isl.PwQPolynomial, isl.BasicSet)): return obj.set_dim_name(dt, pos, name) elif isinstance(obj, isl.PwAff): # work around missing isl_pw_aff_set_dim_name for now. diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 6885e1ab596fe7f11b4a8228ef09734c87115989..38051495d7245c570b381894f4110aed1acad26e 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -35,11 +35,8 @@ import re from pytools import UniqueNameGenerator, generate_unique_names, natsorted -from loopy.library.function import ( - default_function_mangler, - single_arg_function_mangler) - from loopy.diagnostic import CannotBranchDomainTree, LoopyError +from loopy.tools import update_persistent_hash from loopy.diagnostic import StaticValueFindingError from loopy.kernel.data import filter_iname_tags_by_type, Iname from warnings import warn @@ -109,8 +106,9 @@ class _deprecated_KernelState_SCHEDULED: # noqa class KernelState: # noqa INITIAL = 0 - PREPROCESSED = 1 - LINEARIZED = 2 + CALLS_RESOLVED = 1 + PREPROCESSED = 2 + LINEARIZED = 3 @_deprecated_KernelState_SCHEDULED def SCHEDULED(): # pylint:disable=no-method-argument @@ -168,8 +166,8 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): .. attribute:: domains - a list of :class:`islpy.BasicSet` instances - representing the :ref:`domain-tree`. + a list of :class:`islpy.BasicSet` instances representing the + :ref:`domain-tree`. .. attribute:: instructions @@ -198,7 +196,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): :class:`loopy.TemporaryVariable` instances. - .. attribute:: function_manglers .. attribute:: symbol_manglers .. attribute:: substitutions @@ -267,7 +264,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): inames=None, iname_to_tags=None, substitutions=None, - function_manglers=None, symbol_manglers=[], iname_slab_increments=None, @@ -276,7 +272,7 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): applied_iname_rewrites=None, cache_manager=None, - index_dtype=np.int32, + index_dtype=None, options=None, state=KernelState.INITIAL, @@ -305,16 +301,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): temporary_variables = {} if substitutions is None: substitutions = {} - if function_manglers is None: - function_manglers = [ - default_function_mangler, - single_arg_function_mangler, - ] - if symbol_manglers is None: - function_manglers = [ - default_function_mangler, - single_arg_function_mangler, - ] if iname_slab_increments is None: iname_slab_increments = {} @@ -347,6 +333,9 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): name: inames.get(name, Iname(name, frozenset())) for name in _get_inames_from_domains(domains)} + if index_dtype is None: + index_dtype = np.int32 + # }}} # {{{ process assumptions @@ -381,6 +370,7 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): if state not in [ KernelState.INITIAL, + KernelState.CALLS_RESOLVED, KernelState.PREPROCESSED, KernelState.LINEARIZED, ]: @@ -424,7 +414,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): substitutions=substitutions, cache_manager=cache_manager, applied_iname_rewrites=applied_iname_rewrites, - function_manglers=function_manglers, symbol_manglers=symbol_manglers, index_dtype=index_dtype, options=options, @@ -439,51 +428,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): # }}} - # {{{ function mangling - - def mangle_function(self, identifier, arg_dtypes, ast_builder=None): - if ast_builder is None: - ast_builder = self.target.get_device_ast_builder() - - manglers = ast_builder.function_manglers() + self.function_manglers - - for mangler in manglers: - mangle_result = mangler(self, identifier, arg_dtypes) - if mangle_result is not None: - from loopy.kernel.data import CallMangleInfo - if isinstance(mangle_result, CallMangleInfo): - assert len(mangle_result.arg_dtypes) == len(arg_dtypes) - return mangle_result - - assert isinstance(mangle_result, tuple) - - from warnings import warn - warn("'%s' returned a tuple instead of a CallMangleInfo instance. " - "This is deprecated." % mangler.__name__, - DeprecationWarning) - - if len(mangle_result) == 2: - result_dtype, target_name = mangle_result - return CallMangleInfo( - target_name=target_name, - result_dtypes=(result_dtype,), - arg_dtypes=None) - - elif len(mangle_result) == 3: - result_dtype, target_name, actual_arg_dtypes = mangle_result - return CallMangleInfo( - target_name=target_name, - result_dtypes=(result_dtype,), - arg_dtypes=actual_arg_dtypes) - - else: - raise ValueError("unexpected size of tuple returned by '%s'" - % mangler.__name__) - - return None - - # }}} - # {{{ symbol mangling def mangle_symbol(self, ast_builder, identifier): @@ -557,6 +501,21 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): except KeyError: pass + if name in self.all_inames(): + from loopy import TemporaryVariable + return TemporaryVariable( + name=name, + dtype=self.index_dtype, + shape=()) + + try: + dtype, name = self.mangle_symbol(self.target.get_device_ast_builder(), + name) + from loopy import ValueArg + return ValueArg(name, dtype) + except TypeError: + pass + raise ValueError("nothing known about variable '%s'" % name) @property @@ -1106,21 +1065,13 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): constants_only=True))) @memoize_method - def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False): - """Return a tuple (global_size, local_size) containing a grid that - could accommodate execution of all instructions whose IDs are given - in *insn_ids*. - - :arg insn_ids: a :class:`frozenset` of instruction IDs - - *global_size* and *local_size* are :class:`islpy.PwAff` objects. + def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, + callables_table, ignore_auto=False): + """ + Returns a tuple of (global_sizes, local_sizes), where global_sizes, + local_sizes are the grid sizes accommodating all of *insn_ids*. The grid + sizes are a dict from the axis index to the corresponding grid size. """ - - if self.overridden_get_grid_sizes_for_insn_ids: - return self.overridden_get_grid_sizes_for_insn_ids( - insn_ids, - ignore_auto=ignore_auto) - all_inames_by_insns = set() for insn_id in insn_ids: all_inames_by_insns |= self.insn_inames(insn_id) @@ -1131,9 +1082,38 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): % (", ".join(sorted(all_inames_by_insns)), ", ".join(sorted(self.all_inames())))) + # {{{ include grid constraints due to callees + global_sizes = {} local_sizes = {} + from loopy.kernel.instruction import CallInstruction + from loopy.symbolic import ResolvedFunction + + for insn in self.instructions: + # TODO: This might be unsafe as call-sites must be resolved to get + # any hardware axes size constraints they might impose. However, + # transforms like 'precompute' use this method and callables might + # not be resolved by then. + if (isinstance(insn, CallInstruction) + and isinstance(insn.expression.function, ResolvedFunction)): + + clbl = callables_table[insn.expression.function.name] + gsize, lsize = clbl.get_hw_axes_sizes(insn.arg_id_to_arg(), + self.assumptions.space, + callables_table) + + for tgt_dict, tgt_size in [(global_sizes, gsize), + (local_sizes, lsize)]: + + for iaxis, size in tgt_size.items(): + if iaxis in tgt_dict: + tgt_dict[iaxis] = tgt_dict[iaxis].max(size) + else: + tgt_dict[iaxis] = size + + # }}} + from loopy.kernel.data import ( GroupIndexTag, LocalIndexTag, AutoLocalIndexTagBase) @@ -1175,22 +1155,59 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): tgt_dict[tag.axis] = size - def to_dim_tuple(size_dict, which, forced_sizes={}): - forced_sizes = forced_sizes.copy() + # {{{ override local_sizes with self.local_sizes + for i_lsize, lsize in self.local_sizes.items(): + if i_lsize <= max(local_sizes.keys()): + local_sizes[i_lsize] = lsize + else: + from warnings import warn + warn(f"Forced local sizes '{i_lsize}: {lsize}' is unused" + f" because kernel '{self.name}' uses {max(local_sizes.keys())}" + " local hardware axes.") + + # }}} + + return global_sizes, local_sizes + + @memoize_method + def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table, + ignore_auto=False, return_dict=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of all instructions whose IDs are given + in *insn_ids*. + + :arg insn_ids: a :class:`frozenset` of instruction IDs + + *global_size* and *local_size* are :class:`islpy.PwAff` objects. + """ + + if self.overridden_get_grid_sizes_for_insn_ids: + gsize, lsize = self.overridden_get_grid_sizes_for_insn_ids( + insn_ids, + callables_table=callables_table, + ignore_auto=ignore_auto) + if return_dict: + return dict(enumerate(gsize)), dict(enumerate(lsize)) + else: + return gsize, lsize + + global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts( + insn_ids, callables_table, ignore_auto=ignore_auto) + + if return_dict: + return global_sizes, local_sizes + + def to_dim_tuple(size_dict, which): size_list = [] sorted_axes = sorted(size_dict.keys()) - while sorted_axes or forced_sizes: + while sorted_axes: if sorted_axes: cur_axis = sorted_axes.pop(0) else: cur_axis = None - if len(size_list) in forced_sizes: - size_list.append(forced_sizes.pop(len(size_list))) - continue - assert cur_axis is not None if cur_axis > len(size_list): @@ -1202,9 +1219,11 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): return tuple(size_list) return (to_dim_tuple(global_sizes, "global"), - to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) + to_dim_tuple(local_sizes, "local")) - def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, ignore_auto=False): + @memoize_method + def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, + callables_table, ignore_auto=False, return_dict=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. @@ -1215,7 +1234,15 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): """ grid_size, group_size = self.get_grid_sizes_for_insn_ids( - insn_ids, ignore_auto) + insn_ids, callables_table, ignore_auto, return_dict) + + if return_dict: + def dict_to_exprs(d): + from loopy.symbolic import pw_aff_to_expr + return {k: pw_aff_to_expr(v, int_ok=True) + for k, v in d.items()} + + return dict_to_exprs(grid_size), dict_to_exprs(group_size) def tup_to_exprs(tup): from loopy.symbolic import pw_aff_to_expr @@ -1223,7 +1250,8 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): return tup_to_exprs(grid_size), tup_to_exprs(group_size) - def get_grid_size_upper_bounds(self, ignore_auto=False): + def get_grid_size_upper_bounds(self, callables_table, ignore_auto=False, + return_dict=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1231,18 +1259,20 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): """ return self.get_grid_sizes_for_insn_ids( frozenset(insn.id for insn in self.instructions), - ignore_auto=ignore_auto) + callables_table, ignore_auto=ignore_auto, + return_dict=return_dict) - def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): + def get_grid_size_upper_bounds_as_exprs(self, callables_table, + ignore_auto=False, return_dict=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. *global_size* and *local_size* are :mod:`pymbolic` expressions """ - return self.get_grid_sizes_for_insn_ids_as_exprs( frozenset(insn.id for insn in self.instructions), - ignore_auto=ignore_auto) + callables_table, ignore_auto=ignore_auto, + return_dict=return_dict) # }}} @@ -1465,14 +1495,11 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): """ Execute the :class:`LoopKernel`. """ - key = self.target.get_kernel_executor_cache_key(*args, **kwargs) - try: - kex = self._kernel_executor_cache[key] - except KeyError: - kex = self.target.get_kernel_executor(self, *args, **kwargs) - self._kernel_executor_cache[key] = kex - - return kex(*args, **kwargs) + warn("Calling a LoopKernel is deprecated, call a TranslationUnit " + "instead.", DeprecationWarning, stacklevel=2) + from loopy.translation_unit import make_program + program = make_program(self) + return program(*args, **kwargs) # }}} @@ -1569,18 +1596,10 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): # resolve hash conflicts. "preamble_generators", - "function_manglers", "symbol_manglers", ) - def update_persistent_hash(self, key_hash, key_builder): - """Custom hash computation function for use with - :class:`pytools.persistent_dict.PersistentDict`. - - Only works in conjunction with :class:`loopy.tools.KeyBuilder`. - """ - for field_name in self.hash_fields: - key_builder.rec(key_hash, getattr(self, field_name)) + update_persistent_hash = update_persistent_hash @memoize_method def __hash__(self): @@ -1648,11 +1667,6 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): return super().copy(**kwargs) - # forward compatibility with kernel callables - @property - def default_entrypoint(self): - return self - # }}} # vim: foldmethod=marker diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 9fd166ab8f15bdc97006c94c7d03977b64c08292..8fdcb1386ecd2873d8f511095cf1914e0dff292b 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -88,6 +88,9 @@ class _StrideArrayDimTagBase(ArrayDimImplementationTag): :class:`ComputedStrideArrayDimTag` instances may occur. """ + def depends_on(self): + raise NotImplementedError() + class FixedStrideArrayDimTag(_StrideArrayDimTagBase): """An arg dimension implementation tag for a fixed (potentially @@ -145,6 +148,14 @@ class FixedStrideArrayDimTag(_StrideArrayDimTagBase): return self.copy(stride=mapper(self.stride)) + def depends_on(self): + from loopy.kernel.data import auto + from loopy.symbolic import DependencyMapper + if self.stride is auto: + return frozenset() + + return DependencyMapper(composite_leaves=auto)(self.stride) + class ComputedStrideArrayDimTag(_StrideArrayDimTagBase): """ @@ -179,6 +190,9 @@ class ComputedStrideArrayDimTag(_StrideArrayDimTagBase): def map_expr(self, mapper): return self + def depends_on(self): + return frozenset() + class SeparateArrayArrayDimTag(ArrayDimImplementationTag): def stringify(self, include_target_axis): @@ -190,6 +204,9 @@ class SeparateArrayArrayDimTag(ArrayDimImplementationTag): def map_expr(self, mapper): return self + def depends_on(self): + return frozenset() + class VectorArrayDimTag(ArrayDimImplementationTag): def stringify(self, include_target_axis): @@ -201,6 +218,9 @@ class VectorArrayDimTag(ArrayDimImplementationTag): def map_expr(self, mapper): return self + def depends_on(self): + return frozenset() + NESTING_LEVEL_RE = re.compile(r"^N([-0-9]+)(?::(.*)|)$") PADDED_STRIDE_TAG_RE = re.compile(r"^([a-zA-Z]*)\(pad=(.*)\)$") @@ -864,6 +884,7 @@ class ArrayBase(ImmutableRecord, Taggable): order=order, alignment=alignment, for_atomic=for_atomic, + target=target, tags=tags, **kwargs) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index a3a470f7f980b39c751341fa6bd99217afbe7e26..b9cf234c6cf116d569f578d5941e66b37ebc8b38 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -23,16 +23,18 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - import numpy as np from pymbolic.mapper import CSECachingMapperMixin +from pymbolic.primitives import Slice, Variable, Subscript, Call from loopy.tools import intern_frozenset_of_ids, Optional -from loopy.symbolic import IdentityMapper, WalkMapper +from loopy.symbolic import ( + IdentityMapper, WalkMapper, SubArrayRef) from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, - SubstitutionRule) + SubstitutionRule, AddressSpace, ValueArg) +from loopy.translation_unit import for_each_kernel from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -530,9 +532,11 @@ def parse_insn(groups, insn_options): assignee_names.append(inner_lhs_i.name) elif isinstance(inner_lhs_i, (Subscript, LinearSubscript)): assignee_names.append(inner_lhs_i.aggregate.name) + elif isinstance(inner_lhs_i, SubArrayRef): + assignee_names.append(inner_lhs_i.subscript.aggregate.name) else: raise LoopyError("left hand side of assignment '%s' must " - "be variable or subscript" % (lhs_i,)) + "be variable, subscript or a SubArrayRef" % (lhs_i,)) new_lhs.append(lhs_i) @@ -1080,6 +1084,9 @@ def parse_domains(domains, defines): result.append(dom) + if result == []: + result = [isl.BasicSet("{:}")] + return result # }}} @@ -1168,8 +1175,7 @@ class ArgumentGuesser: def make_new_arg(self, arg_name): arg_name = arg_name.strip() import loopy as lp - - from loopy.kernel.data import ValueArg, ArrayArg, AddressSpace + from loopy.kernel.data import ValueArg, ArrayArg if arg_name in self.all_params: return ValueArg(arg_name) @@ -1720,7 +1726,7 @@ def _is_wildcard(s): def _resolve_dependencies(what, knl, insn, deps): - from loopy import find_instructions + from loopy.transform.instruction import find_instructions from loopy.match import MatchExpressionBase new_deps = [] @@ -1814,6 +1820,7 @@ def add_inferred_inames(knl): # {{{ apply single-writer heuristic +@for_each_kernel def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): logger.debug("%s: default deps" % kernel.name) @@ -1882,9 +1889,219 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): # }}} +# {{{ slice to sub array ref + +def normalize_slice_params(slice, dimension_length): + """ + Returns the normalized slice parameters ``(start, stop, step)``. + + :arg slice: An instance of :class:`pymbolic.primitives.Slice`. + :arg dimension_length: Length of the axis being sliced. + """ + from pymbolic.primitives import Slice + from numbers import Integral + + assert isinstance(slice, Slice) + start, stop, step = slice.start, slice.stop, slice.step + + # {{{ defaulting parameters + + if step is None: + step = 1 + + if step == 0: + raise LoopyError("Slice cannot have 0 step size.") + + if start is None: + if step > 0: + start = 0 + else: + start = dimension_length-1 + + if stop is None: + if step > 0: + stop = dimension_length + else: + stop = -1 + + # }}} + + if not isinstance(step, Integral): + raise LoopyError("Non-integral step sizes lead to non-affine domains =>" + " not supported") + + return start, stop, step + + +class SliceToInameReplacer(IdentityMapper): + """ + Converts slices to instances of :class:`loopy.symbolic.SubArrayRef`. + + .. attribute:: var_name_gen + + Variable name generator, in order to generate unique inames within the + kernel domain. + + .. attribute:: knl + + An instance of :class:`loopy.LoopKernel` + + .. attribute:: subarray_ref_bounds + + A :class:`list` (one entry for each :class:`SubArrayRef` to be created) + of :class:`dict` instances to store the slices enountered in the + expressions as a mapping from ``iname`` to a tuple of ``(start, stop, + step)``, which describes the boxy (i.e. affine) constraints imposed on + the ``iname`` by the corresponding slice notation its intended to + replace. + """ + def __init__(self, knl): + self.subarray_ref_bounds = [] + self.knl = knl + self.var_name_gen = knl.get_var_name_generator() + + def map_subscript(self, expr): + subscript_iname_bounds = {} + + new_index = [] + swept_inames = [] + for i, index in enumerate(expr.index_tuple): + if isinstance(index, Slice): + unique_var_name = self.var_name_gen(based_on="i") + if expr.aggregate.name in self.knl.arg_dict: + shape = self.knl.arg_dict[expr.aggregate.name].shape + else: + assert expr.aggregate.name in self.knl.temporary_variables + shape = self.knl.temporary_variables[ + expr.aggregate.name].shape + if shape is None or shape[i] is None: + raise LoopyError("Slice notation is only supported for " + "variables whose shapes are known at creation time " + "-- maybe add the shape for '{}'.".format( + expr.aggregate.name)) + + domain_length = shape[i] + start, stop, step = normalize_slice_params(index, domain_length) + subscript_iname_bounds[unique_var_name] = (start, stop, step) + new_index.append(start+step*Variable(unique_var_name)) + swept_inames.append(Variable(unique_var_name)) + else: + new_index.append(index) + + if swept_inames: + self.subarray_ref_bounds.append(subscript_iname_bounds) + result = SubArrayRef(tuple(swept_inames), Subscript( + self.rec(expr.aggregate), + self.rec(tuple(new_index)))) + else: + result = super().map_subscript(expr) + + return result + + def map_call(self, expr): + + def _convert_array_to_slices(arg): + # FIXME: We do not support something like A[1] should point to the + # second row if 'A' is 3 x 3 array. + if isinstance(arg, Variable): + from loopy.kernel.data import auto + if (arg.name in self.knl.temporary_variables): + if self.knl.temporary_variables[arg.name].shape in [ + auto, None]: + # do not convert arrays with unknown shapes to slices. + # (If an array of unknown shape was passed in error, will be + # caught and raised during preprocessing). + array_arg_shape = () + else: + array_arg_shape = ( + self.knl.temporary_variables[arg.name].shape) + elif arg.name in self.knl.arg_dict: + if isinstance(self.knl.arg_dict[arg.name], ValueArg): + array_arg_shape = () + else: + + if self.knl.arg_dict[arg.name].shape in [ + auto, None]: + # do not convert arrays with unknown shapes to slices. + # (If an array of unknown shape was passed in error, will + # be caught and raised during preprocessing). + array_arg_shape = () + else: + array_arg_shape = ( + self.knl.arg_dict[arg.name].shape) + else: + # arg could be either an iname or a "mangled symbol" + array_arg_shape = () + + if array_arg_shape != (): + return Subscript(arg, tuple(Slice(()) + for _ in array_arg_shape)) + return arg + + return Call(expr.function, + tuple(self.rec(_convert_array_to_slices(par)) + for par in expr.parameters)) + + def map_call_with_kwargs(self, expr): + # See: https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + + def get_iname_domain_as_isl_set(self): + """ + Returns the extra domain constraints imposed by the slice inames, + recorded in :attr:`iname_domains`. + """ + subarray_ref_domains = [] + for sar_bounds in self.subarray_ref_bounds: + ctx = self.knl.isl_context + space = isl.Space.create_from_names(ctx, + set=list(sar_bounds.keys())) + from loopy.symbolic import get_dependencies + args_as_params_for_domains = set() + for slice_ in sar_bounds.values(): + args_as_params_for_domains |= get_dependencies(slice_) + + space = space.add_dims(dim_type.param, len(args_as_params_for_domains)) + for i, arg in enumerate(args_as_params_for_domains): + space = space.set_dim_name(dim_type.param, i, arg) + + iname_set = isl.BasicSet.universe(space) + + from loopy.isl_helpers import make_slab + for iname, (start, stop, step) in sar_bounds.items(): + if step > 0: + iname_set = iname_set & make_slab(space, iname, 0, + stop-start, step) + else: + iname_set = iname_set & make_slab(space, iname, 0, + start-stop, -step) + + subarray_ref_domains.append(iname_set) + + return subarray_ref_domains + + +def realize_slices_array_inputs_as_sub_array_refs(kernel): + """ + Returns a kernel with the instances of :class:`pymbolic.primitives.Slice` + encountered in expressions replaced as `loopy.symbolic.SubArrayRef`. + """ + slice_replacer = SliceToInameReplacer(kernel) + new_insns = [insn.with_transformed_expressions(slice_replacer) + for insn in kernel.instructions] + + return kernel.copy( + domains=( + kernel.domains + + slice_replacer.get_iname_domain_as_isl_set()), + instructions=new_insns) + +# }}} + + # {{{ kernel creation top-level -def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): +def make_function(domains, instructions, kernel_data=["..."], **kwargs): """User-facing kernel creation entrypoint. :arg domains: @@ -1938,9 +2155,6 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): :arg default_offset: 0 or :class:`loopy.auto`. The default value of *offset* in :attr:`ArrayArg` for guessed arguments. Defaults to 0. - :arg function_manglers: list of functions of signature - ``(target, name, arg_dtypes)`` - returning a :class:`loopy.CallMangleInfo`. :arg symbol_manglers: list of functions of signature (name) returning a tuple (result_dtype, c_name), where c_name is the C-level symbol to be evaluated. @@ -2047,7 +2261,11 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # This *is* gross. But it seems like the right thing interface-wise. import inspect - caller_globals = inspect.currentframe().f_back.f_globals + if inspect.currentframe().f_back.f_code.co_name == "make_kernel": + # if caller is "make_kernel", read globals from make_kernel's caller + caller_globals = inspect.currentframe().f_back.f_back.f_globals + else: + caller_globals = inspect.currentframe().f_back.f_globals for ver_sym in LANGUAGE_VERSION_SYMBOLS: try: @@ -2064,7 +2282,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): from loopy.version import ( MOST_RECENT_LANGUAGE_VERSION, FALLBACK_LANGUAGE_VERSION) - warn("'lang_version' was not passed to make_kernel(). " + warn("'lang_version' was not passed to make_function(). " "To avoid this warning, pass " "lang_version={ver} in this invocation. " "(Or say 'from loopy.version import " @@ -2184,6 +2402,10 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_nonexistent_iname_deps(knl) knl = create_temporaries(knl, default_order) + + # convert slices to iname domains + knl = realize_slices_array_inputs_as_sub_array_refs(knl) + # ------------------------------------------------------------------------- # Ordering dependency: # ------------------------------------------------------------------------- @@ -2221,15 +2443,25 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_duplicate_names(knl) check_written_variable_names(knl) + from loopy.kernel.tools import infer_args_are_input_output + knl = infer_args_are_input_output(knl) + from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) creation_plog.done() - from loopy.kernel.tools import infer_arg_is_output_only - knl = infer_arg_is_output_only(knl) + from loopy.translation_unit import make_program + return make_program(knl) + + +def make_kernel(*args, **kwargs): + tunit = make_function(*args, **kwargs) + name, = [name for name in tunit.callables_table] + return tunit.with_entrypoints(name) + - return knl +make_kernel.__doc__ = make_function.__doc__ # }}} diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 620211cf29b464e297119ede597bb1abadaff193..8d0f05daa6bdbc5c5a84a53cc6756e4abf3dd86c 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -68,6 +68,8 @@ __doc__ = """ .. autoclass:: UnrollTag .. autoclass:: Iname + +.. autoclass:: KernelArgument """ @@ -363,6 +365,8 @@ class KernelArgument(ImmutableRecord): dtype = None kwargs["dtype"] = dtype + kwargs["is_output"] = kwargs.pop("is_output", None) + kwargs["is_input"] = kwargs.pop("is_input", None) ImmutableRecord.__init__(self, **kwargs) @@ -375,21 +379,39 @@ class ArrayArg(ArrayBase, KernelArgument): An attribute of :class:`AddressSpace` defining the address space in which the array resides. - .. attribute:: is_output_only + .. attribute:: is_output + + An instance of :class:`bool`. If set to *True*, the array is used to + return information to the caller. If set to *False*, the callee does not + write to the array during a call. - An instance of :class:`bool`. If set to *True*, recorded to be - returned from the kernel. + .. attribute:: is_input + + An instance of :class:`bool`. If set to *True*, expected to be provided + by the caller. If *False*, the callee does not depend on the array + at kernel entry. """) allowed_extra_kwargs = [ "address_space", - "is_output_only", + "is_output", + "is_input", "tags"] def __init__(self, *args, **kwargs): if "address_space" not in kwargs: raise TypeError("'address_space' must be specified") - kwargs["is_output_only"] = kwargs.pop("is_output_only", False) + + is_output_only = kwargs.pop("is_output_only", None) + if is_output_only is not None: + warn("'is_output_only' is deprecated. Use 'is_output', 'is_input'" + " instead.", DeprecationWarning, stacklevel=2) + kwargs["is_output"] = is_output_only + kwargs["is_input"] = not is_output_only + else: + kwargs["is_output"] = kwargs.pop("is_output", None) + kwargs["is_input"] = kwargs.pop("is_input", None) + super().__init__(*args, **kwargs) min_target_axes = 0 @@ -416,7 +438,8 @@ class ArrayArg(ArrayBase, KernelArgument): """ super().update_persistent_hash(key_hash, key_builder) key_builder.rec(key_hash, self.address_space) - key_builder.rec(key_hash, self.is_output_only) + key_builder.rec(key_hash, self.is_output) + key_builder.rec(key_hash, self.is_input) # Making this a function prevents incorrect use in isinstance. @@ -433,6 +456,17 @@ def GlobalArg(*args, **kwargs): class ConstantArg(ArrayBase, KernelArgument): __doc__ = ArrayBase.__doc__ + + def __init__(self, *args, **kwargs): + if kwargs.pop("address_space", AddressSpace.GLOBAL) != AddressSpace.GLOBAL: + raise LoopyError("'address_space' for ConstantArg must be GLOBAL.") + super().__init__(*args, **kwargs) + + # Constant Arg cannot be an output + is_output = False + is_input = True + address_space = AddressSpace.GLOBAL + min_target_axes = 0 max_target_axes = 1 @@ -443,9 +477,20 @@ class ConstantArg(ArrayBase, KernelArgument): class ImageArg(ArrayBase, KernelArgument): __doc__ = ArrayBase.__doc__ + + def __init__(self, *args, **kwargs): + if kwargs.pop("address_space", AddressSpace.GLOBAL) != AddressSpace.GLOBAL: + raise LoopyError("'address_space' for ImageArg must be GLOBAL.") + super().__init__(*args, **kwargs) + min_target_axes = 1 max_target_axes = 3 + # ImageArg cannot be an output (for now) + is_output = False + is_input = True + address_space = AddressSpace.GLOBAL + @property def dimensions(self): return len(self.dim_tags) @@ -466,7 +511,7 @@ class ImageArg(ArrayBase, KernelArgument): class ValueArg(KernelArgument, Taggable): def __init__(self, name, dtype=None, approximately=1000, target=None, - is_output_only=False, tags=None): + is_output=False, is_input=True, tags=None): """ :arg tags: A an instance of or Iterable of instances of :class:`pytools.tag.Tag` intended for consumption by an @@ -477,7 +522,9 @@ class ValueArg(KernelArgument, Taggable): dtype=dtype, approximately=approximately, target=target, - is_output_only=is_output_only, tags=tags) + is_output=is_output, + is_input=is_input, + tags=tags) def __str__(self): import loopy as lp diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..55a38f3e820d62f69b1264fce97f0679363240d4 --- /dev/null +++ b/loopy/kernel/function_interface.py @@ -0,0 +1,990 @@ +__copyright__ = "Copyright (C) 2018 Andreas Kloeckner, Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from pytools import ImmutableRecord +from loopy.diagnostic import LoopyError + +from loopy.tools import update_persistent_hash +from loopy.kernel import LoopKernel +from loopy.kernel.array import ArrayBase +from loopy.kernel.data import ValueArg, ArrayArg +from loopy.symbolic import DependencyMapper, WalkMapper + +__doc__ = """ +.. currentmodule:: loopy.kernel.function_interface + +.. autoclass:: ValueArgDescriptor + +.. autoclass:: ArrayArgDescriptor + +.. autoclass:: InKernelCallable + +.. autoclass:: CallableKernel + +.. autoclass:: ScalarCallable +""" + + +# {{{ argument descriptors + +class ValueArgDescriptor(ImmutableRecord): + hash_fields = () + + def map_expr(self, subst_mapper): + return self.copy() + + def depends_on(self): + return frozenset() + + update_persistent_hash = update_persistent_hash + + +class ArrayArgDescriptor(ImmutableRecord): + """ + Records information about an array argument to an in-kernel callable. To be + passed to and returned from + :meth:`InKernelCallable.with_descrs`, used for + matching shape and address space of caller and callee kernels. + + ..attribute:: shape + + Shape of the array. + + .. attribute:: address_space + + An attribute of :class:`loopy.AddressSpace`. + + .. attribute:: dim_tags + + A tuple of instances of + :class:`loopy.kernel.array.ArrayDimImplementationTag` + + .. automethod:: map_expr + .. automethod:: depends_on + """ + + fields = {"shape", "address_space", "dim_tags"} + + def __init__(self, shape, address_space, dim_tags): + + # {{{ sanity checks + + from loopy.kernel.array import ArrayDimImplementationTag + from loopy.kernel.data import auto + + assert isinstance(shape, tuple) or shape in [None, auto] + assert isinstance(dim_tags, tuple) or dim_tags is None + + if dim_tags: + # FIXME at least vector dim tags should be supported + assert all(isinstance(dim_tag, ArrayDimImplementationTag) for dim_tag in + dim_tags) + + # }}} + + super().__init__( + shape=shape, + address_space=address_space, + dim_tags=dim_tags) + + def map_expr(self, f): + """ + Returns an instance of :class:`ArrayArgDescriptor` with its shapes, strides, + mapped by *f*. + """ + if self.shape is not None: + new_shape = tuple(f(axis_len) for axis_len in self.shape) + else: + new_shape = None + + if self.dim_tags is not None: + new_dim_tags = tuple(dim_tag.map_expr(f) for dim_tag in self.dim_tags) + else: + new_dim_tags = None + + return self.copy(shape=new_shape, dim_tags=new_dim_tags) + + def depends_on(self): + """ + Returns class:`frozenset` of all the variable names the + :class:`ArrayArgDescriptor` depends on. + """ + from loopy.kernel.data import auto + result = set() + + if self.shape: + dep_mapper = DependencyMapper(composite_leaves=False) + for axis_len in self.shape: + if axis_len not in [None, auto]: + result |= dep_mapper(axis_len) + + if self.dim_tags: + for dim_tag in self.dim_tags: + result |= dim_tag.depends_on() + + return frozenset(var.name for var in result) + + def update_persistent_hash(self, key_hash, key_builder): + key_builder.update_for_pymbolic_expression(key_hash, self.shape) + key_builder.rec(key_hash, self.address_space) + key_builder.rec(key_hash, self.dim_tags) + + +class ExpressionIsScalarChecker(WalkMapper): + def __init__(self, kernel): + self.kernel = kernel + + def map_sub_array_ref(self, expr): + raise LoopyError("Sub-array refs can only be used as call's parameters" + f" or assignees. '{expr}' violates this.") + + def map_call(self, expr): + self.rec(expr.parameters) + + def map_subscript(self, expr): + for child in expr.index_tuple: + self.rec(child) + + def map_variable(self, expr): + from loopy.kernel.data import TemporaryVariable, ArrayArg, auto + if expr.name in self.kernel.all_inames(): + # inames are scalar + return + + var = self.kernel.arg_dict.get(expr.name, None) or ( + self.kernel.temporary_variables.get(expr.name, None)) + + if var is not None: + if isinstance(var, (ArrayArg, TemporaryVariable)) and ( + var.shape != () and var.shape is not auto): + raise LoopyError("Array regions can only passed as sub-array refs.") + + def map_slice(self, expr): + raise LoopyError("Array regions can only passed as sub-array refs.") + + def map_call_with_kwargs(self, expr): + # See https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + + +def get_arg_descriptor_for_expression(kernel, expr): + """ + :returns: a :class:`ArrayArgDescriptor` or a :class:`ValueArgDescriptor` + describing the argument expression *expr* which occurs + in a call in the code of *kernel*. + """ + from loopy.symbolic import (SubArrayRef, pw_aff_to_expr, + SweptInameStrideCollector) + from loopy.kernel.data import TemporaryVariable, ArrayArg + + if isinstance(expr, SubArrayRef): + name = expr.subscript.aggregate.name + arg = kernel.get_var_descriptor(name) + + if not isinstance(arg, (TemporaryVariable, ArrayArg)): + raise LoopyError("unsupported argument type " + "'%s' of '%s' in call statement" + % (type(arg).__name__, expr.name)) + + aspace = arg.address_space + + from loopy.kernel.array import FixedStrideArrayDimTag as DimTag + sub_dim_tags = [] + sub_shape = [] + + # This helps in identifying identities like + # "2*(i//2) + i%2" := "i" + # See the kernel in + # test_callables.py::test_shape_translation_through_sub_array_refs + + from loopy.symbolic import simplify_using_aff + linearized_index = simplify_using_aff( + kernel, + sum(dim_tag.stride*iname for dim_tag, iname in + zip(arg.dim_tags, expr.subscript.index_tuple))) + + strides_as_dict = SweptInameStrideCollector( + tuple(iname.name for iname in expr.swept_inames) + )(linearized_index) + sub_dim_tags = tuple( + # Not all swept inames necessarily occur in the expression. + DimTag(strides_as_dict.get(iname, 0)) + for iname in expr.swept_inames) + sub_shape = tuple( + pw_aff_to_expr( + kernel.get_iname_bounds(iname.name).upper_bound_pw_aff + - kernel.get_iname_bounds(iname.name).lower_bound_pw_aff)+1 + for iname in expr.swept_inames) + + return ArrayArgDescriptor( + address_space=aspace, + dim_tags=sub_dim_tags, + shape=sub_shape) + else: + ExpressionIsScalarChecker(kernel)(expr) + return ValueArgDescriptor() + +# }}} + + +# {{{ helper function for in-kernel callables + +def get_kw_pos_association(kernel): + """ + Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in + *kernel*. + """ + kw_to_pos = {} + pos_to_kw = {} + + read_count = 0 + write_count = -1 + + for arg in kernel.args: + if arg.is_output: + kw_to_pos[arg.name] = write_count + pos_to_kw[write_count] = arg.name + write_count -= 1 + if arg.is_input: + # if an argument is both input and output then kw_to_pos is + # overwritten with its expected position in the parameters + kw_to_pos[arg.name] = read_count + pos_to_kw[read_count] = arg.name + read_count += 1 + + return kw_to_pos, pos_to_kw + +# }}} + + +# {{{ template class + +class InKernelCallable(ImmutableRecord): + """ + An abstract interface to define a callable encountered in a kernel. + + .. attribute:: name + + The name of the callable which can be encountered within expressions in + a kernel. + + .. attribute:: arg_id_to_dtype + + A mapping which indicates the arguments types and result types of the + callable. + + .. attribute:: arg_id_to_descr + + A mapping which gives indicates the argument shape and ``dim_tags`` it + would be responsible for generating code. + + + .. automethod:: __init__ + .. automethod:: with_types + .. automethod:: with_descrs + .. automethod:: with_target + .. automethod:: generate_preambles + .. automethod:: emit_call + .. automethod:: emit_call_insn + .. automethod:: is_ready_for_codegen + .. automethod:: get_hw_axes_sizes + .. automethod:: get_used_hw_axes + .. automethod:: get_called_callables + .. automethod:: with_name + .. automethod:: is_type_specialized + + .. note:: + + * "``arg_id`` can either be an instance of :class:`int` integer + corresponding to the position of the argument or an instance of + :class:`str` corresponding to the name of keyword argument accepted + by the function. + + * Negative "arg_id" values ``-i`` in the mapping attributes indicate + return value with (0-based) index *i*. + + """ + + hash_fields = ("name", "arg_id_to_dtype", "arg_id_to_descr") + + def __init__(self, name, arg_id_to_dtype=None, arg_id_to_descr=None): + + super().__init__(name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + update_persistent_hash = update_persistent_hash + + def with_types(self, arg_id_to_dtype, clbl_inf_ctx): + """ + :arg arg_id_to_type: a mapping from argument identifiers (integers for + positional arguments) to :class:`loopy.types.LoopyType` instances. + Unspecified/unknown types are not represented in *arg_id_to_type*. + + Return values are denoted by negative integers, with the first + returned value identified as *-1*. + + :arg clbl_inf_ctx: An instance of + :class:`loopy.translation_unit.CallablesInferenceContext`. *clbl_inf_ctx* + provides the namespace of other callables contained within *self*. + + :returns: a tuple ``(new_self, new_clbl_inf_ctx)``, where *new_self* is a + new :class:`InKernelCallable` specialized for the given types. + *new_clbl_inf_ctx* is *clbl_inf_ctx*'s updated state if the + type-specialization of *self* updated other calls contained within + it. + + .. note:: + + If the :class:`InKernelCallable` does not contain any + other callables within it, then *clbl_inf_ctx* is returned as is. + """ + + raise NotImplementedError() + + def with_descrs(self, arg_id_to_descr, clbl_inf_ctx): + """ + :arg arg_id_to_descr: a mapping from argument identifiers (integers for + positional arguments) to instances of :class:`ArrayArgDescriptor` + or :class:`ValueArgDescriptor`. Unspecified/unknown descriptors are + not represented in *arg_id_to_type*. + + Return values are denoted by negative integers, with the first + returned value identified as *-1*. + + :arg clbl_inf_ctx: An instance of + :class:`loopy.translation_unit.CallablesInferenceContext`. *clbl_inf_ctx* + provides the namespace of other callables contained within *self*. + + :returns: a tuple ``(new_self, new_clbl_inf_ctx)``, where *new_self* is a + new :class:`InKernelCallable` specialized for the given argument + descriptors. *new_clbl_inf_ctx* is the *clbl_inf_ctx*'s updated state + if descriptor-specialization of *self* updated other calls contained + within it. + + .. note:: + + If the :class:`InKernelCallable` does not contain any + other callables within it, then *clbl_inf_ctx* is returned as is. + """ + + raise NotImplementedError() + + def with_target(self, target): + """ + Returns a copy of *self* with all the ``dtypes`` in + ``in_knl_callable.arg_id_to_dtype`` associated with the *target*. + + :arg target: An instance of :class:`loopy.target.TargetBase`. + """ + + if target is None: + raise LoopyError("target cannot be None for with_target") + + def with_target_if_not_None(dtype): + """ + Returns a copy of :arg:`dtype` associated with the target. If + ``dtype`` is *None* returns *None*. + """ + if dtype: + return dtype.with_target(target) + else: + return None + + new_arg_id_to_dtype = None + if self.arg_id_to_dtype is not None: + new_arg_id_to_dtype = {id: with_target_if_not_None(dtype) + for id, dtype in self.arg_id_to_dtype.items()} + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + + def is_ready_for_codegen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + def get_hw_axes_sizes(self, arg_id_to_arg, space, callables_table): + """ + Returns ``gsizes, lsizes``, where *gsizes* and *lsizes* are mappings + from axis indices to corresponding group or local hw axis sizes. The hw + axes sizes are represented as instances of :class:`islpy.PwAff` on the + given *space*. + + :arg arg_id_to_arg: A mapping from the passed argument *id* to the + arguments at a call-site. + :arg space: An instance of :class:`islpy.Space`. + """ + raise NotImplementedError + + def get_used_hw_axes(self, callables_table): + """ + Returns a tuple ``group_axes_used, local_axes_used``, where + ``(group|local)_axes_used`` are :class:`frozenset` of hardware axes + indices used by the callable. + """ + raise NotImplementedError + + def generate_preambles(self, target): + """ + Yields the target specific preamble. + """ + raise NotImplementedError() + + def emit_call(self, expression_to_code_mapper, expression, target): + + raise NotImplementedError() + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + """ + Returns a tuple of ``(call, assignee_is_returned)`` which is the target + facing function call that would be seen in the generated code. ``call`` + is an instance of ``pymbolic.primitives.Call`` ``assignee_is_returned`` + is an instance of :class:`bool` to indicate if the assignee is returned + by value of C-type targets. + + *Example:* If ``assignee_is_returned=True``, then ``a, b = f(c, d)`` is + interpreted in the target as ``a = f(c, d, &b)``. If + ``assignee_is_returned=False``, then ``a, b = f(c, d)`` is interpreted + in the target as the statement ``f(c, d, &a, &b)``. + """ + + raise NotImplementedError() + + def __hash__(self): + return hash(self.hash_fields) + + def with_added_arg(self, arg_dtype, arg_descr): + """ + Registers a new argument to the callable and returns the name of the + argument in the callable's namespace. + """ + raise NotImplementedError() + + def get_called_callables(self, callables_table, recursive=True): + """ + Returns a :class:`frozenset` of callable ids called by *self* that are + resolved via *callables_table*. + + :arg callables_table: Similar to + :attr:`loopy.TranslationUnit.callables_table`. + :arg recursive: If *True* recursively searches for all the called + callables, else only returns the callables directly called by + *self*. + """ + raise NotImplementedError + + def with_name(self, name): + """ + Returns a copy of *self* so that it could be referred by *name* + in a :attr:`loopy.TranslationUnit.callables_table`'s namespace. + """ + raise NotImplementedError + + def is_type_specialized(self): + """ + Returns *True* iff *self*'s type signature is known, else returns + *False*. + """ + raise NotImplementedError + +# }}} + + +# {{{ scalar callable + +class ScalarCallable(InKernelCallable): + """ + An abstract interface the to a scalar callable encountered in a kernel. + + .. attribute:: name_in_target + + A :class:`str` to denote the name of the function in a + :class:`loopy.target.TargetBase` for which the callable is specialized. + *None* if the callable is not specialized enough to know its name + in target. + + .. automethod:: with_types + + .. automethod:: with_descrs + + .. note:: + + The :meth:`ScalarCallable.with_types` is intended to assist with type + specialization of the function and sub-classes must define it. + """ + fields = {"name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"} + hash_fields = InKernelCallable.hash_fields + ("name_in_target",) + + def __init__(self, name, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + super().__init__(name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + self.name_in_target = name_in_target + + def with_types(self, arg_id_to_dtype, callables_table): + raise LoopyError("No type inference information present for " + "the function %s." % (self.name)) + + def with_descrs(self, arg_id_to_descr, clbl_inf_ctx): + + arg_id_to_descr[-1] = ValueArgDescriptor() + return (self.copy(arg_id_to_descr=arg_id_to_descr), + clbl_inf_ctx) + + def get_hw_axes_sizes(self, arg_id_to_arg, space, callables_table): + return {}, {} + + def get_used_hw_axes(self, callables_table): + return frozenset(), frozenset() + + def is_ready_for_codegen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + # {{{ code generation + + def emit_call(self, expression_to_code_mapper, expression, target): + + assert self.is_ready_for_codegen() + + # must have single assignee + assert len(expression.parameters) == len(self.arg_id_to_dtype) - 1 + arg_dtypes = tuple(self.arg_id_to_dtype[id] for id in + range(len(self.arg_id_to_dtype)-1)) + + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in + expression.parameters) + + from loopy.expression import dtype_to_type_context + # processing the parameters with the required dtypes + processed_parameters = tuple( + expression_to_code_mapper.rec(par, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype) + for par, par_dtype, tgt_dtype in zip( + expression.parameters, par_dtypes, arg_dtypes)) + + from pymbolic import var + return var(self.name_in_target)(*processed_parameters) + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + """ + :arg insn: An instance of :class:`loopy.kernel.instructions.CallInstruction`. + :arg target: An instance of :class:`loopy.target.TargetBase`. + :arg expression_to_code_mapper: An instance of :class:`IdentityMapper` + responsible for code mapping from :mod:`loopy` syntax to the + **target syntax**. + + :returns: A tuple of the call to be generated and an instance of + :class:`bool` whether the first assignee is a part of the LHS in + the assignment instruction. + + .. note:: + + The default implementation returns the first assignees and the + references of the rest of the assignees are appended to the + arguments of the call. + + *Example:* ``c, d = f(a, b)`` is returned as ``c = f(a, b, &d)`` + """ + from loopy.target.c import CFamilyTarget + if not isinstance(target, CFamilyTarget): + raise NotImplementedError() + + from loopy.kernel.instruction import CallInstruction + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from pymbolic import var + + assert isinstance(insn, CallInstruction) + assert self.is_ready_for_codegen() + + ecm = expression_to_code_mapper + parameters = insn.expression.parameters + assignees = insn.assignees[1:] + + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) + for par in parameters) + arg_dtypes = tuple(self.arg_id_to_dtype[i] + for i, _ in enumerate(parameters)) + + assignee_dtypes = tuple(self.arg_id_to_dtype[-i-2] + for i, _ in enumerate(assignees)) + + tgt_parameters = [ecm(par, PREC_NONE, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype).expr + for par, par_dtype, tgt_dtype in zip(parameters, + par_dtypes, + arg_dtypes)] + + for i, (a, tgt_dtype) in enumerate(zip(assignees, assignee_dtypes)): + if tgt_dtype != expression_to_code_mapper.infer_type(a): + raise LoopyError("Type Mismatch in function %s. Expected: %s" + "Got: %s" % (self.name, tgt_dtype, + expression_to_code_mapper.infer_type(a))) + tgt_parameters.append(var("&")(ecm(a, PREC_NONE, + dtype_to_type_context(target, + tgt_dtype), + tgt_dtype).expr)) + + # assignee is returned whenever the size of assignees is non zero. + first_assignee_is_returned = len(insn.assignees) > 0 + + return var(self.name_in_target)(*tgt_parameters), first_assignee_is_returned + + def generate_preambles(self, target): + return + yield + + # }}} + + def with_added_arg(self, arg_dtype, arg_descr): + raise LoopyError("Cannot add args to scalar callables.") + + def get_called_callables(self, callables_table, recursive=True): + """ + Returns a :class:`frozenset` of callable ids called by *self*. + """ + return frozenset() + + def with_name(self, name): + return self + + def is_type_specialized(self): + return (self.arg_id_to_dtype is not None + and all(dtype is not None + for dtype in self.arg_id_to_dtype.values())) + +# }}} + + +# {{{ callable kernel + +class CallableKernel(InKernelCallable): + """ + Records informations about a callee kernel. Also provides interface through + member methods to make the callee kernel compatible to be called from a + caller kernel. + + :meth:`CallableKernel.with_types` should be called in order to match + the ``dtypes`` of the arguments that are shared between the caller and the + callee kernel. + + :meth:`CallableKernel.with_descrs` should be called in order to match + the arguments' shapes/strides across the caller and the callee kernel. + + .. attribute:: subkernel + + :class:`~loopy.LoopKernel` which is being called. + + .. automethod:: with_descrs + .. automethod:: with_types + """ + + fields = {"subkernel", "arg_id_to_dtype", "arg_id_to_descr"} + hash_fields = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr") + + def __init__(self, subkernel, arg_id_to_dtype=None, + arg_id_to_descr=None): + assert isinstance(subkernel, LoopKernel) + super().__init__(name=subkernel.name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + self.subkernel = subkernel + + def copy(self, subkernel=None, arg_id_to_dtype=None, + arg_id_to_descr=None): + if subkernel is None: + subkernel = self.subkernel + if arg_id_to_descr is None: + arg_id_to_descr = self.arg_id_to_descr + if arg_id_to_dtype is None: + arg_id_to_dtype = self.arg_id_to_dtype + + return CallableKernel(subkernel, arg_id_to_dtype, arg_id_to_descr) + + def with_types(self, arg_id_to_dtype, callables_table): + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + new_args = [] + for arg in self.subkernel.args: + kw = arg.name + if kw in arg_id_to_dtype: + # id exists as kw + new_args.append(arg.copy(dtype=arg_id_to_dtype[kw])) + elif kw_to_pos[kw] in arg_id_to_dtype: + # id exists as positional argument + new_args.append(arg.copy( + dtype=arg_id_to_dtype[kw_to_pos[kw]])) + else: + new_args.append(arg) + + from loopy.type_inference import ( + infer_unknown_types_for_a_single_kernel) + pre_specialized_subkernel = self.subkernel.copy( + args=new_args) + + # infer the types of the written variables based on the knowledge + # of the types of the arguments supplied + specialized_kernel, callables_table = ( + infer_unknown_types_for_a_single_kernel( + pre_specialized_subkernel, + callables_table)) + + new_arg_id_to_dtype = {} + for pos, kw in pos_to_kw.items(): + arg = specialized_kernel.arg_dict[kw] + if arg.dtype: + new_arg_id_to_dtype[kw] = arg.dtype + new_arg_id_to_dtype[pos] = arg.dtype + + # Return the kernel call with specialized subkernel and the corresponding + # new arg_id_to_dtype + return self.copy(subkernel=specialized_kernel, + arg_id_to_dtype=new_arg_id_to_dtype), callables_table + + def with_descrs(self, arg_id_to_descr, clbl_inf_ctx): + + # arg_id_to_descr expressions provided are from the caller's namespace, + # need to register + + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + kw_to_callee_idx = {arg.name: i + for i, arg in enumerate(self.subkernel.args)} + + new_args = self.subkernel.args[:] + + for arg_id, descr in arg_id_to_descr.items(): + if isinstance(arg_id, int): + arg_id = pos_to_kw[arg_id] + + callee_arg = new_args[kw_to_callee_idx[arg_id]] + + # {{{ checks + + if isinstance(callee_arg, ValueArg) and ( + isinstance(descr, ArrayArgDescriptor)): + raise LoopyError(f"In call to {self.subkernel.name}, '{arg_id}' " + "expected to be a scalar, got an array region.") + + if isinstance(callee_arg, ArrayArg) and ( + isinstance(descr, ValueArgDescriptor)): + raise LoopyError(f"In call to {self.subkernel.name}, '{arg_id}' " + "expected to be an array, got a scalar.") + + if (isinstance(descr, ArrayArgDescriptor) + and isinstance(callee_arg.shape, tuple) + and len(callee_arg.shape) != len(descr.shape)): + raise LoopyError(f"In call to {self.subkernel.name}, '{arg_id}'" + " has a dimensionality mismatch, expected " + f"{len(callee_arg.shape)}, got {len(descr.shape)}") + + # }}} + + if isinstance(descr, ArrayArgDescriptor): + callee_arg = callee_arg.copy(shape=descr.shape, + dim_tags=descr.dim_tags, + address_space=descr.address_space) + else: + # do nothing for a scalar arg. + assert isinstance(descr, ValueArgDescriptor) + + new_args[kw_to_callee_idx[arg_id]] = callee_arg + + subkernel = self.subkernel.copy(args=new_args) + + from loopy.preprocess import traverse_to_infer_arg_descr + subkernel, clbl_inf_ctx = traverse_to_infer_arg_descr(subkernel, + clbl_inf_ctx) + + # {{{ update the arg descriptors + + for arg in subkernel.args: + kw = arg.name + if isinstance(arg, ArrayBase): + arg_id_to_descr[kw] = ( + ArrayArgDescriptor(shape=arg.shape, + dim_tags=arg.dim_tags, + address_space=arg.address_space)) + else: + assert isinstance(arg, ValueArg) + arg_id_to_descr[kw] = ValueArgDescriptor() + + arg_id_to_descr[kw_to_pos[kw]] = arg_id_to_descr[kw] + + # }}} + + return (self.copy(subkernel=subkernel, + arg_id_to_descr=arg_id_to_descr), + clbl_inf_ctx) + + def with_added_arg(self, arg_dtype, arg_descr): + var_name = self.subkernel.get_var_name_generator()(based_on="_lpy_arg") + + if isinstance(arg_descr, ValueArgDescriptor): + subknl = self.subkernel.copy( + args=self.subkernel.args+[ + ValueArg(var_name, arg_dtype, self.subkernel.target)]) + + kw_to_pos, pos_to_kw = get_kw_pos_association(subknl) + + if self.arg_id_to_dtype is None: + arg_id_to_dtype = {} + else: + arg_id_to_dtype = self.arg_id_to_dtype.copy() + if self.arg_id_to_descr is None: + arg_id_to_descr = {} + else: + arg_id_to_descr = self.arg_id_to_descr.copy() + + arg_id_to_dtype[var_name] = arg_dtype + arg_id_to_descr[var_name] = arg_descr + arg_id_to_dtype[kw_to_pos[var_name]] = arg_dtype + arg_id_to_descr[kw_to_pos[var_name]] = arg_descr + + return (self.copy(subkernel=subknl, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr), + var_name) + + else: + # don't think this should ever be needed + raise NotImplementedError("with_added_arg not implemented for array" + " types arguments.") + + def with_packing_for_args(self): + from loopy.kernel.data import AddressSpace + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + arg_id_to_descr = {} + + for pos, kw in pos_to_kw.items(): + arg = self.subkernel.arg_dict[kw] + arg_id_to_descr[pos] = ArrayArgDescriptor( + shape=arg.shape, + dim_tags=arg.dim_tags, + address_space=AddressSpace.GLOBAL) + + return self.copy(subkernel=self.subkernel, + arg_id_to_descr=arg_id_to_descr) + + def get_used_hw_axes(self, callables_table): + gsize, lsize = self.subkernel.get_grid_size_upper_bounds(callables_table, + return_dict=True) + + return frozenset(gsize.keys()), frozenset(lsize.keys()) + + def get_hw_axes_sizes(self, arg_id_to_arg, space, callables_table): + from loopy.isl_helpers import subst_into_pwaff + _, pos_to_kw = get_kw_pos_association(self.subkernel) + gsize, lsize = self.subkernel.get_grid_size_upper_bounds(callables_table, + return_dict=True) + + subst_dict = {i: val + for i, val in arg_id_to_arg.items() + if isinstance(self.subkernel.arg_dict[pos_to_kw[i]], + ValueArg)} + + gsize = {iaxis: subst_into_pwaff(space, size, subst_dict) + for iaxis, size in gsize.items()} + lsize = {iaxis: subst_into_pwaff(space, size, subst_dict) + for iaxis, size in lsize.items()} + + return gsize, lsize + + def is_ready_for_codegen(self): + return (self.arg_id_to_dtype is not None + and self.arg_id_to_descr is not None) + + def generate_preambles(self, target): + """ Yields the *target* specific preambles. + """ + return + yield + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + from loopy.target.c import CFamilyTarget + if not isinstance(target, CFamilyTarget): + raise NotImplementedError() + + from loopy.kernel.instruction import CallInstruction + + assert self.is_ready_for_codegen() + assert isinstance(insn, CallInstruction) + + ecm = expression_to_code_mapper + parameters = insn.expression.parameters + assignees = insn.assignees + + parameters = list(parameters) + par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + # insert the assignees at the required positions + assignee_write_count = -1 + for i, arg in enumerate(self.subkernel.args): + if arg.is_output: + if not arg.is_input: + assignee = assignees[-assignee_write_count-1] + parameters.insert(i, assignee) + par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) + + assignee_write_count -= 1 + + # no type casting in array calls + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from pymbolic import var + + tgt_parameters = [ecm(par, PREC_NONE, dtype_to_type_context(target, + par_dtype), + par_dtype).expr + for par, par_dtype in zip(parameters, par_dtypes)] + + return var(self.subkernel.name)(*tgt_parameters), False + + def get_called_callables(self, callables_table, recursive=True): + from loopy.kernel.tools import get_resolved_callable_ids_called_by_knl + return get_resolved_callable_ids_called_by_knl(self.subkernel, + callables_table, + recursive=recursive) + + def with_name(self, name): + new_knl = self.subkernel.copy(name=name) + return self.copy(subkernel=new_knl) + + def is_type_specialized(self): + from loopy.kernel.data import auto + return (self.arg_id_to_dtype is not None + and all(arg.dtype not in [None, auto] + for arg in self.subkernel.args) + and all(tv.dtype not in [None, auto] + for tv in self.subkernel.temporary_variables.values())) + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 5c525da06275afc281096139e11edf148dded3f6..e561dd0305df0adfb584be78cb19dd386252f51c 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -324,6 +324,9 @@ class InstructionBase(ImmutableRecord, Taggable): def reduction_inames(self): raise NotImplementedError + def sub_array_ref_inames(self): + raise NotImplementedError + def assignee_var_names(self): """Return a tuple of assignee variable names, one for each quantity being assigned to. @@ -469,7 +472,7 @@ class InstructionBase(ImmutableRecord, Taggable): def _get_assignee_var_name(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript + from loopy.symbolic import LinearSubscript, SubArrayRef if isinstance(expr, Lookup): expr = expr.aggregate @@ -488,13 +491,20 @@ def _get_assignee_var_name(expr): assert isinstance(agg, Variable) return agg.name + + elif isinstance(expr, SubArrayRef): + agg = expr.subscript.aggregate + assert isinstance(agg, Variable) + + return agg.name + else: raise RuntimeError("invalid lvalue '%s'" % expr) def _get_assignee_subscript_deps(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript, get_dependencies + from loopy.symbolic import LinearSubscript, get_dependencies, SubArrayRef if isinstance(expr, Lookup): expr = expr.aggregate @@ -505,6 +515,9 @@ def _get_assignee_subscript_deps(expr): return get_dependencies(expr.index) elif isinstance(expr, LinearSubscript): return get_dependencies(expr.index) + elif isinstance(expr, SubArrayRef): + return get_dependencies(expr.subscript.index) - ( + frozenset(iname.name for iname in expr.swept_inames)) else: raise RuntimeError("invalid lvalue '%s'" % expr) @@ -799,6 +812,11 @@ class MultiAssignmentBase(InstructionBase): from loopy.symbolic import get_reduction_inames return get_reduction_inames(self.expression) + @memoize_method + def sub_array_ref_inames(self): + from loopy.symbolic import get_sub_array_ref_swept_inames + return get_sub_array_ref_swept_inames((self.assignees, self.expression)) + # }}} @@ -1037,7 +1055,8 @@ class CallInstruction(MultiAssignmentBase): from pymbolic.primitives import Call from loopy.symbolic import Reduction - if not isinstance(expression, (Call, Reduction)) and expression is not None: + if not isinstance(expression, (Call, Reduction)) and ( + expression is not None): raise LoopyError("'expression' argument to CallInstruction " "must be a function call") @@ -1053,9 +1072,10 @@ class CallInstruction(MultiAssignmentBase): expression = parse(expression) from pymbolic.primitives import Variable, Subscript - from loopy.symbolic import LinearSubscript + from loopy.symbolic import LinearSubscript, SubArrayRef for assignee in assignees: - if not isinstance(assignee, (Variable, Subscript, LinearSubscript)): + if not isinstance(assignee, (Variable, Subscript, LinearSubscript, + SubArrayRef)): raise LoopyError("invalid lvalue '%s'" % assignee) self.assignees = assignees @@ -1124,6 +1144,17 @@ class CallInstruction(MultiAssignmentBase): result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates) return result + def arg_id_to_arg(self): + """:returns: a :class:`dict` mapping argument identifiers (non-negative numbers + for positional arguments and negative numbers + for assignees) to their respective values + """ + arg_id_to_arg = dict(enumerate(self.expression.parameters)) + for i, arg in enumerate(self.assignees): + arg_id_to_arg[-i-1] = arg + + return arg_id_to_arg + @property def atomicity(self): # Function calls can impossibly be atomic, and even the result assignment @@ -1134,34 +1165,118 @@ class CallInstruction(MultiAssignmentBase): # }}} +def subscript_contains_slice(subscript): + """Return *True* if the *subscript* contains an instance of + :class:`pymbolic.primitives.Slice` as of its indices. + """ + from pymbolic.primitives import Subscript, Slice + assert isinstance(subscript, Subscript) + return any(isinstance(index, Slice) for index in subscript.index_tuple) + + +def is_array_call(assignees, expression): + """ + Returns *True* is the instruction is an array call. + + An array call is a function call applied to array type objects. If any of + the arguemnts or assignees to the function is an array, + :meth:`is_array_call` will return *True*. + """ + from pymbolic.primitives import Call, Subscript + from loopy.symbolic import SubArrayRef + + if not isinstance(expression, Call): + return False + + for par in expression.parameters+assignees: + if isinstance(par, SubArrayRef): + return True + elif isinstance(par, Subscript): + if subscript_contains_slice(par): + return True + + # did not encounter SubArrayRef/Slice, hence must be a normal call + return False + + +def modify_assignee_for_array_call(assignee): + """ + Converts the assignee subscript or variable as a SubArrayRef. + """ + from pymbolic.primitives import Subscript, Variable + from loopy.symbolic import SubArrayRef + if isinstance(assignee, SubArrayRef): + return assignee + elif isinstance(assignee, Subscript): + if subscript_contains_slice(assignee): + # Slice subscripted array are treated as SubArrayRef in the kernel + # Hence, making the behavior similar to that of `SubArrayref` + return assignee + else: + return SubArrayRef((), assignee) + elif isinstance(assignee, Variable): + return SubArrayRef((), Subscript(assignee, 0)) + else: + raise LoopyError("ArrayCall only takes Variable, Subscript or " + "SubArrayRef as its inputs") + + def make_assignment(assignees, expression, temp_var_types=None, **kwargs): + if temp_var_types is None: temp_var_types = (Optional(),) * len(assignees) - if len(assignees) == 1: + if len(assignees) != 1 or is_array_call(assignees, expression): + atomicity = kwargs.pop("atomicity", ()) + if atomicity: + raise LoopyError("atomic operations with more than one " + "left-hand side not supported") + + from pymbolic.primitives import Call + from loopy.symbolic import Reduction + if not isinstance(expression, (Call, Reduction)): + raise LoopyError("right-hand side in multiple assignment must be " + "function call or reduction, got: '%s'" % expression) + + if not is_array_call(assignees, expression): + return CallInstruction( + assignees=assignees, + expression=expression, + temp_var_types=temp_var_types, + **kwargs) + else: + # In the case of an array call, it is important to have each + # assignee as an instance of SubArrayRef. If not given as a + # SubArrayRef + return CallInstruction( + assignees=tuple(modify_assignee_for_array_call( + assignee) for assignee in assignees), + expression=expression, + temp_var_types=temp_var_types, + **kwargs) + else: + def _is_array(expr): + from loopy.symbolic import SubArrayRef + from pymbolic.primitives import (Subscript, Slice) + if isinstance(expr, SubArrayRef): + return True + if isinstance(expr, Subscript): + return any(isinstance(idx, Slice) for idx in + expr.index_tuple) + return False + + from loopy.symbolic import DependencyMapper + if any(_is_array(dep) for dep in DependencyMapper()((assignees, + expression))): + raise LoopyError("Array calls only supported as instructions" + " with function call as RHS for now.") + return Assignment( assignee=assignees[0], expression=expression, temp_var_type=temp_var_types[0], **kwargs) - atomicity = kwargs.pop("atomicity", ()) - if atomicity: - raise LoopyError("atomic operations with more than one " - "left-hand side not supported") - - from pymbolic.primitives import Call - from loopy.symbolic import Reduction - if not isinstance(expression, (Call, Reduction)): - raise LoopyError("right-hand side in multiple assignment must be " - "function call or reduction, got: '%s'" % expression) - - return CallInstruction( - assignees=assignees, - expression=expression, - temp_var_types=temp_var_types, - **kwargs) - # {{{ c instruction @@ -1290,6 +1405,9 @@ class CInstruction(InstructionBase): def reduction_inames(self): return set() + def sub_array_ref_inames(self): + return frozenset() + def assignee_var_names(self): return tuple(_get_assignee_var_name(expr) for expr in self.assignees) @@ -1337,6 +1455,9 @@ class _DataObliviousInstruction(InstructionBase): def reduction_inames(self): return frozenset() + def sub_array_ref_inames(self): + return frozenset() + def assignee_var_names(self): return frozenset() diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index f660ed4e23d57555817a7a7a4ddb7a7fce6ab95b..2b859c99ef79f00f2474a66a44600c3123c975bf 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -32,19 +32,42 @@ import islpy as isl from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg, natsorted - +from loopy.kernel import LoopKernel +from loopy.translation_unit import (TranslationUnit, + for_each_kernel) +from loopy.kernel.function_interface import CallableKernel +from loopy.kernel.instruction import ( + MultiAssignmentBase, CInstruction, _DataObliviousInstruction) +from loopy.symbolic import CombineMapper +from functools import reduce import logging logger = logging.getLogger(__name__) # {{{ add and infer argument dtypes -def add_dtypes(kernel, dtype_dict): +def add_dtypes(prog_or_kernel, dtype_dict): """Specify remaining unspecified argument/temporary variable types. :arg dtype_dict: a mapping from variable names to :class:`numpy.dtype` instances """ + if isinstance(prog_or_kernel, TranslationUnit): + kernel_names = [clbl.subkernel.name for clbl in + prog_or_kernel.callables_table.values() if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise LoopyError("add_dtypes may not take a TranslationUnit with more" + " than one callable kernels. Please provide individual kernels" + " instead.") + + kernel_name, = kernel_names + + return prog_or_kernel.with_kernel( + add_dtypes(prog_or_kernel[kernel_name], dtype_dict)) + + assert isinstance(prog_or_kernel, LoopKernel) + processed_dtype_dict = {} for k, v in dtype_dict.items(): @@ -54,13 +77,13 @@ def add_dtypes(kernel, dtype_dict): processed_dtype_dict[subkey] = v dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes( - kernel, processed_dtype_dict) + prog_or_kernel, processed_dtype_dict) if dtype_dict_remainder: raise RuntimeError("unused argument dtypes: %s" % ", ".join(dtype_dict_remainder)) - return kernel.copy(args=new_args, temporary_variables=new_temp_vars) + return prog_or_kernel.copy(args=new_args, temporary_variables=new_temp_vars) def _add_dtypes_overdetermined(kernel, dtype_dict): @@ -112,11 +135,22 @@ def get_arguments_with_incomplete_dtype(kernel): if arg.dtype is None] -def add_and_infer_dtypes(kernel, dtype_dict, expect_completion=False): - kernel = add_dtypes(kernel, dtype_dict) +def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False, + kernel_name=None): + assert isinstance(prog, TranslationUnit) + if kernel_name is None: + kernel_names = [clbl.subkernel.name for clbl in + prog.callables_table.values() if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise LoopyError("Provide 'kernel_name' argument.") + + kernel_name, = kernel_names + + prog = prog.with_kernel(add_dtypes(prog[kernel_name], dtype_dict)) from loopy.type_inference import infer_unknown_types - return infer_unknown_types(kernel, expect_completion=expect_completion) + return infer_unknown_types(prog, expect_completion=expect_completion) def _add_and_infer_dtypes_overdetermined(kernel, dtype_dict): @@ -464,8 +498,10 @@ class DomainChanger: # {{{ graphviz / dot export -def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): - """Return a string in the `dot `__ language depicting +@for_each_kernel +def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True, + use_insn_id=False): + """Return a string in the `dot `_ language depicting dependencies among kernel instructions. """ @@ -476,7 +512,7 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): if iname_cluster and not kernel.schedule: try: from loopy.schedule import get_one_linearized_kernel - kernel = get_one_linearized_kernel(kernel) + kernel = get_one_linearized_kernel(kernel, callables_table) except RuntimeError as e: iname_cluster = False from warnings import warn @@ -757,7 +793,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): # }}} -def assign_automatic_axes(kernel, axis=0, local_size=None): +def assign_automatic_axes(kernel, callables_table, axis=0, local_size=None): logger.debug("%s: assign automatic axes" % kernel.name) # TODO: do the tag removal rigorously, might be easier after switching # to set() from tuple() @@ -771,7 +807,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): if local_size is None: _, local_size = kernel.get_grid_size_upper_bounds_as_exprs( - ignore_auto=True) + callables_table, ignore_auto=True) # {{{ axis assignment helper function @@ -794,6 +830,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): if not isinstance(tag, AutoLocalIndexTagBase))) return assign_automatic_axes( kernel.copy(inames=new_inames), + callables_table, axis=recursion_axis) if axis is None: @@ -833,7 +870,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): else: new_tag = LocalIndexTag(axis) if desired_length > local_size[axis]: - from loopy import split_iname, untag_inames + from loopy import untag_inames + from loopy.transform.iname import split_iname # Don't be tempted to switch the outer tag to unroll--this may # generate tons of code on some examples. @@ -844,6 +882,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): iname, inner_length=local_size[axis], outer_tag=None, inner_tag=new_tag, do_tagged_check=False), + callables_table=callables_table, axis=recursion_axis, local_size=local_size) if not kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase): @@ -860,7 +899,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): new_inames = kernel.inames.copy() new_inames[iname] = kernel.inames[iname].copy(tags=new_tags) return assign_automatic_axes(kernel.copy(inames=new_inames), - axis=recursion_axis, local_size=local_size) + callables_table, axis=recursion_axis, local_size=local_size) # }}} @@ -928,7 +967,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): if axis >= len(local_size): return kernel else: - return assign_automatic_axes(kernel, axis=axis+1, + return assign_automatic_axes(kernel, + callables_table=callables_table, axis=axis+1, local_size=local_size) # }}} @@ -1858,35 +1898,155 @@ def find_aliasing_equivalence_classes(kernel): # {{{ direction helper tools -def infer_arg_is_output_only(kernel): +def infer_args_are_input_output(kernel): """ - Returns a copy of *kernel* with the attribute ``is_output_only`` set. + Returns a copy of *kernel* with the attributes ``is_input`` and + ``is_output`` of the arguments set. .. note:: - If the attribute ``is_output_only`` is not supplied from an user, then - infers it as an output argument if it is written at some point in the - kernel. + If the :attr:`~loopy.ArrayArg.is_output` is not supplied from a user, + then the array is inferred as an output argument if it is written at + some point in the kernel. + + If the :attr:`~loopy.ArrayArg.is_input` is not supplied from a user, + then the array is inferred as an input argument if it is either read at + some point in the kernel or it is neither read nor written. """ from loopy.kernel.data import ArrayArg, ValueArg, ConstantArg, ImageArg new_args = [] + for arg in kernel.args: if isinstance(arg, ArrayArg): - if arg.is_output_only is not None: - assert isinstance(arg.is_output_only, bool) - new_args.append(arg) + if arg.is_output is not None: + assert isinstance(arg.is_output, bool) else: if arg.name in kernel.get_written_variables(): - new_args.append(arg.copy(is_output_only=True)) + arg = arg.copy(is_output=True) else: - new_args.append(arg.copy(is_output_only=False)) + arg = arg.copy(is_output=False) + + if arg.is_input is not None: + assert isinstance(arg.is_input, bool) + else: + if arg.name in kernel.get_read_variables() or ( + (arg.name not in kernel.get_read_variables()) and ( + arg.name not in kernel.get_written_variables())): + arg = arg.copy(is_input=True) + else: + arg = arg.copy(is_input=False) elif isinstance(arg, (ConstantArg, ImageArg, ValueArg)): - new_args.append(arg) + pass else: raise NotImplementedError("Unkonwn argument type %s." % type(arg)) + if not (arg.is_input or arg.is_output): + raise LoopyError("Kernel argument must be either input or output." + " '{}' in '{}' does not follow it.".format(arg.name, + kernel.name)) + + new_args.append(arg) + return kernel.copy(args=new_args) # }}} + +# {{{ CallablesIDCollector + +class CallablesIDCollector(CombineMapper): + """ + Mapper to collect function identifiers of all resolved callables in an + expression. + """ + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_resolved_function(self, expr): + return frozenset([expr.name]) + + def map_constant(self, expr): + return frozenset() + + def map_kernel(self, kernel): + callables_in_insn = frozenset() + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + callables_in_insn = callables_in_insn | ( + self(insn.expression)) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError(type(insn).__name__) + + for rule in kernel.substitutions.values(): + callables_in_insn = callables_in_insn | ( + self(rule.expression)) + + return callables_in_insn + + def map_type_cast(self, expr): + return self.rec(expr.child) + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + + +def get_resolved_callable_ids_called_by_knl(knl, callables, recursive=True): + clbl_id_collector = CallablesIDCollector() + callables_called_by_kernel = clbl_id_collector.map_kernel(knl) + + if not recursive: + # => do not recurse into the callees + return callables_called_by_kernel + + callables_called_by_called_callables = frozenset().union(*( + callables[clbl_id].get_called_callables(callables) + for clbl_id in callables_called_by_kernel)) + return callables_called_by_kernel | callables_called_by_called_callables + +# }}} + + +# {{{ get_call_graph + +def get_call_graph(t_unit, only_kernel_callables=False): + """ + Returns a mapping from a callable name to the calls seen in it. + + :arg t_unit: An instance of :class:`TranslationUnit`. + """ + from pyrsistent import pmap + from loopy.kernel import KernelState + + if t_unit.state < KernelState.CALLS_RESOLVED: + raise LoopyError("TranslationUnit must have calls resolved in order to" + " compute its call graph.") + + knl_callables = frozenset(name for name, clbl in t_unit.callables_table.items() + if isinstance(clbl, CallableKernel)) + + # stores a mapping from caller -> "direct"" callees + call_graph = {} + + for name, clbl in t_unit.callables_table.items(): + if (not isinstance(clbl, CallableKernel) + and only_kernel_callables): + pass + else: + if only_kernel_callables: + call_graph[name] = (clbl.get_called_callables(t_unit.callables_table, + recursive=False) + & knl_callables) + else: + call_graph[name] = clbl.get_called_callables(t_unit.callables_table, + recursive=False) + + return pmap(call_graph) + +# }}} + # vim: foldmethod=marker diff --git a/loopy/library/function.py b/loopy/library/function.py index 99af08169c0ea053a1671e0ab087f24a86c16e3b..d7558960ab0c7e2c4f045655a068fc67d0785797 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -20,38 +20,109 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - -def default_function_mangler(kernel, name, arg_dtypes): - from loopy.library.reduction import reduction_function_mangler - - manglers = [reduction_function_mangler, tuple_function_mangler] - for mangler in manglers: - result = mangler(kernel, name, arg_dtypes) - if result is not None: - return result - - return None - - -def single_arg_function_mangler(kernel, name, arg_dtypes): - if len(arg_dtypes) == 1: - dtype, = arg_dtypes - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo(name, (dtype,), (dtype,)) - - return None - - -def tuple_function_mangler(kernel, name, arg_dtypes): - if name == "make_tuple": - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="loopy_make_tuple", - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) - - return None +from loopy.kernel.function_interface import ScalarCallable +from loopy.diagnostic import LoopyError +from loopy.types import NumpyType +import numpy as np + + +class MakeTupleCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, callables_table): + new_arg_id_to_dtype = arg_id_to_dtype.copy() + for i in range(len(arg_id_to_dtype)): + if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: + new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i] + + return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target="loopy_make_tuple"), callables_table) + + def with_descrs(self, arg_id_to_descr, callables_table): + from loopy.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = {(id, ValueArgDescriptor()): + (-id-1, ValueArgDescriptor()) for id in arg_id_to_descr.keys()} + + return ( + self.copy(arg_id_to_descr=new_arg_id_to_descr), + callables_table) + + +class IndexOfCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, callables_table): + new_arg_id_to_dtype = {i: dtype + for i, dtype in arg_id_to_dtype.items() + if dtype is not None} + new_arg_id_to_dtype[-1] = NumpyType(np.int32) + + return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), + callables_table) + + def emit_call(self, expression_to_code_mapper, expression, target): + from pymbolic.primitives import Subscript + + if len(expression.parameters) != 1: + raise LoopyError("%s takes exactly one argument" % self.name) + arg, = expression.parameters + if not isinstance(arg, Subscript): + raise LoopyError( + "argument to %s must be a subscript" % self.name) + + ary = expression_to_code_mapper.find_array(arg) + + from loopy.kernel.array import get_access_info + from pymbolic import evaluate + access_info = get_access_info(expression_to_code_mapper.kernel.target, + ary, arg.index, lambda expr: evaluate(expr, + expression_to_code_mapper.codegen_state.var_subst_map), + expression_to_code_mapper.codegen_state.vectorization_info) + + from loopy.kernel.data import ImageArg + if isinstance(ary, ImageArg): + raise LoopyError("%s does not support images" % self.name) + + if self.name == "indexof": + return access_info.subscripts[0] + elif self.name == "indexof_vec": + from loopy.kernel.array import VectorArrayDimTag + ivec = None + for iaxis, dim_tag in enumerate(ary.dim_tags): + if isinstance(dim_tag, VectorArrayDimTag): + ivec = iaxis + + if ivec is None: + return access_info.subscripts[0] + else: + return ( + access_info.subscripts[0]*ary.shape[ivec] + + access_info.vector_index) + + else: + raise RuntimeError("should not get here") + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + return self.emit_call( + expression_to_code_mapper, + insn.expression, + target), True + + +def get_loopy_callables(): + """ + Returns a mapping from function ids to corresponding + :class:`loopy.kernel.function_interface.InKernelCallable` for functions + whose interface is provided by :mod:`loopy`. Callables that fall in this + category are -- + + - reductions leading to function calls like ``argmin``, ``argmax``. + - callables that have a predefined meaning in :mod:`loo.py` like + ``make_tuple``, ``index_of``, ``indexof_vec``. + """ + known_callables = { + "make_tuple": MakeTupleCallable(name="make_tuple"), + "indexof": IndexOfCallable(name="indexof"), + "indexof_vec": IndexOfCallable(name="indexof_vec"), + } + + return known_callables # vim: foldmethod=marker diff --git a/loopy/library/random123.py b/loopy/library/random123.py index 7f24dd3a0e3699fb0bb55ac1d4022645dedac854..8978f44192791839dfc52689d56d8e74fad5031a 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -26,6 +26,7 @@ THE SOFTWARE. from pytools import ImmutableRecord from mako.template import Template +from loopy.kernel.function_interface import ScalarCallable import numpy as np @@ -162,60 +163,85 @@ double${ width } ${ name }_f64( # }}} -def random123_preamble_generator(preamble_info): - for f in preamble_info.seen_functions: - try: - rng_variant = FUNC_NAMES_TO_RNG[f.name] - except KeyError: - continue +class Random123Callable(ScalarCallable): + """ + Records information about for the random123 functions. + """ + fields = ScalarCallable.fields | {"target"} + hash_fields = ScalarCallable.hash_fields + ("target",) + + def __init__(self, name, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None, target=None): + super().__init__(name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr, + name_in_target=name_in_target) + + self.target = target + + def with_types(self, arg_id_to_dtype, callables_table): + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return (self.copy(), + callables_table) + + name = self.name + target = self.target + + rng_variant = FUNC_NAMES_TO_RNG[name] + + from loopy.types import NumpyType + base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] + ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) + key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) + + fn = rng_variant.full_name + if name == fn: + new_arg_id_to_dtype = {-1: ctr_dtype, -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return ( + self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=fn+"_gen"), + callables_table) + + elif name == fn + "_f32": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name), callables_table + + elif name == fn + "_f64": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name), callables_table + + return (self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + def generate_preambles(self, target): + rng_variant = FUNC_NAMES_TO_RNG[self.name] from loopy.target.pyopencl import PyOpenCLTarget yield ("90-random123-"+rng_variant.full_name, PREAMBLE_TEMPLATE.render( is_pyopencl_target=isinstance( - preamble_info.kernel.target, + target, PyOpenCLTarget), rng_variant=rng_variant, )) + return -def random123_function_mangler(kernel, name, arg_dtypes): - try: - rng_variant = FUNC_NAMES_TO_RNG[name] - except KeyError: - return None - - from loopy.types import NumpyType - target = kernel.target - base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] - ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) - key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) - - from loopy.kernel.data import CallMangleInfo - fn = rng_variant.full_name - if name == fn: - return CallMangleInfo( - target_name=fn+"_gen", - result_dtypes=(ctr_dtype, ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f32": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float32), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f64": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float64), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - else: - return None + +def get_random123_callables(target): + return {id_: Random123Callable(id_, target=target) for id_ in FUNC_NAMES_TO_RNG} # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 6ca763442d3bb7e4f9044b738cb67e70aca703b1..67043e1af50a5b7ce6dc16d4f5349cca8ee32dba 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -22,11 +22,14 @@ THE SOFTWARE. from pymbolic import var +from loopy.symbolic import ResolvedFunction +from loopy.kernel.function_interface import ScalarCallable import numpy as np from loopy.symbolic import FunctionIdentifier from loopy.diagnostic import LoopyError from loopy.types import NumpyType +from loopy.tools import update_persistent_hash __doc__ = """ .. currentmodule:: loopy.library.reduction @@ -42,6 +45,8 @@ __doc__ = """ .. autoclass:: MaxReductionOperation .. autoclass:: MinReductionOperation + +.. autoclass:: ReductionOpFunction """ @@ -50,7 +55,7 @@ class ReductionOperation: equality-comparable. """ - def result_dtypes(self, target, *arg_dtypes): + def result_dtypes(self, *arg_dtypes): """ :arg arg_dtypes: may be None if not known :returns: None if not known, otherwise the returned type @@ -62,7 +67,7 @@ class ReductionOperation: def arg_count(self): raise NotImplementedError - def neutral_element(self, *dtypes): + def neutral_element(self, dtypes, callables_table, target): raise NotImplementedError def __hash__(self): @@ -97,68 +102,66 @@ class ReductionOperation: class ScalarReductionOperation(ReductionOperation): - def __init__(self, forced_result_type=None): - """ - :arg forced_result_type: Force the reduction result to be of this type. - May be a string identifying the type for the backend under - consideration. - """ - self.forced_result_type = forced_result_type - @property def arg_count(self): return 1 - def result_dtypes(self, kernel, arg_dtype): - if self.forced_result_type is not None: - return (self.parse_result_type( - kernel.target, self.forced_result_type),) - + def result_dtypes(self, arg_dtype): if arg_dtype is None: return None return (arg_dtype,) def __hash__(self): - return hash((type(self), self.forced_result_type)) + return hash((type(self),)) def __eq__(self, other): - return (type(self) == type(other) - and self.forced_result_type == other.forced_result_type) + return type(self) == type(other) def __str__(self): result = type(self).__name__.replace("ReductionOperation", "").lower() - if self.forced_result_type is not None: - result = "{}<{}>".format(result, str(self.forced_result_type)) - return result class SumReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype): + def neutral_element(self, dtype, callables_table, target): # FIXME: Document that we always use an int here. - return 0 + from loopy import auto + if dtype not in [None, auto] and dtype.numpy_dtype.kind == "f": + return 0.0, callables_table - def __call__(self, dtype, operand1, operand2): - return operand1 + operand2 + return 0, callables_table + + def __call__(self, dtype, operand1, operand2, callables_table, target): + return operand1 + operand2, callables_table class ProductReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype): + def neutral_element(self, dtype, callables_table, target): # FIXME: Document that we always use an int here. - return 1 + from loopy import auto + if dtype not in [None, auto] and dtype.numpy_dtype.kind == "f": + return 1.0, callables_table - def __call__(self, dtype, operand1, operand2): - return operand1 * operand2 + return 1, callables_table + + def __call__(self, dtype, operand1, operand2, callables_table, target): + return operand1 * operand2, callables_table def get_le_neutral(dtype): """Return a number y that satisfies (x <= y) for all y.""" if dtype.numpy_dtype.kind == "f": - # OpenCL 1.1, section 6.11.2 - return var("INFINITY") + # OpenCL 1.2, section 6.12.2 + if dtype.numpy_dtype.itemsize == 4: + #float + return var("INFINITY") + elif dtype.numpy_dtype.itemsize == 8: + #double + return var("HUGE_VAL") + elif dtype.numpy_dtype.kind == "i": # OpenCL 1.1, section 6.11.3 if dtype.numpy_dtype.itemsize == 4: @@ -175,8 +178,13 @@ def get_ge_neutral(dtype): """Return a number y that satisfies (x >= y) for all y.""" if dtype.numpy_dtype.kind == "f": - # OpenCL 1.1, section 6.11.2 - return -var("INFINITY") + # OpenCL 1.2, section 6.12.2 + if dtype.numpy_dtype.itemsize == 4: + #float + return -var("INFINITY") + elif dtype.numpy_dtype.itemsize == 8: + #double + return -var("HUGE_VAL") elif dtype.numpy_dtype.kind == "i": # OpenCL 1.1, section 6.11.3 if dtype.numpy_dtype.itemsize == 4: @@ -190,19 +198,47 @@ def get_ge_neutral(dtype): class MaxReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype): - return get_ge_neutral(dtype) + def neutral_element(self, dtype, callables_table, target): + return get_ge_neutral(dtype), callables_table - def __call__(self, dtype, operand1, operand2): - return var("max")(operand1, operand2) + def __call__(self, dtype, operand1, operand2, callables_table, target): + dtype, = dtype + from loopy.translation_unit import add_callable_to_table + + # getting the callable 'max' from target + max_scalar_callable = target.get_device_ast_builder().known_callables["max"] + + # type specialize the callable + max_scalar_callable, callables_table = max_scalar_callable.with_types( + {0: dtype, 1: dtype}, callables_table) + + # populate callables_table + func_id, callables_table = add_callable_to_table(callables_table, "max", + max_scalar_callable) + + return ResolvedFunction(func_id)(operand1, operand2), callables_table class MinReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype): - return get_le_neutral(dtype) + def neutral_element(self, dtype, callables_table, target): + return get_le_neutral(dtype), callables_table - def __call__(self, dtype, operand1, operand2): - return var("min")(operand1, operand2) + def __call__(self, dtype, operand1, operand2, callables_table, target): + dtype, = dtype + from loopy.translation_unit import add_callable_to_table + + # getting the callable 'min' from target + min_scalar_callable = target.get_device_ast_builder().known_callables["min"] + + # type specialize the callable + min_scalar_callable, callables_table = min_scalar_callable.with_types( + {0: dtype, 1: dtype}, callables_table) + + # populate callables_table + func_id, callables_table = add_callable_to_table(callables_table, "min", + min_scalar_callable) + + return ResolvedFunction(func_id)(operand1, operand2), callables_table # {{{ base class for symbolic reduction ops @@ -226,6 +262,10 @@ class ReductionOpFunction(FunctionIdentifier): return type(self)(reduction_op) + hash_fields = ( + "reduction_op",) + + update_persistent_hash = update_persistent_hash # }}} @@ -257,13 +297,30 @@ class _SegmentedScalarReductionOperation(ReductionOperation): scalar_dtype.numpy_dtype.type.__name__, segment_flag_dtype.numpy_dtype.type.__name__) - def neutral_element(self, scalar_dtype, segment_flag_dtype): - scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, - segment_flag_dtype.numpy_dtype.type(0)) + def neutral_element(self, scalar_dtype, segment_flag_dtype, + callables_table, target): + from loopy.library.function import MakeTupleCallable + from loopy.translation_unit import add_callable_to_table + + scalar_neutral_element, calables_table = ( + self.inner_reduction.neutral_element( + scalar_dtype, callables_table, target)) + + make_tuple_callable = MakeTupleCallable( + name="make_tuple") + + make_tuple_callable, callables_table = make_tuple_callable.with_types( + dict(enumerate([scalar_dtype, segment_flag_dtype])), + callables_table) - def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): - return (self.inner_reduction.result_dtypes(kernel, scalar_dtype) + func_id, callables_table = add_callable_to_table( + callables_table, "make_tuple", make_tuple_callable) + + return ResolvedFunction(func_id)(scalar_neutral_element, + segment_flag_dtype.numpy_dtype.type(0)), callables_table + + def result_dtypes(self, scalar_dtype, segment_flag_dtype): + return (self.inner_reduction.result_dtypes(scalar_dtype) + (segment_flag_dtype,)) def __str__(self): @@ -273,10 +330,25 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return hash(type(self)) def __eq__(self, other): - return type(self) == type(other) + return type(self) == type(other) and (self.inner_reduction == + other.inner_reduction) + + def __call__(self, dtypes, operand1, operand2, callables_table, target): + segmented_scalar_callable = SegmentOpCallable(SegmentedOp(self)) - def __call__(self, dtypes, operand1, operand2): - return SegmentedOp(self)(*(operand1 + operand2)) + # type specialize the callable + segmented_scalar_callable, callables_table = ( + segmented_scalar_callable.with_types( + {0: dtypes[0], 1: dtypes[1], 2: dtypes[0], 3: dtypes[1]}, + callables_table)) + + # populate callables_table + from loopy.translation_unit import add_callable_to_table + func_id, callables_table = add_callable_to_table( + callables_table, SegmentedOp(self), segmented_scalar_callable) + + return (ResolvedFunction(func_id)(*(operand1 + operand2)), + callables_table) class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): @@ -284,34 +356,24 @@ class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): which = "sum" op = "((%s) + (%s))" + hash_fields = ( + "which", + "op",) + + update_persistent_hash = update_persistent_hash + class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): base_reduction_class = ProductReductionOperation op = "((%s) * (%s))" which = "product" + hash_fields = ( + "which", + "op", + "base_reduction_class",) -def get_segmented_function_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - prefix = op.prefix(scalar_dtype, segment_flag_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, - %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, - %(segment_flag_t)s *segment_flag_out) - { - *segment_flag_out = segment_flag1 | segment_flag2; - return segment_flag2 ? op2 : %(combined)s; - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - segment_flag_t=kernel.target.dtype_to_typename(segment_flag_dtype), - combined=op.op % ("op1", "op2"), - )) + update_persistent_hash = update_persistent_hash # }}} @@ -337,15 +399,31 @@ class _ArgExtremumReductionOperation(ReductionOperation): scalar_dtype.numpy_dtype.type.__name__, index_dtype.numpy_dtype.type.__name__) - def result_dtypes(self, kernel, scalar_dtype, index_dtype): + def result_dtypes(self, scalar_dtype, index_dtype): return (scalar_dtype, index_dtype) - def neutral_element(self, scalar_dtype, index_dtype): + def neutral_element(self, scalar_dtype, index_dtype, callables_table, + target): scalar_neutral_func = ( get_ge_neutral if self.neutral_sign < 0 else get_le_neutral) scalar_neutral_element = scalar_neutral_func(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, - index_dtype.numpy_dtype.type(-1)) + + from loopy.library.function import MakeTupleCallable + from loopy.translation_unit import add_callable_to_table + make_tuple_callable = MakeTupleCallable( + name="make_tuple") + + make_tuple_callable, callables_table = make_tuple_callable.with_types( + dict(enumerate([scalar_dtype, index_dtype])), + callables_table) + + # populate callables_table + func_id, callables_table = add_callable_to_table(callables_table, + "make_tuple", + make_tuple_callable) + + return ResolvedFunction(func_id)(scalar_neutral_element, + index_dtype.numpy_dtype.type(-1)), callables_table def __str__(self): return self.which @@ -360,8 +438,22 @@ class _ArgExtremumReductionOperation(ReductionOperation): def arg_count(self): return 2 - def __call__(self, dtypes, operand1, operand2): - return ArgExtOp(self)(*(operand1 + operand2)) + def __call__(self, dtypes, operand1, operand2, callables_table, target): + arg_ext_scalar_callable = ArgExtOpCallable(ArgExtOp(self)) + + # type specialize the callable + arg_ext_scalar_callable, callables_table = ( + arg_ext_scalar_callable.with_types( + {0: dtypes[0], 1: dtypes[1], 2: dtypes[0], 3: dtypes[1]}, + callables_table)) + + # populate callables_table + from loopy.translation_unit import add_callable_to_table + func_id, callables_table = add_callable_to_table( + callables_table, ArgExtOp(self), arg_ext_scalar_callable) + + return (ResolvedFunction(func_id)(*(operand1 + operand2)), + callables_table) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): @@ -369,43 +461,23 @@ class ArgMaxReductionOperation(_ArgExtremumReductionOperation): update_comparison = ">=" neutral_sign = -1 + hash_fields = ("which", + "update_comparison", + "neutral_sign",) + + update_persistent_hash = update_persistent_hash + class ArgMinReductionOperation(_ArgExtremumReductionOperation): which = "min" update_comparison = "<=" neutral_sign = +1 + hash_fields = ("which", + "update_comparison", + "neutral_sign",) -def get_argext_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - prefix = op.prefix(scalar_dtype, index_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(index_t)s index1, - %(scalar_t)s op2, %(index_t)s index2, - %(index_t)s *index_out) - { - if (op2 %(comp)s op1) - { - *index_out = index2; - return op2; - } - else - { - *index_out = index1; - return op1; - } - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - index_t=kernel.target.dtype_to_typename(index_dtype), - comp=op.update_comparison, - )) + update_persistent_hash = update_persistent_hash # }}} @@ -442,10 +514,13 @@ def parse_reduction_op(name): red_op_match = re.match(r"^([a-z]+)_([a-z0-9_]+)$", name) if red_op_match: op_name = red_op_match.group(1) - op_type = red_op_match.group(2) if op_name in _REDUCTION_OPS: - return _REDUCTION_OPS[op_name](op_type) + from warnings import warn + warn("Reductions with forced result types are no longer supported. " + f"Encountered '{name}', which might be one.", + DeprecationWarning) + return None if name in _REDUCTION_OPS: return _REDUCTION_OPS[name]() @@ -460,70 +535,94 @@ def parse_reduction_op(name): # }}} -def reduction_function_mangler(kernel, func_id, arg_dtypes): - if isinstance(func_id, ArgExtOp): - from loopy.target.opencl import CFamilyTarget - if not isinstance(kernel.target, CFamilyTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, index_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, index_dtype), - arg_dtypes=( - scalar_dtype, - index_dtype, - scalar_dtype, - index_dtype), - ) - - elif isinstance(func_id, SegmentedOp): - from loopy.target.opencl import CFamilyTarget - if not isinstance(kernel.target, CFamilyTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, segment_flag_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, segment_flag_dtype), - arg_dtypes=( - scalar_dtype, - segment_flag_dtype, - scalar_dtype, - segment_flag_dtype), - ) +# {{{ reduction specific callables + +class ReductionCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, callables_table): + scalar_dtype = arg_id_to_dtype[0] + index_dtype = arg_id_to_dtype[1] + result_dtypes = self.name.reduction_op.result_dtypes(scalar_dtype, + index_dtype) + new_arg_id_to_dtype = arg_id_to_dtype.copy() + new_arg_id_to_dtype[-1] = result_dtypes[0] + new_arg_id_to_dtype[-2] = result_dtypes[1] + name_in_target = self.name.reduction_op.prefix(scalar_dtype, + index_dtype) + "_op" + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name_in_target), callables_table + + def with_descrs(self, arg_id_to_descr, callables_table): + from loopy.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = arg_id_to_descr.copy() + new_arg_id_to_descr[-1] = ValueArgDescriptor() + return ( + self.copy(arg_id_to_descr=arg_id_to_descr), + callables_table) + + +class ArgExtOpCallable(ReductionCallable): + + def generate_preambles(self, target): + op = self.name.reduction_op + scalar_dtype = self.arg_id_to_dtype[-1] + index_dtype = self.arg_id_to_dtype[-2] + + prefix = op.prefix(scalar_dtype, index_dtype) + + yield (prefix, """ + inline {scalar_t} {prefix}_op( + {scalar_t} op1, {index_t} index1, + {scalar_t} op2, {index_t} index2, + {index_t} *index_out) + {{ + if (op2 {comp} op1) + {{ + *index_out = index2; + return op2; + }} + else + {{ + *index_out = index1; + return op1; + }} + }} + """.format( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + index_t=target.dtype_to_typename(index_dtype), + comp=op.update_comparison, + )) + + return + + +class SegmentOpCallable(ReductionCallable): + + def generate_preambles(self, target): + op = self.name.reduction_op + scalar_dtype = self.arg_id_to_dtype[-1] + segment_flag_dtype = self.arg_id_to_dtype[-2] + prefix = op.prefix(scalar_dtype, segment_flag_dtype) + + yield (prefix, """ + inline {scalar_t} {prefix}_op( + {scalar_t} op1, {segment_flag_t} segment_flag1, + {scalar_t} op2, {segment_flag_t} segment_flag2, + {segment_flag_t} *segment_flag_out) + {{ + *segment_flag_out = segment_flag1 | segment_flag2; + return segment_flag2 ? op2 : {combined}; + }} + """.format( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + segment_flag_t=target.dtype_to_typename(segment_flag_dtype), + combined=op.op % ("op1", "op2"), + )) + + return - return None - - -def reduction_preamble_generator(preamble_info): - from loopy.target.opencl import OpenCLTarget - - for func in preamble_info.seen_functions: - if isinstance(func.name, ArgExtOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_argext_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) - - elif isinstance(func.name, SegmentedOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_segmented_function_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) +# }}} # vim: fdm=marker diff --git a/loopy/loop.py b/loopy/loop.py index 7f5744b482fa2fb6cfbed64ee27486af9cb36e40..af61b7db5e37ebdaafc9849c5141142d909ee7a5 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -22,13 +22,15 @@ THE SOFTWARE. import islpy as isl +from loopy.translation_unit import for_each_kernel def potential_loop_nest_map(kernel): """Returns a dictionary mapping inames to other inames that *could* be nested around them. - :seealso: :func:`loopy.schedule.loop_nest_map` + * :seealso: :func:`loopy.schedule.loop_nest_map` + * :seealso: :func:`loopy.schedule.find_loop_nest_around_map` """ result = {} @@ -52,7 +54,9 @@ def potential_loop_nest_map(kernel): return result -def fuse_loop_domains(kernel): +@for_each_kernel +def merge_loop_domains(kernel): + # FIXME: This should be moved to loopy.transforms.iname from loopy.kernel.tools import is_domain_dependent_on_inames while True: @@ -60,11 +64,13 @@ def fuse_loop_domains(kernel): parents_per_domain = kernel.parents_per_domain() all_parents_per_domain = kernel.all_parents_per_domain() + iname_to_insns = kernel.iname_to_insns() + new_domains = None for inner_iname, outer_inames in lnm.items(): for outer_iname in outer_inames: - # {{{ check if it's safe to fuse + # {{{ check if it's safe to merge inner_domain_idx = kernel.get_home_domain_index(inner_iname) outer_domain_idx = kernel.get_home_domain_index(outer_iname) @@ -72,12 +78,28 @@ def fuse_loop_domains(kernel): if inner_domain_idx == outer_domain_idx: break + if (not iname_to_insns[inner_iname] + or not iname_to_insns[outer_iname]): + # Inames without instructions occur when used in + # a SubArrayRef. We don't want monster SubArrayRef domains, + # so refuse to merge those. + continue + + if iname_to_insns[inner_iname] != iname_to_insns[outer_iname]: + # The two inames are imperfectly nested. Domain fusion + # might be invalid when the inner loop is empty, leading to + # the outer loop also being empty. + + # FIXME: Not fully correct, does not consider reductions + # https://gitlab.tiker.net/inducer/loopy/issues/172 + continue + if ( outer_domain_idx in all_parents_per_domain[inner_domain_idx] and not outer_domain_idx == parents_per_domain[inner_domain_idx]): # Outer domain is not a direct parent of the inner - # domain. Unable to fuse. + # domain. Unable to merge. continue outer_dom = kernel.domains[outer_domain_idx] @@ -87,7 +109,7 @@ def fuse_loop_domains(kernel): if is_domain_dependent_on_inames(kernel, inner_domain_idx, outer_inames): # Bounds of inner domain depend on outer domain. - # Unable to fuse. + # Unable to merge. continue # }}} diff --git a/loopy/match.py b/loopy/match.py index 9160402b48c81e4126f0f73f8fde6f6f5406e8b4..7ecbfcfaef925890f2de9951e70feb9bf3fbbf6f 100644 --- a/loopy/match.py +++ b/loopy/match.py @@ -50,6 +50,7 @@ Match expressions .. autoclass:: Tagged .. autoclass:: Writes .. autoclass:: Reads +.. autoclass:: InKernel .. autoclass:: Iname """ @@ -74,6 +75,7 @@ _id = intern("_id") _tag = intern("_tag") _writes = intern("_writes") _reads = intern("_reads") +_in_kernel = intern("_in_kernel") _iname = intern("_iname") _whitespace = intern("_whitespace") @@ -93,13 +95,14 @@ _LEX_TABLE = [ (_tag, RE(r"tag:([\w?*]+)")), (_writes, RE(r"writes:([\w?*]+)")), (_reads, RE(r"reads:([\w?*]+)")), + (_in_kernel, RE(r"in_kernel:([\w?*]+)")), (_iname, RE(r"iname:([\w?*]+)")), (_whitespace, RE("[ \t]+")), ] -_TERMINALS = ([_id, _tag, _writes, _reads, _iname]) +_TERMINALS = ([_id, _tag, _writes, _reads, _in_kernel, _iname]) # {{{ operator precedence @@ -293,6 +296,11 @@ class Reads(GlobMatchExpressionBase): for name in matchable.read_dependency_names()) +class InKernel(GlobMatchExpressionBase): + def __call__(self, kernel, matchable): + return self.re.match(kernel.name) + + class Iname(GlobMatchExpressionBase): def __call__(self, kernel, matchable): return any(self.re.match(name) @@ -330,6 +338,10 @@ def parse_match(expr): result = Reads(pstate.next_match_obj().group(1)) pstate.advance() return result + elif next_tag is _in_kernel: + result = InKernel(pstate.next_match_obj().group(1)) + pstate.advance() + return result elif next_tag is _iname: result = Iname(pstate.next_match_obj().group(1)) pstate.advance() diff --git a/loopy/preprocess.py b/loopy/preprocess.py index e59c275d29c96775c143942e6c2477b78a8a2c07..c01e7f27abccdcafaee4c4554a12572059db90b5 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -20,11 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import logging +logger = logging.getLogger(__name__) from loopy.diagnostic import ( LoopyError, WriteRaceConditionWarning, warn_with_kernel, LoopyAdvisory) - import islpy as isl from pytools.persistent_dict import WriteOncePersistentDict @@ -35,23 +36,34 @@ from loopy.kernel.data import make_assignment, filter_iname_tags_by_type from loopy.kernel.tools import kernel_has_global_barriers # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types -from loopy.transform.iname import remove_any_newly_unused_inames +from loopy.symbolic import RuleAwareIdentityMapper, ReductionCallbackMapper +# from loopy.transform.iname import remove_any_newly_unused_inames -import logging -logger = logging.getLogger(__name__) +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + CallInstruction, _DataObliviousInstruction) +from loopy.kernel import LoopKernel +from loopy.translation_unit import TranslationUnit +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + +from pytools import ProcessLogger +from functools import partial # {{{ prepare for caching -def prepare_for_caching(kernel): +def prepare_for_caching_inner(kernel): import loopy as lp + from loopy.types import OpaqueType new_args = [] tgt = kernel.target for arg in kernel.args: dtype = arg.dtype - if dtype is not None and dtype is not lp.auto and dtype.target is not tgt: + if (dtype is not None + and not isinstance(dtype, OpaqueType) + and dtype is not lp.auto + and dtype.target is not tgt): arg = arg.copy(dtype=dtype.with_target(tgt), target=tgt) new_args.append(arg) @@ -70,6 +82,32 @@ def prepare_for_caching(kernel): return kernel + +def prepare_for_caching(program): + if isinstance(program, LoopKernel): + return prepare_for_caching_inner(program) + + assert isinstance(program, TranslationUnit) + tgt = program.target + + new_clbls = {} + for name, clbl in program.callables_table.items(): + if clbl.arg_id_to_dtype is not None: + arg_id_to_dtype = {id: dtype.with_target(tgt) + for id, dtype in clbl.arg_id_to_dtype.items()} + clbl = clbl.copy(arg_id_to_dtype=arg_id_to_dtype) + if isinstance(clbl, ScalarCallable): + pass + elif isinstance(clbl, CallableKernel): + subknl = prepare_for_caching_inner(clbl.subkernel) + clbl = clbl.copy(subkernel=subknl) + else: + raise NotImplementedError(type(clbl)) + + new_clbls[name] = clbl + + return program.copy(callables_table=new_clbls) + # }}} @@ -244,15 +282,11 @@ def find_temporary_address_space(kernel): desired_aspace_per_insn.append(desired_aspace) if not desired_aspace_per_insn: - if temp_var.initializer is None: - warn_with_kernel(kernel, "temp_to_write(%s)" % temp_var.name, - "temporary variable '%s' never written, eliminating" - % temp_var.name, LoopyAdvisory) - else: - raise LoopyError("temporary variable '%s': never written, " - "cannot automatically determine address space" - % temp_var.name) + warn_with_kernel(kernel, "temp_to_write(%s)" % temp_var.name, + "cannot automatically determine address space of '%s'" + % temp_var.name, LoopyAdvisory) + new_temp_vars[temp_var.name] = temp_var continue overall_aspace = max(desired_aspace_per_insn) @@ -741,7 +775,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): # }}} - from loopy.kernel.instruction import CallInstruction + from loopy.kernel.instruction import CallInstruction, is_array_call for insn in kernel.instructions: if not isinstance(insn, CallInstruction): continue @@ -749,6 +783,9 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): if len(insn.assignees) <= 1: continue + if is_array_call(insn.assignees, insn.expression): + continue + assignees = insn.assignees assignee_var_names = insn.assignee_var_names() @@ -882,10 +919,21 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} -@remove_any_newly_unused_inames -def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, - automagic_scans_ok=False, force_scan=False, - force_outer_iname_for_scan=None): +class RealizeReductionCallbackMapper(ReductionCallbackMapper): + def __init__(self, callback, callables_table): + super().__init__(callback) + self.callables_table = callables_table + + def map_reduction(self, expr, **kwargs): + result, self.callables_table = self.callback(expr, self.rec, + **kwargs) + return result + + +# @remove_any_newly_unused_inames +def realize_reduction_for_single_kernel(kernel, callables_table, + insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, + force_scan=False, force_outer_iname_for_scan=None): """Rewrites reductions into their imperative form. With *insn_id_filter* specified, operate only on the instruction with an instruction id matching *insn_id_filter*. @@ -1005,7 +1053,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ sequential - def map_reduction_seq(expr, rec, nresults, arg_dtypes, + def map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes): outer_insn_inames = insn.within_inames @@ -1037,13 +1085,16 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, init_id = insn_id_gen( "{}_{}_init".format(insn.id, "_".join(expr.inames))) + expression, callables_table = expr.operation.neutral_element( + *arg_dtypes, callables_table=callables_table, target=kernel.target) + init_insn = make_assignment( id=init_id, assignees=acc_vars, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=init_insn_depends_on, - expression=expr.operation.neutral_element(*arg_dtypes) + expression=expression, # Do not inherit predicates: Those might read variables # that may not yet be set, and we don't have a great way @@ -1087,13 +1138,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, else: reduction_expr = expr.expr + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, acc_vars), + reduction_expr, + callables_table, + kernel.target) + reduction_insn = make_assignment( id=update_id, assignees=acc_vars, - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, acc_vars), - reduction_expr), + expression=expression, depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on, within_inames=update_insn_iname_deps, within_inames_is_final=insn.within_inames_is_final, @@ -1105,9 +1160,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0] + return acc_vars[0], callables_table else: - return acc_vars + return acc_vars, callables_table # }}} @@ -1139,7 +1194,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, v[iname].lt_set(v[0] + ubound)).get_basic_sets() return bs - def map_reduction_local(expr, rec, nresults, arg_dtypes, + def map_reduction_local(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes): red_iname, = expr.inames @@ -1190,7 +1245,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, base_iname_deps = outer_insn_inames - frozenset(expr.inames) - neutral = expr.operation.neutral_element(*arg_dtypes) + neutral, callables_table = expr.operation.neutral_element(*arg_dtypes, + callables_table=callables_table, target=kernel.target) init_id = insn_id_gen(f"{insn.id}_{red_iname}_init") init_insn = make_assignment( id=init_id, @@ -1250,17 +1306,20 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, reduction_expr = expr.expr transfer_id = insn_id_gen(f"{insn.id}_{red_iname}_transfer") + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar( + neutral_var_names, + tuple(var(nvn) for nvn in neutral_var_names)), + reduction_expr, + callables_table, + kernel.target) transfer_insn = make_assignment( id=transfer_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(red_iname),)] for acc_var in acc_vars), - expression=expr.operation( - arg_dtypes, - _strip_if_scalar( - neutral_var_names, - tuple(var(nvn) for nvn in neutral_var_names)), - reduction_expr), + expression=expression, within_inames=( (outer_insn_inames - frozenset(expr.inames)) | frozenset([red_iname])), @@ -1289,22 +1348,26 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, new_iname_tags[stage_exec_iname] = kernel.iname_tags(red_iname) stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage)) + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars)), + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + ( + var(stage_exec_iname) + new_size,)] + for acc_var in acc_vars)), + callables_table, + kernel.target) + stage_insn = make_assignment( id=stage_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars), - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, tuple( - acc_var[ - outer_local_iname_vars + (var(stage_exec_iname),)] - for acc_var in acc_vars)), - _strip_if_scalar(acc_vars, tuple( - acc_var[ - outer_local_iname_vars + ( - var(stage_exec_iname) + new_size,)] - for acc_var in acc_vars))), + expression=expression, within_inames=( base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, @@ -1325,9 +1388,10 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0][outer_local_iname_vars + (0,)] + return acc_vars[0][outer_local_iname_vars + (0,)], callables_table else: - return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars] + return [acc_var[outer_local_iname_vars + (0,)] for acc_var in + acc_vars], callables_table # }}} # {{{ utils (stateful) @@ -1386,7 +1450,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ sequential scan - def map_scan_seq(expr, rec, nresults, arg_dtypes, + def map_scan_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, scan_min_value, stride): outer_insn_inames = insn.within_inames @@ -1423,6 +1487,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if global_barrier is not None: init_insn_depends_on |= frozenset([global_barrier]) + expression, callables_table = expr.operation.neutral_element( + *arg_dtypes, callables_table=callables_table, target=kernel.target) + init_insn = make_assignment( id=init_id, assignees=acc_vars, @@ -1430,7 +1497,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, (sweep_iname,) + expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=init_insn_depends_on, - expression=expr.operation.neutral_element(*arg_dtypes), + expression=expression, # Do not inherit predicates: Those might read variables # that may not yet be set, and we don't have a great way # of figuring out what the dependencies of the accumulator @@ -1456,13 +1523,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if insn.within_inames_is_final: update_insn_iname_deps = insn.within_inames | {track_iname} + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, acc_vars), + _strip_if_scalar(acc_vars, updated_inner_exprs), + callables_table, + kernel.target) + scan_insn = make_assignment( id=update_id, assignees=acc_vars, - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, acc_vars), - _strip_if_scalar(acc_vars, updated_inner_exprs)), + expression=expression, depends_on=frozenset(update_insn_depends_on), within_inames=update_insn_iname_deps, no_sync_with=insn.no_sync_with, @@ -1476,25 +1547,25 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0] + return acc_vars[0], callables_table else: - return acc_vars + return acc_vars, callables_table # }}} # {{{ local-parallel scan - def map_scan_local(expr, rec, nresults, arg_dtypes, - reduction_dtypes, sweep_iname, scan_iname, - sweep_min_value, scan_min_value, stride): + def map_scan_local(expr, rec, callables_table, nresults, arg_dtypes, + reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, + scan_min_value, stride): scan_size = _get_int_iname_size(sweep_iname) assert scan_size > 0 if scan_size == 1: - return map_reduction_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + return map_reduction_seq(expr, rec, callables_table, + nresults, arg_dtypes, reduction_dtypes) outer_insn_inames = insn.within_inames @@ -1552,7 +1623,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, base_iname_deps = (outer_insn_inames - frozenset(expr.inames) - frozenset([sweep_iname])) - neutral = expr.operation.neutral_element(*arg_dtypes) + neutral, callables_table = expr.operation.neutral_element( + *arg_dtypes, callables_table=callables_table, target=kernel.target) init_insn_depends_on = insn.depends_on @@ -1660,19 +1732,23 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, write_stage_id = insn_id_gen( "scan_%s_write_stage_%d" % (scan_iname, istage)) + + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, read_vars), + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars)), + callables_table, + kernel.target) + write_stage_insn = make_assignment( id=write_stage_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars), - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, read_vars), - _strip_if_scalar(acc_vars, tuple( - acc_var[ - outer_local_iname_vars + (var(stage_exec_iname),)] - for acc_var in acc_vars)) - ), + expression=expression, within_inames=( base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, @@ -1693,16 +1769,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0][outer_local_iname_vars + (output_idx,)] + return (acc_vars[0][outer_local_iname_vars + (output_idx,)], + callables_table) else: return [acc_var[outer_local_iname_vars + (output_idx,)] - for acc_var in acc_vars] + for acc_var in acc_vars], callables_table # }}} # {{{ seq/par dispatch - def map_reduction(expr, rec, nresults=1): + def map_reduction(expr, rec, callables_table, nresults=1): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. @@ -1710,7 +1787,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, infer_arg_and_reduction_dtypes_for_reduction_expression) arg_dtypes, reduction_dtypes = ( infer_arg_and_reduction_dtypes_for_reduction_expression( - temp_kernel, expr, unknown_types_ok)) + temp_kernel, expr, callables_table, unknown_types_ok)) outer_insn_inames = insn.within_inames bad_inames = frozenset(expr.inames) & outer_insn_inames @@ -1790,7 +1867,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # to reduce over. It's rather similar to an array with () shape in # numpy.) - return expr.expr + return expr.expr, callables_table # }}} @@ -1819,15 +1896,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, for tag in temp_kernel.iname_tags(sweep_iname)))) elif parallel: return map_scan_local( - expr, rec, nresults, arg_dtypes, reduction_dtypes, + expr, rec, callables_table, nresults, + arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, scan_param.sweep_lower_bound, scan_param.scan_lower_bound, scan_param.stride) elif sequential: return map_scan_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes, - sweep_iname, scan_param.scan_iname, + expr, rec, callables_table, nresults, + arg_dtypes, reduction_dtypes, sweep_iname, + scan_param.scan_iname, scan_param.sweep_lower_bound, scan_param.scan_lower_bound, scan_param.stride) @@ -1846,17 +1925,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if n_sequential: assert n_local_par == 0 - return map_reduction_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + return map_reduction_seq(expr, rec, callables_table, + nresults, arg_dtypes, reduction_dtypes) else: assert n_local_par > 0 return map_reduction_local( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + expr, rec, callables_table, nresults, arg_dtypes, + reduction_dtypes) # }}} - from loopy.symbolic import ReductionCallbackMapper - cb_mapper = ReductionCallbackMapper(map_reduction) + cb_mapper = RealizeReductionCallbackMapper(map_reduction, callables_table) insn_queue = kernel.instructions[:] insn_id_replacements = {} @@ -1885,9 +1964,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # Run reduction expansion. from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: - new_expressions = cb_mapper(insn.expression, nresults=nresults) + new_expressions = cb_mapper(insn.expression, + callables_table=cb_mapper.callables_table, + nresults=nresults) else: - new_expressions = (cb_mapper(insn.expression),) + new_expressions = cb_mapper(insn.expression, + callables_table=cb_mapper.callables_table), if generated_insns: # An expansion happened, so insert the generated stuff plus @@ -1967,13 +2049,32 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, kernel = lp.replace_instruction_ids(kernel, insn_id_replacements) - kernel = lp.tag_inames(kernel, new_iname_tags) + from loopy.transform.iname import tag_inames + kernel = tag_inames(kernel, new_iname_tags) kernel = ( _hackily_ensure_multi_assignment_return_values_are_scoped_private( kernel)) - return kernel + return kernel, cb_mapper.callables_table + + +def realize_reduction(program, *args, **kwargs): + assert isinstance(program, TranslationUnit) + + callables_table = dict(program.callables_table) + kernels_to_scan = [in_knl_callable.subkernel + for in_knl_callable in program.callables_table.values() + if isinstance(in_knl_callable, CallableKernel)] + + for knl in kernels_to_scan: + new_knl, callables_table = realize_reduction_for_single_kernel( + knl, callables_table, *args, **kwargs) + in_knl_callable = callables_table[knl.name].copy( + subkernel=new_knl) + callables_table[knl.name] = in_knl_callable + + return program.copy(callables_table=callables_table) # }}} @@ -2043,37 +2144,250 @@ def check_atomic_loads(kernel): # }}} -preprocess_cache = WriteOncePersistentDict( - "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, - key_builder=LoopyKeyBuilder()) +# {{{ arg_descr_inference + +class ArgDescrInferenceMapper(RuleAwareIdentityMapper): + """ + Infers :attr:`~loopy.kernel.function_interface.arg_id_to_descr` of + callables visited in an expression. + """ + def __init__(self, rule_mapping_context, caller_kernel, clbl_inf_ctx): + super().__init__(rule_mapping_context) + self.caller_kernel = caller_kernel + self.clbl_inf_ctx = clbl_inf_ctx + + def map_call(self, expr, expn_state, assignees=None): + from pymbolic.primitives import Call, Variable + from loopy.kernel.function_interface import ValueArgDescriptor + from loopy.symbolic import ResolvedFunction + from loopy.kernel.array import ArrayBase + from loopy.kernel.data import ValueArg + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper + from loopy.kernel.function_interface import get_arg_descriptor_for_expression -def preprocess_kernel(kernel, device=None): - if device is not None: - from warnings import warn - warn("passing 'device' to preprocess_kernel() is deprecated", - DeprecationWarning, stacklevel=2) + if not isinstance(expr.function, ResolvedFunction): + # ignore if the call is not to a ResolvedFunction + return super().map_call(expr, expn_state) - from loopy.kernel import KernelState - if kernel.state >= KernelState.PREPROCESSED: - return kernel + arg_id_to_arg = dict(enumerate(expr.parameters)) - # {{{ cache retrieval + if assignees is not None: + # If supplied with assignees then this is a CallInstruction + for i, arg in enumerate(assignees): + arg_id_to_arg[-i-1] = arg - from loopy import CACHING_ENABLED - if CACHING_ENABLED: - input_kernel = kernel + arg_id_to_descr = { + arg_id: get_arg_descriptor_for_expression(self.caller_kernel, arg) + for arg_id, arg in arg_id_to_arg.items()} + clbl = self.clbl_inf_ctx[expr.function.name] - try: - result = preprocess_cache[kernel] - logger.debug("%s: preprocess cache hit" % kernel.name) - return result - except KeyError: - pass + # {{{ translating descriptor expressions to the callable's namespace - # }}} + deps_as_params = [] + subst_map = {} + + deps = frozenset().union(*(descr.depends_on() + for descr in arg_id_to_descr.values())) + + assert deps <= self.caller_kernel.all_variable_names() + + for dep in deps: + caller_arg = self.caller_kernel.arg_dict.get(dep, (self.caller_kernel + .temporary_variables + .get(dep))) + if not (isinstance(caller_arg, ValueArg) + or (isinstance(caller_arg, ArrayBase) + and caller_arg.shape == ())): + raise NotImplementedError(f"Obtained '{dep}' as a dependency for" + f" call '{expr.function.name}' which is not a scalar.") + + clbl, callee_name = clbl.with_added_arg(caller_arg.dtype, + ValueArgDescriptor()) + + subst_map[dep] = Variable(callee_name) + deps_as_params.append(Variable(dep)) + + mapper = SubstitutionMapper(make_subst_func(subst_map)) + arg_id_to_descr = {id_: descr.map_expr(mapper) + for id_, descr in arg_id_to_descr.items()} + + # }}} + + # specializing the function according to the parameter description + new_clbl, self.clbl_inf_ctx = clbl.with_descrs(arg_id_to_descr, + self.clbl_inf_ctx) + + self.clbl_inf_ctx, new_func_id = (self.clbl_inf_ctx + .with_callable(expr.function.function, + new_clbl)) + + return Call(ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters) + + tuple(deps_as_params)) + + def map_call_with_kwargs(self, expr): + # See https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + + def __call__(self, expr, kernel, insn, assignees=None): + from loopy.kernel.data import InstructionBase + from loopy.symbolic import IdentityMapper, ExpansionState + assert insn is None or isinstance(insn, InstructionBase) + + return IdentityMapper.__call__(self, expr, + ExpansionState( + kernel=kernel, + instruction=insn, + stack=(), + arg_context={}), assignees=assignees) + + def map_kernel(self, kernel): + + new_insns = [] + + for insn in kernel.instructions: + if isinstance(insn, CallInstruction): + # In call instructions the assignees play an important in + # determining the arg_id_to_descr + mapper = partial(self, kernel=kernel, insn=insn, + assignees=insn.assignees) + new_insns.append(insn.with_transformed_expressions(mapper)) + elif isinstance(insn, MultiAssignmentBase): + mapper = partial(self, kernel=kernel, insn=insn) + new_insns.append(insn.with_transformed_expressions(mapper)) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError("arg_descr_inference for %s instruction" % + type(insn)) + + return kernel.copy(instructions=new_insns) + + +def traverse_to_infer_arg_descr(kernel, callables_table): + """ + Returns a copy of *kernel* with the argument shapes and strides matching for + resolved functions in the *kernel*. Refer + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`. + + .. note:: + + Initiates a walk starting from *kernel* to all its callee kernels. + """ + from loopy.symbolic import SubstitutionRuleMappingContext + + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + arg_descr_inf_mapper = ArgDescrInferenceMapper(rule_mapping_context, + kernel, callables_table) + + descr_inferred_kernel = rule_mapping_context.finish_kernel( + arg_descr_inf_mapper.map_kernel(kernel)) + + return descr_inferred_kernel, arg_descr_inf_mapper.clbl_inf_ctx + + +def infer_arg_descr(program): + """ + Returns a copy of *program* with the + :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the + callables. + """ + from loopy.translation_unit import make_clbl_inf_ctx, resolve_callables + from loopy.kernel.array import ArrayBase + from loopy.kernel.function_interface import (ArrayArgDescriptor, + ValueArgDescriptor) + from loopy import auto, ValueArg + + program = resolve_callables(program) + + clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, + program.entrypoints) + + for e in program.entrypoints: + def _tuple_or_None(s): + if isinstance(s, tuple): + return s + elif s in [None, auto]: + return s + else: + return s, + + arg_id_to_descr = {} + for arg in program[e].args: + if isinstance(arg, ArrayBase): + if arg.shape not in (None, auto): + arg_id_to_descr[arg.name] = ArrayArgDescriptor( + _tuple_or_None(arg.shape), arg.address_space, + arg.dim_tags) + elif isinstance(arg, ValueArg): + arg_id_to_descr[arg.name] = ValueArgDescriptor() + else: + raise NotImplementedError() + new_callable, clbl_inf_ctx = program.callables_table[e].with_descrs( + arg_id_to_descr, clbl_inf_ctx) + clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable, + is_entrypoint=True) + + return clbl_inf_ctx.finish_program(program) + +# }}} + + +# {{{ inline_kernels_with_gbarriers + +def inline_kernels_with_gbarriers(program): + from loopy.kernel.instruction import BarrierInstruction + from loopy.transform.callable import inline_callable_kernel + from loopy.kernel.tools import get_call_graph + from pytools.graph import compute_topological_order + + def has_gbarrier(knl): + return any((isinstance(insn, BarrierInstruction) + and insn.synchronization_kind == "global") + for insn in knl.instructions) + + call_graph = get_call_graph(program, only_kernel_callables=True) + + # traverse the kernel calls in a reverse topological sort so that barriers + # are rightly passed to the entrypoints. + toposort = compute_topological_order(call_graph, + # pass key to have deterministic codegen + key=lambda x: x + ) + + for name in toposort[::-1]: + if has_gbarrier(program[name]): + program = inline_callable_kernel(program, name) + + return program + +# }}} + + +def filter_reachable_callables(t_unit): + from loopy.translation_unit import get_reachable_resolved_callable_ids + reachable_function_ids = get_reachable_resolved_callable_ids(t_unit + .callables_table, + t_unit.entrypoints) + new_callables = {name: clbl for name, clbl in t_unit.callables_table.items() + if name in (reachable_function_ids | t_unit.entrypoints)} + return t_unit.copy(callables_table=new_callables) + + +preprocess_cache = WriteOncePersistentDict( + "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, + key_builder=LoopyKeyBuilder()) + + +def _preprocess_single_kernel(kernel, callables_table, device=None): + from loopy.kernel import KernelState - logger.info("%s: preprocess start" % kernel.name) + prepro_logger = ProcessLogger(logger, "%s: preprocess" % kernel.name) from loopy.check import check_identifiers_in_subst_rules check_identifiers_in_subst_rules(kernel) @@ -2089,20 +2403,84 @@ def preprocess_kernel(kernel, device=None): # }}} - from loopy.transform.subst import expand_subst - kernel = expand_subst(kernel) - # Ordering restriction: # Type inference and reduction iname uniqueness don't handle substitutions. # Get them out of the way. - kernel = infer_unknown_types(kernel, expect_completion=False) - check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) + # Ordering restriction: + # add_axes_to_temporaries_for_ilp because reduction accumulators + # need to be duplicated by this. + + kernel = realize_ilp(kernel) + + kernel = find_temporary_address_space(kernel) + + # check for atomic loads, much easier to do here now that the dependencies + # have been established + kernel = check_atomic_loads(kernel) + + kernel = kernel.target.preprocess(kernel) + + kernel = kernel.copy( + state=KernelState.PREPROCESSED) + + prepro_logger.done() + + return kernel + + +def preprocess_program(program, device=None): + + # {{{ cache retrieval + + from loopy import CACHING_ENABLED + if CACHING_ENABLED: + input_program = program + + try: + result = preprocess_cache[program] + logger.debug(f"program with entrypoints: {program.entrypoints}" + " preprocess cache hit") + return result + except KeyError: + pass + + # }}} + + from loopy.kernel import KernelState + if program.state >= KernelState.PREPROCESSED: + return program + + if len([clbl for clbl in program.callables_table.values() if + isinstance(clbl, CallableKernel)]) == 1: + program = program.with_entrypoints(",".join(clbl.name for clbl in + program.callables_table.values() if isinstance(clbl, + CallableKernel))) + + if not program.entrypoints: + raise LoopyError("Translation unit did not receive any entrypoints") + + from loopy.translation_unit import resolve_callables + program = resolve_callables(program) + + program = filter_reachable_callables(program) + + if device is not None: + # FIXME: Time to remove this? (Git blame shows 5 years ago) + from warnings import warn + warn("passing 'device' to preprocess_kernel() is deprecated", + DeprecationWarning, stacklevel=2) + + program = infer_unknown_types(program, expect_completion=False) + + from loopy.transform.subst import expand_subst + program = expand_subst(program) + from loopy.kernel.creation import apply_single_writer_depencency_heuristic - kernel = apply_single_writer_depencency_heuristic(kernel) + program = apply_single_writer_depencency_heuristic(program) # Ordering restrictions: # @@ -2113,26 +2491,43 @@ def preprocess_kernel(kernel, device=None): # because it manipulates the depends_on field, which could prevent # defaults from being applied. - kernel = realize_reduction(kernel, unknown_types_ok=False) + program = realize_reduction(program, unknown_types_ok=False) - # Ordering restriction: - # add_axes_to_temporaries_for_ilp because reduction accumulators - # need to be duplicated by this. + # {{{ preprocess callable kernels - kernel = realize_ilp(kernel) + # Callable editing restrictions: + # + # - should not edit callables_table in :meth:`preprocess_single_kernel` + # as we are iterating over it.[1] + # + # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects + + new_callables = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = _preprocess_single_kernel( + in_knl_callable.subkernel, program.callables_table, + device) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % ( + type(in_knl_callable).__name__)) - kernel = find_temporary_address_space(kernel) + new_callables[func_id] = in_knl_callable - # check for atomic loads, much easier to do here now that the dependencies - # have been established - kernel = check_atomic_loads(kernel) + program = program.copy(callables_table=new_callables) - kernel = kernel.target.preprocess(kernel) + # }}} - logger.info("%s: preprocess done" % kernel.name) + # infer arg descrs of the callables + program = infer_arg_descr(program) - kernel = kernel.copy( - state=KernelState.PREPROCESSED) + # Ordering restriction: + # callees with gbarrier in them must be inlined after inferrring arg_descr. + program = inline_kernels_with_gbarriers(program) # {{{ prepare for caching @@ -2142,15 +2537,20 @@ def preprocess_kernel(kernel, device=None): # this target information. if CACHING_ENABLED: - input_kernel = prepare_for_caching(input_kernel) + input_program = prepare_for_caching(input_program) - kernel = prepare_for_caching(kernel) + program = prepare_for_caching(program) # }}} if CACHING_ENABLED: - preprocess_cache.store_if_not_present(input_kernel, kernel) + preprocess_cache.store_if_not_present(input_program, program) + + return program + + +# FIXME: Do we add a deprecation warning? +preprocess_kernel = preprocess_program - return kernel # vim: foldmethod=marker diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index a854ce04edb736a7067d31dc4b0e7da5955c38bd..e3fdb030f35fe676c83e9d295ee767cbedb1e162 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1748,16 +1748,17 @@ def _insn_ids_reaching_end(schedule, kind, reverse): return insn_ids_alive_at_scope[-1] -def append_barrier_or_raise_error(schedule, dep, verify_only): +def append_barrier_or_raise_error(kernel_name, schedule, dep, verify_only): if verify_only: from loopy.diagnostic import MissingBarrierError raise MissingBarrierError( - "Dependency '%s' (for variable '%s') " + "%s: Dependency '%s' (for variable '%s') " "requires synchronization " "by a %s barrier (add a 'no_sync_with' " "instruction option to state that no " "synchronization is needed)" % ( + kernel_name, dep.dep_descr.format( tgt=dep.target.id, src=dep.source.id), dep.variable, @@ -1828,7 +1829,8 @@ def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0 for dep in chain.from_iterable( dep_tracker.gen_dependencies_with_target_at(insn) for insn in loop_head): - append_barrier_or_raise_error(result, dep, verify_only) + append_barrier_or_raise_error( + kernel.name, result, dep, verify_only) # This barrier gets inserted outside the loop, hence it is # executed unconditionally and so kills all sources before # the loop. @@ -1860,7 +1862,8 @@ def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0 elif isinstance(sched_item, RunInstruction): for dep in dep_tracker.gen_dependencies_with_target_at( sched_item.insn_id): - append_barrier_or_raise_error(result, dep, verify_only) + append_barrier_or_raise_error( + kernel.name, result, dep, verify_only) dep_tracker.discard_all_sources() break result.append(sched_item) @@ -1926,7 +1929,7 @@ class MinRecursionLimitForScheduling(MinRecursionLimit): # {{{ main scheduling entrypoint -def generate_loop_schedules(kernel, debug_args={}): +def generate_loop_schedules(kernel, callables_table, debug_args={}): """ .. warning:: @@ -1939,17 +1942,18 @@ def generate_loop_schedules(kernel, debug_args={}): """ with MinRecursionLimitForScheduling(kernel): - yield from generate_loop_schedules_inner(kernel, debug_args=debug_args) + yield from generate_loop_schedules_inner(kernel, + callables_table, debug_args=debug_args) -def generate_loop_schedules_inner(kernel, debug_args={}): +def generate_loop_schedules_inner(kernel, callables_table, debug_args={}): from loopy.kernel import KernelState if kernel.state not in (KernelState.PREPROCESSED, KernelState.LINEARIZED): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") from loopy.check import pre_schedule_checks - pre_schedule_checks(kernel) + pre_schedule_checks(kernel, callables_table) schedule_count = 0 @@ -2061,7 +2065,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}): gen_sched = convert_barrier_instructions_to_barriers( kernel, gen_sched) - gsize, lsize = kernel.get_grid_size_upper_bounds() + gsize, lsize = kernel.get_grid_size_upper_bounds(callables_table, + return_dict=True) if (gsize or lsize): if not kernel.options.disable_global_barriers: @@ -2118,7 +2123,7 @@ schedule_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) -def _get_one_scheduled_kernel_inner(kernel): +def _get_one_scheduled_kernel_inner(kernel, callables_table): # This helper function exists to ensure that the generator chain is fully # out of scope after the function returns. This allows it to be # garbage-collected in the exit handler of the @@ -2128,22 +2133,24 @@ def _get_one_scheduled_kernel_inner(kernel): # # See https://gitlab.tiker.net/inducer/sumpy/issues/31 for context. - return next(iter(generate_loop_schedules(kernel))) + return next(iter(generate_loop_schedules(kernel, callables_table))) -def get_one_scheduled_kernel(kernel): +def get_one_scheduled_kernel(kernel, callables_table): warn_with_kernel( kernel, "get_one_scheduled_kernel_deprecated", "get_one_scheduled_kernel is deprecated. " "Use get_one_linearized_kernel instead.", - DeprecationWarning) - return get_one_linearized_kernel(kernel) + DeprecationWarning, stacklevel=2) + return get_one_linearized_kernel(kernel, callables_table) -def get_one_linearized_kernel(kernel): +def get_one_linearized_kernel(kernel, callables_table): from loopy import CACHING_ENABLED - sched_cache_key = kernel + # must include *callables_table* within the cache key as the preschedule + # checks depend on it. + sched_cache_key = (kernel, callables_table) from_cache = False if CACHING_ENABLED: @@ -2158,7 +2165,8 @@ def get_one_linearized_kernel(kernel): if not from_cache: with ProcessLogger(logger, "%s: schedule" % kernel.name): with MinRecursionLimitForScheduling(kernel): - result = _get_one_scheduled_kernel_inner(kernel) + result = _get_one_scheduled_kernel_inner(kernel, + callables_table) if CACHING_ENABLED and not from_cache: schedule_cache.store_if_not_present(sched_cache_key, result) diff --git a/loopy/statistics.py b/loopy/statistics.py index ef335abced5b568b8b6319e5d82d1b315cfb2dca..9f0bdfbcb4451e593965f56af07d76013d4e0caf 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1,4 +1,9 @@ -__copyright__ = "Copyright (C) 2015 James Stevens" +__copyright__ = """ +Copyright (C) 2015 James Stevens +Copyright (C) 2018 Kaushik Kulkarni +Copyright (C) 2019 Andreas Kloeckner +""" + __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy @@ -24,12 +29,14 @@ import loopy as lp from islpy import dim_type import islpy as isl from pymbolic.mapper import CombineMapper -from functools import reduce from loopy.kernel.data import ( MultiAssignmentBase, TemporaryVariable, AddressSpace) from loopy.diagnostic import warn_with_kernel, LoopyError from loopy.symbolic import CoefficientCollector -from pytools import Record, memoize_method +from pytools import ImmutableRecord, memoize_method +from loopy.kernel.function_interface import CallableKernel +from loopy.translation_unit import TranslationUnit +from functools import partial __doc__ = """ @@ -37,6 +44,7 @@ __doc__ = """ .. currentmodule:: loopy .. autoclass:: ToCountMap +.. autoclass:: ToCountPolynomialMap .. autoclass:: CountGranularity .. autoclass:: Op .. autoclass:: MemAccess @@ -56,6 +64,19 @@ __doc__ = """ """ +# FIXME: +# - The SUBGROUP granularity is completely broken if the root kernel +# contains the grid and the operations get counted in the callee. +# To test, most of those are set to WORKITEM instead below (marked +# with FIXMEs). This leads to value mismatches and key errors in +# the tests. +# - Currently, nothing prevents summation across different +# granularities, which is guaranteed to yield bogus results. +# - AccessFootprintGatherer needs to be redone to match get_op_map and +# get_mem_access_map style +# - Test for the subkernel functionality need to be written + + def get_kernel_parameter_space(kernel): return isl.Space.create_from_names(kernel.isl_context, set=[], params=sorted(list(kernel.outer_params()))).params() @@ -69,11 +90,25 @@ def get_kernel_zero_pwqpolynomial(kernel): # {{{ GuardedPwQPolynomial +def _get_param_tuple(obj): + return tuple( + obj.get_dim_name(dim_type.param, i) + for i in range(obj.dim(dim_type.param))) + + class GuardedPwQPolynomial: def __init__(self, pwqpolynomial, valid_domain): + assert isinstance(pwqpolynomial, isl.PwQPolynomial) self.pwqpolynomial = pwqpolynomial self.valid_domain = valid_domain + assert (_get_param_tuple(pwqpolynomial.space) + == _get_param_tuple(valid_domain.space)) + + @property + def space(self): + return self.valid_domain.space + def __add__(self, other): if isinstance(other, GuardedPwQPolynomial): return GuardedPwQPolynomial( @@ -122,7 +157,7 @@ class GuardedPwQPolynomial: return str(self.pwqpolynomial) def __repr__(self): - return repr(self.pwqpolynomial) + return "Guarded" + repr(self.pwqpolynomial) # }}} @@ -130,28 +165,43 @@ class GuardedPwQPolynomial: # {{{ ToCountMap class ToCountMap: - """Maps any type of key to an arithmetic type. + """A map from work descriptors like :class:`Op` and :class:`MemAccess` + to any arithmetic type. + + .. automethod:: __getitem__ + .. automethod:: __str__ + .. automethod:: __repr__ + .. automethod:: __len__ + .. automethod:: get + .. automethod:: items + .. automethod:: keys + .. automethod:: values + + .. automethod:: copy + .. automethod:: with_set_attributes .. automethod:: filter_by .. automethod:: filter_by_func .. automethod:: group_by .. automethod:: to_bytes .. automethod:: sum - .. automethod:: eval_and_sum """ - def __init__(self, init_dict=None, val_type=GuardedPwQPolynomial): - if init_dict is None: - init_dict = {} - self.count_map = init_dict - self.val_type = val_type + def __init__(self, count_map=None): + if count_map is None: + count_map = {} + + self.count_map = count_map + + def _zero(self): + return 0 def __add__(self, other): result = self.count_map.copy() for k, v in other.count_map.items(): result[k] = self.count_map.get(k, 0) + v - return ToCountMap(result, self.val_type) + return self.copy(count_map=result) def __radd__(self, other): if other != 0: @@ -159,13 +209,14 @@ class ToCountMap: "to {} {}. ToCountMap may only be added to " "0 and other ToCountMap objects." .format(type(other), other)) + return self def __mul__(self, other): if isinstance(other, GuardedPwQPolynomial): - return ToCountMap({ - index: self.count_map[index]*other - for index in self.keys()}) + return self.copy({ + index: other*value + for index, value in self.count_map.items()}) else: raise ValueError("ToCountMap: Attempted to multiply " "ToCountMap by {} {}." @@ -174,21 +225,17 @@ class ToCountMap: __rmul__ = __mul__ def __getitem__(self, index): - try: - return self.count_map[index] - except KeyError: - #TODO what is the best way to handle this? - if self.val_type is GuardedPwQPolynomial: - return GuardedPwQPolynomial.zero() - else: - return 0 - - def __setitem__(self, index, value): - self.count_map[index] = value + return self.count_map[index] def __repr__(self): return repr(self.count_map) + def __str__(self): + return "\n".join( + f"{k}: {v}" + for k, v in sorted(self.count_map.items(), + key=lambda k: str(k))) + def __len__(self): return len(self.count_map) @@ -201,17 +248,19 @@ class ToCountMap: def keys(self): return self.count_map.keys() - def pop(self, item): - return self.count_map.pop(item) + def values(self): + return self.count_map.values() - def copy(self): - return ToCountMap(dict(self.count_map), self.val_type) + def copy(self, count_map=None): + if count_map is None: + count_map = self.count_map + + return type(self)(count_map=count_map) def with_set_attributes(self, **kwargs): - return ToCountMap({ + return self.copy(count_map={ key.copy(**kwargs): val - for key, val in self.count_map.items()}, - self.val_type) + for key, val in self.count_map.items()}) def filter_by(self, **kwargs): """Remove items without specified key fields. @@ -238,28 +287,25 @@ class ToCountMap: """ - result_map = ToCountMap(val_type=self.val_type) + new_count_map = {} - from loopy.types import to_loopy_type - if "dtype" in kwargs.keys(): - kwargs["dtype"] = [to_loopy_type(d) for d in kwargs["dtype"]] + class _Sentinel: + pass - # for each item in self.count_map - for self_key, self_val in self.items(): - try: - # check to see if key attribute values match all filters - for arg_field, allowable_vals in kwargs.items(): - attr_val = getattr(self_key, arg_field) - # see if the value is in the filter list - if attr_val not in allowable_vals: - break - else: # loop terminated without break or error - result_map[self_key] = self_val - except(AttributeError): - # the field passed is not a field of this key - continue + new_kwargs = {} + for arg_field, allowable_vals in kwargs.items(): + if arg_field == "dtype": + from loopy.types import to_loopy_type + allowable_vals = [to_loopy_type(dtype) for dtype in allowable_vals] + + new_kwargs[arg_field] = allowable_vals + + for key, val in self.count_map.items(): + if all(getattr(key, arg_field, _Sentinel) in allowable_vals + for arg_field, allowable_vals in new_kwargs.items()): + new_count_map[key] = val - return result_map + return self.copy(count_map=new_count_map) def filter_by_func(self, func): """Keep items that pass a test. @@ -286,14 +332,13 @@ class ToCountMap: """ - result_map = ToCountMap(val_type=self.val_type) + new_count_map = {} - # for each item in self.count_map, call func on the key - for self_key, self_val in self.items(): + for self_key, self_val in self.count_map.items(): if func(self_key): - result_map[self_key] = self_val + new_count_map[self_key] = self_val - return result_map + return self.copy(count_map=new_count_map) def group_by(self, *args): """Group map items together, distinguishing by only the key fields @@ -341,7 +386,7 @@ class ToCountMap: """ - result_map = ToCountMap(val_type=self.val_type) + new_count_map = {} # make sure all item keys have same type if self.count_map: @@ -350,22 +395,17 @@ class ToCountMap: raise ValueError("ToCountMap: group_by() function may only " "be used on ToCountMaps with uniform keys") else: - return result_map + return self - # for each item in self.count_map - for self_key, self_val in self.items(): - new_key = key_type() + for self_key, self_val in self.count_map.items(): + new_key = key_type( + **{ + field: getattr(self_key, field) + for field in args}) - # set all specified fields - for field in args: - setattr(new_key, field, getattr(self_key, field)) + new_count_map[new_key] = new_count_map.get(new_key, 0) + self_val - if new_key in result_map.keys(): - result_map[new_key] += self_val - else: - result_map[new_key] = self_val - - return result_map + return self.copy(count_map=new_count_map) def to_bytes(self): """Convert counts to bytes using data type in map key. @@ -398,48 +438,76 @@ class ToCountMap: """ - result = self.copy() + new_count_map = {} - for key, val in self.items(): - bytes_processed = int(key.dtype.itemsize) * val - result[key] = bytes_processed + for key, val in self.count_map.items(): + new_count_map[key] = int(key.dtype.itemsize) * val - #TODO again, is this okay? - result.val_type = int - - return result + return self.copy(new_count_map) def sum(self): - """Add all counts in ToCountMap. + """:return: A sum of the values of the dictionary.""" - :return: An :class:`islpy.PwQPolynomial` or :class:`int` containing the - sum of counts. + total = self._zero() - """ + for k, v in self.count_map.items(): + total = v + total - if self.val_type is GuardedPwQPolynomial: - total = GuardedPwQPolynomial.zero() - else: - total = 0 - - for k, v in self.items(): - total += v return total - #TODO test and document - def eval(self, params): - result = self.copy() - for key, val in self.items(): - result[key] = val.eval_with_dict(params) - result.val_type = int - return result +# }}} + + +# {{{ ToCountPolynomialMap + +class ToCountPolynomialMap(ToCountMap): + """Maps any type of key to a :class:`islpy.PwQPolynomial` or a + :class:`~loopy.statistics.GuardedPwQPolynomial`. + + .. automethod:: eval_and_sum + """ + + def __init__(self, space, count_map=None): + if not isinstance(space, isl.Space): + raise TypeError( + "first argument to ToCountPolynomialMap must be " + "of type islpy.Space") + + assert space.is_params() + self.space = space + + space_param_tuple = _get_param_tuple(space) + + for key, val in count_map.items(): + if isinstance(val, isl.PwQPolynomial): + assert val.dim(dim_type.out) == 1 + elif isinstance(val, GuardedPwQPolynomial): + assert val.pwqpolynomial.dim(dim_type.out) == 1 + else: + raise TypeError("unexpected value type") + + assert _get_param_tuple(val.space) == space_param_tuple + + super().__init__(count_map) + + def _zero(self): + space = self.space.insert_dims(dim_type.out, 0, 1) + return isl.PwQPolynomial.zero(space) - def eval_and_sum(self, params): - """Add all counts in :class:`ToCountMap` and evaluate with provided - parameter dict. + def copy(self, count_map=None, space=None): + if count_map is None: + count_map = self.count_map - :return: An :class:`int` containing the sum of all counts in the - :class:`ToCountMap` evaluated with the parameters provided. + if space is None: + space = self.space + + return type(self)(space, count_map) + + def eval_and_sum(self, params=None): + """Add all counts and evaluate with provided parameter dict *params* + + :return: An :class:`int` containing the sum of all counts + evaluated with the parameters provided. Example usage:: @@ -454,18 +522,69 @@ class ToCountMap: # (now use these counts to, e.g., predict performance) """ + if params is None: + params = {} + return self.sum().eval_with_dict(params) # }}} +# {{{ subst_into_to_count_map + +def subst_into_guarded_pwqpolynomial(new_space, guarded_poly, subst_dict): + from loopy.isl_helpers import subst_into_pwqpolynomial, get_param_subst_domain + + poly = subst_into_pwqpolynomial( + new_space, guarded_poly.pwqpolynomial, subst_dict) + + valid_domain = guarded_poly.valid_domain + i_begin_subst_space = valid_domain.dim(dim_type.param) + + valid_domain, subst_domain, _ = get_param_subst_domain( + new_space, guarded_poly.valid_domain, subst_dict) + + valid_domain = valid_domain & subst_domain + valid_domain = valid_domain.project_out(dim_type.param, 0, i_begin_subst_space) + return GuardedPwQPolynomial(poly, valid_domain) + + +def subst_into_to_count_map(space, tcm, subst_dict): + from loopy.isl_helpers import subst_into_pwqpolynomial + new_count_map = {} + for key, value in tcm.count_map.items(): + if isinstance(value, GuardedPwQPolynomial): + new_count_map[key] = subst_into_guarded_pwqpolynomial( + space, value, subst_dict) + + elif isinstance(value, isl.PwQPolynomial): + new_count_map[key] = subst_into_pwqpolynomial(space, value, subst_dict) + + elif isinstance(value, int): + new_count_map[key] = value + + else: + raise ValueError("unexpected value type") + + return tcm.copy(space=space, count_map=new_count_map) + +# }}} + + def stringify_stats_mapping(m): + + from warnings import warn + warn("stringify_stats_mapping is deprecated and will be removed in 2020." + " Use ToCountMap.__str__() instead.", DeprecationWarning, stacklevel=2) + result = "" for key in sorted(m.keys(), key=lambda k: str(k)): result += ("{} : {}\n".format(key, m[key])) return result +# {{{ CountGranularity + class CountGranularity: """Strings specifying whether an operation should be counted once per *work-item*, *sub-group*, or *work-group*. @@ -492,10 +611,12 @@ class CountGranularity: WORKGROUP = "workgroup" ALL = [WORKITEM, SUBGROUP, WORKGROUP] +# }}} + # {{{ Op descriptor -class Op(Record): +class Op(ImmutableRecord): """A descriptor for a type of arithmetic operation. .. attribute:: dtype @@ -521,34 +642,41 @@ class Op(Record): implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. + .. attribute:: kernel_name + + A :class:`str` representing the kernel name where the operation occurred. + """ - def __init__(self, dtype=None, name=None, count_granularity=None): + def __init__(self, dtype=None, name=None, count_granularity=None, + kernel_name=None): if count_granularity not in CountGranularity.ALL+[None]: raise ValueError("Op.__init__: count_granularity '%s' is " "not allowed. count_granularity options: %s" % (count_granularity, CountGranularity.ALL+[None])) - if dtype is None: - Record.__init__(self, dtype=dtype, name=name, - count_granularity=count_granularity) - else: + + if dtype is not None: from loopy.types import to_loopy_type - Record.__init__(self, dtype=to_loopy_type(dtype), name=name, - count_granularity=count_granularity) + dtype = to_loopy_type(dtype) - def __hash__(self): - return hash(repr(self)) + super().__init__(dtype=dtype, name=name, + count_granularity=count_granularity, + kernel_name=kernel_name) def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness - return f"Op({self.dtype}, {self.name}, {self.count_granularity})" + if self.kernel_name is not None: + return (f'Op("{self.dtype}", "{self.name}", "{self.count_granularity}",' + f' "{self.kernel_name}")') + else: + return f'Op("{self.dtype}", "{self.name}", "{self.count_granularity}")' # }}} # {{{ MemAccess descriptor -class MemAccess(Record): +class MemAccess(ImmutableRecord): """A descriptor for a type of memory access. .. attribute:: mtype @@ -608,12 +736,15 @@ class MemAccess(Record): implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. + .. attribute:: kernel_name + + A :class:`str` representing the kernel name where the operation occurred. """ def __init__(self, mtype=None, dtype=None, lid_strides=None, gid_strides=None, direction=None, variable=None, *, variable_tags=None, variable_tag=None, - count_granularity=None): + count_granularity=None, kernel_name=None): if count_granularity not in CountGranularity.ALL+[None]: raise ValueError("Op.__init__: count_granularity '%s' is " @@ -638,18 +769,16 @@ class MemAccess(Record): # }}} - if dtype is None: - Record.__init__(self, mtype=mtype, dtype=dtype, lid_strides=lid_strides, + if dtype is not None: + from loopy.types import to_loopy_type + dtype = to_loopy_type(dtype) + + ImmutableRecord.__init__(self, mtype=mtype, dtype=dtype, + lid_strides=lid_strides, gid_strides=gid_strides, direction=direction, variable=variable, variable_tags=variable_tags, - count_granularity=count_granularity) - else: - from loopy.types import to_loopy_type - Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype), - lid_strides=lid_strides, gid_strides=gid_strides, - direction=direction, variable=variable, - variable_tags=variable_tags, - count_granularity=count_granularity) + count_granularity=count_granularity, + kernel_name=kernel_name) @property def variable_tag(self): @@ -666,13 +795,12 @@ class MemAccess(Record): return tag def __hash__(self): - # Note that this means lid_strides and gid_strides must be sorted - # in self.__repr__() + # dicts in gid_strides and lid_strides aren't natively hashable return hash(repr(self)) def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness - return "MemAccess({}, {}, {}, {}, {}, {}, {}, {})".format( + return "MemAccess({}, {}, {}, {}, {}, {}, {}, {}, {})".format( self.mtype, self.dtype, None if self.lid_strides is None else dict( @@ -682,33 +810,101 @@ class MemAccess(Record): self.direction, self.variable, self.variable_tags, - self.count_granularity) + self.count_granularity, + self.kernel_name) +# }}} + + +# {{{ Sync descriptor + +class Sync(ImmutableRecord): + """A descriptor for a type of synchronization. + + .. attribute:: kind + + A string describing the synchronization kind, e.g. ``"barrier_global"`` or + ``"barrier_local"`` or ``"kernel_launch"``. + + .. attribute:: kernel_name + + A :class:`str` representing the kernel name where the operation occurred. + """ + + def __init__(self, kind=None, kernel_name=None): + super().__init__(kind=kind, kernel_name=kernel_name) + + def __repr__(self): + # Record.__repr__ overridden for consistent ordering and conciseness + return f"Sync({self.kind}, {self.kernel_name})" # }}} -# {{{ counter base +# {{{ CounterBase class CounterBase(CombineMapper): - def __init__(self, knl): + def __init__(self, knl, callables_table, kernel_rec): self.knl = knl - from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) + self.callables_table = callables_table + self.kernel_rec = kernel_rec + + from loopy.type_inference import TypeReader + self.type_inf = TypeReader(knl, callables_table) + self.zero = get_kernel_zero_pwqpolynomial(self.knl) + self.one = self.zero + 1 + + @property + @memoize_method + def param_space(self): + return get_kernel_parameter_space(self.knl) + + def new_poly_map(self, count_map): + return ToCountPolynomialMap(self.param_space, count_map) + + def new_zero_poly_map(self): + return self.new_poly_map({}) def combine(self, values): return sum(values) def map_constant(self, expr): - return ToCountMap() + return self.new_zero_poly_map() def map_call(self, expr): - return self.rec(expr.parameters) + from loopy.symbolic import ResolvedFunction + assert isinstance(expr.function, ResolvedFunction) + clbl = self.callables_table[expr.function.name] + + from loopy.kernel.function_interface import (CallableKernel, + get_kw_pos_association) + from loopy.kernel.data import ValueArg + if isinstance(clbl, CallableKernel): + sub_result = self.kernel_rec(clbl.subkernel) + _, pos_to_kw = get_kw_pos_association(clbl.subkernel) + + subst_dict = { + pos_to_kw[i]: param + for i, param in enumerate(expr.parameters) + if isinstance(clbl.subkernel.arg_dict[pos_to_kw[i]], + ValueArg)} + + return subst_into_to_count_map( + self.param_space, + sub_result, subst_dict) \ + + self.rec(expr.parameters) + + else: + raise NotImplementedError() + + def map_call_with_kwargs(self, expr): + # See https://github.com/inducer/loopy/pull/323 + raise NotImplementedError def map_sum(self, expr): if expr.children: return sum(self.rec(child) for child in expr.children) else: - return ToCountMap() + return self.new_zero_poly_map() map_product = map_sum @@ -737,8 +933,8 @@ class CounterBase(CombineMapper): map_derivative = map_common_subexpression map_slice = map_common_subexpression - # preprocessing should have removed these def map_reduction(self, expr): + # preprocessing should have removed these raise RuntimeError("%s encountered %s--not supposed to happen" % (type(self).__name__, type(expr).__name__)) @@ -748,60 +944,81 @@ class CounterBase(CombineMapper): # {{{ ExpressionOpCounter class ExpressionOpCounter(CounterBase): - def __init__(self, knl, count_within_subscripts=True): - self.knl = knl + def __init__(self, knl, callables_table, kernel_rec, + count_within_subscripts=True): + super().__init__( + knl, callables_table, kernel_rec) self.count_within_subscripts = count_within_subscripts - from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) + + arithmetic_count_granularity = CountGranularity.SUBGROUP def combine(self, values): return sum(values) def map_constant(self, expr): - return ToCountMap() + return self.new_zero_poly_map() map_tagged_variable = map_constant map_variable = map_constant def map_call(self, expr): - return ToCountMap( - {Op(dtype=self.type_inf(expr), - name="func:"+str(expr.function), - count_granularity=CountGranularity.SUBGROUP): 1} - ) + self.rec(expr.parameters) + from loopy.symbolic import ResolvedFunction + assert isinstance(expr.function, ResolvedFunction) + clbl = self.callables_table[expr.function.name] + + from loopy.kernel.function_interface import CallableKernel + if not isinstance(clbl, CallableKernel): + return self.new_poly_map( + {Op(dtype=self.type_inf(expr), + name="func:"+clbl.name, + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one} + ) + self.rec(expr.parameters) + else: + return super().map_call(expr) def map_subscript(self, expr): if self.count_within_subscripts: return self.rec(expr.index) else: - return ToCountMap() + return self.new_zero_poly_map() + + def map_sub_array_ref(self, expr): + # generates an array view, considered free + return self.new_zero_poly_map() def map_sum(self, expr): assert expr.children - return ToCountMap( + return self.new_poly_map( {Op(dtype=self.type_inf(expr), name="add", - count_granularity=CountGranularity.SUBGROUP): - len(expr.children)-1} + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): + self.zero + (len(expr.children)-1)} ) + sum(self.rec(child) for child in expr.children) def map_product(self, expr): from pymbolic.primitives import is_zero assert expr.children - return sum(ToCountMap({Op(dtype=self.type_inf(expr), + return sum(self.new_poly_map({Op(dtype=self.type_inf(expr), name="mul", - count_granularity=CountGranularity.SUBGROUP): 1}) + count_granularity=( + self.arithmetic_count_granularity), + kernel_name=self.knl.name): self.one}) + self.rec(child) for child in expr.children if not is_zero(child + 1)) + \ - ToCountMap({Op(dtype=self.type_inf(expr), + self.new_poly_map({Op(dtype=self.type_inf(expr), name="mul", - count_granularity=CountGranularity.SUBGROUP): -1}) + count_granularity=( + self.arithmetic_count_granularity), + kernel_name=self.knl.name): -self.one}) def map_quotient(self, expr, *args): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name="div", - count_granularity=CountGranularity.SUBGROUP): 1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) @@ -809,32 +1026,36 @@ class ExpressionOpCounter(CounterBase): map_remainder = map_quotient def map_power(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name="pow", - count_granularity=CountGranularity.SUBGROUP): 1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) def map_left_shift(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name="shift", - count_granularity=CountGranularity.SUBGROUP): 1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) map_right_shift = map_left_shift def map_bitwise_not(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name="bw", - count_granularity=CountGranularity.SUBGROUP): 1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name="bw", - count_granularity=CountGranularity.SUBGROUP): - len(expr.children)-1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): + self.zero + (len(expr.children)-1)}) \ + sum(self.rec(child) for child in expr.children) map_bitwise_xor = map_bitwise_or @@ -855,9 +1076,10 @@ class ExpressionOpCounter(CounterBase): + self.rec(expr.else_) def map_min(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name="maxmin", - count_granularity=CountGranularity.SUBGROUP): + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -898,6 +1120,8 @@ class _IndexStrideCoefficientCollector(CoefficientCollector): # }}} +# {{{ _get_lid_and_gid_strides + def _get_lid_and_gid_strides(knl, array, index): # find all local and global index tags and corresponding inames from loopy.symbolic import get_dependencies @@ -982,28 +1206,49 @@ def _get_lid_and_gid_strides(knl, array, index): return get_iname_strides(lid_to_iname), get_iname_strides(gid_to_iname) +# }}} + -class MemAccessCounter(CounterBase): - pass +# {{{ MemAccessCounterBase + +class MemAccessCounterBase(CounterBase): + def map_sub_array_ref(self, expr): + # generates an array view, considered free + return self.new_zero_poly_map() + + def map_call(self, expr): + from loopy.symbolic import ResolvedFunction + assert isinstance(expr.function, ResolvedFunction) + clbl = self.callables_table[expr.function.name] + + from loopy.kernel.function_interface import CallableKernel + if not isinstance(clbl, CallableKernel): + return self.rec(expr.parameters) + else: + return super().map_call(expr) + +# }}} # {{{ LocalMemAccessCounter -class LocalMemAccessCounter(MemAccessCounter): +class LocalMemAccessCounter(MemAccessCounterBase): + local_mem_count_granularity = CountGranularity.SUBGROUP + def count_var_access(self, dtype, name, index): - sub_map = ToCountMap() + count_map = {} if name in self.knl.temporary_variables: array = self.knl.temporary_variables[name] if isinstance(array, TemporaryVariable) and ( array.address_space == AddressSpace.LOCAL): if index is None: # no subscript - sub_map[MemAccess( + count_map[MemAccess( mtype="local", dtype=dtype, - count_granularity=CountGranularity.SUBGROUP) - ] = 1 - return sub_map + count_granularity=self.local_mem_count_granularity, + kernel_name=self.knl.name)] = self.one + return self.new_poly_map(count_map) array = self.knl.temporary_variables[name] @@ -1015,15 +1260,16 @@ class LocalMemAccessCounter(MemAccessCounter): lid_strides, gid_strides = _get_lid_and_gid_strides( self.knl, array, index_tuple) - sub_map[MemAccess( + count_map[MemAccess( mtype="local", dtype=dtype, lid_strides=dict(sorted(lid_strides.items())), gid_strides=dict(sorted(gid_strides.items())), variable=name, - count_granularity=CountGranularity.SUBGROUP)] = 1 + count_granularity=self.local_mem_count_granularity, + kernel_name=self.knl.name)] = self.one - return sub_map + return self.new_poly_map(count_map) def map_variable(self, expr): return self.count_var_access( @@ -1042,7 +1288,7 @@ class LocalMemAccessCounter(MemAccessCounter): # {{{ GlobalMemAccessCounter -class GlobalMemAccessCounter(MemAccessCounter): +class GlobalMemAccessCounter(MemAccessCounterBase): def map_variable(self, expr): name = expr.name @@ -1050,17 +1296,18 @@ class GlobalMemAccessCounter(MemAccessCounter): array = self.knl.arg_dict[name] else: # this is a temporary variable - return ToCountMap() + return self.new_zero_poly_map() if not isinstance(array, lp.ArrayArg): # this array is not in global memory - return ToCountMap() + return self.new_zero_poly_map() - return ToCountMap({MemAccess(mtype="global", - dtype=self.type_inf(expr), lid_strides={}, - gid_strides={}, variable=name, - count_granularity=CountGranularity.WORKITEM): 1} - ) + self.rec(expr.index) + return self.new_poly_map({MemAccess(mtype="global", + dtype=self.type_inf(expr), lid_strides={}, + gid_strides={}, variable=name, + count_granularity=CountGranularity.WORKITEM, + kernel_name=self.knl.name): self.one} + ) + self.rec(expr.index) def map_subscript(self, expr): name = expr.aggregate.name @@ -1086,19 +1333,27 @@ class GlobalMemAccessCounter(MemAccessCounter): lid_strides, gid_strides = _get_lid_and_gid_strides( self.knl, array, index_tuple) - count_granularity = CountGranularity.WORKITEM if ( - 0 in lid_strides and lid_strides[0] != 0 - ) else CountGranularity.SUBGROUP + global_access_count_granularity = CountGranularity.SUBGROUP - return ToCountMap({MemAccess( + # Account for broadcasts once per subgroup + count_granularity = CountGranularity.WORKITEM if ( + # if the stride in lid.0 is known + 0 in lid_strides + and + # it is nonzero + lid_strides[0] != 0 + ) else global_access_count_granularity + + return self.new_poly_map({MemAccess( mtype="global", dtype=self.type_inf(expr), lid_strides=dict(sorted(lid_strides.items())), gid_strides=dict(sorted(gid_strides.items())), variable=name, variable_tags=var_tags, - count_granularity=count_granularity - ): 1} + count_granularity=count_granularity, + kernel_name=self.knl.name, + ): self.one} ) + self.rec(expr.index_tuple) # }}} @@ -1174,10 +1429,19 @@ class AccessFootprintGatherer(CombineMapper): # {{{ count def add_assumptions_guard(kernel, pwqpolynomial): - return GuardedPwQPolynomial(pwqpolynomial, kernel.assumptions) + return GuardedPwQPolynomial( + pwqpolynomial, + kernel.assumptions.align_params(pwqpolynomial.space)) def count(kernel, set, space=None): + if isinstance(kernel, TranslationUnit): + kernel_names = [i for i, clbl in kernel.callables_table.items() + if isinstance(clbl, CallableKernel)] + if len(kernel_names) > 1: + raise LoopyError() + return count(kernel[kernel_names[0]], set, space) + try: if space is not None: set = set.align_params(space) @@ -1186,7 +1450,7 @@ def count(kernel, set, space=None): except AttributeError: pass - count = isl.PwQPolynomial.zero( + total_count = isl.PwQPolynomial.zero( set.space .drop_dims(dim_type.set, 0, set.dim(dim_type.set)) .add_dims(dim_type.set, 1)) @@ -1248,7 +1512,7 @@ def count(kernel, set, space=None): # }}} if bset_count is not None: - count += bset_count + total_count += bset_count is_subset = bset <= bset_rebuilt is_superset = bset >= bset_rebuilt @@ -1273,12 +1537,12 @@ def count(kernel, set, space=None): "number of integer points in your loop " "domain.") - return add_assumptions_guard(kernel, count) + return add_assumptions_guard(kernel, total_count) -def get_unused_hw_axes_factor(knl, insn, disregard_local_axes): +def get_unused_hw_axes_factor(knl, callables_table, insn, disregard_local_axes): # FIXME: Multi-kernel support - gsize, lsize = knl.get_grid_size_upper_bounds() + gsize, lsize = knl.get_grid_size_upper_bounds(callables_table) g_used = set() l_used = set() @@ -1327,29 +1591,29 @@ def count_inames_domain(knl, inames): return count(knl, domain, space=space) -def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False): +def count_insn_runs(knl, callables_table, insn, count_redundant_work, + disregard_local_axes=False): insn_inames = insn.within_inames if disregard_local_axes: from loopy.kernel.data import LocalIndexTag - insn_inames = [iname - for iname in insn_inames - if not knl.iname_tags_of_type(iname, LocalIndexTag)] + insn_inames = frozenset( + [iname for iname in insn_inames + if not knl.iname_tags_of_type(iname, LocalIndexTag)]) c = count_inames_domain(knl, insn_inames) if count_redundant_work: - unused_fac = get_unused_hw_axes_factor(knl, insn, - disregard_local_axes=disregard_local_axes) + unused_fac = get_unused_hw_axes_factor(knl, callables_table, + insn, disregard_local_axes=disregard_local_axes) return c * unused_fac else: return c -@memoize_method -def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, - count_granularity=CountGranularity.WORKITEM): +def _get_insn_count(knl, callables_table, insn_id, subgroup_size, + count_redundant_work, count_granularity=CountGranularity.WORKITEM): insn = knl.id_to_insn[insn_id] if count_granularity is None: @@ -1361,19 +1625,21 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, if count_granularity == CountGranularity.WORKITEM: return count_insn_runs( - knl, insn, count_redundant_work=count_redundant_work, + knl, callables_table, insn, + count_redundant_work=count_redundant_work, disregard_local_axes=False) ct_disregard_local = count_insn_runs( - knl, insn, disregard_local_axes=True, + knl, callables_table, insn, disregard_local_axes=True, count_redundant_work=count_redundant_work) if count_granularity == CountGranularity.WORKGROUP: return ct_disregard_local elif count_granularity == CountGranularity.SUBGROUP: - # get the group size + # {{{ compute workgroup_size + from loopy.symbolic import aff_to_expr - _, local_size = knl.get_grid_size_upper_bounds() + _, local_size = knl.get_grid_size_upper_bounds(callables_table) workgroup_size = 1 if local_size: for size in local_size: @@ -1393,15 +1659,18 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, % (CountGranularity.SUBGROUP, local_size)) workgroup_size *= s + # }}} + warn_with_kernel(knl, "insn_count_subgroups_upper_bound", "get_insn_count: when counting instruction %s with " "count_granularity=%s, using upper bound for work-group size " "(%d work-items) to compute sub-groups per work-group. When " - "multiple device programs present, actual sub-group count may be" + "multiple device programs present, actual sub-group count may be " "lower." % (insn_id, CountGranularity.SUBGROUP, workgroup_size)) from pytools import div_ceil return ct_disregard_local*div_ceil(workgroup_size, subgroup_size) + else: # this should not happen since this is enforced in Op/MemAccess raise ValueError("get_insn_count: count_granularity '%s' is" @@ -1413,17 +1682,52 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, # {{{ get_op_map -def get_op_map(knl, numpy_types=True, count_redundant_work=False, - count_within_subscripts=True, subgroup_size=None): +def _get_op_map_for_single_kernel(knl, callables_table, + count_redundant_work, + count_within_subscripts, subgroup_size): + + subgroup_size = _process_subgroup_size(knl, subgroup_size) + + kernel_rec = partial(_get_op_map_for_single_kernel, + callables_table=callables_table, + count_redundant_work=count_redundant_work, + count_within_subscripts=count_within_subscripts, + subgroup_size=subgroup_size) + + op_counter = ExpressionOpCounter(knl, callables_table, kernel_rec, + count_within_subscripts) + op_map = op_counter.new_zero_poly_map() + + from loopy.kernel.instruction import ( + CallInstruction, CInstruction, Assignment, + NoOpInstruction, BarrierInstruction) + + for insn in knl.instructions: + if isinstance(insn, (CallInstruction, CInstruction, Assignment)): + ops = op_counter(insn.assignees) + op_counter(insn.expression) + for key, val in ops.count_map.items(): + count = _get_insn_count(knl, callables_table, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity) + op_map = op_map + ToCountMap({key: val}) * count + + elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + pass + else: + raise NotImplementedError("unexpected instruction item type: '%s'" + % type(insn).__name__) + + return op_map + + +def get_op_map(program, numpy_types=True, count_redundant_work=False, + count_within_subscripts=True, subgroup_size=None, + entrypoint=None): """Count the number of operations in a loopy kernel. :arg knl: A :class:`loopy.LoopKernel` whose operations are to be counted. - :arg numpy_types: A :class:`bool` specifying whether the types in the - returned mapping should be numpy types instead of - :class:`loopy.types.LoopyType`. - :arg count_redundant_work: Based on usage of hardware axes or other specifics, a kernel may perform work redundantly. This :class:`bool` flag indicates whether this work should be included in the count. @@ -1474,53 +1778,37 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, """ - subgroup_size = _process_subgroup_size(knl, subgroup_size) + if entrypoint is None: + if len(program.entrypoints) > 1: + raise LoopyError("Must provide entrypoint") - from loopy.preprocess import preprocess_kernel, infer_unknown_types - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) + entrypoint = list(program.entrypoints)[0] - op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl, count_within_subscripts) + assert entrypoint in program.entrypoints - from loopy.kernel.instruction import ( - CallInstruction, CInstruction, Assignment, - NoOpInstruction, BarrierInstruction) + from loopy.preprocess import preprocess_program, infer_unknown_types + program = preprocess_program(program) - for insn in knl.instructions: - if isinstance(insn, (CallInstruction, CInstruction, Assignment)): - ops = op_counter(insn.assignee) + op_counter(insn.expression) - for key, val in ops.count_map.items(): - op_map = ( - op_map - + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) + # Ordering restriction: preprocess might insert arguments to + # make strides valid. Those also need to go through type inference. + program = infer_unknown_types(program, expect_completion=True) - elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): - pass - else: - raise NotImplementedError("unexpected instruction item type: '%s'" - % type(insn).__name__) + if numpy_types is not None: + from warnings import warn + warn("numpy_types is being ignored and will be removed in 2020.", + DeprecationWarning, stacklevel=2) - if numpy_types: - return ToCountMap( - init_dict={ - Op( - dtype=op.dtype.numpy_dtype, - name=op.name, - count_granularity=op.count_granularity): - ct - for op, ct in op_map.count_map.items()}, - val_type=op_map.val_type - ) - else: - return op_map + return _get_op_map_for_single_kernel( + program[entrypoint], program.callables_table, + count_redundant_work=count_redundant_work, + count_within_subscripts=count_within_subscripts, + subgroup_size=subgroup_size) # }}} +# {{{ subgoup size finding + def _find_subgroup_size_for_knl(knl): from loopy.target.pyopencl import PyOpenCLTarget if isinstance(knl.target, PyOpenCLTarget) and knl.target.device is not None: @@ -1572,20 +1860,66 @@ def _process_subgroup_size(knl, subgroup_size_requested): "must be integer, 'guess', or, if you're feeling " "lucky, None." % (subgroup_size_requested)) +# }}} + # {{{ get_mem_access_map -def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, - subgroup_size=None): +def _get_mem_access_map_for_single_kernel(knl, callables_table, + count_redundant_work, subgroup_size): + + subgroup_size = _process_subgroup_size(knl, subgroup_size) + + kernel_rec = partial(_get_mem_access_map_for_single_kernel, + callables_table=callables_table, + count_redundant_work=count_redundant_work, + subgroup_size=subgroup_size) + + access_counter_g = GlobalMemAccessCounter( + knl, callables_table, kernel_rec) + access_counter_l = LocalMemAccessCounter( + knl, callables_table, kernel_rec) + access_map = access_counter_g.new_zero_poly_map() + + from loopy.kernel.instruction import ( + CallInstruction, CInstruction, Assignment, + NoOpInstruction, BarrierInstruction) + + for insn in knl.instructions: + if isinstance(insn, (CallInstruction, CInstruction, Assignment)): + insn_access_map = ( + access_counter_g(insn.expression) + + access_counter_l(insn.expression) + ).with_set_attributes(direction="load") + for assignee in insn.assignees: + insn_access_map = insn_access_map + ( + access_counter_g(assignee) + + access_counter_l(assignee) + ).with_set_attributes(direction="store") + + for key, val in insn_access_map.count_map.items(): + count = _get_insn_count(knl, callables_table, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity) + access_map = access_map + ToCountMap({key: val}) * count + + elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + pass + + else: + raise NotImplementedError("unexpected instruction item type: '%s'" + % type(insn).__name__) + + return access_map + + +def get_mem_access_map(program, numpy_types=None, count_redundant_work=False, + subgroup_size=None, entrypoint=None): """Count the number of memory accesses in a loopy kernel. :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be counted. - :arg numpy_types: A :class:`bool` specifying whether the types in the - returned mapping should be numpy types instead of - :class:`loopy.types.LoopyType`. - :arg count_redundant_work: Based on usage of hardware axes or other specifics, a kernel may perform work redundantly. This :class:`bool` flag indicates whether this work should be included in the count. @@ -1662,72 +1996,86 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, """ - subgroup_size = _process_subgroup_size(knl, subgroup_size) - - from loopy.preprocess import preprocess_kernel, infer_unknown_types - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) + if entrypoint is None: + if len(program.entrypoints) > 1: + raise LoopyError("Must provide entrypoint") - access_map = ToCountMap() - access_counter_g = GlobalMemAccessCounter(knl) - access_counter_l = LocalMemAccessCounter(knl) + entrypoint = list(program.entrypoints)[0] - from loopy.kernel.instruction import ( - CallInstruction, CInstruction, Assignment, - NoOpInstruction, BarrierInstruction) + assert entrypoint in program.entrypoints - for insn in knl.instructions: - if isinstance(insn, (CallInstruction, CInstruction, Assignment)): - insn_access_map = ( - access_counter_g(insn.expression) - + access_counter_l(insn.expression) - ).with_set_attributes(direction="load") + from loopy.preprocess import preprocess_program, infer_unknown_types - for assignee in insn.assignees: - insn_access_map += ( - access_counter_g(assignee) - + access_counter_l(assignee) - ).with_set_attributes(direction="store") + program = preprocess_program(program) + # Ordering restriction: preprocess might insert arguments to + # make strides valid. Those also need to go through type inference. + program = infer_unknown_types(program, expect_completion=True) - for key, val in insn_access_map.count_map.items(): - access_map = ( - access_map - + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) - elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): - pass - else: - raise NotImplementedError("unexpected instruction item type: '%s'" - % type(insn).__name__) + if numpy_types is not None: + from warnings import warn + warn("numpy_types is being ignored and will be removed in 2020.", + DeprecationWarning, stacklevel=2) - if numpy_types: - return ToCountMap( - init_dict={ - MemAccess( - mtype=mem_access.mtype, - dtype=mem_access.dtype.numpy_dtype, - lid_strides=mem_access.lid_strides, - gid_strides=mem_access.gid_strides, - direction=mem_access.direction, - variable=mem_access.variable, - variable_tags=mem_access.variable_tags, - count_granularity=mem_access.count_granularity): - ct - for mem_access, ct in access_map.count_map.items()}, - val_type=access_map.val_type - ) - else: - return access_map + return _get_mem_access_map_for_single_kernel( + program[entrypoint], program.callables_table, + count_redundant_work=count_redundant_work, + subgroup_size=subgroup_size) # }}} # {{{ get_synchronization_map -def get_synchronization_map(knl, subgroup_size=None): +def _get_synchronization_map_for_single_kernel(knl, callables_table, + subgroup_size=None): + + knl = lp.get_one_linearized_kernel(knl, callables_table) + + from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, + CallKernel, ReturnFromKernel, RunInstruction) + kernel_rec = partial(_get_synchronization_map_for_single_kernel, + callables_table=callables_table, + subgroup_size=subgroup_size) + + sync_counter = CounterBase(knl, callables_table, kernel_rec) + sync_map = sync_counter.new_zero_poly_map() + + iname_list = [] + + for sched_item in knl.schedule: + if isinstance(sched_item, EnterLoop): + if sched_item.iname: # (if not empty) + iname_list.append(sched_item.iname) + elif isinstance(sched_item, LeaveLoop): + if sched_item.iname: # (if not empty) + iname_list.pop() + + elif isinstance(sched_item, Barrier): + sync_map = sync_map + ToCountMap( + {Sync( + "barrier_%s" % sched_item.synchronization_kind, + knl.name): count_inames_domain(knl, frozenset(iname_list))}) + + elif isinstance(sched_item, RunInstruction): + pass + + elif isinstance(sched_item, CallKernel): + sync_map = sync_map + ToCountMap( + {Sync("kernel_launch", knl.name): + count_inames_domain(knl, frozenset(iname_list))}) + + elif isinstance(sched_item, ReturnFromKernel): + pass + + else: + raise LoopyError("unexpected schedule item: %s" + % type(sched_item).__name__) + + return sync_map + + +def get_synchronization_map(program, subgroup_size=None, entrypoint=None): """Count the number of synchronization events each work-item encounters in a loopy kernel. @@ -1763,79 +2111,30 @@ def get_synchronization_map(knl, subgroup_size=None): # (now use this count to, e.g., predict performance) """ + if entrypoint is None: + if len(program.entrypoints) > 1: + raise LoopyError("Must provide entrypoint") - from loopy.preprocess import preprocess_kernel, infer_unknown_types - from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, - CallKernel, ReturnFromKernel, RunInstruction) - from operator import mul - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - knl = lp.get_one_linearized_kernel(knl) - iname_list = [] + entrypoint = list(program.entrypoints)[0] - result = ToCountMap() + assert entrypoint in program.entrypoints + from loopy.preprocess import preprocess_program, infer_unknown_types - one = isl.PwQPolynomial("{ 1 }") + program = preprocess_program(program) + # Ordering restriction: preprocess might insert arguments to + # make strides valid. Those also need to go through type inference. + program = infer_unknown_types(program, expect_completion=True) - def get_count_poly(iname_list): - if iname_list: # (if iname_list is not empty) - ct = (count(knl, ( - knl.get_inames_domain(iname_list). - project_out_except(iname_list, [dim_type.set]) - )), ) - return reduce(mul, ct) - else: - return one - - for sched_item in knl.schedule: - if isinstance(sched_item, EnterLoop): - if sched_item.iname: # (if not empty) - iname_list.append(sched_item.iname) - elif isinstance(sched_item, LeaveLoop): - if sched_item.iname: # (if not empty) - iname_list.pop() - - elif isinstance(sched_item, Barrier): - result = result + ToCountMap({"barrier_%s" % - sched_item.synchronization_kind: - get_count_poly(iname_list)}) - - elif isinstance(sched_item, CallKernel): - result = result + ToCountMap( - {"kernel_launch": get_count_poly(iname_list)}) - - elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)): - pass - - else: - raise LoopyError("unexpected schedule item: %s" - % type(sched_item).__name__) - - return result + return _get_synchronization_map_for_single_kernel( + program[entrypoint], program.callables_table, + subgroup_size=subgroup_size) # }}} # {{{ gather_access_footprints -def gather_access_footprints(kernel, ignore_uncountable=False): - """Return a dictionary mapping ``(var_name, direction)`` to - :class:`islpy.Set` instances capturing which indices of each the array - *var_name* are read/written (where *direction* is either ``read`` or - ``write``. - - :arg ignore_uncountable: If *False*, an error will be raised for accesses - on which the footprint cannot be determined (e.g. data-dependent or - nonlinear indices) - """ - - from loopy.preprocess import preprocess_kernel, infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - - from loopy.kernel import KernelState - if kernel.state < KernelState.PREPROCESSED: - kernel = preprocess_kernel(kernel) - +def _gather_access_footprints_for_single_kernel(kernel, ignore_uncountable): write_footprints = [] read_footprints = [] @@ -1858,6 +2157,48 @@ def gather_access_footprints(kernel, ignore_uncountable=False): write_footprints.append(afg(insn.assignees)) read_footprints.append(afg(insn.expression)) + return write_footprints, read_footprints + + +def gather_access_footprints(program, ignore_uncountable=False, entrypoint=None): + """Return a dictionary mapping ``(var_name, direction)`` to + :class:`islpy.Set` instances capturing which indices of each the array + *var_name* are read/written (where *direction* is either ``read`` or + ``write``. + + :arg ignore_uncountable: If *False*, an error will be raised for accesses + on which the footprint cannot be determined (e.g. data-dependent or + nonlinear indices) + """ + + if entrypoint is None: + if len(program.entrypoints) > 1: + raise LoopyError("Must provide entrypoint") + + entrypoint = list(program.entrypoints)[0] + + assert entrypoint in program.entrypoints + + # FIMXE: works only for one callable kernel till now. + if len([in_knl_callable for in_knl_callable in + program.callables_table.values() if isinstance(in_knl_callable, + CallableKernel)]) != 1: + raise NotImplementedError("Currently only supported for program with " + "only one CallableKernel.") + + from loopy.preprocess import preprocess_program, infer_unknown_types + + program = preprocess_program(program) + # Ordering restriction: preprocess might insert arguments to + # make strides valid. Those also need to go through type inference. + program = infer_unknown_types(program, expect_completion=True) + + write_footprints = [] + read_footprints = [] + + write_footprints, read_footprints = _gather_access_footprints_for_single_kernel( + program[entrypoint], ignore_uncountable) + write_footprints = AccessFootprintGatherer.combine(write_footprints) read_footprints = AccessFootprintGatherer.combine(read_footprints) @@ -1872,7 +2213,7 @@ def gather_access_footprints(kernel, ignore_uncountable=False): return result -def gather_access_footprint_bytes(kernel, ignore_uncountable=False): +def gather_access_footprint_bytes(program, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.PwQPolynomial` instances capturing the number of bytes are read/written (where *direction* is either ``read`` or ``write`` on array @@ -1883,12 +2224,12 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): nonlinear indices) """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) + from loopy.preprocess import preprocess_program, infer_unknown_types + kernel = infer_unknown_types(program, expect_completion=True) from loopy.kernel import KernelState if kernel.state < KernelState.PREPROCESSED: - kernel = preprocess_kernel(kernel) + kernel = preprocess_program(program) result = {} fp = gather_access_footprints(kernel, diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 15570c4607f90fa3fc03ea6d2cb3c4ab11458326..9917de098786966a702b8948728f6aae33d835d3 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -56,7 +56,7 @@ from pymbolic.mapper.constant_folder import \ ConstantFoldingMapper as ConstantFoldingMapperBase from pymbolic.parser import Parser as ParserBase - +from loopy.diagnostic import LoopyError from loopy.diagnostic import ExpressionToAffineConversionError import islpy as isl @@ -89,6 +89,10 @@ __doc__ = """ .. autoclass:: ExpansionState .. autoclass:: RuleAwareIdentityMapper + +.. autoclass:: ResolvedFunction + +.. autoclass:: SubArrayRef """ @@ -138,7 +142,14 @@ class IdentityMapperMixin: return expr def map_type_annotation(self, expr, *args, **kwargs): - return type(expr)(expr.type, self.rec(expr.child)) + return type(expr)(expr.type, self.rec(expr.child, *args, **kwargs)) + + def map_sub_array_ref(self, expr, *args, **kwargs): + return SubArrayRef(self.rec(expr.swept_inames, *args, **kwargs), + self.rec(expr.subscript, *args, **kwargs)) + + def map_resolved_function(self, expr, *args, **kwargs): + return ResolvedFunction(expr.function) map_type_cast = map_type_annotation @@ -202,17 +213,36 @@ class WalkMapper(WalkMapperBase): map_rule_argument = map_group_hw_index + def map_sub_array_ref(self, expr, *args): + if not self.visit(expr): + return + + self.rec(expr.swept_inames, *args) + self.rec(expr.subscript, *args) + + def map_resolved_function(self, expr, *args): + if not self.visit(expr): + return + + self.rec(expr.function, *args) + map_fortran_division = WalkMapperBase.map_quotient class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant + map_resolved_function = CallbackMapperBase.map_constant class CombineMapper(CombineMapperBase): def map_reduction(self, expr, *args, **kwargs): return self.rec(expr.expr, *args, **kwargs) + def map_sub_array_ref(self, expr): + return self.combine(( + self.rec(expr.subscript), + self.combine(tuple(self.rec(idx) for idx in expr.swept_inames)))) + map_linear_subscript = CombineMapperBase.map_subscript map_fortran_division = CombineMapperBase.map_quotient @@ -274,6 +304,16 @@ class StringifyMapper(StringifyMapperBase): return "cast({}, {})".format( repr(expr.type), self.rec(expr.child, PREC_NONE)) + def map_resolved_function(self, expr, prec): + # underlining a resolved call + return "\u0332".join(str(expr.function)) + + def map_sub_array_ref(self, expr, prec): + return "[{inames}]: {subscr}".format( + inames=",".join(self.rec(iname, prec) for iname in + expr.swept_inames), + subscr=self.rec(expr.subscript, prec)) + def map_fortran_division(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_NONE result = self.map_quotient(expr, PREC_NONE) @@ -318,7 +358,7 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase): if not isinstance(other, type(expr)): return self.treat_mismatch(expr, other, unis) if (expr.inames != other.inames - or type(expr.operation) != type(other.operation)): # noqa + or type(expr.function) != type(other.function)): # noqa return [] return self.rec(expr.expr, other.expr, unis) @@ -350,8 +390,7 @@ class DependencyMapper(DependencyMapperBase): def map_call(self, expr, *args, **kwargs): # Loopy does not have first-class functions. Do not descend # into 'function' attribute of Call. - return self.combine( - self.rec(child, *args, **kwargs) for child in expr.parameters) + return self.rec(expr.parameters, *args, **kwargs) def map_reduction(self, expr, *args, **kwargs): deps = self.rec(expr.expr, *args, **kwargs) @@ -363,14 +402,25 @@ class DependencyMapper(DependencyMapperBase): def map_loopy_function_identifier(self, expr, *args, **kwargs): return set() + def map_sub_array_ref(self, expr, *args, **kwargs): + deps = self.rec(expr.subscript, *args, **kwargs) + return deps - set(expr.swept_inames) + map_linear_subscript = DependencyMapperBase.map_subscript def map_type_cast(self, expr, *args, **kwargs): return self.rec(expr.child, *args, **kwargs) + def map_resolved_function(self, expr): + return self.rec(expr.function) + def map_literal(self, expr): return set() + def map_call_with_kwargs(self, expr): + # See https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + map_fortran_division = DependencyMapperBase.map_quotient @@ -637,7 +687,6 @@ class Reduction(LoopyExpressionBase): Represents a reduction operation on :attr:`expr` across :attr:`inames`. .. attribute:: operation - an instance of :class:`loopy.library.reduction.ReductionOperation` .. attribute:: inames @@ -765,6 +814,171 @@ class RuleArgument(LoopyExpressionBase): mapper_method = intern("map_rule_argument") +class ResolvedFunction(LoopyExpressionBase): + """ + A function identifier whose definition is known in a :mod:`loopy` program. + A function is said to be *known* in a :class:`~loopy.TranslationUnit` if its + name maps to an :class:`~loopy.kernel.function_interface.InKernelCallable` + in :attr:`loopy.TranslationUnit.callables_table`. Refer to :ref:`func-interface`. + + .. attribute:: function + + An instance of :class:`pymbolic.primitives.Variable` or + :class:`loopy.library.reduction.ReductionOpFunction`. + """ + init_arg_names = ("function", ) + + def __init__(self, function): + if isinstance(function, str): + function = p.Variable(function) + from loopy.library.reduction import ReductionOpFunction + assert isinstance(function, (p.Variable, ReductionOpFunction)) + self.function = function + + @property + def name(self): + from loopy.library.reduction import ReductionOpFunction + if isinstance(self.function, p.Variable): + return self.function.name + elif isinstance(self.function, ReductionOpFunction): + return self.function + else: + raise LoopyError("Unexpected function type %s in ResolvedFunction." % + type(self.function)) + + def __getinitargs__(self): + return (self.function, ) + + def make_stringifier(self, originating_stringifier=None): + return StringifyMapper() + + mapper_method = intern("map_resolved_function") + + +class EvaluatorWithDeficientContext(PartialEvaluationMapper): + """Evaluation Mapper that does not need values of all the variables + involved in the expression. + + Returns the expression with the values mapped from :attr:`context`. + """ + def map_variable(self, expr): + if expr.name in self.context: + return self.context[expr.name] + else: + return expr + + +class VariableInAnExpression(CombineMapper): + def __init__(self, variables_to_search): + assert(all(isinstance(variable, p.Variable) for variable in + variables_to_search)) + self.variables_to_search = variables_to_search + + def combine(self, values): + return any(values) + + def map_variable(self, expr): + return expr in self.variables_to_search + + def map_constant(self, expr): + return False + + +class SweptInameStrideCollector(CoefficientCollectorBase): + """ + Mapper to compute the coefficient swept inames for :class:`SubArrayRef`. + """ + def map_algebraic_leaf(self, expr): + # subscripts that are not involved in :attr:`target_names` are treated + # as constants. + if isinstance(expr, p.Subscript) and (self.target_names is None + or expr.aggregate.name not in self.target_names): + return {1: expr} + + return super().map_algebraic_leaf(expr) + + +def get_start_subscript_from_sar(sar, kernel): + """ + Returns an instance of :class:`pymbolic.primitives.Subscript`, the + beginning subscript of the array swept by the *SubArrayRef*. + + **Example:** Consider ``[i, k]: a[i, j, k, l]``. The beginning + subscript would be ``a[0, j, 0, l]`` + """ + + def _get_lower_bound(iname): + pwaff = kernel.get_iname_bounds(iname).lower_bound_pw_aff + return int(pw_aff_to_expr(pwaff)) + + swept_inames_to_zeros = { + swept_iname.name: _get_lower_bound(swept_iname.name) for + swept_iname in sar.swept_inames} + + return EvaluatorWithDeficientContext(swept_inames_to_zeros)( + sar.subscript) + + +class SubArrayRef(LoopyExpressionBase): + """ + An algebraic expression to map an affine memory layout pattern (known as + sub-arary) as consecutive elements of the sweeping axes which are defined + using :attr:`SubArrayRef.swept_inames`. + + .. attribute:: swept_inames + + An instance of :class:`tuple` denoting the axes to which the sub array + is supposed to be mapped to. + + .. attribute:: subscript + + An instance of :class:`pymbolic.primitives.Subscript` denoting the + array in the kernel. + + .. automethod:: is_equal + """ + + init_arg_names = ("swept_inames", "subscript") + + def __init__(self, swept_inames, subscript): + + # {{{ sanity checks + + if not isinstance(swept_inames, tuple): + assert isinstance(swept_inames, p.Variable) + swept_inames = (swept_inames,) + + assert isinstance(swept_inames, tuple) + + for iname in swept_inames: + assert isinstance(iname, p.Variable) + assert isinstance(subscript, p.Subscript) + + # }}} + + self.swept_inames = swept_inames + self.subscript = subscript + + def __getinitargs__(self): + return (self.swept_inames, self.subscript) + + def get_hash(self): + return hash((self.__class__, self.swept_inames, self.subscript)) + + def is_equal(self, other): + """ + Returns *True* iff the sub-array refs have identical expressions. + """ + return (other.__class__ == self.__class__ + and other.subscript == self.subscript + and other.swept_inames == self.swept_inames) + + def make_stringifier(self, originating_stringifier=None): + return StringifyMapper() + + mapper_method = intern("map_sub_array_ref") + + class FortranDivision(p.QuotientBase, LoopyExpressionBase): """This exists for the benefit of the Fortran frontend, which specializes to floating point division for floating point inputs and round-to-zero @@ -807,12 +1021,37 @@ def get_reduction_inames(expr): return _get_dependencies_and_reduction_inames(expr)[1] +class SubArrayRefSweptInamesCollector(CombineMapper): + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_sub_array_ref(self, expr): + return frozenset({iname.name for iname in expr.swept_inames}) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + map_resolved_function = map_constant + + +def get_sub_array_ref_swept_inames(expr): + return SubArrayRefSweptInamesCollector()(expr) + + # {{{ rule-aware mappers def parse_tagged_name(expr): + from loopy.library.reduction import ArgExtOp, SegmentedOp if isinstance(expr, TaggedVariable): return expr.name, expr.tags - elif isinstance(expr, p.Variable): + elif isinstance(expr, ResolvedFunction): + return parse_tagged_name(expr.function) + elif isinstance(expr, (p.Variable, ArgExtOp, SegmentedOp)): return expr.name, None else: raise RuntimeError("subst rule name not understood: %s" % expr) @@ -1011,12 +1250,14 @@ class RuleAwareIdentityMapper(IdentityMapper): def __init__(self, rule_mapping_context): self.rule_mapping_context = rule_mapping_context - def map_variable(self, expr, expn_state): + def map_variable(self, expr, expn_state, *args, **kwargs): name, tags = parse_tagged_name(expr) if name not in self.rule_mapping_context.old_subst_rules: - return IdentityMapper.map_variable(self, expr, expn_state) + return IdentityMapper.map_variable(self, expr, expn_state, *args, + **kwargs) else: - return self.map_substitution(name, tags, (), expn_state) + return self.map_substitution(name, tags, (), expn_state, *args, + **kwargs) def map_call(self, expr, expn_state): if not isinstance(expr.function, p.Variable): @@ -1068,6 +1309,10 @@ class RuleAwareIdentityMapper(IdentityMapper): return sym def __call__(self, expr, kernel, insn): + """ + :arg insn: A :class:`~loopy.kernel.InstructionBase` of which *expr* is + a part of, or *None* if *expr*'s source is not an instruction. + """ from loopy.kernel.data import InstructionBase assert insn is None or isinstance(insn, InstructionBase) @@ -1348,8 +1593,10 @@ class LoopyParser(ParserBase): return float(val) # generic float def parse_prefix(self, pstate): - from pymbolic.parser import _PREC_UNARY, _less, _greater, _identifier + from pymbolic.parser import (_PREC_UNARY, _less, _greater, _identifier, + _openbracket, _closebracket, _colon) import loopy as lp + if pstate.is_next(_less): pstate.advance() if pstate.is_next(_greater): @@ -1365,6 +1612,26 @@ class LoopyParser(ParserBase): return TypeAnnotation( typename, self.parse_expression(pstate, _PREC_UNARY)) + + elif pstate.is_next(_openbracket): + rollback_pstate = pstate.copy() + pstate.advance() + pstate.expect_not_end() + if pstate.is_next(_closebracket): + swept_inames = () + else: + swept_inames = self.parse_expression(pstate) + + pstate.expect(_closebracket) + pstate.advance() + if pstate.is_next(_colon): + # pstate.expect(_colon): + pstate.advance() + subscript = self.parse_expression(pstate, _PREC_UNARY) + return SubArrayRef(swept_inames, subscript) + else: + pstate = rollback_pstate + return super().parse_prefix(rollback_pstate) else: return super().parse_prefix(pstate) @@ -2066,7 +2333,7 @@ def get_access_map(domain, subscript, assumptions=None, shape=None, except ExpressionToAffineConversionError as err: shape_aff = None - if shape is not None: + if shape is not None and shape[idim] is not None: try: shape_aff = guarded_aff_from_expr(access_map.space, shape[idim]) except ExpressionToAffineConversionError: @@ -2200,6 +2467,33 @@ class BatchedAccessMapMapper(WalkMapper): def map_type_cast(self, expr, inames): return self.rec(expr.child, inames) + def map_sub_array_ref(self, expr, inames): + arg_name = expr.subscript.aggregate.name + if arg_name not in self._var_names: + return + + if arg_name in self.bad_subscripts: + return + + total_inames = inames | {iname.name for iname in expr.swept_inames} + assert total_inames not in self.access_maps[arg_name] + + self.rec(expr.subscript, total_inames) + + # {{{ project out swept_inames as within inames they are swept locally + + amap = self.access_maps[arg_name].pop(total_inames) + for iname in expr.swept_inames: + dt, pos = amap.get_var_dict()[iname.name] + amap = amap.project_out(dt, pos, 1) + + # }}} + + if self.access_maps[arg_name][inames] is None: + self.access_maps[arg_name][inames] = amap + else: + self.access_maps[arg_name][inames] |= amap + class AccessRangeMapper: """**IMPORTANT** diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 8af47c41222416fbd2dbe3dc5a88d4090a4a06f0..a6357a12b61d2b837ab1cd016554c2c5af100024 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -87,7 +87,10 @@ class TargetBase: def preprocess(self, kernel): return kernel - def pre_codegen_check(self, kernel): + def pre_codegen_entrypoint_check(self, kernel, callables_table): + pass + + def pre_codegen_callable_check(self, kernel, callables_table): pass # }}} @@ -157,8 +160,15 @@ class ASTBuilderBase: # {{{ library - def function_manglers(self): - return [] + @property + def known_callables(self): + """ + Returns a mapping from function ids to corresponding + :class:`loopy.kernel.function_interface.InKernelCallable` for the + function ids known to *self.target*. + """ + # FIXME: @inducer: Do we need to move this to TargetBase? + return {} def symbol_manglers(self): return [] @@ -170,6 +180,10 @@ class ASTBuilderBase: # {{{ code generation guts + @property + def ast_module(self): + raise NotImplementedError() + def get_function_definition(self, codegen_state, codegen_result, schedule_index, function_decl, function_body): raise NotImplementedError diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 92b94d658fd24f44ff4b8b0ba748f3cd5212617a..31d53ebf48c903298b3e8a7bc8a36c9e57f3c9aa 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -24,7 +24,6 @@ THE SOFTWARE. """ import numpy as np # noqa -from loopy.kernel.data import CallMangleInfo from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder from loopy.diagnostic import LoopyError, LoopyTypeError from cgen import Pointer, NestedDeclarator, Block @@ -32,6 +31,7 @@ from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import IdentityMapper from loopy.types import NumpyType +from loopy.kernel.function_interface import ScalarCallable import pymbolic.primitives as p from loopy.tools import remove_common_indentation @@ -72,11 +72,13 @@ class DTypeRegistryWrapper: return self.wrapped_registry.get_or_register_dtype(names, dtype) def dtype_to_ctype(self, dtype): - from loopy.types import LoopyType, NumpyType + from loopy.types import LoopyType, NumpyType, OpaqueType assert isinstance(dtype, LoopyType) if isinstance(dtype, NumpyType): return self.wrapped_registry.dtype_to_ctype(dtype) + elif isinstance(dtype, OpaqueType): + return dtype.name else: raise LoopyError( "unable to convert type '%s' to C" @@ -91,7 +93,8 @@ def c99_preamble_generator(preamble_info): if any(dtype.is_integral() for dtype in preamble_info.seen_dtypes): yield("10_stdint", "#include ") if any(dtype.numpy_dtype == np.dtype("bool") - for dtype in preamble_info.seen_dtypes): + for dtype in preamble_info.seen_dtypes + if isinstance(dtype, NumpyType)): yield("10_stdbool", "#include ") if any(dtype.is_complex() for dtype in preamble_info.seen_dtypes): yield("10_complex", "#include ") @@ -421,10 +424,10 @@ class CFamilyTarget(TargetBase): return self.get_dtype_registry().dtype_to_ctype(dtype) def get_kernel_executor_cache_key(self, *args, **kwargs): - return None # TODO: ??? + raise NotImplementedError def get_kernel_executor(self, knl, *args, **kwargs): - raise NotImplementedError() + raise NotImplementedError # }}} @@ -447,42 +450,60 @@ def c_symbol_mangler(kernel, name): # float NAN as defined in C99 standard if name == "NAN": return NumpyType(np.dtype(np.float32)), name + + if name in ["INT_MAX", "INT_MIN"]: + return NumpyType(np.dtype(np.int32)), name + return None # }}} -# {{{ function mangler +# {{{ function scoping -def c_math_mangler(target, name, arg_dtypes, modify_name=True): - # Function mangler for math functions defined in C standard - # Convert abs, min, max to fabs, fmin, fmax. - # If modify_name is set to True, function names are modified according to - # floating point types of the arguments (e.g. cos(double), cosf(float)) - # This should be set to True for C and Cuda, False for OpenCL - if not isinstance(name, str): - return None +class CMathCallable(ScalarCallable): + """ + An umbrella callable for all the math functions which can be seen in a + C-Target. + """ - # {{{ (abs|max|min) -> (fabs|fmax|fmin) + def with_types(self, arg_id_to_dtype, callables_table): + name = self.name - if name in ["abs", "min", "max"]: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - if dtype.kind == "f": - name = "f" + name + # {{{ (abs|max|min) -> (fabs|fmax|fmin) - # }}} + if name in ["abs", "min", "max"]: + dtype = np.find_common_type( + [], [dtype.numpy_dtype for dtype in arg_id_to_dtype.values()]) + if dtype.kind == "f": + name = "f" + name + + # }}} + + # unary functions + if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", + "erf", "erfc", "abs", "real", "imag", "conj"]: - # unitary functions - if (name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"] - and len(arg_dtypes) == 1 - and arg_dtypes[0].numpy_dtype.kind in "fc"): + for id in arg_id_to_dtype: + if not -1 <= id <= 0: + raise LoopyError(f"'{name}' can take only one argument.") - dtype = arg_dtypes[0].numpy_dtype - real_dtype = np.empty(0, dtype=dtype).real.dtype + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) - if modify_name: + dtype = arg_id_to_dtype[0].numpy_dtype + real_dtype = np.empty(0, dtype=dtype).real.dtype + + if dtype.kind in ("u", "i"): + # ints and unsigned casted to float32 + dtype = np.float32 + + # for CUDA, C Targets the name must be modified if real_dtype == np.float64: pass # fabs elif real_dtype == np.float32: @@ -491,29 +512,46 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): and real_dtype == np.float128): # pylint:disable=no-member name = name + "l" # fabsl else: - raise LoopyTypeError(f"{name} does not support type {real_dtype}") - - if dtype.kind == "c": - name = "c" + name - - return CallMangleInfo( - target_name=name, - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) - - # binary functions - if (name in ["fmax", "fmin", "copysign", "pow"] - and len(arg_dtypes) == 2): - - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - real_dtype = np.empty(0, dtype=dtype).real.dtype - - if name in ["fmax", "fmin", "copysign"] and dtype.kind == "c": - raise LoopyTypeError(f"{name} does not support complex numbers") - - elif real_dtype.kind in "fc": - if modify_name: + raise LoopyTypeError("{} does not support type {}".format(name, + dtype)) + + if name in ["abs", "real", "imag"]: + dtype = real_dtype + + if dtype.kind == "c" or name in ["real", "imag", "abs"]: + if name != "conj": + name = "c" + name + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + callables_table) + + # binary functions + elif name in ["fmax", "fmin", "pow", "atan2", "copysign"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only two arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if id >= 0]) + real_dtype = np.empty(0, dtype=dtype).real.dtype + + if name in ["fmax", "fmin", "copysign"] and dtype.kind == "c": + raise LoopyTypeError(f"{name} does not support complex numbers") + + elif real_dtype.kind in "fc": if real_dtype == np.float64: pass # fmin elif real_dtype == np.float32: @@ -523,50 +561,93 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): name = name + "l" # fminl else: raise LoopyTypeError("%s does not support type %s" - % (name, real_dtype)) - - if dtype.kind == "c": - name = "c" + name # cpow - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) - - # complex functions - if (name in ["abs", "real", "imag"] - and len(arg_dtypes) == 1 - and arg_dtypes[0].numpy_dtype.kind == "c"): - dtype = arg_dtypes[0].numpy_dtype - real_dtype = np.empty(0, dtype=dtype).real.dtype - - if modify_name: - if real_dtype == np.float64: - pass # fabs - elif real_dtype == np.float32: - name = name + "f" # fabsf - elif (hasattr(np, "float128") - and real_dtype == np.float128): # pylint:disable=no-member - name = name + "l" # fabsl - else: - raise LoopyTypeError(f"{name} does not support type {real_dtype}") + % (name, dtype)) + if dtype.kind == "c": + name = "c" + name # cpow + dtype = NumpyType(dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + callables_table) + elif name in ["max", "min"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only two arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't resolved enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if id >= 0]) + if dtype.kind not in "iu": + # only support integers for now to avoid having to deal with NaNs + raise LoopyError(f"{name} does not support '{dtype}' arguments.") + + return ( + self.copy(name_in_target=f"lpy_{name}_{dtype.name}", + arg_id_to_dtype={-1: NumpyType(dtype), + 0: NumpyType(dtype), + 1: NumpyType(dtype)}), + callables_table) + elif name == "isnan": + for id in arg_id_to_dtype: + if not -1 <= id <= 0: + raise LoopyError(f"'{name}' can take only one argument.") + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0].numpy_dtype + return ( + self.copy( + name_in_target=name, + arg_id_to_dtype={ + 0: NumpyType(dtype), + -1: NumpyType(np.int32)}), + callables_table) + + def generate_preambles(self, target): + if self.name_in_target.startswith("lpy_max"): + dtype = self.arg_id_to_dtype[-1] + ctype = target.dtype_to_typename(dtype) + + yield ("40_lpy_max", f""" + static inline {ctype} {self.name_in_target}({ctype} a, {ctype} b) {{ + return (a > b ? a : b); + }}""") - name = "c" + name + if self.name_in_target.startswith("lpy_min"): + dtype = self.arg_id_to_dtype[-1] + ctype = target.dtype_to_typename(dtype) + yield ("40_lpy_min", f""" + static inline {ctype} {self.name_in_target}({ctype} a, {ctype} b) {{ + return (a < b ? a : b); + }}""") - return CallMangleInfo( - target_name=name, - result_dtypes=(NumpyType(real_dtype),), - arg_dtypes=arg_dtypes) - if (name == "isnan" and len(arg_dtypes) == 1 - and arg_dtypes[0].numpy_dtype.kind == "f"): - return CallMangleInfo( - target_name=name, - result_dtypes=(NumpyType(np.int32),), - arg_dtypes=arg_dtypes) +def get_c_callables(): + """ + Returns an instance of :class:`InKernelCallable` if the function + represented by :arg:`identifier` is known in C, otherwise returns *None*. + """ + cmath_ids = ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", + "sinh", "pow", "atan2", "tanh", "exp", "log", "log10", + "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin", + "fabs", "tan", "erf", "erfc", "isnan", "real", "imag", + "conj"] - return None + return {id_: CMathCallable(id_) for id_ in cmath_ids} # }}} @@ -574,12 +655,6 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): class CFamilyASTBuilder(ASTBuilderBase): # {{{ library - def function_manglers(self): - return ( - super().function_manglers() + [ - c_math_mangler - ]) - def symbol_manglers(self): return ( super().symbol_manglers() + [ @@ -592,6 +667,12 @@ class CFamilyASTBuilder(ASTBuilderBase): _preamble_generator, ]) + @property + def known_callables(self): + callables = super().known_callables + callables.update(get_c_callables()) + return callables + # }}} # {{{ code generation @@ -678,9 +759,13 @@ class CFamilyASTBuilder(ASTBuilderBase): if self.target.fortran_abi: name += "_" + if codegen_state.is_entrypoint: + name = Value("void", name) + else: + name = Value("static void", name) return FunctionDeclarationWrapper( FunctionDeclaration( - Value("void", name), + name, [self.idi_to_cgen_declarator(codegen_state.kernel, idi) for idi in codegen_state.implemented_data_info])) @@ -709,8 +794,8 @@ class CFamilyASTBuilder(ASTBuilderBase): temporaries_written_in_subkernel) subkernel = kernel.schedule[schedule_index].kernel_name sub_knl_temps = ( - temporaries_read_in_subkernel(kernel, subkernel) | - temporaries_written_in_subkernel(kernel, subkernel)) + temporaries_read_in_subkernel(kernel, subkernel) + | temporaries_written_in_subkernel(kernel, subkernel)) for tv in sorted( kernel.temporary_variables.values(), @@ -831,6 +916,11 @@ class CFamilyASTBuilder(ASTBuilderBase): # {{{ code generation guts + @property + def ast_module(self): + import cgen + return cgen + def get_expression_to_code_mapper(self, codegen_state): return self.get_expression_to_c_expression_mapper(codegen_state) @@ -993,83 +1083,33 @@ class CFamilyASTBuilder(ASTBuilderBase): return block_if_necessary(assignments) def emit_multiple_assignment(self, codegen_state, insn): - ecm = codegen_state.expression_to_code_mapper - - from pymbolic.primitives import Variable - from pymbolic.mapper.stringifier import PREC_NONE - - func_id = insn.expression.function - parameters = insn.expression.parameters - - if isinstance(func_id, Variable): - func_id = func_id.name - - assignee_var_descriptors = [ - codegen_state.kernel.get_var_descriptor(a) - for a in insn.assignee_var_names()] - - par_dtypes = tuple(ecm.infer_type(par) for par in parameters) - mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes) - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % func_id) - - assert mangle_result.arg_dtypes is not None + ecm = codegen_state.expression_to_code_mapper + func_id = insn.expression.function.name + in_knl_callable = codegen_state.callables_table[func_id] - if mangle_result.target_name == "loopy_make_tuple": - # This shorcut avoids actually having to emit a 'make_tuple' function. + if isinstance(in_knl_callable, ScalarCallable) and ( + in_knl_callable.name_in_target == "loopy_make_tuple"): return self.emit_tuple_assignment(codegen_state, insn) - from loopy.expression import dtype_to_type_context - c_parameters = [ - ecm(par, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr - for par, par_dtype, tgt_dtype in zip( - parameters, par_dtypes, mangle_result.arg_dtypes)] - - from loopy.codegen import SeenFunction - codegen_state.seen_functions.add( - SeenFunction(func_id, - mangle_result.target_name, - mangle_result.arg_dtypes, - mangle_result.result_dtypes)) - - from pymbolic import var - for i, (a, tgt_dtype) in enumerate( - zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): - if tgt_dtype != ecm.infer_type(a): - raise LoopyError("type mismatch in %d'th (1-based) left-hand " - "side of instruction '%s'" % (i+1, insn.id)) - c_parameters.append( - # TODO Yuck: The "where-at function": &(...) - var("&")( - ecm(a, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr)) + # takes "is_returned" to infer whether insn.assignees[0] is a part of + # LHS. + in_knl_callable_as_call, is_returned = in_knl_callable.emit_call_insn( + insn=insn, + target=self.target, + expression_to_code_mapper=ecm) - from pymbolic import var - result = var(mangle_result.target_name)(*c_parameters) - - # In case of no assignees, we are done - if len(mangle_result.result_dtypes) == 0: + if is_returned: + from cgen import Assign + lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) + return Assign(lhs_code, + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) + else: from cgen import ExpressionStatement return ExpressionStatement( - CExpression(self.get_c_expression_to_code_mapper(), result)) - - result = ecm.wrap_in_typecast_lazy( - lambda: mangle_result.result_dtypes[0], - assignee_var_descriptors[0].dtype, - result) - - lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) - - from cgen import Assign - return Assign( - lhs_code, - CExpression(self.get_c_expression_to_code_mapper(), result)) + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): @@ -1207,15 +1247,20 @@ class ExecutableCTarget(CTarget): """ An executable CFamilyTarget that uses (by default) JIT compilation of C-code """ - def __init__(self, compiler=None, fortran_abi=False): super().__init__(fortran_abi=fortran_abi) from loopy.target.c.c_execution import CCompiler self.compiler = compiler or CCompiler() - def get_kernel_executor(self, knl, *args, **kwargs): + def get_kernel_executor_cache_key(self, *args, **kwargs): + # This is for things like the context in OpenCL. There is no such + # thing that CPU JIT is specific to. + return None + + def get_kernel_executor(self, t_unit, *args, **kwargs): from loopy.target.c.c_execution import CKernelExecutor - return CKernelExecutor(knl, compiler=self.compiler) + return CKernelExecutor(t_unit, entrypoint=kwargs.pop("entrypoint"), + compiler=self.compiler) def get_host_ast_builder(self): # enable host code generation diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index f664e3ee1d1e8544edb218e93acc0b4ee32d1112..ed31fad29ee7e9dc3a96e0d80680346ce53b3441 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -166,7 +166,8 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): % ", ".join(f'"{arg.name}": {arg.name}' for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) + if arg.base_name in + kernel.get_written_variables())) else: out_args = [arg for arg in implemented_data_info @@ -407,7 +408,7 @@ class CKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, kernel, compiler=None): + def __init__(self, program, entrypoint, compiler=None): """ :arg kernel: may be a loopy.LoopKernel, a generator returning kernels (a warning will be issued if more than one is returned). If the @@ -416,54 +417,57 @@ class CKernelExecutor(KernelExecutorBase): """ self.compiler = compiler if compiler else CCompiler() - super().__init__(kernel) + super().__init__(program, entrypoint) - def get_invoker_uncached(self, kernel, codegen_result): + def get_invoker_uncached(self, kernel, entrypoint, codegen_result): generator = CExecutionWrapperGenerator() - return generator(kernel, codegen_result) + return generator(kernel, entrypoint, codegen_result) def get_wrapper_generator(self): return CExecutionWrapperGenerator() @memoize_method - def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + def program_info(self, entrypoint, arg_to_dtype_set=frozenset(), + all_kwargs=None): + program = self.get_typed_and_scheduled_program(entrypoint, arg_to_dtype_set) from loopy.codegen import generate_code_v2 - codegen_result = generate_code_v2(kernel) + codegen_result = generate_code_v2(program) dev_code = codegen_result.device_code() host_code = codegen_result.host_code() all_code = "\n".join([dev_code, "", host_code]) - if self.kernel.options.write_cl: + if self.program[entrypoint].options.write_cl: output = all_code - if self.kernel.options.highlight_cl: + if self.program[entrypoint].options.highlight_cl: output = get_highlighted_code(output) - if self.kernel.options.write_cl is True: + if self.program[entrypoint].options.write_cl is True: print(output) else: - with open(self.kernel.options.write_cl, "w") as outf: + with open(self.program[entrypoint].options.write_cl, "w") as outf: outf.write(output) - if self.kernel.options.edit_cl: + if self.program[entrypoint].options.edit_cl: from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.c") # update code from editor all_code = "\n".join([dev_code, "", host_code]) c_kernels = [] + for dp in codegen_result.device_programs: c_kernels.append(CompiledCKernel(dp, - codegen_result.implemented_data_info, all_code, self.kernel.target, - self.compiler)) + codegen_result.implemented_data_infos[entrypoint], all_code, + self.program.target, self.compiler)) return _KernelInfo( - kernel=kernel, + program=program, c_kernels=c_kernels, - implemented_data_info=codegen_result.implemented_data_info, - invoker=self.get_invoker(kernel, codegen_result)) + implemented_data_info=codegen_result.implemented_data_infos[ + entrypoint], + invoker=self.get_invoker(program, entrypoint, codegen_result)) # }}} @@ -480,7 +484,9 @@ class CKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) + program_info = self.program_info(kwargs["entrypoint"], + self.arg_to_dtype_set(kwargs)) + kwargs.pop("entrypoint") - return kernel_info.invoker( - kernel_info.c_kernels, *args, **kwargs) + return program_info.invoker( + program_info.c_kernels, *args, **kwargs) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 94486cdbe89f67824f066082a4504615459175ba..7a1fa6d895ed9826b20e22804989a077bfde9bc7 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -35,9 +35,9 @@ from pymbolic import var from loopy.expression import dtype_to_type_context -from loopy.type_inference import TypeInferenceMapper +from loopy.type_inference import TypeReader -from loopy.diagnostic import LoopyError, LoopyWarning +from loopy.diagnostic import LoopyError from loopy.tools import is_integer from loopy.types import LoopyType from loopy.target.c import CExpression @@ -62,7 +62,8 @@ class ExpressionToCExpressionMapper(IdentityMapper): self.codegen_state = codegen_state if type_inf_mapper is None: - type_inf_mapper = TypeInferenceMapper(self.kernel) + type_inf_mapper = TypeReader(self.kernel, + self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper self.allow_complex = codegen_state.allow_complex @@ -176,6 +177,11 @@ class ExpressionToCExpressionMapper(IdentityMapper): def map_tagged_variable(self, expr, type_context): return var(expr.name) + def map_sub_array_ref(self, expr, type_context): + from loopy.symbolic import get_start_subscript_from_sar + return var("&")(self.rec(get_start_subscript_from_sar(expr, self.kernel), + type_context)) + def map_subscript(self, expr, type_context): def base_impl(expr, type_context): return self.rec(expr.aggregate, type_context)[self.rec(expr.index, "i")] @@ -442,104 +448,12 @@ class ExpressionToCExpressionMapper(IdentityMapper): "for constant '%s'" % expr) def map_call(self, expr, type_context): - from pymbolic.primitives import Variable, Subscript - - identifier = expr.function - - # {{{ implement indexof, indexof_vec - - if identifier.name in ["indexof", "indexof_vec"]: - if len(expr.parameters) != 1: - raise LoopyError("%s takes exactly one argument" % identifier.name) - arg, = expr.parameters - if not isinstance(arg, Subscript): - raise LoopyError( - "argument to %s must be a subscript" % identifier.name) - - ary = self.find_array(arg) - - from loopy.kernel.array import get_access_info - from pymbolic import evaluate - access_info = get_access_info(self.kernel.target, ary, arg.index, - lambda expr: evaluate(expr, self.codegen_state.var_subst_map), - self.codegen_state.vectorization_info) - - from loopy.kernel.data import ImageArg - if isinstance(ary, ImageArg): - raise LoopyError("%s does not support images" % identifier.name) - - if identifier.name == "indexof": - return access_info.subscripts[0] - elif identifier.name == "indexof_vec": - from loopy.kernel.array import VectorArrayDimTag - ivec = None - for iaxis, dim_tag in enumerate(ary.dim_tags): - if isinstance(dim_tag, VectorArrayDimTag): - ivec = iaxis - - if ivec is None: - return access_info.subscripts[0] - else: - return ( - access_info.subscripts[0]*ary.shape[ivec] - + access_info.vector_index) - - else: - raise RuntimeError("should not get here") - - # }}} - - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) - - processed_parameters = None - - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: - raise LoopyError("functions with more or fewer than one return value " - "may not be used in an expression") - - if mangle_result.arg_dtypes is not None: - processed_parameters = tuple( - self.rec(par, - dtype_to_type_context(self.kernel.target, tgt_dtype), - tgt_dtype) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)) - - else: - # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to - # propagate the type context here. But for many others, it does - # not. Using the inferred type as a stopgap for now. - processed_parameters = tuple( - self.rec(par, - type_context=dtype_to_type_context( - self.kernel.target, par_dtype)) - for par, par_dtype in zip(expr.parameters, par_dtypes)) - - from warnings import warn - warn("Calling function '%s' with unknown C signature--" - "return CallMangleInfo.arg_dtypes" - % identifier, LoopyWarning) - - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes, - mangle_result.result_dtypes)) - - return var(mangle_result.target_name)(*processed_parameters) + return ( + self.codegen_state.callables_table[ + expr.function.name].emit_call( + expression_to_code_mapper=self, + expression=expr, + target=self.kernel.target)) # {{{ deal with complex-valued variables @@ -566,6 +480,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): def map_power(self, expr, type_context): tgt_dtype = self.infer_type(expr) + base_dtype = self.infer_type(expr.base) exponent_dtype = self.infer_type(expr.exponent) from pymbolic.primitives import is_constant, is_zero @@ -587,10 +502,21 @@ class ExpressionToCExpressionMapper(IdentityMapper): "int_pow", func_name, (tgt_dtype, exponent_dtype), (tgt_dtype, ))) + # FIXME: This need some more callables to be registered. return var(func_name)(self.rec(expr.base, type_context), self.rec(expr.exponent, type_context)) else: - return self.rec(var("pow")(expr.base, expr.exponent), type_context) + from loopy.codegen import SeenFunction + clbl = self.codegen_state.ast_builder.known_callables["pow"] + clbl = clbl.with_types({0: tgt_dtype, 1: exponent_dtype}, + self.codegen_state.callables_table)[0] + self.codegen_state.seen_functions.add( + SeenFunction( + clbl.name, clbl.name_in_target, + (base_dtype, exponent_dtype), + (tgt_dtype,))) + return var(clbl.name_in_target)(self.rec(expr.base, type_context), + self.rec(expr.exponent, type_context)) # }}} diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 67dc1fe249af91d9b73a7162867dcd98c7ef6bc7..63018189e7aaa729f6a008b4768d479f78e3cfeb 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -29,10 +29,11 @@ from pytools import memoize_method from loopy.target.c import CFamilyTarget, CFamilyASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper -from loopy.diagnostic import LoopyError +from loopy.diagnostic import LoopyError, LoopyTypeError from loopy.types import NumpyType from loopy.kernel.data import AddressSpace from pymbolic import var +from loopy.kernel.function_interface import ScalarCallable # {{{ vector types @@ -110,43 +111,82 @@ def _register_vector_types(dtype_registry): # }}} -# {{{ function mangler +# {{{ function scoper -def cuda_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +_CUDA_SPECIFIC_FUNCTIONS = { + "rsqrt": 1, + "atan2": 2, + } - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type([], arg_dtypes) - if dtype.kind == "c": - raise RuntimeError("min/max do not support complex numbers") +class CudaCallable(ScalarCallable): - if dtype.kind == "f": - name = "f" + name + def cuda_with_types(self, arg_id_to_dtype, callables_table): - return dtype, name + name = self.name - if name in ["pow"] and len(arg_dtypes) == 2: - dtype = np.find_common_type([], arg_dtypes) + if name in _CUDA_SPECIFIC_FUNCTIONS: + num_args = _CUDA_SPECIFIC_FUNCTIONS[name] - if dtype == np.float64: - pass # pow - elif dtype == np.float32: - name = name + "f" # powf - else: - raise RuntimeError(f"{name} does not support type {dtype}") + # {{{ sanity checks + + for id, dtype in arg_id_to_dtype.items(): + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + if dtype is not None and dtype.kind == "c": + raise LoopyTypeError( + f"'{name}' does not support complex arguments.") + + # }}} + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + updated_arg_id_to_dtype = {id: NumpyType(dtype) + for id in range(-1, num_args)} + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype), + callables_table) - return dtype, name + if name == "dot": + # CUDA dot function: + # Performs dot product. Input types: vector and return type: scalar. + for i in range(2): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) - if name in "atan2" and len(arg_dtypes) == 2: - return arg_dtypes[0], name + input_dtype = arg_id_to_dtype[0] - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].fields["x"] - return scalar_dtype, name + scalar_dtype, offset, field_name = input_dtype.fields["x"] + return_dtype = scalar_dtype + return self.copy(arg_id_to_dtype={0: input_dtype, 1: input_dtype, + -1: return_dtype}) + + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) - return None + +def get_cuda_callables(): + cuda_func_ids = {"dot"} | set(_CUDA_SPECIFIC_FUNCTIONS) + return {id_: CudaCallable(name=id_) for id_ in cuda_func_ids} # }}} @@ -192,6 +232,9 @@ class CudaTarget(CFamilyTarget): super().__init__() + def split_kernel_at_global_barriers(self): + return True + def get_device_ast_builder(self): return CUDACASTBuilder(self) @@ -225,16 +268,51 @@ class CudaTarget(CFamilyTarget): # }}} +# {{{ preamable generator + +def cuda_preamble_generator(preamble_info): + from loopy.types import AtomicNumpyType + seen_64_bit_atomics = any( + isinstance(dtype, AtomicNumpyType) and dtype.numpy_dtype.itemsize == 8 + for dtype in preamble_info.seen_atomic_dtypes) + + if seen_64_bit_atomics: + # Source: + # docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions + yield ("00_enable_64bit_atomics", """ + #if __CUDA_ARCH__ < 600 + __device__ double atomicAdd(double* address, double val) + { + unsigned long long int* address_as_ull = + (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + + } while (assumed != old); + + return __longlong_as_double(old); + } + #endif + """) + +# }}} + + # {{{ ast builder class CUDACASTBuilder(CFamilyASTBuilder): # {{{ library - def function_manglers(self): - return ( - super().function_manglers() + [ - cuda_function_mangler - ]) + @property + def known_callables(self): + callables = super().known_callables + callables.update(get_cuda_callables()) + return callables # }}} @@ -260,7 +338,8 @@ class CUDACASTBuilder(CFamilyASTBuilder): _, local_grid_size = \ codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( - codegen_state.kernel.schedule, schedule_index)) + codegen_state.kernel.schedule, schedule_index), + codegen_state.callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): @@ -273,6 +352,12 @@ class CUDACASTBuilder(CFamilyASTBuilder): return FunctionDeclarationWrapper(fdecl) + def preamble_generators(self): + + return ( + super().preamble_generators() + [ + cuda_preamble_generator]) + # }}} # {{{ code generation guts @@ -350,6 +435,97 @@ class CUDACASTBuilder(CFamilyASTBuilder): return CudaConstant(arg_decl) + # {{{ code generation for atomic update + + def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, + lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): + + from pymbolic.primitives import Sum + from cgen import Statement + from pymbolic.mapper.stringifier import PREC_NONE + + if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [ + np.int32, np.int64, np.float32, np.float64]: + # atomicAdd + if isinstance(rhs_expr, Sum): + ecm = self.get_expression_to_code_mapper(codegen_state) + + new_rhs_expr = Sum(tuple(c for c in rhs_expr.children + if c != lhs_expr)) + lhs_expr_code = ecm(lhs_expr) + rhs_expr_code = ecm(new_rhs_expr) + + return Statement("atomicAdd(&{}, {})".format( + lhs_expr_code, rhs_expr_code)) + else: + from cgen import Block, DoWhile, Assign + from loopy.target.c import POD + old_val_var = codegen_state.var_name_generator("loopy_old_val") + new_val_var = codegen_state.var_name_generator("loopy_new_val") + + from loopy.kernel.data import TemporaryVariable + ecm = codegen_state.expression_to_code_mapper.with_assignments( + { + old_val_var: TemporaryVariable(old_val_var, lhs_dtype), + new_val_var: TemporaryVariable(new_val_var, lhs_dtype), + }) + + lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None) + + from pymbolic.mapper.substitutor import make_subst_func + from pymbolic import var + from loopy.symbolic import SubstitutionMapper + + subst = SubstitutionMapper( + make_subst_func({lhs_expr: var(old_val_var)})) + rhs_expr_code = ecm(subst(rhs_expr), prec=PREC_NONE, + type_context=rhs_type_context, + needed_dtype=lhs_dtype) + + cast_str = "" + old_val = old_val_var + new_val = new_val_var + + if lhs_dtype.numpy_dtype.kind == "f": + if lhs_dtype.numpy_dtype == np.float32: + ctype = "int" + elif lhs_dtype.numpy_dtype == np.float64: + ctype = "long" + else: + assert False + + old_val = "*(%s *) &" % ctype + old_val + new_val = "*(%s *) &" % ctype + new_val + cast_str = "(%s *) " % (ctype) + + return Block([ + POD(self, NumpyType(lhs_dtype.dtype, target=self.target), + old_val_var), + POD(self, NumpyType(lhs_dtype.dtype, target=self.target), + new_val_var), + DoWhile( + "atomicCAS(" + "%(cast_str)s&(%(lhs_expr)s), " + "%(old_val)s, " + "%(new_val)s" + ") != %(old_val)s" + % { + "cast_str": cast_str, + "lhs_expr": lhs_expr_code, + "old_val": old_val, + "new_val": new_val, + }, + Block([ + Assign(old_val_var, lhs_expr_code), + Assign(new_val_var, rhs_expr_code), + ]) + ) + ]) + else: + raise NotImplementedError("atomic update for '%s'" % lhs_dtype) + + # }}} + # }}} # }}} diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 222839e56fce5742b6d128c35397d3a7f9900d21..68a60f28d48d9a20d6a17c4724bf6b215a80ff0a 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -59,12 +59,13 @@ class SeparateArrayPackingController: It also repacks outgoing arrays of this type back into an object array. """ - def __init__(self, kernel): + def __init__(self, program, entrypoint): + # map from arg name self.packing_info = {} from loopy.kernel.array import ArrayBase - for arg in kernel.args: + for arg in program[entrypoint].args: if not isinstance(arg, ArrayBase): continue @@ -80,7 +81,8 @@ class SeparateArrayPackingController: name=arg.name, sep_shape=arg.sep_shape(), subscripts_and_names=subscripts_and_names, - is_written=arg.name in kernel.get_written_variables()) + is_written=arg.name in + program[entrypoint].get_written_variables()) def unpack(self, kernel_kwargs): if not self.packing_info: @@ -158,7 +160,7 @@ class ExecutionWrapperGeneratorBase(ABC): # {{{ integer arg finding from shapes def generate_integer_arg_finding_from_shapes( - self, gen, kernel, implemented_data_info): + self, gen, program, implemented_data_info): # a mapping from integer argument names to a list of tuples # (arg_name, expression), where expression is a # unary function of kernel.arg_dict[arg_name] @@ -183,7 +185,8 @@ class ExecutionWrapperGeneratorBase(ABC): if len(deps) == 1: integer_arg_var, = deps - if kernel.arg_dict[integer_arg_var.name].dtype.is_integral(): + if program.arg_dict[ + integer_arg_var.name].dtype.is_integral(): from pymbolic.algorithm import solve_affine_equations_for try: # friggin' overkill :) @@ -230,7 +233,7 @@ class ExecutionWrapperGeneratorBase(ABC): # {{{ integer arg finding from offsets def generate_integer_arg_finding_from_offsets(self, gen, kernel, - implemented_data_info): + implemented_data_info): options = kernel.options gen("# {{{ find integer arguments from offsets") @@ -634,7 +637,7 @@ class ExecutionWrapperGeneratorBase(ABC): def generate_host_code(self, gen, codegen_result): raise NotImplementedError - def __call__(self, kernel, codegen_result): + def __call__(self, program, entrypoint, codegen_result): """ Generates the wrapping python invoker for this execution target @@ -646,12 +649,12 @@ class ExecutionWrapperGeneratorBase(ABC): kernel """ - options = kernel.options - implemented_data_info = codegen_result.implemented_data_info + options = program[entrypoint].options + implemented_data_info = codegen_result.implemented_data_infos[entrypoint] from loopy.kernel.data import KernelArgument gen = PythonFunctionGenerator( - "invoke_%s_loopy_kernel" % kernel.name, + "invoke_%s_loopy_kernel" % entrypoint, self.system_args + [ "%s=None" % idi.name for idi in implemented_data_info @@ -666,21 +669,24 @@ class ExecutionWrapperGeneratorBase(ABC): self.initialize_system_args(gen) self.generate_integer_arg_finding_from_shapes( - gen, kernel, implemented_data_info) + gen, program[entrypoint], implemented_data_info) self.generate_integer_arg_finding_from_offsets( - gen, kernel, implemented_data_info) + gen, program[entrypoint], implemented_data_info) self.generate_integer_arg_finding_from_strides( - gen, kernel, implemented_data_info) + gen, program[entrypoint], implemented_data_info) self.generate_value_arg_check( - gen, kernel, implemented_data_info) - + gen, program[entrypoint], implemented_data_info) args = self.generate_arg_setup( - gen, kernel, implemented_data_info, options) + gen, program[entrypoint], implemented_data_info, options) + + #FIXME: should we make this as a dict as well. + host_program_name = codegen_result.host_programs[entrypoint].name - self.generate_invocation(gen, codegen_result.host_program.name, args, - kernel, implemented_data_info) + self.generate_invocation(gen, host_program_name, args, + program[entrypoint], implemented_data_info) - self.generate_output_handler(gen, options, kernel, implemented_data_info) + self.generate_output_handler(gen, options, program[entrypoint], + implemented_data_info) if options.write_wrapper: output = gen.get() @@ -728,64 +734,66 @@ class KernelExecutorBase: .. automethod:: __call__ """ - def __init__(self, kernel): + def __init__(self, program, entrypoint): """ :arg kernel: a loopy.LoopKernel """ - self.kernel = kernel + self.program = program + self.entrypoint = entrypoint - self.packing_controller = SeparateArrayPackingController(kernel) + self.packing_controller = SeparateArrayPackingController(program, + entrypoint) - self.output_names = tuple(arg.name for arg in self.kernel.args - if arg.name in self.kernel.get_written_variables()) + self.output_names = tuple(arg.name for arg in self.program[entrypoint].args + if arg.is_output) self.has_runtime_typed_args = any( arg.dtype is None - for arg in kernel.args) + for arg in program[entrypoint].args) - def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set): + def get_typed_and_scheduled_program_uncached(self, entrypoint, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes + from loopy.kernel import KernelState + from loopy.translation_unit import resolve_callables - kernel = self.kernel + program = resolve_callables(self.program) if arg_to_dtype_set: var_to_dtype = {} + entry_knl = program[entrypoint] for var, dtype in arg_to_dtype_set: - try: - dest_name = kernel.impl_arg_to_arg[var].name - except KeyError: + if var in entry_knl.impl_arg_to_arg: + dest_name = entry_knl.impl_arg_to_arg[var].name + else: dest_name = var - try: - var_to_dtype[dest_name] = dtype - except KeyError: - raise LoopyError("cannot set type for '%s': " - "no known variable/argument with that name" - % var) + var_to_dtype[dest_name] = dtype - kernel = add_dtypes(kernel, var_to_dtype) + program = program.with_kernel(add_dtypes(entry_knl, var_to_dtype)) from loopy.type_inference import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) + program = infer_unknown_types(program, expect_completion=True) - if kernel.schedule is None: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) + if program.state < KernelState.LINEARIZED: + from loopy.preprocess import preprocess_program + program = preprocess_program(program) from loopy.schedule import get_one_linearized_kernel - kernel = get_one_linearized_kernel(kernel) + for e in program.entrypoints: + program = program.with_kernel( + get_one_linearized_kernel(program[e], program.callables_table)) - return kernel + return program - def get_typed_and_scheduled_kernel(self, arg_to_dtype_set): + def get_typed_and_scheduled_program(self, entrypoint, arg_to_dtype_set): from loopy import CACHING_ENABLED from loopy.preprocess import prepare_for_caching # prepare_for_caching() gets run by preprocess, but the kernel at this # stage is not guaranteed to be preprocessed. - cacheable_kernel = prepare_for_caching(self.kernel) - cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set) + cacheable_program = prepare_for_caching(self.program) + cache_key = (type(self).__name__, cacheable_program, arg_to_dtype_set) if CACHING_ENABLED: try: @@ -793,9 +801,11 @@ class KernelExecutorBase: except KeyError: pass - logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name) + logger.debug("%s: typed-and-scheduled cache miss" % + self.program.entrypoints) - kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set) + kernel = self.get_typed_and_scheduled_program_uncached(entrypoint, + arg_to_dtype_set) if CACHING_ENABLED: typed_and_scheduled_cache.store_if_not_present(cache_key, kernel) @@ -803,10 +813,13 @@ class KernelExecutorBase: return kernel def arg_to_dtype_set(self, kwargs): + kwargs = kwargs.copy() if not self.has_runtime_typed_args: return None - impl_arg_to_arg = self.kernel.impl_arg_to_arg + entrypoint = kwargs.pop("entrypoint") + + impl_arg_to_arg = self.program[entrypoint].impl_arg_to_arg arg_to_dtype = {} for arg_name, val in kwargs.items(): arg = impl_arg_to_arg.get(arg_name, None) @@ -827,18 +840,18 @@ class KernelExecutorBase: # {{{ debugging aids - def get_highlighted_code(self, arg_to_dtype=None, code=None): + def get_highlighted_code(self, entrypoint, arg_to_dtype=None, code=None): if code is None: - code = self.get_code(arg_to_dtype) + code = self.get_code(entrypoint, arg_to_dtype) return get_highlighted_code(code) - def get_code(self, arg_to_dtype=None): + def get_code(self, entrypoint, arg_to_dtype=None): def process_dtype(dtype): if isinstance(dtype, type) and issubclass(dtype, np.generic): dtype = np.dtype(dtype) if isinstance(dtype, np.dtype): from loopy.types import NumpyType - dtype = NumpyType(dtype, self.kernel.target) + dtype = NumpyType(dtype, self.program.target) return dtype @@ -846,22 +859,19 @@ class KernelExecutorBase: arg_to_dtype = frozenset( (k, process_dtype(v)) for k, v in arg_to_dtype.items()) - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype) + kernel = self.get_typed_and_scheduled_program(entrypoint, arg_to_dtype) from loopy.codegen import generate_code_v2 code = generate_code_v2(kernel) return code.device_code() - def get_invoker_uncached(self, kernel, *args): - raise NotImplementedError() - - def get_wrapper_generator(self): + def get_invoker_uncached(self, program, entrypoint, *args): raise NotImplementedError() - def get_invoker(self, kernel, *args): + def get_invoker(self, program, entrypoint, *args): from loopy import CACHING_ENABLED - cache_key = (self.__class__.__name__, kernel) + cache_key = (self.__class__.__name__, (program, entrypoint)) if CACHING_ENABLED: try: @@ -869,9 +879,9 @@ class KernelExecutorBase: except KeyError: pass - logger.debug("%s: invoker cache miss" % kernel.name) + logger.debug("%s: invoker cache miss" % entrypoint) - invoker = self.get_invoker_uncached(kernel, *args) + invoker = self.get_invoker_uncached(program, entrypoint, *args) if CACHING_ENABLED: invoker_cache.store_if_not_present(cache_key, invoker) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index d047b6464c1fc86010d1943527af68be73278bbb..67af90a24f7d9a2b22c0715e3d09d27b2a572b2a 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -171,8 +171,9 @@ class ISPCTarget(CFamilyTarget): host_program_name_suffix = "" device_program_name_suffix = "_inner" - def pre_codegen_check(self, kernel): - gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs() + def pre_codegen_entrypoint_check(self, kernel, callables_table): + gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs( + callables_table) if len(lsize) > 1: for i, ls_i in enumerate(lsize[1:]): if ls_i != 1: diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index d9afebea98754372dcab3ebd573a241a080ad535..3f3da4a3242774fe1a4bf743e419a593cc7b4955 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -30,11 +30,11 @@ from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from pytools import memoize_method from loopy.diagnostic import LoopyError, LoopyTypeError from loopy.types import NumpyType -from loopy.target.c import DTypeRegistryWrapper, c_math_mangler -from loopy.kernel.data import AddressSpace, CallMangleInfo +from loopy.target.c import DTypeRegistryWrapper +from loopy.kernel.data import AddressSpace +from loopy.kernel.function_interface import ScalarCallable from pymbolic import var -from functools import partial # {{{ dtype registry wrappers @@ -180,77 +180,225 @@ VECTOR_LITERAL_FUNCS = { } -def opencl_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +class OpenCLCallable(ScalarCallable): + """ + Records information about OpenCL functions which are not covered by + :class:`loopy.target.c.CMathCallable`. + """ - # OpenCL has min(), max() for integer types - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "i": - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) - - if name == "pow" and len(arg_dtypes) == 2: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - if dtype == np.float64: - name = "powf64" - elif dtype == np.float32: - name = "powf32" - else: - raise LoopyTypeError(f"'pow' does not support type {dtype}.") - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) - - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"] - return CallMangleInfo( - target_name=name, - result_dtypes=(NumpyType(scalar_dtype),), - arg_dtypes=(arg_dtypes[0],)*2) - - if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: - num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] - if len(arg_dtypes) != num_args: - raise LoopyError("%s takes %d arguments (%d received)" - % (name, num_args, len(arg_dtypes))) - - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise LoopyError("%s does not support complex numbers" - % name) - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=(result_dtype,)*num_args) - - if name in VECTOR_LITERAL_FUNCS: - base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] - - if count != len(arg_dtypes): - return None - - return CallMangleInfo( - target_name="(%s%d) " % (base_tp_name, count), - result_dtypes=(kernel.target.vector_dtype( - NumpyType(dtype), count),), - arg_dtypes=(NumpyType(dtype),)*count) - - return None + def with_types(self, arg_id_to_dtype, callables_table): + name = self.name + + # unary functions + if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", + "erf", "erfc"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 0: + raise LoopyError(f"'{name}' can take only one argument.") + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0] + dtype = dtype.numpy_dtype + + if dtype.kind in ("u", "i"): + # ints and unsigned casted to float32 + dtype = np.float32 + elif dtype.kind == "c": + raise LoopyTypeError(f"{name} does not support type {dtype}") + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + callables_table) + # binary functions + elif name in ["fmax", "fmin", "atan2", "copysign"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + #FIXME: Do we need to raise here?: + # The pattern we generally follow is that if we don't find + # a function, then we just return None + raise LoopyError("%s can take only two arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if id >= 0]) + + if dtype.kind == "c": + raise LoopyTypeError("%s does not support complex numbers") + + dtype = NumpyType(dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + callables_table) + + elif name in ["max", "min"]: + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype: + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + common_dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if (id >= 0 and dtype is not None)]) + + if common_dtype.kind in ["u", "i", "f"]: + if common_dtype.kind == "f": + name = "f"+name + + dtype = NumpyType(common_dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + callables_table) + else: + # Unsupported type. + raise LoopyError("%s function not supported for the types %s" % + (name, common_dtype)) + + elif name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError(f"'{name}' can take only 2 arguments.") + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"] + return ( + self.copy(name_in_target=name, arg_id_to_dtype={-1: + NumpyType(scalar_dtype), 0: dtype, 1: dtype}), + callables_table) + + elif name == "pow": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError(f"'{name}' can take only 2 arguments.") + + common_dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if (id >= 0 and dtype is not None)]) + + if common_dtype == np.float64: + name = "powf64" + elif common_dtype == np.float32: + name = "powf32" + else: + raise LoopyTypeError(f"'pow' does not support type {dtype}.") + + result_dtype = NumpyType(common_dtype) + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: result_dtype, + 0: common_dtype, 1: common_dtype}), + callables_table) + + elif name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: + num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + updated_arg_id_to_dtype = {id: NumpyType(dtype) for id in range(-1, + num_args)} + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype), + callables_table) + + elif name in VECTOR_LITERAL_FUNCS: + base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] + + for id in arg_id_to_dtype: + if not -1 <= id < count: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(count): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + updated_arg_id_to_dtype = {id: NumpyType(dtype) for id in + range(count)} + updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype( + NumpyType(dtype), count) + + return ( + self.copy(name_in_target="(%s%d) " % (base_tp_name, count), + arg_id_to_dtype=updated_arg_id_to_dtype), + callables_table) + + # does not satisfy any of the conditions needed for specialization. + # hence just returning a copy of the callable. + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + +def get_opencl_callables(): + """ + Returns an instance of :class:`InKernelCallable` if the function defined by + *identifier* is known in OpenCL. + """ + opencl_function_ids = ( + {"max", "min", "dot", "pow", "abs", "acos", "asin", + "atan", "cos", "cosh", "sin", "sinh", "pow", "atan2", "tanh", "exp", + "log", "log10", "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin", + "fabs", "tan", "erf", "erfc"} + | set(_CL_SIMPLE_MULTI_ARG_FUNCTIONS) + | set(VECTOR_LITERAL_FUNCS)) + + return {id_: OpenCLCallable(name=id_) for id_ in + opencl_function_ids} # }}} @@ -274,6 +422,8 @@ def opencl_symbol_mangler(kernel, name): return NumpyType(np.dtype(np.int32)), name elif name.startswith("LONG_"): return NumpyType(np.dtype(np.int64)), name + elif name == "HUGE_VAL": + return NumpyType(np.dtype(np.float64)), name else: return None @@ -310,6 +460,7 @@ def opencl_preamble_generator(preamble_info): from loopy.tools import remove_common_indentation kernel = preamble_info.kernel + yield ("00_declare_gid_lid", remove_common_indentation(""" #define lid(N) ((%(idx_ctype)s) get_local_id(N)) @@ -426,13 +577,11 @@ class OpenCLTarget(CFamilyTarget): class OpenCLCASTBuilder(CFamilyASTBuilder): # {{{ library - def function_manglers(self): - return ( - [ - opencl_function_mangler, - partial(c_math_mangler, modify_name=False) - ] + - super().function_manglers()) + @property + def known_callables(self): + callables = super().known_callables + callables.update(get_opencl_callables()) + return callables def symbol_manglers(self): return ( @@ -441,13 +590,10 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): ]) def preamble_generators(self): - from loopy.library.reduction import reduction_preamble_generator return ( super().preamble_generators() + [ - opencl_preamble_generator, - reduction_preamble_generator, - ]) + opencl_preamble_generator]) # }}} @@ -460,6 +606,11 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) + if not codegen_state.is_entrypoint: + # auxiliary kernels need not mention opencl speicific qualifiers + # for a functions signature + return fdecl + fdecl = fdecl.subdecl from cgen.opencl import CLKernel, CLRequiredWorkGroupSize @@ -468,7 +619,8 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): from loopy.schedule import get_insn_ids_for_block_at _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( - codegen_state.kernel.schedule, schedule_index)) + codegen_state.kernel.schedule, schedule_index), + codegen_state.callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_sizes): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 40f13a57a0cfafbed95df72f003eb1dd57d3fb8d..d389a434a145615dd34f7e3a259e077e60175349 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -25,13 +25,13 @@ THE SOFTWARE. import numpy as np import pymbolic.primitives as p -from loopy.kernel.data import CallMangleInfo from loopy.target.opencl import (OpenCLTarget, OpenCLCASTBuilder, ExpressionToOpenCLCExpressionMapper) from loopy.target.python import PythonASTBuilderBase from loopy.types import NumpyType -from loopy.diagnostic import LoopyError, warn_with_kernel +from loopy.diagnostic import LoopyError, warn_with_kernel, LoopyTypeError from warnings import warn +from loopy.kernel.function_interface import ScalarCallable import logging logger = logging.getLogger(__name__) @@ -130,7 +130,7 @@ def adjust_local_temp_var_storage(kernel, device): # {{{ check sizes against device properties -def check_sizes(kernel, device): +def check_sizes(kernel, callables_table, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError @@ -147,7 +147,8 @@ def check_sizes(kernel, device): if isinstance(arg, lp.ValueArg) and arg.approximately is not None: parameters[arg.name] = arg.approximately - glens, llens = kernel.get_grid_size_upper_bounds_as_exprs() + glens, llens = ( + kernel.get_grid_size_upper_bounds_as_exprs(callables_table)) if (max(len(glens), len(llens)) > device.max_work_item_dimensions): @@ -195,36 +196,86 @@ def check_sizes(kernel, device): # }}} -def pyopencl_function_mangler(target, name, arg_dtypes): - if len(arg_dtypes) == 1 and isinstance(name, str): - arg_dtype, = arg_dtypes +# {{{ pyopencl function scopers - if arg_dtype.is_complex(): - if arg_dtype.numpy_dtype == np.complex64: - tpname = "cfloat" - elif arg_dtype.numpy_dtype == np.complex128: - tpname = "cdouble" +class PyOpenCLCallable(ScalarCallable): + """ + Records information about the callables which are not covered by + :class:`loopy.target.opencl.OpenCLCallable` + """ + def with_types(self, arg_id_to_dtype, callables_table): + + name = self.name + + for id in arg_id_to_dtype: + # since all the below functions are single arg. + if not -1 <= id <= 0: + raise LoopyError("%s can only take one argument." % name) + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0] + + if name in ["real", "imag", "abs"]: + if dtype.is_complex(): + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return ( + self.copy(name_in_target=f"{tpname}_{name}", + arg_id_to_dtype={0: dtype, -1: NumpyType( + np.dtype(dtype.numpy_dtype.type(0).real))}), + callables_table) + + if name in ["sqrt", "exp", "log", + "sin", "cos", "tan", + "sinh", "cosh", "tanh", + "conj", "abs"]: + if dtype.is_complex(): + # function parameters are complex. + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return ( + self.copy(name_in_target=f"{tpname}_{name}", + arg_id_to_dtype={0: dtype, -1: dtype}), + callables_table) else: - raise RuntimeError("unexpected complex type '%s'" % arg_dtype) + # function calls for floating-point parameters. + numpy_dtype = dtype.numpy_dtype + if numpy_dtype.kind in ("u", "i"): + dtype = NumpyType(np.float32) + if name == "abs": + name = "fabs" + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: dtype, -1: dtype}), + callables_table) + + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) - if name in ["sqrt", "exp", "log", - "sin", "cos", "tan", - "sinh", "cosh", "tanh", - "conj"]: - return CallMangleInfo( - target_name=f"{tpname}_{name}", - result_dtypes=(arg_dtype,), - arg_dtypes=(arg_dtype,)) - if name in ["real", "imag", "abs"]: - return CallMangleInfo( - target_name=f"{tpname}_{name}", - result_dtypes=(NumpyType( - np.dtype(arg_dtype.numpy_dtype.type(0).real)), - ), - arg_dtypes=(arg_dtype,)) +def get_pyopencl_callables(): + pyopencl_ids = ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", + "tanh", "conj", "real", "imag", "abs"] + return {id_: PyOpenCLCallable(name=id_) for id_ in pyopencl_ids} - return None +# }}} # {{{ preamble generator @@ -555,8 +606,8 @@ class PyOpenCLTarget(OpenCLTarget): kernel = adjust_local_temp_var_storage(kernel, self.device) return kernel - def pre_codegen_check(self, kernel): - check_sizes(kernel, self.device) + def pre_codegen_entrypoint_check(self, kernel, callables_table): + check_sizes(kernel, callables_table, self.device) def get_host_ast_builder(self): return PyOpenCLPythonASTBuilder(self) @@ -619,9 +670,10 @@ class PyOpenCLTarget(OpenCLTarget): def get_kernel_executor_cache_key(self, queue, **kwargs): return queue.context - def get_kernel_executor(self, kernel, queue, **kwargs): + def get_kernel_executor(self, program, queue, **kwargs): from loopy.target.pyopencl_execution import PyOpenCLKernelExecutor - return PyOpenCLKernelExecutor(queue.context, kernel) + return PyOpenCLKernelExecutor(queue.context, program, + entrypoint=kwargs.pop("entrypoint")) def with_device(self, device): return type(self)(device) @@ -954,21 +1006,20 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library - def function_manglers(self): - from loopy.library.random123 import random123_function_mangler - return ( - [ - pyopencl_function_mangler, - random123_function_mangler - # order matters: e.g. prefer our abs() over that of the - # superclass - ] + super().function_manglers()) + @property + def known_callables(self): + from loopy.library.random123 import get_random123_callables + + # order matters: e.g. prefer our abs() over that of the + # superclass + callables = super().known_callables + callables.update(get_pyopencl_callables()) + callables.update(get_random123_callables(self.target)) + return callables def preamble_generators(self): - from loopy.library.random123 import random123_preamble_generator return ([ pyopencl_preamble_generator, - random123_preamble_generator, ] + super().preamble_generators()) # }}} diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 0a9bafde9608624e7285363e4338f96b551307ea..92b0982c53b355bcf0c0d88732e4a6a49e414200 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -257,7 +257,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, context, kernel): + def __init__(self, context, program, entrypoint): """ :arg context: a :class:`pyopencl.Context` :arg kernel: may be a loopy.LoopKernel, a generator returning kernels @@ -266,62 +266,69 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): specific arguments. """ - super().__init__(kernel) + super().__init__(program, entrypoint) self.context = context from loopy.target.pyopencl import PyOpenCLTarget - if isinstance(kernel.target, PyOpenCLTarget): - self.kernel = kernel.copy(target=( - kernel.target.with_device(context.devices[0]))) + if isinstance(program.target, PyOpenCLTarget): + self.program = program.copy(target=( + program.target.with_device(context.devices[0]))) - def get_invoker_uncached(self, kernel, codegen_result): + def get_invoker_uncached(self, program, entrypoint, codegen_result): generator = PyOpenCLExecutionWrapperGenerator() - return generator(kernel, codegen_result) + return generator(program, entrypoint, codegen_result) def get_wrapper_generator(self): return PyOpenCLExecutionWrapperGenerator() @memoize_method - def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + def program_info(self, entrypoint, arg_to_dtype_set=frozenset(), + all_kwargs=None): + program = self.get_typed_and_scheduled_program(entrypoint, + arg_to_dtype_set) + # FIXME: now just need to add the types to the arguments from loopy.codegen import generate_code_v2 from loopy.target.execution import get_highlighted_code - codegen_result = generate_code_v2(kernel) + codegen_result = generate_code_v2(program) dev_code = codegen_result.device_code() - if self.kernel.options.write_cl: + if program[entrypoint].options.write_cl: + #FIXME: redirect to "translation unit" level option as well. output = dev_code - if self.kernel.options.highlight_cl: + if self.program[entrypoint].options.highlight_cl: output = get_highlighted_code(output) - if self.kernel.options.write_cl is True: + if self.program[entrypoint].options.write_cl is True: print(output) else: - with open(self.kernel.options.write_cl, "w") as outf: + with open(self.program[entrypoint].options.write_cl, "w") as outf: outf.write(output) - if self.kernel.options.edit_cl: + if program[entrypoint].options.edit_cl: + #FIXME: redirect to "translation unit" level option as well. from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.cl") import pyopencl as cl + #FIXME: redirect to "translation unit" level option as well. cl_program = ( cl.Program(self.context, dev_code) - .build(options=kernel.options.cl_build_options)) + .build(options=program[entrypoint].options.cl_build_options)) cl_kernels = _Kernels() - for dp in codegen_result.device_programs: - setattr(cl_kernels, dp.name, getattr(cl_program, dp.name)) + for dp in cl_program.kernel_names.split(";"): + setattr(cl_kernels, dp, getattr(cl_program, dp)) return _KernelInfo( - kernel=kernel, + program=program, cl_kernels=cl_kernels, - implemented_data_info=codegen_result.implemented_data_info, - invoker=self.get_invoker(kernel, codegen_result)) + implemented_data_info=codegen_result.implemented_data_infos[ + entrypoint], + invoker=self.get_invoker(program, entrypoint, codegen_result)) def __call__(self, queue, **kwargs): """ @@ -356,10 +363,12 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) + program_info = self.program_info(kwargs["entrypoint"], + self.arg_to_dtype_set(kwargs)) + kwargs.pop("entrypoint") - return kernel_info.invoker( - kernel_info.cl_kernels, queue, allocator, wait_for, + return program_info.invoker( + program_info.cl_kernels, queue, allocator, wait_for, out_host, **kwargs) # }}} diff --git a/loopy/target/python.py b/loopy/target/python.py index 0aa0cf572a33fbc58a79471bdf7b09b12a0bf3a2..15ddc4679a3e1392118853a577411236ac10e7b3 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -23,15 +23,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import numpy as np - from pymbolic.mapper import Mapper from pymbolic.mapper.stringifier import StringifyMapper -from loopy.type_inference import TypeInferenceMapper +from loopy.type_inference import TypeReader from loopy.kernel.data import ValueArg from loopy.diagnostic import LoopyError # noqa from loopy.target import ASTBuilderBase -from genpy import Suite +from genpy import Suite, Collection # {{{ expression to code @@ -42,7 +40,8 @@ class ExpressionToPythonMapper(StringifyMapper): self.codegen_state = codegen_state if type_inf_mapper is None: - type_inf_mapper = TypeInferenceMapper(self.kernel) + type_inf_mapper = TypeReader(self.kernel, + self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper def handle_unsupported_expression(self, victim, enclosing_prec): @@ -80,48 +79,30 @@ class ExpressionToPythonMapper(StringifyMapper): expr, enclosing_prec) def map_call(self, expr, enclosing_prec): - from pymbolic.primitives import Variable from pymbolic.mapper.stringifier import PREC_NONE - identifier = expr.function + identifier_name = self.codegen_state.callables_table[ + expr.function.name].name - if identifier.name in ["indexof", "indexof_vec"]: + if identifier_name in ["indexof", "indexof_vec"]: raise LoopyError( "indexof, indexof_vec not yet supported in Python") - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.type_inf_mapper(par) for par in expr.parameters) + clbl = self.codegen_state.callables_table[ + expr.function.name] str_parameters = None + number_of_assignees = len([key for key in + clbl.arg_id_to_dtype.keys() if key < 0]) - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: + if number_of_assignees != 1: raise LoopyError("functions with more or fewer than one return value " "may not be used in an expression") - str_parameters = [ - self.rec(par, PREC_NONE) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)] - - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes, - mangle_result.result_dtypes)) + str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters] - return "{}({})".format(mangle_result.target_name, ", ".join(str_parameters)) + return "{}({})".format(clbl.name_in_target, + ", ".join(str_parameters)) def map_group_hw_index(self, expr, enclosing_prec): raise LoopyError("plain Python does not have group hw axes") @@ -147,33 +128,8 @@ class ExpressionToPythonMapper(StringifyMapper): # }}} -# {{{ genpy extensions - -class Collection(Suite): - def generate(self): - for item in self.contents: - yield from item.generate() - -# }}} - - # {{{ ast builder -def _numpy_single_arg_function_mangler(kernel, name, arg_dtypes): - if (not isinstance(name, str) - or not hasattr(np, name) - or len(arg_dtypes) != 1): - return None - - arg_dtype, = arg_dtypes - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="_lpy_np."+name, - result_dtypes=(arg_dtype,), - arg_dtypes=arg_dtypes) - - def _base_python_preamble_generator(preamble_info): yield ("00_future", "from __future__ import division, print_function\n") yield ("05_numpy_import", """ @@ -185,13 +141,12 @@ class PythonASTBuilderBase(ASTBuilderBase): """A Python host AST builder for integration with PyOpenCL. """ - # {{{ code generation guts - - def function_manglers(self): - return ( - super().function_manglers() + [ - _numpy_single_arg_function_mangler, - ]) + @property + def known_callables(self): + from loopy.target.c import get_c_callables + callables = super().known_callables + callables.update(get_c_callables()) + return callables def preamble_generators(self): return ( @@ -199,6 +154,13 @@ class PythonASTBuilderBase(ASTBuilderBase): _base_python_preamble_generator ]) + # {{{ code generation guts + + @property + def ast_module(self): + import genpy + return genpy + def get_function_declaration(self, codegen_state, codegen_result, schedule_index): return None diff --git a/loopy/tools.py b/loopy/tools.py index 5be4ca6b58f0a2e0dd5907eacf4749dd3aaf927b..644082ed61143798f3c01e5af820092aabd665af 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -35,6 +35,17 @@ def is_integer(obj): return isinstance(obj, (int, np.integer)) +def update_persistent_hash(obj, key_hash, key_builder): + """ + Custom hash computation function for use with + :class:`pytools.persistent_dict.PersistentDict`. + + Only works in conjunction with :class:`loopy.tools.KeyBuilder`. + """ + for field_name in obj.hash_fields: + key_builder.rec(key_hash, getattr(obj, field_name)) + + # {{{ custom KeyBuilder subclass class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase): @@ -52,6 +63,13 @@ class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase): self.key_hash.update(type(expr.operation).__name__.encode("utf-8")) self.rec(expr.expr, *args) + def map_foreign(self, expr, *args, **kwargs): + """Mapper method dispatch for non-:mod:`pymbolic` objects.""" + if expr is None: + self.key_hash.update(b"") + else: + PersistentHashWalkMapperBase.map_foreign(self, expr, *args, **kwargs) + class LoopyKeyBuilder(KeyBuilderBase): """A custom :class:`pytools.persistent_dict.KeyBuilder` subclass @@ -72,6 +90,11 @@ class LoopyKeyBuilder(KeyBuilderBase): update_for_defaultdict = update_for_dict + def update_for_frozenset(self, key_hash, key): + for set_key in sorted(key, + key=lambda obj: type(obj).__name__ + str(obj)): + self.rec(key_hash, set_key) + def update_for_BasicSet(self, key_hash, key): # noqa from islpy import Printer prn = Printer.to_str(key.get_ctx()) @@ -99,6 +122,8 @@ class LoopyKeyBuilder(KeyBuilderBase): else: PersistentHashWalkMapper(key_hash)(key) + update_for_PMap = update_for_dict # noqa: N815 + class PymbolicExpressionHashWrapper: def __init__(self, expression): diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py index f4f3f2d3a5be6b75454bdac893275ded59fd5406..7a220418f73d88b3388d96d3f8b3cfe778fe15c4 100644 --- a/loopy/transform/add_barrier.py +++ b/loopy/transform/add_barrier.py @@ -24,6 +24,8 @@ THE SOFTWARE. from loopy.kernel.instruction import BarrierInstruction from loopy.match import parse_match from loopy.transform.instruction import add_dependency +from loopy.translation_unit import for_each_kernel +from loopy.kernel import LoopKernel __doc__ = """ .. currentmodule:: loopy @@ -34,6 +36,7 @@ __doc__ = """ # {{{ add_barrier +@for_each_kernel def add_barrier(kernel, insn_before="", insn_after="", id_based_on=None, tags=None, synchronization_kind="global", mem_kind=None, within_inames=None): @@ -59,6 +62,8 @@ def add_barrier(kernel, insn_before="", insn_after="", id_based_on=None, """ + assert isinstance(kernel, LoopKernel) + if mem_kind is None: mem_kind = synchronization_kind diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index 0a38790152f9e1325733a8bdc47d13f05d400c39..8203f0d528ebc5dfc2a7681ac0e50285c95a1ab9 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -23,9 +23,13 @@ THE SOFTWARE. from loopy.diagnostic import LoopyError +from loopy.translation_unit import for_each_kernel +from loopy.kernel import LoopKernel + # {{{ fold constants +@for_each_kernel def fold_constants(kernel): from loopy.symbolic import ConstantFoldingMapper cfm = ConstantFoldingMapper() @@ -49,7 +53,9 @@ def fold_constants(kernel): # {{{ collect_common_factors_on_increment # thus far undocumented +@for_each_kernel def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): + assert isinstance(kernel, LoopKernel) # FIXME: Does not understand subst rules for now if kernel.substitutions: from loopy.transform.subst import expand_subst diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index e27e5902644205e8a1643b4c243ba8ae6532fafa..536a7a82620ddcadc027e33aa7d51ce277bdaf08 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -25,6 +25,9 @@ from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingCont from loopy.kernel.data import ValueArg, ArrayArg import islpy as isl +from loopy.translation_unit import for_each_kernel + + __doc__ = """ .. currentmodule:: loopy @@ -98,6 +101,7 @@ def _add_unique_dim_name(name, dim_names): return (ng(name),) + tuple(dim_names) +@for_each_kernel def to_batched(kernel, nbatches, batch_varying_args, batch_iname_prefix="ibatch", sequential=False): """Takes in a kernel that carries out an operation and returns a kernel diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 84503a618b067a147dc5181c2251d17d8b83eb44..400be5554e8d56184cba3435ad80884adfeb02e3 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -30,6 +30,9 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper from loopy.version import DATA_MODEL_VERSION from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel +from loopy.translation_unit import TranslationUnit +from loopy.kernel.function_interface import CallableKernel, ScalarCallable from pymbolic import var @@ -127,10 +130,10 @@ buffer_array_cache = WriteOncePersistentDict( # Adding an argument? also add something to the cache_key below. -def buffer_array(kernel, var_name, buffer_inames, init_expression=None, - store_expression=None, within=None, default_tag="l.auto", - temporary_scope=None, temporary_is_local=None, - fetch_bounding_box=False): +def buffer_array_for_single_kernel(kernel, callables_table, var_name, + buffer_inames, init_expression=None, store_expression=None, + within=None, default_tag="l.auto", temporary_scope=None, + temporary_is_local=None, fetch_bounding_box=False): """Replace accesses to *var_name* with ones to a temporary, which is created and acts as a buffer. To perform this transformation, the access footprint to *var_name* is determined and a temporary of a suitable @@ -166,6 +169,20 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, fetched. """ + if isinstance(kernel, TranslationUnit): + kernel_names = [i for i, clbl in + kernel.callables_table.items() if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise LoopyError() + + return kernel.with_kernel(buffer_array(kernel[kernel_names[0]], + var_name, buffer_inames, init_expression, store_expression, within, + default_tag, temporary_scope, temporary_is_local, + fetch_bounding_box, kernel.callables_table)) + + assert isinstance(kernel, LoopKernel) + # {{{ unify temporary_scope / temporary_is_local from loopy.kernel.data import AddressSpace @@ -237,7 +254,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, from loopy.preprocess import prepare_for_caching key_kernel = prepare_for_caching(kernel) - cache_key = (key_kernel, var_name, tuple(buffer_inames), + cache_key = (key_kernel, var_name, + tuple(buffer_inames), PymbolicExpressionHashWrapper(init_expression), PymbolicExpressionHashWrapper(store_expression), within, default_tag, temporary_scope, fetch_bounding_box) @@ -525,7 +543,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) + kernel = assign_automatic_axes(kernel, callables_table) if CACHING_ENABLED: from loopy.preprocess import prepare_for_caching @@ -534,4 +552,25 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, return kernel + +def buffer_array(program, *args, **kwargs): + assert isinstance(program, TranslationUnit) + + new_callables = {} + + for func_id, clbl in program.callables_table.items(): + if isinstance(clbl, CallableKernel): + clbl = clbl.copy( + subkernel=buffer_array_for_single_kernel(clbl.subkernel, + program.callables_table, *args, **kwargs)) + elif isinstance(clbl, ScalarCallable): + pass + else: + raise NotImplementedError() + + new_callables[func_id] = clbl + + return program.copy(callables_table=new_callables) + + # vim: foldmethod=marker diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py new file mode 100644 index 0000000000000000000000000000000000000000..0180fe208cb1f611dff986b924aec9ba94782f37 --- /dev/null +++ b/loopy/transform/callable.py @@ -0,0 +1,570 @@ +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import islpy as isl + +from pytools import UniqueNameGenerator + +from loopy.kernel import LoopKernel +from loopy.diagnostic import LoopyError +from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, + Assignment, CInstruction, _DataObliviousInstruction) +from loopy.symbolic import ( + RuleAwareIdentityMapper, + RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) +from loopy.kernel.function_interface import ( + CallableKernel, ScalarCallable) +from loopy.translation_unit import (TranslationUnit, + for_each_kernel) + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: register_callable + +.. autofunction:: inline_callable_kernel + +.. autofunction:: merge +""" + + +def register_callable(translation_unit, function_identifier, callable_, + redefining_not_ok=True): + """ + :param translation_unit: A :class:`loopy.TranslationUnit`. + :param callable_: A :class:`loopy.InKernelCallable`. + """ + + if isinstance(callable_, LoopKernel): + callable_ = CallableKernel(callable_) + + from loopy.kernel.function_interface import InKernelCallable + assert isinstance(callable_, InKernelCallable) + + if (function_identifier in translation_unit.callables_table) and ( + translation_unit.callables_table[function_identifier] != callable_ + and redefining_not_ok): + raise LoopyError("Redefining function identifier not allowed. Set the" + " option 'redefining_not_ok=False' to bypass this error.") + + new_callables = translation_unit.callables_table.set(function_identifier, + callable_) + + return translation_unit.copy( + callables_table=new_callables) + + +def merge(translation_units): + """ + :param translation_units: A sequence of :class:`loopy.TranslationUnit`. + + :returns: An instance of :class:`loopy.TranslationUnit` which contains all the + callables from each of the *translation_units. + """ + + for i in range(1, len(translation_units)): + if translation_units[i].target != translation_units[i-1].target: + raise LoopyError("translation units to be merged should have the" + " same target.") + + # {{{ check for callable collision + + for i, prg_i in enumerate(translation_units): + for prg_j in translation_units[i+1:]: + for clbl_name in (set(prg_i.callables_table) + & set(prg_j.callables_table)): + if (prg_i.callables_table[clbl_name] + != prg_j.callables_table[clbl_name]): + # TODO: generate unique names + rename for the colliding + # callables (if entrypoints are colliding that shuold still + # be an error) + raise NotImplementedError("Translation units to be merged" + " must have different callable names" + " for now.") + + # }}} + + callables_table = {} + for trans_unit in translation_units: + callables_table.update(trans_unit.callables_table.copy()) + + return TranslationUnit( + entrypoints=frozenset().union(*( + t.entrypoints or frozenset() for t in translation_units)), + callables_table=callables_table, + target=translation_units[0].target) + + +# {{{ kernel inliner mapper + +class KernelArgumentSubstitutor(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, caller_knl, + callee_knl, callee_arg_to_call_param): + super().__init__(rule_mapping_context) + self.caller_knl = caller_knl + self.callee_knl = callee_knl + self.callee_arg_to_call_param = callee_arg_to_call_param + + def map_subscript(self, expr, expn_state): + if expr.aggregate.name in self.callee_knl.arg_dict: + from loopy.symbolic import get_start_subscript_from_sar + from loopy.isl_helpers import simplify_via_aff + from pymbolic.primitives import Subscript, Variable + + sar = self.callee_arg_to_call_param[expr.aggregate.name] # SubArrayRef + + callee_arg = self.callee_knl.arg_dict[expr.aggregate.name] + if sar.subscript.aggregate.name in self.caller_knl.arg_dict: + caller_arg = self.caller_knl.arg_dict[sar.subscript.aggregate.name] + else: + caller_arg = self.caller_knl.temporary_variables[ + sar.subscript.aggregate.name] + + flatten_index = 0 + for i, idx in enumerate(get_start_subscript_from_sar(sar, + self.caller_knl).index_tuple): + flatten_index += idx*caller_arg.dim_tags[i].stride + + flatten_index += sum( + idx * tag.stride + for idx, tag in zip(expr.index_tuple, callee_arg.dim_tags)) + + flatten_index = simplify_via_aff(flatten_index) + + new_indices = [] + for dim_tag in caller_arg.dim_tags: + ind = flatten_index // dim_tag.stride + flatten_index -= (dim_tag.stride * ind) + new_indices.append(ind) + + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + + return Subscript(Variable(sar.subscript.aggregate.name), new_indices) + else: + return super().map_subscript(expr, expn_state) + + def map_variable(self, expr, expn_state): + from loopy.kernel.data import ArrayArg, ValueArg + from loopy.symbolic import SubArrayRef + if expr.name in self.callee_knl.arg_dict: + arg = self.callee_knl.arg_dict[expr.name] + par = self.callee_arg_to_call_param[expr.name] + if isinstance(arg, ArrayArg): + assert arg.shape == () + assert isinstance(par, SubArrayRef) and par.swept_inames == () + return par.subscript.aggregate + else: + assert isinstance(arg, ValueArg) + return par + else: + return super().map_variable(expr, expn_state) + +# }}} + + +# {{{ inlining of a single call instruction + +def substitute_into_domain(domain, param_name, expr, allowed_param_dims): + """ + :arg allowed_deps: A :class:`list` of :class:`str` that are + """ + import pymbolic.primitives as prim + from loopy.symbolic import get_dependencies, isl_set_from_expr + if param_name not in domain.get_var_dict(): + # param_name not in domain => domain will be unchanged + return domain + + # {{{ rename 'param_name' to avoid namespace pollution with allowed_param_dims + + dt, pos = domain.get_var_dict()[param_name] + domain = domain.set_dim_name(dt, pos, UniqueNameGenerator( + set(allowed_param_dims))(param_name)) + + # }}} + + for dep in get_dependencies(expr): + if dep in allowed_param_dims: + domain = domain.add_dims(isl.dim_type.param, 1) + domain = domain.set_dim_name( + isl.dim_type.param, + domain.dim(isl.dim_type.param)-1, + dep) + else: + raise ValueError("Augmenting caller's domain " + f"with '{dep}' is not allowed.") + + set_ = isl_set_from_expr(domain.space, + prim.Comparison(prim.Variable(param_name), + "==", + expr)) + + bset, = set_.get_basic_sets() + domain = domain & bset + + return domain.project_out(dt, pos, 1) + + +def rename_iname(domain, old_iname, new_iname): + if old_iname not in domain.get_var_dict(): + return domain + + dt, pos = domain.get_var_dict()[old_iname] + return domain.set_dim_name(dt, pos, new_iname) + + +def get_valid_domain_param_names(knl): + from loopy.kernel.data import ValueArg + return ([arg.name for arg in knl.args if isinstance(arg, ValueArg)] + + [tv.name + for tv in knl.temporary_variables.values() + if tv.shape == ()] + + list(knl.all_inames()) + ) + + +def _inline_call_instruction(caller_knl, callee_knl, call_insn): + """ + Returns a copy of *caller_knl* with the *call_insn* in the *kernel* + replaced by inlining *callee_knl* into it within it. + + :arg call_insn: An instance of `loopy.CallInstruction` of the call-site. + """ + import pymbolic.primitives as prim + from pymbolic.mapper.substitutor import make_subst_func + from loopy.kernel.data import ValueArg + + # {{{ sanity checks + + assert call_insn.expression.function.name == callee_knl.name + + # }}} + + callee_label = callee_knl.name[:4] + "_" + vng = caller_knl.get_var_name_generator() + ing = caller_knl.get_instruction_id_generator() + + # {{{ construct callee->caller name mappings + + # name_map: Mapping[str, str] + # A mapping from variable names in the callee kernel's namespace to + # the ones they would be referred by in the caller's namespace post inlining. + name_map = {} + + # only consider temporary variables and inames, arguments would be mapping + # according to the invocation in call_insn. + for name in (callee_knl.all_inames() + | set(callee_knl.temporary_variables.keys())): + new_name = vng(callee_label+name) + name_map[name] = new_name + + # }}} + + # {{{ iname_to_tags + + # new_inames: caller's inames post inlining + new_inames = caller_knl.inames + + for old_name, callee_iname in callee_knl.inames.items(): + new_name = name_map[old_name] + new_inames[new_name] = callee_iname.copy(name=new_name) + + # }}} + + # {{{ register callee's temps as caller's + + # new_temps: caller's temps post inlining + new_temps = caller_knl.temporary_variables.copy() + + for name, tv in callee_knl.temporary_variables.items(): + new_temps[name_map[name]] = tv.copy(name=name_map[name]) + + # }}} + + # {{{ get callee args -> parameters passed to the call + + arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) + + assignees = call_insn.assignees # writes + parameters = call_insn.expression.parameters # reads + + from loopy.kernel.function_interface import get_kw_pos_association + kw_to_pos, pos_to_kw = get_kw_pos_association(callee_knl) + + for i, par in enumerate(parameters): + arg_map[pos_to_kw[i]] = par + + for i, assignee in enumerate(assignees): + arg_map[pos_to_kw[-i-1]] = assignee + + # }}} + + # {{{ process domains/assumptions + + # rename inames + new_domains = callee_knl.domains.copy() + for old_iname in callee_knl.all_inames(): + new_domains = [rename_iname(dom, old_iname, name_map[old_iname]) + for dom in new_domains] + + # realize domains' dim params in terms of caller's variables + new_assumptions = callee_knl.assumptions + for callee_arg_name, param_expr in arg_map.items(): + if isinstance(callee_knl.arg_dict[callee_arg_name], + ValueArg): + new_domains = [ + substitute_into_domain( + dom, + callee_arg_name, + param_expr, get_valid_domain_param_names(caller_knl)) + for dom in new_domains] + + new_assumptions = substitute_into_domain( + new_assumptions, + callee_arg_name, + param_expr, get_valid_domain_param_names(caller_knl)) + + # }}} + + # {{{ rename inames/temporaries in the program + + rule_mapping_context = SubstitutionRuleMappingContext(callee_knl.substitutions, + vng) + subst_func = make_subst_func({old_name: prim.Variable(new_name) + for old_name, new_name in name_map.items()}) + inames_temps_renamer = RuleAwareSubstitutionMapper(rule_mapping_context, + subst_func, + within=lambda *args: True) + + callee_knl = rule_mapping_context.finish_kernel(inames_temps_renamer + .map_kernel(callee_knl)) + + # }}} + + # {{{ map callee's expressions to get expressions after inlining + + rule_mapping_context = SubstitutionRuleMappingContext(callee_knl.substitutions, + vng) + smap = KernelArgumentSubstitutor(rule_mapping_context, caller_knl, + callee_knl, arg_map) + + callee_knl = rule_mapping_context.finish_kernel(smap.map_kernel(callee_knl)) + + # }}} + + # {{{ generate new ids for instructions + + insn_id_map = {} + for insn in callee_knl.instructions: + insn_id_map[insn.id] = ing(callee_label+insn.id) + + # }}} + + # {{{ use NoOp to mark the start and end of callee kernel + + from loopy.kernel.instruction import NoOpInstruction + + noop_start = NoOpInstruction( + id=ing(callee_label+"_start"), + within_inames=call_insn.within_inames, + depends_on=call_insn.depends_on + ) + noop_end = NoOpInstruction( + id=call_insn.id, + within_inames=call_insn.within_inames, + depends_on=frozenset(insn_id_map.values()) + ) + + # }}} + + # {{{ map callee's instruction ids + + inlined_insns = [noop_start] + + for insn in callee_knl.instructions: + new_within_inames = (frozenset(name_map[iname] + for iname in insn.within_inames) + | call_insn.within_inames) + new_depends_on = (frozenset(insn_id_map[dep] for dep in insn.depends_on) + | {noop_start.id}) + new_no_sync_with = frozenset((insn_id_map[id], scope) + for id, scope in insn.no_sync_with) + new_id = insn_id_map[insn.id] + + if isinstance(insn, Assignment): + new_atomicity = tuple(type(atomicity)(name_map[atomicity.var_name]) + for atomicity in insn.atomicity) + insn = insn.copy( + id=insn_id_map[insn.id], + within_inames=new_within_inames, + depends_on=new_depends_on, + tags=insn.tags | call_insn.tags, + atomicity=new_atomicity, + no_sync_with=new_no_sync_with + ) + else: + insn = insn.copy( + id=new_id, + within_inames=new_within_inames, + depends_on=new_depends_on, + tags=insn.tags | call_insn.tags, + no_sync_with=new_no_sync_with + ) + inlined_insns.append(insn) + + inlined_insns.append(noop_end) + + # }}} + + # {{{ swap out call_insn with inlined_instructions + + idx = caller_knl.instructions.index(call_insn) + new_insns = (caller_knl.instructions[:idx] + + inlined_insns + + caller_knl.instructions[idx+1:]) + + # }}} + + old_assumptions, new_assumptions = isl.align_two( + caller_knl.assumptions, new_assumptions) + + return caller_knl.copy(instructions=new_insns, + temporary_variables=new_temps, + domains=caller_knl.domains+new_domains, + assumptions=(old_assumptions.params() + & new_assumptions.params()), + inames=new_inames) + +# }}} + + +# {{{ inline callable kernel + +@for_each_kernel +def _inline_single_callable_kernel(caller_kernel, callee_kernel): + from loopy.symbolic import ResolvedFunction + + # sub-array refs might be removed during inlining + # => remove their swept inames from domains + inames_to_remove = frozenset() + + for insn in caller_kernel.instructions: + if (isinstance(insn, CallInstruction) + and isinstance(insn.expression.function, ResolvedFunction)): + if insn.expression.function.name == callee_kernel.name: + caller_kernel = _inline_call_instruction(caller_kernel, + callee_kernel, insn) + inames_to_remove |= insn.sub_array_ref_inames() + elif isinstance(insn, (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError(type(insn)) + + from loopy.transform.iname import remove_unused_inames + return remove_unused_inames(caller_kernel, inames_to_remove) + + +# FIXME This should take a 'within' parameter to be able to only inline +# *some* calls to a kernel, but not others. +def inline_callable_kernel(translation_unit, function_name): + """ + Returns a copy of *translation_unit* with the callable kernel + named *function_name* inlined at all call-sites. + """ + from loopy.preprocess import infer_arg_descr, filter_reachable_callables + from loopy.translation_unit import resolve_callables + + # {{{ must have argument shape information at call sites to inline + + translation_unit = resolve_callables(translation_unit) + translation_unit = infer_arg_descr(translation_unit) + + # }}} + + callee = translation_unit[function_name] + + return filter_reachable_callables( + _inline_single_callable_kernel(translation_unit, + callee)) + +# }}} + + +# {{{ rename_callable + +def rename_callable(program, old_name, new_name=None, existing_ok=False): + """ + :arg program: An instance of :class:`loopy.TranslationUnit` + :arg old_name: The callable to be renamed + :arg new_name: New name for the callable to be renamed + :arg existing_ok: An instance of :class:`bool` + """ + from loopy.symbolic import ( + RuleAwareSubstitutionMapper, + SubstitutionRuleMappingContext) + from pymbolic import var + + assert isinstance(program, TranslationUnit) + assert isinstance(old_name, str) + + if (new_name in program.callables_table) and not existing_ok: + raise LoopyError(f"callables named '{new_name}' already exists") + + if new_name is None: + namegen = UniqueNameGenerator(program.callables_table.keys()) + new_name = namegen(old_name) + + assert isinstance(new_name, str) + + new_callables_table = {} + + for name, clbl in program.callables_table.items(): + if name == old_name: + name = new_name + + if isinstance(clbl, CallableKernel): + knl = clbl.subkernel + rule_mapping_context = SubstitutionRuleMappingContext( + knl.substitutions, knl.get_var_name_generator()) + smap = RuleAwareSubstitutionMapper(rule_mapping_context, + {var(old_name): var(new_name)}.get, + within=lambda *args: True) + knl = rule_mapping_context.finish_kernel(smap.map_kernel(knl)) + clbl = clbl.copy(subkernel=knl.copy(name=name)) + elif isinstance(clbl, ScalarCallable): + pass + else: + raise NotImplementedError(f"{type(clbl)}") + + new_callables_table[name] = clbl + + new_entrypoints = program.entrypoints.copy() + if old_name in new_entrypoints: + new_entrypoints = ((new_entrypoints | frozenset([new_name])) + - frozenset([old_name])) + + return program.copy(callables_table=new_callables_table, + entrypoints=new_entrypoints) + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 4851ffdece47dc092011991c5b7218d96ea953c0..d866f8a5e6e8064e4685c68451f3fa85b32e402c 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -26,6 +26,10 @@ from islpy import dim_type from loopy.kernel.data import ImageArg from pytools import MovedFunctionDeprecationWrapper +from loopy.translation_unit import (TranslationUnit, + for_each_kernel) +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable # {{{ convenience: add_prefetch @@ -136,7 +140,8 @@ class _not_provided: # noqa: N801 pass -def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, +def add_prefetch_for_single_kernel(kernel, callables_table, var_name, + sweep_inames=[], dim_arg_names=None, # "None" is a valid value here, distinct from the default. default_tag=_not_provided, @@ -235,6 +240,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, This function internally uses :func:`extract_subst` and :func:`precompute`. """ + assert isinstance(kernel, LoopKernel) # {{{ fish indexing out of var_name and into footprint_subscripts @@ -327,9 +333,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, # precompute module, but precompute acutally uses that to adjust its # warning message. - from loopy.transform.precompute import precompute - new_kernel = precompute(kernel, subst_use, sweep_inames, - precompute_inames=dim_arg_names, + from loopy.transform.precompute import precompute_for_single_kernel + new_kernel = precompute_for_single_kernel(kernel, callables_table, + subst_use, sweep_inames, precompute_inames=dim_arg_names, default_tag=default_tag, dtype=var_descr.dtype, fetch_bounding_box=fetch_bounding_box, temporary_name=temporary_name, @@ -362,11 +368,35 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, else: return new_kernel + +def add_prefetch(program, *args, **kwargs): + assert isinstance(program, TranslationUnit) + + new_callables = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = add_prefetch_for_single_kernel( + in_knl_callable.subkernel, program.callables_table, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_callables[func_id] = in_knl_callable + + return program.copy(callables_table=new_callables) + # }}} # {{{ change variable kinds +@for_each_kernel def change_arg_to_image(kernel, name): new_args = [] for arg in kernel.args: @@ -384,6 +414,7 @@ def change_arg_to_image(kernel, name): # {{{ tag array axes +@for_each_kernel def tag_array_axes(kernel, ary_names, dim_tags): """ :arg dim_tags: a tuple of @@ -422,13 +453,15 @@ def tag_array_axes(kernel, ary_names, dim_tags): return kernel -tag_data_axes = MovedFunctionDeprecationWrapper(tag_array_axes) +tag_data_axes = ( + MovedFunctionDeprecationWrapper(tag_array_axes)) # }}} # {{{ set_array_axis_names +@for_each_kernel def set_array_axis_names(kernel, ary_names, dim_names): """ .. versionchanged:: 2016.2 @@ -453,13 +486,15 @@ def set_array_axis_names(kernel, ary_names, dim_names): return kernel -set_array_dim_names = MovedFunctionDeprecationWrapper(set_array_axis_names) +set_array_dim_names = (MovedFunctionDeprecationWrapper( + set_array_axis_names)) # }}} # {{{ remove_unused_arguments +@for_each_kernel def remove_unused_arguments(kernel): new_args = [] @@ -501,6 +536,7 @@ def remove_unused_arguments(kernel): # {{{ alias_temporaries +@for_each_kernel def alias_temporaries(kernel, names, base_name_prefix=None, synchronize_for_exclusive_use=True): """Sets all temporaries given by *names* to be backed by a single piece of @@ -585,11 +621,14 @@ def alias_temporaries(kernel, names, base_name_prefix=None, # {{{ set argument order +@for_each_kernel def set_argument_order(kernel, arg_names): """ :arg arg_names: A list (or comma-separated string) or argument names. All arguments must be in this list. """ + #FIXME: @inducer -- shoulld this only affect the root kernel, or should it + # take a within? if isinstance(arg_names, str): arg_names = arg_names.split(",") @@ -618,6 +657,7 @@ def set_argument_order(kernel, arg_names): # {{{ rename argument +@for_each_kernel def rename_argument(kernel, old_name, new_name, existing_ok=False): """ .. versionadded:: 2016.2 @@ -691,6 +731,7 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False): # {{{ set temporary scope +@for_each_kernel def set_temporary_scope(kernel, temp_var_names, scope): """ :arg temp_var_names: a container with membership checking, @@ -732,6 +773,7 @@ def set_temporary_scope(kernel, temp_var_names, scope): # {{{ reduction_arg_to_subst_rule +@for_each_kernel def reduction_arg_to_subst_rule( kernel, inames, insn_match=None, subst_rule_name=None): if isinstance(inames, str): diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index 59428cde258436c3e30f4f82b23d9c6b423605b8..124568f4512340a812d6fd366318cceb0fea2591 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -31,6 +31,7 @@ import loopy as lp from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext from loopy.isl_helpers import make_slab from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel # {{{ diff mapper @@ -348,6 +349,8 @@ class DifferentiationContext: arg.dtype, shape=shape, dim_tags=dim_tags, + is_input=arg.is_input, + is_output=arg.is_output )) elif var_name in self.kernel.temporary_variables: @@ -377,6 +380,8 @@ def diff_kernel(kernel, diff_outputs, by, diff_iname_prefix="diff_i", *diff_context.by_name*, or *None* if no dependency exists. """ + assert isinstance(kernel, LoopKernel) + from loopy.kernel.creation import apply_single_writer_depencency_heuristic kernel = apply_single_writer_depencency_heuristic(kernel, warn_if_used=True) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 7a4f372f5d30a32638343d3a929447c4edd93c06..6e28d9e7b969372a714af78a3b772f0052347e39 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -27,6 +27,10 @@ from islpy import dim_type from loopy.diagnostic import LoopyError from pymbolic import var +from loopy.kernel import LoopKernel +from loopy.translation_unit import TranslationUnit +from loopy.kernel.function_interface import CallableKernel + def _apply_renames_in_exprs(kernel, var_renames): from loopy.symbolic import ( @@ -126,9 +130,6 @@ def _merge_values(item_name, val_a, val_b): # {{{ two-kernel fusion def _fuse_two_kernels(kernela, kernelb): - from loopy.kernel import KernelState - if kernela.state != KernelState.INITIAL or kernelb.state != KernelState.INITIAL: - raise LoopyError("can only fuse kernels in INITIAL state") # {{{ fuse domains @@ -252,9 +253,6 @@ def _fuse_two_kernels(kernela, kernelb): "substitution", kernela.substitutions, kernelb.substitutions), - function_manglers=_ordered_merge_lists( - kernela.function_manglers, - kernelb.function_manglers), symbol_manglers=_ordered_merge_lists( kernela.symbol_manglers, kernelb.symbol_manglers), @@ -327,6 +325,47 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): *data_flow* was added in version 2016.2 """ + # FIXME: This should take in inputs as (prog1, knlname1) and (prog2, + # knlname2). if prog1 == prog2 then the callable names belong to the same + # namespace, otherwise the kernel names should be uniquified. + # We should also somehow be able to know that callables like "sin"/"cos" + # belong to the global namespace and need not be uniquified. + + if all(isinstance(kernel, TranslationUnit) for kernel in kernels): + # {{{ sanity checks + + for knl in kernels: + nkernels = len([i for i, clbl in knl.callables_table.items() + if isinstance(clbl, CallableKernel)]) + if nkernels != 1: + raise NotImplementedError("Translation unit with more than one" + " callable kernel not allowed for now.") + + # }}} + + # {{{ "merge" the callable namespace + + from loopy.transform.callable import rename_callable + loop_kernels_to_be_fused = [] + new_callables = {} + + for t_unit in kernels: + for name in set(t_unit.callables_table) & set(new_callables): + t_unit = rename_callable(t_unit, name) + + for name, clbl in t_unit.callables_table.items(): + if isinstance(clbl, CallableKernel): + loop_kernels_to_be_fused.append(clbl.subkernel) + else: + new_callables[name] = clbl + + # }}} + + kernels = loop_kernels_to_be_fused[:] + else: + assert all(isinstance(knl, LoopKernel) for knl in kernels) + new_callables = {} + kernels = list(kernels) if data_flow is None: @@ -405,6 +444,11 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): # }}} - return result + new_callables[result.name] = CallableKernel(result) + + return TranslationUnit(callables_table=new_callables, + target=result.target, + entrypoints=frozenset([result.name])) + # vim: foldmethod=marker diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index d3a0ed41ed0342fa281eb3bb62a971344cda1fc7..8cb649b91ad1cf05d427309d76e8d511fed93df3 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -29,6 +29,11 @@ from loopy.symbolic import ( SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError +from loopy.translation_unit import (TranslationUnit, + for_each_kernel) +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel + __doc__ = """ .. currentmodule:: loopy @@ -74,6 +79,7 @@ __doc__ = """ # {{{ set loop priority +@for_each_kernel def set_loop_priority(kernel, loop_priority): from warnings import warn warn("set_loop_priority is deprecated. Use prioritize_loops instead. " @@ -88,6 +94,7 @@ def set_loop_priority(kernel, loop_priority): return kernel.copy(loop_priority=frozenset([loop_priority])) +@for_each_kernel def prioritize_loops(kernel, loop_priority): """Indicates the textual order in which loops should be entered in the kernel code. Note that this priority has an advisory role only. If the @@ -102,6 +109,8 @@ def prioritize_loops(kernel, loop_priority): :arg: an iterable of inames, or, for brevity, a comma-separated string of inames """ + + assert isinstance(kernel, LoopKernel) if isinstance(loop_priority, str): loop_priority = tuple(s.strip() for s in loop_priority.split(",") if s.strip()) @@ -330,6 +339,7 @@ def _split_iname_backend(kernel, iname_to_split, # {{{ split iname +@for_each_kernel def split_iname(kernel, split_iname, inner_length, *, outer_iname=None, inner_iname=None, @@ -356,6 +366,8 @@ def split_iname(kernel, split_iname, inner_length, :arg within: a stack match as understood by :func:`loopy.match.parse_match`. """ + assert isinstance(kernel, LoopKernel) + def make_new_loop_index(inner, outer): return inner + outer*inner_length @@ -372,6 +384,7 @@ def split_iname(kernel, split_iname, inner_length, # {{{ chunk iname +@for_each_kernel def chunk_iname(kernel, split_iname, num_chunks, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, @@ -506,6 +519,7 @@ class _InameJoiner(RuleAwareSubstitutionMapper): return super().map_reduction(expr, expn_state) +@for_each_kernel def join_inames(kernel, inames, new_iname=None, tag=None, within=None): """In a sense, the inverse of :func:`split_iname`. Takes in inames, finds their bounds (all but the first have to be bounded), and combines @@ -606,8 +620,8 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): new_insns = [ insn.copy( - within_inames=subst_within_inames(insn.within_inames)) - for insn in kernel.instructions] + within_inames=subst_within_inames(insn.within_inames)) if + within(kernel, insn) else insn for insn in kernel.instructions] kernel = (kernel .copy( @@ -632,7 +646,7 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): if tag is not None: kernel = tag_inames(kernel, {new_iname: tag}) - return kernel + return remove_unused_inames(kernel, inames) # }}} @@ -662,7 +676,9 @@ def untag_inames(kernel, iname_to_untag, tag_type): # {{{ tag inames -def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): +@for_each_kernel +def tag_inames(kernel, iname_to_tag, force=False, + ignore_nonexistent=False): """Tag an iname :arg iname_to_tag: a list of tuples ``(iname, new_tag)``. *new_tag* is given @@ -832,6 +848,7 @@ class _InameDuplicator(RuleAwareIdentityMapper): return insn.copy(within_inames=new_fid) +@for_each_kernel def duplicate_inames(kernel, inames, within, new_inames=None, suffix=None, tags={}): """ @@ -1022,6 +1039,13 @@ def get_iname_duplication_options(kernel, use_boostable_into=None): Use :func:`has_schedulable_iname_nesting` to decide whether an iname needs to be duplicated in a given kernel. """ + if isinstance(kernel, TranslationUnit): + if len([clbl for clbl in kernel.callables_table.values() if + isinstance(clbl, CallableKernel)]) == 1: + kernel = kernel[list(kernel.entrypoints)[0]] + + assert isinstance(kernel, LoopKernel) + if use_boostable_into: raise LoopyError("'use_boostable_into=True' is no longer supported.") @@ -1069,6 +1093,10 @@ def has_schedulable_iname_nesting(kernel): :returns: a :class:`bool` indicating whether this kernel needs an iname duplication in order to be schedulable. """ + if isinstance(kernel, TranslationUnit): + if len([clbl for clbl in kernel.callables_table.values() if + isinstance(clbl, CallableKernel)]) == 1: + kernel = kernel[list(kernel.entrypoints)[0]] return not bool(next(get_iname_duplication_options(kernel), False)) # }}} @@ -1076,6 +1104,7 @@ def has_schedulable_iname_nesting(kernel): # {{{ rename_inames +@for_each_kernel def rename_iname(kernel, old_iname, new_iname, existing_ok=False, within=None): """ :arg within: a stack match as understood by @@ -1176,7 +1205,8 @@ def get_used_inames(kernel): for insn in exp_kernel.instructions: used_inames.update( insn.within_inames - | insn.reduction_inames()) + | insn.reduction_inames() + | insn.sub_array_ref_inames()) return used_inames @@ -1325,6 +1355,7 @@ def _split_reduction(kernel, inames, direction, within=None): rsplit.map_kernel(kernel)) +@for_each_kernel def split_reduction_inward(kernel, inames, within=None): """Takes a reduction of the form:: @@ -1344,6 +1375,7 @@ def split_reduction_inward(kernel, inames, within=None): return _split_reduction(kernel, inames, "in", within) +@for_each_kernel def split_reduction_outward(kernel, inames, within=None): """Takes a reduction of the form:: @@ -1367,6 +1399,7 @@ def split_reduction_outward(kernel, inames, within=None): # {{{ affine map inames +@for_each_kernel def affine_map_inames(kernel, old_inames, new_inames, equations): """Return a new *kernel* where the affine transform specified by *equations* has been applied to the inames. @@ -1698,6 +1731,7 @@ class _ReductionInameUniquifier(RuleAwareIdentityMapper): expr, expn_state) +@for_each_kernel def make_reduction_inames_unique(kernel, inames=None, within=None): """ :arg inames: if not *None*, only apply to these inames @@ -1744,6 +1778,7 @@ def make_reduction_inames_unique(kernel, inames=None, within=None): # {{{ add_inames_to_insn +@for_each_kernel def add_inames_to_insn(kernel, inames, insn_match): """ :arg inames: a frozenset of inames that will be added to the @@ -1782,6 +1817,7 @@ def add_inames_to_insn(kernel, inames, insn_match): # }}} +@for_each_kernel def add_inames_for_unused_hw_axes(kernel, within=None): """ Returns a kernel with inames added to each instruction diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 045850651f94ebed65afc24b0008a712b047dd20..287321e3e57dc2cb80c2b8442c5ad2699acd6853 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -21,15 +21,39 @@ THE SOFTWARE. """ from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import (ScalarCallable, CallableKernel) +from loopy.translation_unit import (TranslationUnit, + for_each_kernel) # {{{ find_instructions -def find_instructions(kernel, insn_match): +def find_instructions_in_single_kernel(kernel, insn_match): + assert isinstance(kernel, LoopKernel) from loopy.match import parse_match match = parse_match(insn_match) return [insn for insn in kernel.instructions if match(kernel, insn)] + +def find_instructions(program, insn_match): + if isinstance(program, LoopKernel): + return find_instructions_in_single_kernel(program, insn_match) + + assert isinstance(program, TranslationUnit) + insns = [] + for in_knl_callable in program.callables_table.values(): + if isinstance(in_knl_callable, CallableKernel): + insns += (find_instructions_in_single_kernel( + in_knl_callable.subkernel, insn_match)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % ( + type(in_knl_callable))) + + return insns + # }}} @@ -54,6 +78,7 @@ def map_instructions(kernel, insn_match, f): # {{{ set_instruction_priority +@for_each_kernel def set_instruction_priority(kernel, insn_match, priority): """Set the priority of instructions matching *insn_match* to *priority*. @@ -71,6 +96,7 @@ def set_instruction_priority(kernel, insn_match, priority): # {{{ add_dependency +@for_each_kernel def add_dependency(kernel, insn_match, depends_on): """Add the instruction dependency *dependency* to the instructions matched by *insn_match*. @@ -88,7 +114,8 @@ def add_dependency(kernel, insn_match, depends_on): added_deps = frozenset([depends_on]) else: added_deps = frozenset( - dep.id for dep in find_instructions(kernel, depends_on)) + dep.id for dep in find_instructions_in_single_kernel(kernel, + depends_on)) if not added_deps: raise LoopyError("no instructions found matching '%s' " @@ -119,6 +146,7 @@ def add_dependency(kernel, insn_match, depends_on): # {{{ remove_instructions +@for_each_kernel def remove_instructions(kernel, insn_ids): """Return a new kernel with instructions in *insn_ids* removed. @@ -209,6 +237,7 @@ def replace_instruction_ids(kernel, replacements): # {{{ tag_instructions +@for_each_kernel def tag_instructions(kernel, new_tag, within=None): from loopy.match import parse_match within = parse_match(within) @@ -231,6 +260,7 @@ def tag_instructions(kernel, new_tag, within=None): # {{{ add nosync +@for_each_kernel def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, empty_ok=False): """Add a *no_sync_with* directive between *source* and *sink*. @@ -263,18 +293,21 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, This used to silently pass. This behavior can be restored using *empty_ok*. """ + assert isinstance(kernel, LoopKernel) if isinstance(source, str) and source in kernel.id_to_insn: sources = frozenset([source]) else: sources = frozenset( - source.id for source in find_instructions(kernel, source)) + source.id for source in find_instructions_in_single_kernel( + kernel, source)) if isinstance(sink, str) and sink in kernel.id_to_insn: sinks = frozenset([sink]) else: sinks = frozenset( - sink.id for sink in find_instructions(kernel, sink)) + sink.id for sink in find_instructions_in_single_kernel( + kernel, sink)) if not sources and not empty_ok: raise LoopyError("No match found for source specification '%s'." % source) @@ -327,6 +360,7 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, # {{{ uniquify_instruction_ids +@for_each_kernel def uniquify_instruction_ids(kernel): """Converts any ids that are :class:`loopy.UniqueName` or *None* into unique strings. diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py new file mode 100644 index 0000000000000000000000000000000000000000..9335bb0bb49ac17190460efb2fa127bef8ebd8f2 --- /dev/null +++ b/loopy/transform/pack_and_unpack_args.py @@ -0,0 +1,340 @@ +__copyright__ = "Copyright (C) 2018 Tianjiao Sun, Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from loopy.diagnostic import LoopyError +from loopy.kernel.instruction import CallInstruction +from loopy.translation_unit import TranslationUnit +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable +from loopy.symbolic import SubArrayRef + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: pack_and_unpack_args_for_call +""" + + +def pack_and_unpack_args_for_call_for_single_kernel(kernel, + callables_table, call_name, args_to_pack=None, + args_to_unpack=None): + """ + Returns a a copy of *kernel* with instructions appended to copy the + arguments in *args* to match the alignment expected by the *call_name* in + the kernel. The arguments are copied back to *args* with the appropriate + data layout. + + :arg call_name: An instance of :class:`str` denoting the function call in + the *kernel*. + :arg args_to_unpack: A list of the arguments as instances of :class:`str` which + must be packed. If set *None*, it is interpreted that all the array + arguments would be packed. + :arg args_to_unpack: A list of the arguments as instances of :class:`str` + which must be unpacked. If set *None*, it is interpreted that + all the array arguments should be unpacked. + """ + assert isinstance(kernel, LoopKernel) + new_domains = [] + new_tmps = kernel.temporary_variables.copy() + old_insn_to_new_insns = {} + + for insn in kernel.instructions: + if not isinstance(insn, CallInstruction): + # pack and unpack call only be done for CallInstructions. + continue + if insn.expression.function.name not in callables_table: + continue + + in_knl_callable = callables_table[ + insn.expression.function.name] + + if in_knl_callable.name != call_name: + # not the function we're looking for. + continue + in_knl_callable = in_knl_callable.with_packing_for_args() + + vng = kernel.get_var_name_generator() + ing = kernel.get_instruction_id_generator() + + parameters = insn.expression.parameters + if args_to_pack is None: + args_to_pack = [par.subscript.aggregate.name for par in + parameters+insn.assignees if isinstance(par, SubArrayRef) + and (par.swept_inames)] + if args_to_unpack is None: + args_to_unpack = [par.subscript.aggregate.name for par in + parameters+insn.assignees if isinstance(par, SubArrayRef) + and (par.swept_inames)] + + # {{{ sanity checks for args + + assert isinstance(args_to_pack, list) + assert isinstance(args_to_unpack, list) + + for arg in args_to_pack: + found_sub_array_ref = False + + for par in parameters + insn.assignees: + # checking that the given args is a sub array ref + if isinstance(par, SubArrayRef) and ( + par.subscript.aggregate.name == arg): + found_sub_array_ref = True + break + if not found_sub_array_ref: + raise LoopyError("No match found for packing arg '%s' of call '%s' " + "at insn '%s'." % (arg, call_name, insn.id)) + for arg in args_to_unpack: + if arg not in args_to_pack: + raise LoopyError("Argument %s should be packed in order to be " + "unpacked." % arg) + + # }}} + + packing_insns = [] + unpacking_insns = [] + + # {{{ handling ilp tags + + from loopy.kernel.data import IlpBaseTag, VectorizeTag + import islpy as isl + from pymbolic import var + + dim_type = isl.dim_type.set + ilp_inames = {iname for iname in insn.within_inames + if all(isinstance(tag, (IlpBaseTag, VectorizeTag)) + for tag in kernel.iname_to_tags.get(iname, []))} + new_ilp_inames = set() + ilp_inames_map = {} + for iname in ilp_inames: + new_iname_name = vng(iname + "_ilp") + ilp_inames_map[var(iname)] = var(new_iname_name) + new_ilp_inames.add(new_iname_name) + for iname in ilp_inames: + new_domain = kernel.get_inames_domain(iname).copy() + for i in range(new_domain.n_dim()): + old_iname = new_domain.get_dim_name(dim_type, i) + if old_iname in ilp_inames: + new_domain = new_domain.set_dim_name( + dim_type, i, ilp_inames_map[var(old_iname)].name) + new_domains.append(new_domain) + + # }}} + + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper + + # dict to store the new assignees and parameters, the mapping pattern + # from arg_id to parameters is identical to InKernelCallable.arg_id_to_dtype + id_to_parameters = tuple(enumerate(parameters)) + tuple( + (-i-1, assignee) for i, assignee in enumerate(insn.assignees)) + new_id_to_parameters = {} + + for arg_id, p in id_to_parameters: + if isinstance(p, SubArrayRef) and (p.subscript.aggregate.name in + args_to_pack): + new_pack_inames = ilp_inames_map.copy() # packing-specific inames + new_unpack_inames = ilp_inames_map.copy() # unpacking-specific iname + + new_pack_inames = {iname: var(vng(iname.name + + "_pack")) for iname in p.swept_inames} + new_unpack_inames = {iname: var(vng(iname.name + + "_unpack")) for iname in p.swept_inames} + + # Updating the domains corresponding to the new inames. + for iname in p.swept_inames: + new_domain_pack = kernel.get_inames_domain(iname.name).copy() + new_domain_unpack = kernel.get_inames_domain(iname.name).copy() + for i in range(new_domain_pack.n_dim()): + old_iname = new_domain_pack.get_dim_name(dim_type, i) + if var(old_iname) in new_pack_inames: + new_domain_pack = new_domain_pack.set_dim_name( + dim_type, i, new_pack_inames[var(old_iname)].name) + new_domain_unpack = new_domain_unpack.set_dim_name( + dim_type, i, new_unpack_inames[var(old_iname)].name) + new_domains.append(new_domain_pack) + new_domains.append(new_domain_unpack) + + arg = p.subscript.aggregate.name + pack_name = vng(arg + "_pack") + + from loopy.kernel.data import (TemporaryVariable, + temp_var_scope) + + if arg in kernel.arg_dict: + arg_in_caller = kernel.arg_dict[arg] + else: + arg_in_caller = kernel.temporary_variables[arg] + + pack_tmp = TemporaryVariable( + name=pack_name, + dtype=arg_in_caller.dtype, + dim_tags=in_knl_callable.arg_id_to_descr[arg_id].dim_tags, + shape=in_knl_callable.arg_id_to_descr[arg_id].shape, + scope=temp_var_scope.PRIVATE, + ) + + new_tmps[pack_name] = pack_tmp + + from loopy import Assignment + pack_subst_mapper = SubstitutionMapper(make_subst_func( + new_pack_inames)) + unpack_subst_mapper = SubstitutionMapper(make_subst_func( + new_unpack_inames)) + + # {{{ getting the lhs for packing and rhs for unpacking + + from loopy.isl_helpers import simplify_via_aff, make_slab + + flatten_index = simplify_via_aff( + sum(dim_tag.stride*idx for dim_tag, idx in + zip(arg_in_caller.dim_tags, p.subscript.index_tuple))) + + new_indices = [] + for dim_tag in in_knl_callable.arg_id_to_descr[arg_id].dim_tags: + ind = flatten_index // dim_tag.stride + flatten_index -= (dim_tag.stride * ind) + new_indices.append(ind) + + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + + pack_lhs_assignee = pack_subst_mapper( + var(pack_name).index(new_indices)) + unpack_rhs = unpack_subst_mapper( + var(pack_name).index(new_indices)) + + # }}} + + packing_insns.append(Assignment( + assignee=pack_lhs_assignee, + expression=pack_subst_mapper.map_subscript(p.subscript), + within_inames=insn.within_inames - ilp_inames | { + new_pack_inames[i].name for i in p.swept_inames} | ( + new_ilp_inames), + depends_on=insn.depends_on, + id=ing(insn.id+"_pack"), + depends_on_is_final=True + )) + + if p.subscript.aggregate.name in args_to_unpack: + unpacking_insns.append(Assignment( + expression=unpack_rhs, + assignee=unpack_subst_mapper.map_subscript(p.subscript), + within_inames=insn.within_inames - ilp_inames | { + new_unpack_inames[i].name for i in p.swept_inames} | ( + new_ilp_inames), + id=ing(insn.id+"_unpack"), + depends_on=frozenset([insn.id]), + depends_on_is_final=True + )) + + # {{{ creating the sweep inames for the new sub array refs + + updated_swept_inames = [] + + for i, _ in enumerate( + in_knl_callable.arg_id_to_descr[arg_id].shape): + updated_swept_inames.append(var(vng("i_packsweep_"+arg))) + + ctx = kernel.isl_context + space = isl.Space.create_from_names(ctx, + set=[iname.name for iname in updated_swept_inames]) + iname_set = isl.BasicSet.universe(space) + for iname, axis_length in zip(updated_swept_inames, + in_knl_callable.arg_id_to_descr[arg_id].shape): + iname_set = iname_set & make_slab(space, iname.name, 0, + axis_length) + new_domains = new_domains + [iname_set] + + # }}} + + new_id_to_parameters[arg_id] = SubArrayRef( + tuple(updated_swept_inames), + (var(pack_name).index(tuple(updated_swept_inames)))) + else: + new_id_to_parameters[arg_id] = p + + if packing_insns: + subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map)) + new_call_insn = insn.with_transformed_expressions(subst_mapper) + new_params = tuple(subst_mapper(new_id_to_parameters[i]) for i, _ in + enumerate(parameters)) + new_assignees = tuple(subst_mapper(new_id_to_parameters[-i-1]) + for i, _ in enumerate(insn.assignees)) + new_call_insn = new_call_insn.copy( + depends_on=new_call_insn.depends_on | { + pack.id for pack in packing_insns}, + within_inames=new_call_insn.within_inames - ilp_inames | ( + new_ilp_inames), + expression=new_call_insn.expression.function(*new_params), + assignees=new_assignees) + old_insn_to_new_insns[insn.id] = (packing_insns + [new_call_insn] + + unpacking_insns) + + if old_insn_to_new_insns: + new_instructions = [] + for insn in kernel.instructions: + if insn.id in old_insn_to_new_insns: + # Replacing the current instruction with the group of + # instructions including the packing and unpacking instructions + new_instructions.extend(old_insn_to_new_insns[insn.id]) + else: + # for the instructions that depend on the call instruction that + # are to be packed and unpacked, we need to add the complete + # instruction block as a dependency for them. + new_depends_on = insn.depends_on + if insn.depends_on & set(old_insn_to_new_insns): + # need to add the unpack instructions on dependencies. + for old_insn_id in insn.depends_on & set(old_insn_to_new_insns): + new_depends_on |= frozenset(i.id for i + in old_insn_to_new_insns[old_insn_id]) + new_instructions.append(insn.copy(depends_on=new_depends_on)) + kernel = kernel.copy( + domains=kernel.domains + new_domains, + instructions=new_instructions, + temporary_variables=new_tmps + ) + + return kernel + + +def pack_and_unpack_args_for_call(program, *args, **kwargs): + assert isinstance(program, TranslationUnit) + + new_callables = {} + for func_id, in_knl_callable in program.callables_table.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = pack_and_unpack_args_for_call_for_single_kernel( + in_knl_callable.subkernel, program.callables_table, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_callables[func_id] = in_knl_callable + + return program.copy(callables_table=new_callables) + +# vim: foldmethod=marker diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py index 390fafb2b71b705acaa990e45d2d4d4b9fc59cbe..44b2bbf33a40939f78d3bd6b1bb3ed5b63a463a2 100644 --- a/loopy/transform/padding.py +++ b/loopy/transform/padding.py @@ -24,6 +24,12 @@ THE SOFTWARE. from pytools import MovedFunctionDeprecationWrapper from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext +from loopy.translation_unit import (for_each_kernel, + TranslationUnit) +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel +from loopy.diagnostic import LoopyError + class ArrayAxisSplitHelper(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, arg_names, handler): @@ -40,7 +46,9 @@ class ArrayAxisSplitHelper(RuleAwareIdentityMapper): # {{{ split_array_dim (deprecated since June 2016) -def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, +@for_each_kernel +def split_array_dim(kernel, arrays_and_axes, count, + auto_split_inames=True, split_kwargs=None): """ :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating @@ -242,7 +250,7 @@ def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, return kernel -split_arg_axis = MovedFunctionDeprecationWrapper(split_array_dim) +split_arg_axis = (MovedFunctionDeprecationWrapper(split_array_dim)) # }}} @@ -366,7 +374,9 @@ def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"): return kernel -def split_array_axis(kernel, array_names, axis_nr, count, order="C"): +@for_each_kernel +def split_array_axis(kernel, array_names, axis_nr, count, + order="C"): """ :arg array: a list of names of temporary variables or arguments. May also be a comma-separated string of these. @@ -384,6 +394,7 @@ def split_array_axis(kernel, array_names, axis_nr, count, order="C"): ``loopy.split_array_dim`` that had the role of this function in versions prior to 2016.2. """ + assert isinstance(kernel, LoopKernel) if isinstance(array_names, str): array_names = [i.strip() for i in array_names.split(",") if i.strip()] @@ -399,6 +410,15 @@ def split_array_axis(kernel, array_names, axis_nr, count, order="C"): # {{{ find_padding_multiple def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1): + if isinstance(kernel, TranslationUnit): + kernel_names = [i for i, clbl in kernel.callables_table.items() + if isinstance(clbl, CallableKernel)] + if len(kernel_names) > 1: + raise LoopyError() + return find_padding_multiple(kernel[kernel_names[0]], variable, axis, + align_bytes, allowed_waste) + assert isinstance(kernel, LoopKernel) + arg = kernel.arg_dict[variable] if arg.dim_tags is None: @@ -436,6 +456,7 @@ def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1 # {{{ add_padding +@for_each_kernel def add_padding(kernel, variable, axis, align_bytes): arg_to_idx = {arg.name: i for i, arg in enumerate(kernel.args)} arg_idx = arg_to_idx[variable] diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index 60fef9e9852fcff2e6a3a9929d45bc59508fbcb7..4916dd4e711b385cddaf5511591bde484f0e57c5 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -25,6 +25,9 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) import islpy as isl +from loopy.translation_unit import for_each_kernel +from loopy.kernel import LoopKernel + __doc__ = """ .. currentmodule:: loopy @@ -37,6 +40,7 @@ __doc__ = """ # {{{ assume +@for_each_kernel def assume(kernel, assumptions): """Include an assumption about :ref:`domain-parameters` in the kernel, e.g. `n mod 4 = 0`. @@ -64,18 +68,8 @@ def assume(kernel, assumptions): # {{{ fix_parameter -def fix_parameters(kernel, within=None, **value_dict): - """Fix the values of the arguments to specific constants. - - *value_dict* consists of *name*/*value* pairs, where *name* will be fixed - to be *value*. *name* may refer to :ref:`domain-parameters` or - :ref:`arguments`. - """ - - if not value_dict: - return kernel - - def process_set_one_param(s, name, value): +def _fix_parameter(kernel, name, value, within=None): + def process_set(s): var_dict = s.get_var_dict() try: @@ -95,15 +89,10 @@ def fix_parameters(kernel, within=None, **value_dict): return s - def process_set(s): - for name, value in value_dict.items(): - s = process_set_one_param(s, name, value) - return s - new_domains = [process_set(dom) for dom in kernel.domains] from pymbolic.mapper.substitutor import make_subst_func - subst_func = make_subst_func(value_dict) + subst_func = make_subst_func({name: value}) from loopy.symbolic import SubstitutionMapper, PartialEvaluationMapper subst_map = SubstitutionMapper(subst_func) @@ -115,7 +104,7 @@ def fix_parameters(kernel, within=None, **value_dict): from loopy.kernel.array import ArrayBase new_args = [] for arg in kernel.args: - if arg.name in value_dict.keys(): + if arg.name == name: # remove from argument list continue @@ -146,6 +135,23 @@ def fix_parameters(kernel, within=None, **value_dict): )) +@for_each_kernel +def fix_parameters(kernel, **value_dict): + """Fix the values of the arguments to specific constants. + + *value_dict* consists of *name*/*value* pairs, where *name* will be fixed + to be *value*. *name* may refer to :ref:`domain-parameters` or + :ref:`arguments`. + """ + assert isinstance(kernel, LoopKernel) + + within = value_dict.pop("within", None) + + for name, value in value_dict.items(): + kernel = _fix_parameter(kernel, name, value, within) + + return kernel + # }}} # vim: foldmethod=marker diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index cefed807d73bd0a9064c170190a3ba19b2d5abf6..9ba572efe03296e161cad82433d125feeae1358a 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -27,6 +27,8 @@ from loopy.symbolic import (get_dependencies, SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError from pymbolic.mapper.substitutor import make_subst_func +from loopy.translation_unit import TranslationUnit +from loopy.kernel.function_interface import CallableKernel, ScalarCallable import numpy as np from pymbolic import var @@ -255,9 +257,9 @@ class _not_provided: # noqa: N801 pass -def precompute(kernel, subst_use, sweep_inames=[], within=None, - storage_axes=None, temporary_name=None, precompute_inames=None, - precompute_outer_inames=None, +def precompute_for_single_kernel(kernel, callables_table, subst_use, + sweep_inames=[], within=None, storage_axes=None, temporary_name=None, + precompute_inames=None, precompute_outer_inames=None, storage_axis_to_tag={}, # "None" is a valid value here, distinct from the default. @@ -352,6 +354,18 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are eliminated. """ + if isinstance(kernel, TranslationUnit): + kernel_names = [i for i, clbl in + kernel.callables_table.items() if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise LoopyError() + + return kernel.with_kernel(precompute(kernel[kernel_names[0]], + subst_use, sweep_inames, within, storage_axes, temporary_name, + precompute_inames, precompute_outer_inames, storage_axis_to_tag, + default_tag, dtype, fetch_bounding_box, temporary_address_space, + compute_insn_id, kernel.callables_table, **kwargs)) # {{{ unify temporary_address_space / temporary_scope @@ -1030,15 +1044,34 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # }}} - from loopy import tag_inames + from loopy.transform.iname import tag_inames kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.data import AutoFitLocalIndexTag, filter_iname_tags_by_type if filter_iname_tags_by_type(new_iname_to_tag.values(), AutoFitLocalIndexTag): from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) + kernel = assign_automatic_axes(kernel, callables_table) return kernel + +def precompute(program, *args, **kwargs): + assert isinstance(program, TranslationUnit) + new_callables = {} + + for func_id, clbl in program.callables_table.items(): + if isinstance(clbl, CallableKernel): + knl = precompute_for_single_kernel(clbl.subkernel, + program.callables_table, *args, **kwargs) + clbl = clbl.copy(subkernel=knl) + elif isinstance(clbl, ScalarCallable): + pass + else: + raise NotImplementedError() + + new_callables[func_id] = clbl + + return program.copy(callables_table=new_callables) + # vim: foldmethod=marker diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 0a9cfb7bce21a64cc2858e4f3b9472e2992984b8..7c7f00932e59cd8893737eaa653768c34aa5067b 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -61,7 +61,7 @@ class LivenessAnalysis: def __init__(self, kernel): self.kernel = kernel - self.schedule = self.kernel.schedule + self.schedule = kernel.schedule @memoize_method def get_successor_relation(self): @@ -232,8 +232,9 @@ class TemporarySaver: def new_shape(self): return self.hw_dims + self.non_hw_dims - def __init__(self, kernel): + def __init__(self, kernel, callables_table): self.kernel = kernel + self.callables_table = callables_table self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() @@ -436,7 +437,8 @@ class TemporarySaver: return (), () group_sizes, local_sizes = ( - self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids)) + self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids, + self.callables_table)) if temporary.address_space == lp.AddressSpace.LOCAL: # Elide local axes in the save slot for local temporaries. @@ -623,7 +625,7 @@ class TemporarySaver: kernel = lp.add_nosync(kernel, "global", source, sink) from loopy.kernel.tools import assign_automatic_axes - return assign_automatic_axes(kernel) + return assign_automatic_axes(kernel, self.callables_table) def save(self, temporary, subkernel): self.save_or_reload_impl(temporary, subkernel, "save") @@ -717,7 +719,7 @@ class TemporarySaver: # {{{ auto save and reload across kernel calls -def save_and_reload_temporaries(kernel): +def save_and_reload_temporaries(program, entrypoint=None): """ Add instructions to save and reload temporary variables that are live across kernel calls. @@ -740,13 +742,28 @@ def save_and_reload_temporaries(kernel): :returns: The resulting kernel """ - liveness = LivenessAnalysis(kernel) - saver = TemporarySaver(kernel) + if entrypoint is None: + if len(program.entrypoints) != 1: + raise LoopyError("Missing argument 'entrypoint'.") + entrypoint = list(program.entrypoints)[0] + + knl = program[entrypoint] + + if not knl.schedule: + program = lp.preprocess_program(program) + from loopy.schedule import get_one_linearized_kernel + knl = get_one_linearized_kernel(program[entrypoint], + program.callables_table) + + assert knl.schedule is not None + + liveness = LivenessAnalysis(knl) + saver = TemporarySaver(knl, program.callables_table) from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) - for sched_idx, sched_item in enumerate(kernel.schedule): + for sched_idx, sched_item in enumerate(knl.schedule): if isinstance(sched_item, CallKernel): # Any written temporary that is live-out needs to be read into @@ -757,8 +774,9 @@ def save_and_reload_temporaries(kernel): else: subkernel = sched_item.kernel_name interesting_temporaries = ( - temporaries_read_in_subkernel(kernel, subkernel) - | temporaries_written_in_subkernel(kernel, subkernel)) + temporaries_read_in_subkernel(knl, subkernel) + | temporaries_written_in_subkernel(knl, + subkernel)) for temporary in liveness[sched_idx].live_out & interesting_temporaries: logger.info("reloading {} at entry of {}" @@ -766,20 +784,20 @@ def save_and_reload_temporaries(kernel): saver.reload(temporary, sched_item.kernel_name) elif isinstance(sched_item, ReturnFromKernel): - if sched_idx == len(kernel.schedule) - 1: + if sched_idx == len(knl.schedule) - 1: # Kernel exit: nothing live interesting_temporaries = set() else: subkernel = sched_item.kernel_name interesting_temporaries = ( - temporaries_written_in_subkernel(kernel, subkernel)) + temporaries_written_in_subkernel(knl, subkernel)) for temporary in liveness[sched_idx].live_in & interesting_temporaries: logger.info("saving {} before return of {}" .format(temporary, sched_item.kernel_name)) saver.save(temporary, sched_item.kernel_name) - return saver.finish() + return program.with_kernel(saver.finish()) # }}} diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index a9e153db845c57f6a6819456a185a4baf4e56a01..331c8ff0c9efb21586aa7379b1d1b3a2c1d09f30 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -28,6 +28,9 @@ from loopy.transform.iname import remove_any_newly_unused_inames from pytools import ImmutableRecord from pymbolic import var +from loopy.translation_unit import (for_each_kernel, + TranslationUnit) +from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging logger = logging.getLogger(__name__) @@ -51,6 +54,16 @@ def extract_subst(kernel, subst_name, template, parameters=()): unifications. """ + if isinstance(kernel, TranslationUnit): + kernel_names = [i for i, clbl in + kernel.callables_table.items() if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise LoopyError() + + return kernel.with_kernel(extract_subst(kernel[kernel_names[0]], + subst_name, template, parameters)) + if isinstance(template, str): from pymbolic import parse template = parse(template) @@ -190,6 +203,7 @@ def extract_subst(kernel, subst_name, template, parameters=()): instructions=new_insns, substitutions=new_substs) + # }}} @@ -277,6 +291,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper): return var(subst_name)(*index) +@for_each_kernel @remove_any_newly_unused_inames def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, force_retain_argument=False): @@ -460,6 +475,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, # {{{ expand_subst +@for_each_kernel def expand_subst(kernel, within=None): """ Returns an instance of :class:`loopy.LoopKernel` with the substitutions @@ -468,6 +484,7 @@ def expand_subst(kernel, within=None): :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. """ + if not kernel.substitutions: return kernel @@ -500,8 +517,17 @@ def find_rules_matching(kernel, pattern): return [r for r in kernel.substitutions if pattern.match(r)] -def find_one_rule_matching(kernel, pattern): - rules = find_rules_matching(kernel, pattern) +def find_one_rule_matching(program, pattern): + rules = [] + for in_knl_callable in program.callables_table.values(): + if isinstance(in_knl_callable, CallableKernel): + knl = in_knl_callable.subkernel + rules.extend(find_rules_matching(knl, pattern)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable types %s." % ( + type(in_knl_callable).__name__)) if len(rules) > 1: raise ValueError("more than one substitution rule matched '%s'" diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py new file mode 100644 index 0000000000000000000000000000000000000000..83ceeef68cfd8f0f6b199b872b567bfca0f1aee9 --- /dev/null +++ b/loopy/translation_unit.py @@ -0,0 +1,779 @@ +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import collections + +from pytools import ImmutableRecord +from pymbolic.primitives import Variable +from functools import wraps + +from loopy.symbolic import (RuleAwareIdentityMapper, ResolvedFunction, + SubstitutionRuleMappingContext) +from loopy.kernel.function_interface import ( + CallableKernel, ScalarCallable) +from loopy.diagnostic import LoopyError +from loopy.library.reduction import ReductionOpFunction + +from loopy.kernel import LoopKernel +from loopy.tools import update_persistent_hash +from pymbolic.primitives import Call +from pyrsistent import pmap, PMap + +__doc__ = """ +.. currentmodule:: loopy.translation_unit + +.. autoclass:: CallablesInferenceContext + +.. autofunction:: make_program + +.. autofunction:: for_each_kernel + +""" + + +# {{{ CallableResolver + +def _is_a_reduction_op(expr): + if isinstance(expr, ResolvedFunction): + return _is_a_reduction_op(expr.function) + + return isinstance(expr, ReductionOpFunction) + + +class CallableResolver(RuleAwareIdentityMapper): + """ + Resolves callables in expressions and records the names of the calls + resolved. + + .. attribute:: known_callables + + An instance of :class:`frozenset` of the call names to be resolved. + + .. attribute:: rule_mapping_context + + An instance of :class:`loopy.symbolic.RuleMappingContext`. + + .. attribute:: calls_resolved + + A :class:`set` of calls that were resolved. Updated during an + expression traversal. + """ + def __init__(self, rule_mapping_context, known_callables): + assert isinstance(known_callables, frozenset) + + super().__init__(rule_mapping_context) + + self.known_callables = known_callables + + # a record of the call names that were resolved + self.calls_resolved = set() + + def map_call(self, expr, expn_state): + from loopy.symbolic import parse_tagged_name + + if not _is_a_reduction_op(expr.function): + name, tag = parse_tagged_name(expr.function) + else: + if isinstance(expr.function, ResolvedFunction): + name = expr.function.function + else: + name = expr.function + + if name in self.known_callables: + params = tuple(self.rec(par, expn_state) for par in expr.parameters) + + # record that we resolved a call + self.calls_resolved.add(name) + + function = expr.function + + if not isinstance(expr.function, ResolvedFunction): + function = ResolvedFunction(expr.function) + + return Call(function, params) + + return super().map_call(expr, expn_state) + + def map_call_with_kwargs(self, expr): + # See https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + +# }}} + + +# {{{ translation unit + +class TranslationUnit(ImmutableRecord): + """ + Records the information about all the callables in a :mod:`loopy` program. + + An instance of :class:`TranslationUnit` is the object that gets lowered + for a :class:`loopy.target.TargetBase`. + + + .. attribute:: entrypoints + + A :class:`frozenset` of the names of the kernels which + could be called from the host. + + .. attribute:: default_entrypoint + + The :class:`~loopy.LoopKernel` representing the main entrypoint + of the program, if defined. Currently, this attribute may only be + accessed if there is exactly one entrypoint in the translation unit. + + .. attribute:: callables_table + + An instance of :class:`pyrsistent.PMap` mapping the function + identifiers in a kernel to their associated instances of + :class:`~loopy.kernel.function_interface.InKernelCallable`. + + .. attribute:: target + + An instance of :class:`loopy.target.TargetBase`. + + .. attribute:: func_id_to_in_knl_callables_mappers + + A :class:`frozenset` of functions of the signature ``(target: + TargetBase, function_indentifier: str)`` that returns an instance + of :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. + + .. automethod:: __call__ + .. automethod:: copy + .. automethod:: __getitem__ + .. automethod:: with_kernel + + .. note:: + + - To create an instance of :class:`loopy.TranslationUnit`, it is + recommended to go through :func:`loopy.make_kernel`. + - This data structure and its attributes should be considered + immutable, any modifications should be done through + :meth:`~TranslationUnit.copy`. + + """ + def __init__(self, + entrypoints=frozenset(), + callables_table=pmap(), + target=None, + func_id_to_in_knl_callable_mappers=[]): + + # {{{ sanity checks + + assert isinstance(callables_table, collections.abc.Mapping) + assert isinstance(entrypoints, frozenset) + + if not isinstance(callables_table, PMap): + callables_table = pmap(callables_table) + + # }}} + + super().__init__( + entrypoints=entrypoints, + callables_table=pmap(callables_table), + target=target, + func_id_to_in_knl_callable_mappers=( + func_id_to_in_knl_callable_mappers)) + + self._program_executor_cache = {} + + hash_fields = ( + "entrypoints", + "callables_table", + "target",) + + update_persistent_hash = update_persistent_hash + + def copy(self, **kwargs): + target = kwargs.pop("target", None) + program = super().copy(**kwargs) + if target: + from loopy.kernel import KernelState + if max(callable_knl.subkernel.state + for callable_knl in self.callables_table.values() + if isinstance(callable_knl, CallableKernel)) > ( + KernelState.INITIAL): + if not isinstance(target, type(self.target)): + raise LoopyError("One of the kernels in the program has been " + "preprocessed, cannot modify target now.") + + new_callables = {} + for func_id, clbl in program.callables_table.items(): + if isinstance(clbl, CallableKernel): + knl = clbl.subkernel + knl = knl.copy(target=target) + clbl = clbl.copy(subkernel=knl) + elif isinstance(clbl, ScalarCallable): + pass + else: + raise NotImplementedError() + new_callables[func_id] = clbl + + program = super().copy( + callables_table=new_callables, target=target) + + return program + + def with_entrypoints(self, entrypoints): + """ + :param entrypoints: Either a comma-separated :class:`str` or + :class:`frozenset`. + """ + if isinstance(entrypoints, str): + entrypoints = frozenset([e.strip() for e in + entrypoints.split(",")]) + + assert isinstance(entrypoints, frozenset) + + return self.copy(entrypoints=entrypoints) + + @property + def state(self): + """ Returns an instance of :class:`loopy.kernel.KernelState`. """ + from loopy.kernel import KernelState + return min((callable_knl.subkernel.state + for callable_knl in self.callables_table.values() + if isinstance(callable_knl, CallableKernel)), + default=KernelState.INITIAL) + + def with_kernel(self, kernel): + """ + If *self* contains a callable kernel with *kernel*'s name, replaces its + subkernel and returns a copy of *self*. Else records a new callable + kernel with *kernel* as its subkernel. + + :arg kernel: An instance of :class:`loopy.LoopKernel`. + :returns: Copy of *self* with updated callable kernels. + """ + if kernel.name in self.callables_table: + # update the callable kernel + new_in_knl_callable = self.callables_table[kernel.name].copy( + subkernel=kernel) + new_callables = self.callables_table.remove(kernel.name).set( + kernel.name, new_in_knl_callable) + return self.copy(callables_table=new_callables) + else: + # add a new callable kernel + clbl = CallableKernel(kernel) + new_callables = self.callables_table.set(kernel.name, clbl) + return self.copy(callables_table=new_callables) + + def __getitem__(self, name): + """ + For the callable named *name*, return a :class:`loopy.LoopKernel` if + it's a :class:`~loopy.kernel.function_interface.CallableKernel` + otherwise return the callable itself. + """ + result = self.callables_table[name] + if isinstance(result, CallableKernel): + return result.subkernel + else: + return result + + @property + def default_entrypoint(self): + if len(self.entrypoints) == 1: + entrypoint, = self.entrypoints + return self[entrypoint] + else: + raise ValueError("TranslationUnit has multiple possible entrypoints." + " The default entry point kernel is not uniquely" + " determined.") + + def __call__(self, *args, **kwargs): + """ + Builds and calls the *entrypoint* kernel, if + :attr:`TranslationUnit.target` is an executable target. + + :arg entrypoint: The name of the entrypoint callable to be called. + Defaults to :attr:`default_entrypoint`. + """ + entrypoint = kwargs.get("entrypoint", None) + if entrypoint is None: + entrypoint = self.default_entrypoint.name + + if entrypoint not in self.entrypoints: + raise LoopyError(f"'{entrypoint}' not in list of possible entrypoints " + "for the program. " + "Maybe you want to invoke 'with_entrypoints' before " + "calling the program?") + + kwargs["entrypoint"] = entrypoint + + key = self.target.get_kernel_executor_cache_key(*args, **kwargs) + try: + pex = self._program_executor_cache[key] + except KeyError: + pex = self.target.get_kernel_executor(self, *args, **kwargs) + self._program_executor_cache[key] = pex + + return pex(*args, **kwargs) + + def __str__(self): + # FIXME: do a topological sort by the call graph + + def strify_callable(clbl): + return str(clbl.subkernel) + + return "\n".join( + strify_callable(clbl) + for name, clbl in self.callables_table.items() + if isinstance(clbl, CallableKernel)) + + def __setstate__(self, state_obj): + super().__setstate__(state_obj) + + self._program_executor_cache = {} + + def __hash__(self): + from loopy.tools import LoopyKeyBuilder + from pytools.persistent_dict import new_hash + key_hash = new_hash() + self.update_persistent_hash(key_hash, LoopyKeyBuilder()) + return hash(key_hash.digest()) + + +class Program(TranslationUnit): + def __init__(self, *args, **kwargs): + from warnings import warn + warn("Program is deprecated, use TranslationUnit instead, " + "will be removed in 2022", DeprecationWarning, stacklevel=2) + super().__init__(*args, **kwargs) + +# }}} + + +# {{{ rename resolved functions + +class ResolvedFunctionRenamer(RuleAwareIdentityMapper): + """ + Mapper to rename the resolved functions in an expression according to + *renaming_dict*. + """ + def __init__(self, rule_mapping_context, renaming_dict): + super().__init__( + rule_mapping_context) + self.renaming_dict = renaming_dict + + def map_resolved_function(self, expr, expn_state): + if expr.name in self.renaming_dict: + return ResolvedFunction(self.renaming_dict[expr.name]) + else: + return super().map_resolved_function( + expr, expn_state) + + +def rename_resolved_functions_in_a_single_kernel(kernel, + renaming_dict): + """ + Returns a copy of *kernel* with the instances of :class:`ResolvedFunction` + renames according to *renaming_dict*. + """ + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + resolved_function_renamer = ResolvedFunctionRenamer(rule_mapping_context, + renaming_dict) + return ( + rule_mapping_context.finish_kernel( + resolved_function_renamer.map_kernel(kernel))) + +# }}} + + +def get_reachable_resolved_callable_ids(callables, entrypoints): + """ + Returns a :class:`frozenset` of callables ids that are resolved and + reachable from *entrypoints*. + """ + return frozenset().union(*(callables[e].get_called_callables(callables) + for e in entrypoints)) + + +# {{{ CallablesInferenceContext + +def get_all_subst_names(callables): + """ + Returns a :class:`set` of all substitution rule names in the callable + kernels of *callables*. + + :arg callables: A mapping from function identifiers to + :class:`~loopy.kernel.function_interface.InKernelCallable`. + """ + return set().union(*(set(clbl.subkernel.substitutions.keys()) + for clbl in callables.values() + if isinstance(clbl, CallableKernel))) + + +def make_callable_name_generator(callables): + from pytools import UniqueNameGenerator + all_substs = get_all_subst_names(callables) + return UniqueNameGenerator(set(callables.keys()) | all_substs) + + +def make_clbl_inf_ctx(callables, entrypoints): + name_gen = make_callable_name_generator(callables) + return CallablesInferenceContext(callables, name_gen) + + +class CallablesInferenceContext(ImmutableRecord): + """ + Helper class for housekeeping a :attr:`loopy.TranslationUnit.callables_table` + while traversing through callables of :class:`loopy.TranslationUnit`. + + .. attribute:: callables + + A mapping from the callable names to instances of + :class:`loopy.kernel.function_interface.InKernelCallable`. + + .. attribute:: renames + + A mapping from old function identifiers to a :class:`frozenset` of new + function identifiers. + + .. attribute:: new_entrypoints + + A :class:`frozenset` of renamed entrypoint names. + + .. automethod:: with_callable + + .. automethod:: finish_program + + .. automethod:: __getitem__ + """ + def __init__(self, callables, + clbl_name_gen, + renames=collections.defaultdict(frozenset), + new_entrypoints=frozenset()): + assert isinstance(callables, collections.abc.Mapping) + + super().__init__(callables=dict(callables), + clbl_name_gen=clbl_name_gen, + renames=renames, + new_entrypoints=new_entrypoints) + + def with_callable(self, old_function_id, new_clbl, + is_entrypoint=False): + """ + Updates the callable referred by *function_id*'s in *self*'s namespace + to *new_clbl*. + + :arg old_function_id: An instance of :class:`pymbolic.primitives.Variable` or + :class:`loopy.library.reduction.ReductionOpFunction`. + + :arg new_clbl: An instance of + :class:`loopy.kernel.function_interface.InKernelCallable`. + + :returns: ``(new_self, new_function_id)`` is a copy of *self* with + *new_clbl* in its namespace. *new_clbl* would be referred by + *new_function_id* in *new_self*'s namespace. + """ + + assert isinstance(old_function_id, (str, Variable, ReductionOpFunction)) + + if isinstance(old_function_id, Variable): + old_function_id = old_function_id.name + + renames = self.renames.copy() + + # if the callable already exists => return the function + # identifier corresponding to that callable. + for func_id, clbl in self.callables.items(): + if clbl == new_clbl: + renames[old_function_id] |= frozenset([func_id]) + if isinstance(func_id, str): + new_entrypoints = self.new_entrypoints + if is_entrypoint: + new_entrypoints |= frozenset([func_id]) + return (self.copy(renames=renames, + new_entrypoints=new_entrypoints), + Variable(func_id),) + else: + assert not is_entrypoint + assert isinstance(func_id, ReductionOpFunction) + return (self.copy(renames=renames), + func_id) + + # {{{ handle ReductionOpFunction + + if isinstance(old_function_id, ReductionOpFunction): + # FIXME: Check if we have 2 ArgMax functions + # with different types in the same kernel the generated code + # does not mess up the types. + assert not is_entrypoint + unique_function_id = old_function_id.copy() + updated_callables = self.callables.copy() + updated_callables[unique_function_id] = new_clbl + renames[old_function_id] |= frozenset([unique_function_id]) + + return (self.copy(callables=updated_callables, + renames=renames), + unique_function_id) + + # }}} + + # must allocate a new clbl in the namespace => find a unique id for it + unique_function_id = self.clbl_name_gen(old_function_id) + + updated_callables = self.callables.copy() + updated_callables[unique_function_id] = new_clbl + renames[old_function_id] |= frozenset([unique_function_id]) + + new_entrypoints = self.new_entrypoints + if is_entrypoint: + new_entrypoints |= frozenset([unique_function_id]) + + return (self.copy(renames=renames, + callables=updated_callables, + new_entrypoints=new_entrypoints), + Variable(unique_function_id)) + + def finish_program(self, program): + """ + Returns a copy of *program* with rollback renaming of the callables + done whenever possible. + + For example: If all the ``sin`` function ids diverged as + ``sin_0``, ``sin_1``, then all the renaming is done such that one of + the flavors of the callable is renamed back to ``sin``. + """ + # FIXME: Generalize this if an inference happens over a proper subgraph + # of the callgraph (the following assert should be removed) + assert len(self.new_entrypoints) == len(program.entrypoints) + + # {{{ get all the callables reachable from the new entrypoints. + + # get the names of all callables reachable from the new entrypoints + new_callable_ids = get_reachable_resolved_callable_ids(self.callables, + self.new_entrypoints) + + # get the history of function ids from the performed renames: + history = {} + for old_func_id, new_func_ids in self.renames.items(): + for new_func_id in new_func_ids: + if new_func_id in (new_callable_ids | self.new_entrypoints): + history[new_func_id] = old_func_id + + # }}} + + # AIM: Preserve the entrypoints of *program* + + # If there are any callees having old entrypoint names => mark them for + # renaming + callees_with_old_entrypoint_names = ((program.entrypoints & new_callable_ids) + - self.new_entrypoints) + + todo_renames = {} + new_callables = dict(program.callables_table) + + for c in callees_with_old_entrypoint_names: + todo_renames[c] = self.clbl_name_gen(c) + + for e in self.new_entrypoints: + # note renames to "rollback" the renaming of entrypoints + todo_renames[e] = history[e] + assert todo_renames[e] in program.entrypoints + + # try to rollback the names as much as possible + for new_id in new_callable_ids: + old_func_id = history[new_id] + if (isinstance(old_func_id, str) + and old_func_id not in set(todo_renames.values())): + todo_renames[new_id] = old_func_id + + # {{{ perform the renames form todo_renames + + for func_id in (new_callable_ids | self.new_entrypoints): + clbl = self.callables[func_id] + if func_id in todo_renames: + assert history[func_id] == todo_renames[func_id] + func_id = todo_renames[func_id] + if isinstance(clbl, CallableKernel): + subknl = clbl.subkernel.copy(name=func_id) + subknl = rename_resolved_functions_in_a_single_kernel(subknl, + todo_renames) + + clbl = clbl.copy(subkernel=subknl) + + new_callables[func_id] = clbl + + # }}} + + return program.copy(callables_table=new_callables) + + def __getitem__(self, name): + result = self.callables[name] + return result + +# }}} + + +# {{{ helper functions + +def make_program(kernel): + """ + Returns an instance of :class:`loopy.TranslationUnit` with *kernel* as the only + callable kernel. + """ + + program = TranslationUnit( + callables_table={ + kernel.name: CallableKernel(kernel)}, + target=kernel.target) + + return program + + +def for_each_kernel(transform): + """ + Function wrapper for transformations of the type ``transform(kernel: + LoopKernel, *args, **kwargs) -> LoopKernel``. Returns a function that would + apply *transform* to all callable kernels in a :class:`loopy.TranslationUnit`. + """ + def _collective_transform(*args, **kwargs): + if "translation_unit" in kwargs: + t_unit_or_kernel = kwargs.pop("translation_unit") + elif "kernel" in kwargs: + t_unit_or_kernel = kwargs.pop("kernel") + else: + t_unit_or_kernel = args[0] + args = args[1:] + + if isinstance(t_unit_or_kernel, TranslationUnit): + t_unit = t_unit_or_kernel + new_callables = {} + for func_id, clbl in t_unit.callables_table.items(): + if isinstance(clbl, CallableKernel): + new_subkernel = transform(clbl.subkernel, *args, **kwargs) + clbl = clbl.copy(subkernel=new_subkernel) + elif isinstance(clbl, ScalarCallable): + pass + else: + raise NotImplementedError(f"{type(clbl)}") + + new_callables[func_id] = clbl + + return t_unit.copy(callables_table=new_callables) + else: + assert isinstance(t_unit_or_kernel, LoopKernel) + kernel = t_unit_or_kernel + return transform(kernel, *args, **kwargs) + + return wraps(transform)(_collective_transform) + + +def add_callable_to_table(callables_table, clbl_id, clbl): + """ + Returns a tuple ``new_clbl_id, new_callables_table`` where + *new_callables_table* is a copy of *callables_table* with *clbl* in its + namespace. *clbl* is referred to in *new_callables_table*'s namespace by + *new_clbl_id*. + + :arg clbl_id: An instance of :class:`str` or + :class:`~loopy.library.reduction.ReductionOpFunction` based on which + the unique identifier, *new_clbl_id* , is to be chosen. + """ + from loopy.kernel.function_interface import InKernelCallable + assert isinstance(clbl, InKernelCallable) + + for i, c in callables_table.items(): + if c == clbl: + return i, callables_table + + if isinstance(clbl_id, ReductionOpFunction): + new_clbl_id = clbl_id + else: + assert isinstance(clbl_id, str) + ung = make_callable_name_generator(callables_table) + new_clbl_id = ung(clbl_id) + + new_callables_table = callables_table.copy() + new_callables_table[new_clbl_id] = clbl.with_name(new_clbl_id) + + return new_clbl_id, new_callables_table + +# }}} + + +# {{{ resolve_callables + +def resolve_callables(program): + """ + Returns a :class:`TranslationUnit` with known :class:`pymbolic.primitives.Call` + expression nodes converted to :class:`loopy.symbolic.ResolvedFunction`. + """ + from loopy.library.function import get_loopy_callables + from loopy.check import validate_kernel_call_sites + from loopy.kernel import KernelState + + if program.state >= KernelState.CALLS_RESOLVED: + # program's callables have been resolved + return program + + # get registered callables + known_callables = dict(program.callables_table) + # get target specific callables + known_callables.update(program.target.get_device_ast_builder().known_callables) + # get loopy specific callables + known_callables.update(get_loopy_callables()) + + callables_table = {} + + # callables: name of the calls seen in the program + callables = {name for name, clbl in program.callables_table.items() + if isinstance(clbl, CallableKernel)} + + while callables: + clbl_name = callables.pop() + clbl = known_callables[clbl_name] + + if isinstance(clbl, CallableKernel): + knl = clbl.subkernel + + rule_mapping_context = SubstitutionRuleMappingContext( + knl.substitutions, knl.get_var_name_generator()) + clbl_resolver = CallableResolver(rule_mapping_context, + frozenset(known_callables)) + knl = rule_mapping_context.finish_kernel(clbl_resolver.map_kernel(knl)) + knl = knl.copy(state=KernelState.CALLS_RESOLVED) + + # add the updated callable kernel to the table + callables_table[clbl_name] = clbl.copy(subkernel=knl) + + # note the resolved callable for traversal + callables.update(clbl_resolver.calls_resolved - set(callables_table)) + elif isinstance(clbl, ScalarCallable): + # nothing to resolve within a scalar callable + callables_table[clbl_name] = clbl + else: + raise NotImplementedError(f"{type(clbl)}") + + program = program.copy(callables_table=callables_table) + + validate_kernel_call_sites(program) + + return program + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 787966efc7fd00ad282e60990846ce07004e7906..dd9135483eb7d8677d0a6dfb991359b0d0ced944 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -29,6 +29,14 @@ from loopy.types import NumpyType from loopy.diagnostic import ( LoopyError, TypeInferenceFailure, DependencyTypeInferenceFailure) +from loopy.kernel.instruction import _DataObliviousInstruction + +from loopy.symbolic import ( + LinearSubscript, parse_tagged_name, RuleAwareIdentityMapper, + SubstitutionRuleExpander, ResolvedFunction, + SubstitutionRuleMappingContext, SubArrayRef) +from pymbolic.primitives import Variable, Subscript, Lookup +from loopy.translation_unit import CallablesInferenceContext, make_clbl_inf_ctx import logging logger = logging.getLogger(__name__) @@ -40,10 +48,141 @@ def _debug(kernel, s, *args): logger.debug(f"{kernel.name}: {logstr}") +def get_return_types_as_tuple(arg_id_to_dtype): + """Returns the types of arguments in a tuple format. + + :arg arg_id_to_dtype: An instance of :class:`dict` which denotes a + mapping from the arguments to their inferred types. + """ + return_arg_id_to_dtype = {id: dtype for id, dtype in + arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)} + return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True) + + return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) + + +# {{{ renaming helpers + +class FunctionNameChanger(RuleAwareIdentityMapper): + """ + Changes the names of scoped functions in calls of expressions according to + the mapping ``calls_to_new_functions`` + """ + + def __init__(self, rule_mapping_context, calls_to_new_names, + subst_expander): + super().__init__(rule_mapping_context) + self.calls_to_new_names = calls_to_new_names + self.subst_expander = subst_expander + + def map_call(self, expr, expn_state): + name, tag = parse_tagged_name(expr.function) + + if name not in self.rule_mapping_context.old_subst_rules: + expanded_expr = self.subst_expander(expr) + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + elif expanded_expr in self.calls_to_new_names: + # FIXME: This is killing the substitution. + # Maybe using a RuleAwareIdentityMapper for TypeInferenceMapper + # would help. + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expanded_expr]), + tuple(self.rec(child, expn_state) + for child in expanded_expr.parameters)) + else: + return super().map_call( + expr, expn_state) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + def map_call_with_kwargs(self, expr): + # See https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + + +def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): + """ + Returns a copy of *kernel* with the names of pymbolic calls changed + according to the mapping given by *pymbolic_calls_new_names*. + + :arg pymbolic_calls_to_new_names: A mapping from instances of + :class:`pymbolic.primitives.Call` to :class:`str`. + + **Example: ** + + - Given a *kernel* -- + + .. code:: + + ------------------------------------------------------------- + KERNEL: loopy_kernel + ------------------------------------------------------------- + ARGUMENTS: + x: type: , shape: (10), dim_tags: (N0:stride:1) + y: type: , shape: (10), dim_tags: (N0:stride:1) + ------------------------------------------------------------- + DOMAINS: + { [i] : 0 <= i <= 9 } + ------------------------------------------------------------- + INAME IMPLEMENTATION TAGS: + i: None + ------------------------------------------------------------- + INSTRUCTIONS: + for i + y[i] = ResolvedFunction('sin')(x[i]) + end i + ------------------------------------------------------------- + + - And given a *pymbolic_calls_to_new_names* -- + + .. code:: + + {Call(ResolvedFunction(Variable('sin')), (Subscript(Variable('x'), + Variable('i')),))": 'sin_1'} + + - The following *kernel* is returned -- + + .. code:: + + ------------------------------------------------------------- + KERNEL: loopy_kernel + ------------------------------------------------------------- + ARGUMENTS: + x: type: , shape: (10), dim_tags: (N0:stride:1) + y: type: , shape: (10), dim_tags: (N0:stride:1) + ------------------------------------------------------------- + DOMAINS: + { [i] : 0 <= i <= 9 } + ------------------------------------------------------------- + INAME IMPLEMENTATION TAGS: + i: None + ------------------------------------------------------------- + INSTRUCTIONS: + for i + y[i] = ResolvedFunction('sin_1')(x[i]) + end i + ------------------------------------------------------------- + """ + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + name_changer = FunctionNameChanger(rule_mapping_context, + pymbolic_calls_to_new_names, subst_expander) + + return rule_mapping_context.finish_kernel( + name_changer.map_kernel(kernel)) + +# }}} + + # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): - def __init__(self, kernel, new_assignments=None): + def __init__(self, kernel, clbl_inf_ctx, new_assignments=None): """ :arg new_assignments: mapping from names to either :class:`loopy.kernel.data.TemporaryVariable` @@ -52,10 +191,13 @@ class TypeInferenceMapper(CombineMapper): instances """ self.kernel = kernel + assert isinstance(clbl_inf_ctx, CallablesInferenceContext) if new_assignments is None: new_assignments = {} self.new_assignments = new_assignments self.symbols_with_unknown_types = set() + self.clbl_inf_ctx = clbl_inf_ctx + self.old_calls_to_new_calls = {} def __call__(self, expr, return_tuple=False, return_dtype_set=False): kwargs = {} @@ -88,13 +230,16 @@ class TypeInferenceMapper(CombineMapper): # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) # are Python-equal (for many common constants such as integers). - def copy(self): - return type(self)(self.kernel, self.new_assignments) + def copy(self, clbl_inf_ctx=None): + if clbl_inf_ctx is None: + clbl_inf_ctx = self.clbl_inf_ctx + return type(self)(self.kernel, clbl_inf_ctx, + self.new_assignments) def with_assignments(self, names_to_vars): new_ass = self.new_assignments.copy() new_ass.update(names_to_vars) - return type(self)(self.kernel, new_ass) + return type(self)(self.kernel, self.clbl_inf_ctx, new_ass) @staticmethod def combine(dtype_sets): @@ -250,14 +395,17 @@ class TypeInferenceMapper(CombineMapper): return self.rec(expr.aggregate) def map_call(self, expr, return_tuple=False): + from pymbolic.primitives import Variable identifier = expr.function - if isinstance(identifier, Variable): - identifier = identifier.name - if identifier in ["indexof", "indexof_vec"]: - return [self.kernel.index_dtype] + if not isinstance(identifier, ResolvedFunction): + # function not resolved => exit + return [] + + if isinstance(identifier, (Variable, ResolvedFunction)): + identifier = identifier.name def none_if_empty(d): if d: @@ -266,25 +414,44 @@ class TypeInferenceMapper(CombineMapper): else: return None - arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters) - if None in arg_dtypes: + arg_id_to_dtype = {i: none_if_empty(self.rec(par)) + for (i, par) in enumerate(expr.parameters)} + + # specializing the known function wrt type + in_knl_callable = self.clbl_inf_ctx[expr.function.name] + + in_knl_callable, self.clbl_inf_ctx = (in_knl_callable + .with_types(arg_id_to_dtype, + self.clbl_inf_ctx)) + + in_knl_callable = in_knl_callable.with_target(self.kernel.target) + + # storing the type specialized function so that it can be used for + # later use + self.clbl_inf_ctx, new_function_id = ( + self.clbl_inf_ctx.with_callable( + expr.function.function, + in_knl_callable)) + + self.old_calls_to_new_calls[expr] = new_function_id + + new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + + if new_arg_id_to_dtype is None: return [] - mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) - if return_tuple: - if mangle_result is not None: - return [mangle_result.result_dtypes] - else: - if mangle_result is not None: - if len(mangle_result.result_dtypes) != 1 and not return_tuple: - raise LoopyError("functions with more or fewer than one " - "return value may only be used in direct assignments") + # collecting result dtypes in order of the assignees + if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: + if return_tuple: + return [get_return_types_as_tuple(new_arg_id_to_dtype)] + else: + return [new_arg_id_to_dtype[-1]] - return [mangle_result.result_dtypes[0]] + return [] - raise RuntimeError("unable to resolve " - "function '%s' with %d given arguments" - % (identifier, len(arg_dtypes))) + def map_call_with_kwargs(self, expr): + # See https://github.com/inducer/loopy/pull/323 + raise NotImplementedError def map_variable(self, expr): if expr.name in self.kernel.all_inames(): @@ -352,11 +519,20 @@ class TypeInferenceMapper(CombineMapper): def map_comparison(self, expr): # "bool" is unusable because OpenCL's bool has indeterminate memory # format. + self(expr.left, return_tuple=False, return_dtype_set=False) + self(expr.right, return_tuple=False, return_dtype_set=False) return [NumpyType(np.dtype(np.int32))] - map_logical_not = map_comparison - map_logical_and = map_comparison - map_logical_or = map_comparison + def map_logical_not(self, expr): + return [NumpyType(np.dtype(np.int32))] + + def map_logical_and(self, expr): + for child in expr.children: + self.rec(child) + + return [NumpyType(np.dtype(np.int32))] + + map_logical_or = map_logical_and def map_group_hw_index(self, expr, *args): return [self.kernel.index_dtype] @@ -393,20 +569,116 @@ class TypeInferenceMapper(CombineMapper): rec_results = self.rec(expr.expr) if return_tuple: - return [expr.operation.result_dtypes(self.kernel, *rec_result) + return [expr.operation.result_dtypes(*rec_result) for rec_result in rec_results] else: - return [expr.operation.result_dtypes(self.kernel, rec_result)[0] + return [expr.operation.result_dtypes(rec_result)[0] for rec_result in rec_results] + def map_sub_array_ref(self, expr): + return self.rec(expr.subscript) + + map_fortran_division = map_quotient + +# }}} + + +# {{{ TypeReader + +class TypeReader(TypeInferenceMapper): + def __init__(self, kernel, callables, new_assignments={}): + self.kernel = kernel + self.callables = callables + self.new_assignments = new_assignments + + # {{{ disabled interface + + def copy(self, *args, **kwargs): + raise ValueError("Not allowed in TypeReader") + + # }}} + + def with_assignments(self, names_to_vars): + new_ass = self.new_assignments.copy() + new_ass.update(names_to_vars) + return type(self)(self.kernel, self.callables, new_ass) + + def map_call(self, expr, return_tuple=False): + identifier = expr.function + if isinstance(identifier, (Variable, ResolvedFunction)): + identifier = identifier.name + + # specializing the known function wrt type + if isinstance(expr.function, ResolvedFunction): + in_knl_callable = self.callables[expr.function.name] + + arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + + if arg_id_to_dtype is None: + return [] + + # collecting result dtypes in order of the assignees + if -1 in arg_id_to_dtype and arg_id_to_dtype[-1] is not None: + if return_tuple: + return [get_return_types_as_tuple(arg_id_to_dtype)] + else: + return [arg_id_to_dtype[-1]] + + return [] + + def map_variable(self, expr): + if expr.name in self.kernel.all_inames(): + return [self.kernel.index_dtype] + + result = self.kernel.mangle_symbol( + self.kernel.target.get_device_ast_builder(), + expr.name) + + if result is not None: + result_dtype, _ = result + return [result_dtype] + + obj = self.new_assignments.get(expr.name) + + if obj is None: + obj = self.kernel.arg_dict.get(expr.name) + + if obj is None: + obj = self.kernel.temporary_variables.get(expr.name) + + if obj is None: + raise TypeInferenceFailure("name not known in type inference: %s" + % expr.name) + + from loopy.kernel.data import TemporaryVariable, KernelArgument + import loopy as lp + if isinstance(obj, (KernelArgument, TemporaryVariable)): + assert obj.dtype is not lp.auto + result = [obj.dtype] + if result[0] is None: + raise DependencyTypeInferenceFailure( + ", ".join(sorted(expr.name))) + else: + return result + + else: + raise RuntimeError("unexpected type inference " + "object type for '%s'" % expr.name) + + def map_call_with_kwargs(self, expr): + # See https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + # }}} # {{{ infer single variable def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): + if var_name in kernel.all_params(): - return [kernel.index_dtype], [] + return [kernel.index_dtype], [], {}, ( + type_inf_mapper.clbl_inf_ctx) from functools import partial debug = partial(_debug, kernel) @@ -451,11 +723,15 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): dtype_sets.append(result) if not dtype_sets: - return None, type_inf_mapper.symbols_with_unknown_types + return ( + None, type_inf_mapper.symbols_with_unknown_types, None, + type_inf_mapper.clbl_inf_ctx) result = type_inf_mapper.combine(dtype_sets) - return result, type_inf_mapper.symbols_with_unknown_types + return (result, type_inf_mapper.symbols_with_unknown_types, + type_inf_mapper.old_calls_to_new_calls, + type_inf_mapper.clbl_inf_ctx) # }}} @@ -482,7 +758,7 @@ class _DictUnionView: # {{{ infer_unknown_types -def infer_unknown_types(kernel, expect_completion=False): +def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx): """Infer types on temporaries and arguments.""" logger.debug("%s: infer types" % kernel.name) @@ -544,7 +820,8 @@ def infer_unknown_types(kernel, expect_completion=False): new_temp_vars, new_arg_dict ]) - type_inf_mapper = TypeInferenceMapper(kernel, item_lookup) + type_inf_mapper = TypeInferenceMapper(kernel, clbl_inf_ctx, + item_lookup) from loopy.symbolic import SubstitutionRuleExpander subst_expander = SubstitutionRuleExpander(kernel.substitutions) @@ -553,6 +830,8 @@ def infer_unknown_types(kernel, expect_completion=False): from loopy.kernel.data import TemporaryVariable, KernelArgument + old_calls_to_new_calls = {} + for var_chain in sccs: changed_during_last_queue_run = False queue = var_chain[:] @@ -575,10 +854,15 @@ def infer_unknown_types(kernel, expect_completion=False): item = item_lookup[name] debug("inferring type for %s %s", type(item).__name__, item.name) - - result, symbols_with_unavailable_types = ( - _infer_var_type( - kernel, item.name, type_inf_mapper, subst_expander)) + try: + (result, symbols_with_unavailable_types, + new_old_calls_to_new_calls, clbl_inf_ctx) = ( + _infer_var_type( + kernel, item.name, type_inf_mapper, subst_expander)) + except DependencyTypeInferenceFailure: + result = tuple() + type_inf_mapper = type_inf_mapper.copy( + clbl_inf_ctx=clbl_inf_ctx) failed = not result if not failed: @@ -597,6 +881,7 @@ def infer_unknown_types(kernel, expect_completion=False): new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError("unexpected item type in type inference") + old_calls_to_new_calls.update(new_old_calls_to_new_calls) else: debug(" failure") @@ -609,14 +894,10 @@ def infer_unknown_types(kernel, expect_completion=False): " (need type of '%s'--check for missing arguments)" % ", ".join(symbols_with_unavailable_types)) - if expect_completion: - raise LoopyError( - "could not determine type of '%s'%s" - % (item.name, advice)) - - else: - # We're done here. - break + debug("could not determine type of '%s'%s" + % (item.name, advice)) + # We're done here + break # remember that this item failed failed_names.add(item.name) @@ -624,7 +905,6 @@ def infer_unknown_types(kernel, expect_completion=False): if set(queue) == failed_names: # We did what we could... print(queue, failed_names, item.name) - assert not expect_completion break # can't infer type yet, put back into queue @@ -635,23 +915,138 @@ def infer_unknown_types(kernel, expect_completion=False): # }}} + # {{{ check if insn missed during type inference + + def _instruction_missed_during_inference(insn): + for assignee in insn.assignees: + if isinstance(assignee, Lookup): + assignee = assignee.aggregate + + if isinstance(assignee, Variable): + if assignee.name in kernel.arg_dict: + if kernel.arg_dict[assignee.name].dtype is None: + return False + else: + assert assignee.name in kernel.temporary_variables + if kernel.temporary_variables[assignee.name].dtype is None: + return False + + elif isinstance(assignee, (Subscript, LinearSubscript)): + if assignee.aggregate.name in kernel.arg_dict: + if kernel.arg_dict[assignee.aggregate.name].dtype is None: + return False + else: + assert assignee.aggregate.name in kernel.temporary_variables + if kernel.temporary_variables[ + assignee.aggregate.name].dtype is None: + return False + else: + assert isinstance(assignee, SubArrayRef) + if assignee.subscript.aggregate.name in kernel.arg_dict: + if kernel.arg_dict[ + assignee.subscript.aggregate.name].dtype is None: + return False + else: + assert assignee.subscript.aggregate.name in ( + kernel.temporary_variables) + if kernel.temporary_variables[ + assignee.subscript.aggregate.name] is None: + return False + + return True + + # }}} + + for insn in kernel.instructions: + if isinstance(insn, lp.MultiAssignmentBase): + # just a dummy run over the expression, to pass over all the + # functions + if _instruction_missed_during_inference(insn): + type_inf_mapper(insn.expression, + return_tuple=len(insn.assignees) != 1, + return_dtype_set=True) + elif isinstance(insn, (_DataObliviousInstruction, + lp.CInstruction)): + pass + else: + raise NotImplementedError("Unknown instructions type %s." % ( + type(insn).__name__)) + + clbl_inf_ctx = type_inf_mapper.clbl_inf_ctx + old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls) + end_time = time.time() logger.debug("type inference took {dur:.2f} seconds".format( dur=end_time - start_time)) - return unexpanded_kernel.copy( + pre_type_specialized_knl = unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], ) + type_specialized_kernel = change_names_of_pymbolic_calls( + pre_type_specialized_knl, old_calls_to_new_calls) + + return type_specialized_kernel, clbl_inf_ctx + + +def infer_unknown_types(program, expect_completion=False): + """Infer types on temporaries and arguments.""" + from loopy.kernel.data import auto + from loopy.translation_unit import resolve_callables + + program = resolve_callables(program) + + # {{{ early-exit criterion + + if all(clbl.is_type_specialized() + for clbl in program.callables_table.values()): + # all the callables including the kernels have inferred their types + # => no need for type inference + return program + + # }}} + + clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, + program.entrypoints) + + for e in program.entrypoints: + logger.debug(f"Entering entrypoint: {e}") + arg_id_to_dtype = {arg.name: arg.dtype for arg in + program[e].args if arg.dtype not in (None, auto)} + new_callable, clbl_inf_ctx = program.callables_table[e].with_types( + arg_id_to_dtype, clbl_inf_ctx) + clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable, + is_entrypoint=True) + if expect_completion: + from loopy.types import LoopyType + new_knl = new_callable.subkernel + + args_not_inferred = {arg.name + for arg in new_knl.args + if not isinstance(arg.dtype, LoopyType)} + + tvs_not_inferred = {tv.name + for tv in new_knl.temporary_variables.values() + if not isinstance(tv.dtype, LoopyType)} + + vars_not_inferred = tvs_not_inferred | args_not_inferred + + if vars_not_inferred: + if expect_completion: + raise LoopyError("could not determine type of" + f" '{vars_not_inferred.pop()}' of kernel '{e}'.") + + return clbl_inf_ctx.finish_program(program) + # }}} # {{{ reduction expression helper def infer_arg_and_reduction_dtypes_for_reduction_expression( - kernel, expr, unknown_types_ok): - type_inf_mapper = TypeInferenceMapper(kernel) + kernel, expr, callables_table, unknown_types_ok): + type_inf_mapper = TypeReader(kernel, callables_table) import loopy as lp if expr.is_tuple_typed: @@ -676,7 +1071,7 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( raise LoopyError("failed to determine type of accumulator for " "reduction '%s'" % expr) - reduction_dtypes = expr.operation.result_dtypes(kernel, *arg_dtypes) + reduction_dtypes = expr.operation.result_dtypes(*arg_dtypes) reduction_dtypes = tuple( dt.with_target(kernel.target) if dt is not lp.auto else dt diff --git a/loopy/types.py b/loopy/types.py index 97483ec0c2c0dd313d4071fd3367fd8844004298..99530a326c3a633854d605dea42cbe0b62246cec 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -200,6 +200,45 @@ class AtomicNumpyType(NumpyType, AtomicType): # }}} +# {{{ + +class OpaqueType(LoopyType): + """An opaque data type is truly opaque - it has no allocations, no + temporaries of that type, etc. The only thing allowed is to be pass in + through one ValueArg and go out to another. It is introduced to accomodate + functional calls to external libraries. + """ + def __init__(self, name): + assert isinstance(name, str) + self.name = name + self.target = None + + def is_integral(self): + return False + + def is_complex(self): + return False + + def involves_complex(self): + return False + + def update_persistent_hash(self, key_hash, key_builder): + key_builder.rec(key_hash, self.name) + + def __hash__(self): + return hash(self.name) + + def __eq__(self, other): + return ( + type(self) == type(other) + and self.name == other.name) + + def __ne__(self, other): + return not self.__eq__(other) + +# }}} + + def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False, target=None): from loopy.kernel.data import auto diff --git a/loopy/version.py b/loopy/version.py index 78eb9beb735ad870fa6ce314587e0cba6d7e1f7e..aa94283d05bc9ee46760da54862fc8ea75ade8a0 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -42,7 +42,7 @@ else: # }}} -VERSION = (2021, 1) +VERSION = (2021, 2) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS diff --git a/setup.py b/setup.py index 89927f28c6be492d91c6679a09cb0fdfc5b51a67..0417026a36a0ac579f0c38916bf3692e504c1d78 100644 --- a/setup.py +++ b/setup.py @@ -92,6 +92,7 @@ setup(name="loopy", "codepy>=2017.1", "colorama", "Mako", + "pyrsistent", ], extras_require={ diff --git a/test/library_for_test.py b/test/library_for_test.py index 2cb4067e0acd6f4a88ff166e0fd460ec925585f2..5f83a22aa027402293025877353021573648f578 100644 --- a/test/library_for_test.py +++ b/test/library_for_test.py @@ -1,23 +1,71 @@ -# This exists because function handles can't be pickled. +import loopy as lp +import numpy as np -def no_ret_f_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +class NoRetFunction(lp.ScalarCallable): + def with_types(self, arg_id_to_dtype, callables): + if len(arg_id_to_dtype) != 0: + raise RuntimeError("'f' cannot take any inputs.") - if (name == "f" and len(arg_dtypes) == 0): - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="f", - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) + return (self.copy(arg_id_to_dtype=arg_id_to_dtype, + name_in_target="f"), + callables) + def with_descrs(self, arg_id_to_descr, callables): + if len(arg_id_to_descr) != 0: + raise RuntimeError("'f' cannot take any inputs.") -def no_ret_f_preamble_gen(preamble_info): - yield ("10_define_f", - r""" - void f() - { - printf("Hi!\n"); - } - """) + return (self.copy(arg_id_to_descr=arg_id_to_descr), + callables) + + def generate_preambles(self, target): + assert isinstance(target, lp.CFamilyTarget) + yield ("10_define_f", + r""" + void f() + { + printf("Hi!\n"); + } + """) + + +class SingleArgNoRetFunction(lp.ScalarCallable): + def with_types(self, arg_id_to_dtype, callables): + input_dtype = arg_id_to_dtype.get(0) + if input_dtype is None: + return self, callables + + if input_dtype.numpy_dtype != np.float32: + raise RuntimeError("'f' only supports f32.") + + return (self.copy(arg_id_to_dtype=arg_id_to_dtype, + name_in_target="f"), + callables) + + def with_descrs(self, arg_id_to_descr, callables): + if len(arg_id_to_descr) != 0: + raise RuntimeError("'f' cannot take any inputs.") + + return (self.copy(arg_id_to_descr=arg_id_to_descr), + callables) + + def generate_preambles(self, target): + assert isinstance(target, lp.CFamilyTarget) + + yield ("10_define_f", + r""" + void f(float x) + { + printf("Hi!\n"); + } + """) + + +def symbol_x(knl, name): + if name == "X": + from loopy.types import to_loopy_type + return to_loopy_type(np.float32), "X" + + +def preamble_for_x(preamble_info): + yield("preamble_ten", r"#define X 10.0") diff --git a/test/test_apps.py b/test/test_apps.py index 56f4127ac6be827afda8bd41b6e87ee6d5e774dc..6e49e73fafae569411ad68fb8fefd24b5315087f 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -217,7 +217,8 @@ def test_rob_stroud_bernstein(ctx_factory): lp.GlobalArg("coeffs", None, shape=None), "..." ], - assumptions="deg>=0 and nels>=1" + assumptions="deg>=0 and nels>=1", + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) @@ -225,13 +226,12 @@ def test_rob_stroud_bernstein(ctx_factory): knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) - - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( + knl = lp.add_dtypes(knl, dict( qpts=np.float32, coeffs=np.float32, tmp=np.float32, - ))) + )) + print(lp.generate_code_v2(knl)) def test_rob_stroud_bernstein_full(ctx_factory): @@ -297,7 +297,8 @@ def test_rob_stroud_bernstein_full(ctx_factory): lp.GlobalArg("coeffs", None, shape=None), "..." ], - assumptions="deg>=0 and nels>=1" + assumptions="deg>=0 and nels>=1", + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) @@ -311,14 +312,14 @@ def test_rob_stroud_bernstein_full(ctx_factory): from pickle import dumps, loads knl = loads(dumps(knl)) - knl = lp.CompiledKernel(ctx, knl).get_highlighted_code( + knl = lp.add_dtypes(knl, dict( qpts=np.float32, tmp=np.float32, coeffs=np.float32, result=np.float32, )) - print(knl) + print(lp.generate_code_v2(knl)) def test_stencil(ctx_factory): @@ -661,9 +662,10 @@ def test_domain_tree_nesting(): TV("num_vals_offset", initializer=num_vals_offset, read_only=True, address_space=AS.PRIVATE), lp.GlobalArg("B", shape=(100, 31), dtype=np.float64), - lp.GlobalArg("out", shape=(100, 12), dtype=np.float64)]) + lp.GlobalArg("out", shape=(100, 12), dtype=np.float64)], + name="nested_domain") - parents_per_domain = knl.parents_per_domain() + parents_per_domain = knl["nested_domain"].parents_per_domain() def depth(i): if parents_per_domain[i] is None: diff --git a/test/test_c_execution.py b/test/test_c_execution.py index a204859fff57e4806ac9ebd8204acded021512ac..1c79241cfe4f78f574655c230fa1c393d2c4b51e 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -111,11 +111,12 @@ def test_c_target_strides_nonsquare(): lp.GlobalArg("a", np.float32, shape=sizes, order=order), "..." ], - target=ExecutableCTarget()) + target=ExecutableCTarget(), + name="nonsquare_strides") # test with C-order knl = __get_kernel("C") - a_lp = next(x for x in knl.args if x.name == "a") + a_lp = next(x for x in knl["nonsquare_strides"].args if x.name == "a") a_np = np.reshape(np.arange(np.product(a_lp.shape), dtype=np.float32), a_lp.shape, order="C") @@ -125,7 +126,7 @@ def test_c_target_strides_nonsquare(): # test with F-order knl = __get_kernel("F") - a_lp = next(x for x in knl.args if x.name == "a") + a_lp = next(x for x in knl["nonsquare_strides"].args if x.name == "a") a_np = np.reshape(np.arange(np.product(a_lp.shape), dtype=np.float32), a_lp.shape, order="F") diff --git a/test/test_callables.py b/test/test_callables.py new file mode 100644 index 0000000000000000000000000000000000000000..c19c7f1d058b55dba927fed33d44e6b54320fc03 --- /dev/null +++ b/test/test_callables.py @@ -0,0 +1,916 @@ +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import numpy as np +import pyopencl as cl +import pyopencl.clrandom # noqa: F401 +import loopy as lp +import pytest +import sys + + +from pyopencl.tools import ( # noqa: F401 + pytest_generate_tests_for_pyopencl + as pytest_generate_tests) + +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 + + +def test_register_function_lookup(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + from testlib import Log2Callable + + x = np.random.rand(10) + queue = cl.CommandQueue(ctx) + + prog = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[i] = log2(x[i]) + """) + prog = lp.register_callable(prog, "log2", Log2Callable("log2")) + + evt, (out, ) = prog(queue, x=x) + + assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 4 + + x = np.random.rand(n, n, n, n, n) + y = np.random.rand(n, n, n, n, n) + + grandchild_knl = lp.make_function( + "{[i, j]:0<= i, j< 4}", + """ + c[i, j] = 2*a[i, j] + 3*b[i, j] + """, name="linear_combo1") + + child_knl = lp.make_function( + "{[i, j]:0<=i, j < 4}", + """ + [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) + """, name="linear_combo2") + + parent_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], + [j, l]: y[i, j, k, l, m]) + """, + kernel_data=[ + lp.GlobalArg( + name="x, y", + dtype=np.float64, + shape=(n, n, n, n, n)), + ...] + ) + + knl = lp.merge([grandchild_knl, child_knl, parent_knl]) + + if inline: + knl = lp.inline_callable_kernel(knl, "linear_combo2") + knl = lp.inline_callable_kernel(knl, "linear_combo1") + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out)/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_slices_with_negative_step(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 4 + + x = np.random.rand(n, n, n, n, n) + y = np.random.rand(n, n, n, n, n) + + child_knl = lp.make_function( + "{[i, j]:0<=i, j < 4}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """, name="linear_combo") + + parent_knl = lp.make_kernel( + "{[i, k, m]: 0<=i, k, m<4}", + """ + z[i, 3:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], + y[i, :, k, :, m]) + """, + kernel_data=[ + lp.GlobalArg( + name="x, y, z", + dtype=np.float64, + shape=(n, n, n, n, n)), + ...] + ) + + knl = lp.merge([parent_knl, child_knl]) + if inline: + knl = lp.inline_callable_kernel(knl, "linear_combo") + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out[:, ::-1, :, :, :])/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl_with_hw_axes(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 4 + + x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + + callee_knl = lp.make_function( + "{[i, j]:0<=i, j < 4}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """, name="linear_combo") + + callee_knl = lp.split_iname(callee_knl, "i", 1, inner_tag="l.0", outer_tag="g.0") + + caller_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], + [j, l]: y[i, j, k, l, m]) + """, name="caller") + caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") + + knl = lp.merge([caller_knl, callee_knl]) + + knl = lp.set_options(knl, "return_dict") + + if inline: + knl = lp.inline_callable_kernel(knl, "linear_combo") + + evt, out = knl(queue, x=x_dev, y=y_dev) + + x_host = x_dev.get() + y_host = y_dev.get() + + assert np.linalg.norm(2*x_host+3*y_host-out["z"].get())/np.linalg.norm( + 2*x_host+3*y_host) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_shape_translation_through_sub_array_ref(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) + x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) + x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) + + callee1 = lp.make_function( + "{[i]: 0<=i<6}", + """ + b[i] = 2*abs(a[i]) + """, name="callee_fn1") + + callee2 = lp.make_function( + "{[i, j]: 0<=i<3 and 0 <= j < 2}", + """ + b[i, j] = 3*a[i, j] + """, name="callee_fn2") + + callee3 = lp.make_function( + "{[i]: 0<=i<6}", + """ + b[i] = 5*a[i] + """, name="callee_fn3") + + knl = lp.make_kernel( + "{[i, j, k, l]: 0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}", + """ + [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2]) + [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k]) + [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) + """) + + knl = lp.merge([knl, callee1]) + knl = lp.merge([knl, callee2]) + knl = lp.merge([knl, callee3]) + + if inline: + knl = lp.inline_callable_kernel(knl, "callee_fn1") + knl = lp.inline_callable_kernel(knl, "callee_fn2") + knl = lp.inline_callable_kernel(knl, "callee_fn3") + + knl = lp.set_options(knl, "write_cl") + knl = lp.set_options(knl, "return_dict") + evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3) + + y1 = out_dict["y1"].get() + y2 = out_dict["y2"].get() + y3 = out_dict["y3"].get() + + assert (np.linalg.norm(y1-2*x1.get())) < 1e-15 + assert (np.linalg.norm(y2-3*x2.get())) < 1e-15 + assert (np.linalg.norm(np.diag(y3-5*x3.get()))) < 1e-15 + + +def test_multi_arg_array_call(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + import pymbolic.primitives as p + n = 10 + acc_i = p.Variable("acc_i") + i = p.Variable("i") + index = p.Variable("index") + a_i = p.Subscript(p.Variable("a"), p.Variable("i")) + argmin_kernel = lp.make_function( + "{[i]: 0 <= i < n}", + [ + lp.Assignment(id="init2", assignee=index, + expression=0), + lp.Assignment(id="init1", assignee=acc_i, + expression="214748367"), + lp.Assignment(id="insn", assignee=index, + expression=p.If(p.Expression.eq(acc_i, a_i), i, index), + depends_on="update"), + lp.Assignment(id="update", assignee=acc_i, + expression=p.Variable("min")(acc_i, a_i), + depends_on="init1,init2")], + [ + lp.GlobalArg("a"), + lp.GlobalArg("acc_i, index", is_input=False, is_output=True, + shape=lp.auto), + ...], + name="custom_argmin") + + argmin_kernel = lp.fix_parameters(argmin_kernel, n=n) + + knl = lp.make_kernel( + "{[i]:0<=i 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_diff.py b/test/test_diff.py index 8af2a2b057a52ef6e122ffa65caf85d777ccbbb1..c1bfd9093a09cd9c1f265eb5895b3c677bdb37bf 100644 --- a/test/test_diff.py +++ b/test/test_diff.py @@ -58,12 +58,15 @@ def test_diff(ctx_factory): """ <> a = 1/(1+sinh(x[i] + y[j])**2) z[i] = sum(j, exp(a * x[j])) - """) + """, name="diff") knl = lp.fix_parameters(knl, n=50) from loopy.transform.diff import diff_kernel - dknl, diff_map = diff_kernel(knl, "z", "x") + #FIXME Is this the correct interface. Does it make sense to take the entire + #translation unit? + dknl, diff_map = diff_kernel(knl["diff"], "z", "x") + dknl = knl.with_kernel(dknl) dknl = lp.remove_unused_arguments(dknl) dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a") diff --git a/test/test_domain.py b/test/test_domain.py index 6a0d9f255faefc1e1e3e8fbd8c8f745b058ff1b9..03f1bbc2f538b03af8e7beb6b69d4132c99448e9 100644 --- a/test/test_domain.py +++ b/test/test_domain.py @@ -56,20 +56,15 @@ def test_assume(ctx_factory): knl = lp.make_kernel( "{[i]: 0<=i 10") - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - print(gen_knl) - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) - assert "if" not in compiled.get_code() + code = lp.generate_code_v2(knl).device_code() + assert "if" not in code def test_divisibility_assumption(ctx_factory): @@ -85,16 +80,14 @@ def test_divisibility_assumption(ctx_factory): lp.GlobalArg("b", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and (exists zz: n = 16*zz)") + assumptions="n>=1 and (exists zz: n = 16*zz)", + target=lp.PyOpenCLTarget(ctx.devices[0])) ref_knl = knl knl = lp.split_iname(knl, "i", 16) - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - code = lp.generate_code(k) - assert "if" not in code + code = lp.generate_code_v2(knl).device_code() + assert "if" not in code lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 16**3}) @@ -113,16 +106,12 @@ def test_eq_constraint(ctx_factory): [ lp.GlobalArg("a", np.float32, shape=(1000,)), lp.GlobalArg("b", np.float32, shape=(1000,)) - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for knl in kernel_gen: - print(lp.generate_code(knl)) + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds(ctx_factory): @@ -145,12 +134,10 @@ def test_dependent_loop_bounds(ctx_factory): lp.GlobalArg("a_sum", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and row_len>=1") + assumptions="n>=1 and row_len>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds_2(ctx_factory): @@ -174,14 +161,13 @@ def test_dependent_loop_bounds_2(ctx_factory): lp.GlobalArg("ax", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and row_len>=1") + assumptions="n>=1 and row_len>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds_3(ctx_factory): @@ -206,25 +192,22 @@ def test_dependent_loop_bounds_3(ctx_factory): lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), lp.ValueArg("n", np.int32), - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0]), + name="loopy_kernel") - assert knl.parents_per_domain()[1] == 0 + assert knl["loopy_kernel"].parents_per_domain()[1] == 0 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + print(lp.generate_code_v2(knl).device_code()) knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1") - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - with pytest.raises(RuntimeError): - list(lp.generate_loop_schedules(knl_bad)) + list(lp.generate_code_v2(knl_bad)) def test_dependent_loop_bounds_4(): @@ -280,17 +263,17 @@ def test_independent_multi_domain(ctx_factory): lp.GlobalArg("a", dtype, shape=("n"), order="C"), lp.GlobalArg("b", dtype, shape=("n"), order="C"), lp.ValueArg("n", np.int32), - ]) + ], + name="loopy_kernel") knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - assert knl.parents_per_domain() == 2*[None] + assert knl["loopy_kernel"].parents_per_domain() == 2*[None] n = 50 - cknl = lp.CompiledKernel(ctx, knl) - evt, (a, b) = cknl(queue, n=n, out_host=True) + evt, (a, b) = knl(queue, n=n, out_host=True) assert a.shape == (50,) assert b.shape == (50,) @@ -394,10 +377,11 @@ def test_triangle_domain(ctx_factory): knl = lp.make_kernel( "{[i,j]: 0<=i,j 1e-15 + assert abs_err < 1e-6 + + def test_fill(ctx_factory): fortran_src = """ subroutine fill(out, a, n) @@ -58,18 +155,18 @@ def test_fill(ctx_factory): !$loopy begin ! - ! fill, = lp.parse_fortran(SOURCE) + ! fill = lp.parse_fortran(SOURCE) ! fill = lp.split_iname(fill, "i", split_amount, ! outer_tag="g.0", inner_tag="l.0") - ! RESULT = [fill] + ! RESULT = fill ! !$loopy end """ - knl, = lp.parse_transformed_fortran(fortran_src, + knl = lp.parse_transformed_fortran(fortran_src, pre_transform_code="split_amount = 128") - assert "i_inner" in knl.all_inames() + assert "i_inner" in knl["fill"].all_inames() ctx = ctx_factory() @@ -90,7 +187,7 @@ def test_fill_const(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ctx = ctx_factory() @@ -113,7 +210,7 @@ def test_asterisk_in_shape(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -137,7 +234,7 @@ def test_assignment_to_subst(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ref_knl = knl @@ -164,7 +261,7 @@ def test_assignment_to_subst_two_defs(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ref_knl = knl @@ -192,15 +289,15 @@ def test_assignment_to_subst_indices(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) knl = lp.fix_parameters(knl, n=5) ref_knl = knl - assert "a" in knl.temporary_variables + assert "a" in knl["fill"].temporary_variables knl = lp.assignment_to_subst(knl, "a") - assert "a" not in knl.temporary_variables + assert "a" not in knl["fill"].temporary_variables ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl) @@ -229,7 +326,7 @@ def test_if(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ref_knl = knl @@ -263,7 +360,7 @@ def test_tagged(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) assert sum(1 for insn in lp.find_instructions(knl, "tag:input")) == 2 @@ -297,34 +394,34 @@ def test_matmul(ctx_factory, buffer_inames): end subroutine """ - knl, = lp.parse_fortran(fortran_src) + prog = lp.parse_fortran(fortran_src) - assert len(knl.domains) == 1 + assert len(prog["dgemm"].domains) == 1 - ref_knl = knl + ref_prog = prog - knl = lp.split_iname(knl, "i", 16, + prog = lp.split_iname(prog, "i", 16, outer_tag="g.0", inner_tag="l.1") - knl = lp.split_iname(knl, "j", 8, + prog = lp.split_iname(prog, "j", 8, outer_tag="g.1", inner_tag="l.0") - knl = lp.split_iname(knl, "k", 32) - knl = lp.assume(knl, "n mod 32 = 0") - knl = lp.assume(knl, "m mod 32 = 0") - knl = lp.assume(knl, "ell mod 16 = 0") - - knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") - knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") - knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", + prog = lp.split_iname(prog, "k", 32) + prog = lp.assume(prog, "n mod 32 = 0") + prog = lp.assume(prog, "m mod 32 = 0") + prog = lp.assume(prog, "ell mod 16 = 0") + + prog = lp.extract_subst(prog, "a_acc", "a[i1,i2]", parameters="i1, i2") + prog = lp.extract_subst(prog, "b_acc", "b[i1,i2]", parameters="i1, i2") + prog = lp.precompute(prog, "a_acc", "k_inner,i_inner", precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") - knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", + prog = lp.precompute(prog, "b_acc", "j_inner,k_inner", precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") - knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames, + prog = lp.buffer_array(prog, "c", buffer_inames=buffer_inames, init_expression="0", store_expression="base+buffer") - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128)) + lp.auto_test_vs_ref(ref_prog, ctx, prog, parameters=dict(n=128, m=128, ell=128)) @pytest.mark.xfail @@ -362,7 +459,7 @@ def test_batched_sparse(): """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) knl = lp.split_iname(knl, "i", 128) knl = lp.tag_inames(knl, {"i_outer": "g.0"}) @@ -406,18 +503,19 @@ def test_fuse_kernels(ctx_factory): result(e,i,j) = prev + d(i,k)*q(e,k,j) """ - xderiv, = lp.parse_fortran( + xderiv = lp.parse_fortran( fortran_template.format(inner=xd_line, name="xderiv")) - yderiv, = lp.parse_fortran( + yderiv = lp.parse_fortran( fortran_template.format(inner=yd_line, name="yderiv")) - xyderiv, = lp.parse_fortran( + xyderiv = lp.parse_fortran( fortran_template.format( inner=(xd_line + "\n" + yd_line), name="xyderiv")) - knl = lp.fuse_kernels((xderiv, yderiv), data_flow=[("result", 0, 1)]) - knl = lp.prioritize_loops(knl, "e,i,j,k") + knl = lp.fuse_kernels((xderiv["xderiv"], yderiv["yderiv"]), + data_flow=[("result", 0, 1)]) + knl = knl.with_kernel(lp.prioritize_loops(knl["xderiv_and_yderiv"], "e,i,j,k")) - assert len(knl.temporary_variables) == 2 + assert len(knl["xderiv_and_yderiv"].temporary_variables) == 2 ctx = ctx_factory() lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4)) @@ -449,15 +547,17 @@ def test_parse_and_fuse_two_kernels(): !$loopy begin ! - ! fill, twice = lp.parse_fortran(SOURCE) + ! t_unit = lp.parse_fortran(SOURCE) + ! fill = t_unit["fill"] + ! twice = t_unit["twice"] ! knl = lp.fuse_kernels((fill, twice)) ! print(knl) - ! RESULT = [knl] + ! RESULT = knl ! !$loopy end """ - knl, = lp.parse_transformed_fortran(fortran_src) + lp.parse_transformed_fortran(fortran_src) def test_precompute_some_exist(ctx_factory): @@ -477,9 +577,9 @@ def test_precompute_some_exist(ctx_factory): end subroutine """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) - assert len(knl.domains) == 1 + assert len(knl["dgemm"].domains) == 1 knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.1") @@ -507,6 +607,53 @@ def test_precompute_some_exist(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128)) +def test_fortran_subroutines(): + fortran_src = """ + subroutine twice(n, a) + implicit none + real*8 a(n) + integer i,n + + do i=1,n + a(i) = a(i) * 2 + end do + end subroutine + + subroutine twice_cross(n, a, i) + implicit none + integer i, n + real*8 a(n,n) + + call twice(n, a(1:n, i)) + call twice(n, a(i, 1:n)) + end subroutine + """ + t_unit = lp.parse_fortran(fortran_src).with_entrypoints("twice_cross") + print(lp.generate_code_v2(t_unit).device_code()) + + +def test_domain_fusion_imperfectly_nested(): + fortran_src = """ + subroutine imperfect(n, m, a, b) + implicit none + integer i, j, n, m + real a(n), b(n,n) + + do i=1, n + a(i) = i + do j=1, m + b(i,j) = i*j + end do + end do + end subroutine + """ + + t_unit = lp.parse_fortran(fortran_src) + # If n > 0 and m == 0, a single domain would be empty, + # leading (incorrectly) to no assignments to 'a'. + assert len(t_unit["imperfect"].domains) > 1 + + def test_division_in_shapes(ctx_factory): fortran_src = """ subroutine halve(m, a) @@ -520,13 +667,13 @@ def test_division_in_shapes(ctx_factory): end do end subroutine """ - knl, = lp.parse_fortran(fortran_src) - ref_knl = knl + t_unit = lp.parse_fortran(fortran_src) + ref_t_unit = t_unit - print(knl) + print(t_unit) ctx = ctx_factory() - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(m=128)) + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit, parameters=dict(m=128)) if __name__ == "__main__": diff --git a/test/test_fusion.py b/test/test_fusion.py index 8e28fb3493e9517236d386989a51f3d9dfe440ef..a811b3b3b5434f39b7f6b14f8ca3abbd82eab412 100644 --- a/test/test_fusion.py +++ b/test/test_fusion.py @@ -134,7 +134,7 @@ def test_write_block_matrix_fusion(ctx_factory): knl = lp.rename_argument(write_into_mat_prg(), "mat", f"mat_{idx}") kwargs[f"mat_{idx}"] = mat - for iname in knl.all_inames(): + for iname in knl.default_entrypoint.all_inames(): knl = lp.rename_iname(knl, iname, f"{iname}_{idx}") knl = lp.rename_argument(knl, "ndofs", f"ndofs_{idx}") diff --git a/test/test_loopy.py b/test/test_loopy.py index 1afdeb9c38c407ec5f2737dd87734df6397c1a56..28d22c39c387b35c4440198c83877a8a64bf3f72 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -92,7 +92,7 @@ def test_complicated_subst(ctx_factory): print(knl) - sr_keys = list(knl.substitutions.keys()) + sr_keys = list(knl["loopy_kernel"].substitutions.keys()) for letter, how_many in [ ("f", 1), ("g", 1), @@ -102,8 +102,10 @@ def test_complicated_subst(ctx_factory): assert substs_with_letter == how_many -def test_type_inference_no_artificial_doubles(): - knl = lp.make_kernel( +def test_type_inference_no_artificial_doubles(ctx_factory): + ctx = ctx_factory() + + prog = lp.make_kernel( "{[i]: 0<=i bb = a[i] - b[i] @@ -115,16 +117,15 @@ def test_type_inference_no_artificial_doubles(): lp.GlobalArg("c", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - code = lp.generate_code(k) - assert "double" not in code + code = lp.generate_code_v2(prog).device_code() + assert "double" not in code def test_type_inference_with_type_dependencies(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: i=0}", """ <>a = 99 @@ -136,13 +137,17 @@ def test_type_inference_with_type_dependencies(): <>d = b + 2 + 1j """, "...") - knl = lp.infer_unknown_types(knl) + prog = lp.infer_unknown_types(prog) from loopy.types import to_loopy_type - assert knl.temporary_variables["a"].dtype == to_loopy_type(np.int32) - assert knl.temporary_variables["b"].dtype == to_loopy_type(np.float32) - assert knl.temporary_variables["c"].dtype == to_loopy_type(np.float32) - assert knl.temporary_variables["d"].dtype == to_loopy_type(np.complex128) + assert prog["loopy_kernel"].temporary_variables["a"].dtype == to_loopy_type( + np.int32) + assert prog["loopy_kernel"].temporary_variables["b"].dtype == to_loopy_type( + np.float32) + assert prog["loopy_kernel"].temporary_variables["c"].dtype == to_loopy_type( + np.float32) + assert prog["loopy_kernel"].temporary_variables["d"].dtype == to_loopy_type( + np.complex128) def test_sized_and_complex_literals(ctx_factory): @@ -176,16 +181,12 @@ def test_simple_side_effect(ctx_factory): """ a[i] = a[i] + 1 """, - [lp.GlobalArg("a", np.float32, shape=(100,))] + [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]) ) - knl = lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - print(gen_knl) - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) def test_owed_barriers(ctx_factory): @@ -196,17 +197,14 @@ def test_owed_barriers(ctx_factory): [ " z[i] = a[i]" ], - [lp.GlobalArg("a", np.float32, shape=(100,))] + [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) def test_wg_too_small(ctx_factory): @@ -218,17 +216,14 @@ def test_wg_too_small(ctx_factory): " z[i] = a[i] {id=copy}" ], [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]), local_sizes={0: 16}) knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) - - import pytest - for gen_knl in kernel_gen: - with pytest.raises(RuntimeError): - lp.CompiledKernel(ctx, gen_knl).get_code() + print(knl) + with pytest.raises(RuntimeError): + print(lp.generate_code_v2(knl)) def test_multi_cse(ctx_factory): @@ -240,17 +235,14 @@ def test_multi_cse(ctx_factory): " z[i] = a[i] + a[i]**2" ], [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]), local_sizes={0: 16}) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.add_prefetch(knl, "a", []) - knl = lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) def test_bare_data_dependency(ctx_factory): @@ -280,7 +272,9 @@ def test_bare_data_dependency(ctx_factory): # {{{ test race detection -def test_ilp_write_race_detection_global(): +def test_ilp_write_race_detection_global(ctx_factory): + ctx = ctx_factory() + knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i,j a[i] = 5+i+j", ], - []) + [], + target=lp.PyOpenCLTarget(ctx.devices[0]), + name="loopy_kernel") knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16, 17) + assert knl["loopy_kernel"].temporary_variables["a"].shape == (16, 17) -def test_ilp_write_race_avoidance_private(): +def test_ilp_write_race_avoidance_private(ctx_factory): + ctx = ctx_factory() knl = lp.make_kernel( "{[j]: 0<=j<16 }", [ "<> a = 5+j", ], - []) + [], + target=lp.PyOpenCLTarget(ctx.devices[0]), + name="loopy_kernel") knl = lp.tag_inames(knl, dict(j="ilp")) knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16,) + assert knl["loopy_kernel"].temporary_variables["a"].shape == (16,) # }}} @@ -354,11 +356,12 @@ def test_write_parameter(ctx_factory): lp.GlobalArg("b", dtype, shape=()), lp.ValueArg("n", np.int32, approximately=1000), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) import pytest with pytest.raises(RuntimeError): - lp.CompiledKernel(ctx, knl).get_code() + lp.generate_code_v2(knl).device_code() # {{{ arg guessing @@ -379,10 +382,11 @@ def test_arg_shape_guessing(ctx_factory): lp.GlobalArg("c", shape=lp.auto), lp.ValueArg("n"), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_arg_guessing(ctx_factory): @@ -395,10 +399,11 @@ def test_arg_guessing(ctx_factory): b[i, j] = i*j c[i+j, j] = b[j,i] """, - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_arg_guessing_with_reduction(ctx_factory): @@ -413,16 +418,16 @@ def test_arg_guessing_with_reduction(ctx_factory): b[i, j] = i*j c[i+j, j] = b[j,i] """, - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_unknown_arg_shape(ctx_factory): ctx = ctx_factory() from loopy.target.pyopencl import PyOpenCLTarget - from loopy.compiled import CompiledKernel bsize = [256, 0] knl = lp.make_kernel( @@ -438,11 +443,11 @@ def test_unknown_arg_shape(ctx_factory): """, seq_dependencies=True, name="uniform_l", - target=PyOpenCLTarget(), + target=PyOpenCLTarget(ctx.devices[0]), assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0])) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) - kernel_info = CompiledKernel(ctx, knl).kernel_info(frozenset()) # noqa + print(lp.generate_code_v2(knl).device_code()) # }}} @@ -459,10 +464,11 @@ def test_nonlinear_index(ctx_factory): lp.GlobalArg("a", shape="n"), lp.ValueArg("n"), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_offsets_and_slicing(ctx_factory): @@ -494,9 +500,9 @@ def test_offsets_and_slicing(ctx_factory): b_full_h[b_sub] = 2*a_full_h[a_sub] - #print(cknl.get_highlighted_code({"a": a.dtype})) - knl = lp.set_options(knl, write_cl=True) + knl = lp.add_dtypes(knl, {"a": a.dtype}) + print(lp.generate_code_v2(knl)) knl(queue, a=a, b=b) import numpy.linalg as la @@ -514,18 +520,16 @@ def test_vector_ilp_with_prefetch(ctx_factory): # argument guessing. lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"], default_tag="l.auto") - cknl = lp.CompiledKernel(ctx, knl) - cknl.kernel_info() - import re - code = cknl.get_code() + code = lp.generate_code_v2(knl).device_code() assert len(list(re.finditer("barrier", code))) == 1 @@ -546,18 +550,18 @@ def test_c_instruction(ctx_factory): lp.TemporaryVariable("x", np.float32), "...", ], - assumptions="n>=1") + assumptions="n>=1", target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_dependent_domain_insn_iname_finding(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel([ + prog = lp.make_kernel([ "{[isrc_box]: 0<=isrc_box src_ibox = source_boxes[i] @@ -598,10 +604,11 @@ def test_inames_deps_from_write_subscript(ctx_factory): [ lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a", None, shape=None), - "..."]) + "..."], + name="loopy_kernel") - print(knl) - assert "i" in knl.insn_inames("myred") + print(prog) + assert "i" in prog["loopy_kernel"].insn_inames("myred") def test_modulo_indexing(ctx_factory): @@ -615,14 +622,12 @@ def test_modulo_indexing(ctx_factory): [ lp.GlobalArg("a", None, shape="n"), "..." - ] + ], target=lp.PyOpenCLTarget(ctx.devices[0]) ) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( - a=np.float32, - ))) + knl = lp.add_dtypes(knl, {"a": np.float32}) + print(lp.generate_code_v2(knl).device_code()) @pytest.mark.parametrize("vec_len", [2, 3, 4, 8, 16]) @@ -770,11 +775,7 @@ def test_multiple_writes_to_local_temporary(): temp[i, 1] = 15 """) knl = lp.tag_inames(knl, dict(i="l.0")) - - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - code, _ = lp.generate_code(k) - print(code) + print(lp.generate_code_v2(knl).device_code()) def test_make_copy_kernel(ctx_factory): @@ -854,9 +855,7 @@ def test_variable_size_temporary(): # Make sure that code generation succeeds even if # there are variable-length arrays. - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - lp.generate_code(k) + lp.generate_code_v2(knl).device_code() @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) @@ -980,7 +979,7 @@ def test_within_inames_and_reduction(): within_inames=frozenset(), within_inames_is_final=True) - k = lp.make_kernel("{[i,j] : 0<=i,j {[j]: 0 <= j < jmax}"], """ @@ -2274,10 +2264,11 @@ def test_barrier_insertion_near_bottom_of_loop(): end """, seq_dependencies=True) - knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.set_temporary_scope(knl, "a", "local") - knl = lp.set_temporary_scope(knl, "b", "local") - knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + prog = lp.tag_inames(prog, dict(i="l.0")) + prog = lp.set_temporary_scope(prog, "a", "local") + prog = lp.set_temporary_scope(prog, "b", "local") + prog = lp.preprocess_kernel(prog) + knl = lp.get_one_scheduled_kernel(prog["loopy_kernel"], prog.callables_table) print(knl) @@ -2287,7 +2278,7 @@ def test_barrier_insertion_near_bottom_of_loop(): def test_barrier_in_overridden_get_grid_size_expanded_kernel(): # make simple barrier'd kernel - knl = lp.make_kernel("{[i]: 0 <= i < 10}", + prog = lp.make_kernel("{[i]: 0 <= i < 10}", """ for i a[i] = i {id=a} @@ -2302,24 +2293,26 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel(): # split into kernel w/ vesize larger than iname domain vecsize = 16 - knl = lp.split_iname(knl, "i", vecsize, inner_tag="l.0") + prog = lp.split_iname(prog, "i", vecsize, inner_tag="l.0") from testlib import GridOverride # artifically expand via overridden_get_grid_sizes_for_insn_ids + knl = prog["loopy_kernel"] knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride( knl.copy(), vecsize)) + prog = prog.with_kernel(knl) # make sure we can generate the code - lp.generate_code_v2(knl) + lp.generate_code_v2(prog) def test_multi_argument_reduction_type_inference(): - from loopy.type_inference import TypeInferenceMapper + from loopy.type_inference import TypeReader from loopy.library.reduction import SegmentedSumReductionOperation from loopy.types import to_loopy_type op = SegmentedSumReductionOperation() - knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=ja = 0 <>b_s0 = 0 """) - vng = knl.get_var_name_generator() + vng = prog["loopy_kernel"].get_var_name_generator() assert vng("a_s0") != "a_s0" assert vng("b") != "b" @@ -2481,7 +2477,7 @@ def test_fixed_parameters(ctx_factory): def test_parameter_inference(): knl = lp.make_kernel("{[i]: 0 <= i < n and i mod 2 = 0}", "") - assert knl.all_params() == {"n"} + assert knl["loopy_kernel"].all_params() == {"n"} def test_execution_backend_can_cache_dtypes(ctx_factory): @@ -2500,7 +2496,7 @@ def test_execution_backend_can_cache_dtypes(ctx_factory): def test_wildcard_dep_matching(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0 <= i < 10}", """ <>a = 0 {id=insn1} @@ -2513,80 +2509,26 @@ def test_wildcard_dep_matching(): all_insns = {"insn%d" % i for i in range(1, 6)} - assert knl.id_to_insn["insn1"].depends_on == set() - assert knl.id_to_insn["insn2"].depends_on == all_insns - {"insn2"} - assert knl.id_to_insn["insn3"].depends_on == all_insns - {"insn3"} - assert knl.id_to_insn["insn4"].depends_on == {"insn1", "insn2"} - assert knl.id_to_insn["insn5"].depends_on == all_insns - {"insn1", "insn5"} - - -def test_preamble_with_separate_temporaries(ctx_factory): - # create a function mangler - - # and finally create a test - n = 5 - # for each entry come up with a random number of data points - num_data = np.asarray(np.random.randint(2, 10, size=n), dtype=np.int32) - # turn into offsets - offsets = np.asarray(np.hstack(([0], np.cumsum(num_data))), dtype=np.int32) - # create lookup data - lookup = np.empty(0) - for i in num_data: - lookup = np.hstack((lookup, np.arange(i))) - lookup = np.asarray(lookup, dtype=np.int32) - # and create data array - data = np.random.rand(np.product(num_data)) - - # make kernel - kernel = lp.make_kernel("{[i]: 0 <= i < n}", - """ - for i - <>ind = indirect(offsets[i], offsets[i + 1], 1) - out[i] = data[ind] - end - """, - [lp.GlobalArg("out", shape=("n",)), - lp.TemporaryVariable( - "offsets", shape=(offsets.size,), initializer=offsets, - address_space=lp.AddressSpace.GLOBAL, - read_only=True), - lp.GlobalArg("data", shape=(data.size,), dtype=np.float64)], - ) - - # fixt params, and add manglers / preamble - from testlib import ( - SeparateTemporariesPreambleTestMangler, - SeparateTemporariesPreambleTestPreambleGenerator, - ) - func_info = dict( - func_name="indirect", - func_arg_dtypes=(np.int32, np.int32, np.int32), - func_result_dtypes=(np.int32,), - arr=lookup - ) - - kernel = lp.fix_parameters(kernel, **{"n": n}) - kernel = lp.register_preamble_generators( - kernel, [SeparateTemporariesPreambleTestPreambleGenerator(**func_info)]) - kernel = lp.register_function_manglers( - kernel, [SeparateTemporariesPreambleTestMangler(**func_info)]) - - print(lp.generate_code(kernel)[0]) - # and call (functionality unimportant, more that it compiles) - ctx = cl.create_some_context() - queue = cl.CommandQueue(ctx) - # check that it actually performs the lookup correctly - assert np.allclose(kernel( - queue, data=data.flatten("C"))[1][0], data[offsets[:-1] + 1]) + assert prog["loopy_kernel"].id_to_insn["insn1"].depends_on == set() + assert (prog["loopy_kernel"].id_to_insn["insn2"].depends_on == all_insns - + {"insn2"}) + assert (prog["loopy_kernel"].id_to_insn["insn3"].depends_on == all_insns - + {"insn3"}) + assert (prog["loopy_kernel"].id_to_insn["insn4"].depends_on == {"insn1", + "insn2"}) + assert (prog["loopy_kernel"].id_to_insn["insn5"].depends_on == all_insns - + {"insn1", "insn5"}) def test_arg_inference_for_predicates(): - knl = lp.make_kernel("{[i]: 0 <= i < 10}", + prog = lp.make_kernel("{[i]: 0 <= i < 10}", """ if incr[i] a = a + 1 end - """) + """, name="loopy_kernel") + + knl = prog["loopy_kernel"] assert "incr" in knl.arg_dict assert knl.arg_dict["incr"].shape == (10,) @@ -2611,7 +2553,7 @@ def test_relaxed_stride_checks(ctx_factory): def test_add_prefetch_works_in_lhs_index(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{ [n,k,l,k1,l1,k2,l2]: " "start<=n {[i,j,k]: 0<=i,j {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}", [ @@ -945,10 +1004,25 @@ def test_barrier_counter_barriers(): m = 256 ell = 128 params = {"n": n, "m": m, "ell": ell} - barrier_count = sync_map["barrier_local"].eval_with_dict(params) + barrier_count = sync_map.filter_by(kind="barrier_local").eval_and_sum(params) assert barrier_count == 50*10*2 +def test_barrier_count_single(): + knl = lp.make_kernel( + "{[i]: 0<=i<128}", + """ + <> c[i] = 15*i {id=yoink} + c[i+1] = c[i] {dep=yoink} + """) + + knl = lp.tag_inames(knl, {"i": "l.0"}) + sync_map = lp.get_synchronization_map(knl) + print(sync_map) + barrier_count = sync_map.filter_by(kind="barrier_local").eval_and_sum() + assert barrier_count == 1 + + def test_all_counters_parallel_matmul(): bsize = 16 knl = lp.make_kernel( @@ -975,21 +1049,21 @@ def test_all_counters_parallel_matmul(): sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 - assert sync_map["kernel_launch"].eval_with_dict(params) == 1 - assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize + assert sync_map.filter_by(kind="kernel_launch").eval_and_sum(params) == 1 + assert sync_map.filter_by(kind="barrier_local").eval_and_sum(params) == 2*m/bsize op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) f32mul = op_map[ - lp.Op(np.float32, "mul", CG.SUBGROUP) + lp.Op(np.float32, "mul", CG.SUBGROUP, "matmul") ].eval_with_dict(params) f32add = op_map[ - lp.Op(np.float32, "add", CG.SUBGROUP) + lp.Op(np.float32, "add", CG.SUBGROUP, "matmul") ].eval_with_dict(params) i32ops = op_map[ - lp.Op(np.int32, "add", CG.SUBGROUP) + lp.Op(np.int32, "add", CG.SUBGROUP, "matmul") ].eval_with_dict(params) i32ops += op_map[ - lp.Op(np.dtype(np.int32), "mul", CG.SUBGROUP) + lp.Op(np.dtype(np.int32), "mul", CG.SUBGROUP, "matmul") ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups @@ -1002,13 +1076,15 @@ def test_all_counters_parallel_matmul(): lid_strides={0: 1, 1: Variable("ell")}, gid_strides={1: bsize}, direction="load", variable="b", - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name="matmul") ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess("global", np.float32, lid_strides={0: 1, 1: Variable("m")}, gid_strides={0: Variable("m")*bsize}, direction="load", - variable="a", count_granularity=CG.WORKITEM) + variable="a", count_granularity=CG.WORKITEM, + kernel_name="matmul") ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize @@ -1018,7 +1094,8 @@ def test_all_counters_parallel_matmul(): lid_strides={0: 1, 1: Variable("ell")}, gid_strides={0: Variable("ell")*bsize, 1: bsize}, direction="store", variable="c", - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name="matmul") ].eval_with_dict(params) assert f32coal == n*ell @@ -1037,14 +1114,16 @@ def test_all_counters_parallel_matmul(): lid_strides={1: 16}, gid_strides={}, variable="a_fetch", - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name="matmul") ].eval_with_dict(params) local_mem_l_b = local_mem_map[lp.MemAccess("local", np.dtype(np.float32), direction="load", lid_strides={0: 1}, gid_strides={}, variable="b_fetch", - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name="matmul") ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups @@ -1093,9 +1172,8 @@ def test_floor_div_coefficient_collector(): n_subgroups = n_workgroups*subgroups_per_group # count local f32 accesses - f32_local = lp.get_mem_access_map( - knl, count_redundant_work=True, subgroup_size=SGS - ).filter_by(dtype=[np.float32], mtype=["local"]).eval_and_sum(params) + m = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) + f32_local = m.filter_by(dtype=[np.float32], mtype=["local"]).eval_and_sum(params) # (count-per-sub-group)*n_subgroups assert f32_local == 2*(rept+1)*n_subgroups @@ -1133,7 +1211,8 @@ def test_mem_access_tagged_variables(): gid_strides={1: bsize}, direction="load", variable="b", variable_tag="mmbload", - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name="matmul") ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess("global", np.float32, lid_strides={1: Variable("m")}, @@ -1141,7 +1220,8 @@ def test_mem_access_tagged_variables(): direction="load", variable="a", variable_tag="mmaload", - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name="matmul") ].eval_with_dict(params) assert f32s1lb == n*m*ell @@ -1154,7 +1234,8 @@ def test_mem_access_tagged_variables(): gid_strides={0: Variable("ell")*bsize, 1: bsize}, direction="store", variable="c", variable_tag="mmresult", - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name="matmul") ].eval_with_dict(params) assert f32coal == n*ell @@ -1319,6 +1400,85 @@ def test_strided_footprint(): assert 2*num < denom +def test_stats_on_callable_kernel(): + callee = lp.make_function( + "{[i, j]: 0<=i, j< 20}", + """ + y[i] = sum(j, A[i,j]*x[j]) + """, name="matvec20x20") + + caller = lp.make_kernel( + "{:}", + """ + y[:] = matvec20x20(A[:,:], x[:]) + """, + [ + lp.GlobalArg("x,y", shape=(20,), dtype=np.float), + lp.GlobalArg("A", shape=(20, 20), dtype=np.float), + ], + name="matvec") + caller = lp.merge([caller, callee]) + + op_map = lp.get_op_map(caller, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) + f64_add = op_map.filter_by(name="add").eval_and_sum({}) + assert f64_add == 400 + + +def test_stats_on_callable_kernel_within_loop(): + callee = lp.make_function( + "{[i, j]: 0<=i, j< 20}", + """ + y[i] = sum(j, A[i,j]*x[j]) + """, name="matvec20x20") + + caller = lp.make_kernel( + "{[i]: 0<=i< 20}", + """ + y[i, :] = matvec20x20(A[:,:], x[i, :]) + """, + [ + lp.GlobalArg("x,y", shape=(20, 20), dtype=np.float), + lp.GlobalArg("A", shape=(20, 20), dtype=np.float), + ], + name="matmat") + caller = lp.merge([caller, callee]) + + op_map = lp.get_op_map(caller, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) + + f64_add = op_map.filter_by(name="add").eval_and_sum({}) + assert f64_add == 8000 + + +def test_callable_kernel_with_substitution(): + callee = lp.make_function( + "{[i, j]: 0<=i, j< n}", + """ + y[i] = sum(j, A[i,j]*x[j]) + """, + [lp.ValueArg("n"), ...], + name="matvec") + + caller = lp.make_kernel( + "{[i]: 0<=i< 20}", + """ + y[i, :] = matvec(20, A[:,:], x[i, :]) + """, + [ + lp.GlobalArg("x,y", shape=(20, 20), dtype=np.float), + lp.GlobalArg("A", shape=(20, 20), dtype=np.float), + ], + name="matmat") + caller = lp.merge([caller, callee]) + + op_map = lp.get_op_map(caller, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) + + f64_add = op_map.filter_by(name="add").eval_and_sum({}) + assert f64_add == 8000 + + def test_no_loop_ops(): # See https://github.com/inducer/loopy/issues/211 diff --git a/test/test_target.py b/test/test_target.py index 3bf625c3a25e9246151807f65e0ee02a8c139781..6cf0cdb99008b8a8b070f64c648aa01b275dea70 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -70,9 +70,7 @@ def test_ispc_target(occa_mode=False): knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"], default_tag="l.auto") - codegen_result = lp.generate_code_v2( - lp.get_one_scheduled_kernel( - lp.preprocess_kernel(knl))) + codegen_result = lp.generate_code_v2(knl) print(codegen_result.device_code()) print(codegen_result.host_code()) @@ -96,9 +94,8 @@ def test_cuda_target(): default_tag="l.auto") print( - lp.generate_code( - lp.get_one_scheduled_kernel( - lp.preprocess_kernel(knl)))[0]) + lp.generate_code_v2( + knl).device_code()) def test_generate_c_snippet(): @@ -138,10 +135,7 @@ def test_generate_c_snippet(): knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.prioritize_loops(knl, "I,k_outer,k_inner") - - knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) - print(lp.generate_body(knl)) + print(lp.generate_code_v2(knl)) @pytest.mark.parametrize("target", [CTarget, OpenCLTarget]) @@ -354,8 +348,7 @@ def test_ispc_streaming_stores(): knl = lp.set_argument_order(knl, vars + ["n"]) - knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) + lp.generate_code_v2(knl).all_code() assert "streaming_store(" in lp.generate_code_v2(knl).all_code() diff --git a/test/test_transform.py b/test/test_transform.py index b154a9468cc04f9595b369ef9ea90dac2074c20a..4853545db20547483bcac86f1ec181434061ee53 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -155,7 +155,7 @@ def test_to_batched_temp(ctx_factory): bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x") # checking that cnst is not being bathced - assert bknl.temporary_variables["cnst"].shape == () + assert bknl["loopy_kernel"].temporary_variables["cnst"].shape == () a = np.random.randn(5, 5) x = np.random.randn(7, 5) @@ -260,18 +260,17 @@ def test_vectorize(ctx_factory): a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) - knl = lp.set_array_dim_names(knl, "a,b", "i") + knl = lp.set_array_axis_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) - knl = lp.tag_data_axes(knl, "a,b", "c,vec") + knl = lp.tag_array_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref( @@ -280,19 +279,19 @@ def test_vectorize(ctx_factory): def test_extract_subst(ctx_factory): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0<=itmp[i] = 10 {id=insn1} <>tmp2[i] = 10 {id=insn2} @@ -495,30 +497,36 @@ def test_add_nosync(): <>tmp5[i] = 0 {id=insn5,groups=g1} tmp5[i] = 1 {id=insn6,conflicts=g1} - """) + """, name="nosync") - orig_knl = lp.set_temporary_scope(orig_knl, "tmp3", "local") - orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local") + orig_prog = lp.set_temporary_scope(orig_prog, "tmp3", "local") + orig_prog = lp.set_temporary_scope(orig_prog, "tmp5", "local") # No dependency present - don't add nosync - knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2", + prog = lp.add_nosync(orig_prog, "any", "writes:tmp", "writes:tmp2", empty_ok=True) - assert frozenset() == knl.id_to_insn["insn2"].no_sync_with + assert frozenset() == ( + prog["nosync"].id_to_insn["insn2"].no_sync_with) # Dependency present - knl = lp.add_nosync(orig_knl, "local", "writes:tmp3", "reads:tmp3") - assert frozenset() == knl.id_to_insn["insn3"].no_sync_with - assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + prog = lp.add_nosync(orig_prog, "local", "writes:tmp3", "reads:tmp3") + assert frozenset() == ( + prog["nosync"].id_to_insn["insn3"].no_sync_with) + assert frozenset([("insn3", "local")]) == ( + prog["nosync"].id_to_insn["insn4"].no_sync_with) # Bidirectional - knl = lp.add_nosync( - orig_knl, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) - assert frozenset([("insn4", "local")]) == knl.id_to_insn["insn3"].no_sync_with - assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + prog = lp.add_nosync( + orig_prog, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) + assert frozenset([("insn4", "local")]) == ( + prog["nosync"].id_to_insn["insn3"].no_sync_with) + assert frozenset([("insn3", "local")]) == ( + prog["nosync"].id_to_insn["insn4"].no_sync_with) # Groups - knl = lp.add_nosync(orig_knl, "local", "insn5", "insn6") - assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with + prog = lp.add_nosync(orig_prog, "local", "insn5", "insn6") + assert frozenset([("insn5", "local")]) == ( + prog["nosync"].id_to_insn["insn6"].no_sync_with) def test_uniquify_instruction_ids(): @@ -527,28 +535,30 @@ def test_uniquify_instruction_ids(): i3 = lp.Assignment("b", 1, id=lp.UniqueName("b")) i4 = lp.Assignment("b", 1, id=lp.UniqueName("b")) - knl = lp.make_kernel("{[i]: i = 1}", []).copy(instructions=[i1, i2, i3, i4]) + prog = lp.make_kernel("{[i]: i = 1}", [], name="lpy_knl") + new_root_kernel = prog["lpy_knl"].copy(instructions=[i1, i2, i3, i4]) + prog = prog.with_kernel(new_root_kernel) from loopy.transform.instruction import uniquify_instruction_ids - knl = uniquify_instruction_ids(knl) + prog = uniquify_instruction_ids(prog) - insn_ids = {insn.id for insn in knl.instructions} + insn_ids = {insn.id for insn in prog["lpy_knl"].instructions} assert len(insn_ids) == 4 assert all(isinstance(id, str) for id in insn_ids) def test_split_iname_only_if_in_within(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0<=i<10}", """ c[i] = 3*d[i] {id=to_split} a[i] = 2*b[i] {id=not_to_split} - """) + """, name="splitter") - knl = lp.split_iname(knl, "i", 4, within="id:to_split") + prog = lp.split_iname(prog, "i", 4, within="id:to_split") - for insn in knl.instructions: + for insn in prog["splitter"].instructions: if insn.id == "to_split": assert insn.within_inames == frozenset({"i_outer", "i_inner"}) if insn.id == "not_to_split": @@ -559,7 +569,7 @@ def test_nested_substs_in_insns(ctx_factory): ctx = ctx_factory() import loopy as lp - ref_knl = lp.make_kernel( + ref_prg = lp.make_kernel( "{[i]: 0<=i<10}", """ a(x) := 2 * x @@ -569,10 +579,12 @@ def test_nested_substs_in_insns(ctx_factory): """ ) - knl = lp.expand_subst(ref_knl) - assert not knl.substitutions + t_unit = lp.expand_subst(ref_prg) + assert not any( + cknl.subkernel.substitutions + for cknl in t_unit.callables_table.values()) - lp.auto_test_vs_ref(ref_knl, ctx, knl) + lp.auto_test_vs_ref(ref_prg, ctx, t_unit) def test_extract_subst_with_iname_deps_in_templ(ctx_factory): @@ -665,12 +677,12 @@ def test_add_inames_for_unused_hw_axes(ctx_factory): knl = lp.add_inames_for_unused_hw_axes(knl) - assert knl.id_to_insn["init_alpha"].within_inames == frozenset(["i_inner", - "i_outer", "j_outer", "j_inner"]) - assert knl.id_to_insn["a_fetch_rule"].within_inames == frozenset(["i_inner", - "i_outer", "j_outer", "j_inner"]) - assert knl.id_to_insn["b_fetch_rule"].within_inames == frozenset(["i_inner", - "i_outer", "j_outer", "j_inner"]) + assert (knl["rank_one"].id_to_insn["init_alpha"].within_inames + == frozenset(["i_inner", "i_outer", "j_outer", "j_inner"])) + assert (knl["rank_one"].id_to_insn["a_fetch_rule"].within_inames + == frozenset(["i_inner", "i_outer", "j_outer", "j_inner"])) + assert (knl["rank_one"].id_to_insn["b_fetch_rule"].within_inames + == frozenset(["i_inner", "i_outer", "j_outer", "j_inner"])) lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[np.dtype(dtype).itemsize*n**2/1e9], op_label=["GBytes"], @@ -730,12 +742,13 @@ def test_rename_argument_with_assumptions(): knl = lp.assume(knl, "n_old=10") knl = lp.rename_argument(knl, "n_old", "n_new") + assumptions = knl["loopy_kernel"].assumptions - assert "n_old" not in knl.assumptions.get_var_dict() - assert "n_new" in knl.assumptions.get_var_dict() + assert "n_old" not in assumptions.get_var_dict() + assert "n_new" in assumptions.get_var_dict() assert ( - (knl.assumptions & isl.BasicSet("[n_new]->{: n_new=10}")) - == knl.assumptions) + (assumptions & isl.BasicSet("[n_new]->{: n_new=10}")) + == assumptions) def test_tag_iname_with_match_pattern(): @@ -747,6 +760,7 @@ def test_tag_iname_with_match_pattern(): """) knl = lp.tag_inames(knl, "i*:unr") + knl = knl["loopy_kernel"] i0_tag, = knl.inames["i0"].tags i1_tag, = knl.inames["i1"].tags @@ -772,6 +786,7 @@ def test_custom_iname_tag(): """) knl = lp.tag_inames(knl, {"ifuzz0": ElementLoopTag(), "ifuzz1": DOFLoopTag()}) + knl = knl["loopy_kernel"] ifuzz0_tag, = knl.inames["ifuzz0"].tags ifuzz1_tag, = knl.inames["ifuzz1"].tags diff --git a/test/testlib.py b/test/testlib.py index 35d51f72d2d7cf08dc5b92c8377c9c1578509e6d..847c7423ad9cd17ba4032bca5832d690c96dedaf 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -1,4 +1,5 @@ import loopy as lp +import numpy as np # {{{ test_barrier_in_overridden_get_grid_size_expanded_kernel @@ -8,126 +9,48 @@ class GridOverride: self.clean = clean self.vecsize = vecsize - def __call__(self, insn_ids, ignore_auto=True): - gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto) + def __call__(self, insn_ids, callables_table, ignore_auto=True): + gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, + callables_table, ignore_auto) return gsize, (self.vecsize,) # }}} -# {{{ test_preamble_with_separate_temporaries +# {{{ test_register_function_lookup -class SeparateTemporariesPreambleTestDataHolder: - def __init__(self, func_name, func_arg_dtypes, func_result_dtypes, arr): - self.func_name = func_name - self.func_arg_dtypes = func_arg_dtypes - self.func_result_dtypes = func_result_dtypes - self.arr = arr +class Log2Callable(lp.ScalarCallable): - def __eq__(self, other): - import numpy as np + def with_types(self, arg_id_to_dtype, callables_table): + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0].numpy_dtype + + if dtype.kind in ("u", "i"): + # ints and unsigned casted to float32 + dtype = np.float32 + + if dtype.type == np.float32: + name_in_target = "log2f" + elif dtype.type == np.float64: + name_in_target = "log2" + pass + else: + raise TypeError(f"log2: unexpected type {dtype}") + + from loopy.types import NumpyType return ( - isinstance(other, type(self)) - and self.func_name == other.func_name - and self.func_arg_dtypes == other.func_arg_dtypes - and self.func_result_dtypes == other.func_result_dtypes - and np.array_equal(self.arr, other.arr)) - - def __ne__(self, other): - return not self.__eq__(other) - - -class SeparateTemporariesPreambleTestMangler( - SeparateTemporariesPreambleTestDataHolder): - def __call__(self, kernel, name, arg_dtypes): - """ - A function that will return a :class:`loopy.kernel.data.CallMangleInfo` - to interface with the calling :class:`loopy.LoopKernel` - """ - if name != self.func_name: - return None - - from loopy.types import to_loopy_type - from loopy.kernel.data import CallMangleInfo - - def __compare(d1, d2): - # compare dtypes ignoring atomic - return to_loopy_type(d1, for_atomic=True) == \ - to_loopy_type(d2, for_atomic=True) - - # check types - if len(arg_dtypes) != len(arg_dtypes): - raise Exception("Unexpected number of arguments provided to mangler " - "{}, expected {}, got {}".format( - self.func_name, len(self.func_arg_dtypes), - len(arg_dtypes))) - - for i, (d1, d2) in enumerate(zip(self.func_arg_dtypes, arg_dtypes)): - if not __compare(d1, d2): - raise Exception("Argument at index {} for mangler {} does not " - "match expected dtype. Expected {}, got {}". - format(i, self.func_name, str(d1), str(d2))) - - # get target for creation - target = arg_dtypes[0].target - return CallMangleInfo( - target_name=self.func_name, - result_dtypes=tuple(to_loopy_type(x, target=target) for x in - self.func_result_dtypes), - arg_dtypes=arg_dtypes) - - -class SeparateTemporariesPreambleTestPreambleGenerator( - SeparateTemporariesPreambleTestDataHolder): - def __call__(self, preamble_info): - - # find a function matching our name - func_match = next( - (x for x in preamble_info.seen_functions - if x.name == self.func_name), None) - desc = "custom_funcs_indirect" - if func_match is not None: - from loopy.types import to_loopy_type - # check types - if tuple(to_loopy_type(x) for x in self.func_arg_dtypes) == \ - func_match.arg_dtypes: - # if match, create our temporary - var = lp.TemporaryVariable( - "lookup", initializer=self.arr, dtype=self.arr.dtype, - shape=self.arr.shape, - address_space=lp.AddressSpace.GLOBAL, read_only=True) - # and code - code = """ - int {name}(int start, int end, int match) - {{ - int result = start; - for (int i = start + 1; i < end; ++i) - {{ - if (lookup[i] == match) - result = i; - }} - return result; - }} - """.format(name=self.func_name) - - # generate temporary variable code - from cgen import Initializer - from loopy.target.c import generate_array_literal - codegen_state = preamble_info.codegen_state.copy( - is_generating_device_code=True) - kernel = preamble_info.kernel - ast_builder = codegen_state.ast_builder - target = kernel.target - decl_info, = var.decl_info(target, index_dtype=kernel.index_dtype) - decl = ast_builder.wrap_global_constant( - ast_builder.get_temporary_decl( - codegen_state, None, var, - decl_info)) - if var.initializer is not None: - decl = Initializer(decl, generate_array_literal( - codegen_state, var, var.initializer)) - # return generated code - yield (desc, "\n".join([str(decl), code])) + self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + callables_table) + # }}}