diff --git a/.gitignore b/.gitignore index 322e3a7caa7f229bccba899cb9bc993d95364b86..c5865116226b2cdc15ed4978548416090265e115 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ distribute*tar.gz core .coverage htmlcov +.ipynb_checkpoints diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..5e2805bd9cc8fbb64f63893093e45229d6803cb3 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,24 @@ +Python 2.7 AMD CPU: + script: + - py_version=2.7 + - cl_dev=amd_pu + - EXTRA_INSTALL="numpy mako" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - ". ./build-and-test-py-project.sh" + tags: + - python2.7 + - amd-cl-cpu + except: + - tags +Python 3.4 AMD CPU: + script: + - py_version=3.4 + - cl_dev=amd_pu + - EXTRA_INSTALL="numpy mako" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - ". ./build-and-test-py-project.sh" + tags: + - python3.4 + - amd-cl-cpu + except: + - tags diff --git a/bin/loopy b/bin/loopy index 2a657174d4e390f80331689068b8d21e8eab0de7..cb223c31a1eef3244bd21310722f4b84a539d77c 100644 --- a/bin/loopy +++ b/bin/loopy @@ -29,13 +29,13 @@ def to_python_literal(value): def defines_to_python_code(defines_str): import re - DEFINE_RE = re.compile(r"^\#define\s+([a-zA-Z0-9_]+)\s+(.*)$") + define_re = re.compile(r"^\#define\s+([a-zA-Z0-9_]+)\s+(.*)$") result = [] for l in defines_str.split("\n"): if not l.strip(): continue - match = DEFINE_RE.match(l) + match = define_re.match(l) if match is None: raise RuntimeError("#define not understood: '%s'" % l) @@ -71,23 +71,22 @@ def main(): with open(args.infile, "r") as infile_fd: infile_content = infile_fd.read() - # {{{ path wrangling + if args.lang == "loopy": + # {{{ path wrangling - from os.path import dirname, abspath - from os import getcwd + from os.path import dirname, abspath + from os import getcwd - infile_dirname = dirname(args.infile) - if infile_dirname: - infile_dirname = abspath(infile_dirname) - else: - infile_dirname = getcwd() + infile_dirname = dirname(args.infile) + if infile_dirname: + infile_dirname = abspath(infile_dirname) + else: + infile_dirname = getcwd() - import sys - sys.path.append(infile_dirname) + sys.path.append(infile_dirname) - # }}} + # }}} - if args.lang == "loopy": data_dic = {} data_dic["lp"] = lp data_dic["np"] = np @@ -131,9 +130,9 @@ def main(): defines_to_python_code(defines_fd.read()) + pre_transform_code) - from loopy.frontend.fortran import f2loopy - kernels = f2loopy(infile_content, pre_transform_code=pre_transform_code, - use_c_preprocessor=(args.lang == "fpp")) + kernels = lp.parse_transformed_fortran( + infile_content, pre_transform_code=pre_transform_code, + filename=args.infile) if args.name is not None: kernels = [kernel for kernel in kernels diff --git a/build-helpers/loopy.spec b/build-helpers/loopy.spec index ec420760abd4d43f16771d4745be6fd43f3cabb3..e24a24db9bda96f149e4279a6c37c1603476c3a4 100644 --- a/build-helpers/loopy.spec +++ b/build-helpers/loopy.spec @@ -2,9 +2,10 @@ single_file = True +from os.path import expanduser a = Analysis(['bin/loopy'], - pathex=['/home/andreas/src/loopy'], + pathex=[expanduser('~/src/loopy')], hiddenimports=[], hookspath=None, runtime_hooks=None, diff --git a/build-helpers/run-pyinstaller.sh b/build-helpers/run-pyinstaller.sh index 660c20fd4be4be4513b36cbf7d51de1fcac3a388..50f9d85dccc503be2a2ccfb6c0e3d6aa28216981 100755 --- a/build-helpers/run-pyinstaller.sh +++ b/build-helpers/run-pyinstaller.sh @@ -2,7 +2,7 @@ # run this from the loopy root directory -rm -Rf dist/loopy +rm -Rf dist build pyinstaller \ --workpath=build/pyinstaller \ diff --git a/build-helpers/upload.sh b/build-helpers/upload.sh new file mode 100755 index 0000000000000000000000000000000000000000..57b8a873b9395954d76a8fd16f8ca9a261e8baa3 --- /dev/null +++ b/build-helpers/upload.sh @@ -0,0 +1,5 @@ +#! /bin/bash + +set -e + +scp "$1" tiker.net:public_html/pub/loopy-binaries/ diff --git a/doc/reference.rst b/doc/reference.rst index 9d75a924b71847d04632b4945f9228c80c2e01f4..6ed83d43084b5578c817964b13097b4aa78294ef 100644 --- a/doc/reference.rst +++ b/doc/reference.rst @@ -334,8 +334,16 @@ function, which is responsible for creating kernels: .. autofunction:: make_kernel +.. autofunction:: parse_fortran + +.. autofunction:: parse_transformed_fortran + .. autofunction:: make_copy_kernel +.. autofunction:: fuse_kernels + +.. autofunction:: c_preprocess + Transforming Kernels -------------------- @@ -429,6 +437,8 @@ Manipulating Instructions .. autofunction:: remove_instructions +.. autofunction:: tag_instructions + Library interface ^^^^^^^^^^^^^^^^^ diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 226868317b71474b9e1866a4c0ae5df82080f9dc..e566365f8d520d33f518bc95cbaa43e5af6e747c 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -22,8 +22,12 @@ import a few modules and set up a :class:`pyopencl.Context` and a >>> import pyopencl as cl >>> import pyopencl.array >>> import pyopencl.clrandom + >>> import loopy as lp - >>> from pytools import StderrToStdout as IncludeWarningsInDoctest + >>> lp.set_caching_enabled(False) + + >>> from warnings import filterwarnings, catch_warnings + >>> filterwarnings('error', category=lp.LoopyWarning) >>> ctx = cl.create_some_context(interactive=False) >>> queue = cl.CommandQueue(ctx) @@ -97,9 +101,9 @@ always see loopy's view of a kernel by printing it. KERNEL: loopy_kernel --------------------------------------------------------------------------- ARGUMENTS: - a: GlobalArg, type: <runtime>, shape: (n), dim_tags: (stride:1) + a: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1) n: ValueArg, type: <runtime> - out: GlobalArg, type: <runtime>, shape: (n), dim_tags: (stride:1) + out: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1) --------------------------------------------------------------------------- DOMAINS: [n] -> { [i] : i >= 0 and i <= -1 + n } @@ -168,14 +172,13 @@ by passing :attr:`loopy.Options.write_cl`. >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) - <BLANKLINE> #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE> __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) { <BLANKLINE> - for (int i = 0; i <= (-1 + n); ++i) + for (int i = 0; i <= -1 + n; ++i) out[i] = 2.0f * a[i]; } @@ -246,14 +249,13 @@ call :func:`loopy.generate_code`: >>> typed_knl = lp.get_one_scheduled_kernel(typed_knl) >>> code, _ = lp.generate_code(typed_knl) >>> print code - <BLANKLINE> #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE> __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) { <BLANKLINE> - for (int i = 0; i <= (-1 + n); ++i) + for (int i = 0; i <= -1 + n; ++i) out[i] = 2.0f * a[i]; } @@ -361,16 +363,16 @@ Let us take a look at the generated code for the above kernel: .. doctest:: >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_loop_priority(knl, "i,j") >>> evt, (out,) = knl(queue, a=a_mat_dev) - <BLANKLINE> #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE> __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) { <BLANKLINE> - for (int i = 0; i <= (-1 + n); ++i) - for (int j = 0; j <= (-1 + n); ++j) + for (int i = 0; i <= -1 + n; ++i) + for (int j = 0; j <= -1 + n; ++j) { out[n * j + i] = a[n * i + j]; out[n * i + j] = 2.0f * out[n * i + j]; @@ -382,10 +384,8 @@ still not right: .. doctest:: - >>> assert (out.get() == a_mat_dev.get().T*2).all() - Traceback (most recent call last): - ... - AssertionError + >>> print((out.get() == a_mat_dev.get().T*2).all()) + False For the kernel to perform the desired computation, *all instances* (loop iterations) of the first instruction need to be completed, @@ -406,6 +406,7 @@ with identical bounds, for the use of the transpose: ... out[j,i] = a[i,j] {id=transpose} ... out[ii,jj] = 2*out[ii,jj] {dep=transpose} ... """) + >>> knl = lp.set_loop_priority(knl, "i,j,ii,jj") :func:`loopy.duplicate_inames` can be used to achieve the same goal. Now the intended code is generated and our test passes. @@ -414,18 +415,17 @@ Now the intended code is generated and our test passes. >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=a_mat_dev) - <BLANKLINE> #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE> __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) { <BLANKLINE> - for (int i = 0; i <= (-1 + n); ++i) - for (int j = 0; j <= (-1 + n); ++j) + for (int i = 0; i <= -1 + n; ++i) + for (int j = 0; j <= -1 + n; ++j) out[n * j + i] = a[n * i + j]; - for (int ii = 0; ii <= (-1 + n); ++ii) - for (int jj = 0; jj <= (-1 + n); ++jj) + for (int ii = 0; ii <= -1 + n; ++ii) + for (int jj = 0; jj <= -1 + n; ++jj) out[n * ii + jj] = 2.0f * out[n * ii + jj]; } >>> assert (out.get() == a_mat_dev.get().T*2).all() @@ -443,6 +443,10 @@ control is the nesting of loops. For example, should the *i* loop be nested around the *j* loop, or the other way around, in the following simple zero-fill kernel? +It turns out that Loopy will typically choose a loop nesting for us, but it +does not like doing so. In this tutorial (where we've turned Loopy's warnings +into errors), we are told what is wrong in no uncertain terms:: + .. doctest:: >>> knl = lp.make_kernel( @@ -453,18 +457,9 @@ zero-fill kernel? >>> knl = lp.set_options(knl, "write_cl") - >>> with IncludeWarningsInDoctest(): - ... evt, (out,) = knl(queue, a=a_mat_dev) - <BLANKLINE> - ... - for (int i = 0; i <= (-1 + n); ++i) - for (int j = 0; j <= (-1 + n); ++j) - a[n * i + j] = 0.0f; + >>> evt, (out,) = knl(queue, a=a_mat_dev) + Traceback (most recent call last): ... - -Loopy has chosen a loop nesting for us, but it did not like doing so, as it -also issued the following warning:: - LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring This is easily resolved: @@ -481,12 +476,11 @@ ambiguous. .. doctest:: - >>> with IncludeWarningsInDoctest(): - ... evt, (out,) = knl(queue, a=a_mat_dev) - <BLANKLINE> + >>> evt, (out,) = knl(queue, a=a_mat_dev) + #define lid(N) ((int) get_local_id(N)) ... - for (int j = 0; j <= (-1 + n); ++j) - for (int i = 0; i <= (-1 + n); ++i) + for (int j = 0; j <= -1 + n; ++j) + for (int i = 0; i <= -1 + n; ++i) a[n * i + j] = 0.0f; ... @@ -536,15 +530,16 @@ Consider this example: >>> knl = lp.make_kernel( ... "{ [i]: 0<=i<n }", - ... "a[i] = 0", assumptions="n>=0") + ... "a[i] = 0", assumptions="n>=1") >>> knl = lp.split_iname(knl, "i", 16) + >>> knl = lp.set_loop_priority(knl, "i_outer,i_inner") >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... - for (int i_outer = 0; i_outer <= (-1 + ((15 + n) / 16)); ++i_outer) + for (int i_outer = 0; i_outer <= -1 + ((15 + n) / 16); ++i_outer) for (int i_inner = 0; i_inner <= 15; ++i_inner) - if ((-1 + -1 * i_inner + -16 * i_outer + n) >= 0) + if (-1 + -1 * i_inner + -16 * i_outer + n >= 0) a[i_inner + i_outer * 16] = 0.0f; ... @@ -567,12 +562,11 @@ relation to loop nesting. For example, it's perfectly possible to request >>> knl = lp.set_loop_priority(knl, "i_inner,i_outer") >>> evt, (out,) = knl(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... for (int i_inner = 0; i_inner <= 15; ++i_inner) - if ((-1 + n) >= 0) - for (int i_outer = 0; i_outer <= (-1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16)); ++i_outer) - a[i_inner + i_outer * 16] = 0.0f; + for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) + a[i_inner + i_outer * 16] = 0.0f; ... Notice how loopy has automatically generated guard conditionals to make @@ -593,10 +587,10 @@ commonly called 'loop tiling': >>> knl = lp.set_loop_priority(knl, "i_outer,j_outer,i_inner") >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=a_mat_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... - for (int i_outer = 0; i_outer <= (-1 + ((15 + n) / 16)); ++i_outer) - for (int j_outer = 0; j_outer <= (-1 + ((15 + n) / 16)); ++j_outer) + for (int i_outer = 0; i_outer <= ((-16 + n) / 16); ++i_outer) + for (int j_outer = 0; j_outer <= ((-16 + n) / 16); ++j_outer) for (int i_inner = 0; i_inner <= 15; ++i_inner) for (int j_inner = 0; j_inner <= 15; ++j_inner) out[n * (i_inner + i_outer * 16) + j_inner + j_outer * 16] = a[n * (j_inner + j_outer * 16) + i_inner + i_outer * 16]; @@ -632,11 +626,13 @@ loop's tag to ``"unr"``: >>> orig_knl = knl >>> knl = lp.split_iname(knl, "i", 4) >>> knl = lp.tag_inames(knl, dict(i_inner="unr")) + >>> knl = lp.set_loop_priority(knl, "i_outer,i_inner") >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) - <BLANKLINE> + #define int_floor_div_pos_b(a,b) ( ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) ) + #define lid(N) ((int) get_local_id(N)) ... - for (int i_outer = 0; i_outer <= (-1 + ((3 + n) / 4)); ++i_outer) + for (int i_outer = 0; i_outer <= int_floor_div_pos_b(-4 + n, 4); ++i_outer) { a[0 + i_outer * 4] = 0.0f; a[1 + i_outer * 4] = 0.0f; @@ -708,12 +704,12 @@ Let's try this out on our vector fill kernel by creating workgroups of size ... outer_tag="g.0", inner_tag="l.0") >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *restrict a, int const n) { <BLANKLINE> - if ((-1 + -128 * gid(0) + -1 * lid(0) + n) >= 0) + if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0) a[lid(0) + gid(0) * 128] = 0.0f; } @@ -752,18 +748,19 @@ assumption: >>> orig_knl = knl >>> knl = lp.split_iname(knl, "i", 4) >>> knl = lp.tag_inames(knl, dict(i_inner="unr")) + >>> knl = lp.set_loop_priority(knl, "i_outer,i_inner") >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... - for (int i_outer = 0; i_outer <= (-1 + ((3 + n) / 4)); ++i_outer) + for (int i_outer = 0; i_outer <= -1 + ((3 + n) / 4); ++i_outer) { a[0 + i_outer * 4] = 0.0f; - if ((-2 + -4 * i_outer + n) >= 0) + if (-2 + -4 * i_outer + n >= 0) a[1 + i_outer * 4] = 0.0f; - if ((-3 + -4 * i_outer + n) >= 0) + if (-3 + -4 * i_outer + n >= 0) a[2 + i_outer * 4] = 0.0f; - if ((-4 + -4 * i_outer + n) >= 0) + if (-4 + -4 * i_outer + n >= 0) a[3 + i_outer * 4] = 0.0f; } ... @@ -781,11 +778,12 @@ enabling some cost savings: >>> knl = orig_knl >>> knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr") >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_loop_priority(knl, "i_outer,i_inner") >>> evt, (out,) = knl(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... /* bulk slab for 'i_outer' */ - for (int i_outer = 0; i_outer <= (-2 + ((3 + n) / 4)); ++i_outer) + for (int i_outer = 0; i_outer <= -2 + ((3 + n) / 4); ++i_outer) { a[0 + i_outer * 4] = 0.0f; a[1 + i_outer * 4] = 0.0f; @@ -793,15 +791,15 @@ enabling some cost savings: a[3 + i_outer * 4] = 0.0f; } /* final slab for 'i_outer' */ - for (int i_outer = (-1 + n + -1 * (3 * n / 4)); i_outer <= (-1 + ((3 + n) / 4)); ++i_outer) - if ((-1 + n) >= 0) + for (int i_outer = -1 + n + -1 * (3 * n / 4); i_outer <= -1 + ((3 + n) / 4); ++i_outer) + if (-1 + n >= 0) { a[0 + i_outer * 4] = 0.0f; - if ((-2 + -4 * i_outer + n) >= 0) + if (-2 + -4 * i_outer + n >= 0) a[1 + i_outer * 4] = 0.0f; - if ((-3 + -4 * i_outer + n) >= 0) + if (-3 + -4 * i_outer + n >= 0) a[2 + i_outer * 4] = 0.0f; - if ((4 + 4 * i_outer + -1 * n) == 0) + if (4 + 4 * i_outer + -1 * n == 0) a[3 + i_outer * 4] = 0.0f; } ... @@ -871,12 +869,12 @@ memory, local to each work item. >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out1, out2) = knl(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... { float a_temp; <BLANKLINE> - for (int i = 0; i <= (-1 + n); ++i) + for (int i = 0; i <= -1 + n; ++i) { a_temp = sin(a[i]); out2[i] = sqrt(1.0f + -1.0f * a_temp * a_temp); @@ -929,19 +927,19 @@ Consider the following example: >>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0")) >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... { __local float a_temp[16]; float acc_k; <BLANKLINE> - if ((-1 + -16 * gid(0) + -1 * lid(0) + n) >= 0) + if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { a_temp[lid(0)] = a[16 * gid(0) + lid(0)]; acc_k = 0.0f; } barrier(CLK_LOCAL_MEM_FENCE) /* for a_temp (insn_0_k_update depends on insn) */; - if ((-1 + -16 * gid(0) + -1 * lid(0) + n) >= 0) + if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { for (int k = 0; k <= 15; ++k) acc_k = acc_k + a_temp[k]; @@ -992,7 +990,7 @@ transformation exists in :func:`loopy.add_prefetch`: >>> knl = lp.set_options(knl, "write_cl") >>> knl_pf = lp.add_prefetch(knl, "a") >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... a_fetch_0 = a[16 * gid(0) + lid(0)]; for (int k = 0; k <= 15; ++k) @@ -1015,12 +1013,12 @@ earlier: >>> knl_pf = lp.add_prefetch(knl, "a", ["i_inner"]) >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... - if ((-1 + -16 * gid(0) + -1 * lid(0) + n) >= 0) + if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) a_fetch_0[lid(0)] = a[lid(0) + 16 * gid(0)]; barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch_0 (insn_k_update depends on a_fetch) */; - if ((-1 + -16 * gid(0) + -1 * lid(0) + n) >= 0) + if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { for (int k = 0; k <= 15; ++k) acc_k = acc_k + a_fetch_0[lid(0)]; @@ -1145,18 +1143,20 @@ sign that something is amiss: .. doctest:: - >>> with IncludeWarningsInDoctest(): - ... evt, (out,) = knl(queue, a=a_mat_dev) - /...: WriteRaceConditionWarning: instruction 'a_fetch' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch_0' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch)' to silenced_warnings kernel argument to disable) - warn(text, type) + >>> evt, (out,) = knl(queue, a=a_mat_dev) + Traceback (most recent call last): + ... + WriteRaceConditionWarning: instruction 'a_fetch' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch_0' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch)' to silenced_warnings kernel argument to disable) When we ask to see the code, the issue becomes apparent: .. doctest:: >>> knl = lp.set_options(knl, "write_cl") - >>> evt, (out,) = knl(queue, a=a_mat_dev) - <BLANKLINE> + >>> from warnings import catch_warnings + >>> with catch_warnings(): + ... filterwarnings("always", category=lp.LoopyWarning) + ... evt, (out,) = knl(queue, a=a_mat_dev) #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE> diff --git a/examples/fortran/IPython Integration Demo.ipynb b/examples/fortran/IPython Integration Demo.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c2b34f1d13f7b4971c0c2e88a0ae5013ed12ee23 --- /dev/null +++ b/examples/fortran/IPython Integration Demo.ipynb @@ -0,0 +1,190 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:c9f8334aa7aa4a5ad1437fa5871aafa52bbc9131271d9e90e7be47d22725cc94" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Loopy IPython Integration Demo" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "%load_ext loopy.ipython_ext" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 1 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Without transform code" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "%%fortran_kernel\n", + "\n", + "subroutine fill(out, a, n)\n", + " implicit none\n", + "\n", + " real*8 a, out(n)\n", + " integer n, i\n", + "\n", + " do i = 1, n\n", + " out(i) = a\n", + " end do\n", + "end" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 2 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print(fill)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "---------------------------------------------------------------------------\n", + "KERNEL: fill\n", + "---------------------------------------------------------------------------\n", + "ARGUMENTS:\n", + "a: ValueArg, type: float64\n", + "n: ValueArg, type: int32\n", + "out: GlobalArg, type: float64, shape: (n), dim_tags: (N0:stride:1)\n", + "---------------------------------------------------------------------------\n", + "DOMAINS:\n", + "[n] -> { [i] : i >= 0 and i <= -1 + n }\n", + "---------------------------------------------------------------------------\n", + "INAME IMPLEMENTATION TAGS:\n", + "i: None\n", + "---------------------------------------------------------------------------\n", + "INSTRUCTIONS:\n", + "[i] out[i] <- a # insn0\n", + "---------------------------------------------------------------------------\n" + ] + } + ], + "prompt_number": 3 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## With transform code" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "split_amount = 128" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 4 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "%%transformed_fortran_kernel\n", + "\n", + "subroutine tr_fill(out, a, n)\n", + " implicit none\n", + "\n", + " real*8 a, out(n)\n", + " integer n, i\n", + "\n", + " do i = 1, n\n", + " out(i) = a\n", + " end do\n", + "end\n", + "\n", + "!$loopy begin\n", + "!\n", + "! tr_fill, = lp.parse_fortran(SOURCE)\n", + "! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n", + "! outer_tag=\"g.0\", inner_tag=\"l.0\")\n", + "! RESULT = [tr_fill]\n", + "!\n", + "!$loopy end" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 5 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print(tr_fill)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "---------------------------------------------------------------------------\n", + "KERNEL: tr_fill\n", + "---------------------------------------------------------------------------\n", + "ARGUMENTS:\n", + "a: ValueArg, type: float64\n", + "n: ValueArg, type: int32\n", + "out: GlobalArg, type: float64, shape: (n), dim_tags: (N0:stride:1)\n", + "---------------------------------------------------------------------------\n", + "DOMAINS:\n", + "[n] -> { [i_outer, i_inner] : i_inner >= -128i_outer and i_inner <= -1 + n - 128i_outer and i_inner >= 0 and i_inner <= 127 }\n", + "---------------------------------------------------------------------------\n", + "INAME IMPLEMENTATION TAGS:\n", + "i_inner: l.0\n", + "i_outer: g.0\n", + "---------------------------------------------------------------------------\n", + "INSTRUCTIONS:\n", + "[i_inner,i_outer] out[i_inner + i_outer*128] <- a # insn0\n", + "---------------------------------------------------------------------------\n" + ] + } + ], + "prompt_number": 6 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [] + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/examples/fortran/foo.floopy b/examples/fortran/foo.floopy index 7d2d3eef0570e448ad8e9e7113e2470e1d4ef64d..6b8741e1150a05226f3aa9de849db8abb1915002 100644 --- a/examples/fortran/foo.floopy +++ b/examples/fortran/foo.floopy @@ -1,24 +1,30 @@ subroutine fill(out, a, n) implicit none - real*8 a, out(n) - integer n + real_type a, out(n) + integer n, i do i = 1, n out(i) = a end do do i = 1, n - out(i) = out(i) * 2 + out(i) = out(i) * factor end do end -!$loopy begin transform +!$loopy begin ! +! SOURCE = lp.c_preprocess(SOURCE, [ +! "factor 4.0", +! "real_type real*8", +! ]) +! fill, = lp.parse_fortran(SOURCE, FILENAME) ! fill = lp.split_iname(fill, "i", 128, ! outer_tag="g.0", inner_tag="l.0") ! fill = lp.split_iname(fill, "i_1", 128, ! outer_tag="g.0", inner_tag="l.0") +! RESULT = [fill] ! -!$loopy end transform +!$loopy end ! vim:filetype=floopy diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy index ea85b0c6b198c8bac6270098696504fc9006fd14..3352449d7bfcc406e5349f36c5cb7ebb3e3613b6 100644 --- a/examples/fortran/matmul.floopy +++ b/examples/fortran/matmul.floopy @@ -12,7 +12,8 @@ subroutine dgemm(m,n,l,alpha,a,b,c) end do end subroutine -!$loopy begin transform +!$loopy begin +! dgemm, = lp.parse_fortran(SOURCE, FILENAME) ! dgemm = lp.split_iname(dgemm, "i", 16, ! outer_tag="g.0", inner_tag="l.1") ! dgemm = lp.split_iname(dgemm, "j", 8, @@ -23,4 +24,5 @@ end subroutine ! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2") ! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner") ! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner") -!$loopy end transform +! RESULT = [dgemm] +!$loopy end diff --git a/examples/fortran/outerprod.py b/examples/fortran/outerprod.py deleted file mode 100644 index 4122c84376fcc676023b3a4a01b0728c70b18b61..0000000000000000000000000000000000000000 --- a/examples/fortran/outerprod.py +++ /dev/null @@ -1,8 +0,0 @@ -lp_knl = lp.make_kernel( - "{[i,j]: 0<=i,j<n}", - "c[i,j] = a[i]*b[j]") - -lp_knl = lp.add_dtypes(lp_knl, {"a": np.float64, "b": np.float64}) -lp_knl = lp.split_iname(lp_knl, "i", 16, outer_tag="g.0", inner_tag="l.0") -lp_knl = lp.split_iname(lp_knl, "j", 16, outer_tag="g.1", inner_tag="l.1") - diff --git a/examples/fortran/sparse.floopy b/examples/fortran/sparse.floopy index 924e0aa4abe51c4dd84b01cad0cb83b56122c97d..18542e6b0403a7ab475b3e357f18489847367c3d 100644 --- a/examples/fortran/sparse.floopy +++ b/examples/fortran/sparse.floopy @@ -6,6 +6,7 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y) real*8 x(n), y(n), rowsum integer m, n, rowstart, rowend, length, nvals + integer i, j do i = 1, m rowstart = rowstarts(i) @@ -21,10 +22,12 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y) end do end -!$loopy begin transform +!$loopy begin +! sparse, = lp.parse_fortran(SOURCE, FILENAME) ! sparse = lp.split_iname(sparse, "i", 128) ! sparse = lp.tag_inames(sparse, {"i_outer": "g.0"}) ! sparse = lp.tag_inames(sparse, {"i_inner": "l.0"}) ! sparse = lp.split_iname(sparse, "j", 4) ! sparse = lp.tag_inames(sparse, {"j_inner": "unr"}) -!$loopy end transform +! RESULT = [sparse] +!$loopy end diff --git a/examples/fortran/tagging.floopy b/examples/fortran/tagging.floopy index f4b4e28eab3ddd544c791a088279718ef5221bb9..40b487528cdd788f26fb4b600869e4bf81f090a8 100644 --- a/examples/fortran/tagging.floopy +++ b/examples/fortran/tagging.floopy @@ -2,7 +2,7 @@ subroutine fill(out, a, n) implicit none real*8 a, out(n) - integer n + integer n, i !$loopy begin tagged: init do i = 1, n @@ -14,11 +14,12 @@ subroutine fill(out, a, n) end do end -!$loopy begin transform -! +!$loopy begin +! fill, = lp.parse_fortran(SOURCE, FILENAME) ! fill = lp.split_iname(fill, "i", 128, ! outer_tag="g.0", inner_tag="l.0") ! fill = lp.split_iname(fill, "i_1", 128, ! outer_tag="g.0", inner_tag="l.0") -!$loopy end transform +! RESULT = [fill] +!$loopy end ! vim:filetype=floopy diff --git a/examples/fortran/volumeKernel.floopy b/examples/fortran/volumeKernel.floopy index 953432d2253e90e396385cd53f4c0f2d81d70e5a..c5784b63492063bfd2a9604c42dbf65b2ecb86bf 100644 --- a/examples/fortran/volumeKernel.floopy +++ b/examples/fortran/volumeKernel.floopy @@ -65,8 +65,9 @@ subroutine volumeKernel(elements, Nfields, Ngeo, Ndim, Dop, geo, Q, rhsQ ) end subroutine volumeKernel -!$loopy begin transform +!$loopy begin ! +! volumeKernel, = lp.parse_fortran(SOURCE, FILENAME) ! volumeKernel = lp.split_iname(volumeKernel, ! "e", 32, outer_tag="g.1", inner_tag="g.0") ! volumeKernel = lp.fix_parameters(volumeKernel, @@ -75,5 +76,6 @@ end subroutine volumeKernel ! i="l.0", j="l.1", k="l.2", ! i_1="l.0", j_1="l.1", k_1="l.2" ! )) +! RESULT = [volumeKernel] ! -!$loopy end transform +!$loopy end diff --git a/examples/fortran/volumeKernelSimple.floopy b/examples/fortran/volumeKernelSimple.floopy index 67948020d6fce06e858718fa9cca366d78b33295..afc3321b8fa18ff532b3c2898dbaea8f99e3700c 100644 --- a/examples/fortran/volumeKernelSimple.floopy +++ b/examples/fortran/volumeKernelSimple.floopy @@ -27,10 +27,12 @@ subroutine volumeKernel(elements, Nfields, Ngeo, Ndim, Dop, geo, Q, rhsQ ) end subroutine volumeKernel -!$loopy begin transform +!$loopy begin ! +! volumeKernel, = lp.parse_fortran(SOURCE, FILENAME) ! volumeKernel = lp.fix_parameters(volumeKernel, ! Nq=5, Ndim=3) ! volumeKernel = lp.tag_inames(volumeKernel, dict(i="l.0")) +! RESULT = [volumeKernel] ! -!$loopy end transform +!$loopy end diff --git a/examples/python/hello-loopy-lp.py b/examples/python/hello-loopy-lp.py new file mode 100644 index 0000000000000000000000000000000000000000..382aadc097352ab7328b9ef956e0c0379d06a2cb --- /dev/null +++ b/examples/python/hello-loopy-lp.py @@ -0,0 +1,11 @@ +# This is a version of hello-loopy.py that can be run through +# a loopy binary using +# +# ./loopy --lang=loopy hello-loopy-lp.py - + +knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + "out[i] = 2*a[i]") + +knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) +lp_knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") diff --git a/loopy/__init__.py b/loopy/__init__.py index 12329bbf6472867c873958f1a18f20ba1016b5f7..dcd8e27c1eaf02a0f7aee133598b0e2fd34f70b9 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -1,8 +1,4 @@ -from __future__ import division -from __future__ import absolute_import -import six -from six.moves import range -from six.moves import zip +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -27,13 +23,16 @@ THE SOFTWARE. """ +import six +from six.moves import range, zip + import islpy as isl from islpy import dim_type from loopy.symbolic import (RuleAwareIdentityMapper, RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext, TaggedVariable, Reduction, LinearSubscript, ) -from loopy.diagnostic import LoopyError +from loopy.diagnostic import LoopyError, LoopyWarning # {{{ imported user interface @@ -58,6 +57,7 @@ from loopy.library.reduction import register_reduction_parser from loopy.subst import extract_subst, expand_subst, temporary_to_subst from loopy.precompute import precompute from loopy.buffer import buffer_array +from loopy.fusion import fuse_kernels from loopy.padding import (split_arg_axis, find_padding_multiple, add_padding) from loopy.preprocess import (preprocess_kernel, realize_reduction, @@ -67,6 +67,8 @@ from loopy.codegen import generate_code, generate_body from loopy.compiled import CompiledKernel from loopy.options import Options from loopy.auto_test import auto_test_vs_ref +from loopy.frontend.fortran import (c_preprocess, parse_transformed_fortran, + parse_fortran) __all__ = [ "TaggedVariable", "Reduction", "LinearSubscript", @@ -88,6 +90,7 @@ __all__ = [ "extract_subst", "expand_subst", "temporary_to_subst", "precompute", "buffer_array", + "fuse_kernels", "split_arg_axis", "find_padding_multiple", "add_padding", "get_dot_dependency_graph", @@ -106,6 +109,9 @@ __all__ = [ "Options", "make_kernel", + "c_preprocess", "parse_transformed_fortran", "parse_fortran", + + "LoopyError", "LoopyWarning", # {{{ from this file @@ -539,7 +545,7 @@ class _InameDuplicator(RuleAwareIdentityMapper): return var(new_name) def map_instruction(self, insn): - if not self.within(((insn.id, None),)): + if not self.within(((insn.id, insn.tags),)): return insn new_fid = frozenset( @@ -1458,7 +1464,8 @@ def register_function_manglers(kernel, manglers): # {{{ cache control -CACHING_ENABLED = True +import os +CACHING_ENABLED = "LOOPY_NO_CACHE" not in os.environ def set_caching_enabled(flag): @@ -1781,4 +1788,23 @@ def fold_constants(kernel): # }}} + +# {{{ tag_instructions + +def tag_instructions(kernel, new_tag, within=None): + from loopy.context_matching import parse_stack_match + within = parse_stack_match(within) + + new_insns = [] + for insn in kernel.instructions: + if within(((insn.id, insn.tags),)): + new_insns.append( + insn.copy(tags=insn.tags + (new_tag,))) + else: + new_insns.append(insn) + + return kernel.copy(instructions=new_insns) + +# }}} + # vim: foldmethod=marker diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index dcea94690c2cca630b40a870f777d884715f1dd4..33cdc2118e389ae7b04e43e28a4c6d0e62011751 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -277,7 +277,7 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): from loopy.codegen.bounds import get_usable_inames_for_conditional - # Note: this does note include loop_iname itself! + # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) @@ -299,7 +299,6 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param - del assumptions_non_param # move inames that are usable into parameters moved_inames = [] @@ -317,14 +316,23 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): static_min_of_pw_aff, static_max_of_pw_aff) - static_lbound = static_min_of_pw_aff( + lbound = ( kernel.cache_manager.dim_min( - dom_and_slab, loop_iname_idx).coalesce(), - constants_only=False, prefer_constants=False) + dom_and_slab, loop_iname_idx) + .gist(kernel.assumptions) + .coalesce()) + ubound = ( + kernel.cache_manager.dim_max( + dom_and_slab, loop_iname_idx) + .gist(kernel.assumptions) + .coalesce()) + + static_lbound = static_min_of_pw_aff( + lbound, + constants_only=False) static_ubound = static_max_of_pw_aff( - kernel.cache_manager.dim_max( - dom_and_slab, loop_iname_idx).coalesce(), - constants_only=False, prefer_constants=False) + ubound, + constants_only=False) # }}} diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index 56d7f6706c8932a9a97202768bdd0053c5c36d10..93cbd125de96e0481a3f466c0a7e96dd04c89cd7 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -37,11 +37,11 @@ class LoopyAdvisory(LoopyWarningBase): pass -class ParameterFinderWarning(LoopyWarningBase): +class ParameterFinderWarning(LoopyWarning): pass -class WriteRaceConditionWarning(LoopyWarningBase): +class WriteRaceConditionWarning(LoopyWarning): pass # }}} diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 1a610b63cb35f4931e41446f00c16a182de8430a..1c3d1283541c33ce3944e358f4abd451e2c881ee 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -25,46 +25,209 @@ THE SOFTWARE. from loopy.diagnostic import LoopyError -def f2loopy(source, free_form=True, strict=True, - pre_transform_code=None, pre_transform_code_context=None, - use_c_preprocessor=False, - file_name="<floopy code>"): - if use_c_preprocessor: - try: - import ply.lex as lex - import ply.cpp as cpp - except ImportError: - raise LoopyError("Using the C preprocessor requires PLY to be installed") +def c_preprocess(source, defines=None, filename="<floopy source>"): + """ + :arg source: a string, possibly containing C preprocessor constructs + :arg defines: a list of strings as they might occur after a + C-style ``#define`` directive, for example ``deg2rad(x) (x/180d0 * 3.14d0)``. + :return: a string + """ + try: + import ply.lex as lex + import ply.cpp as cpp + except ImportError: + raise LoopyError("Using the C preprocessor requires PLY to be installed") - lexer = lex.lex(cpp) + lexer = lex.lex(cpp) - from ply.cpp import Preprocessor - p = Preprocessor(lexer) - p.parse(source, file_name) + from ply.cpp import Preprocessor + p = Preprocessor(lexer) - tokens = [] - while True: - tok = p.token() + if defines: + for d in defines: + p.define(d) - if not tok: - break + p.parse(source, filename) - if tok.type == "CPP_COMMENT": - continue + tokens = [] + while True: + tok = p.token() - tokens.append(tok.value) + if not tok: + break - source = "".join(tokens) + if tok.type == "CPP_COMMENT": + continue + tokens.append(tok.value) + + return "".join(tokens) + + +def _extract_loopy_lines(source): + lines = source.split("\n") + + import re + comment_re = re.compile(r"^\s*\!(.*)$") + + remaining_lines = [] + loopy_lines = [] + + in_loopy_code = False + for l in lines: + comment_match = comment_re.match(l) + + if comment_match is None: + if in_loopy_code: + raise LoopyError("non-comment source line in loopy block") + + remaining_lines.append(l) + continue + + cmt = comment_match.group(1) + cmt_stripped = cmt.strip() + + if cmt_stripped == "$loopy begin": + if in_loopy_code: + raise LoopyError("can't enter loopy block twice") + in_loopy_code = True + + elif cmt_stripped == "$loopy end": + if not in_loopy_code: + raise LoopyError("can't leave loopy block twice") + in_loopy_code = False + + elif in_loopy_code: + loopy_lines.append(cmt) + + else: + remaining_lines.append(l) + + return "\n".join(remaining_lines), "\n".join(loopy_lines) + + +def parse_transformed_fortran(source, free_form=True, strict=True, + pre_transform_code=None, transform_code_context=None, + filename="<floopy code>"): + """ + :arg source: a string of Fortran source code which must include + a snippet of transform code as described below. + :arg pre_transform_code: code that is run in the same context + as the transform + + *source* may contain snippets of loopy transform code between markers:: + + !$loopy begin + ! ... + !$loopy end + + Within the transform code, the following symbols are predefined: + + * ``lp``: a reference to the :mod:`loopy` package + * ``np``: a reference to the :mod:`numpy` package + * ``SOURCE``: the source code surrounding the transform block. + This may be processed using :func:`c_preprocess` and + :func:`parse_fortran`. + * ``FILENAME``: the file name of the code being processed + + The transform code must define ``RESULT``, conventionally a list of + kernels, which is returned from this function unmodified. + + An example of *source* may look as follows:: + + subroutine fill(out, a, n) + implicit none + + real*8 a, out(n) + integer n, i + + do i = 1, n + out(i) = a + end do + end + + !$loopy begin + ! + ! fill, = lp.parse_fortran(SOURCE, FILENAME) + ! fill = lp.split_iname(fill, "i", split_amount, + ! outer_tag="g.0", inner_tag="l.0") + ! RESULT = [fill] + ! + !$loopy end + """ + + source, transform_code = _extract_loopy_lines(source) + if not transform_code: + raise LoopyError("no transform code found") + + from loopy.tools import remove_common_indentation + transform_code = remove_common_indentation( + transform_code, + require_leading_newline=False) + + if transform_code_context is None: + proc_dict = {} + else: + proc_dict = transform_code_context.copy() + + import loopy as lp + import numpy as np + + proc_dict["lp"] = lp + proc_dict["np"] = np + + proc_dict["SOURCE"] = source + proc_dict["FILENAME"] = filename + + from os.path import dirname, abspath + from os import getcwd + + infile_dirname = dirname(filename) + if infile_dirname: + infile_dirname = abspath(infile_dirname) + else: + infile_dirname = getcwd() + + import sys + prev_sys_path = sys.path + try: + if infile_dirname: + sys.path = prev_sys_path + [infile_dirname] + + if pre_transform_code is not None: + proc_dict["_MODULE_SOURCE_CODE"] = pre_transform_code + exec(compile(pre_transform_code, + "<loopy pre-transform code>", "exec"), proc_dict) + + proc_dict["_MODULE_SOURCE_CODE"] = transform_code + exec(compile(transform_code, filename, "exec"), proc_dict) + + finally: + sys.path = prev_sys_path + + if "RESULT" not in proc_dict: + raise LoopyError("transform code did not set RESULT") + + return proc_dict["RESULT"] + + +def parse_fortran(source, filename="<floopy code>", free_form=True, strict=True): + """ + :returns: a list of :class:`loopy.LoopKernel` objects + """ from fparser import api tree = api.parse(source, isfree=free_form, isstrict=strict, analyze=False, ignore_comments=False) + if tree is None: + raise LoopyError("Fortran parser was unhappy with source code " + "and returned invalid data (Sorry!)") + from loopy.frontend.fortran.translator import F2LoopyTranslator - f2loopy = F2LoopyTranslator(file_name) + f2loopy = F2LoopyTranslator(filename) f2loopy(tree) - return f2loopy.make_kernels(pre_transform_code=pre_transform_code, - pre_transform_code_context=pre_transform_code_context) + return f2loopy.make_kernels() + # vim: foldmethod=marker diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index abd925c14fe9ebf14342ac5ac25e2062291feefc..895606aa38df3bac78e05d18822a5dbf494d0948 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -35,6 +35,7 @@ from loopy.frontend.fortran.diagnostic import ( import islpy as isl from islpy import dim_type from loopy.symbolic import IdentityMapper +from loopy.diagnostic import LoopyError from pymbolic.primitives import Wildcard @@ -193,24 +194,6 @@ class Scope(object): # }}} -def remove_common_indentation(lines): - while lines and lines[0].strip() == "": - lines.pop(0) - while lines and lines[-1].strip() == "": - lines.pop(-1) - - if lines: - base_indent = 0 - while lines[0][base_indent] in " \t": - base_indent += 1 - - for line in lines[1:]: - if line[:base_indent].strip(): - raise ValueError("inconsistent indentation") - - return "\n".join(line[base_indent:] for line in lines) - - # {{{ translator class F2LoopyTranslator(FTreeWalkerBase): @@ -225,17 +208,15 @@ class F2LoopyTranslator(FTreeWalkerBase): self.kernels = [] - # Flag to record whether 'loopy begin transform' comment - # has been seen. - self.in_transform_code = False - self.instruction_tags = [] self.conditions = [] - self.transform_code_lines = [] - self.filename = filename + self.index_dtype = None + + self.block_nest = [] + def add_expression_instruction(self, lhs, rhs): scope = self.scope_stack[-1] @@ -276,6 +257,7 @@ class F2LoopyTranslator(FTreeWalkerBase): scope = Scope(node.name, list(node.args)) self.scope_stack.append(scope) + self.block_nest.append("sub") for c in node.content: self.rec(c) @@ -284,6 +266,11 @@ class F2LoopyTranslator(FTreeWalkerBase): self.kernels.append(scope) def map_EndSubroutine(self, node): + if not self.block_nest: + raise TranslationError("no subroutine started at this point") + if self.block_nest.pop() != "sub": + raise TranslationError("mismatched end subroutine") + return [] def map_Implicit(self, node): @@ -471,6 +458,7 @@ class F2LoopyTranslator(FTreeWalkerBase): self.conditions.append(cond_name) + self.block_nest.append("if") for c in node.content: self.rec(c) @@ -479,100 +467,119 @@ class F2LoopyTranslator(FTreeWalkerBase): self.conditions.append("!" + cond_name) def map_EndIfThen(self, node): + if not self.block_nest: + raise TranslationError("no if block started at end do") + if self.block_nest.pop() != "if": + raise TranslationError("mismatched end if") + self.conditions.pop() def map_Do(self, node): scope = self.scope_stack[-1] - if node.loopcontrol: - loop_var, loop_bounds = node.loopcontrol.split("=") - loop_var = loop_var.strip() - scope.use_name(loop_var) - loop_bounds = self.parse_expr( - node, - loop_bounds, min_precedence=self.expr_parser._PREC_FUNC_ARGS) - - if len(loop_bounds) == 2: - start, stop = loop_bounds - step = 1 - elif len(loop_bounds) == 3: - start, stop, step = loop_bounds - else: - raise RuntimeError("loop bounds not understood: %s" - % node.loopcontrol) + if not node.loopcontrol: + raise NotImplementedError("unbounded do loop") - if step != 1: - raise NotImplementedError( - "do loops with non-unit stride") + loop_var, loop_bounds = node.loopcontrol.split("=") + loop_var = loop_var.strip() - if not isinstance(step, int): - raise TranslationError( - "non-constant steps not supported: %s" % step) - - from loopy.symbolic import get_dependencies - loop_bound_deps = ( - get_dependencies(start) - | get_dependencies(stop) - | get_dependencies(step)) - - # {{{ find a usable loopy-side loop name - - loopy_loop_var = loop_var - loop_var_suffix = None - while True: - already_used = False - for iset in scope.index_sets: - if loopy_loop_var in iset.get_var_dict(dim_type.set): - already_used = True - break - - if not already_used: + iname_dtype = scope.get_type(loop_var) + if self.index_dtype is None: + self.index_dtype = iname_dtype + else: + if self.index_dtype != iname_dtype: + raise LoopyError("type of '%s' (%s) does not agree with prior " + "index type (%s)" + % (loop_var, iname_dtype, self.index_dtype)) + + scope.use_name(loop_var) + loop_bounds = self.parse_expr( + node, + loop_bounds, min_precedence=self.expr_parser._PREC_FUNC_ARGS) + + if len(loop_bounds) == 2: + start, stop = loop_bounds + step = 1 + elif len(loop_bounds) == 3: + start, stop, step = loop_bounds + else: + raise RuntimeError("loop bounds not understood: %s" + % node.loopcontrol) + + if step != 1: + raise NotImplementedError( + "do loops with non-unit stride") + + if not isinstance(step, int): + raise TranslationError( + "non-constant steps not supported: %s" % step) + + from loopy.symbolic import get_dependencies + loop_bound_deps = ( + get_dependencies(start) + | get_dependencies(stop) + | get_dependencies(step)) + + # {{{ find a usable loopy-side loop name + + loopy_loop_var = loop_var + loop_var_suffix = None + while True: + already_used = False + for iset in scope.index_sets: + if loopy_loop_var in iset.get_var_dict(dim_type.set): + already_used = True break - if loop_var_suffix is None: - loop_var_suffix = 0 + if not already_used: + break - loop_var_suffix += 1 - loopy_loop_var = loop_var + "_%d" % loop_var_suffix + if loop_var_suffix is None: + loop_var_suffix = 0 - # }}} + loop_var_suffix += 1 + loopy_loop_var = loop_var + "_%d" % loop_var_suffix - space = isl.Space.create_from_names(self.isl_context, - set=[loopy_loop_var], params=list(loop_bound_deps)) - - from loopy.isl_helpers import iname_rel_aff - from loopy.symbolic import aff_from_expr - index_set = ( - isl.BasicSet.universe(space) - .add_constraint( - isl.Constraint.inequality_from_aff( - iname_rel_aff(space, - loopy_loop_var, ">=", - aff_from_expr(space, 0)))) - .add_constraint( - isl.Constraint.inequality_from_aff( - iname_rel_aff(space, - loopy_loop_var, "<=", - aff_from_expr(space, stop-start))))) - - from pymbolic import var - scope.active_iname_aliases[loop_var] = \ - var(loopy_loop_var) + start - scope.active_loopy_inames.add(loopy_loop_var) - - scope.index_sets.append(index_set) - - for c in node.content: - self.rec(c) - - del scope.active_iname_aliases[loop_var] - scope.active_loopy_inames.remove(loopy_loop_var) + # }}} - else: - raise NotImplementedError("unbounded do loop") + space = isl.Space.create_from_names(self.isl_context, + set=[loopy_loop_var], params=list(loop_bound_deps)) + + from loopy.isl_helpers import iname_rel_aff + from loopy.symbolic import aff_from_expr + index_set = ( + isl.BasicSet.universe(space) + .add_constraint( + isl.Constraint.inequality_from_aff( + iname_rel_aff(space, + loopy_loop_var, ">=", + aff_from_expr(space, 0)))) + .add_constraint( + isl.Constraint.inequality_from_aff( + iname_rel_aff(space, + loopy_loop_var, "<=", + aff_from_expr(space, stop-start))))) + + from pymbolic import var + scope.active_iname_aliases[loop_var] = \ + var(loopy_loop_var) + start + scope.active_loopy_inames.add(loopy_loop_var) + + scope.index_sets.append(index_set) + + self.block_nest.append("do") + + for c in node.content: + self.rec(c) + + del scope.active_iname_aliases[loop_var] + scope.active_loopy_inames.remove(loopy_loop_var) def map_EndDo(self, node): - pass + if not self.block_nest: + raise TranslationError("no do loop started at end do") + if self.block_nest.pop() != "do": + raise TranslationError("mismatched end do") def map_Continue(self, node): raise NotImplementedError("continue") @@ -593,17 +600,7 @@ class F2LoopyTranslator(FTreeWalkerBase): faulty_loopy_pragma_match = self.faulty_loopy_pragma.match( stripped_comment_line) - if stripped_comment_line == "$loopy begin transform": - if self.in_transform_code: - raise TranslationError("can't enter transform code twice") - self.in_transform_code = True - - elif stripped_comment_line == "$loopy end transform": - if not self.in_transform_code: - raise TranslationError("can't leave transform code twice") - self.in_transform_code = False - - elif begin_tag_match: + if begin_tag_match: tag = begin_tag_match.group(1) if tag in self.instruction_tags: raise TranslationError("nested begin tag for tag '%s'" % tag) @@ -616,9 +613,6 @@ class F2LoopyTranslator(FTreeWalkerBase): "end tag without begin tag for tag '%s'" % tag) self.instruction_tags.remove(tag) - elif self.in_transform_code: - self.transform_code_lines.append(node.content) - elif faulty_loopy_pragma_match is not None: from warnings import warn warn("The comment line '%s' was not recognized as a loopy directive" @@ -628,18 +622,8 @@ class F2LoopyTranslator(FTreeWalkerBase): # }}} - def make_kernels(self, pre_transform_code=None, pre_transform_code_context=None): - kernel_names = [ - sub.subprogram_name - for sub in self.kernels] - - if pre_transform_code_context is None: - proc_dict = {} - else: - proc_dict = pre_transform_code_context.copy() - - proc_dict["lp"] = lp - proc_dict["np"] = np + def make_kernels(self): + result = [] for sub in self.kernels: # {{{ figure out arguments @@ -685,28 +669,17 @@ class F2LoopyTranslator(FTreeWalkerBase): sub.instructions, kernel_data, name=sub.subprogram_name, - default_order="F" + default_order="F", + index_dtype=self.index_dtype, ) from loopy.loop import fuse_loop_domains knl = fuse_loop_domains(knl) + knl = lp.fold_constants(knl) - proc_dict[sub.subprogram_name] = lp.fold_constants(knl) - - transform_code = remove_common_indentation( - self.transform_code_lines) - - if pre_transform_code is not None: - proc_dict["_MODULE_SOURCE_CODE"] = pre_transform_code - exec(compile(pre_transform_code, - "<loopy transforms>", "exec"), proc_dict) - - proc_dict["_MODULE_SOURCE_CODE"] = transform_code - exec(compile(transform_code, - "<loopy transforms>", "exec"), proc_dict) + result.append(knl) - return [proc_dict[knl_name] - for knl_name in kernel_names] + return result # }}} diff --git a/loopy/fusion.py b/loopy/fusion.py new file mode 100644 index 0000000000000000000000000000000000000000..4431c2c7f56009b8e205cb310060b96cfa210eae --- /dev/null +++ b/loopy/fusion.py @@ -0,0 +1,291 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import six + +import islpy as isl +from islpy import dim_type + +from loopy.diagnostic import LoopyError +from pymbolic import var + + +def _find_fusable_loop_domain_index(domain, other_domains): + my_inames = set(domain.get_var_dict(dim_type.set)) + + overlap_domains = [] + for i, o_dom in enumerate(other_domains): + o_inames = set(o_dom.get_var_dict(dim_type.set)) + if my_inames & o_inames: + overlap_domains.append(i) + + if len(overlap_domains) >= 2: + raise LoopyError("more than one domain in one kernel has " + "overlapping inames with a " + "domain of the other kernel, cannot fuse: '%s'" + % domain) + + if len(overlap_domains) == 1: + return overlap_domains[0] + else: + return None + + +# {{{ generic merge helpers + +def _ordered_merge_lists(list_a, list_b): + result = list_a[:] + for item in list_b: + if item not in list_b: + result.append(item) + + return result + + +def _merge_dicts(item_name, dict_a, dict_b): + result = dict_a.copy() + + for k, v in six.iteritems(dict_b): + if k in result: + if v != result[k]: + raise LoopyError("inconsistent %ss for key '%s' in merge: %s and %s" + % (item_name, k, v, result[k])) + + else: + result[k] = v + + return result + + +def _merge_values(item_name, val_a, val_b): + if val_a != val_b: + raise LoopyError("inconsistent %ss in merge: %s and %s" + % (item_name, val_a, val_b)) + + return val_a + +# }}} + + +# {{{ two-kernel fusion + +def _fuse_two_kernels(knla, knlb): + from loopy.kernel import kernel_state + if knla.state != kernel_state.INITIAL or knlb.state != kernel_state.INITIAL: + raise LoopyError("can only fuse kernels in INITIAL state") + + # {{{ fuse domains + + new_domains = knla.domains[:] + + for dom_b in knlb.domains: + i_fuse = _find_fusable_loop_domain_index(dom_b, new_domains) + if i_fuse is None: + new_domains.append(dom_b) + else: + dom_a = new_domains[i_fuse] + dom_a, dom_b = isl.align_two(dom_a, dom_b) + + shared_inames = list( + set(dom_a.get_var_dict(dim_type.set)) + & + set(dom_b.get_var_dict(dim_type.set))) + + dom_a_s = dom_a.project_out_except(shared_inames, [dim_type.set]) + dom_b_s = dom_a.project_out_except(shared_inames, [dim_type.set]) + + if not (dom_a_s <= dom_b_s and dom_b_s <= dom_a_s): + raise LoopyError("kernels do not agree on domain of " + "inames '%s'" % (",".join(shared_inames))) + + new_domain = dom_a & dom_b + + new_domains[i_fuse] = new_domain + + # }}} + + vng = knla.get_var_name_generator() + b_var_renames = {} + + # {{{ fuse args + + new_args = knla.args[:] + for b_arg in knlb.args: + if b_arg.name not in knla.arg_dict: + new_arg_name = vng(b_arg.name) + + if new_arg_name != b_arg.name: + b_var_renames[b_arg.name] = var(new_arg_name) + + new_args.append(b_arg.copy(name=new_arg_name)) + else: + if b_arg != knla.arg_dict[b_arg.name]: + raise LoopyError( + "argument '%s' has inconsistent definition between " + "the two kernels being merged" % b_arg.name) + + # }}} + + # {{{ fuse temporaries + + new_temporaries = knla.temporary_variables.copy() + for b_name, b_tv in six.iteritems(knlb.temporary_variables): + assert b_name == b_tv.name + + new_tv_name = vng(b_name) + + if new_tv_name != b_name: + b_var_renames[b_name] = var(new_tv_name) + + assert new_tv_name not in new_temporaries + new_temporaries[new_tv_name] = b_tv.copy(name=new_tv_name) + + # }}} + + # {{{ apply renames in kernel b + + from loopy.symbolic import ( + SubstitutionRuleMappingContext, + RuleAwareSubstitutionMapper) + from pymbolic.mapper.substitutor import make_subst_func + + srmc = SubstitutionRuleMappingContext( + knlb.substitutions, knlb.get_var_name_generator()) + subst_map = RuleAwareSubstitutionMapper( + srmc, make_subst_func(b_var_renames), within=lambda stack: True) + knlb = subst_map.map_kernel(knlb) + + # }}} + + # {{{ fuse instructions + + new_instructions = knla.instructions[:] + from pytools import UniqueNameGenerator + insn_id_gen = UniqueNameGenerator( + set([insna.id for insna in new_instructions])) + + knl_b_instructions = [] + old_b_id_to_new_b_id = {} + for insnb in knlb.instructions: + old_id = insnb.id + new_id = insn_id_gen(old_id) + old_b_id_to_new_b_id[old_id] = new_id + + knl_b_instructions.append( + insnb.copy(id=new_id)) + + for insnb in knl_b_instructions: + new_instructions.append( + insnb.copy( + insn_deps=frozenset( + old_b_id_to_new_b_id[dep_id] + for dep_id in insnb.insn_deps))) + + # }}} + + # {{{ fuse assumptions + + assump_a = knla.assumptions + assump_b = knlb.assumptions + assump_a, assump_b = isl.align_two(assump_a, assump_b) + + shared_param_names = list( + set(dom_a.get_var_dict(dim_type.set)) + & + set(dom_b.get_var_dict(dim_type.set))) + + assump_a_s = assump_a.project_out_except(shared_param_names, [dim_type.param]) + assump_b_s = assump_a.project_out_except(shared_param_names, [dim_type.param]) + + if not (assump_a_s <= assump_b_s and assump_b_s <= assump_a_s): + raise LoopyError("assumptions do not agree on kernels to be merged") + + new_assumptions = (assump_a & assump_b).params() + + # }}} + + from loopy.kernel import LoopKernel + return LoopKernel( + domains=new_domains, + instructions=new_instructions, + args=new_args, + name="%s_and_%s" % (knla.name, knlb.name), + preambles=_ordered_merge_lists(knla.preambles, knlb.preambles), + preamble_generators=_ordered_merge_lists( + knla.preamble_generators, knlb.preamble_generators), + assumptions=new_assumptions, + local_sizes=_merge_dicts( + "local size", knla.local_sizes, knlb.local_sizes), + temporary_variables=new_temporaries, + iname_to_tag=_merge_dicts( + "iname-to-tag mapping", + knla.iname_to_tag, + knlb.iname_to_tag), + substitutions=_merge_dicts( + "substitution", + knla.substitutions, + knlb.substitutions), + function_manglers=_ordered_merge_lists( + knla.function_manglers, + knlb.function_manglers), + symbol_manglers=_ordered_merge_lists( + knla.symbol_manglers, + knlb.symbol_manglers), + + iname_slab_increments=_merge_dicts( + "iname slab increment", + knla.iname_slab_increments, + knlb.iname_slab_increments), + loop_priority=_ordered_merge_lists( + knla.loop_priority, + knlb.loop_priority), + silenced_warnings=_ordered_merge_lists( + knla.silenced_warnings, + knlb.silenced_warnings), + applied_iname_rewrites=_ordered_merge_lists( + knla.applied_iname_rewrites, + knlb.applied_iname_rewrites), + index_dtype=_merge_values( + "index dtype", + knla.index_dtype, + knlb.index_dtype), + target=_merge_values( + "target", + knla.target, + knlb.target), + options=knla.options) + +# }}} + + +def fuse_kernels(kernels): + kernels = list(kernels) + result = kernels.pop(0) + while kernels: + result = _fuse_two_kernels(result, kernels.pop(0)) + + return result + +# vim: foldmethod=marker diff --git a/loopy/ipython_ext.py b/loopy/ipython_ext.py new file mode 100644 index 0000000000000000000000000000000000000000..12fd0354267e86707cc91de0027319e3584a08f0 --- /dev/null +++ b/loopy/ipython_ext.py @@ -0,0 +1,28 @@ +from __future__ import division + +from IPython.core.magic import (magics_class, Magics, cell_magic) + +import loopy as lp + + +@magics_class +class LoopyMagics(Magics): + @cell_magic + def fortran_kernel(self, line, cell): + result = lp.parse_fortran(cell.encode()) + + for knl in result: + self.shell.user_ns[knl.name] = knl + + @cell_magic + def transformed_fortran_kernel(self, line, cell): + result = lp.parse_transformed_fortran( + cell.encode(), + transform_code_context=self.shell.user_ns) + + for knl in result: + self.shell.user_ns[knl.name] = knl + + +def load_ipython_extension(ip): + ip.register_magics(LoopyMagics) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index b876513f754209f7ff498f4b9243a9ac54cce7f8..5c52988926e6714a83a620cfdad1e760fb25c4f2 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -137,8 +137,7 @@ def iname_rel_aff(space, iname, rel, aff): raise ValueError("unknown value of 'rel': %s" % rel) -def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context, - prefer_constants): +def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context): if context is not None: context = isl.align_spaces(context, pw_aff.get_domain_space(), obj_bigger_ok=True).params() @@ -163,21 +162,12 @@ def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context, .is_bounded()) # put constant bounds with unbounded validity first - # FIXME: Heuristi-hack. - if prefer_constants: - order = [ - (True, False), # constant, unbounded validity - (False, False), # nonconstant, unbounded validity - (True, True), # constant, bounded validity - (False, True), # nonconstant, bounded validity - ] - else: - order = [ - (False, False), # nonconstant, unbounded validity - (True, False), # constant, unbounded validity - (False, True), # nonconstant, bounded validity - (True, True), # constant, bounded validity - ] + order = [ + (True, False), # constant, unbounded validity + (False, False), # nonconstant, unbounded validity + (True, True), # constant, bounded validity + (False, True), # nonconstant, bounded validity + ] pieces = flatten([ [(set, aff) for set, aff in pieces @@ -209,22 +199,19 @@ def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context, % (what, pw_aff)) -def static_min_of_pw_aff(pw_aff, constants_only, context=None, - prefer_constants=True): +def static_min_of_pw_aff(pw_aff, constants_only, context=None): return static_extremum_of_pw_aff(pw_aff, constants_only, isl.PwAff.ge_set, - "minimum", context, prefer_constants) + "minimum", context) -def static_max_of_pw_aff(pw_aff, constants_only, context=None, - prefer_constants=True): +def static_max_of_pw_aff(pw_aff, constants_only, context=None): return static_extremum_of_pw_aff(pw_aff, constants_only, isl.PwAff.le_set, - "maximum", context, prefer_constants) + "maximum", context) -def static_value_of_pw_aff(pw_aff, constants_only, context=None, - prefer_constants=True): +def static_value_of_pw_aff(pw_aff, constants_only, context=None): return static_extremum_of_pw_aff(pw_aff, constants_only, isl.PwAff.eq_set, - "value", context, prefer_constants) + "value", context) def duplicate_axes(isl_obj, duplicate_inames, new_inames): @@ -337,7 +324,7 @@ def boxify(cache_manager, domain, box_inames, context): domain = domain.move_dims( dim_type.param, n_old_parameters, dim_type.set, 0, n_nonbox_inames) - result = domain.universe_like() + result = domain.universe(domain.space) zero = isl.Aff.zero_on_domain(result.space) for i in range(len(box_iname_indices)): diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 14562b384a89f1455a3a29ac161682889f1d3e01..1f64e59e6c9e8ab6d599ea0cc61937df5a60ffcb 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1,10 +1,6 @@ """Kernel object.""" -from __future__ import division -from __future__ import absolute_import -import six -from six.moves import range -from six.moves import zip +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -28,6 +24,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six +from six.moves import range, zip import numpy as np from pytools import RecordWithoutPickling, Record, memoize_method diff --git a/loopy/statistics.py b/loopy/statistics.py index 9ba6bd3972985270f674e09e55ee2b439888d361..24ce751ff5bd79308ac54f7c00875293eed0aa4e 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1,6 +1,4 @@ -from __future__ import division -from __future__ import absolute_import -import six +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2015 James Stevens" @@ -24,6 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six + import loopy as lp import warnings from islpy import dim_type @@ -35,17 +35,20 @@ class TypeToOpCountMap: def __init__(self, init_dict=None): if init_dict is None: - self.dict = {} - else: - self.dict = init_dict + init_dict = {} + + self.dict = init_dict def __add__(self, other): - return TypeToOpCountMap(dict(self.dict.items() + other.dict.items() - + [(k, self.dict[k] + other.dict[k]) - for k in set(self.dict) & set(other.dict)])) + result = self.dict.copy() + + for k, v in six.iteritems(other.dict): + result[k] = self.dict.get(k, 0) + v + + return TypeToOpCountMap(result) def __radd__(self, other): - if (other != 0): + if other != 0: raise ValueError("TypeToOpCountMap: Attempted to add TypeToOpCountMap " "to {} {}. TypeToOpCountMap may only be added to " "0 and other TypeToOpCountMap objects." @@ -81,7 +84,7 @@ class ExpressionOpCounter(CombineMapper): def __init__(self, knl): self.knl = knl - from loopy.codegen.expression import TypeInferenceMapper + from loopy.expression import TypeInferenceMapper self.type_inf = TypeInferenceMapper(knl) def combine(self, values): @@ -263,6 +266,36 @@ localid 0 is threadidx.x ''' +def count(kernel, bset): + try: + return bset.card() + except AttributeError: + pass + + if not bset.is_box(): + from loopy.diagnostic import warn + warn(kernel, "count_overestimate", + "Barvinok wrappers are not installed. " + "Counting routines may overestimate the " + "number of integer points in your loop " + "domain.") + + result = None + + for i in range(bset.dim(isl.dim_type.set)): + dmax = bset.dim_max(i) + dmin = bset.dim_min(i) + + length = isl.PwQPolynomial.from_pw_aff(dmax - dmin + 1) + + if result is None: + result = length + else: + result = result * length + + return result + + # to evaluate poly: poly.eval_with_dict(dictionary) def get_op_poly(knl): from loopy.preprocess import preprocess_kernel, infer_unknown_types @@ -278,7 +311,7 @@ def get_op_poly(knl): inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) ops = op_counter(insn.expression) - op_poly = op_poly + ops*domain.card() + op_poly = op_poly + ops*count(knl, domain) return op_poly @@ -290,6 +323,5 @@ def get_DRAM_access_poly(knl): # for now just counting subscripts insn_inames = knl.insn_inames(insn) inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(insn_inames, [dim_type.set])) - poly += subscript_counter(insn.expression) * domain.card() + poly += subscript_counter(insn.expression) * count(knl, domain) return poly - diff --git a/loopy/target/opencl/__init__.py b/loopy/target/opencl/__init__.py index e68040c0983d5b6f144ae2b42eaa41899ef56b76..e4533b86dd24a8dca973ac9c8ffd022a4bed204b 100644 --- a/loopy/target/opencl/__init__.py +++ b/loopy/target/opencl/__init__.py @@ -214,10 +214,9 @@ class OpenCLTarget(CTarget): @memoize_method def get_dtype_registry(self): - from loopy.target.c.compyte import ( - DTypeRegistry, fill_with_registry_with_c_types) + from loopy.target.c.compyte.dtypes import DTypeRegistry, fill_with_registry_with_c_types result = DTypeRegistry() - fill_with_registry_with_c_types(result) + fill_with_registry_with_c_types(result, respect_windows=False) # complex number support left out diff --git a/loopy/target/pyopencl/__init__.py b/loopy/target/pyopencl/__init__.py index a0a119dce42e1095ff327f38f6b836f80091b159..174506cd65a81b405053c212c9683f9dd2df2cc1 100644 --- a/loopy/target/pyopencl/__init__.py +++ b/loopy/target/pyopencl/__init__.py @@ -125,16 +125,16 @@ def adjust_local_temp_var_storage(kernel, device): def check_sizes(kernel, device): import loopy as lp + from loopy.diagnostic import LoopyAdvisory, LoopyError + if device is None: from loopy.diagnostic import warn warn(kernel, "no_device_in_pre_codegen_checks", "No device parameter was passed to the PyOpenCLTarget. " "Perhaps you want to pass a device to benefit from " - "additional checking.") + "additional checking.", LoopyAdvisory) return - from loopy.diagnostic import LoopyAdvisory, LoopyError - parameters = {} for arg in kernel.args: if isinstance(arg, lp.ValueArg) and arg.approximately is not None: diff --git a/loopy/tools.py b/loopy/tools.py index 1f47160677c5255539a3a6c098c9ba6d8b89a18b..b3c610dcbcf5e23a1de5746db78d5fedc9d171ed 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -164,14 +164,14 @@ class PicklableDtype(object): # {{{ remove common indentation -def remove_common_indentation(code): +def remove_common_indentation(code, require_leading_newline=True): if "\n" not in code: return code # accommodate pyopencl-ish syntax highlighting code = code.lstrip("//CL//") - if not code.startswith("\n"): + if require_leading_newline and not code.startswith("\n"): return code lines = code.split("\n") diff --git a/loopy/version.py b/loopy/version.py index 8f68a4958da821981a6922bd14335a2bc487473a..e7d877aa136ef4ad15774388ba9c9cb37b828ed1 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -25,4 +25,4 @@ VERSION = (2014, 1) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS -DATA_MODEL_VERSION = "v5" +DATA_MODEL_VERSION = "v6" diff --git a/test/test_fortran.py b/test/test_fortran.py index d51411a6e65ba750ddf8cfc2ee844c4bc167ab82..d361b15dc3d5f280bd9a4dadd4dbd603d4470284 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -51,23 +51,27 @@ def test_fill(ctx_factory): implicit none real*8 a, out(n) - integer n + integer n, i do i = 1, n out(i) = a end do end - !$loopy begin transform + !$loopy begin ! - ! fill = lp.split_iname(fill, "i", 128, + ! fill, = lp.parse_fortran(SOURCE) + ! fill = lp.split_iname(fill, "i", split_amount, ! outer_tag="g.0", inner_tag="l.0") + ! RESULT = [fill] ! - !$loopy end transform + !$loopy end """ - from loopy.frontend.fortran import f2loopy - knl, = f2loopy(fortran_src) + knl, = lp.parse_transformed_fortran(fortran_src, + pre_transform_code="split_amount = 128") + + assert "i_inner" in knl.all_inames() ctx = ctx_factory() @@ -80,7 +84,7 @@ def test_fill_const(ctx_factory): implicit none real*8 a, out(n) - integer n + integer n, i do i = 1, n out(i) = 3.45 @@ -88,8 +92,7 @@ def test_fill_const(ctx_factory): end """ - from loopy.frontend.fortran import f2loopy - knl, = f2loopy(fortran_src) + knl, = lp.parse_fortran(fortran_src) ctx = ctx_factory() @@ -102,7 +105,7 @@ def test_asterisk_in_shape(ctx_factory): implicit none real*8 a, out(n), out2(n), inp(*) - integer n + integer n, i do i = 1, n a = inp(n) @@ -112,8 +115,7 @@ def test_asterisk_in_shape(ctx_factory): end """ - from loopy.frontend.fortran import f2loopy - knl, = f2loopy(fortran_src) + knl, = lp.parse_fortran(fortran_src) ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -127,7 +129,7 @@ def test_temporary_to_subst(ctx_factory): implicit none real*8 a, out(n), out2(n), inp(n) - integer n + integer n, i do i = 1, n a = inp(i) @@ -137,8 +139,7 @@ def test_temporary_to_subst(ctx_factory): end """ - from loopy.frontend.fortran import f2loopy - knl, = f2loopy(fortran_src) + knl, = lp.parse_fortran(fortran_src) ref_knl = knl @@ -154,7 +155,7 @@ def test_temporary_to_subst_two_defs(ctx_factory): implicit none real*8 a, out(n), out2(n), inp(n) - integer n + integer n, i do i = 1, n a = inp(i) @@ -165,8 +166,7 @@ def test_temporary_to_subst_two_defs(ctx_factory): end """ - from loopy.frontend.fortran import f2loopy - knl, = f2loopy(fortran_src) + knl, = lp.parse_fortran(fortran_src) ref_knl = knl @@ -182,7 +182,7 @@ def test_temporary_to_subst_indices(ctx_factory): implicit none real*8 a(n), out(n), out2(n), inp(n) - integer n + integer n, i do i = 1, n a(i) = 6*inp(i) @@ -194,8 +194,7 @@ def test_temporary_to_subst_indices(ctx_factory): end """ - from loopy.frontend.fortran import f2loopy - knl, = f2loopy(fortran_src) + knl, = lp.parse_fortran(fortran_src) knl = lp.fix_parameters(knl, n=5) @@ -215,7 +214,7 @@ def test_if(ctx_factory): implicit none real*8 a, b, out(n), out2(n), inp(n) - integer n + integer n, i, j do i = 1, n a = inp(i) @@ -232,8 +231,7 @@ def test_if(ctx_factory): end """ - from loopy.frontend.fortran import f2loopy - knl, = f2loopy(fortran_src) + knl, = lp.parse_fortran(fortran_src) ref_knl = knl @@ -249,7 +247,7 @@ def test_tagged(ctx_factory): implicit none real*8 a, b, r, out(n), out2(n), inp(n), inp2(n) real*8 alpha - integer n + integer n, i do i = 1, n !$loopy begin tagged: input @@ -267,8 +265,7 @@ def test_tagged(ctx_factory): end """ - from loopy.frontend.fortran import f2loopy - knl, = f2loopy(fortran_src) + knl, = lp.parse_fortran(fortran_src) assert sum(1 for insn in lp.find_instructions(knl, "*$input")) == 2 @@ -294,8 +291,7 @@ def test_matmul(ctx_factory, buffer_inames): end subroutine """ - from loopy.frontend.fortran import f2loopy - knl, = f2loopy(fortran_src) + knl, = lp.parse_fortran(fortran_src) assert len(knl.domains) == 1 @@ -321,12 +317,6 @@ def test_matmul(ctx_factory, buffer_inames): ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128)) - # # FIXME: Make r/w tests possible, reactivate the above - # knl = lp.preprocess_kernel(knl) - # for k in lp.generate_loop_schedules(knl): - # code, _ = lp.generate_code(k) - # print(code) - @pytest.mark.xfail def test_batched_sparse(): @@ -339,6 +329,7 @@ def test_batched_sparse(): real*8 x(n, nvecs), y(n, nvecs), rowsum(nvecs) integer m, n, rowstart, rowend, length, nvals, nvecs + integer i, j, k do i = 1, m rowstart = rowstarts(i) @@ -362,8 +353,7 @@ def test_batched_sparse(): """ - from loopy.frontend.fortran import f2loopy - knl, = f2loopy(fortran_src) + knl, = lp.parse_fortran(fortran_src) knl = lp.split_iname(knl, "i", 128) knl = lp.tag_inames(knl, {"i_outer": "g.0"}) @@ -373,6 +363,92 @@ def test_batched_sparse(): knl = lp.fix_parameters(knl, nvecs=4) +def test_fuse_kernels(ctx_factory): + fortran_template = """ + subroutine {name}(nelements, ndofs, result, d, q) + implicit none + integer e, i, j, k + integer nelements, ndofs + real*8 result(nelements, ndofs, ndofs) + real*8 q(nelements, ndofs, ndofs) + real*8 d(ndofs, ndofs) + real*8 prev + + do e = 1,nelements + do i = 1,ndofs + do j = 1,ndofs + do k = 1,ndofs + {inner} + end do + end do + end do + end do + end subroutine + """ + + xd_line = """ + prev = result(e,i,j) + result(e,i,j) = prev + d(i,k)*q(e,i,k) + """ + yd_line = """ + prev = result(e,i,j) + result(e,i,j) = prev + d(i,k)*q(e,k,j) + """ + + xderiv, = lp.parse_fortran( + fortran_template.format(inner=xd_line, name="xderiv")) + yderiv, = lp.parse_fortran( + fortran_template.format(inner=yd_line, name="yderiv")) + xyderiv, = lp.parse_fortran( + fortran_template.format( + inner=(xd_line + "\n" + yd_line), name="xyderiv")) + + knl = lp.fuse_kernels((xderiv, yderiv)) + knl = lp.set_loop_priority(knl, "e,i,j,k") + + assert len(knl.temporary_variables) == 2 + + ctx = ctx_factory() + lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4)) + + +def test_parse_and_fuse_two_kernels(): + fortran_src = """ + subroutine fill(out, a, n) + implicit none + + real*8 a, out(n) + integer n, i + + do i = 1, n + out(i) = a + end do + end + + subroutine twice(out, n) + implicit none + + real*8 out(n) + integer n, i + + do i = 1, n + out(i) = 2*out(i) + end do + end + + !$loopy begin + ! + ! fill, twice = lp.parse_fortran(SOURCE) + ! knl = lp.fuse_kernels((fill, twice)) + ! print(knl) + ! RESULT = [knl] + ! + !$loopy end + """ + + knl, = lp.parse_transformed_fortran(fortran_src) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_statistics.py b/test/test_statistics.py index e68700833b4a4b41bef5051b030aed92206622c6..bfcd199f17dd4fa9c176145ebc4f84d1a848b987 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -1,4 +1,4 @@ -from __future__ import division +from __future__ import division, print_function __copyright__ = "Copyright (C) 2015 James Stevens" @@ -23,14 +23,14 @@ THE SOFTWARE. """ import sys -from pyopencl.tools import ( +from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) from loopy.statistics import * # noqa import numpy as np -def test_op_counter_basic(ctx_factory): +def test_op_counter_basic(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -56,7 +56,7 @@ def test_op_counter_basic(ctx_factory): assert i32 == n*m -def test_op_counter_reduction(ctx_factory): +def test_op_counter_reduction(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", @@ -74,10 +74,10 @@ def test_op_counter_reduction(ctx_factory): assert f32 == 2*n*m*l -def test_op_counter_logic(ctx_factory): +def test_op_counter_logic(): knl = lp.make_kernel( - "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", + "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) @@ -98,10 +98,10 @@ def test_op_counter_logic(ctx_factory): assert i32 == n*m -def test_op_counter_specialops(ctx_factory): +def test_op_counter_specialops(): knl = lp.make_kernel( - "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", + "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) @@ -124,10 +124,10 @@ def test_op_counter_specialops(ctx_factory): assert i32 == n*m -def test_op_counter_bitwise(ctx_factory): +def test_op_counter_bitwise(): knl = lp.make_kernel( - "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", + "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) @@ -143,10 +143,43 @@ def test_op_counter_bitwise(ctx_factory): m = 256 l = 128 i32 = poly.dict[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l}) + print(poly.dict[np.dtype(np.int32)]) not_there = poly[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l}) assert i32 == 3*n*m+n*m*l assert not_there == 0 + +def test_op_counter_triangular_domain(): + + knl = lp.make_kernel( + "{[i,j]: 0<=i<n and 0<=j<m and i<j}", + """ + a[i, j] = b[i,j] * 2 + """, + name="bitwise", assumptions="n,m >= 1") + + knl = lp.add_and_infer_dtypes(knl, + dict(b=np.float64)) + + expect_fallback = False + import islpy as isl + try: + isl.BasicSet.card + except AttributeError: + expect_fallback = True + else: + expect_fallback = False + + poly = get_op_poly(knl)[np.dtype(np.float64)] + value_dict = dict(m=13, n=200) + flops = poly.eval_with_dict(value_dict) + + if expect_fallback: + assert flops == 144 + else: + assert flops == 78 + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])