diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 226868317b71474b9e1866a4c0ae5df82080f9dc..e566365f8d520d33f518bc95cbaa43e5af6e747c 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -22,8 +22,12 @@ import a few modules and set up a :class:`pyopencl.Context` and a >>> import pyopencl as cl >>> import pyopencl.array >>> import pyopencl.clrandom + >>> import loopy as lp - >>> from pytools import StderrToStdout as IncludeWarningsInDoctest + >>> lp.set_caching_enabled(False) + + >>> from warnings import filterwarnings, catch_warnings + >>> filterwarnings('error', category=lp.LoopyWarning) >>> ctx = cl.create_some_context(interactive=False) >>> queue = cl.CommandQueue(ctx) @@ -97,9 +101,9 @@ always see loopy's view of a kernel by printing it. KERNEL: loopy_kernel --------------------------------------------------------------------------- ARGUMENTS: - a: GlobalArg, type: <runtime>, shape: (n), dim_tags: (stride:1) + a: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1) n: ValueArg, type: <runtime> - out: GlobalArg, type: <runtime>, shape: (n), dim_tags: (stride:1) + out: GlobalArg, type: <runtime>, shape: (n), dim_tags: (N0:stride:1) --------------------------------------------------------------------------- DOMAINS: [n] -> { [i] : i >= 0 and i <= -1 + n } @@ -168,14 +172,13 @@ by passing :attr:`loopy.Options.write_cl`. >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) - <BLANKLINE> #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE> __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) { <BLANKLINE> - for (int i = 0; i <= (-1 + n); ++i) + for (int i = 0; i <= -1 + n; ++i) out[i] = 2.0f * a[i]; } @@ -246,14 +249,13 @@ call :func:`loopy.generate_code`: >>> typed_knl = lp.get_one_scheduled_kernel(typed_knl) >>> code, _ = lp.generate_code(typed_knl) >>> print code - <BLANKLINE> #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE> __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) { <BLANKLINE> - for (int i = 0; i <= (-1 + n); ++i) + for (int i = 0; i <= -1 + n; ++i) out[i] = 2.0f * a[i]; } @@ -361,16 +363,16 @@ Let us take a look at the generated code for the above kernel: .. doctest:: >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_loop_priority(knl, "i,j") >>> evt, (out,) = knl(queue, a=a_mat_dev) - <BLANKLINE> #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE> __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) { <BLANKLINE> - for (int i = 0; i <= (-1 + n); ++i) - for (int j = 0; j <= (-1 + n); ++j) + for (int i = 0; i <= -1 + n; ++i) + for (int j = 0; j <= -1 + n; ++j) { out[n * j + i] = a[n * i + j]; out[n * i + j] = 2.0f * out[n * i + j]; @@ -382,10 +384,8 @@ still not right: .. doctest:: - >>> assert (out.get() == a_mat_dev.get().T*2).all() - Traceback (most recent call last): - ... - AssertionError + >>> print((out.get() == a_mat_dev.get().T*2).all()) + False For the kernel to perform the desired computation, *all instances* (loop iterations) of the first instruction need to be completed, @@ -406,6 +406,7 @@ with identical bounds, for the use of the transpose: ... out[j,i] = a[i,j] {id=transpose} ... out[ii,jj] = 2*out[ii,jj] {dep=transpose} ... """) + >>> knl = lp.set_loop_priority(knl, "i,j,ii,jj") :func:`loopy.duplicate_inames` can be used to achieve the same goal. Now the intended code is generated and our test passes. @@ -414,18 +415,17 @@ Now the intended code is generated and our test passes. >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=a_mat_dev) - <BLANKLINE> #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE> __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *restrict a, int const n, __global float *restrict out) { <BLANKLINE> - for (int i = 0; i <= (-1 + n); ++i) - for (int j = 0; j <= (-1 + n); ++j) + for (int i = 0; i <= -1 + n; ++i) + for (int j = 0; j <= -1 + n; ++j) out[n * j + i] = a[n * i + j]; - for (int ii = 0; ii <= (-1 + n); ++ii) - for (int jj = 0; jj <= (-1 + n); ++jj) + for (int ii = 0; ii <= -1 + n; ++ii) + for (int jj = 0; jj <= -1 + n; ++jj) out[n * ii + jj] = 2.0f * out[n * ii + jj]; } >>> assert (out.get() == a_mat_dev.get().T*2).all() @@ -443,6 +443,10 @@ control is the nesting of loops. For example, should the *i* loop be nested around the *j* loop, or the other way around, in the following simple zero-fill kernel? +It turns out that Loopy will typically choose a loop nesting for us, but it +does not like doing so. In this tutorial (where we've turned Loopy's warnings +into errors), we are told what is wrong in no uncertain terms:: + .. doctest:: >>> knl = lp.make_kernel( @@ -453,18 +457,9 @@ zero-fill kernel? >>> knl = lp.set_options(knl, "write_cl") - >>> with IncludeWarningsInDoctest(): - ... evt, (out,) = knl(queue, a=a_mat_dev) - <BLANKLINE> - ... - for (int i = 0; i <= (-1 + n); ++i) - for (int j = 0; j <= (-1 + n); ++j) - a[n * i + j] = 0.0f; + >>> evt, (out,) = knl(queue, a=a_mat_dev) + Traceback (most recent call last): ... - -Loopy has chosen a loop nesting for us, but it did not like doing so, as it -also issued the following warning:: - LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring This is easily resolved: @@ -481,12 +476,11 @@ ambiguous. .. doctest:: - >>> with IncludeWarningsInDoctest(): - ... evt, (out,) = knl(queue, a=a_mat_dev) - <BLANKLINE> + >>> evt, (out,) = knl(queue, a=a_mat_dev) + #define lid(N) ((int) get_local_id(N)) ... - for (int j = 0; j <= (-1 + n); ++j) - for (int i = 0; i <= (-1 + n); ++i) + for (int j = 0; j <= -1 + n; ++j) + for (int i = 0; i <= -1 + n; ++i) a[n * i + j] = 0.0f; ... @@ -536,15 +530,16 @@ Consider this example: >>> knl = lp.make_kernel( ... "{ [i]: 0<=i<n }", - ... "a[i] = 0", assumptions="n>=0") + ... "a[i] = 0", assumptions="n>=1") >>> knl = lp.split_iname(knl, "i", 16) + >>> knl = lp.set_loop_priority(knl, "i_outer,i_inner") >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... - for (int i_outer = 0; i_outer <= (-1 + ((15 + n) / 16)); ++i_outer) + for (int i_outer = 0; i_outer <= -1 + ((15 + n) / 16); ++i_outer) for (int i_inner = 0; i_inner <= 15; ++i_inner) - if ((-1 + -1 * i_inner + -16 * i_outer + n) >= 0) + if (-1 + -1 * i_inner + -16 * i_outer + n >= 0) a[i_inner + i_outer * 16] = 0.0f; ... @@ -567,12 +562,11 @@ relation to loop nesting. For example, it's perfectly possible to request >>> knl = lp.set_loop_priority(knl, "i_inner,i_outer") >>> evt, (out,) = knl(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... for (int i_inner = 0; i_inner <= 15; ++i_inner) - if ((-1 + n) >= 0) - for (int i_outer = 0; i_outer <= (-1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16)); ++i_outer) - a[i_inner + i_outer * 16] = 0.0f; + for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) + a[i_inner + i_outer * 16] = 0.0f; ... Notice how loopy has automatically generated guard conditionals to make @@ -593,10 +587,10 @@ commonly called 'loop tiling': >>> knl = lp.set_loop_priority(knl, "i_outer,j_outer,i_inner") >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=a_mat_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... - for (int i_outer = 0; i_outer <= (-1 + ((15 + n) / 16)); ++i_outer) - for (int j_outer = 0; j_outer <= (-1 + ((15 + n) / 16)); ++j_outer) + for (int i_outer = 0; i_outer <= ((-16 + n) / 16); ++i_outer) + for (int j_outer = 0; j_outer <= ((-16 + n) / 16); ++j_outer) for (int i_inner = 0; i_inner <= 15; ++i_inner) for (int j_inner = 0; j_inner <= 15; ++j_inner) out[n * (i_inner + i_outer * 16) + j_inner + j_outer * 16] = a[n * (j_inner + j_outer * 16) + i_inner + i_outer * 16]; @@ -632,11 +626,13 @@ loop's tag to ``"unr"``: >>> orig_knl = knl >>> knl = lp.split_iname(knl, "i", 4) >>> knl = lp.tag_inames(knl, dict(i_inner="unr")) + >>> knl = lp.set_loop_priority(knl, "i_outer,i_inner") >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) - <BLANKLINE> + #define int_floor_div_pos_b(a,b) ( ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) ) + #define lid(N) ((int) get_local_id(N)) ... - for (int i_outer = 0; i_outer <= (-1 + ((3 + n) / 4)); ++i_outer) + for (int i_outer = 0; i_outer <= int_floor_div_pos_b(-4 + n, 4); ++i_outer) { a[0 + i_outer * 4] = 0.0f; a[1 + i_outer * 4] = 0.0f; @@ -708,12 +704,12 @@ Let's try this out on our vector fill kernel by creating workgroups of size ... outer_tag="g.0", inner_tag="l.0") >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *restrict a, int const n) { <BLANKLINE> - if ((-1 + -128 * gid(0) + -1 * lid(0) + n) >= 0) + if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0) a[lid(0) + gid(0) * 128] = 0.0f; } @@ -752,18 +748,19 @@ assumption: >>> orig_knl = knl >>> knl = lp.split_iname(knl, "i", 4) >>> knl = lp.tag_inames(knl, dict(i_inner="unr")) + >>> knl = lp.set_loop_priority(knl, "i_outer,i_inner") >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... - for (int i_outer = 0; i_outer <= (-1 + ((3 + n) / 4)); ++i_outer) + for (int i_outer = 0; i_outer <= -1 + ((3 + n) / 4); ++i_outer) { a[0 + i_outer * 4] = 0.0f; - if ((-2 + -4 * i_outer + n) >= 0) + if (-2 + -4 * i_outer + n >= 0) a[1 + i_outer * 4] = 0.0f; - if ((-3 + -4 * i_outer + n) >= 0) + if (-3 + -4 * i_outer + n >= 0) a[2 + i_outer * 4] = 0.0f; - if ((-4 + -4 * i_outer + n) >= 0) + if (-4 + -4 * i_outer + n >= 0) a[3 + i_outer * 4] = 0.0f; } ... @@ -781,11 +778,12 @@ enabling some cost savings: >>> knl = orig_knl >>> knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr") >>> knl = lp.set_options(knl, "write_cl") + >>> knl = lp.set_loop_priority(knl, "i_outer,i_inner") >>> evt, (out,) = knl(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... /* bulk slab for 'i_outer' */ - for (int i_outer = 0; i_outer <= (-2 + ((3 + n) / 4)); ++i_outer) + for (int i_outer = 0; i_outer <= -2 + ((3 + n) / 4); ++i_outer) { a[0 + i_outer * 4] = 0.0f; a[1 + i_outer * 4] = 0.0f; @@ -793,15 +791,15 @@ enabling some cost savings: a[3 + i_outer * 4] = 0.0f; } /* final slab for 'i_outer' */ - for (int i_outer = (-1 + n + -1 * (3 * n / 4)); i_outer <= (-1 + ((3 + n) / 4)); ++i_outer) - if ((-1 + n) >= 0) + for (int i_outer = -1 + n + -1 * (3 * n / 4); i_outer <= -1 + ((3 + n) / 4); ++i_outer) + if (-1 + n >= 0) { a[0 + i_outer * 4] = 0.0f; - if ((-2 + -4 * i_outer + n) >= 0) + if (-2 + -4 * i_outer + n >= 0) a[1 + i_outer * 4] = 0.0f; - if ((-3 + -4 * i_outer + n) >= 0) + if (-3 + -4 * i_outer + n >= 0) a[2 + i_outer * 4] = 0.0f; - if ((4 + 4 * i_outer + -1 * n) == 0) + if (4 + 4 * i_outer + -1 * n == 0) a[3 + i_outer * 4] = 0.0f; } ... @@ -871,12 +869,12 @@ memory, local to each work item. >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out1, out2) = knl(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... { float a_temp; <BLANKLINE> - for (int i = 0; i <= (-1 + n); ++i) + for (int i = 0; i <= -1 + n; ++i) { a_temp = sin(a[i]); out2[i] = sqrt(1.0f + -1.0f * a_temp * a_temp); @@ -929,19 +927,19 @@ Consider the following example: >>> knl = lp.tag_inames(knl, dict(i_outer="g.0", i_inner="l.0")) >>> knl = lp.set_options(knl, "write_cl") >>> evt, (out,) = knl(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... { __local float a_temp[16]; float acc_k; <BLANKLINE> - if ((-1 + -16 * gid(0) + -1 * lid(0) + n) >= 0) + if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { a_temp[lid(0)] = a[16 * gid(0) + lid(0)]; acc_k = 0.0f; } barrier(CLK_LOCAL_MEM_FENCE) /* for a_temp (insn_0_k_update depends on insn) */; - if ((-1 + -16 * gid(0) + -1 * lid(0) + n) >= 0) + if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { for (int k = 0; k <= 15; ++k) acc_k = acc_k + a_temp[k]; @@ -992,7 +990,7 @@ transformation exists in :func:`loopy.add_prefetch`: >>> knl = lp.set_options(knl, "write_cl") >>> knl_pf = lp.add_prefetch(knl, "a") >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... a_fetch_0 = a[16 * gid(0) + lid(0)]; for (int k = 0; k <= 15; ++k) @@ -1015,12 +1013,12 @@ earlier: >>> knl_pf = lp.add_prefetch(knl, "a", ["i_inner"]) >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) - <BLANKLINE> + #define lid(N) ((int) get_local_id(N)) ... - if ((-1 + -16 * gid(0) + -1 * lid(0) + n) >= 0) + if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) a_fetch_0[lid(0)] = a[lid(0) + 16 * gid(0)]; barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch_0 (insn_k_update depends on a_fetch) */; - if ((-1 + -16 * gid(0) + -1 * lid(0) + n) >= 0) + if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { for (int k = 0; k <= 15; ++k) acc_k = acc_k + a_fetch_0[lid(0)]; @@ -1145,18 +1143,20 @@ sign that something is amiss: .. doctest:: - >>> with IncludeWarningsInDoctest(): - ... evt, (out,) = knl(queue, a=a_mat_dev) - /...: WriteRaceConditionWarning: instruction 'a_fetch' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch_0' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch)' to silenced_warnings kernel argument to disable) - warn(text, type) + >>> evt, (out,) = knl(queue, a=a_mat_dev) + Traceback (most recent call last): + ... + WriteRaceConditionWarning: instruction 'a_fetch' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch_0' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch)' to silenced_warnings kernel argument to disable) When we ask to see the code, the issue becomes apparent: .. doctest:: >>> knl = lp.set_options(knl, "write_cl") - >>> evt, (out,) = knl(queue, a=a_mat_dev) - <BLANKLINE> + >>> from warnings import catch_warnings + >>> with catch_warnings(): + ... filterwarnings("always", category=lp.LoopyWarning) + ... evt, (out,) = knl(queue, a=a_mat_dev) #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) <BLANKLINE>