diff --git a/doc/tutorial.rst b/doc/tutorial.rst index b7be43f7e283bc464c88a6cdb49141964f9a8f1c..361cd3acb6de1754c2c841432dd23f44a6c7fa00 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -534,7 +534,7 @@ Consider this example: for (int i_outer = 0; i_outer <= -1 + ((15 + n) / 16); ++i_outer) for (int i_inner = 0; i_inner <= 15; ++i_inner) if (-1 + -1 * i_inner + -16 * i_outer + n >= 0) - a[i_inner + i_outer * 16] = 0.0f; + a[16 * i_outer + i_inner] = 0.0f; ... By default, the new, split inames are named *OLD_outer* and *OLD_inner*, @@ -561,7 +561,7 @@ relation to loop nesting. For example, it's perfectly possible to request for (int i_inner = 0; i_inner <= 15; ++i_inner) if (-1 + -1 * i_inner + n >= 0) for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer) - a[i_inner + i_outer * 16] = 0.0f; + a[16 * i_outer + i_inner] = 0.0f; ... Notice how loopy has automatically generated guard conditionals to make @@ -588,7 +588,7 @@ commonly called 'loop tiling': for (int j_outer = 0; j_outer <= ((-16 + n) / 16); ++j_outer) for (int i_inner = 0; i_inner <= 15; ++i_inner) for (int j_inner = 0; j_inner <= 15; ++j_inner) - out[n * (i_inner + i_outer * 16) + j_inner + j_outer * 16] = a[n * (j_inner + j_outer * 16) + i_inner + i_outer * 16]; + out[n * (16 * i_outer + i_inner) + 16 * j_outer + j_inner] = a[n * (16 * j_outer + j_inner) + 16 * i_outer + i_inner]; ... .. }}} @@ -630,10 +630,10 @@ loop's tag to ``"unr"``: ... for (int i_outer = 0; i_outer <= int_floor_div_pos_b(-4 + n, 4); ++i_outer) { - a[0 + i_outer * 4] = 0.0f; - a[1 + i_outer * 4] = 0.0f; - a[2 + i_outer * 4] = 0.0f; - a[3 + i_outer * 4] = 0.0f; + a[4 * i_outer + 0] = 0.0f; + a[4 * i_outer + 1] = 0.0f; + a[4 * i_outer + 2] = 0.0f; + a[4 * i_outer + 3] = 0.0f; } ... @@ -705,7 +705,7 @@ Let's try this out on our vector fill kernel by creating workgroups of size __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *restrict a, int const n) { if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0) - a[lid(0) + gid(0) * 128] = 0.0f; + a[128 * gid(0) + lid(0)] = 0.0f; } Loopy requires that workgroup sizes are fixed and constant at compile time. @@ -750,13 +750,13 @@ assumption: ... for (int i_outer = 0; i_outer <= -1 + ((3 + n) / 4); ++i_outer) { - a[0 + i_outer * 4] = 0.0f; + a[4 * i_outer + 0] = 0.0f; if (-2 + -4 * i_outer + n >= 0) - a[1 + i_outer * 4] = 0.0f; + a[4 * i_outer + 1] = 0.0f; if (-3 + -4 * i_outer + n >= 0) - a[2 + i_outer * 4] = 0.0f; + a[4 * i_outer + 2] = 0.0f; if (-4 + -4 * i_outer + n >= 0) - a[3 + i_outer * 4] = 0.0f; + a[4 * i_outer + 3] = 0.0f; } ... @@ -780,22 +780,22 @@ enabling some cost savings: /* bulk slab for 'i_outer' */ for (int i_outer = 0; i_outer <= -2 + ((3 + n) / 4); ++i_outer) { - a[0 + i_outer * 4] = 0.0f; - a[1 + i_outer * 4] = 0.0f; - a[2 + i_outer * 4] = 0.0f; - a[3 + i_outer * 4] = 0.0f; + a[4 * i_outer + 0] = 0.0f; + a[4 * i_outer + 1] = 0.0f; + a[4 * i_outer + 2] = 0.0f; + a[4 * i_outer + 3] = 0.0f; } /* final slab for 'i_outer' */ for (int i_outer = -1 + n + -1 * (3 * n / 4); i_outer <= -1 + ((3 + n) / 4); ++i_outer) if (-1 + n >= 0) { - a[0 + i_outer * 4] = 0.0f; + a[4 * i_outer + 0] = 0.0f; if (-2 + -4 * i_outer + n >= 0) - a[1 + i_outer * 4] = 0.0f; + a[4 * i_outer + 1] = 0.0f; if (-3 + -4 * i_outer + n >= 0) - a[2 + i_outer * 4] = 0.0f; + a[4 * i_outer + 2] = 0.0f; if (4 + 4 * i_outer + -1 * n == 0) - a[3 + i_outer * 4] = 0.0f; + a[4 * i_outer + 3] = 0.0f; } ... @@ -1020,7 +1020,7 @@ earlier: if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) acc_k = 0.0f; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) - a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)]; + a_fetch[lid(0)] = a[16 * gid(0) + lid(0)]; barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn_k_update depends on a_fetch_rule) */; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { @@ -1187,9 +1187,9 @@ When we ask to see the code, the issue becomes apparent: float a_fetch[16]; <BLANKLINE> ... - a_fetch[lid(0)] = a[n * (lid(0) + 16 * gid(1)) + lid(1) + 16 * gid(0)]; + a_fetch[lid(0)] = a[n * (16 * gid(1) + lid(0)) + 16 * gid(0) + lid(1)]; ... - out[n * (lid(1) + gid(0) * 16) + lid(0) + gid(1) * 16] = a_fetch[lid(0)]; + out[n * (16 * gid(0) + lid(1)) + 16 * gid(1) + lid(0)] = a_fetch[lid(0)]; ... } @@ -1493,9 +1493,9 @@ Now to make things more interesting, we'll create a kernel with barriers: for (int i = 0; i <= 49; ++i) { barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */; - c[990 * i + 99 * j + lid(0) + 1 + gid(0) * 128] = 2 * a[980 * i + 98 * j + lid(0) + 1 + gid(0) * 128]; + c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1]; barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */; - e[980 * i + 98 * j + lid(0) + 1 + gid(0) * 128] = c[990 * i + 99 * j + 1 + lid(0) + 1 + gid(0) * 128] + c[990 * i + 99 * j + -1 + lid(0) + 1 + gid(0) * 128]; + e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1]; } }