From 645daafbbb6b510aa8b8431ea630b71339064a45 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 22 Sep 2015 00:17:43 -0500
Subject: [PATCH] Avoid naming prefetch buffers from add_prefetch() *_fetch_0

---
 doc/tutorial.rst  | 23 +++++++++++++----------
 loopy/__init__.py | 10 +++++++---
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 80b862452..bf8fdfec1 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -997,9 +997,10 @@ transformation exists in :func:`loopy.add_prefetch`:
     >>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
-        a_fetch_0 = a[16 * gid(0) + lid(0)];
+        a_fetch = a[16 * gid(0) + lid(0)];
+        acc_k = 0.0f;
         for (int k = 0; k <= 15; ++k)
-          acc_k = acc_k + a_fetch_0;
+          acc_k = acc_k + a_fetch;
         out[16 * gid(0) + lid(0)] = acc_k;
     ...
 
@@ -1021,12 +1022,14 @@ earlier:
     #define lid(N) ((int) get_local_id(N))
     ...
       if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
-        a_fetch_0[lid(0)] = a[lid(0) + 16 * gid(0)];
-      barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch_0 (insn_k_update depends on a_fetch) */;
+        a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)];
+      if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
+        acc_k = 0.0f;
+      barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn_k_update depends on a_fetch_rule) */;
       if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
       {
         for (int k = 0; k <= 15; ++k)
-          acc_k = acc_k + a_fetch_0[lid(0)];
+          acc_k = acc_k + a_fetch[lid(0)];
         out[16 * gid(0) + lid(0)] = acc_k;
       }
     ...
@@ -1151,7 +1154,7 @@ sign that something is amiss:
     >>> evt, (out,) = knl(queue, a=a_mat_dev)
     Traceback (most recent call last):
     ...
-    WriteRaceConditionWarning: instruction 'a_fetch' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch_0' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch)' to silenced_warnings kernel argument to disable)
+    WriteRaceConditionWarning: instruction 'a_fetch' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch)' to silenced_warnings kernel argument to disable)
 
 When we ask to see the code, the issue becomes apparent:
 
@@ -1167,17 +1170,17 @@ When we ask to see the code, the issue becomes apparent:
     <BLANKLINE>
     __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *restrict a, int const n, __global float *restrict out)
     {
-      float a_fetch_0[16];
+      float a_fetch[16];
     <BLANKLINE>
       ...
-          a_fetch_0[lid(0)] = a[n * (lid(0) + 16 * gid(1)) + lid(1) + 16 * gid(0)];
+          a_fetch[lid(0)] = a[n * (lid(0) + 16 * gid(1)) + lid(1) + 16 * gid(0)];
       ...
-          out[n * (lid(1) + gid(0) * 16) + lid(0) + gid(1) * 16] = a_fetch_0[lid(0)];
+          out[n * (lid(1) + gid(0) * 16) + lid(0) + gid(1) * 16] = a_fetch[lid(0)];
       ...
     }
 
 Loopy has a 2D workgroup to use for prefetching of a 1D array. When it
-considers making *a_fetch_0* ``local`` (in the OpenCL memory sense of the word)
+considers making *a_fetch* ``local`` (in the OpenCL memory sense of the word)
 to make use of parallelism in prefetching, it discovers that a write race
 across the remaining axis of the workgroup would emerge.
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index d417648b7..2809157eb 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -1023,7 +1023,8 @@ def _process_footprint_subscripts(kernel, rule_name, sweep_inames,
 
 
 def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
-        default_tag="l.auto", rule_name=None, footprint_subscripts=None,
+        default_tag="l.auto", rule_name=None, temporary_name=None,
+        footprint_subscripts=None,
         fetch_bounding_box=False):
     """Prefetch all accesses to the variable *var_name*, with all accesses
     being swept through *sweep_inames*.
@@ -1082,7 +1083,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
     var_name_gen = kernel.get_var_name_generator()
 
     if rule_name is None:
-        rule_name = var_name_gen("%s_fetch" % c_name)
+        rule_name = var_name_gen("%s_fetch_rule" % c_name)
+    if temporary_name is None:
+        temporary_name = var_name_gen("%s_fetch" % c_name)
 
     arg = kernel.arg_dict[var_name]
 
@@ -1119,7 +1122,8 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
     new_kernel = precompute(kernel, subst_use, sweep_inames,
             precompute_inames=dim_arg_names,
             default_tag=default_tag, dtype=arg.dtype,
-            fetch_bounding_box=fetch_bounding_box)
+            fetch_bounding_box=fetch_bounding_box,
+            temporary_name=temporary_name)
 
     # {{{ remove inames that were temporarily added by slice sweeps
 
-- 
GitLab