From 29f26f72c96b69784d896145a6406064cda0dac7 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 8 Aug 2011 17:44:19 +0200
Subject: [PATCH] Avoid lmem bank conflicts only on hardware sporting real
 lmem.

---
 loopy/codegen/prefetch.py | 70 +++++++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 29 deletions(-)

diff --git a/loopy/codegen/prefetch.py b/loopy/codegen/prefetch.py
index bec065c9a..53bb1766f 100644
--- a/loopy/codegen/prefetch.py
+++ b/loopy/codegen/prefetch.py
@@ -1,6 +1,7 @@
 from __future__ import division
 
 from pytools import Record
+import pyopencl as cl
 import pyopencl.characterize as cl_char
 from loopy.codegen import wrap_in, gen_code_block
 
@@ -30,36 +31,47 @@ def preprocess_prefetch(kernel):
         other_dim_sizes = (pf.itemsize
                 * product(dim_storage_lengths[:-1]))
 
-        min_mult = cl_char.local_memory_bank_count(kernel.device)
-        good_incr = None
-        new_dsl = dim_storage_lengths
-        min_why_not = None
-
-        for increment in range(dim_storage_lengths[-1]//2):
-
-            test_dsl = dim_storage_lengths[:]
-            test_dsl[-1] = test_dsl[-1] + increment
-            new_mult, why_not = cl_char.why_not_local_access_conflict_free(
-                    kernel.device, pf.itemsize,
-                    shape, test_dsl)
-
-            # will choose smallest increment 'automatically'
-            if new_mult < min_mult:
-                new_lmem_use = other_pf_sizes + pf.itemsize*product(new_dsl)
-                if new_lmem_use < lmem_size:
-                    new_dsl = test_dsl
-                    min_mult = new_mult
-                    min_why_not = why_not
-                    good_incr = increment
-
-        if min_mult != 1:
+        if kernel.device.local_mem_type == cl.device_local_mem_type.GLOBAL:
+            # FIXME: could try to avoid cache associativity disasters
+            new_dsl = dim_storage_lengths
+
+        elif kernel.device.local_mem_type == cl.device_local_mem_type.LOCAL:
+            min_mult = cl_char.local_memory_bank_count(kernel.device)
+            good_incr = None
+            new_dsl = dim_storage_lengths
+            min_why_not = None
+
+            for increment in range(dim_storage_lengths[-1]//2):
+
+                test_dsl = dim_storage_lengths[:]
+                test_dsl[-1] = test_dsl[-1] + increment
+                new_mult, why_not = cl_char.why_not_local_access_conflict_free(
+                        kernel.device, pf.itemsize,
+                        shape, test_dsl)
+
+                # will choose smallest increment 'automatically'
+                if new_mult < min_mult:
+                    new_lmem_use = other_pf_sizes + pf.itemsize*product(new_dsl)
+                    if new_lmem_use < lmem_size:
+                        new_dsl = test_dsl
+                        min_mult = new_mult
+                        min_why_not = why_not
+                        good_incr = increment
+
+            if min_mult != 1:
+                from warnings import warn
+                from loopy import LoopyAdvisory
+                warn("could not find a conflict-free mem layout "
+                        "for prefetch of '%s' "
+                        "(currently: %dx conflict, increment: %d, reason: %s)"
+                        % (pf.input_vector, min_mult, good_incr, min_why_not),
+                        LoopyAdvisory)
+        else:
             from warnings import warn
-            from loopy import LoopyAdvisory
-            warn("could not find a conflict-free mem layout "
-                    "for prefetch of '%s' "
-                    "(currently: %dx conflict, increment: %d, reason: %s)"
-                    % (pf.input_vector, min_mult, good_incr, min_why_not),
-                    LoopyAdvisory)
+            warn("unknown type of local memory")
+
+            new_dsl = dim_storage_lengths
+
 
         new_pf = pf.copy(dim_storage_lengths=new_dsl,
                 name="prefetch_%s_%d" % (pf.input_vector, i_pf))
-- 
GitLab