From 29f26f72c96b69784d896145a6406064cda0dac7 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Mon, 8 Aug 2011 17:44:19 +0200 Subject: [PATCH] Avoid lmem bank conflicts only on hardware sporting real lmem. --- loopy/codegen/prefetch.py | 70 +++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/loopy/codegen/prefetch.py b/loopy/codegen/prefetch.py index bec065c9a..53bb1766f 100644 --- a/loopy/codegen/prefetch.py +++ b/loopy/codegen/prefetch.py @@ -1,6 +1,7 @@ from __future__ import division from pytools import Record +import pyopencl as cl import pyopencl.characterize as cl_char from loopy.codegen import wrap_in, gen_code_block @@ -30,36 +31,47 @@ def preprocess_prefetch(kernel): other_dim_sizes = (pf.itemsize * product(dim_storage_lengths[:-1])) - min_mult = cl_char.local_memory_bank_count(kernel.device) - good_incr = None - new_dsl = dim_storage_lengths - min_why_not = None - - for increment in range(dim_storage_lengths[-1]//2): - - test_dsl = dim_storage_lengths[:] - test_dsl[-1] = test_dsl[-1] + increment - new_mult, why_not = cl_char.why_not_local_access_conflict_free( - kernel.device, pf.itemsize, - shape, test_dsl) - - # will choose smallest increment 'automatically' - if new_mult < min_mult: - new_lmem_use = other_pf_sizes + pf.itemsize*product(new_dsl) - if new_lmem_use < lmem_size: - new_dsl = test_dsl - min_mult = new_mult - min_why_not = why_not - good_incr = increment - - if min_mult != 1: + if kernel.device.local_mem_type == cl.device_local_mem_type.GLOBAL: + # FIXME: could try to avoid cache associativity disasters + new_dsl = dim_storage_lengths + + elif kernel.device.local_mem_type == cl.device_local_mem_type.LOCAL: + min_mult = cl_char.local_memory_bank_count(kernel.device) + good_incr = None + new_dsl = dim_storage_lengths + min_why_not = None + + for increment in range(dim_storage_lengths[-1]//2): + + test_dsl = dim_storage_lengths[:] + test_dsl[-1] = test_dsl[-1] + increment + new_mult, why_not = cl_char.why_not_local_access_conflict_free( + kernel.device, pf.itemsize, + shape, test_dsl) + + # will choose smallest increment 'automatically' + if new_mult < min_mult: + new_lmem_use = other_pf_sizes + pf.itemsize*product(new_dsl) + if new_lmem_use < lmem_size: + new_dsl = test_dsl + min_mult = new_mult + min_why_not = why_not + good_incr = increment + + if min_mult != 1: + from warnings import warn + from loopy import LoopyAdvisory + warn("could not find a conflict-free mem layout " + "for prefetch of '%s' " + "(currently: %dx conflict, increment: %d, reason: %s)" + % (pf.input_vector, min_mult, good_incr, min_why_not), + LoopyAdvisory) + else: from warnings import warn - from loopy import LoopyAdvisory - warn("could not find a conflict-free mem layout " - "for prefetch of '%s' " - "(currently: %dx conflict, increment: %d, reason: %s)" - % (pf.input_vector, min_mult, good_incr, min_why_not), - LoopyAdvisory) + warn("unknown type of local memory") + + new_dsl = dim_storage_lengths + new_pf = pf.copy(dim_storage_lengths=new_dsl, name="prefetch_%s_%d" % (pf.input_vector, i_pf)) -- GitLab