From 7f06cf219ce05de1b463e53fbdc6c14db4b97d61 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Thu, 8 Nov 2012 23:57:28 -0600 Subject: [PATCH] Introduce work group size floor in scan. --- pyopencl/scan.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pyopencl/scan.py b/pyopencl/scan.py index 3a82172a..af428afe 100644 --- a/pyopencl/scan.py +++ b/pyopencl/scan.py @@ -975,7 +975,6 @@ class GenericScanKernel(_GenericScanKernelBase): trip_count = 0 - if self.devices[0].type == cl.device_type.CPU: # (about the widest vector a CPU can support, also taking # into account that CPUs don't hide latency by large work groups @@ -1002,6 +1001,19 @@ class GenericScanKernel(_GenericScanKernelBase): wg_size, k_group_size) + 256 <= avail_local_mem): solutions.append((wg_size*k_group_size, k_group_size, wg_size)) + if self.devices[0].type == cl.device_type.GPU: + from pytools import any + for wg_size_floor in [256, 192, 128]: + have_sol_above_floor = any(wg_size >= wg_size_floor + for _, _, wg_size in solutions) + + if have_sol_above_floor: + # delete all the others + solutions = [(total, k_group_size, wg_size) + for total, k_group_size, wg_size in solutions + if wg_size >= wg_size_floor] + break + _, k_group_size, max_scan_wg_size = max(solutions) while True: @@ -1140,7 +1152,6 @@ class GenericScanKernel(_GenericScanKernelBase): wg_size = _round_down_to_power_of_2( min(max_wg_size, 128)) - scan_tpl = _make_template(SCAN_INTERVALS_SOURCE) scan_src = str(scan_tpl.render( wg_size=wg_size, -- GitLab