diff --git a/pyopencl/characterize.py b/pyopencl/characterize.py index b0a98ad0305062a42f5f4886e5ce556c25e558c3..0d9a1a462e5d19d7ecc69a80044e1dd8a8577ee6 100644 --- a/pyopencl/characterize.py +++ b/pyopencl/characterize.py @@ -242,3 +242,22 @@ def why_not_local_access_conflict_free(dev, itemsize, def get_fast_inaccurate_build_options(dev): return ["-cl-mad-enable", "-cl-fast-relaxed-math", "-cl-no-signed-zeros", "-cl-strict-aliasing"] + + + + +def get_simd_group_size(dev): + try: + return dev.warp_size_nv + except: + pass + + lc_vendor = dev.vendor.lower() + if "nvidia" in lc_vendor: + return 32 + + if ("amd" in lc_vendor or "ati" in lc_vendor) \ + and dev.type == cl.device_type.GPU: + return 32 + + return None diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py index ea5adcc2642c9b27b78e1b624b5808d396a342a4..6949320b4cf1aeb9d7c300b3b9fe0d8fdeebf949 100644 --- a/pyopencl/reduction.py +++ b/pyopencl/reduction.py @@ -171,14 +171,16 @@ def get_reduction_source( # {{{ compute synchronization-less group size def get_dev_no_sync_size(device): - try: - return device.warp_size_nv - except: - if "nvidia" in device.vendor.lower(): - from warnings import warn - warn("Reduction might be unnecessarily slow: " - "can't query warp size on Nvidia device") + from pyopencl.characterize import get_simd_group_size + result = get_simd_group_size(device) + + if result is None: + from warnings import warn + warn("Reduction might be unnecessarily slow: " + "can't query SIMD group size") return 1 + + return result no_sync_size = min(get_dev_no_sync_size(dev) for dev in devices)