diff --git a/doc/source/misc.rst b/doc/source/misc.rst index 79614aba908ee5feabab58514a18216e7be066aa..e1ae14a4e6f4eaa259c30e9c62c36a5ab120dd52 100644 --- a/doc/source/misc.rst +++ b/doc/source/misc.rst @@ -17,6 +17,8 @@ Acknowledgments PyOpenCL. * Paolo Simone Gasparello, Keith Brafford, and Ian Johnson provided much help in getting OpenCL-OpenGL interoperability to work. +* Sean True allowed access to a test machine to ensure compatibility + with OS X Lion. Guidelines ========== @@ -86,6 +88,8 @@ Version 2011.2 * Base :mod:`pyopencl.clrandom` on `RANLUXCL `_, add functionality. * Add :class:`pyopencl.NannyEvent` objects. +* Add :mod:`pyopencl.characterize`. +* Ensure compatibility with OS X Lion. Version 2011.1.2 ---------------- diff --git a/pyopencl/array.py b/pyopencl/array.py index 4af6d812a4284b1682161916b3fac98c1f3d730d..3a7c8d1b5a97fb6ad6a01b60764c0ace597b49e9 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -101,9 +101,14 @@ _create_vector_types() # {{{ helper functionality -def splay(queue, n): +def splay(queue, n, kernel_specific_max_wg_size=None): dev = queue.device max_work_items = _builtin_min(128, dev.max_work_group_size) + + if kernel_specific_max_wg_size is not None: + from __builtin__ import min + max_work_items = min(max_work_items, kernel_specific_max_wg_size) + min_work_items = _builtin_min(32, max_work_items) max_groups = dev.max_compute_units * 4 * 8 # 4 to overfill the device @@ -143,8 +148,11 @@ def elwise_kernel_runner(kernel_getter): repr_ary = args[0] queue = kwargs.pop("queue", None) or repr_ary.queue - gs, ls = repr_ary.get_sizes(queue) knl = kernel_getter(*args) + gs, ls = repr_ary.get_sizes(queue, + knl.get_work_group_info( + cl.kernel_work_group_info.WORK_GROUP_SIZE, + queue.device)) assert isinstance(repr_ary, Array) @@ -301,8 +309,9 @@ class Array(object): return _ArrayFlags(self) #@memoize_method FIXME: reenable - def get_sizes(self, queue): - return splay(queue, self.mem_size) + def get_sizes(self, queue, kernel_specific_max_wg_size=None): + return splay(queue, self.mem_size, + kernel_specific_max_wg_size=kernel_specific_max_wg_size) def set(self, ary, queue=None, async=False): assert ary.size == self.size @@ -881,7 +890,11 @@ def multi_take(arrays, indices, out=None, queue=None): if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) - gs, ls = indices.get_sizes(queue) + gs, ls = indices.get_sizes(queue, + knl.get_work_group_info( + cl.kernel_work_group_info.WORK_GROUP_SIZE, + queue.device)) + knl(queue, gs, ls, indices.data, *([o.data for o in out[chunk_slice]] @@ -949,7 +962,11 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) - gs, ls = src_indices.get_sizes(queue) + gs, ls = src_indices.get_sizes(queue, + knl.get_work_group_info( + cl.kernel_work_group_info.WORK_GROUP_SIZE, + queue.device)) + knl(queue, gs, ls, *([o.data for o in out[chunk_slice]] + [dest_indices.data, src_indices.data] @@ -1002,7 +1019,11 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None): if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) - gs, ls = dest_indices.get_sizes(queue) + gs, ls = dest_indices.get_sizes(queue, + knl.get_work_group_info( + cl.kernel_work_group_info.WORK_GROUP_SIZE, + queue.device)) + knl(queue, gs, ls, *([o.data for o in out[chunk_slice]] + [dest_indices.data] diff --git a/pyopencl/characterize.py b/pyopencl/characterize.py index d3f0344c89db60451ea3b5b3e18c5ee715285254..1ba4a9487ddc99ed89102faf069daca9a744575f 100644 --- a/pyopencl/characterize.py +++ b/pyopencl/characterize.py @@ -250,14 +250,24 @@ def why_not_local_access_conflict_free(dev, itemsize, def get_fast_inaccurate_build_options(dev): + """Return a list of flags valid on device *dev* that enable fast, but + potentially inaccurate floating point math. + """ return ["-cl-mad-enable", "-cl-fast-relaxed-math", "-cl-no-signed-zeros", "-cl-strict-aliasing"] -def get_simd_group_size(dev): - """Only refers to implicit SIMD.""" +def get_simd_group_size(dev, type_size): + """Return an estimate of how many work items will be executed across SIMD + lanes. This returns the size of what Nvidia calls a warp and what AMD calls + a wavefront. + + Only refers to implicit SIMD. + + :arg type_size: number of bytes in vector entry type. + """ try: return dev.warp_size_nv except: @@ -278,13 +288,13 @@ def get_simd_group_size(dev): if dev.type == cl.device_type.CPU: # implicit assumption: Impl. will vectorize - if dtype.itemsize == 1: + if type_size == 1: return dev.preferred_vector_width_char - elif dtype.itemsize == 2: + elif type_size == 2: return dev.preferred_vector_width_short - elif dtype.itemsize == 4: + elif type_size == 4: return dev.preferred_vector_width_float - elif dtype.itemsize == 8: + elif type_size == 8: return dev.preferred_vector_width_double else: raise ValueError("unexpected dtype size in get_simd_group_size") diff --git a/pyopencl/compyte b/pyopencl/compyte index 52aecae2c0019caa81342ab79b47f60601a6a1b1..1cd20d001c203703aecf218846ae0b28ea5f4211 160000 --- a/pyopencl/compyte +++ b/pyopencl/compyte @@ -1 +1 @@ -Subproject commit 52aecae2c0019caa81342ab79b47f60601a6a1b1 +Subproject commit 1cd20d001c203703aecf218846ae0b28ea5f4211 diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index eeb6dde0e8283b0304884ff48d97b78b65f1e6a5..6452535bfbf8ad11f521687e0e3bb4c634bb30eb 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -156,7 +156,10 @@ class ElementwiseKernel: queue = repr_vec.queue invocation_args.append(repr_vec.mem_size) - gs, ls = repr_vec.get_sizes(queue) + gs, ls = repr_vec.get_sizes(queue, + self.kernel.get_work_group_info( + cl.kernel_work_group_info.WORK_GROUP_SIZE, + queue.device)) self.kernel.set_args(*invocation_args) return cl.enqueue_nd_range_kernel(queue, self.kernel, gs, ls) diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py index 6949320b4cf1aeb9d7c300b3b9fe0d8fdeebf949..aecc202d34cd06e43adcd5fb23bc7f54414307a9 100644 --- a/pyopencl/reduction.py +++ b/pyopencl/reduction.py @@ -153,9 +153,9 @@ def get_reduction_source( def get_dev_group_size(device): # dirty fix for the RV770 boards - max_work_group_size=device.max_work_group_size + max_work_group_size = device.max_work_group_size if "RV770" in device.name: - max_work_group_size=64 + max_work_group_size = 64 return min( max_work_group_size, (device.local_mem_size + out_type_size - 1) @@ -171,16 +171,16 @@ def get_reduction_source( # {{{ compute synchronization-less group size def get_dev_no_sync_size(device): - from pyopencl.characterize import get_simd_group_size - result = get_simd_group_size(device) + from pyopencl.characterize import get_simd_group_size + result = get_simd_group_size(device, out_type_size) - if result is None: - from warnings import warn - warn("Reduction might be unnecessarily slow: " - "can't query SIMD group size") + if result is None: + from warnings import warn + warn("Reduction might be unnecessarily slow: " + "can't query SIMD group size") return 1 - - return result + + return result no_sync_size = min(get_dev_no_sync_size(dev) for dev in devices) @@ -262,16 +262,34 @@ class ReductionKernel: dtype_out = self.dtype_out = np.dtype(dtype_out) - self.stage_1_inf = get_reduction_kernel(ctx, - dtype_to_ctype(dtype_out), dtype_out.itemsize, - neutral, reduce_expr, map_expr, arguments, - name=name+"_stage1", options=options, preamble=preamble) + max_group_size = None + trip_count = 0 + + while True: + self.stage_1_inf = get_reduction_kernel(ctx, + dtype_to_ctype(dtype_out), dtype_out.itemsize, + neutral, reduce_expr, map_expr, arguments, + name=name+"_stage1", options=options, preamble=preamble, + max_group_size=max_group_size) + + kernel_max_wg_size = self.stage_1_inf.kernel.get_work_group_info( + cl.kernel_work_group_info.WORK_GROUP_SIZE, + ctx.devices[0]) + + if self.stage_1_inf.group_size <= kernel_max_wg_size: + break + else: + max_group_size = kernel_max_wg_size + + trip_count += 1 + assert trip_count <= 2 # stage 2 has only one input and no map expression self.stage_2_inf = get_reduction_kernel(ctx, dtype_to_ctype(dtype_out), dtype_out.itemsize, neutral, reduce_expr, - name=name+"_stage2", options=options, preamble=preamble) + name=name+"_stage2", options=options, preamble=preamble, + max_group_size=max_group_size) from pytools import any from pyopencl.tools import VectorArg diff --git a/test/test_array.py b/test/test_array.py index 7722fde5cb441194acd6c5d4ece6dd0495dbd836..120f3132ee19e88f56bd03824e854f7042819057 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -322,7 +322,7 @@ def test_elwise_kernel_with_options(ctx_factory): in_gpu = clrand(queue, (50,), np.float32) - options = ['-DADD_ONE'] + options = ['-D', 'ADD_ONE'] add_one = ElementwiseKernel( context, "float* out, const float *in",