diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py index d76c7b433a3bdde9fa69123ca0cdc1d496b7de08..204139a196b9433df7dd1a3a51fbe5141f23dd62 100644 --- a/pyopencl/algorithm.py +++ b/pyopencl/algorithm.py @@ -276,17 +276,6 @@ RADIX_SORT_PREAMBLE_TPL = Template(r"""//CL// #endif <% - def get_count_branch(known_bits): - if len(known_bits) == bits: - return "s.c%s" % known_bits - - b = len(known_bits) - boundary_mnr = known_bits + "1" + (bits-b-1)*"0" - - return ("((mnr < %s) ? %s : %s)" % ( - int(boundary_mnr, 2), - get_count_branch(known_bits+"0"), - get_count_branch(known_bits+"1"))) %> index_t get_count(scan_t s, int mnr) @@ -375,7 +364,7 @@ class RadixSort(object): on the compute device. """ def __init__(self, context, arguments, key_expr, sort_arg_names, - bits_at_a_time=4, index_dtype=np.int32, key_dtype=np.uint32, + bits_at_a_time=3, index_dtype=np.int32, key_dtype=np.uint32, options=[]): """ :arg arguments: A string of comma-separated C argument declarations. @@ -417,6 +406,17 @@ class RadixSort(object): if arg.name in sort_arg_names] + [ ScalarArg(np.int32, "base_bit") ]) + def get_count_branch(known_bits): + if len(known_bits) == self.bits: + return "s.c%s" % known_bits + + boundary_mnr = known_bits + "1" + (self.bits-len(known_bits)-1)*"0" + + return ("((mnr < %s) ? %s : %s)" % ( + int(boundary_mnr, 2), + get_count_branch(known_bits+"0"), + get_count_branch(known_bits+"1"))) + codegen_args = dict( bits=self.bits, key_ctype=dtype_to_ctype(self.key_dtype), @@ -426,6 +426,7 @@ class RadixSort(object): padded_bin=_padded_bin, scan_ctype=scan_ctype, sort_arg_names=sort_arg_names, + get_count_branch=get_count_branch, ) preamble = scan_t_cdecl+RADIX_SORT_PREAMBLE_TPL.render(**codegen_args) diff --git a/pyopencl/scan.py b/pyopencl/scan.py index 526f679c8f46fc0b7a782b33f5f32a21933a375e..03bfef17e1bc9b51f442aac9b8863f439efedb72 100644 --- a/pyopencl/scan.py +++ b/pyopencl/scan.py @@ -988,8 +988,10 @@ class GenericScanKernel(_GenericScanKernelBase): # (about the widest vector a CPU can support, also taking # into account that CPUs don't hide latency by large work groups max_scan_wg_size = 16 + wg_size_multiples = 16 else: max_scan_wg_size = min(dev.max_work_group_size for dev in self.devices) + wg_size_multiples = 64 avail_local_mem = min( dev.local_mem_size @@ -998,14 +1000,17 @@ class GenericScanKernel(_GenericScanKernelBase): # k_group_size should be a power of two because of in-kernel # division by that number. - k_group_size = 128 + solutions = [] + for k_exp in range(0, 7): + for wg_size in range(wg_size_multiples, max_scan_wg_size+1, + wg_size_multiples): - while ( - self.get_local_mem_use( - max_scan_wg_size, k_group_size) + 256 > avail_local_mem): - k_group_size //= 2 + k_group_size = 2**k_exp + if (self.get_local_mem_use( + wg_size, k_group_size) + 256 <= avail_local_mem): + solutions.append((wg_size*k_group_size, k_group_size, wg_size)) - assert k_group_size > 1 + _, k_group_size, max_scan_wg_size = max(solutions) while True: candidate_scan_info = self.build_scan_kernel( diff --git a/pyopencl/tools.py b/pyopencl/tools.py index 8ebf0cd007fad808398e85d0661c7a8ed2860f19..e63ca2f63a19eefb9cdf8eca436a6fcdd555111c 100644 --- a/pyopencl/tools.py +++ b/pyopencl/tools.py @@ -401,7 +401,7 @@ def match_dtype_to_c_struct(device, name, dtype, context=None): src = r""" #define pycl_offsetof(st, m) \ - ((size_t) ( (char *)&((st *)0)->m - (char *)0 )) + ((size_t) ((__local char *) &(dummy.m) - (__local char *)&dummy )) %(pre_decls)s @@ -410,6 +410,7 @@ def match_dtype_to_c_struct(device, name, dtype, context=None): __kernel void get_size_and_offsets(__global size_t *result) { result[0] = sizeof(%(my_type)s); + __local %(my_type)s dummy; %(offset_code)s } """ % dict( @@ -428,9 +429,24 @@ def match_dtype_to_c_struct(device, name, dtype, context=None): import pyopencl.array result_buf = cl.array.empty(queue, 1+len(fields), np.uintp) - knl(queue, (1,), None, result_buf.data) + knl(queue, (1,), (1,), result_buf.data) + queue.finish() size_and_offsets = result_buf.get() + size = int(size_and_offsets[0]) + + from pytools import any + offsets = size_and_offsets[1:] + if any(ofs >= size for ofs in offsets): + # offsets not plausible + + if dtype.itemsize == size: + # If sizes match, use numpy's idea of the offsets. + offsets = [offset + for field_name, (field_dtype, offset) in fields] + else: + raise RuntimeError("cannot discover struct layout on '%s'" % device) + result_buf.data.release() del knl del prg @@ -440,7 +456,7 @@ def match_dtype_to_c_struct(device, name, dtype, context=None): dtype_arg_dict = dict( names=[field_name for field_name, (field_dtype, offset) in fields], formats=[field_dtype for field_name, (field_dtype, offset) in fields], - offsets=[int(x) for x in size_and_offsets[1:]], + offsets=[int(x) for x in offsets], itemsize=int(size_and_offsets[0]), ) dtype = np.dtype(dtype_arg_dict) diff --git a/test/test_array.py b/test/test_array.py index 9e680727c5be30be7c33831ba7d3a207f58b999c..cdc1cfefd5b3ea3c6fb79fbafab4c839d9dace01 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -816,7 +816,9 @@ scan_test_counts = [ 2 ** 20 - 2 ** 18, 2 ** 20 - 2 ** 18 + 5, 2 ** 20 + 1, - 2 ** 20, 2 ** 24 + 2 ** 20, + 2 ** 23 + 3, + 2 ** 24 + 5 ] @pytools.test.mark_test.opencl @@ -1070,8 +1072,8 @@ def test_sort(ctx_factory): numpy_elapsed = numpy_end-dev_end dev_elapsed = dev_end-dev_start - print " dev: %.2f s numpy: %.2f ratio: %.1fx" % ( - dev_elapsed, numpy_elapsed, dev_elapsed/numpy_elapsed) + print " dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio: %.2fx" % ( + 1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed, numpy_elapsed/dev_elapsed) assert (a_dev_sorted.get() == a_sorted).all()