diff --git a/doc/algorithm.rst b/doc/algorithm.rst index fa3c3fad81ce929479ddafa5c80958055f799fdb..3ad9c53edc6c81bc7b6f0b804930f8cf60a73721 100644 --- a/doc/algorithm.rst +++ b/doc/algorithm.rst @@ -271,4 +271,9 @@ Building many variable-size lists .. autoclass:: ListOfListsBuilder - .. automethod:: __call__ +Bitonic Sort +------------ + +.. module:: pyopencl.bitonic_sort + +.. autoclass:: BitonicSort diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py index e0533a2d2c3ed9f43bcf75a1d77d31d0a0815844..9f300c1dbf13a8f4af5f17735aaae52a92f54c25 100644 --- a/pyopencl/algorithm.py +++ b/pyopencl/algorithm.py @@ -395,6 +395,7 @@ RADIX_SORT_OUTPUT_STMT_TPL = Template(r"""//CL// # {{{ driver + # import hoisted here to be used as a default argument in the constructor from pyopencl.scan import GenericScanKernel @@ -403,6 +404,8 @@ class RadixSort(object): """Provides a general `radix sort <https://en.wikipedia.org/wiki/Radix_sort>`_ on the compute device. + .. seealso:: :class:`pyopencl.algorithm.BitonicSort` + .. versionadded:: 2013.1 """ def __init__(self, context, arguments, key_expr, sort_arg_names, diff --git a/pyopencl/bitonic_sort.py b/pyopencl/bitonic_sort.py new file mode 100644 index 0000000000000000000000000000000000000000..d4beaba2c7aee15c4cf82ba5fc2f07daed8bcebe --- /dev/null +++ b/pyopencl/bitonic_sort.py @@ -0,0 +1,236 @@ +from __future__ import division, with_statement, absolute_import, print_function + +__copyright__ = """ +Copyright (c) 2011, Eric Bainville +Copyright (c) 2015, Ilya Efimoff +All rights reserved. +""" + +# based on code at http://www.bealto.com/gpu-sorting_intro.html + +__license__ = """ +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +import pyopencl as cl +from pyopencl.tools import dtype_to_ctype +from operator import mul +from functools import reduce +from pytools import memoize_method +from mako.template import Template + +import pyopencl.bitonic_sort_templates as _tmpl + + +def _is_power_of_2(n): + from pyopencl.tools import bitlog2 + return n == 0 or 2**bitlog2(n) == n + + +class BitonicSort(object): + """Sort an array (or one axis of one) using a sorting network. + + Will only work if the axis of the array to be sorted has a length + that is a power of 2. + + .. versionadded:: 2015.2 + + .. seealso:: :class:`pyopencl.algorithm.RadixSort` + + .. autofunction:: __call__ + """ + + kernels_srcs = { + 'B2': _tmpl.ParallelBitonic_B2, + 'B4': _tmpl.ParallelBitonic_B4, + 'B8': _tmpl.ParallelBitonic_B8, + 'B16': _tmpl.ParallelBitonic_B16, + 'C4': _tmpl.ParallelBitonic_C4, + 'BL': _tmpl.ParallelBitonic_Local, + 'BLO': _tmpl.ParallelBitonic_Local_Optim, + 'PML': _tmpl.ParallelMerge_Local + } + + def __init__(self, context): + self.context = context + + def __call__(self, arr, idx=None, queue=None, wait_for=None, axis=0): + """ + :arg arr: the array to be sorted. Will be overwritten with the sorted array. + :arg idx: an array of indices to be tracked along with the sorting of *arr* + :arg queue: a :class:`pyopencl.CommandQueue`, defaults to the array's queue + if None + :arg wait_for: a list of :class:`pyopencl.Event` instances or None + :arg axis: the axis of the array by which to sort + + :returns: a tuple (sorted_array, event) + """ + + if queue is None: + queue = arr.queue + + if wait_for is None: + wait_for = [] + wait_for = wait_for + arr.events + + last_evt = cl.enqueue_marker(queue, wait_for=wait_for) + + if arr.shape[axis] == 0: + return arr, last_evt + + if not _is_power_of_2(arr.shape[axis]): + raise ValueError("sorted array axis length must be a power of 2") + + if idx is None: + argsort = 0 + else: + argsort = 1 + + run_queue = self.sort_b_prepare_wl( + argsort, + arr.dtype, + idx.dtype if idx is not None else None, arr.shape, + axis) + + knl, nt, wg, aux = run_queue[0] + + if idx is not None: + if aux: + last_evt = knl( + queue, (nt,), wg, arr.data, idx.data, + cl.LocalMemory(wg[0]*arr.dtype.itemsize), + cl.LocalMemory(wg[0]*idx.dtype.itemsize), + wait_for=[last_evt]) + for knl, nt, wg, _ in run_queue[1:]: + last_evt = knl( + queue, (nt,), wg, arr.data, idx.data, + wait_for=[last_evt]) + + else: + if aux: + last_evt = knl( + queue, (nt,), wg, arr.data, + cl.LocalMemory(wg[0]*4*arr.dtype.itemsize), + wait_for=[last_evt]) + for knl, nt, wg, _ in run_queue[1:]: + last_evt = knl(queue, (nt,), wg, arr.data, wait_for=[last_evt]) + + return arr, last_evt + + @memoize_method + def get_program(self, letter, argsort, params): + defstpl = Template(_tmpl.defines) + + defs = defstpl.render( + NS="\\", argsort=argsort, inc=params[0], dir=params[1], + dtype=params[2], idxtype=params[3], + dsize=params[4], nsize=params[5]) + + kid = Template(self.kernels_srcs[letter]).render(argsort=argsort) + + prg = cl.Program(self.context, defs + kid).build() + return prg + + @memoize_method + def sort_b_prepare_wl(self, argsort, key_dtype, idx_dtype, shape, axis): + key_ctype = dtype_to_ctype(key_dtype) + + if idx_dtype is None: + idx_ctype = 'uint' # Dummy + + else: + idx_ctype = dtype_to_ctype(idx_dtype) + + run_queue = [] + ds = int(shape[axis]) + size = reduce(mul, shape) + ndim = len(shape) + + ns = reduce(mul, shape[(axis+1):]) if axis < ndim-1 else 1 + + ds = int(shape[axis]) + allowb4 = True + allowb8 = True + allowb16 = True + + dev = self.context.devices[0] + + # {{{ find workgroup size + + wg = min(ds, dev.max_work_group_size) + + available_lmem = dev.local_mem_size + while True: + lmem_size = wg*key_dtype.itemsize + if argsort: + lmem_size += wg*idx_dtype.itemsize + + if lmem_size + 512 > available_lmem: + wg //= 2 + + if not wg: + raise RuntimeError( + "too little local memory available on '%s'" + % dev) + + else: + break + + # }}} + + length = wg >> 1 + prg = self.get_program( + 'BLO', argsort, (1, 1, key_ctype, idx_ctype, ds, ns)) + run_queue.append((prg.run, size, (wg,), True)) + + while length < ds: + inc = length + while inc > 0: + ninc = 0 + direction = length << 1 + if allowb16 and inc >= 8 and ninc == 0: + letter = 'B16' + ninc = 4 + elif allowb8 and inc >= 4 and ninc == 0: + letter = 'B8' + ninc = 3 + elif allowb4 and inc >= 2 and ninc == 0: + letter = 'B4' + ninc = 2 + elif inc >= 0: + letter = 'B2' + ninc = 1 + + nthreads = size >> ninc + + prg = self.get_program(letter, argsort, + (inc, direction, key_ctype, idx_ctype, ds, ns)) + run_queue.append((prg.run, nthreads, None, False,)) + inc >>= ninc + + length <<= 1 + + return run_queue diff --git a/pyopencl/bitonic_sort_templates.py b/pyopencl/bitonic_sort_templates.py new file mode 100644 index 0000000000000000000000000000000000000000..9b4f14e82a5943698cdbf5ff75e1b6522c09a8ed --- /dev/null +++ b/pyopencl/bitonic_sort_templates.py @@ -0,0 +1,584 @@ +__copyright__ = """ +Copyright (c) 2011, Eric Bainville +Copyright (c) 2015, Ilya Efimoff +All rights reserved. +""" + +__license__ = """ +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + + +# {{{ defines + +defines = """//CL// +typedef ${dtype} data_t; +typedef ${idxtype} idx_t; +typedef ${idxtype}2 idx_t2; +#if CONFIG_USE_VALUE +#define getKey(a) ((a).x) +#define getValue(a) ((a).y) +#define makeData(k,v) ((${dtype}2)((k),(v))) +#else +#define getKey(a) (a) +#define getValue(a) (0) +#define makeData(k,v) (k) +#endif + +#ifndef BLOCK_FACTOR +#define BLOCK_FACTOR 1 +#endif + +#define inc ${inc} +#define hinc ${inc>>1} //Half inc +#define qinc ${inc>>2} //Quarter inc +#define einc ${inc>>3} //Eighth of inc +#define dir ${dir} + +% if argsort: +#define ORDER(a,b,ay,by) { bool swap = reverse ^ (getKey(a)<getKey(b));${NS} + data_t auxa = a; data_t auxb = b;${NS} + idx_t auya = ay; idx_t auyb = by;${NS} + a = (swap)?auxb:auxa; b = (swap)?auxa:auxb;${NS} + ay = (swap)?auyb:auya; by = (swap)?auya:auyb;} +#define ORDERV(x,y,a,b) { bool swap = reverse ^ (getKey(x[a])<getKey(x[b]));${NS} + data_t auxa = x[a]; data_t auxb = x[b];${NS} + idx_t auya = y[a]; idx_t auyb = y[b];${NS} + x[a] = (swap)?auxb:auxa; x[b] = (swap)?auxa:auxb;${NS} + y[a] = (swap)?auyb:auya; y[b] = (swap)?auya:auyb;} +#define B2V(x,y,a) { ORDERV(x,y,a,a+1) } +#define B4V(x,y,a) { for (int i4=0;i4<2;i4++) { ORDERV(x,y,a+i4,a+i4+2) } B2V(x,y,a) B2V(x,y,a+2) } +#define B8V(x,y,a) { for (int i8=0;i8<4;i8++) { ORDERV(x,y,a+i8,a+i8+4) } B4V(x,y,a) B4V(x,y,a+4) } +#define B16V(x,y,a) { for (int i16=0;i16<8;i16++) { ORDERV(x,y,a+i16,a+i16+8) } B8V(x,y,a) B8V(x,y,a+8) } +% else: +#define ORDER(a,b) { bool swap = reverse ^ (getKey(a)<getKey(b)); data_t auxa = a; data_t auxb = b; a = (swap)?auxb:auxa; b = (swap)?auxa:auxb; } +#define ORDERV(x,a,b) { bool swap = reverse ^ (getKey(x[a])<getKey(x[b]));${NS} + data_t auxa = x[a]; data_t auxb = x[b];${NS} + x[a] = (swap)?auxb:auxa; x[b] = (swap)?auxa:auxb; } +#define B2V(x,a) { ORDERV(x,a,a+1) } +#define B4V(x,a) { for (int i4=0;i4<2;i4++) { ORDERV(x,a+i4,a+i4+2) } B2V(x,a) B2V(x,a+2) } +#define B8V(x,a) { for (int i8=0;i8<4;i8++) { ORDERV(x,a+i8,a+i8+4) } B4V(x,a) B4V(x,a+4) } +#define B16V(x,a) { for (int i16=0;i16<8;i16++) { ORDERV(x,a+i16,a+i16+8) } B8V(x,a) B8V(x,a+8) } +% endif +#define nsize ${nsize} //Total next dimensions sizes sum. (Block size) +#define dsize ${dsize} //Dimension size +""" + +# }}} + + +# {{{ B2 + +ParallelBitonic_B2 = """//CL// +// N/2 threads +//ParallelBitonic_B2 +__kernel void run(__global data_t * data\\ +% if argsort: +, __global idx_t * index) +% else: +) +% endif +{ + int t = get_global_id(0) % (dsize>>1); // thread index + int gt = get_global_id(0) / (dsize>>1); + int low = t & (inc - 1); // low order bits (below INC) + int i = (t<<1) - low; // insert 0 at position INC + int gi = i/dsize; // block index + bool reverse = ((dir & i) == 0);// ^ (gi%2); // asc/desc order + + int offset = (gt/nsize)*nsize*dsize+(gt%nsize); + data += i*nsize + offset; // translate to first value +% if argsort: + index += i*nsize + offset; // translate to first value +% endif + + // Load data + data_t x0 = data[ 0]; + data_t x1 = data[inc*nsize]; +% if argsort: + // Load index + idx_t i0 = index[ 0]; + idx_t i1 = index[inc*nsize]; +% endif + + // Sort +% if argsort: + ORDER(x0,x1,i0,i1) +% else: + ORDER(x0,x1) +% endif + + // Store data + data[0 ] = x0; + data[inc*nsize] = x1; +% if argsort: + // Store index + index[ 0] = i0; + index[inc*nsize] = i1; +% endif +} +""" + +# }}} + + +# {{{ B4 + +ParallelBitonic_B4 = """//CL// +// N/4 threads +//ParallelBitonic_B4 +__kernel void run(__global data_t * data\\ +% if argsort: +, __global idx_t * index) +% else: +) +% endif +{ + int t = get_global_id(0) % (dsize>>2); // thread index + int gt = get_global_id(0) / (dsize>>2); + int low = t & (hinc - 1); // low order bits (below INC) + int i = ((t - low) << 2) + low; // insert 00 at position INC + bool reverse = ((dir & i) == 0); // asc/desc order + int offset = (gt/nsize)*nsize*dsize+(gt%nsize); + data += i*nsize + offset; // translate to first value +% if argsort: + index += i*nsize + offset; // translate to first value +% endif + + // Load data + data_t x0 = data[ 0]; + data_t x1 = data[ hinc*nsize]; + data_t x2 = data[2*hinc*nsize]; + data_t x3 = data[3*hinc*nsize]; +% if argsort: + // Load index + idx_t i0 = index[ 0]; + idx_t i1 = index[ hinc*nsize]; + idx_t i2 = index[2*hinc*nsize]; + idx_t i3 = index[3*hinc*nsize]; +% endif + + // Sort +% if argsort: + ORDER(x0,x2,i0,i2) + ORDER(x1,x3,i1,i3) + ORDER(x0,x1,i0,i1) + ORDER(x2,x3,i2,i3) +% else: + ORDER(x0,x2) + ORDER(x1,x3) + ORDER(x0,x1) + ORDER(x2,x3) +% endif + + // Store data + data[ 0] = x0; + data[ hinc*nsize] = x1; + data[2*hinc*nsize] = x2; + data[3*hinc*nsize] = x3; +% if argsort: + // Store index + index[ 0] = i0; + index[ hinc*nsize] = i1; + index[2*hinc*nsize] = i2; + index[3*hinc*nsize] = i3; +% endif +} +""" + +# }}} + + +# {{{ B8 + +ParallelBitonic_B8 = """//CL// +// N/8 threads +//ParallelBitonic_B8 +__kernel void run(__global data_t * data\\ +% if argsort: +, __global idx_t * index) +% else: +) +% endif +{ + int t = get_global_id(0) % (dsize>>3); // thread index + int gt = get_global_id(0) / (dsize>>3); + int low = t & (qinc - 1); // low order bits (below INC) + int i = ((t - low) << 3) + low; // insert 000 at position INC + bool reverse = ((dir & i) == 0); // asc/desc order + int offset = (gt/nsize)*nsize*dsize+(gt%nsize); + + data += i*nsize + offset; // translate to first value +% if argsort: + index += i*nsize + offset; // translate to first value +% endif + + // Load + data_t x[8]; +% if argsort: + idx_t y[8]; +% endif + for (int k=0;k<8;k++) x[k] = data[k*qinc*nsize]; +% if argsort: + for (int k=0;k<8;k++) y[k] = index[k*qinc*nsize]; +% endif + + // Sort +% if argsort: + B8V(x,y,0) +% else: + B8V(x,0) +% endif + + // Store + for (int k=0;k<8;k++) data[k*qinc*nsize] = x[k]; +% if argsort: + for (int k=0;k<8;k++) index[k*qinc*nsize] = y[k]; +% endif +} +""" + +# }}} + + +# {{{ B16 + +ParallelBitonic_B16 = """//CL// +// N/16 threads +//ParallelBitonic_B16 +__kernel void run(__global data_t * data\\ +% if argsort: +, __global idx_t * index) +% else: +) +% endif +{ + int t = get_global_id(0) % (dsize>>4); // thread index + int gt = get_global_id(0) / (dsize>>4); + int low = t & (einc - 1); // low order bits (below INC) + int i = ((t - low) << 4) + low; // insert 0000 at position INC + bool reverse = ((dir & i) == 0); // asc/desc order + int offset = (gt/nsize)*nsize*dsize+(gt%nsize); + + data += i*nsize + offset; // translate to first value +% if argsort: + index += i*nsize + offset; // translate to first value +% endif + + // Load + data_t x[16]; +% if argsort: + idx_t y[16]; +% endif + for (int k=0;k<16;k++) x[k] = data[k*einc*nsize]; +% if argsort: + for (int k=0;k<16;k++) y[k] = index[k*einc*nsize]; +% endif + + // Sort +% if argsort: + B16V(x,y,0) +% else: + B16V(x,0) +% endif + + // Store + for (int k=0;k<16;k++) data[k*einc*nsize] = x[k]; +% if argsort: + for (int k=0;k<16;k++) index[k*einc*nsize] = y[k]; +% endif +} +""" + +# }}} + + +# {{{ C4 + +ParallelBitonic_C4 = """//CL// +//ParallelBitonic_C4 +__kernel void run\\ +% if argsort: +(__global data_t * data, __global idx_t * index, __local data_t * aux, __local idx_t * auy) +% else: +(__global data_t * data, __local data_t * aux) +% endif +{ + int t = get_global_id(0); // thread index + int wgBits = 4*get_local_size(0) - 1; // bit mask to get index in local memory AUX (size is 4*WG) + int linc,low,i; + bool reverse; + data_t x[4]; +% if argsort: + idx_t y[4]; +% endif + + // First iteration, global input, local output + linc = hinc; + low = t & (linc - 1); // low order bits (below INC) + i = ((t - low) << 2) + low; // insert 00 at position INC + reverse = ((dir & i) == 0); // asc/desc order + for (int k=0;k<4;k++) x[k] = data[i+k*linc]; +% if argsort: + for (int k=0;k<4;k++) y[k] = index[i+k*linc]; + B4V(x,y,0); + for (int k=0;k<4;k++) auy[(i+k*linc) & wgBits] = y[k]; +% else: + B4V(x,0); +% endif + for (int k=0;k<4;k++) aux[(i+k*linc) & wgBits] = x[k]; + barrier(CLK_LOCAL_MEM_FENCE); + + // Internal iterations, local input and output + for ( ;linc>1;linc>>=2) + { + low = t & (linc - 1); // low order bits (below INC) + i = ((t - low) << 2) + low; // insert 00 at position INC + reverse = ((dir & i) == 0); // asc/desc order + for (int k=0;k<4;k++) x[k] = aux[(i+k*linc) & wgBits]; +% if argsort: + for (int k=0;k<4;k++) y[k] = auy[(i+k*linc) & wgBits]; + B4V(x,y,0); + barrier(CLK_LOCAL_MEM_FENCE); + for (int k=0;k<4;k++) auy[(i+k*linc) & wgBits] = y[k]; +% else: + B4V(x,0); + barrier(CLK_LOCAL_MEM_FENCE); +% endif + for (int k=0;k<4;k++) aux[(i+k*linc) & wgBits] = x[k]; + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Final iteration, local input, global output, INC=1 + i = t << 2; + reverse = ((dir & i) == 0); // asc/desc order + for (int k=0;k<4;k++) x[k] = aux[(i+k) & wgBits]; +% if argsort: + for (int k=0;k<4;k++) y[k] = auy[(i+k) & wgBits]; + B4V(x,y,0); + for (int k=0;k<4;k++) index[i+k] = y[k]; +% else: + B4V(x,0); +% endif + for (int k=0;k<4;k++) data[i+k] = x[k]; +} +""" + +# }}} + + +# {{{ local merge + +ParallelMerge_Local = """//CL// +// N threads, WG is workgroup size. Sort WG input blocks in each workgroup. +__kernel void run(__global const data_t * in,__global data_t * out,__local data_t * aux) +{ + int i = get_local_id(0); // index in workgroup + int wg = get_local_size(0); // workgroup size = block size, power of 2 + + // Move IN, OUT to block start + int offset = get_group_id(0) * wg; + in += offset; out += offset; + + // Load block in AUX[WG] + aux[i] = in[i]; + barrier(CLK_LOCAL_MEM_FENCE); // make sure AUX is entirely up to date + + // Now we will merge sub-sequences of length 1,2,...,WG/2 + for (int length=1;length<wg;length<<=1) + { + data_t iData = aux[i]; + data_t iKey = getKey(iData); + int ii = i & (length-1); // index in our sequence in 0..length-1 + int sibling = (i - ii) ^ length; // beginning of the sibling sequence + int pos = 0; + for (int pinc=length;pinc>0;pinc>>=1) // increment for dichotomic search + { + int j = sibling+pos+pinc-1; + data_t jKey = getKey(aux[j]); + bool smaller = (jKey < iKey) || ( jKey == iKey && j < i ); + pos += (smaller)?pinc:0; + pos = min(pos,length); + } + int bits = 2*length-1; // mask for destination + int dest = ((ii + pos) & bits) | (i & ~bits); // destination index in merged sequence + barrier(CLK_LOCAL_MEM_FENCE); + aux[dest] = iData; + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Write output + out[i] = aux[i]; +} +""" + +# }}} + + +# {{{ + +ParallelBitonic_Local = """//CL// +// N threads, WG is workgroup size. Sort WG input blocks in each workgroup. +__kernel void run(__global const data_t * in,__global data_t * out,__local data_t * aux) +{ + int i = get_local_id(0); // index in workgroup + int wg = get_local_size(0); // workgroup size = block size, power of 2 + + // Move IN, OUT to block start + int offset = get_group_id(0) * wg; + in += offset; out += offset; + + // Load block in AUX[WG] + aux[i] = in[i]; + barrier(CLK_LOCAL_MEM_FENCE); // make sure AUX is entirely up to date + + // Loop on sorted sequence length + for (int length=1;length<wg;length<<=1) + { + bool direction = ((i & (length<<1)) != 0); // direction of sort: 0=asc, 1=desc + // Loop on comparison distance (between keys) + for (int pinc=length;pinc>0;pinc>>=1) + { + int j = i + pinc; // sibling to compare + data_t iData = aux[i]; + uint iKey = getKey(iData); + data_t jData = aux[j]; + uint jKey = getKey(jData); + bool smaller = (jKey < iKey) || ( jKey == iKey && j < i ); + bool swap = smaller ^ (j < i) ^ direction; + barrier(CLK_LOCAL_MEM_FENCE); + aux[i] = (swap)?jData:iData; + barrier(CLK_LOCAL_MEM_FENCE); + } + } + + // Write output + out[i] = aux[i]; +} +""" + +# }}} + + +# {{{ A + +ParallelBitonic_A = """//CL// +__kernel void ParallelBitonic_A(__global const data_t * in) +{ + int i = get_global_id(0); // thread index + int j = i ^ inc; // sibling to compare + + // Load values at I and J + data_t iData = in[i]; + uint iKey = getKey(iData); + data_t jData = in[j]; + uint jKey = getKey(jData); + + // Compare + bool smaller = (jKey < iKey) || ( jKey == iKey && j < i ); + bool swap = smaller ^ (j < i) ^ ((dir & i) != 0); + + // Store + in[i] = (swap)?jData:iData; +} +""" + +# }}} + + +# {{{ local optim + +ParallelBitonic_Local_Optim = """//CL// +__kernel void run\\ +% if argsort: +(__global data_t * data, __global idx_t * index, __local data_t * aux, __local idx_t * auy) +% else: +(__global data_t * data, __local data_t * aux) +% endif +{ + int t = get_global_id(0) % dsize; // thread index + int gt = get_global_id(0) / dsize; + int offset = (gt/nsize)*nsize*dsize+(gt%nsize); + + int i = get_local_id(0); // index in workgroup + int wg = get_local_size(0); // workgroup size = block size, power of 2 + + // Move IN, OUT to block start + //int offset = get_group_id(0) * wg; + data += offset; + // Load block in AUX[WG] + data_t iData = data[t*nsize]; + aux[i] = iData; +% if argsort: + index += offset; + // Load block in AUY[WG] + idx_t iidx = index[t*nsize]; + auy[i] = iidx; +% endif + barrier(CLK_LOCAL_MEM_FENCE); // make sure AUX is entirely up to date + + // Loop on sorted sequence length + for (int pwg=1;pwg<=wg;pwg<<=1){ + int loffset = pwg*(i/pwg); + int ii = i%pwg; + for (int length=1;length<pwg;length<<=1){ + bool direction = ii & (length<<1); // direction of sort: 0=asc, 1=desc + // Loop on comparison distance (between keys) + for (int pinc=length;pinc>0;pinc>>=1){ + int j = ii ^ pinc; // sibling to compare + data_t jData = aux[loffset+j]; +% if argsort: + idx_t jidx = auy[loffset+j]; +% endif + data_t iKey = getKey(iData); + data_t jKey = getKey(jData); + bool smaller = (jKey < iKey) || ( jKey == iKey && j < ii ); + bool swap = smaller ^ (ii>j) ^ direction; + iData = (swap)?jData:iData; // update iData +% if argsort: + iidx = (swap)?jidx:iidx; // update iidx +% endif + barrier(CLK_LOCAL_MEM_FENCE); + aux[loffset+ii] = iData; +% if argsort: + auy[loffset+ii] = iidx; +% endif + barrier(CLK_LOCAL_MEM_FENCE); + } + } + } + + // Write output + data[t*nsize] = iData; +% if argsort: + index[t*nsize] = iidx; +% endif +} +""" + +# }}} + +# vim: filetype=pyopencl:fdm=marker diff --git a/test/test_algorithm.py b/test/test_algorithm.py index b55c850e6681bd738b45378817a28e8c90ac1eb4..c9811cd8105d5816525323af5ce1768b5803288f 100644 --- a/test/test_algorithm.py +++ b/test/test_algorithm.py @@ -1,10 +1,6 @@ #! /usr/bin/env python -from __future__ import division, with_statement -from __future__ import absolute_import -from __future__ import print_function -from six.moves import range -from six.moves import zip +from __future__ import division, with_statement, absolute_import, print_function __copyright__ = "Copyright (C) 2013 Andreas Kloeckner" @@ -28,6 +24,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from six.moves import range, zip import numpy as np import numpy.linalg as la import sys @@ -839,6 +836,69 @@ def test_key_value_sorter(ctx_factory): # }}} +# {{{ bitonic sort + +@pytest.mark.parametrize("size", [ + 512, + 4, + 16 + ]) +@pytest.mark.parametrize("dtype", [ + np.int32, + np.float32, + # np.float64 + ]) +@pytest.mark.bitonic +def test_bitonic_sort(ctx_factory, size, dtype): + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + import pyopencl.clrandom as clrandom + from pyopencl.bitonic_sort import BitonicSort + + s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333) + sorter = BitonicSort(ctx) + sgs, evt = sorter(s.copy(), axis=1) + assert np.array_equal(np.sort(s.get(), axis=1), sgs.get()) + + +@pytest.mark.parametrize("size", [ + 0, + 4, + 2**14, + 2**18, + ]) +@pytest.mark.parametrize("dtype", [ + np.int32, + np.float32, + # np.float64 + ]) +@pytest.mark.bitonic +def test_bitonic_argsort(ctx_factory, size, dtype): + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + import pyopencl.clrandom as clrandom + from pyopencl.bitonic_sort import BitonicSort + + index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) + m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234) + + sorterm = BitonicSort(ctx) + + ms, evt = sorterm(m.copy(), idx=index, axis=0) + + assert np.array_equal(np.sort(m.get()), ms.get()) + + # may be False because of identical values in array + # assert np.array_equal(np.argsort(m.get()), index.get()) + + # Check values by indices + assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()]) + +# }}} + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])