diff --git a/doc/algorithm.rst b/doc/algorithm.rst
index fa3c3fad81ce929479ddafa5c80958055f799fdb..3ad9c53edc6c81bc7b6f0b804930f8cf60a73721 100644
--- a/doc/algorithm.rst
+++ b/doc/algorithm.rst
@@ -271,4 +271,9 @@ Building many variable-size lists
 
 .. autoclass:: ListOfListsBuilder
 
-    .. automethod:: __call__
+Bitonic Sort
+------------
+
+.. module:: pyopencl.bitonic_sort
+
+.. autoclass:: BitonicSort
diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py
index e0533a2d2c3ed9f43bcf75a1d77d31d0a0815844..9f300c1dbf13a8f4af5f17735aaae52a92f54c25 100644
--- a/pyopencl/algorithm.py
+++ b/pyopencl/algorithm.py
@@ -395,6 +395,7 @@ RADIX_SORT_OUTPUT_STMT_TPL = Template(r"""//CL//
 
 
 # {{{ driver
+
 # import hoisted here to be used as a default argument in the constructor
 from pyopencl.scan import GenericScanKernel
 
@@ -403,6 +404,8 @@ class RadixSort(object):
     """Provides a general `radix sort <https://en.wikipedia.org/wiki/Radix_sort>`_
     on the compute device.
 
+    .. seealso:: :class:`pyopencl.algorithm.BitonicSort`
+
     .. versionadded:: 2013.1
     """
     def __init__(self, context, arguments, key_expr, sort_arg_names,
diff --git a/pyopencl/bitonic_sort.py b/pyopencl/bitonic_sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4beaba2c7aee15c4cf82ba5fc2f07daed8bcebe
--- /dev/null
+++ b/pyopencl/bitonic_sort.py
@@ -0,0 +1,236 @@
+from __future__ import division, with_statement, absolute_import, print_function
+
+__copyright__ = """
+Copyright (c) 2011, Eric Bainville
+Copyright (c) 2015, Ilya Efimoff
+All rights reserved.
+"""
+
+# based on code at http://www.bealto.com/gpu-sorting_intro.html
+
+__license__ = """
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+import pyopencl as cl
+from pyopencl.tools import dtype_to_ctype
+from operator import mul
+from functools import reduce
+from pytools import memoize_method
+from mako.template import Template
+
+import pyopencl.bitonic_sort_templates as _tmpl
+
+
+def _is_power_of_2(n):
+    from pyopencl.tools import bitlog2
+    return n == 0 or 2**bitlog2(n) == n
+
+
+class BitonicSort(object):
+    """Sort an array (or one axis of one) using a sorting network.
+
+    Will only work if the axis of the array to be sorted has a length
+    that is a power of 2.
+
+    .. versionadded:: 2015.2
+
+    .. seealso:: :class:`pyopencl.algorithm.RadixSort`
+
+    .. autofunction:: __call__
+    """
+
+    kernels_srcs = {
+            'B2': _tmpl.ParallelBitonic_B2,
+            'B4': _tmpl.ParallelBitonic_B4,
+            'B8': _tmpl.ParallelBitonic_B8,
+            'B16': _tmpl.ParallelBitonic_B16,
+            'C4': _tmpl.ParallelBitonic_C4,
+            'BL': _tmpl.ParallelBitonic_Local,
+            'BLO': _tmpl.ParallelBitonic_Local_Optim,
+            'PML': _tmpl.ParallelMerge_Local
+            }
+
+    def __init__(self, context):
+        self.context = context
+
+    def __call__(self, arr, idx=None, queue=None, wait_for=None, axis=0):
+        """
+        :arg arr: the array to be sorted. Will be overwritten with the sorted array.
+        :arg idx: an array of indices to be tracked along with the sorting of *arr*
+        :arg queue: a :class:`pyopencl.CommandQueue`, defaults to the array's queue
+            if None
+        :arg wait_for: a list of :class:`pyopencl.Event` instances or None
+        :arg axis: the axis of the array by which to sort
+
+        :returns: a tuple (sorted_array, event)
+        """
+
+        if queue is None:
+            queue = arr.queue
+
+        if wait_for is None:
+            wait_for = []
+        wait_for = wait_for + arr.events
+
+        last_evt = cl.enqueue_marker(queue, wait_for=wait_for)
+
+        if arr.shape[axis] == 0:
+            return arr, last_evt
+
+        if not _is_power_of_2(arr.shape[axis]):
+            raise ValueError("sorted array axis length must be a power of 2")
+
+        if idx is None:
+            argsort = 0
+        else:
+            argsort = 1
+
+        run_queue = self.sort_b_prepare_wl(
+                argsort,
+                arr.dtype,
+                idx.dtype if idx is not None else None, arr.shape,
+                axis)
+
+        knl, nt, wg, aux = run_queue[0]
+
+        if idx is not None:
+            if aux:
+                last_evt = knl(
+                        queue, (nt,), wg, arr.data, idx.data,
+                        cl.LocalMemory(wg[0]*arr.dtype.itemsize),
+                        cl.LocalMemory(wg[0]*idx.dtype.itemsize),
+                        wait_for=[last_evt])
+            for knl, nt, wg, _ in run_queue[1:]:
+                last_evt = knl(
+                        queue, (nt,), wg, arr.data, idx.data,
+                        wait_for=[last_evt])
+
+        else:
+            if aux:
+                last_evt = knl(
+                        queue, (nt,), wg, arr.data,
+                        cl.LocalMemory(wg[0]*4*arr.dtype.itemsize),
+                        wait_for=[last_evt])
+            for knl, nt, wg, _ in run_queue[1:]:
+                last_evt = knl(queue, (nt,), wg, arr.data, wait_for=[last_evt])
+
+        return arr, last_evt
+
+    @memoize_method
+    def get_program(self, letter, argsort, params):
+        defstpl = Template(_tmpl.defines)
+
+        defs = defstpl.render(
+                NS="\\", argsort=argsort, inc=params[0], dir=params[1],
+                dtype=params[2], idxtype=params[3],
+                dsize=params[4], nsize=params[5])
+
+        kid = Template(self.kernels_srcs[letter]).render(argsort=argsort)
+
+        prg = cl.Program(self.context, defs + kid).build()
+        return prg
+
+    @memoize_method
+    def sort_b_prepare_wl(self, argsort, key_dtype, idx_dtype, shape, axis):
+        key_ctype = dtype_to_ctype(key_dtype)
+
+        if idx_dtype is None:
+            idx_ctype = 'uint'  # Dummy
+
+        else:
+            idx_ctype = dtype_to_ctype(idx_dtype)
+
+        run_queue = []
+        ds = int(shape[axis])
+        size = reduce(mul, shape)
+        ndim = len(shape)
+
+        ns = reduce(mul, shape[(axis+1):]) if axis < ndim-1 else 1
+
+        ds = int(shape[axis])
+        allowb4 = True
+        allowb8 = True
+        allowb16 = True
+
+        dev = self.context.devices[0]
+
+        # {{{ find workgroup size
+
+        wg = min(ds, dev.max_work_group_size)
+
+        available_lmem = dev.local_mem_size
+        while True:
+            lmem_size = wg*key_dtype.itemsize
+            if argsort:
+                lmem_size += wg*idx_dtype.itemsize
+
+            if lmem_size + 512 > available_lmem:
+                wg //= 2
+
+                if not wg:
+                    raise RuntimeError(
+                        "too little local memory available on '%s'"
+                        % dev)
+
+            else:
+                break
+
+        # }}}
+
+        length = wg >> 1
+        prg = self.get_program(
+                'BLO', argsort, (1, 1, key_ctype, idx_ctype, ds, ns))
+        run_queue.append((prg.run, size, (wg,), True))
+
+        while length < ds:
+            inc = length
+            while inc > 0:
+                ninc = 0
+                direction = length << 1
+                if allowb16 and inc >= 8 and ninc == 0:
+                    letter = 'B16'
+                    ninc = 4
+                elif allowb8 and inc >= 4 and ninc == 0:
+                    letter = 'B8'
+                    ninc = 3
+                elif allowb4 and inc >= 2 and ninc == 0:
+                    letter = 'B4'
+                    ninc = 2
+                elif inc >= 0:
+                    letter = 'B2'
+                    ninc = 1
+
+                nthreads = size >> ninc
+
+                prg = self.get_program(letter, argsort,
+                        (inc, direction, key_ctype, idx_ctype,  ds, ns))
+                run_queue.append((prg.run, nthreads, None, False,))
+                inc >>= ninc
+
+            length <<= 1
+
+        return run_queue
diff --git a/pyopencl/bitonic_sort_templates.py b/pyopencl/bitonic_sort_templates.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b4f14e82a5943698cdbf5ff75e1b6522c09a8ed
--- /dev/null
+++ b/pyopencl/bitonic_sort_templates.py
@@ -0,0 +1,584 @@
+__copyright__ = """
+Copyright (c) 2011, Eric Bainville
+Copyright (c) 2015, Ilya Efimoff
+All rights reserved.
+"""
+
+__license__ = """
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+
+# {{{ defines
+
+defines = """//CL//
+typedef ${dtype} data_t;
+typedef ${idxtype} idx_t;
+typedef ${idxtype}2 idx_t2;
+#if CONFIG_USE_VALUE
+#define getKey(a) ((a).x)
+#define getValue(a) ((a).y)
+#define makeData(k,v) ((${dtype}2)((k),(v)))
+#else
+#define getKey(a) (a)
+#define getValue(a) (0)
+#define makeData(k,v) (k)
+#endif
+
+#ifndef BLOCK_FACTOR
+#define BLOCK_FACTOR 1
+#endif
+
+#define inc  ${inc}
+#define hinc ${inc>>1} //Half inc
+#define qinc ${inc>>2} //Quarter inc
+#define einc ${inc>>3} //Eighth of inc
+#define dir  ${dir}
+
+% if argsort:
+#define ORDER(a,b,ay,by) { bool swap = reverse ^ (getKey(a)<getKey(b));${NS}
+                         data_t auxa = a; data_t auxb = b;${NS}
+                         idx_t auya = ay; idx_t auyb = by;${NS}
+                         a = (swap)?auxb:auxa; b = (swap)?auxa:auxb;${NS}
+                         ay = (swap)?auyb:auya; by = (swap)?auya:auyb;}
+#define ORDERV(x,y,a,b) { bool swap = reverse ^ (getKey(x[a])<getKey(x[b]));${NS}
+                        data_t auxa = x[a]; data_t auxb = x[b];${NS}
+                        idx_t auya = y[a]; idx_t auyb = y[b];${NS}
+                        x[a] = (swap)?auxb:auxa; x[b] = (swap)?auxa:auxb;${NS}
+                        y[a] = (swap)?auyb:auya; y[b] = (swap)?auya:auyb;}
+#define B2V(x,y,a)  { ORDERV(x,y,a,a+1) }
+#define B4V(x,y,a)  { for (int i4=0;i4<2;i4++) { ORDERV(x,y,a+i4,a+i4+2) } B2V(x,y,a) B2V(x,y,a+2) }
+#define B8V(x,y,a)  { for (int i8=0;i8<4;i8++) { ORDERV(x,y,a+i8,a+i8+4) } B4V(x,y,a) B4V(x,y,a+4) }
+#define B16V(x,y,a) { for (int i16=0;i16<8;i16++) { ORDERV(x,y,a+i16,a+i16+8) } B8V(x,y,a) B8V(x,y,a+8) }
+% else:
+#define ORDER(a,b) { bool swap = reverse ^ (getKey(a)<getKey(b)); data_t auxa = a; data_t auxb = b; a = (swap)?auxb:auxa; b = (swap)?auxa:auxb; }
+#define ORDERV(x,a,b) { bool swap = reverse ^ (getKey(x[a])<getKey(x[b]));${NS}
+      data_t auxa = x[a]; data_t auxb = x[b];${NS}
+      x[a] = (swap)?auxb:auxa; x[b] = (swap)?auxa:auxb; }
+#define B2V(x,a) { ORDERV(x,a,a+1) }
+#define B4V(x,a) { for (int i4=0;i4<2;i4++) { ORDERV(x,a+i4,a+i4+2) } B2V(x,a) B2V(x,a+2) }
+#define B8V(x,a) { for (int i8=0;i8<4;i8++) { ORDERV(x,a+i8,a+i8+4) } B4V(x,a) B4V(x,a+4) }
+#define B16V(x,a) { for (int i16=0;i16<8;i16++) { ORDERV(x,a+i16,a+i16+8) } B8V(x,a) B8V(x,a+8) }
+% endif
+#define nsize ${nsize}   //Total next dimensions sizes sum. (Block size)
+#define dsize ${dsize}   //Dimension size
+"""
+
+# }}}
+
+
+# {{{ B2
+
+ParallelBitonic_B2 = """//CL//
+// N/2 threads
+//ParallelBitonic_B2
+__kernel void run(__global data_t * data\\
+% if argsort:
+, __global idx_t * index)
+% else:
+)
+% endif
+{
+  int t  = get_global_id(0) % (dsize>>1); // thread index
+  int gt = get_global_id(0) / (dsize>>1);
+  int low = t & (inc - 1); // low order bits (below INC)
+  int i = (t<<1) - low; // insert 0 at position INC
+  int gi = i/dsize; // block index
+  bool reverse = ((dir & i) == 0);// ^ (gi%2); // asc/desc order
+
+  int offset = (gt/nsize)*nsize*dsize+(gt%nsize);
+  data  += i*nsize + offset; // translate to first value
+% if argsort:
+  index += i*nsize + offset; // translate to first value
+% endif
+
+  // Load data
+  data_t x0 = data[  0];
+  data_t x1 = data[inc*nsize];
+% if argsort:
+  // Load index
+  idx_t i0 = index[  0];
+  idx_t i1 = index[inc*nsize];
+% endif
+
+  // Sort
+% if argsort:
+  ORDER(x0,x1,i0,i1)
+% else:
+  ORDER(x0,x1)
+% endif
+
+  // Store data
+  data[0  ] = x0;
+  data[inc*nsize] = x1;
+% if argsort:
+  // Store index
+  index[  0] = i0;
+  index[inc*nsize] = i1;
+% endif
+}
+"""
+
+# }}}
+
+
+# {{{ B4
+
+ParallelBitonic_B4 = """//CL//
+// N/4 threads
+//ParallelBitonic_B4
+__kernel void run(__global data_t * data\\
+% if argsort:
+, __global idx_t * index)
+% else:
+)
+% endif
+{
+  int t  = get_global_id(0) % (dsize>>2); // thread index
+  int gt = get_global_id(0) / (dsize>>2);
+  int low = t & (hinc - 1); // low order bits (below INC)
+  int i = ((t - low) << 2) + low; // insert 00 at position INC
+  bool reverse = ((dir & i) == 0); // asc/desc order
+  int offset = (gt/nsize)*nsize*dsize+(gt%nsize);
+  data  += i*nsize + offset; // translate to first value
+% if argsort:
+  index += i*nsize + offset; // translate to first value
+% endif
+
+  // Load data
+  data_t x0 = data[     0];
+  data_t x1 = data[  hinc*nsize];
+  data_t x2 = data[2*hinc*nsize];
+  data_t x3 = data[3*hinc*nsize];
+% if argsort:
+  // Load index
+  idx_t i0 = index[     0];
+  idx_t i1 = index[  hinc*nsize];
+  idx_t i2 = index[2*hinc*nsize];
+  idx_t i3 = index[3*hinc*nsize];
+% endif
+
+  // Sort
+% if argsort:
+  ORDER(x0,x2,i0,i2)
+  ORDER(x1,x3,i1,i3)
+  ORDER(x0,x1,i0,i1)
+  ORDER(x2,x3,i2,i3)
+% else:
+  ORDER(x0,x2)
+  ORDER(x1,x3)
+  ORDER(x0,x1)
+  ORDER(x2,x3)
+% endif
+
+  // Store data
+  data[     0] = x0;
+  data[  hinc*nsize] = x1;
+  data[2*hinc*nsize] = x2;
+  data[3*hinc*nsize] = x3;
+% if argsort:
+  // Store index
+  index[     0] = i0;
+  index[  hinc*nsize] = i1;
+  index[2*hinc*nsize] = i2;
+  index[3*hinc*nsize] = i3;
+% endif
+}
+"""
+
+# }}}
+
+
+# {{{ B8
+
+ParallelBitonic_B8 = """//CL//
+// N/8 threads
+//ParallelBitonic_B8
+__kernel void run(__global data_t * data\\
+% if argsort:
+, __global idx_t * index)
+% else:
+)
+% endif
+{
+  int t  = get_global_id(0) % (dsize>>3); // thread index
+  int gt = get_global_id(0) / (dsize>>3);
+  int low = t & (qinc - 1); // low order bits (below INC)
+  int i = ((t - low) << 3) + low; // insert 000 at position INC
+  bool reverse = ((dir & i) == 0); // asc/desc order
+  int offset = (gt/nsize)*nsize*dsize+(gt%nsize);
+
+  data  += i*nsize + offset; // translate to first value
+% if argsort:
+  index += i*nsize + offset; // translate to first value
+% endif
+
+  // Load
+  data_t x[8];
+% if argsort:
+  idx_t y[8];
+% endif
+  for (int k=0;k<8;k++) x[k] = data[k*qinc*nsize];
+% if argsort:
+  for (int k=0;k<8;k++) y[k] = index[k*qinc*nsize];
+% endif
+
+  // Sort
+% if argsort:
+  B8V(x,y,0)
+% else:
+  B8V(x,0)
+% endif
+
+  // Store
+  for (int k=0;k<8;k++) data[k*qinc*nsize] = x[k];
+% if argsort:
+  for (int k=0;k<8;k++) index[k*qinc*nsize] = y[k];
+% endif
+}
+"""
+
+# }}}
+
+
+# {{{ B16
+
+ParallelBitonic_B16 = """//CL//
+// N/16 threads
+//ParallelBitonic_B16
+__kernel void run(__global data_t * data\\
+% if argsort:
+, __global idx_t * index)
+% else:
+)
+% endif
+{
+  int t  = get_global_id(0) % (dsize>>4); // thread index
+  int gt = get_global_id(0) / (dsize>>4);
+  int low = t & (einc - 1); // low order bits (below INC)
+  int i = ((t - low) << 4) + low; // insert 0000 at position INC
+  bool reverse = ((dir & i) == 0); // asc/desc order
+  int offset = (gt/nsize)*nsize*dsize+(gt%nsize);
+
+  data  += i*nsize + offset; // translate to first value
+% if argsort:
+  index += i*nsize + offset; // translate to first value
+% endif
+
+  // Load
+  data_t x[16];
+% if argsort:
+  idx_t y[16];
+% endif
+  for (int k=0;k<16;k++) x[k] = data[k*einc*nsize];
+% if argsort:
+  for (int k=0;k<16;k++) y[k] = index[k*einc*nsize];
+% endif
+
+  // Sort
+% if argsort:
+  B16V(x,y,0)
+% else:
+  B16V(x,0)
+% endif
+
+  // Store
+  for (int k=0;k<16;k++) data[k*einc*nsize] = x[k];
+% if argsort:
+  for (int k=0;k<16;k++) index[k*einc*nsize] = y[k];
+% endif
+}
+"""
+
+# }}}
+
+
+# {{{ C4
+
+ParallelBitonic_C4 = """//CL//
+//ParallelBitonic_C4
+__kernel void run\\
+% if argsort:
+(__global data_t * data, __global idx_t * index, __local data_t * aux, __local idx_t * auy)
+% else:
+(__global data_t * data, __local data_t * aux)
+% endif
+{
+  int t = get_global_id(0); // thread index
+  int wgBits = 4*get_local_size(0) - 1; // bit mask to get index in local memory AUX (size is 4*WG)
+  int linc,low,i;
+  bool reverse;
+  data_t x[4];
+% if argsort:
+  idx_t y[4];
+% endif
+
+  // First iteration, global input, local output
+  linc = hinc;
+  low = t & (linc - 1); // low order bits (below INC)
+  i = ((t - low) << 2) + low; // insert 00 at position INC
+  reverse = ((dir & i) == 0); // asc/desc order
+  for (int k=0;k<4;k++) x[k] = data[i+k*linc];
+% if argsort:
+  for (int k=0;k<4;k++) y[k] = index[i+k*linc];
+  B4V(x,y,0);
+  for (int k=0;k<4;k++) auy[(i+k*linc) & wgBits] = y[k];
+% else:
+  B4V(x,0);
+% endif
+  for (int k=0;k<4;k++) aux[(i+k*linc) & wgBits] = x[k];
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Internal iterations, local input and output
+  for ( ;linc>1;linc>>=2)
+  {
+    low = t & (linc - 1); // low order bits (below INC)
+    i = ((t - low) << 2) + low; // insert 00 at position INC
+    reverse = ((dir & i) == 0); // asc/desc order
+    for (int k=0;k<4;k++) x[k] = aux[(i+k*linc) & wgBits];
+% if argsort:
+    for (int k=0;k<4;k++) y[k] = auy[(i+k*linc) & wgBits];
+    B4V(x,y,0);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int k=0;k<4;k++) auy[(i+k*linc) & wgBits] = y[k];
+% else:
+    B4V(x,0);
+    barrier(CLK_LOCAL_MEM_FENCE);
+% endif
+    for (int k=0;k<4;k++) aux[(i+k*linc) & wgBits] = x[k];
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  // Final iteration, local input, global output, INC=1
+  i = t << 2;
+  reverse = ((dir & i) == 0); // asc/desc order
+  for (int k=0;k<4;k++) x[k] = aux[(i+k) & wgBits];
+% if argsort:
+  for (int k=0;k<4;k++) y[k] = auy[(i+k) & wgBits];
+  B4V(x,y,0);
+  for (int k=0;k<4;k++) index[i+k] = y[k];
+% else:
+  B4V(x,0);
+% endif
+  for (int k=0;k<4;k++) data[i+k] = x[k];
+}
+"""
+
+# }}}
+
+
+# {{{ local merge
+
+ParallelMerge_Local = """//CL//
+// N threads, WG is workgroup size. Sort WG input blocks in each workgroup.
+__kernel void run(__global const data_t * in,__global data_t * out,__local data_t * aux)
+{
+  int i = get_local_id(0); // index in workgroup
+  int wg = get_local_size(0); // workgroup size = block size, power of 2
+
+  // Move IN, OUT to block start
+  int offset = get_group_id(0) * wg;
+  in += offset; out += offset;
+
+  // Load block in AUX[WG]
+  aux[i] = in[i];
+  barrier(CLK_LOCAL_MEM_FENCE); // make sure AUX is entirely up to date
+
+  // Now we will merge sub-sequences of length 1,2,...,WG/2
+  for (int length=1;length<wg;length<<=1)
+  {
+    data_t iData = aux[i];
+    data_t iKey = getKey(iData);
+    int ii = i & (length-1);  // index in our sequence in 0..length-1
+    int sibling = (i - ii) ^ length; // beginning of the sibling sequence
+    int pos = 0;
+    for (int pinc=length;pinc>0;pinc>>=1) // increment for dichotomic search
+    {
+      int j = sibling+pos+pinc-1;
+      data_t jKey = getKey(aux[j]);
+      bool smaller = (jKey < iKey) || ( jKey == iKey && j < i );
+      pos += (smaller)?pinc:0;
+      pos = min(pos,length);
+    }
+    int bits = 2*length-1; // mask for destination
+    int dest = ((ii + pos) & bits) | (i & ~bits); // destination index in merged sequence
+    barrier(CLK_LOCAL_MEM_FENCE);
+    aux[dest] = iData;
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  // Write output
+  out[i] = aux[i];
+}
+"""
+
+# }}}
+
+
+# {{{
+
+ParallelBitonic_Local = """//CL//
+// N threads, WG is workgroup size. Sort WG input blocks in each workgroup.
+__kernel void run(__global const data_t * in,__global data_t * out,__local data_t * aux)
+{
+  int i = get_local_id(0); // index in workgroup
+  int wg = get_local_size(0); // workgroup size = block size, power of 2
+
+  // Move IN, OUT to block start
+  int offset = get_group_id(0) * wg;
+  in += offset; out += offset;
+
+  // Load block in AUX[WG]
+  aux[i] = in[i];
+  barrier(CLK_LOCAL_MEM_FENCE); // make sure AUX is entirely up to date
+
+  // Loop on sorted sequence length
+  for (int length=1;length<wg;length<<=1)
+  {
+    bool direction = ((i & (length<<1)) != 0); // direction of sort: 0=asc, 1=desc
+    // Loop on comparison distance (between keys)
+    for (int pinc=length;pinc>0;pinc>>=1)
+    {
+      int j = i + pinc; // sibling to compare
+      data_t iData = aux[i];
+      uint iKey = getKey(iData);
+      data_t jData = aux[j];
+      uint jKey = getKey(jData);
+      bool smaller = (jKey < iKey) || ( jKey == iKey && j < i );
+      bool swap = smaller ^ (j < i) ^ direction;
+      barrier(CLK_LOCAL_MEM_FENCE);
+      aux[i] = (swap)?jData:iData;
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  }
+
+  // Write output
+  out[i] = aux[i];
+}
+"""
+
+# }}}
+
+
+# {{{ A
+
+ParallelBitonic_A = """//CL//
+__kernel void ParallelBitonic_A(__global const data_t * in)
+{
+  int i = get_global_id(0); // thread index
+  int j = i ^ inc; // sibling to compare
+
+  // Load values at I and J
+  data_t iData = in[i];
+  uint iKey = getKey(iData);
+  data_t jData = in[j];
+  uint jKey = getKey(jData);
+
+  // Compare
+  bool smaller = (jKey < iKey) || ( jKey == iKey && j < i );
+  bool swap = smaller ^ (j < i) ^ ((dir & i) != 0);
+
+  // Store
+  in[i] = (swap)?jData:iData;
+}
+"""
+
+# }}}
+
+
+# {{{ local optim
+
+ParallelBitonic_Local_Optim = """//CL//
+__kernel void run\\
+% if argsort:
+(__global data_t * data, __global idx_t * index, __local data_t * aux, __local idx_t * auy)
+% else:
+(__global data_t * data, __local data_t * aux)
+% endif
+{
+  int t  = get_global_id(0) % dsize; // thread index
+  int gt = get_global_id(0) / dsize;
+  int offset = (gt/nsize)*nsize*dsize+(gt%nsize);
+
+  int i = get_local_id(0); // index in workgroup
+  int wg = get_local_size(0); // workgroup size = block size, power of 2
+
+  // Move IN, OUT to block start
+  //int offset = get_group_id(0) * wg;
+  data += offset;
+  // Load block in AUX[WG]
+  data_t iData = data[t*nsize];
+  aux[i] = iData;
+% if argsort:
+  index += offset;
+  // Load block in AUY[WG]
+  idx_t iidx = index[t*nsize];
+  auy[i] = iidx;
+% endif
+  barrier(CLK_LOCAL_MEM_FENCE); // make sure AUX is entirely up to date
+
+  // Loop on sorted sequence length
+  for (int pwg=1;pwg<=wg;pwg<<=1){
+      int loffset = pwg*(i/pwg);
+      int ii = i%pwg;
+      for (int length=1;length<pwg;length<<=1){
+        bool direction = ii & (length<<1); // direction of sort: 0=asc, 1=desc
+        // Loop on comparison distance (between keys)
+        for (int pinc=length;pinc>0;pinc>>=1){
+          int j = ii ^ pinc; // sibling to compare
+          data_t jData = aux[loffset+j];
+% if argsort:
+          idx_t jidx = auy[loffset+j];
+% endif
+          data_t iKey = getKey(iData);
+          data_t jKey = getKey(jData);
+          bool smaller = (jKey < iKey) || ( jKey == iKey && j < ii );
+          bool swap = smaller ^ (ii>j) ^ direction;
+          iData = (swap)?jData:iData; // update iData
+% if argsort:
+          iidx = (swap)?jidx:iidx; // update iidx
+% endif
+          barrier(CLK_LOCAL_MEM_FENCE);
+          aux[loffset+ii] = iData;
+% if argsort:
+          auy[loffset+ii] = iidx;
+% endif
+          barrier(CLK_LOCAL_MEM_FENCE);
+        }
+      }
+  }
+
+  // Write output
+  data[t*nsize] = iData;
+% if argsort:
+  index[t*nsize] = iidx;
+% endif
+}
+"""
+
+# }}}
+
+# vim: filetype=pyopencl:fdm=marker
diff --git a/test/test_algorithm.py b/test/test_algorithm.py
index b55c850e6681bd738b45378817a28e8c90ac1eb4..c9811cd8105d5816525323af5ce1768b5803288f 100644
--- a/test/test_algorithm.py
+++ b/test/test_algorithm.py
@@ -1,10 +1,6 @@
 #! /usr/bin/env python
 
-from __future__ import division, with_statement
-from __future__ import absolute_import
-from __future__ import print_function
-from six.moves import range
-from six.moves import zip
+from __future__ import division, with_statement, absolute_import, print_function
 
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
 
@@ -28,6 +24,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from six.moves import range, zip
 import numpy as np
 import numpy.linalg as la
 import sys
@@ -839,6 +836,69 @@ def test_key_value_sorter(ctx_factory):
 # }}}
 
 
+# {{{ bitonic sort
+
+@pytest.mark.parametrize("size", [
+    512,
+    4,
+    16
+    ])
+@pytest.mark.parametrize("dtype", [
+    np.int32,
+    np.float32,
+    # np.float64
+    ])
+@pytest.mark.bitonic
+def test_bitonic_sort(ctx_factory, size, dtype):
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    import pyopencl.clrandom as clrandom
+    from pyopencl.bitonic_sort import BitonicSort
+
+    s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333)
+    sorter = BitonicSort(ctx)
+    sgs, evt = sorter(s.copy(), axis=1)
+    assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
+
+
+@pytest.mark.parametrize("size", [
+    0,
+    4,
+    2**14,
+    2**18,
+    ])
+@pytest.mark.parametrize("dtype", [
+    np.int32,
+    np.float32,
+    # np.float64
+    ])
+@pytest.mark.bitonic
+def test_bitonic_argsort(ctx_factory, size, dtype):
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    import pyopencl.clrandom as clrandom
+    from pyopencl.bitonic_sort import BitonicSort
+
+    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
+    m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234)
+
+    sorterm = BitonicSort(ctx)
+
+    ms, evt = sorterm(m.copy(), idx=index, axis=0)
+
+    assert np.array_equal(np.sort(m.get()), ms.get())
+
+    # may be False because of identical values in array
+    # assert np.array_equal(np.argsort(m.get()), index.get())
+
+    # Check values by indices
+    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
+
+# }}}
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])