diff --git a/doc/algorithm.rst b/doc/algorithm.rst
index fa3c3fad81ce929479ddafa5c80958055f799fdb..3ad9c53edc6c81bc7b6f0b804930f8cf60a73721 100644
--- a/doc/algorithm.rst
+++ b/doc/algorithm.rst
@@ -271,4 +271,9 @@ Building many variable-size lists
 
 .. autoclass:: ListOfListsBuilder
 
-    .. automethod:: __call__
+Bitonic Sort
+------------
+
+.. module:: pyopencl.bitonic_sort
+
+.. autoclass:: BitonicSort
diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py
index e0533a2d2c3ed9f43bcf75a1d77d31d0a0815844..9f300c1dbf13a8f4af5f17735aaae52a92f54c25 100644
--- a/pyopencl/algorithm.py
+++ b/pyopencl/algorithm.py
@@ -395,6 +395,7 @@ RADIX_SORT_OUTPUT_STMT_TPL = Template(r"""//CL//
 
 
 # {{{ driver
+
 # import hoisted here to be used as a default argument in the constructor
 from pyopencl.scan import GenericScanKernel
 
@@ -403,6 +404,8 @@ class RadixSort(object):
     """Provides a general `radix sort <https://en.wikipedia.org/wiki/Radix_sort>`_
     on the compute device.
 
+    .. seealso:: :class:`pyopencl.algorithm.BitonicSort`
+
     .. versionadded:: 2013.1
     """
     def __init__(self, context, arguments, key_expr, sort_arg_names,
diff --git a/pyopencl/bitonic_sort.py b/pyopencl/bitonic_sort.py
index f8431d6e17f75480adb1b248fd7d26d121443779..1408168fd21f605ed346ae6bd3d23d26c211ed0e 100644
--- a/pyopencl/bitonic_sort.py
+++ b/pyopencl/bitonic_sort.py
@@ -6,7 +6,7 @@ Copyright (c) 2015, Ilya Efimoff
 All rights reserved.
 """
 
-# based on code at
+# based on code at http://www.bealto.com/gpu-sorting_intro.html
 
 __license__ = """
 Redistribution and use in source and binary forms, with or without
@@ -37,13 +37,25 @@ OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pyopencl as cl
 from pyopencl.tools import dtype_to_ctype
-from mako.template import Template
 from operator import mul
 from functools import reduce
 from pytools import memoize_method
+from mako.template import Template
+
+
+def _is_power_of_2(n):
+    from pyopencl.tools import bitlog2
+    return n == 0 or 2**bitlog2(n) == n
 
 
 class BitonicSort(object):
+    """Sort an array (or one axis of one) using a sorting network.
+
+    Will only work if the axis of the array to be sorted has a length
+    that is a power of 2.
+
+    .. versionadded:: 2015.2
+    """
     def __init__(self, context, shape, key_dtype, idx_dtype=None, axis=0):
         import pyopencl.bitonic_sort_templates as tmpl
 
@@ -65,31 +77,60 @@ class BitonicSort(object):
         if idx_dtype is None:
             self.argsort = 0
             self.idx_t = 'uint'  # Dummy
+
         else:
             self.argsort = 1
             self.idx_t = dtype_to_ctype(idx_dtype)
+
         self.defstpl = Template(tmpl.defines)
-        self.rq = self.sort_b_prepare_wl(shape, self.axis)
+        self.run_queue = self.sort_b_prepare_wl(shape, self.axis)
+
+    def __call__(self, arr, idx=None, mkcpy=True, queue=None, wait_for=None):
+        if queue is None:
+            queue = arr.queue
 
-    def __call__(self, _arr, idx=None, mkcpy=True):
-        arr = _arr.copy() if mkcpy else _arr
-        rq = self.rq
-        p, nt, wg, aux = rq[0]
-        if self.argsort and not type(idx)==type(None):
+        if wait_for is None:
+            wait_for = []
+        wait_for = wait_for + arr.events
+
+        last_evt = cl.enqueue_marker(queue, wait_for=wait_for)
+
+        if arr.shape[self.axis] == 0:
+            return arr, last_evt
+
+        if not _is_power_of_2(arr.shape[self.axis]):
+            raise ValueError("sorted array axis length must be a power of 2")
+
+        arr = arr.copy() if mkcpy else arr
+
+        run_queue = self.run_queue
+        knl, nt, wg, aux = run_queue[0]
+
+        if self.argsort and idx is not None:
             if aux:
-                p.run(arr.queue, (nt,), wg, arr.data, idx.data, cl.LocalMemory(wg[0]*4*arr.dtype.itemsize),\
-                                                                cl.LocalMemory(wg[0]*4*idx.dtype.itemsize))
-            for p, nt, wg,_ in rq[1:]:
-                p.run(arr.queue, (nt,), wg, arr.data, idx.data)
-        elif self.argsort==0:
+                last_evt = knl(
+                        queue, (nt,), wg, arr.data, idx.data,
+                        cl.LocalMemory(wg[0]*4*arr.dtype.itemsize),
+                        cl.LocalMemory(wg[0]*4*idx.dtype.itemsize),
+                        wait_for=[last_evt])
+            for knl, nt, wg, _ in run_queue[1:]:
+                last_evt = knl(
+                        queue, (nt,), wg, arr.data, idx.data,
+                        wait_for=[last_evt])
+
+        elif not self.argsort:
             if aux:
-                p.run(arr.queue, (nt,), wg, arr.data, cl.LocalMemory(wg[0]*4*arr.dtype.itemsize))
-            for p, nt, wg,_ in rq[1:]:
-                p.run(arr.queue, (nt,), wg, arr.data)
+                last_evt = knl(
+                        queue, (nt,), wg, arr.data,
+                        cl.LocalMemory(wg[0]*4*arr.dtype.itemsize),
+                        wait_for=[last_evt])
+            for knl, nt, wg, _ in run_queue[1:]:
+                last_evt = knl(queue, (nt,), wg, arr.data, wait_for=[last_evt])
+
         else:
             raise ValueError("Array of indexes required for this sorter. If argsort is not needed,\
                               recreate sorter witout index datatype provided.")
-        return arr
+        return arr, last_evt
 
     @memoize_method
     def get_program(self, letter, params):
@@ -102,7 +143,9 @@ class BitonicSort(object):
                     dsize=params[4], nsize=params[5])
 
             self.cached_defs[params] = defs
+
         kid = Template(self.kernels_srcs[letter]).render(argsort=self.argsort)
+
         prg = cl.Program(self.context, defs + kid).build()
         return prg
 
@@ -122,7 +165,7 @@ class BitonicSort(object):
         wg = min(ds, self.context.devices[0].max_work_group_size)
         length = wg >> 1
         prg = self.get_program('BLO', (1, 1, self.dtype, self.idx_t, ds, ns))
-        run_queue.append((prg, size, (wg,), True))
+        run_queue.append((prg.run, size, (wg,), True))
 
         while length < ds:
             inc = length
@@ -146,7 +189,7 @@ class BitonicSort(object):
 
                 prg = self.get_program(letter,
                         (inc, direction, self.dtype, self.idx_t,  ds, ns))
-                run_queue.append((prg, nthreads, None, False,))
+                run_queue.append((prg.run, nthreads, None, False,))
                 inc >>= ninc
 
             length <<= 1
diff --git a/pyopencl/bitonic_sort_templates.py b/pyopencl/bitonic_sort_templates.py
index eaaa2a7a1e01ae97a022358b32d0d3936a786e1f..9b4f14e82a5943698cdbf5ff75e1b6522c09a8ed 100644
--- a/pyopencl/bitonic_sort_templates.py
+++ b/pyopencl/bitonic_sort_templates.py
@@ -3,18 +3,38 @@ Copyright (c) 2011, Eric Bainville
 Copyright (c) 2015, Ilya Efimoff
 All rights reserved.
 """
-__license__ = """
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+__license__ = """
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 """
 
 
-defines = """
+# {{{ defines
+
+defines = """//CL//
 typedef ${dtype} data_t;
 typedef ${idxtype} idx_t;
 typedef ${idxtype}2 idx_t2;
@@ -67,9 +87,14 @@ typedef ${idxtype}2 idx_t2;
 #define dsize ${dsize}   //Dimension size
 """
 
-ParallelBitonic_B2 = """
+# }}}
+
+
+# {{{ B2
+
+ParallelBitonic_B2 = """//CL//
 // N/2 threads
-//ParallelBitonic_B2 
+//ParallelBitonic_B2
 __kernel void run(__global data_t * data\\
 % if argsort:
 , __global idx_t * index)
@@ -117,9 +142,14 @@ __kernel void run(__global data_t * data\\
 }
 """
 
-ParallelBitonic_B4 = """
+# }}}
+
+
+# {{{ B4
+
+ParallelBitonic_B4 = """//CL//
 // N/4 threads
-//ParallelBitonic_B4 
+//ParallelBitonic_B4
 __kernel void run(__global data_t * data\\
 % if argsort:
 , __global idx_t * index)
@@ -179,9 +209,14 @@ __kernel void run(__global data_t * data\\
 }
 """
 
-ParallelBitonic_B8 = """
+# }}}
+
+
+# {{{ B8
+
+ParallelBitonic_B8 = """//CL//
 // N/8 threads
-//ParallelBitonic_B8 
+//ParallelBitonic_B8
 __kernel void run(__global data_t * data\\
 % if argsort:
 , __global idx_t * index)
@@ -226,9 +261,14 @@ __kernel void run(__global data_t * data\\
 }
 """
 
-ParallelBitonic_B16 = """
+# }}}
+
+
+# {{{ B16
+
+ParallelBitonic_B16 = """//CL//
 // N/16 threads
-//ParallelBitonic_B16 
+//ParallelBitonic_B16
 __kernel void run(__global data_t * data\\
 % if argsort:
 , __global idx_t * index)
@@ -273,8 +313,13 @@ __kernel void run(__global data_t * data\\
 }
 """
 
-ParallelBitonic_C4 = """
-//ParallelBitonic_C4 
+# }}}
+
+
+# {{{ C4
+
+ParallelBitonic_C4 = """//CL//
+//ParallelBitonic_C4
 __kernel void run\\
 % if argsort:
 (__global data_t * data, __global idx_t * index, __local data_t * aux, __local idx_t * auy)
@@ -342,8 +387,12 @@ __kernel void run\\
 }
 """
 
+# }}}
+
+
+# {{{ local merge
 
-ParallelMerge_Local = """
+ParallelMerge_Local = """//CL//
 // N threads, WG is workgroup size. Sort WG input blocks in each workgroup.
 __kernel void run(__global const data_t * in,__global data_t * out,__local data_t * aux)
 {
@@ -386,7 +435,12 @@ __kernel void run(__global const data_t * in,__global data_t * out,__local data_
 }
 """
 
-ParallelBitonic_Local = """
+# }}}
+
+
+# {{{
+
+ParallelBitonic_Local = """//CL//
 // N threads, WG is workgroup size. Sort WG input blocks in each workgroup.
 __kernel void run(__global const data_t * in,__global data_t * out,__local data_t * aux)
 {
@@ -426,7 +480,12 @@ __kernel void run(__global const data_t * in,__global data_t * out,__local data_
 }
 """
 
-ParallelBitonic_A = """
+# }}}
+
+
+# {{{ A
+
+ParallelBitonic_A = """//CL//
 __kernel void ParallelBitonic_A(__global const data_t * in)
 {
   int i = get_global_id(0); // thread index
@@ -447,7 +506,12 @@ __kernel void ParallelBitonic_A(__global const data_t * in)
 }
 """
 
-ParallelBitonic_Local_Optim = """
+# }}}
+
+
+# {{{ local optim
+
+ParallelBitonic_Local_Optim = """//CL//
 __kernel void run\\
 % if argsort:
 (__global data_t * data, __global idx_t * index, __local data_t * aux, __local idx_t * auy)
@@ -464,7 +528,7 @@ __kernel void run\\
 
   // Move IN, OUT to block start
   //int offset = get_group_id(0) * wg;
-  data += offset; 
+  data += offset;
   // Load block in AUX[WG]
   data_t iData = data[t*nsize];
   aux[i] = iData;
@@ -513,4 +577,8 @@ __kernel void run\\
   index[t*nsize] = iidx;
 % endif
 }
-"""
\ No newline at end of file
+"""
+
+# }}}
+
+# vim: filetype=pyopencl:fdm=marker
diff --git a/test/test_algorithm.py b/test/test_algorithm.py
index 3f135a40744e3c9b892f09035bce0a1cf037efe6..56c649e9d456b6be0cdd167d8e123839d515ed11 100644
--- a/test/test_algorithm.py
+++ b/test/test_algorithm.py
@@ -838,26 +838,53 @@ def test_key_value_sorter(ctx_factory):
 
 # {{{ bitonic sort
 
-def test_bitonic_sort(ctx_factory):
+@pytest.mark.parametrize("size", [
+    512,
+    4,
+    16
+    ])
+@pytest.mark.parametrize("dtype", [
+    np.int32,
+    np.float32,
+    # np.float64
+    ])
+def test_bitonic_sort(ctx_factory, size, dtype):
     ctx = cl.create_some_context()
     queue = cl.CommandQueue(ctx)
 
     import pyopencl.clrandom as clrandom
     from pyopencl.bitonic_sort import BitonicSort
 
-    s = clrandom.rand(queue, (4, 512, 5,), np.float32, luxury=None, a=0, b=1.0)
+    s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=1.0)
     sorter = BitonicSort(ctx, s.shape, s.dtype, axis=1)
-    sgs = sorter(s)
+    sgs, evt = sorter(s)
     assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
 
-    size = 2**18
+
+@pytest.mark.parametrize("size", [
+    0,
+    4,
+    2**14,
+    2**18,
+    ])
+@pytest.mark.parametrize("dtype", [
+    np.int32,
+    np.float32,
+    # np.float64
+    ])
+def test_bitonic_argsort(ctx_factory, size, dtype):
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    import pyopencl.clrandom as clrandom
+    from pyopencl.bitonic_sort import BitonicSort
 
     index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
     m = clrandom.rand(queue, (size,), np.float32, luxury=None, a=0, b=1.0)
 
     sorterm = BitonicSort(ctx, m.shape, m.dtype, idx_dtype=index.dtype, axis=0)
 
-    ms = sorterm(m, idx=index)
+    ms, evt = sorterm(m, idx=index)
 
     assert np.array_equal(np.sort(m.get()), ms.get())