diff --git a/pyopencl/bitonic_sort.py b/pyopencl/bitonic_sort.py index d4beaba2c7aee15c4cf82ba5fc2f07daed8bcebe..151174e52dcf6f74a937ff1f95691d992b40cf2e 100644 --- a/pyopencl/bitonic_sort.py +++ b/pyopencl/bitonic_sort.py @@ -121,8 +121,10 @@ class BitonicSort(object): if aux: last_evt = knl( queue, (nt,), wg, arr.data, idx.data, - cl.LocalMemory(wg[0]*arr.dtype.itemsize), - cl.LocalMemory(wg[0]*idx.dtype.itemsize), + cl.LocalMemory( + _tmpl.LOCAL_MEM_FACTOR*wg[0]*arr.dtype.itemsize), + cl.LocalMemory( + _tmpl.LOCAL_MEM_FACTOR*wg[0]*idx.dtype.itemsize), wait_for=[last_evt]) for knl, nt, wg, _ in run_queue[1:]: last_evt = knl( @@ -133,7 +135,8 @@ class BitonicSort(object): if aux: last_evt = knl( queue, (nt,), wg, arr.data, - cl.LocalMemory(wg[0]*4*arr.dtype.itemsize), + cl.LocalMemory( + _tmpl.LOCAL_MEM_FACTOR*wg[0]*4*arr.dtype.itemsize), wait_for=[last_evt]) for knl, nt, wg, _ in run_queue[1:]: last_evt = knl(queue, (nt,), wg, arr.data, wait_for=[last_evt]) @@ -184,9 +187,9 @@ class BitonicSort(object): available_lmem = dev.local_mem_size while True: - lmem_size = wg*key_dtype.itemsize + lmem_size = _tmpl.LOCAL_MEM_FACTOR*wg*key_dtype.itemsize if argsort: - lmem_size += wg*idx_dtype.itemsize + lmem_size += _tmpl.LOCAL_MEM_FACTOR*wg*idx_dtype.itemsize if lmem_size + 512 > available_lmem: wg //= 2 diff --git a/pyopencl/bitonic_sort_templates.py b/pyopencl/bitonic_sort_templates.py index 9b4f14e82a5943698cdbf5ff75e1b6522c09a8ed..4aafa439530e01d149b34a3af28d5cde36b0a48a 100644 --- a/pyopencl/bitonic_sort_templates.py +++ b/pyopencl/bitonic_sort_templates.py @@ -31,10 +31,17 @@ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ +LOCAL_MEM_FACTOR = 1 + # {{{ defines defines = """//CL// + +% if dtype == "double": + #pragma OPENCL EXTENSION cl_khr_fp64: enable +% endif + typedef ${dtype} data_t; typedef ${idxtype} idx_t; typedef ${idxtype}2 idx_t2; @@ -318,6 +325,8 @@ __kernel void run(__global data_t * data\\ # {{{ C4 +# IF YOU REENABLE THIS, YOU NEED TO ADJUST LOCAL_MEM_FACTOR TO 4 + ParallelBitonic_C4 = """//CL// //ParallelBitonic_C4 __kernel void run\\ diff --git a/test/test_algorithm.py b/test/test_algorithm.py index c9811cd8105d5816525323af5ce1768b5803288f..8f44cc594bab73126af98849811b35d99eebd059 100644 --- a/test/test_algorithm.py +++ b/test/test_algorithm.py @@ -846,13 +846,17 @@ def test_key_value_sorter(ctx_factory): @pytest.mark.parametrize("dtype", [ np.int32, np.float32, - # np.float64 + np.float64 ]) @pytest.mark.bitonic def test_bitonic_sort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) + if (ctx.devices[0].platform.name == "Portable Computing Language" + and dtype == np.float64): + pytest.xfail("Double precision bitonic sort doesn't work on POCL") + import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort @@ -871,13 +875,17 @@ def test_bitonic_sort(ctx_factory, size, dtype): @pytest.mark.parametrize("dtype", [ np.int32, np.float32, - # np.float64 + np.float64 ]) @pytest.mark.bitonic def test_bitonic_argsort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) + if (ctx.devices[0].platform.name == "Portable Computing Language" + and dtype == np.float64): + pytest.xfail("Double precision bitonic sort doesn't work on POCL") + import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort