diff --git a/test/test_array.py b/test/test_array.py index a1cc402507974ebe13ba715541087da2fee36add..b4d2aa63900b213fc5cb06343e53c682eaf510b6 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -22,64 +22,26 @@ if have_cl(): from pyopencl.characterize import has_double_support -@pytools.test.mark_test.opencl -def test_pow_array(ctx_factory): - context = ctx_factory() - queue = cl.CommandQueue(context) - - a = np.array([1, 2, 3, 4, 5]).astype(np.float32) - a_gpu = cl_array.to_device(queue, a) - - result = pow(a_gpu, a_gpu).get() - assert (np.abs(a ** a - result) < 1e-3).all() - - result = (a_gpu ** a_gpu).get() - assert (np.abs(pow(a, a) - result) < 1e-3).all() - - -@pytools.test.mark_test.opencl -def test_pow_number(ctx_factory): - context = ctx_factory() - queue = cl.CommandQueue(context) - - a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) - a_gpu = cl_array.to_device(queue, a) - - result = pow(a_gpu, 2).get() - assert (np.abs(a ** 2 - result) < 1e-3).all() - - -@pytools.test.mark_test.opencl -def test_absrealimag(ctx_factory): - context = ctx_factory() - queue = cl.CommandQueue(context) - def real(x): return x.real - def imag(x): return x.imag - def conj(x): return x.conj() - - n = 111 - for func in [abs, real, imag, conj]: - for dtype in [np.int32, np.float32, np.complex64]: - print(func, dtype) - a = -make_random_array(queue, dtype, n) - - host_res = func(a.get()) - dev_res = func(a).get() - - correct = np.allclose(dev_res, host_res) - if not correct: - print(dev_res) - print(host_res) - print(dev_res-host_res) - assert correct +# {{{ helpers TO_REAL = { np.dtype(np.complex64): np.float32, np.dtype(np.complex128): np.float64 } +def general_clrand(queue, shape, dtype): + from pyopencl.clrandom import rand as clrand + + dtype = np.dtype(dtype) + if dtype.kind == "c": + real_dtype = dtype.type(0).real.dtype + return clrand(queue, shape, real_dtype) + 1j*clrand(queue, shape, real_dtype) + else: + return clrand(queue, shape, dtype) + + def make_random_array(queue, dtype, size): from pyopencl.clrandom import rand @@ -92,6 +54,10 @@ def make_random_array(queue, dtype, size): else: return rand(queue, shape=(size,), dtype=dtype) +# }}} + +# {{{ dtype-related + @pytools.test.mark_test.opencl def test_basic_complex(ctx_factory): context = ctx_factory() @@ -179,13 +145,71 @@ def test_mix_complex(ctx_factory): assert correct @pytools.test.mark_test.opencl -def test_len(ctx_factory): +def test_vector_fill(ctx_factory): + context = ctx_factory() + queue = cl.CommandQueue(context) + + a_gpu = cl_array.Array(queue, 100, dtype=cl_array.vec.float4) + a_gpu.fill(cl_array.vec.make_float4(0.0, 0.0, 1.0, 0.0)) + a = a_gpu.get() + assert a.dtype is cl_array.vec.float4 + + a_gpu = cl_array.zeros(queue, 100, dtype=cl_array.vec.float4) + +@pytools.test.mark_test.opencl +def test_absrealimag(ctx_factory): + context = ctx_factory() + queue = cl.CommandQueue(context) + + def real(x): return x.real + def imag(x): return x.imag + def conj(x): return x.conj() + + n = 111 + for func in [abs, real, imag, conj]: + for dtype in [np.int32, np.float32, np.complex64]: + print(func, dtype) + a = -make_random_array(queue, dtype, n) + + host_res = func(a.get()) + dev_res = func(a).get() + + correct = np.allclose(dev_res, host_res) + if not correct: + print(dev_res) + print(host_res) + print(dev_res-host_res) + assert correct + +# }}} + +# {{{ operands + +@pytools.test.mark_test.opencl +def test_pow_array(ctx_factory): + context = ctx_factory() + queue = cl.CommandQueue(context) + + a = np.array([1, 2, 3, 4, 5]).astype(np.float32) + a_gpu = cl_array.to_device(queue, a) + + result = pow(a_gpu, a_gpu).get() + assert (np.abs(a ** a - result) < 1e-3).all() + + result = (a_gpu ** a_gpu).get() + assert (np.abs(pow(a, a) - result) < 1e-3).all() + + +@pytools.test.mark_test.opencl +def test_pow_number(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) - a_cpu = cl_array.to_device(queue, a) - assert len(a_cpu) == 10 + a_gpu = cl_array.to_device(queue, a) + + result = pow(a_gpu, 2).get() + assert (np.abs(a ** 2 - result) < 1e-3).all() @pytools.test.mark_test.opencl @@ -331,6 +355,9 @@ def test_divide_array(ctx_factory): a_divide = (b_gpu / a_gpu).get() assert (np.abs(b / a - a_divide) < 1e-3).all() +# }}} + +# {{{ RNG @pytools.test.mark_test.opencl def test_random(ctx_factory): @@ -371,35 +398,9 @@ def test_random(ctx_factory): #pt.hist(ran.get()) #pt.show() +# }}} - - - -@pytools.test.mark_test.opencl -def test_nan_arithmetic(ctx_factory): - context = ctx_factory() - queue = cl.CommandQueue(context) - - def make_nan_contaminated_vector(size): - shape = (size,) - a = np.random.randn(*shape).astype(np.float32) - from random import randrange - for i in range(size // 10): - a[randrange(0, size)] = float('nan') - return a - - size = 1 << 20 - - a = make_nan_contaminated_vector(size) - a_gpu = cl_array.to_device(queue, a) - b = make_nan_contaminated_vector(size) - b_gpu = cl_array.to_device(queue, b) - - ab = a * b - ab_gpu = (a_gpu * b_gpu).get() - - assert (np.isnan(ab) == np.isnan(ab_gpu)).all() - +# {{{ elementwise @pytools.test.mark_test.opencl def test_elwise_kernel(ctx_factory): @@ -489,20 +490,80 @@ def test_reverse(ctx_factory): assert (a[::-1] == a_gpu.get()).all() +@pytools.test.mark_test.opencl +def test_if_positive(ctx_factory): + context = ctx_factory() + queue = cl.CommandQueue(context) -def general_clrand(queue, shape, dtype): from pyopencl.clrandom import rand as clrand - dtype = np.dtype(dtype) - if dtype.kind == "c": - real_dtype = dtype.type(0).real.dtype - return clrand(queue, shape, real_dtype) + 1j*clrand(queue, shape, real_dtype) - else: - return clrand(queue, shape, dtype) + l = 20000 + a_gpu = clrand(queue, (l,), np.float32) + b_gpu = clrand(queue, (l,), np.float32) + a = a_gpu.get() + b = b_gpu.get() + + max_a_b_gpu = cl_array.maximum(a_gpu, b_gpu) + min_a_b_gpu = cl_array.minimum(a_gpu, b_gpu) + print(max_a_b_gpu) + print(np.maximum(a, b)) + assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0 + assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0 +@pytools.test.mark_test.opencl +def test_take_put(ctx_factory): + context = ctx_factory() + queue = cl.CommandQueue(context) + + for n in [5, 17, 333]: + one_field_size = 8 + buf_gpu = cl_array.zeros(queue, + n * one_field_size, dtype=np.float32) + dest_indices = cl_array.to_device(queue, + np.array([0, 1, 2, 3, 32, 33, 34, 35], dtype=np.uint32)) + read_map = cl_array.to_device(queue, + np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32)) + + cl_array.multi_take_put( + arrays=[buf_gpu for i in range(n)], + dest_indices=dest_indices, + src_indices=read_map, + src_offsets=[i * one_field_size for i in range(n)], + dest_shape=(96,)) + + +@pytools.test.mark_test.opencl +def test_astype(ctx_factory): + context = ctx_factory() + queue = cl.CommandQueue(context) + + from pyopencl.clrandom import rand as clrand + + if not has_double_support(context.devices[0]): + return + + a_gpu = clrand(queue, (2000,), dtype=np.float32) + + a = a_gpu.get().astype(np.float64) + a2 = a_gpu.astype(np.float64).get() + + assert a2.dtype == np.float64 + assert la.norm(a - a2) == 0, (a, a2) + + a_gpu = clrand(queue, (2000,), dtype=np.float64) + + a = a_gpu.get().astype(np.float32) + a2 = a_gpu.astype(np.float32).get() + + assert a2.dtype == np.float32 + assert la.norm(a - a2) / la.norm(a) < 1e-7 +# }}} + +# {{{ reduction + @pytools.test.mark_test.opencl def test_sum(ctx_factory): context = ctx_factory() @@ -600,98 +661,82 @@ def test_dot(ctx_factory): assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 +mmc_dtype = np.dtype([ + ("cur_min", np.int32), + ("cur_max", np.int32), + ("pad", np.int32), + ]) -if False: - @pytools.test.mark_test.opencl - def test_slice(ctx_factory): - from pyopencl.clrandom import rand as clrand - - l = 20000 - a_gpu = clrand(queue, (l,)) - a = a_gpu.get() - - from random import randrange - for i in range(200): - start = randrange(l) - end = randrange(start, l) - - a_gpu_slice = a_gpu[start:end] - a_slice = a[start:end] - - assert la.norm(a_gpu_slice.get() - a_slice) == 0 - +from pyopencl.tools import register_dtype +register_dtype(mmc_dtype, "minmax_collector", alias_ok=True) +register_dtype(mmc_dtype, "minmax_collector", alias_ok=True) @pytools.test.mark_test.opencl -def test_if_positive(ctx_factory): +def test_struct_reduce(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) - from pyopencl.clrandom import rand as clrand - - l = 20000 - a_gpu = clrand(queue, (l,), np.float32) - b_gpu = clrand(queue, (l,), np.float32) - a = a_gpu.get() - b = b_gpu.get() - - max_a_b_gpu = cl_array.maximum(a_gpu, b_gpu) - min_a_b_gpu = cl_array.minimum(a_gpu, b_gpu) - - print(max_a_b_gpu) - print(np.maximum(a, b)) - - assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0 - assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0 + preamble = r"""//CL// + struct minmax_collector + { + int cur_min; + int cur_max; + // Workaround for OS X Lion GPU CL. Mystifying. + int pad; + }; + typedef struct minmax_collector minmax_collector; -@pytools.test.mark_test.opencl -def test_take_put(ctx_factory): - context = ctx_factory() - queue = cl.CommandQueue(context) + minmax_collector mmc_neutral() + { + // FIXME: needs infinity literal in real use, ok here + minmax_collector result; + result.cur_min = 1<<30; + result.cur_max = -(1<<30); + return result; + } - for n in [5, 17, 333]: - one_field_size = 8 - buf_gpu = cl_array.zeros(queue, - n * one_field_size, dtype=np.float32) - dest_indices = cl_array.to_device(queue, - np.array([0, 1, 2, 3, 32, 33, 34, 35], dtype=np.uint32)) - read_map = cl_array.to_device(queue, - np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32)) + minmax_collector mmc_from_scalar(float x) + { + minmax_collector result; + result.cur_min = x; + result.cur_max = x; + return result; + } - cl_array.multi_take_put( - arrays=[buf_gpu for i in range(n)], - dest_indices=dest_indices, - src_indices=read_map, - src_offsets=[i * one_field_size for i in range(n)], - dest_shape=(96,)) + minmax_collector agg_mmc(minmax_collector a, minmax_collector b) + { + minmax_collector result = a; + if (b.cur_min < result.cur_min) + result.cur_min = b.cur_min; + if (b.cur_max > result.cur_max) + result.cur_max = b.cur_max; + return result; + } + """ -@pytools.test.mark_test.opencl -def test_astype(ctx_factory): - context = ctx_factory() - queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand + a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6) + a = a_gpu.get() - if not has_double_support(context.devices[0]): - return - - a_gpu = clrand(queue, (2000,), dtype=np.float32) - - a = a_gpu.get().astype(np.float64) - a2 = a_gpu.astype(np.float64).get() - - assert a2.dtype == np.float64 - assert la.norm(a - a2) == 0, (a, a2) + from pyopencl.reduction import ReductionKernel + red = ReductionKernel(context, mmc_dtype, + neutral="mmc_neutral()", + reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])", + arguments="__global int *x", preamble=preamble) - a_gpu = clrand(queue, (2000,), dtype=np.float64) + minmax = red(a_gpu).get() + #print minmax["cur_min"], minmax["cur_max"] + #print np.min(a), np.max(a) - a = a_gpu.get().astype(np.float32) - a2 = a_gpu.astype(np.float32).get() + assert abs(minmax["cur_min"] - np.min(a)) < 1e-5 + assert abs(minmax["cur_max"] - np.max(a)) < 1e-5 - assert a2.dtype == np.float32 - assert la.norm(a - a2) / la.norm(a) < 1e-7 +# }}} +# {{{ scan-related def summarize_error(obtained, desired, orig, thresh=1e-5): err = obtained - desired @@ -822,6 +867,39 @@ def test_unique(ctx_factory): assert (a_unique_dev.get()[:count_unique_dev] == a_unique_host).all() +@pytools.test.mark_test.opencl +def test_segmented_scan(ctx_factory): + context = ctx_factory() + queue = cl.CommandQueue(context) + + from random import randrange + from pyopencl.clrandom import rand as clrand + for n in scan_test_counts: + a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000) + a = a_dev.get() + + seg_boundary_count = min(100, randrange(0, int(0.4*n))) + seg_boundaries = np.fromiter(sorted(randrange(n) for i in xrange(seg_boundary_count)), + dtype=np.intp) + print seg_boundaries + + seg_boundary_flags = np.zeros(n, dtype=np.uint8) + seg_boundary_flags[seg_boundaries] = 1 + seg_boundary_flags_dev = cl_array.to_device(queue, seg_boundary_flags) + +# }}} + +# {{{ misc + +@pytools.test.mark_test.opencl +def test_len(ctx_factory): + context = ctx_factory() + queue = cl.CommandQueue(context) + + a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) + a_cpu = cl_array.to_device(queue, a) + assert len(a_cpu) == 10 + @pytools.test.mark_test.opencl def test_stride_preservation(ctx_factory): context = ctx_factory() @@ -834,18 +912,30 @@ def test_stride_preservation(ctx_factory): print(AT_GPU.flags.f_contiguous, AT_GPU.flags.c_contiguous) assert np.allclose(AT_GPU.get(), AT) - @pytools.test.mark_test.opencl -def test_vector_fill(ctx_factory): +def test_nan_arithmetic(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) - a_gpu = cl_array.Array(queue, 100, dtype=cl_array.vec.float4) - a_gpu.fill(cl_array.vec.make_float4(0.0, 0.0, 1.0, 0.0)) - a = a_gpu.get() - assert a.dtype is cl_array.vec.float4 + def make_nan_contaminated_vector(size): + shape = (size,) + a = np.random.randn(*shape).astype(np.float32) + from random import randrange + for i in range(size // 10): + a[randrange(0, size)] = float('nan') + return a - a_gpu = cl_array.zeros(queue, 100, dtype=cl_array.vec.float4) + size = 1 << 20 + + a = make_nan_contaminated_vector(size) + a_gpu = cl_array.to_device(queue, a) + b = make_nan_contaminated_vector(size) + b_gpu = cl_array.to_device(queue, b) + + ab = a * b + ab_gpu = (a_gpu * b_gpu).get() + + assert (np.isnan(ab) == np.isnan(ab_gpu)).all() @pytools.test.mark_test.opencl def test_mem_pool_with_arrays(ctx_factory): @@ -881,78 +971,26 @@ def test_view(ctx_factory): view = a_dev.view(np.int16) assert view.shape == (8, 32) and view.dtype == np.int16 -mmc_dtype = np.dtype([ - ("cur_min", np.int32), - ("cur_max", np.int32), - ("pad", np.int32), - ]) - -from pyopencl.tools import register_dtype -register_dtype(mmc_dtype, "minmax_collector", alias_ok=True) -register_dtype(mmc_dtype, "minmax_collector", alias_ok=True) +# }}} @pytools.test.mark_test.opencl -def test_struct_reduce(ctx_factory): - context = ctx_factory() - queue = cl.CommandQueue(context) - - preamble = r"""//CL// - struct minmax_collector - { - int cur_min; - int cur_max; - // Workaround for OS X Lion GPU CL. Mystifying. - int pad; - }; - - typedef struct minmax_collector minmax_collector; - - minmax_collector mmc_neutral() - { - // FIXME: needs infinity literal in real use, ok here - minmax_collector result; - result.cur_min = 1<<30; - result.cur_max = -(1<<30); - return result; - } - - minmax_collector mmc_from_scalar(float x) - { - minmax_collector result; - result.cur_min = x; - result.cur_max = x; - return result; - } - - minmax_collector agg_mmc(minmax_collector a, minmax_collector b) - { - minmax_collector result = a; - if (b.cur_min < result.cur_min) - result.cur_min = b.cur_min; - if (b.cur_max > result.cur_max) - result.cur_max = b.cur_max; - return result; - } - - """ - - +def no_test_slice(ctx_factory): from pyopencl.clrandom import rand as clrand - a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6) + + l = 20000 + a_gpu = clrand(queue, (l,)) a = a_gpu.get() - from pyopencl.reduction import ReductionKernel - red = ReductionKernel(context, mmc_dtype, - neutral="mmc_neutral()", - reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])", - arguments="__global int *x", preamble=preamble) + from random import randrange + for i in range(200): + start = randrange(l) + end = randrange(start, l) - minmax = red(a_gpu).get() - #print minmax["cur_min"], minmax["cur_max"] - #print np.min(a), np.max(a) + a_gpu_slice = a_gpu[start:end] + a_slice = a[start:end] + + assert la.norm(a_gpu_slice.get() - a_slice) == 0 - assert abs(minmax["cur_min"] - np.min(a)) < 1e-5 - assert abs(minmax["cur_max"] - np.max(a)) < 1e-5 @@ -969,4 +1007,4 @@ if __name__ == "__main__": from py.test.cmdline import main main([__file__]) -# vim: filetype=pyopencl +# vim: filetype=pyopencl:fdm=marker