diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index 8e77fb80023bd0c4d913674eae610d29358a6e29..eeb6dde0e8283b0304884ff48d97b78b65f1e6a5 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -28,18 +28,14 @@ OTHER DEALINGS IN THE SOFTWARE. """ - - from pyopencl.tools import context_dependent_memoize import numpy as np import pyopencl as cl from pyopencl.tools import dtype_to_ctype, VectorArg, ScalarArg - - def get_elwise_program(context, arguments, operation, - name="elwise_kernel", keep=False, options=[], + name="elwise_kernel", options=[], preamble="", loop_prep="", after_loop=""): from pyopencl import Program source = (""" @@ -73,10 +69,8 @@ def get_elwise_program(context, arguments, operation, return Program(context, source).build(options) - - def get_elwise_kernel_and_types(context, arguments, operation, - name="elwise_kernel", keep=False, options=[], preamble="", **kwargs): + name="elwise_kernel", options=[], preamble="", **kwargs): if isinstance(arguments, str): from pyopencl.tools import parse_c_arg parsed_args = [parse_c_arg(arg) for arg in arguments.split(",")] @@ -84,7 +78,7 @@ def get_elwise_kernel_and_types(context, arguments, operation, parsed_args = arguments for arg in parsed_args: - if np.float64 == arg.dtype: + if np.float64 == arg.dtype: preamble = ( "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n\n\n" + preamble) @@ -92,8 +86,9 @@ def get_elwise_kernel_and_types(context, arguments, operation, parsed_args.append(ScalarArg(np.uintp, "n")) - prg = get_elwise_program(context, parsed_args, operation, name, - keep, options, preamble, **kwargs) + prg = get_elwise_program( + context, parsed_args, operation, + name=name, options=options, preamble=preamble, **kwargs) scalar_arg_dtypes = [] for arg in parsed_args: @@ -108,26 +103,27 @@ def get_elwise_kernel_and_types(context, arguments, operation, return kernel, parsed_args - - def get_elwise_kernel(context, arguments, operation, name="elwise_kernel", options=[], **kwargs): """Return a L{pyopencl.Kernel} that performs the same scalar operation on one or several vectors. """ - func, arguments = get_elwise_kernel_and_types(context, - arguments, operation, name, options, **kwargs) + func, arguments = get_elwise_kernel_and_types( + context, arguments, operation, + name=name, options=options, **kwargs) return func - - class ElementwiseKernel: + def __init__(self, context, arguments, operation, name="elwise_kernel", options=[], **kwargs): - self.kernel, self.arguments = get_elwise_kernel_and_types(context, - arguments, operation, name, options, **kwargs) + + self.kernel, self.arguments = get_elwise_kernel_and_types( + context, arguments, operation, + name=name, options=options, + **kwargs) if not [i for i, arg in enumerate(self.arguments) if isinstance(arg, VectorArg)]: @@ -165,8 +161,6 @@ class ElementwiseKernel: return cl.enqueue_nd_range_kernel(queue, self.kernel, gs, ls) - - @context_dependent_memoize def get_take_kernel(context, dtype, idx_dtype, vec_count=1): ctx = { @@ -174,8 +168,10 @@ def get_take_kernel(context, dtype, idx_dtype, vec_count=1): "tp": dtype_to_ctype(dtype), } - args = ([VectorArg(dtype, "dest"+str(i))for i in range(vec_count)] - + [VectorArg(dtype, "src"+str(i))for i in range(vec_count)] + args = ([VectorArg(dtype, "dest" + str(i)) + for i in range(vec_count)] + + [VectorArg(dtype, "src" + str(i)) + for i in range(vec_count)] + [VectorArg(idx_dtype, "idx")]) body = ( ("%(idx_tp)s src_idx = idx[i];\n" % ctx) @@ -183,9 +179,7 @@ def get_take_kernel(context, dtype, idx_dtype, vec_count=1): "dest%d[i] = src%d[src_idx];" % (i, i) for i in range(vec_count))) - return get_elwise_kernel(context, args, body, "take") - - + return get_elwise_kernel(context, args, body, name="take") @context_dependent_memoize @@ -223,9 +217,7 @@ def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1): "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx) + "\n".join(get_copy_insn(i) for i in range(vec_count))) - return get_elwise_kernel(context, args, body, "take_put") - - + return get_elwise_kernel(context, args, body, name="take_put") @context_dependent_memoize @@ -240,7 +232,7 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1): for i in range(vec_count) ] + [ VectorArg(idx_dtype, "gmem_dest_idx"), - ] + [ + ] + [ VectorArg(dtype, "src%d" % i) for i in range(vec_count) ] @@ -250,9 +242,7 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1): + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i) for i in range(vec_count))) - return get_elwise_kernel(args, body, "put") - - + return get_elwise_kernel(args, body, name="put") @context_dependent_memoize @@ -263,8 +253,7 @@ def get_copy_kernel(context, dtype_dest, dtype_src): "tp_src": dtype_to_ctype(dtype_src), }, "dest[i] = src[i]", - "copy") - + name="copy") @context_dependent_memoize @@ -273,12 +262,12 @@ def get_linear_combination_kernel(summand_descriptors, # TODO: Port this! raise NotImplementedError - from pycuda.tools import dtype_to_ctype - from pycuda.elementwise import \ + from pyopencl.tools import dtype_to_ctype + from pyopencl.elementwise import \ VectorArg, ScalarArg, get_elwise_module args = [] - preamble = [ "#include \n\n" ] + preamble = [] loop_prep = [] summands = [] tex_names = [] @@ -312,13 +301,11 @@ def get_linear_combination_kernel(summand_descriptors, func = mod.get_function("linear_combination") tex_src = [mod.get_texref(tn) for tn in tex_names] func.prepare("".join(arg.struct_char for arg in args), - (1,1,1), texrefs=tex_src) + (1, 1, 1), texrefs=tex_src) return func, tex_src - - @context_dependent_memoize def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z): return get_elwise_kernel(context, @@ -328,7 +315,8 @@ def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z): "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = a*x[i] + b*y[i]", - "axpbyz") + name="axpbyz") + @context_dependent_memoize def get_axpbz_kernel(context, dtype): @@ -336,7 +324,8 @@ def get_axpbz_kernel(context, dtype): "%(tp)s *z, %(tp)s a, %(tp)s *x,%(tp)s b" % { "tp": dtype_to_ctype(dtype)}, "z[i] = a * x[i] + b", - "axpb") + name="axpb") + @context_dependent_memoize def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z): @@ -347,7 +336,8 @@ def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z): "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = x[i] * y[i]", - "multiply") + name="multiply") + @context_dependent_memoize def get_divide_kernel(context, dtype_x, dtype_y, dtype_z): @@ -358,7 +348,8 @@ def get_divide_kernel(context, dtype_x, dtype_y, dtype_z): "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = x[i] / y[i]", - "divide") + name="divide") + @context_dependent_memoize def get_rdivide_elwise_kernel(context, dtype): @@ -367,7 +358,8 @@ def get_rdivide_elwise_kernel(context, dtype): "tp": dtype_to_ctype(dtype), }, "z[i] = y / x[i]", - "divide_r") + name="divide_r") + @context_dependent_memoize def get_fill_kernel(context, dtype): @@ -376,7 +368,8 @@ def get_fill_kernel(context, dtype): "tp": dtype_to_ctype(dtype), }, "z[i] = a", - "fill") + name="fill") + @context_dependent_memoize def get_reverse_kernel(context, dtype): @@ -385,7 +378,8 @@ def get_reverse_kernel(context, dtype): "tp": dtype_to_ctype(dtype), }, "z[i] = y[n-1-i]", - "reverse") + name="reverse") + @context_dependent_memoize def get_arange_kernel(context, dtype): @@ -394,7 +388,7 @@ def get_arange_kernel(context, dtype): "tp": dtype_to_ctype(dtype), }, "z[i] = start + i*step", - "arange") + name="arange") @context_dependent_memoize @@ -404,7 +398,8 @@ def get_pow_kernel(context, dtype): "tp": dtype_to_ctype(dtype), }, "z[i] = pow(y[i], value)", - "pow_method") + name="pow_method") + @context_dependent_memoize def get_pow_array_kernel(context, dtype_x, dtype_y, dtype_z): @@ -415,21 +410,24 @@ def get_pow_array_kernel(context, dtype_x, dtype_y, dtype_z): "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = pow(x[i], y[i])", - "pow_method") + name="pow_method") + @context_dependent_memoize def get_fmod_kernel(context): return get_elwise_kernel(context, "float *z, float *arg, float *mod", "z[i] = fmod(arg[i], mod[i])", - "fmod_kernel") + name="fmod_kernel") + @context_dependent_memoize def get_modf_kernel(context): return get_elwise_kernel(context, "float *intpart ,float *fracpart, float *x", "fracpart[i] = modf(x[i], &intpart[i])", - "modf_kernel") + name="modf_kernel") + @context_dependent_memoize def get_frexp_kernel(context): @@ -440,14 +438,16 @@ def get_frexp_kernel(context): significand[i] = frexp(x[i], &expt); exponent[i] = expt; """, - "frexp_kernel") + name="frexp_kernel") + @context_dependent_memoize def get_ldexp_kernel(context): return get_elwise_kernel(context, "float *z, float *sig, float *expt", "z[i] = ldexp(sig[i], (int) expt[i])", - "ldexp_kernel") + name="ldexp_kernel") + @context_dependent_memoize def get_unary_func_kernel(context, func_name, in_dtype, out_dtype=None): @@ -460,9 +460,7 @@ def get_unary_func_kernel(context, func_name, in_dtype, out_dtype=None): "tp_out": dtype_to_ctype(out_dtype), }, "z[i] = %s(y[i])" % func_name, - "%s_kernel" % func_name) - - + name="%s_kernel" % func_name) @context_dependent_memoize @@ -474,4 +472,4 @@ def get_if_positive_kernel(context, crit_dtype, dtype): VectorArg(dtype, "else_"), ], "result[i] = crit[i] > 0 ? then_[i] : else_[i]", - "if_positive") + name="if_positive") diff --git a/test/test_array.py b/test/test_array.py index 4269eb855e38d92ca86fc6c37d8ab11be904e591..591b7e7ea7c0be785d3a631fc337716235c74950 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -5,8 +5,6 @@ import sys import pytools.test - - def have_cl(): try: import pyopencl @@ -22,36 +20,31 @@ if have_cl(): from pyopencl.characterize import has_double_support - - @pytools.test.mark_test.opencl def test_pow_array(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) - a = np.array([1,2,3,4,5]).astype(np.float32) + a = np.array([1, 2, 3, 4, 5]).astype(np.float32) a_gpu = cl_array.to_device(queue, a) - result = pow(a_gpu,a_gpu).get() - assert (np.abs(a**a - result) < 1e-3).all() + result = pow(a_gpu, a_gpu).get() + assert (np.abs(a ** a - result) < 1e-3).all() - result = (a_gpu**a_gpu).get() + result = (a_gpu ** a_gpu).get() assert (np.abs(pow(a, a) - result) < 1e-3).all() - - @pytools.test.mark_test.opencl def test_pow_number(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) - a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32) + a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = cl_array.to_device(queue, a) result = pow(a_gpu, 2).get() - assert (np.abs(a**2 - result) < 1e-3).all() - + assert (np.abs(a ** 2 - result) < 1e-3).all() @pytools.test.mark_test.opencl @@ -69,7 +62,7 @@ def test_abs(ctx_getter): res = a.get() - for i in range (111): + for i in range(111): assert abs(res[i]) >= 0 assert res[i] == i @@ -79,13 +72,11 @@ def test_len(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) - a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32) + a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_cpu = cl_array.to_device(queue, a) assert len(a_cpu) == 10 - - @pytools.test.mark_test.opencl def test_multiply(ctx_getter): """Test the muliplication of an array with a scalar. """ @@ -93,11 +84,9 @@ def test_multiply(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) - for sz in [10, 50000]: for dtype, scalars in [ (np.float32, [2]), - #(np.complex64, [2, 2j]) ]: for scalar in scalars: a = np.arange(sz).astype(dtype) @@ -106,6 +95,7 @@ def test_multiply(ctx_getter): assert (a * scalar == a_doubled).all() + @pytools.test.mark_test.opencl def test_multiply_array(ctx_getter): """Test the multiplication of two arrays.""" @@ -113,16 +103,14 @@ def test_multiply_array(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) - a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32) + a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = cl_array.to_device(queue, a) b_gpu = cl_array.to_device(queue, a) - a_squared = (b_gpu*a_gpu).get() - - assert (a*a == a_squared).all() - + a_squared = (b_gpu * a_gpu).get() + assert (a * a == a_squared).all() @pytools.test.mark_test.opencl @@ -132,13 +120,11 @@ def test_addition_array(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) - a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32) + a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = cl_array.to_device(queue, a) - a_added = (a_gpu+a_gpu).get() - - assert (a+a == a_added).all() - + a_added = (a_gpu + a_gpu).get() + assert (a + a == a_added).all() @pytools.test.mark_test.opencl @@ -148,21 +134,20 @@ def test_addition_scalar(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) - a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32) + a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = cl_array.to_device(queue, a) - a_added = (7+a_gpu).get() - - assert (7+a == a_added).all() - + a_added = (7 + a_gpu).get() + assert (7 + a == a_added).all() @pytools.test.mark_test.opencl def test_substract_array(ctx_getter): """Test the substraction of two arrays.""" #test data - a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32) - b = np.array([10,20,30,40,50,60,70,80,90,100]).astype(np.float32) + a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) + b = np.array([10, 20, 30, 40, 50, + 60, 70, 80, 90, 100]).astype(np.float32) context = ctx_getter() queue = cl.CommandQueue(context) @@ -170,13 +155,11 @@ def test_substract_array(ctx_getter): a_gpu = cl_array.to_device(queue, a) b_gpu = cl_array.to_device(queue, b) - result = (a_gpu-b_gpu).get() - assert (a-b == result).all() - - result = (b_gpu-a_gpu).get() - assert (b-a == result).all() - + result = (a_gpu - b_gpu).get() + assert (a - b == result).all() + result = (b_gpu - a_gpu).get() + assert (b - a == result).all() @pytools.test.mark_test.opencl @@ -187,18 +170,16 @@ def test_substract_scalar(ctx_getter): queue = cl.CommandQueue(context) #test data - a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32) + a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) #convert a to a gpu object a_gpu = cl_array.to_device(queue, a) - result = (a_gpu-7).get() - assert (a-7 == result).all() - - result = (7-a_gpu).get() - assert (7-a == result).all() - + result = (a_gpu - 7).get() + assert (a - 7 == result).all() + result = (7 - a_gpu).get() + assert (7 - a == result).all() @pytools.test.mark_test.opencl @@ -208,16 +189,14 @@ def test_divide_scalar(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) - a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32) + a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = cl_array.to_device(queue, a) - result = (a_gpu/2).get() - assert (a/2 == result).all() - - result = (2/a_gpu).get() - assert (np.abs(2/a - result) < 1e-5).all() - + result = (a_gpu / 2).get() + assert (a / 2 == result).all() + result = (2 / a_gpu).get() + assert (np.abs(2 / a - result) < 1e-5).all() @pytools.test.mark_test.opencl @@ -228,19 +207,17 @@ def test_divide_array(ctx_getter): queue = cl.CommandQueue(context) #test data - a = np.array([10,20,30,40,50,60,70,80,90,100]).astype(np.float32) - b = np.array([10,10,10,10,10,10,10,10,10,10]).astype(np.float32) + a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32) + b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32) a_gpu = cl_array.to_device(queue, a) b_gpu = cl_array.to_device(queue, b) - a_divide = (a_gpu/b_gpu).get() - assert (np.abs(a/b - a_divide) < 1e-3).all() - - a_divide = (b_gpu/a_gpu).get() - assert (np.abs(b/a - a_divide) < 1e-3).all() - + a_divide = (a_gpu / b_gpu).get() + assert (np.abs(a / b - a_divide) < 1e-3).all() + a_divide = (b_gpu / a_gpu).get() + assert (np.abs(b / a - a_divide) < 1e-3).all() @pytools.test.mark_test.opencl @@ -262,8 +239,6 @@ def test_random(ctx_getter): assert (a < 1).all() - - @pytools.test.mark_test.opencl def test_nan_arithmetic(ctx_getter): context = ctx_getter() @@ -272,10 +247,8 @@ def test_nan_arithmetic(ctx_getter): def make_nan_contaminated_vector(size): shape = (size,) a = np.random.randn(*shape).astype(np.float32) - #for i in range(0, shape[0], 3): - #a[i] = float('nan') from random import randrange - for i in range(size//10): + for i in range(size // 10): a[randrange(0, size)] = float('nan') return a @@ -286,14 +259,12 @@ def test_nan_arithmetic(ctx_getter): b = make_nan_contaminated_vector(size) b_gpu = cl_array.to_device(queue, b) - ab = a*b - ab_gpu = (a_gpu*b_gpu).get() + ab = a * b + ab_gpu = (a_gpu * b_gpu).get() assert (np.isnan(ab) == np.isnan(ab_gpu)).all() - - @pytools.test.mark_test.opencl def test_elwise_kernel(ctx_getter): context = ctx_getter() @@ -313,9 +284,38 @@ def test_elwise_kernel(ctx_getter): c_gpu = cl_array.empty_like(a_gpu) lin_comb(5, a_gpu, 6, b_gpu, c_gpu) - assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5 + assert la.norm((c_gpu - (5 * a_gpu + 6 * b_gpu)).get()) < 1e-5 +@pytools.test.mark_test.opencl +def test_elwise_kernel_with_options(ctx_getter): + from pyopencl.clrandom import rand as clrand + from pyopencl.elementwise import ElementwiseKernel + + context = ctx_getter() + queue = cl.CommandQueue(context) + + in_gpu = clrand(context, queue, (50,), np.float32) + + options = ['-DADD_ONE'] + add_one = ElementwiseKernel( + context, + "float* out, const float *in", + """ + out[i] = in[i]; + #ifdef ADD_ONE + out[i]++; + #endif + """, + options=options, + ) + + out_gpu = cl_array.empty_like(in_gpu) + add_one(out_gpu, in_gpu) + + gt = in_gpu.get() + 1 + gv = out_gpu.get() + assert la.norm(gv - gt) < 1e-5 @pytools.test.mark_test.opencl @@ -326,9 +326,7 @@ def test_take(ctx_getter): idx = cl_array.arange(queue, 0, 200000, 2, dtype=np.uint32) a = cl_array.arange(queue, 0, 600000, 3, dtype=np.float32) result = cl_array.take(a, idx) - assert ((3*idx).get() == result.get()).all() - - + assert ((3 * idx).get() == result.get()).all() @pytools.test.mark_test.opencl @@ -341,8 +339,6 @@ def test_arange(ctx_getter): assert (np.arange(n, dtype=np.float32) == a.get()).all() - - @pytools.test.mark_test.opencl def test_reverse(ctx_getter): context = ctx_getter() @@ -357,8 +353,6 @@ def test_reverse(ctx_getter): assert (a[::-1] == a_gpu.get()).all() - - @pytools.test.mark_test.opencl def test_sum(ctx_getter): context = ctx_getter() @@ -372,9 +366,7 @@ def test_sum(ctx_getter): sum_a = np.sum(a) sum_a_gpu = cl_array.sum(a_gpu).get() - assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4 - - + assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4 @pytools.test.mark_test.opencl @@ -400,8 +392,6 @@ def test_minmax(ctx_getter): assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what) - - @pytools.test.mark_test.opencl def test_subset_minmax(ctx_getter): context = ctx_getter() @@ -442,8 +432,6 @@ def test_subset_minmax(ctx_getter): assert min_a_gpu == min_a - - @pytools.test.mark_test.opencl def test_dot(ctx_getter): context = ctx_getter() @@ -459,9 +447,7 @@ def test_dot(ctx_getter): dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() - assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4 - - + assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 if False: @@ -481,7 +467,8 @@ if False: a_gpu_slice = a_gpu[start:end] a_slice = a[start:end] - assert la.norm(a_gpu_slice.get()-a_slice) == 0 + assert la.norm(a_gpu_slice.get() - a_slice) == 0 + @pytools.test.mark_test.opencl def test_if_positive(ctx_getter): @@ -502,8 +489,9 @@ def test_if_positive(ctx_getter): print(max_a_b_gpu) print(np.maximum(a, b)) - assert la.norm(max_a_b_gpu.get()- np.maximum(a, b)) == 0 - assert la.norm(min_a_b_gpu.get()- np.minimum(a, b)) == 0 + assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0 + assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0 + @pytools.test.mark_test.opencl def test_take_put(ctx_getter): @@ -513,9 +501,9 @@ def test_take_put(ctx_getter): for n in [5, 17, 333]: one_field_size = 8 buf_gpu = cl_array.zeros(queue, - n*one_field_size, dtype=np.float32) + n * one_field_size, dtype=np.float32) dest_indices = cl_array.to_device(queue, - np.array([ 0, 1, 2, 3, 32, 33, 34, 35], dtype=np.uint32)) + np.array([0, 1, 2, 3, 32, 33, 34, 35], dtype=np.uint32)) read_map = cl_array.to_device(queue, np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32)) @@ -523,9 +511,10 @@ def test_take_put(ctx_getter): arrays=[buf_gpu for i in range(n)], dest_indices=dest_indices, src_indices=read_map, - src_offsets=[i*one_field_size for i in range(n)], + src_offsets=[i * one_field_size for i in range(n)], dest_shape=(96,)) + @pytools.test.mark_test.opencl def test_astype(ctx_getter): context = ctx_getter() @@ -550,9 +539,7 @@ def test_astype(ctx_getter): a2 = a_gpu.astype(np.float32).get() assert a2.dtype == np.float32 - assert la.norm(a - a2)/la.norm(a) < 1e-7 - - + assert la.norm(a - a2) / la.norm(a) < 1e-7 @pytools.test.mark_test.opencl @@ -567,13 +554,12 @@ def test_scan(ctx_getter): knl = cls(context, dtype, "a+b", "0") for n in [ - 10, 2**10-5, 2**10, - 2**20-2**18, - 2**20-2**18+5, - 2**10+5, - 2**20+5, - 2**20, 2**24 - ]: + 10, 2 ** 10 - 5, 2 ** 10, + 2 ** 20 - 2 ** 18, + 2 ** 20 - 2 ** 18 + 5, + 2 ** 10 + 5, + 2 ** 20 + 5, + 2 ** 20, 2 ** 24]: host_data = np.random.randint(0, 10, n).astype(dtype) dev_data = cl_array.to_device(queue, host_data) @@ -588,21 +574,17 @@ def test_scan(ctx_getter): collect() - - @pytools.test.mark_test.opencl def test_stride_preservation(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) - A = np.random.rand(3,3) + A = np.random.rand(3, 3) AT = A.T print(AT.flags.f_contiguous, AT.flags.c_contiguous) AT_GPU = cl_array.to_device(queue, AT) print(AT_GPU.flags.f_contiguous, AT_GPU.flags.c_contiguous) - assert np.allclose(AT_GPU.get(),AT) - - + assert np.allclose(AT_GPU.get(), AT) @pytools.test.mark_test.opencl @@ -618,10 +600,9 @@ def test_vector_fill(ctx_getter): a_gpu = cl_array.zeros(queue, 100, dtype=cl_array.vec.float4) - - if __name__ == "__main__": - # make sure that import failures get reported, instead of skipping the tests. + # make sure that import failures get reported, instead of skipping the + # tests. import pyopencl as cl import sys