Newer
Older
import numpy.linalg as la
import sys
import pytools.test
from pytools import memoize
def have_cl():
try:
import pyopencl
return True
except:
return False
if have_cl():
import pyopencl as cl
import pyopencl.array as cl_array
import pyopencl.tools as cl_tools
from pyopencl.tools import pytest_generate_tests_for_pyopencl \
as pytest_generate_tests
from pyopencl.characterize import has_double_support
TO_REAL = {
np.dtype(np.complex64): np.float32,
np.dtype(np.complex128): np.float64
}
def general_clrand(queue, shape, dtype):
from pyopencl.clrandom import rand as clrand
dtype = np.dtype(dtype)
if dtype.kind == "c":
real_dtype = dtype.type(0).real.dtype
return clrand(queue, shape, real_dtype) + 1j*clrand(queue, shape, real_dtype)
else:
return clrand(queue, shape, dtype)
def make_random_array(queue, dtype, size):
from pyopencl.clrandom import rand
dtype = np.dtype(dtype)
if dtype.kind == "c":
real_dtype = TO_REAL[dtype]
return (rand(queue, shape=(size,), dtype=real_dtype).astype(dtype)
+ dtype.type(1j)
* rand(queue, shape=(size,), dtype=real_dtype).astype(dtype))
else:
return rand(queue, shape=(size,), dtype=dtype)
@pytools.test.mark_test.opencl
def test_basic_complex(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import rand
size = 500
ary = (rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64)
+ 1j* rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64))
c = np.complex64(5+7j)
host_ary = ary.get()
assert la.norm((c*ary).get() - c*host_ary) < 1e-5 * la.norm(host_ary)
@pytools.test.mark_test.opencl
def test_mix_complex(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
size = 10
dtypes = [
(np.float32, np.complex64),
#(np.int32, np.complex64),
]
if has_double_support(context.devices[0]):
dtypes.extend([
(np.float32, np.float64),
(np.float32, np.complex128),
(np.float64, np.complex64),
(np.float64, np.complex128),
])
from operator import add, mul, sub, truediv
for op in [add, sub, mul, truediv, pow]:
for dtype_a0, dtype_b0 in dtypes:
for dtype_a, dtype_b in [
(dtype_a0, dtype_b0),
(dtype_b0, dtype_a0),
]:
for is_scalar_a, is_scalar_b in [
(False, False),
(False, True),
(True, False),
]:
if is_scalar_a:
ary_a = make_random_array(queue, dtype_a, 1).get()[0]
host_ary_a = ary_a
else:
ary_a = make_random_array(queue, dtype_a, size)
host_ary_a = ary_a.get()
if is_scalar_b:
ary_b = make_random_array(queue, dtype_b, 1).get()[0]
host_ary_b = ary_b
else:
ary_b = make_random_array(queue, dtype_b, size)
host_ary_b = ary_b.get()
print(op, dtype_a, dtype_b, is_scalar_a, is_scalar_b)
dev_result = op(ary_a, ary_b).get()
host_result = op(host_ary_a, host_ary_b)
if host_result.dtype != dev_result.dtype:
# This appears to be a numpy bug, where we get
# served a Python complex that is really a
# smaller numpy complex.
print("HOST_DTYPE: %s DEV_DTYPE: %s" % (
host_result.dtype, dev_result.dtype))
dev_result = dev_result.astype(host_result.dtype)
err = la.norm(host_result-dev_result)/la.norm(host_result)
print(host_result)
print(dev_result)
print(host_result - dev_result)
@pytools.test.mark_test.opencl
def test_pow_neg1_vs_inv(ctx_factory):
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
device = ctx.devices[0]
if not has_double_support(device):
from py.test import skip
skip("double precision not supported on %s" % device)
a_dev = make_random_array(queue, np.complex128, 20000)
res1 = (a_dev ** (-1)).get()
res2 = (1/a_dev).get()
ref = 1/a_dev.get()
assert la.norm(res1-ref, np.inf) / la.norm(ref) < 1e-13
assert la.norm(res2-ref, np.inf) / la.norm(ref) < 1e-13
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
def test_vector_fill(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
a_gpu = cl_array.Array(queue, 100, dtype=cl_array.vec.float4)
a_gpu.fill(cl_array.vec.make_float4(0.0, 0.0, 1.0, 0.0))
a = a_gpu.get()
assert a.dtype is cl_array.vec.float4
a_gpu = cl_array.zeros(queue, 100, dtype=cl_array.vec.float4)
@pytools.test.mark_test.opencl
def test_absrealimag(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
def real(x): return x.real
def imag(x): return x.imag
def conj(x): return x.conj()
n = 111
for func in [abs, real, imag, conj]:
for dtype in [np.int32, np.float32, np.complex64]:
print(func, dtype)
a = -make_random_array(queue, dtype, n)
host_res = func(a.get())
dev_res = func(a).get()
correct = np.allclose(dev_res, host_res)
if not correct:
print(dev_res)
print(host_res)
print(dev_res-host_res)
assert correct
# }}}
# {{{ operands
@pytools.test.mark_test.opencl
def test_pow_array(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
a = np.array([1, 2, 3, 4, 5]).astype(np.float32)
a_gpu = cl_array.to_device(queue, a)
result = pow(a_gpu, a_gpu).get()
assert (np.abs(a ** a - result) < 1e-3).all()
result = (a_gpu ** a_gpu).get()
assert (np.abs(pow(a, a) - result) < 1e-3).all()
@pytools.test.mark_test.opencl
def test_pow_number(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
a_gpu = cl_array.to_device(queue, a)
result = pow(a_gpu, 2).get()
assert (np.abs(a ** 2 - result) < 1e-3).all()
@pytools.test.mark_test.opencl
def test_multiply(ctx_factory):
"""Test the muliplication of an array with a scalar. """
context = ctx_factory()
queue = cl.CommandQueue(context)
for sz in [10, 50000]:
for dtype, scalars in [
a_gpu = make_random_array(queue, dtype, sz)
a = a_gpu.get()
a_mult = (scalar * a_gpu).get()
assert (a * scalar == a_mult).all()
def test_multiply_array(ctx_factory):
"""Test the multiplication of two arrays."""
context = ctx_factory()
queue = cl.CommandQueue(context)
a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
a_gpu = cl_array.to_device(queue, a)
b_gpu = cl_array.to_device(queue, a)
@pytools.test.mark_test.opencl
def test_addition_array(ctx_factory):
"""Test the addition of two arrays."""
context = ctx_factory()
queue = cl.CommandQueue(context)
a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
a_gpu = cl_array.to_device(queue, a)
@pytools.test.mark_test.opencl
def test_addition_scalar(ctx_factory):
"""Test the addition of an array and a scalar."""
context = ctx_factory()
queue = cl.CommandQueue(context)
a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
a_gpu = cl_array.to_device(queue, a)
@pytools.test.mark_test.opencl
def test_substract_array(ctx_factory):
"""Test the substraction of two arrays."""
#test data
a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
b = np.array([10, 20, 30, 40, 50,
60, 70, 80, 90, 100]).astype(np.float32)
context = ctx_factory()
queue = cl.CommandQueue(context)
a_gpu = cl_array.to_device(queue, a)
b_gpu = cl_array.to_device(queue, b)
result = (a_gpu - b_gpu).get()
assert (a - b == result).all()
result = (b_gpu - a_gpu).get()
assert (b - a == result).all()
@pytools.test.mark_test.opencl
def test_substract_scalar(ctx_factory):
"""Test the substraction of an array and a scalar."""
context = ctx_factory()
queue = cl.CommandQueue(context)
#test data
a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
a_gpu = cl_array.to_device(queue, a)
result = (a_gpu - 7).get()
assert (a - 7 == result).all()
result = (7 - a_gpu).get()
assert (7 - a == result).all()
@pytools.test.mark_test.opencl
def test_divide_scalar(ctx_factory):
"""Test the division of an array and a scalar."""
context = ctx_factory()
queue = cl.CommandQueue(context)
a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
a_gpu = cl_array.to_device(queue, a)
result = (a_gpu / 2).get()
assert (a / 2 == result).all()
result = (2 / a_gpu).get()
assert (np.abs(2 / a - result) < 1e-5).all()
@pytools.test.mark_test.opencl
def test_divide_array(ctx_factory):
"""Test the division of an array and a scalar. """
context = ctx_factory()
queue = cl.CommandQueue(context)
#test data
a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32)
b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32)
a_gpu = cl_array.to_device(queue, a)
b_gpu = cl_array.to_device(queue, b)
a_divide = (a_gpu / b_gpu).get()
assert (np.abs(a / b - a_divide) < 1e-3).all()
a_divide = (b_gpu / a_gpu).get()
assert (np.abs(b / a - a_divide) < 1e-3).all()
@pytools.test.mark_test.opencl
def test_random(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import RanluxGenerator
if has_double_support(context.devices[0]):
for ary_size in [300, 301, 302, 303, 10007]:
for dtype in dtypes:
ran = cl_array.zeros(queue, ary_size, dtype)
gen.fill_uniform(ran)
assert (0 < ran.get()).all()
assert (ran.get() < 1).all()
ran = cl_array.zeros(queue, ary_size, dtype)
gen.fill_uniform(ran, a=4, b=7)
assert (4 < ran.get()).all()
assert (ran.get() < 7).all()
ran = gen.normal(queue, (10007,), dtype, mu=4, sigma=3)
ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300)
assert (200 <= ran.get()).all()
assert (ran.get() < 300).all()
#from matplotlib import pyplot as pt
#pt.hist(ran.get())
#pt.show()
@pytools.test.mark_test.opencl
def test_elwise_kernel(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import rand as clrand
a_gpu = clrand(queue, (50,), np.float32)
b_gpu = clrand(queue, (50,), np.float32)
from pyopencl.elementwise import ElementwiseKernel
lin_comb = ElementwiseKernel(context,
"float a, float *x, float b, float *y, float *z",
"z[i] = a*x[i] + b*y[i]",
"linear_combination")
c_gpu = cl_array.empty_like(a_gpu)
lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
assert la.norm((c_gpu - (5 * a_gpu + 6 * b_gpu)).get()) < 1e-5
@pytools.test.mark_test.opencl
def test_elwise_kernel_with_options(ctx_factory):
from pyopencl.clrandom import rand as clrand
from pyopencl.elementwise import ElementwiseKernel
context = ctx_factory()
queue = cl.CommandQueue(context)
in_gpu = clrand(queue, (50,), np.float32)
add_one = ElementwiseKernel(
context,
"float* out, const float *in",
"""
Andreas Klöckner
committed
out[i] = in[i]
Andreas Klöckner
committed
+1
Andreas Klöckner
committed
;
""",
options=options,
)
out_gpu = cl_array.empty_like(in_gpu)
add_one(out_gpu, in_gpu)
gt = in_gpu.get() + 1
gv = out_gpu.get()
assert la.norm(gv - gt) < 1e-5
@pytools.test.mark_test.opencl
def test_ranged_elwise_kernel(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.elementwise import ElementwiseKernel
set_to_seven = ElementwiseKernel(context,
"float *z", "z[i] = 7", "set_to_seven")
for i, slc in enumerate([
slice(5, 20000),
slice(5, 20000, 17),
slice(3000, 5, -1),
slice(1000, -1),
]):
a_gpu = cl_array.zeros(queue, (50000,), dtype=np.float32)
a_cpu = np.zeros(a_gpu.shape, a_gpu.dtype)
a_cpu[slc] = 7
set_to_seven(a_gpu, slice=slc)
assert (a_cpu == a_gpu.get()).all()
def test_take(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
idx = cl_array.arange(queue, 0, 200000, 2, dtype=np.uint32)
a = cl_array.arange(queue, 0, 600000, 3, dtype=np.float32)
@pytools.test.mark_test.opencl
def test_arange(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
n = 5000
a = cl_array.arange(queue, n, dtype=np.float32)
assert (np.arange(n, dtype=np.float32) == a.get()).all()
@pytools.test.mark_test.opencl
def test_reverse(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
n = 5000
a_gpu = cl_array.to_device(queue, a)
a_gpu = a_gpu.reverse()
assert (a[::-1] == a_gpu.get()).all()
@pytools.test.mark_test.opencl
def test_if_positive(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import rand as clrand
l = 20000
a_gpu = clrand(queue, (l,), np.float32)
b_gpu = clrand(queue, (l,), np.float32)
a = a_gpu.get()
b = b_gpu.get()
max_a_b_gpu = cl_array.maximum(a_gpu, b_gpu)
min_a_b_gpu = cl_array.minimum(a_gpu, b_gpu)
assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0
assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
@pytools.test.mark_test.opencl
def test_take_put(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
for n in [5, 17, 333]:
one_field_size = 8
buf_gpu = cl_array.zeros(queue,
n * one_field_size, dtype=np.float32)
dest_indices = cl_array.to_device(queue,
np.array([0, 1, 2, 3, 32, 33, 34, 35], dtype=np.uint32))
read_map = cl_array.to_device(queue,
np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32))
cl_array.multi_take_put(
arrays=[buf_gpu for i in range(n)],
dest_indices=dest_indices,
src_indices=read_map,
src_offsets=[i * one_field_size for i in range(n)],
dest_shape=(96,))
@pytools.test.mark_test.opencl
def test_astype(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import rand as clrand
if not has_double_support(context.devices[0]):
skip("double precision not supported on %s" % context.devices[0])
a_gpu = clrand(queue, (2000,), dtype=np.float32)
a = a_gpu.get().astype(np.float64)
a2 = a_gpu.astype(np.float64).get()
assert a2.dtype == np.float64
assert la.norm(a - a2) == 0, (a, a2)
a_gpu = clrand(queue, (2000,), dtype=np.float64)
a = a_gpu.get().astype(np.float32)
a2 = a_gpu.astype(np.float32).get()
assert a2.dtype == np.float32
assert la.norm(a - a2) / la.norm(a) < 1e-7
def test_sum(ctx_factory):
context = ctx_factory()
n = 200000
for dtype in [np.float32, np.complex64]:
a_gpu = general_clrand(queue, (n,), dtype)
sum_a = np.sum(a)
sum_a_gpu = cl_array.sum(a_gpu).get()
assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
def test_minmax(ctx_factory):
context = ctx_factory()
for what in ["min", "max"]:
for dtype in dtypes:
a_gpu = clrand(queue, (200000,), dtype)
op_a_gpu = getattr(cl_array, what)(a_gpu).get()
assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
def test_subset_minmax(ctx_factory):
context = ctx_factory()
l_a = 200000
gran = 5
l_m = l_a - l_a // gran + 1
if has_double_support(context.devices[0]):
a_gpu = clrand(queue, (l_a,), dtype)
meaningful_indices = meaningful_indices_gpu.get()
j = 0
for i in range(len(meaningful_indices)):
meaningful_indices[i] = j
j = j + 1
if j % gran == 0:
j = j + 1
meaningful_indices_gpu = cl_array.to_device(
min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get()
assert min_a_gpu == min_a
def test_dot(ctx_factory):
context = ctx_factory()
dtypes = [np.float32, np.complex64]
if has_double_support(context.devices[0]):
dtypes.extend([np.float64, np.complex128])
for a_dtype in dtypes:
for b_dtype in dtypes:
a_gpu = general_clrand(queue, (200000,), a_dtype)
a = a_gpu.get()
b_gpu = general_clrand(queue, (200000,), b_dtype)
b = b_gpu.get()
dot_ab = np.dot(a, b)
dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()
assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
@memoize
def make_mmc_dtype(device):
dtype = np.dtype([
("cur_min", np.int32),
("cur_max", np.int32),
("pad", np.int32),
])
name = "minmax_collector"
from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
dtype, c_decl = match_dtype_to_c_struct(device, name, dtype)
dtype = get_or_register_dtype(name, dtype)
return dtype, c_decl
@pytools.test.mark_test.opencl
context = ctx_factory()
queue = cl.CommandQueue(context)
mmc_dtype, mmc_c_decl = make_mmc_dtype(context.devices[0])
preamble = mmc_c_decl + r"""//CL//
minmax_collector mmc_neutral()
{
// FIXME: needs infinity literal in real use, ok here
minmax_collector result;
result.cur_min = 1<<30;
result.cur_max = -(1<<30);
return result;
}
minmax_collector mmc_from_scalar(float x)
{
minmax_collector result;
result.cur_min = x;
result.cur_max = x;
return result;
}
minmax_collector agg_mmc(minmax_collector a, minmax_collector b)
{
minmax_collector result = a;
if (b.cur_min < result.cur_min)
result.cur_min = b.cur_min;
if (b.cur_max > result.cur_max)
result.cur_max = b.cur_max;
return result;
}
from pyopencl.clrandom import rand as clrand
a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6)
a = a_gpu.get()
from pyopencl.reduction import ReductionKernel
red = ReductionKernel(context, mmc_dtype,
neutral="mmc_neutral()",
reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])",
arguments="__global int *x", preamble=preamble)
minmax = red(a_gpu).get()
#print minmax["cur_min"], minmax["cur_max"]
#print np.min(a), np.max(a)
assert abs(minmax["cur_min"] - np.min(a)) < 1e-5
assert abs(minmax["cur_max"] - np.max(a)) < 1e-5
def summarize_error(obtained, desired, orig, thresh=1e-5):
err = obtained - desired
ok_count = 0
bad_count = 0
bad_limit = 200
def summarize_counts():
if ok_count:
entries.append("<%d ok>" % ok_count)
if bad_count >= bad_limit:
entries.append("<%d more bad>" % (bad_count-bad_limit))
entries = []
for i, val in enumerate(err):
if abs(val) > thresh:
if ok_count:
bad_count += 1
if bad_count < bad_limit:
entries.append("%r (want: %r, got: %r, orig: %r)" % (obtained[i], desired[i],
obtained[i], orig[i]))
if bad_count:
summarize_counts()
bad_count = 0
return " ".join(entries)
2 ** 8 - 1,
2 ** 8,
2 ** 8 + 1,
2 ** 10 - 5,
2 ** 10,
2 ** 10 + 5,
2 ** 12 - 5,
2 ** 12,
2 ** 12 + 5,
2 ** 20 - 2 ** 18,
2 ** 20 - 2 ** 18 + 5,
2 ** 20 + 1,
@pytools.test.mark_test.opencl
def test_scan(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.scan import InclusiveScanKernel, ExclusiveScanKernel
dtype = np.int32
for cls in [
InclusiveScanKernel,
ExclusiveScanKernel
]:
host_data = np.random.randint(0, 10, n).astype(dtype)
dev_data = cl_array.to_device(queue, host_data)
assert (host_data == dev_data.get()).all() # /!\ fails on Nv GT2?? for some drivers
knl(dev_data)
desired_result = np.cumsum(host_data, axis=0)
if cls is ExclusiveScanKernel:
desired_result -= host_data
is_ok = (dev_data.get() == desired_result).all()
if 1 and not is_ok:
print("something went wrong, summarizing error...")
print(summarize_error(dev_data.get(), desired_result, host_data))
print("n:%d %s worked:%s" % (n, cls, is_ok))
from gc import collect
collect()
@pytools.test.mark_test.opencl
def test_copy_if(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import rand as clrand
for n in scan_test_counts:
a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
a = a_dev.get()
from pyopencl.algorithm import copy_if
Andreas Klöckner
committed
crit = a_dev.dtype.type(300)
selected = a[a>crit]
selected_dev, count_dev = copy_if(a_dev, "ary[i] > myval", [("myval", crit)])
assert (selected_dev.get()[:count_dev.get()] == selected).all()
from gc import collect
collect()
Andreas Klöckner
committed
@pytools.test.mark_test.opencl
def test_partition(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import rand as clrand
for n in scan_test_counts:
a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
a = a_dev.get()
crit = a_dev.dtype.type(300)
true_host = a[a>crit]
false_host = a[a<=crit]
from pyopencl.algorithm import partition
Andreas Klöckner
committed
true_dev, false_dev, count_true_dev = partition(a_dev, "ary[i] > myval", [("myval", crit)])
count_true_dev = count_true_dev.get()
assert (true_dev.get()[:count_true_dev] == true_host).all()
assert (false_dev.get()[:n-count_true_dev] == false_host).all()
Andreas Klöckner
committed
@pytools.test.mark_test.opencl
def test_unique(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import rand as clrand
for n in scan_test_counts:
a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
a = a_dev.get()
a = np.sort(a)
a_dev = cl_array.to_device(queue, a)
a_unique_host = np.unique(a)
from pyopencl.algorithm import unique
a_unique_dev, count_unique_dev = unique(a_dev)
Andreas Klöckner
committed
count_unique_dev = count_unique_dev.get()
assert (a_unique_dev.get()[:count_unique_dev] == a_unique_host).all()
from gc import collect
collect()
Andreas Klöckner
committed
@pytools.test.mark_test.opencl
def test_index_preservation(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.scan import GenericScanKernel, GenericDebugScanKernel
classes = [GenericScanKernel]
dev = context.devices[0]
if dev.type == cl.device_type.CPU:
classes.append(GenericDebugScanKernel)
for cls in classes:
for n in scan_test_counts:
knl = cls(
context, np.int32,
arguments="__global int *out",
input_expr="i",
scan_expr="b", neutral="0",
output_statement="""
out[i] = item;
""")
out = cl_array.empty(queue, n, dtype=np.int32)
knl(out)
assert (out.get() == np.arange(n)).all()
from gc import collect
collect()
@pytools.test.mark_test.opencl
def test_segmented_scan(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.tools import dtype_to_ctype
dtype = np.int32
ctype = dtype_to_ctype(dtype)
#for is_exclusive in [False, True]:
for is_exclusive in [True, False]:
Andreas Klöckner
committed
if is_exclusive:
output_statement = "out[i] = prev_item"
Andreas Klöckner
committed
output_statement = "out[i] = item"
from pyopencl.scan import GenericScanKernel
knl = GenericScanKernel(context, dtype,
arguments="__global %s *ary, __global char *segflags, __global %s *out"
% (ctype, ctype),
input_expr="ary[i]",
scan_expr="across_seg_boundary ? b : (a+b)", neutral="0",