Newer
Older
a_dev = clrand(queue, (5, 15, 20), dtype=np.float32)
b_dev = clrand(queue, (4, 15, 20), dtype=np.float32)
c_dev = clrand(queue, (3, 15, 20), dtype=np.float32)
a = a_dev.get()
b = b_dev.get()
c = c_dev.get()
cat_dev = cl.array.concatenate((a_dev, b_dev, c_dev))
cat = np.concatenate((a, b, c))
assert la.norm(cat - cat_dev.get()) == 0
# }}}
# {{{ conditionals, any, all
def test_comparisons(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import rand as clrand
ary_len = 20000
a_dev = clrand(queue, (ary_len,), dtype=np.float32)
b_dev = clrand(queue, (ary_len,), dtype=np.float32)
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
a = a_dev.get()
b = b_dev.get()
import operator as o
for op in [o.eq, o.ne, o.le, o.lt, o.ge, o.gt]:
res_dev = op(a_dev, b_dev)
res = op(a, b)
assert (res_dev.get() == res).all()
res_dev = op(a_dev, 0)
res = op(a, 0)
assert (res_dev.get() == res).all()
res_dev = op(0, b_dev)
res = op(0, b)
assert (res_dev.get() == res).all()
res2_dev = op(0, res_dev)
res2 = op(0, res)
assert (res2_dev.get() == res2).all()
def test_any_all(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
ary_len = 20000
a_dev = cl_array.zeros(queue, (ary_len,), dtype=np.int8)
assert not a_dev.all().get()
assert not a_dev.any().get()
assert not a_dev.all().get()
assert a_dev.any().get()
assert a_dev.all().get()
assert a_dev.any().get()
def test_map_to_host(ctx_factory):
if _PYPY:
pytest.skip("numpypy: no array creation from __array_interface__")
context = ctx_factory()
queue = cl.CommandQueue(context)
if context.devices[0].type & cl.device_type.GPU:
mf = cl.mem_flags
allocator = cl_tools.DeferredAllocator(
context, mf.READ_WRITE | mf.ALLOC_HOST_PTR)
else:
allocator = None
a_dev = cl_array.zeros(queue, (5, 6, 7,), dtype=np.float32, allocator=allocator)
a_dev[3, 2, 1] = 10
a_host = a_dev.map_to_host()
a_host[1, 2, 3] = 10
a_host.base.release(queue)
a_dev.finish()
print("DEV[HOST_WRITE]", a_dev.get()[1, 2, 3])
print("HOST[DEV_WRITE]", a_host_saved[3, 2, 1])
assert (a_host_saved == a_dev.get()).all()
def test_view_and_strides(ctx_factory):
if _PYPY:
pytest.xfail("numpypy: no array creation from __array_interface__")
return
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import rand as clrand
x = clrand(queue, (5, 10), dtype=np.float32)
y = x[:3, :5]
yv = y.view()
assert yv.shape == y.shape
assert yv.strides == y.strides
with pytest.raises(AssertionError):
if _PYPY:
# https://bitbucket.org/pypy/numpy/issue/28/indexerror-on-ellipsis-slice
pytest.xfail("numpypy bug #28")
context = ctx_factory()
queue = cl.CommandQueue(context)
n = 2
result = cl.array.empty(queue, (2, n*6), np.float32)
def view(z):
return z[..., n*3:n*6].reshape(z.shape[:-1] + (n, 3))
result = result.with_queue(queue)
result.fill(0)
view(result)[0].fill(1)
view(result)[1].fill(1)
x = result.get()
assert (view(x) == 1).all()
def test_event_management(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import rand as clrand
x = clrand(queue, (5, 10), dtype=np.float32)
assert len(x.events) == 1, len(x.events)
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
x.finish()
assert len(x.events) == 0
y = x+x
assert len(y.events) == 1
y = x*x
assert len(y.events) == 1
y = 2*x
assert len(y.events) == 1
y = 2/x
assert len(y.events) == 1
y = x/2
assert len(y.events) == 1
y = x**2
assert len(y.events) == 1
y = 2**x
assert len(y.events) == 1
for i in range(10):
x.fill(0)
assert len(x.events) == 10
for i in range(1000):
x.fill(0)
assert len(x.events) < 100
Andreas Klöckner
committed
def test_reshape(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
a = np.arange(128).reshape(8, 16).astype(np.float32)
a_dev = cl_array.to_device(queue, a)
# different ways to specify the shape
a_dev.reshape(4, 32)
a_dev.reshape((4, 32))
a_dev.reshape([4, 32])
# using -1 as unknown dimension
assert a_dev.reshape(-1, 32).shape == (4, 32)
assert a_dev.reshape((32, -1)).shape == (32, 4)
assert a_dev.reshape((8, -1, 4)).shape == (8, 4, 4)
Andreas Klöckner
committed
import pytest
with pytest.raises(ValueError):
a_dev.reshape(-1, -1, 4)
def test_skip_slicing(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
a_host = np.arange(16).reshape((4, 4))
b_host = a_host[::3]
a = cl_array.to_device(queue, a_host)
b = a[::3]
assert b.shape == b_host.shape
assert np.array_equal(b[1].get(), b_host[1]) # noqa pylint:disable=unsubscriptable-object
def test_transpose(ctx_factory):
if _PYPY:
pytest.xfail("numpypy: no array creation from __array_interface__")
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import rand as clrand
a_gpu = clrand(queue, (10, 20, 30), dtype=np.float32)
a = a_gpu.get()
# FIXME: not contiguous
#assert np.allclose(a_gpu.transpose((1,2,0)).get(), a.transpose((1,2,0)))
assert np.array_equal(a_gpu.T.get(), a.T)
def test_newaxis(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import rand as clrand
a_gpu = clrand(queue, (10, 20, 30), dtype=np.float32)
a = a_gpu.get()
b_gpu = a_gpu[:, np.newaxis]
b = a[:, np.newaxis]
assert b_gpu.shape == b.shape
for i in range(b.ndim):
if b.shape[i] > 1:
assert b_gpu.strides[i] == b.strides[i]
def test_squeeze(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
shape = (40, 2, 5, 100)
a_cpu = np.random.random(size=shape)
a_gpu = cl_array.to_device(queue, a_cpu)
# Slice with length 1 on dimensions 0 and 1
a_gpu_slice = a_gpu[0:1, 1:2, :, :]
assert a_gpu_slice.shape == (1, 1, shape[2], shape[3])
assert a_gpu_slice.flags.c_contiguous
# Squeeze it and obtain contiguity
a_gpu_squeezed_slice = a_gpu[0:1, 1:2, :, :].squeeze()
assert a_gpu_squeezed_slice.shape == (shape[2], shape[3])
assert a_gpu_squeezed_slice.flags.c_contiguous
# Check that we get the original values out
#assert np.all(a_gpu_slice.get().ravel() == a_gpu_squeezed_slice.get().ravel())
# Slice with length 1 on dimensions 2
a_gpu_slice = a_gpu[:, :, 2:3, :]
assert a_gpu_slice.shape == (shape[0], shape[1], 1, shape[3])
assert not a_gpu_slice.flags.c_contiguous
# Squeeze it, but no contiguity here
a_gpu_squeezed_slice = a_gpu[:, :, 2:3, :].squeeze()
assert a_gpu_squeezed_slice.shape == (shape[0], shape[1], shape[3])
assert not a_gpu_squeezed_slice.flags.c_contiguous
# Check that we get the original values out
#assert np.all(a_gpu_slice.get().ravel() == a_gpu_squeezed_slice.get().ravel())
def test_fancy_fill(ctx_factory):
if _PYPY:
pytest.xfail("numpypy: multi value setting is not supported")
context = ctx_factory()
queue = cl.CommandQueue(context)
numpy_dest = np.zeros((4,), np.int32)
numpy_idx = np.arange(3, dtype=np.int32)
numpy_src = np.arange(8, 9, dtype=np.int32)
numpy_dest[numpy_idx] = numpy_src
cl_dest = cl_array.zeros(queue, (4,), np.int32)
cl_idx = cl_array.arange(queue, 3, dtype=np.int32)
cl_src = cl_array.arange(queue, 8, 9, dtype=np.int32)
cl_dest[cl_idx] = cl_src
assert np.all(numpy_dest == cl_dest.get())
def test_fancy_indexing(ctx_factory):
if _PYPY:
pytest.xfail("numpypy: multi value setting is not supported")
context = ctx_factory()
queue = cl.CommandQueue(context)
n = 2 ** 20 + 2**18 + 22
numpy_dest = np.zeros(n, dtype=np.int32)
numpy_idx = np.arange(n, dtype=np.int32)
np.random.shuffle(numpy_idx)
numpy_src = 20000+np.arange(n, dtype=np.int32)
cl_dest = cl_array.to_device(queue, numpy_dest)
cl_idx = cl_array.to_device(queue, numpy_idx)
cl_src = cl_array.to_device(queue, numpy_src)
numpy_dest[numpy_idx] = numpy_src
cl_dest[cl_idx] = cl_src
assert np.array_equal(numpy_dest, cl_dest.get())
numpy_dest = numpy_src[numpy_idx]
cl_dest = cl_src[cl_idx]
assert np.array_equal(numpy_dest, cl_dest.get())
def test_multi_put(ctx_factory):
if _PYPY:
pytest.xfail("numpypy: multi value setting is not supported")
context = ctx_factory()
queue = cl.CommandQueue(context)
cl_arrays = [
cl_array.arange(queue, 0, 3, dtype=np.float32)
for i in range(1, 10)
]
idx = cl_array.arange(queue, 0, 6, dtype=np.int32)
out_arrays = [
cl_array.zeros(queue, (10,), np.float32)
for i in range(9)
]
out_compare = [np.zeros((10,), np.float32) for i in range(9)]
for i, ary in enumerate(out_compare):
ary[idx.get()] = np.arange(0, 6, dtype=np.float32)
cl_array.multi_put(cl_arrays, idx, out=out_arrays)
assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9))
def test_get_async(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
device = queue.device
if device.platform.vendor == "The pocl project" \
and device.type & cl.device_type.GPU:
pytest.xfail("the async get test fails on POCL + Nvidia,"
"at least the K40, as of pocl 1.6, 2021-01-20")
a = np.random.rand(10**6).astype(np.dtype("float32"))
a_gpu = cl_array.to_device(queue, a)
b = a + a**5 + 1
b_gpu = a_gpu + a_gpu**5 + 1
# deprecated, but still test
b1 = b_gpu.get(async_=True) # testing that this waits for events
b_gpu.finish()
assert np.abs(b1 - b).mean() < 1e-5
b1, evt = b_gpu.get_async() # testing that this waits for events
evt.wait()
assert np.abs(b1 - b).mean() < 1e-5
wait_event = cl.UserEvent(context)
b_gpu.add_event(wait_event)
b, evt = b_gpu.get_async() # testing that this doesn't hang
wait_event.set_status(cl.command_execution_status.COMPLETE)
def test_outoforderqueue_get(ctx_factory):
context = ctx_factory()
try:
queue = cl.CommandQueue(context,
properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
except Exception:
pytest.skip("out-of-order queue not available")
a = np.random.rand(10**6).astype(np.dtype("float32"))
a_gpu = cl_array.to_device(queue, a)
b_gpu = a_gpu + a_gpu**5 + 1
b1 = b_gpu.get() # testing that this waits for events
b = a + a**5 + 1
assert np.abs(b1 - b).mean() < 1e-5
def test_outoforderqueue_copy(ctx_factory):
context = ctx_factory()
try:
queue = cl.CommandQueue(context,
properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
except Exception:
pytest.skip("out-of-order queue not available")
a = np.random.rand(10**6).astype(np.dtype("float32"))
a_gpu = cl_array.to_device(queue, a)
c_gpu = a_gpu**2 - 7
b_gpu = c_gpu.copy() # testing that this waits for and creates events
b_gpu *= 10
queue.finish()
b1 = b_gpu.get()
b = 10 * (a**2 - 7)
assert np.abs(b1 - b).mean() < 1e-5
def test_outoforderqueue_indexing(ctx_factory):
context = ctx_factory()
try:
queue = cl.CommandQueue(context,
properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
except Exception:
pytest.skip("out-of-order queue not available")
a = np.random.rand(10**6).astype(np.dtype("float32"))
i = (8e5 + 1e5 * np.random.rand(10**5)).astype(np.dtype("int32"))
a_gpu = cl_array.to_device(queue, a)
i_gpu = cl_array.to_device(queue, i)
c_gpu = (a_gpu**2)[i_gpu - 10000]
b_gpu = 10 - a_gpu
b_gpu[:] = 8 * a_gpu
b_gpu[i_gpu + 10000] = c_gpu - 10
queue.finish()
b1 = b_gpu.get()
c = (a**2)[i - 10000]
b = 8 * a
b[i + 10000] = c - 10
assert np.abs(b1 - b).mean() < 1e-5
def test_outoforderqueue_reductions(ctx_factory):
context = ctx_factory()
try:
queue = cl.CommandQueue(context,
properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
except Exception:
pytest.skip("out-of-order queue not available")
# 0/1 values to avoid accumulated rounding error
a = (np.random.rand(10**6) > 0.5).astype(np.dtype("float32"))
a[800000] = 10 # all<5 looks true until near the end
a_gpu = cl_array.to_device(queue, a)
b1 = cl_array.sum(a_gpu).get()
b2 = cl_array.dot(a_gpu, 3 - a_gpu).get()
b3 = (a_gpu < 5).all().get()
assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0
def test_negative_dim_rejection(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
with pytest.raises(ValueError):
cl_array.Array(queue, shape=-10, dtype=np.float64)
with pytest.raises(ValueError):
cl_array.Array(queue, shape=(-10,), dtype=np.float64)
for left_dim in (-1, 0, 1):
with pytest.raises(ValueError):
cl_array.Array(queue, shape=(left_dim, -1), dtype=np.float64)
for right_dim in (-1, 0, 1):
with pytest.raises(ValueError):
cl_array.Array(queue, shape=(-1, right_dim), dtype=np.float64)
@pytest.mark.parametrize("empty_shape", [0, (), (3, 0, 2), (0, 5), (5, 0)])
def test_zero_size_array(ctx_factory, empty_shape):
context = ctx_factory()
queue = cl.CommandQueue(context)
if queue.device.platform.name == "Intel(R) OpenCL":
pytest.xfail("size-0 arrays fail on Intel CL")
a = cl_array.zeros(queue, empty_shape, dtype=np.float32)
b = cl_array.zeros(queue, empty_shape, dtype=np.float32)
b.fill(1)
c = a + b
c_host = c.get()
cl_array.to_device(queue, c_host)
assert c.flags.c_contiguous == c_host.flags.c_contiguous
assert c.flags.f_contiguous == c_host.flags.f_contiguous
for order in "CF":
c_flat = c.reshape(-1, order=order)
c_host_flat = c_host.reshape(-1, order=order)
assert c_flat.shape == c_host_flat.shape
assert c_flat.strides == c_host_flat.strides
assert c_flat.flags.c_contiguous == c_host_flat.flags.c_contiguous
assert c_flat.flags.f_contiguous == c_host_flat.flags.f_contiguous
def test_str_without_queue(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
a = cl_array.zeros(queue, 10, dtype=np.float32).with_queue(None)
print(str(a))
print(repr(a))
@pytest.mark.parametrize("order", ("F", "C"))
def test_stack(ctx_factory, input_dims, order):
# Replicates pytato/test/test_codegen.py::test_stack
import pyopencl.array as cla
cl_ctx = ctx_factory()
queue = cl.CommandQueue(cl_ctx)
shape = (2, 2, 2)[:input_dims]
axis = -1 if order == "F" else 0
from numpy.random import default_rng
rng = default_rng()
x_in = rng.random(size=shape)
y_in = rng.random(size=shape)
x_in = x_in if order == "C" else np.asfortranarray(x_in)
y_in = y_in if order == "C" else np.asfortranarray(y_in)
x = cla.to_device(queue, x_in)
y = cla.to_device(queue, y_in)
np.testing.assert_allclose(cla.stack((x, y), axis=axis).get(),
np.stack((x_in, y_in), axis=axis))
def test_assign_different_strides(ctx_factory):
cl_ctx = ctx_factory()
queue = cl.CommandQueue(cl_ctx)
from pyopencl.clrandom import rand as clrand
a = clrand(queue, (20, 30), dtype=np.float32)
b = cl_array.empty(queue, (20, 30), dtype=np.float32, order="F")
with pytest.raises(NotImplementedError):
b[:] = a
def test_branch_operations_on_pure_scalars(ctx_factory):
x = np.random.rand()
y = np.random.rand()
cond = np.random.choice([False, True])
np.testing.assert_allclose(np.maximum(x, y),
cl_array.maximum(x, y))
np.testing.assert_allclose(np.minimum(x, y),
cl_array.minimum(x, y))
np.testing.assert_allclose(np.where(cond, x, y),
cl_array.if_positive(cond, x, y))
if __name__ == "__main__":
if len(sys.argv) > 1: