test_array.py


    a_dev = clrand(queue, (5, 15, 20), dtype=np.float32)
    b_dev = clrand(queue, (4, 15, 20), dtype=np.float32)
    c_dev = clrand(queue, (3, 15, 20), dtype=np.float32)
    a = a_dev.get()
    b = b_dev.get()
    c = c_dev.get()

    cat_dev = cl.array.concatenate((a_dev, b_dev, c_dev))
    cat = np.concatenate((a, b, c))

    assert la.norm(cat - cat_dev.get()) == 0

# }}}


# {{{ conditionals, any, all

def test_comparisons(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import rand as clrand

    ary_len = 20000
    a_dev = clrand(queue, (ary_len,), dtype=np.float32)
    b_dev = clrand(queue, (ary_len,), dtype=np.float32)

    a = a_dev.get()
    b = b_dev.get()

    import operator as o
    for op in [o.eq, o.ne, o.le, o.lt, o.ge, o.gt]:
        res_dev = op(a_dev, b_dev)
        res = op(a, b)

        assert (res_dev.get() == res).all()

        res_dev = op(a_dev, 0)
        res = op(a, 0)

        assert (res_dev.get() == res).all()

        res_dev = op(0, b_dev)
        res = op(0, b)

        assert (res_dev.get() == res).all()

        res2_dev = op(0, res_dev)
        res2 = op(0, res)
        assert (res2_dev.get() == res2).all()


def test_any_all(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    ary_len = 20000
    a_dev = cl_array.zeros(queue, (ary_len,), dtype=np.int8)

    assert not a_dev.all().get()
    assert not a_dev.any().get()

    a_dev[15213] = 1

    assert not a_dev.all().get()
    assert a_dev.any().get()

    a_dev.fill(1)

    assert a_dev.all().get()
    assert a_dev.any().get()

# }}}


def test_map_to_host(ctx_factory):
    if _PYPY:
        pytest.skip("numpypy: no array creation from __array_interface__")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    if context.devices[0].type & cl.device_type.GPU:
        mf = cl.mem_flags
        allocator = cl_tools.DeferredAllocator(
                context, mf.READ_WRITE | mf.ALLOC_HOST_PTR)
    else:
        allocator = None

    a_dev = cl_array.zeros(queue, (5, 6, 7,), dtype=np.float32, allocator=allocator)
    a_dev[3, 2, 1] = 10
    a_host = a_dev.map_to_host()
    a_host[1, 2, 3] = 10

    a_host_saved = a_host.copy()
    a_host.base.release(queue)

    a_dev.finish()

    print("DEV[HOST_WRITE]", a_dev.get()[1, 2, 3])
    print("HOST[DEV_WRITE]", a_host_saved[3, 2, 1])

    assert (a_host_saved == a_dev.get()).all()


def test_view_and_strides(ctx_factory):
    if _PYPY:
        pytest.xfail("numpypy: no array creation from __array_interface__")
    return

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import rand as clrand

    x = clrand(queue, (5, 10), dtype=np.float32)
    y = x[:3, :5]
    yv = y.view()

    assert yv.shape == y.shape
    assert yv.strides == y.strides

    with pytest.raises(AssertionError):
        assert (yv.get() == x.get()[:3, :5]).all()


def test_meshmode_view(ctx_factory):
    if _PYPY:
        # https://bitbucket.org/pypy/numpy/issue/28/indexerror-on-ellipsis-slice
        pytest.xfail("numpypy bug #28")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    n = 2
    result = cl.array.empty(queue, (2, n*6), np.float32)

    def view(z):
        return z[..., n*3:n*6].reshape(z.shape[:-1] + (n, 3))

    result = result.with_queue(queue)
    result.fill(0)
    view(result)[0].fill(1)
    view(result)[1].fill(1)
    x = result.get()
    assert (view(x) == 1).all()


def test_event_management(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import rand as clrand

    x = clrand(queue, (5, 10), dtype=np.float32)
    assert len(x.events) == 1, len(x.events)

    x.finish()

    assert len(x.events) == 0

    y = x+x
    assert len(y.events) == 1
    y = x*x
    assert len(y.events) == 1
    y = 2*x
    assert len(y.events) == 1
    y = 2/x
    assert len(y.events) == 1
    y = x/2
    assert len(y.events) == 1
    y = x**2
    assert len(y.events) == 1
    y = 2**x
    assert len(y.events) == 1

    for i in range(10):
        x.fill(0)

    assert len(x.events) == 10

    for i in range(1000):
        x.fill(0)

    assert len(x.events) < 100


def test_reshape(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    a = np.arange(128).reshape(8, 16).astype(np.float32)
    a_dev = cl_array.to_device(queue, a)

    # different ways to specify the shape
    a_dev.reshape(4, 32)
    a_dev.reshape((4, 32))
    a_dev.reshape([4, 32])

    # using -1 as unknown dimension
    assert a_dev.reshape(-1, 32).shape == (4, 32)
    assert a_dev.reshape((32, -1)).shape == (32, 4)
    assert a_dev.reshape((8, -1, 4)).shape == (8, 4, 4)

    import pytest
    with pytest.raises(ValueError):
        a_dev.reshape(-1, -1, 4)


def test_skip_slicing(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    a_host = np.arange(16).reshape((4, 4))
    b_host = a_host[::3]

    a = cl_array.to_device(queue, a_host)
    b = a[::3]
    assert b.shape == b_host.shape
    assert np.array_equal(b[1].get(), b_host[1])  # noqa pylint:disable=unsubscriptable-object


def test_transpose(ctx_factory):
    if _PYPY:
        pytest.xfail("numpypy: no array creation from __array_interface__")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import rand as clrand

    a_gpu = clrand(queue, (10, 20, 30), dtype=np.float32)
    a = a_gpu.get()

    # FIXME: not contiguous
    #assert np.allclose(a_gpu.transpose((1,2,0)).get(), a.transpose((1,2,0)))
    assert np.array_equal(a_gpu.T.get(), a.T)


def test_newaxis(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import rand as clrand

    a_gpu = clrand(queue, (10, 20, 30), dtype=np.float32)
    a = a_gpu.get()

    b_gpu = a_gpu[:, np.newaxis]
    b = a[:, np.newaxis]

    assert b_gpu.shape == b.shape
    for i in range(b.ndim):
        if b.shape[i] > 1:
            assert b_gpu.strides[i] == b.strides[i]


def test_squeeze(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    shape = (40, 2, 5, 100)
    a_cpu = np.random.random(size=shape)
    a_gpu = cl_array.to_device(queue, a_cpu)

    # Slice with length 1 on dimensions 0 and 1
    a_gpu_slice = a_gpu[0:1, 1:2, :, :]
    assert a_gpu_slice.shape == (1, 1, shape[2], shape[3])
    assert a_gpu_slice.flags.c_contiguous

    # Squeeze it and obtain contiguity
    a_gpu_squeezed_slice = a_gpu[0:1, 1:2, :, :].squeeze()
    assert a_gpu_squeezed_slice.shape == (shape[2], shape[3])
    assert a_gpu_squeezed_slice.flags.c_contiguous

    # Check that we get the original values out
    #assert np.all(a_gpu_slice.get().ravel() == a_gpu_squeezed_slice.get().ravel())

    # Slice with length 1 on dimensions 2
    a_gpu_slice = a_gpu[:, :, 2:3, :]
    assert a_gpu_slice.shape == (shape[0], shape[1], 1, shape[3])
    assert not a_gpu_slice.flags.c_contiguous

    # Squeeze it, but no contiguity here
    a_gpu_squeezed_slice = a_gpu[:, :, 2:3, :].squeeze()
    assert a_gpu_squeezed_slice.shape == (shape[0], shape[1], shape[3])
    assert not a_gpu_squeezed_slice.flags.c_contiguous

    # Check that we get the original values out
    #assert np.all(a_gpu_slice.get().ravel() == a_gpu_squeezed_slice.get().ravel())


def test_fancy_fill(ctx_factory):
    if _PYPY:
        pytest.xfail("numpypy: multi value setting is not supported")
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    numpy_dest = np.zeros((4,), np.int32)
    numpy_idx = np.arange(3, dtype=np.int32)
    numpy_src = np.arange(8, 9, dtype=np.int32)
    numpy_dest[numpy_idx] = numpy_src

    cl_dest = cl_array.zeros(queue, (4,), np.int32)
    cl_idx = cl_array.arange(queue, 3, dtype=np.int32)
    cl_src = cl_array.arange(queue, 8, 9, dtype=np.int32)
    cl_dest[cl_idx] = cl_src

    assert np.all(numpy_dest == cl_dest.get())


def test_fancy_indexing(ctx_factory):
    if _PYPY:
        pytest.xfail("numpypy: multi value setting is not supported")
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    n = 2 ** 20 + 2**18 + 22
    numpy_dest = np.zeros(n, dtype=np.int32)
    numpy_idx = np.arange(n, dtype=np.int32)
    np.random.shuffle(numpy_idx)
    numpy_src = 20000+np.arange(n, dtype=np.int32)

    cl_dest = cl_array.to_device(queue, numpy_dest)
    cl_idx = cl_array.to_device(queue, numpy_idx)
    cl_src = cl_array.to_device(queue, numpy_src)

    numpy_dest[numpy_idx] = numpy_src
    cl_dest[cl_idx] = cl_src

    assert np.array_equal(numpy_dest, cl_dest.get())

    numpy_dest = numpy_src[numpy_idx]
    cl_dest = cl_src[cl_idx]

    assert np.array_equal(numpy_dest, cl_dest.get())


def test_multi_put(ctx_factory):
    if _PYPY:
        pytest.xfail("numpypy: multi value setting is not supported")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    cl_arrays = [
        cl_array.arange(queue, 0, 3, dtype=np.float32)
        for i in range(1, 10)
    ]
    idx = cl_array.arange(queue, 0, 6, dtype=np.int32)
    out_arrays = [
        cl_array.zeros(queue, (10,), np.float32)
        for i in range(9)
    ]

    out_compare = [np.zeros((10,), np.float32) for i in range(9)]
    for i, ary in enumerate(out_compare):
        ary[idx.get()] = np.arange(0, 6, dtype=np.float32)

    cl_array.multi_put(cl_arrays, idx, out=out_arrays)

    assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9))


def test_get_async(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    device = queue.device
    if device.platform.vendor == "The pocl project" \
            and device.type & cl.device_type.GPU:
        pytest.xfail("the async get test fails on POCL + Nvidia,"
                "at least the K40, as of pocl 1.6, 2021-01-20")

    a = np.random.rand(10**6).astype(np.dtype("float32"))
    a_gpu = cl_array.to_device(queue, a)
    b = a + a**5 + 1
    b_gpu = a_gpu + a_gpu**5 + 1

    # deprecated, but still test
    b1 = b_gpu.get(async_=True)  # testing that this waits for events
    b_gpu.finish()
    assert np.abs(b1 - b).mean() < 1e-5

    b1, evt = b_gpu.get_async()  # testing that this waits for events
    evt.wait()
    assert np.abs(b1 - b).mean() < 1e-5

    wait_event = cl.UserEvent(context)
    b_gpu.add_event(wait_event)
    b, evt = b_gpu.get_async()  # testing that this doesn't hang
    wait_event.set_status(cl.command_execution_status.COMPLETE)
    evt.wait()
    assert np.abs(b1 - b).mean() < 1e-5


def test_outoforderqueue_get(ctx_factory):
    context = ctx_factory()
    try:
        queue = cl.CommandQueue(context,
               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
    except Exception:
        pytest.skip("out-of-order queue not available")
    a = np.random.rand(10**6).astype(np.dtype("float32"))
    a_gpu = cl_array.to_device(queue, a)
    b_gpu = a_gpu + a_gpu**5 + 1
    b1 = b_gpu.get()  # testing that this waits for events
    b = a + a**5 + 1
    assert np.abs(b1 - b).mean() < 1e-5


def test_outoforderqueue_copy(ctx_factory):
    context = ctx_factory()
    try:
        queue = cl.CommandQueue(context,
               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
    except Exception:
        pytest.skip("out-of-order queue not available")
    a = np.random.rand(10**6).astype(np.dtype("float32"))
    a_gpu = cl_array.to_device(queue, a)
    c_gpu = a_gpu**2 - 7
    b_gpu = c_gpu.copy()  # testing that this waits for and creates events
    b_gpu *= 10
    queue.finish()
    b1 = b_gpu.get()
    b = 10 * (a**2 - 7)
    assert np.abs(b1 - b).mean() < 1e-5


def test_outoforderqueue_indexing(ctx_factory):
    context = ctx_factory()
    try:
        queue = cl.CommandQueue(context,
               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
    except Exception:
        pytest.skip("out-of-order queue not available")
    a = np.random.rand(10**6).astype(np.dtype("float32"))
    i = (8e5 + 1e5 * np.random.rand(10**5)).astype(np.dtype("int32"))
    a_gpu = cl_array.to_device(queue, a)
    i_gpu = cl_array.to_device(queue, i)
    c_gpu = (a_gpu**2)[i_gpu - 10000]
    b_gpu = 10 - a_gpu
    b_gpu[:] = 8 * a_gpu
    b_gpu[i_gpu + 10000] = c_gpu - 10
    queue.finish()
    b1 = b_gpu.get()
    c = (a**2)[i - 10000]
    b = 8 * a
    b[i + 10000] = c - 10
    assert np.abs(b1 - b).mean() < 1e-5


def test_outoforderqueue_reductions(ctx_factory):
    context = ctx_factory()
    try:
        queue = cl.CommandQueue(context,
               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
    except Exception:
        pytest.skip("out-of-order queue not available")
    # 0/1 values to avoid accumulated rounding error
    a = (np.random.rand(10**6) > 0.5).astype(np.dtype("float32"))
    a[800000] = 10  # all<5 looks true until near the end
    a_gpu = cl_array.to_device(queue, a)
    b1 = cl_array.sum(a_gpu).get()
    b2 = cl_array.dot(a_gpu, 3 - a_gpu).get()
    b3 = (a_gpu < 5).all().get()
    assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0


def test_negative_dim_rejection(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    with pytest.raises(ValueError):
        cl_array.Array(queue, shape=-10, dtype=np.float64)

    with pytest.raises(ValueError):
        cl_array.Array(queue, shape=(-10,), dtype=np.float64)

    for left_dim in (-1, 0, 1):
        with pytest.raises(ValueError):
            cl_array.Array(queue, shape=(left_dim, -1), dtype=np.float64)

    for right_dim in (-1, 0, 1):
        with pytest.raises(ValueError):
            cl_array.Array(queue, shape=(-1, right_dim), dtype=np.float64)


@pytest.mark.parametrize("empty_shape", [0, (), (3, 0, 2), (0, 5), (5, 0)])
def test_zero_size_array(ctx_factory, empty_shape):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    if queue.device.platform.name == "Intel(R) OpenCL":
        pytest.xfail("size-0 arrays fail on Intel CL")

    a = cl_array.zeros(queue, empty_shape, dtype=np.float32)
    b = cl_array.zeros(queue, empty_shape, dtype=np.float32)
    b.fill(1)
    c = a + b
    c_host = c.get()
    cl_array.to_device(queue, c_host)

    assert c.flags.c_contiguous == c_host.flags.c_contiguous
    assert c.flags.f_contiguous == c_host.flags.f_contiguous

    for order in "CF":
        c_flat = c.reshape(-1, order=order)
        c_host_flat = c_host.reshape(-1, order=order)
        assert c_flat.shape == c_host_flat.shape
        assert c_flat.strides == c_host_flat.strides
        assert c_flat.flags.c_contiguous == c_host_flat.flags.c_contiguous
        assert c_flat.flags.f_contiguous == c_host_flat.flags.f_contiguous


def test_str_without_queue(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    a = cl_array.zeros(queue, 10, dtype=np.float32).with_queue(None)
    print(str(a))
    print(repr(a))


@pytest.mark.parametrize("order", ("F", "C"))
@pytest.mark.parametrize("input_dims", (1, 2, 3))
def test_stack(ctx_factory, input_dims, order):
    # Replicates pytato/test/test_codegen.py::test_stack
    import pyopencl.array as cla
    cl_ctx = ctx_factory()
    queue = cl.CommandQueue(cl_ctx)

    shape = (2, 2, 2)[:input_dims]
    axis = -1 if order == "F" else 0

    from numpy.random import default_rng
    rng = default_rng()
    x_in = rng.random(size=shape)
    y_in = rng.random(size=shape)
    x_in = x_in if order == "C" else np.asfortranarray(x_in)
    y_in = y_in if order == "C" else np.asfortranarray(y_in)

    x = cla.to_device(queue, x_in)
    y = cla.to_device(queue, y_in)

    np.testing.assert_allclose(cla.stack((x, y), axis=axis).get(),
                                np.stack((x_in, y_in), axis=axis))


def test_assign_different_strides(ctx_factory):
    cl_ctx = ctx_factory()
    queue = cl.CommandQueue(cl_ctx)

    from pyopencl.clrandom import rand as clrand

    a = clrand(queue, (20, 30), dtype=np.float32)
    b = cl_array.empty(queue, (20, 30), dtype=np.float32, order="F")
    with pytest.raises(NotImplementedError):
        b[:] = a


def test_branch_operations_on_pure_scalars(ctx_factory):
    x = np.random.rand()
    y = np.random.rand()
    cond = np.random.choice([False, True])

    np.testing.assert_allclose(np.maximum(x, y),
                               cl_array.maximum(x, y))
    np.testing.assert_allclose(np.minimum(x, y),
                               cl_array.minimum(x, y))
    np.testing.assert_allclose(np.where(cond, x, y),
                               cl_array.if_positive(cond, x, y))


if __name__ == "__main__":
    if len(sys.argv) > 1:
        exec(sys.argv[1])
    else:
        from pytest import main
        main([__file__])

# vim: filetype=pyopencl:fdm=marker