Skip to content

Non-contiguous pyopencl buffer created for 'vec'-tagged global-arrays

test.py

import loopy as lp
import numpy as np
import pyopencl as cl
ctx = cl.create_some_context()

knl = lp.make_kernel(['{[j_outer]: 0 <= j_outer < 2048}',
					  '{[j_inner]: 0 <= j_inner < 4}',
					  '{[i]: 0 <= i < 39}'],
	"""
	for j_outer
		for j_inner
			<> V = phi[j_outer, i, j_inner]
			for i
				dphi[j_outer, i + 2, j_inner] = V * w[j_outer, i, j_inner]
			end
		end
	end
	""",
	[lp.GlobalArg('phi', shape=(2048, 41, 4), dtype=np.float64),
	 lp.GlobalArg('dphi', shape=(2048, 41, 4), dtype=np.float64),
	 lp.GlobalArg('w', shape=(2048, 39, 4), dtype=np.float64)]
	)

knl = lp.tag_inames(knl, {'j_inner': 'vec', 'j_outer': 'g.0'})
knl = lp.tag_array_axes(knl, 'phi,dphi,w', 'C,C,vec')

queue = cl.CommandQueue(ctx)

# write wrapper
knl = lp.set_options(knl, write_wrapper=True)
print(knl(queue, phi=np.ones((2048, 41, 4)), w=np.ones((2048, 39, 4))))

The above kernel generates the following wrapper:

from __future__ import division

import numpy as _lpy_np
import pyopencl as _lpy_cl
import pyopencl.array as _lpy_cl_array
import pyopencl.tools as _lpy_cl_tools

def _lpy_host_loopy_kernel(_lpy_cl_kernels, queue, phi, dphi, w, wait_for=None, allocator=None):
    from struct import pack as _lpy_pack
    import pyopencl as _lpy_cl
    import pyopencl.tools

    if allocator is None:
        allocator = _lpy_cl_tools.DeferredAllocator(queue.context)


    _global_temporaries = []

    pass
    # {{{ enqueue loopy_kernel

    _lpy_knl = _lpy_cl_kernels.loopy_kernel
    assert _lpy_knl.num_args == 3

    pass
    _lpy_knl.set_arg(0, phi)
    _lpy_knl.set_arg(1, dphi)
    _lpy_knl.set_arg(2, w)
    _lpy_evt = _lpy_cl.enqueue_nd_range_kernel(queue, _lpy_knl, (2048,), (1,),  wait_for=wait_for, g_times_l=True)
    wait_for = [_lpy_evt]

    # }}}


    for _tv in _global_temporaries:
        _tv.release()

    return _lpy_evt

def invoke_loopy_kernel_loopy_kernel(_lpy_cl_kernels, queue, allocator=None, wait_for=None, out_host=None, phi=None, dphi=None, w=None):
    if allocator is None:
        allocator = _lpy_cl_tools.DeferredAllocator(queue.context)

    # {{{ find integer arguments from shapes

    # }}}

    # {{{ find integer arguments from offsets

    # }}}

    # {{{ find integer arguments from strides

    # }}}

    # {{{ check that value args are present

    # }}}

    # {{{ set up array arguments

    _lpy_encountered_numpy = False
    _lpy_encountered_dev = False

    # {{{ process phi

    if isinstance(phi, _lpy_np.ndarray):
        # synchronous, nothing to worry about
        phi = _lpy_cl_array.to_device(queue, phi, allocator=allocator)
        _lpy_encountered_numpy = True
    elif phi is not None:
        _lpy_encountered_dev = True

    if phi is None:
        raise RuntimeError("input argument 'phi' must be supplied")

    if True:
        if phi.dtype != _lpy_np.float64:
            raise TypeError("dtype mismatch on argument 'phi' (got: %s, expected: np:dtype([(('x', 's0'), '<f8'), (('y', 's1'), '<f8'), (('z', 's2'), '<f8'), (('w', 's3'), '<f8')]))" % phi.dtype)
        if phi.shape != (2048, 41, 4):
            raise TypeError("shape mismatch on argument 'phi' (got: %s, expected: %s)" % (phi.shape, (2048, 41, 4,)))
        (_lpy_shape_0, _lpy_shape_1, _lpy_shape_2,) = phi.shape
        (_lpy_stride_0, _lpy_stride_1, _lpy_stride_2,) = phi.strides
        if not (_lpy_shape_0 == 1 or _lpy_stride_0 == 1312) and (_lpy_shape_1 == 1 or _lpy_stride_1 == 32) and (_lpy_shape_2 == 1 or _lpy_stride_2 == 8):
            _lpy_got = tuple(stride for (dim, stride) in zip(phi.shape, phi.strides) if dim > 1)
            _lpy_expected = tuple(stride for (dim, stride) in zip(phi.shape, (1312, 32, 8,)) if dim > 1)
            raise TypeError("strides mismatch on argument 'phi' (after removing unit length dims, got: %s, expected: %s)" % (_lpy_got, _lpy_expected))
        if hasattr(phi, 'offset') and phi.offset:
            raise ValueError("Argument 'phi' does not allow arrays with offsets. Try passing default_offset=loopy.auto to make_kernel().")


    # }}}

    # {{{ process dphi

    if isinstance(dphi, _lpy_np.ndarray):
        # synchronous, nothing to worry about
        dphi = _lpy_cl_array.to_device(queue, dphi, allocator=allocator)
        _lpy_encountered_numpy = True
    elif dphi is not None:
        _lpy_encountered_dev = True

    _lpy_made_by_loopy = False

    if dphi is None:
        _lpy_shape_0 = 2048
        _lpy_shape_1 = 41
        _lpy_strides_0 = 1312
        _lpy_strides_1 = 32
        assert _lpy_strides_0 > 0, "'dphi' has negative stride in axis 0"
        assert _lpy_strides_1 > 0, "'dphi' has negative stride in axis 1"
        _lpy_alloc_size = _lpy_strides_0*(_lpy_shape_0 + -1) + _lpy_strides_1*(_lpy_shape_1 + -1) + 8
        dphi = _lpy_cl_array.Array(queue, (_lpy_shape_0, _lpy_shape_1), _lpy_np.float64, strides=(_lpy_strides_0, _lpy_strides_1), data=allocator(_lpy_alloc_size), allocator=allocator)
        del _lpy_shape_0
        del _lpy_strides_0
        del _lpy_shape_1
        del _lpy_strides_1
        del _lpy_alloc_size

        _lpy_made_by_loopy = True

    if not _lpy_made_by_loopy:
        if dphi.dtype != _lpy_np.float64:
            raise TypeError("dtype mismatch on argument 'dphi' (got: %s, expected: np:dtype([(('x', 's0'), '<f8'), (('y', 's1'), '<f8'), (('z', 's2'), '<f8'), (('w', 's3'), '<f8')]))" % dphi.dtype)
        if dphi.shape != (2048, 41, 4):
            raise TypeError("shape mismatch on argument 'dphi' (got: %s, expected: %s)" % (dphi.shape, (2048, 41, 4,)))
        (_lpy_shape_0, _lpy_shape_1, _lpy_shape_2,) = dphi.shape
        (_lpy_stride_0, _lpy_stride_1, _lpy_stride_2,) = dphi.strides
        if not (_lpy_shape_0 == 1 or _lpy_stride_0 == 1312) and (_lpy_shape_1 == 1 or _lpy_stride_1 == 32) and (_lpy_shape_2 == 1 or _lpy_stride_2 == 8):
            _lpy_got = tuple(stride for (dim, stride) in zip(dphi.shape, dphi.strides) if dim > 1)
            _lpy_expected = tuple(stride for (dim, stride) in zip(dphi.shape, (1312, 32, 8,)) if dim > 1)
            raise TypeError("strides mismatch on argument 'dphi' (after removing unit length dims, got: %s, expected: %s)" % (_lpy_got, _lpy_expected))
        if hasattr(dphi, 'offset') and dphi.offset:
            raise ValueError("Argument 'dphi' does not allow arrays with offsets. Try passing default_offset=loopy.auto to make_kernel().")

    del _lpy_made_by_loopy


    # }}}

    # {{{ process w

    if isinstance(w, _lpy_np.ndarray):
        # synchronous, nothing to worry about
        w = _lpy_cl_array.to_device(queue, w, allocator=allocator)
        _lpy_encountered_numpy = True
    elif w is not None:
        _lpy_encountered_dev = True

    if w is None:
        raise RuntimeError("input argument 'w' must be supplied")

    if True:
        if w.dtype != _lpy_np.float64:
            raise TypeError("dtype mismatch on argument 'w' (got: %s, expected: np:dtype([(('x', 's0'), '<f8'), (('y', 's1'), '<f8'), (('z', 's2'), '<f8'), (('w', 's3'), '<f8')]))" % w.dtype)
        if w.shape != (2048, 39, 4):
            raise TypeError("shape mismatch on argument 'w' (got: %s, expected: %s)" % (w.shape, (2048, 39, 4,)))
        (_lpy_shape_0, _lpy_shape_1, _lpy_shape_2,) = w.shape
        (_lpy_stride_0, _lpy_stride_1, _lpy_stride_2,) = w.strides
        if not (_lpy_shape_0 == 1 or _lpy_stride_0 == 1248) and (_lpy_shape_1 == 1 or _lpy_stride_1 == 32) and (_lpy_shape_2 == 1 or _lpy_stride_2 == 8):
            _lpy_got = tuple(stride for (dim, stride) in zip(w.shape, w.strides) if dim > 1)
            _lpy_expected = tuple(stride for (dim, stride) in zip(w.shape, (1248, 32, 8,)) if dim > 1)
            raise TypeError("strides mismatch on argument 'w' (after removing unit length dims, got: %s, expected: %s)" % (_lpy_got, _lpy_expected))
        if hasattr(w, 'offset') and w.offset:
            raise ValueError("Argument 'w' does not allow arrays with offsets. Try passing default_offset=loopy.auto to make_kernel().")


    # }}}

    # }}}

    _lpy_evt = _lpy_host_loopy_kernel(_lpy_cl_kernels, queue, phi.base_data, dphi.base_data, w.base_data, wait_for=wait_for)
    if out_host is None and (_lpy_encountered_numpy and not _lpy_encountered_dev):
        out_host = True
    if out_host:
        pass
        dphi = dphi.get(queue=queue)

    return _lpy_evt, (dphi,)

which generates an assertion error in pyopencl:

assert self.flags.forc, "Array in get() must be contiguous"

I will try to check out the dphi alloc when I get a chance -- not really familiar with straight pyopencl Array creation however. As a workaround, one can simply pass a properly allocated numpy array to the kernel

edit: on 382e89e9

Edited by Nick Curtis