Non-contiguous pyopencl buffer created for 'vec'-tagged global-arrays
test.py
import loopy as lp
import numpy as np
import pyopencl as cl
ctx = cl.create_some_context()
knl = lp.make_kernel(['{[j_outer]: 0 <= j_outer < 2048}',
'{[j_inner]: 0 <= j_inner < 4}',
'{[i]: 0 <= i < 39}'],
"""
for j_outer
for j_inner
<> V = phi[j_outer, i, j_inner]
for i
dphi[j_outer, i + 2, j_inner] = V * w[j_outer, i, j_inner]
end
end
end
""",
[lp.GlobalArg('phi', shape=(2048, 41, 4), dtype=np.float64),
lp.GlobalArg('dphi', shape=(2048, 41, 4), dtype=np.float64),
lp.GlobalArg('w', shape=(2048, 39, 4), dtype=np.float64)]
)
knl = lp.tag_inames(knl, {'j_inner': 'vec', 'j_outer': 'g.0'})
knl = lp.tag_array_axes(knl, 'phi,dphi,w', 'C,C,vec')
queue = cl.CommandQueue(ctx)
# write wrapper
knl = lp.set_options(knl, write_wrapper=True)
print(knl(queue, phi=np.ones((2048, 41, 4)), w=np.ones((2048, 39, 4))))
The above kernel generates the following wrapper:
from __future__ import division
import numpy as _lpy_np
import pyopencl as _lpy_cl
import pyopencl.array as _lpy_cl_array
import pyopencl.tools as _lpy_cl_tools
def _lpy_host_loopy_kernel(_lpy_cl_kernels, queue, phi, dphi, w, wait_for=None, allocator=None):
from struct import pack as _lpy_pack
import pyopencl as _lpy_cl
import pyopencl.tools
if allocator is None:
allocator = _lpy_cl_tools.DeferredAllocator(queue.context)
_global_temporaries = []
pass
# {{{ enqueue loopy_kernel
_lpy_knl = _lpy_cl_kernels.loopy_kernel
assert _lpy_knl.num_args == 3
pass
_lpy_knl.set_arg(0, phi)
_lpy_knl.set_arg(1, dphi)
_lpy_knl.set_arg(2, w)
_lpy_evt = _lpy_cl.enqueue_nd_range_kernel(queue, _lpy_knl, (2048,), (1,), wait_for=wait_for, g_times_l=True)
wait_for = [_lpy_evt]
# }}}
for _tv in _global_temporaries:
_tv.release()
return _lpy_evt
def invoke_loopy_kernel_loopy_kernel(_lpy_cl_kernels, queue, allocator=None, wait_for=None, out_host=None, phi=None, dphi=None, w=None):
if allocator is None:
allocator = _lpy_cl_tools.DeferredAllocator(queue.context)
# {{{ find integer arguments from shapes
# }}}
# {{{ find integer arguments from offsets
# }}}
# {{{ find integer arguments from strides
# }}}
# {{{ check that value args are present
# }}}
# {{{ set up array arguments
_lpy_encountered_numpy = False
_lpy_encountered_dev = False
# {{{ process phi
if isinstance(phi, _lpy_np.ndarray):
# synchronous, nothing to worry about
phi = _lpy_cl_array.to_device(queue, phi, allocator=allocator)
_lpy_encountered_numpy = True
elif phi is not None:
_lpy_encountered_dev = True
if phi is None:
raise RuntimeError("input argument 'phi' must be supplied")
if True:
if phi.dtype != _lpy_np.float64:
raise TypeError("dtype mismatch on argument 'phi' (got: %s, expected: np:dtype([(('x', 's0'), '<f8'), (('y', 's1'), '<f8'), (('z', 's2'), '<f8'), (('w', 's3'), '<f8')]))" % phi.dtype)
if phi.shape != (2048, 41, 4):
raise TypeError("shape mismatch on argument 'phi' (got: %s, expected: %s)" % (phi.shape, (2048, 41, 4,)))
(_lpy_shape_0, _lpy_shape_1, _lpy_shape_2,) = phi.shape
(_lpy_stride_0, _lpy_stride_1, _lpy_stride_2,) = phi.strides
if not (_lpy_shape_0 == 1 or _lpy_stride_0 == 1312) and (_lpy_shape_1 == 1 or _lpy_stride_1 == 32) and (_lpy_shape_2 == 1 or _lpy_stride_2 == 8):
_lpy_got = tuple(stride for (dim, stride) in zip(phi.shape, phi.strides) if dim > 1)
_lpy_expected = tuple(stride for (dim, stride) in zip(phi.shape, (1312, 32, 8,)) if dim > 1)
raise TypeError("strides mismatch on argument 'phi' (after removing unit length dims, got: %s, expected: %s)" % (_lpy_got, _lpy_expected))
if hasattr(phi, 'offset') and phi.offset:
raise ValueError("Argument 'phi' does not allow arrays with offsets. Try passing default_offset=loopy.auto to make_kernel().")
# }}}
# {{{ process dphi
if isinstance(dphi, _lpy_np.ndarray):
# synchronous, nothing to worry about
dphi = _lpy_cl_array.to_device(queue, dphi, allocator=allocator)
_lpy_encountered_numpy = True
elif dphi is not None:
_lpy_encountered_dev = True
_lpy_made_by_loopy = False
if dphi is None:
_lpy_shape_0 = 2048
_lpy_shape_1 = 41
_lpy_strides_0 = 1312
_lpy_strides_1 = 32
assert _lpy_strides_0 > 0, "'dphi' has negative stride in axis 0"
assert _lpy_strides_1 > 0, "'dphi' has negative stride in axis 1"
_lpy_alloc_size = _lpy_strides_0*(_lpy_shape_0 + -1) + _lpy_strides_1*(_lpy_shape_1 + -1) + 8
dphi = _lpy_cl_array.Array(queue, (_lpy_shape_0, _lpy_shape_1), _lpy_np.float64, strides=(_lpy_strides_0, _lpy_strides_1), data=allocator(_lpy_alloc_size), allocator=allocator)
del _lpy_shape_0
del _lpy_strides_0
del _lpy_shape_1
del _lpy_strides_1
del _lpy_alloc_size
_lpy_made_by_loopy = True
if not _lpy_made_by_loopy:
if dphi.dtype != _lpy_np.float64:
raise TypeError("dtype mismatch on argument 'dphi' (got: %s, expected: np:dtype([(('x', 's0'), '<f8'), (('y', 's1'), '<f8'), (('z', 's2'), '<f8'), (('w', 's3'), '<f8')]))" % dphi.dtype)
if dphi.shape != (2048, 41, 4):
raise TypeError("shape mismatch on argument 'dphi' (got: %s, expected: %s)" % (dphi.shape, (2048, 41, 4,)))
(_lpy_shape_0, _lpy_shape_1, _lpy_shape_2,) = dphi.shape
(_lpy_stride_0, _lpy_stride_1, _lpy_stride_2,) = dphi.strides
if not (_lpy_shape_0 == 1 or _lpy_stride_0 == 1312) and (_lpy_shape_1 == 1 or _lpy_stride_1 == 32) and (_lpy_shape_2 == 1 or _lpy_stride_2 == 8):
_lpy_got = tuple(stride for (dim, stride) in zip(dphi.shape, dphi.strides) if dim > 1)
_lpy_expected = tuple(stride for (dim, stride) in zip(dphi.shape, (1312, 32, 8,)) if dim > 1)
raise TypeError("strides mismatch on argument 'dphi' (after removing unit length dims, got: %s, expected: %s)" % (_lpy_got, _lpy_expected))
if hasattr(dphi, 'offset') and dphi.offset:
raise ValueError("Argument 'dphi' does not allow arrays with offsets. Try passing default_offset=loopy.auto to make_kernel().")
del _lpy_made_by_loopy
# }}}
# {{{ process w
if isinstance(w, _lpy_np.ndarray):
# synchronous, nothing to worry about
w = _lpy_cl_array.to_device(queue, w, allocator=allocator)
_lpy_encountered_numpy = True
elif w is not None:
_lpy_encountered_dev = True
if w is None:
raise RuntimeError("input argument 'w' must be supplied")
if True:
if w.dtype != _lpy_np.float64:
raise TypeError("dtype mismatch on argument 'w' (got: %s, expected: np:dtype([(('x', 's0'), '<f8'), (('y', 's1'), '<f8'), (('z', 's2'), '<f8'), (('w', 's3'), '<f8')]))" % w.dtype)
if w.shape != (2048, 39, 4):
raise TypeError("shape mismatch on argument 'w' (got: %s, expected: %s)" % (w.shape, (2048, 39, 4,)))
(_lpy_shape_0, _lpy_shape_1, _lpy_shape_2,) = w.shape
(_lpy_stride_0, _lpy_stride_1, _lpy_stride_2,) = w.strides
if not (_lpy_shape_0 == 1 or _lpy_stride_0 == 1248) and (_lpy_shape_1 == 1 or _lpy_stride_1 == 32) and (_lpy_shape_2 == 1 or _lpy_stride_2 == 8):
_lpy_got = tuple(stride for (dim, stride) in zip(w.shape, w.strides) if dim > 1)
_lpy_expected = tuple(stride for (dim, stride) in zip(w.shape, (1248, 32, 8,)) if dim > 1)
raise TypeError("strides mismatch on argument 'w' (after removing unit length dims, got: %s, expected: %s)" % (_lpy_got, _lpy_expected))
if hasattr(w, 'offset') and w.offset:
raise ValueError("Argument 'w' does not allow arrays with offsets. Try passing default_offset=loopy.auto to make_kernel().")
# }}}
# }}}
_lpy_evt = _lpy_host_loopy_kernel(_lpy_cl_kernels, queue, phi.base_data, dphi.base_data, w.base_data, wait_for=wait_for)
if out_host is None and (_lpy_encountered_numpy and not _lpy_encountered_dev):
out_host = True
if out_host:
pass
dphi = dphi.get(queue=queue)
return _lpy_evt, (dphi,)
which generates an assertion error in pyopencl:
assert self.flags.forc, "Array in get() must be contiguous"
I will try to check out the dphi alloc when I get a chance -- not really familiar with straight pyopencl Array creation however. As a workaround, one can simply pass a properly allocated numpy array to the kernel
edit: on 382e89e9
Edited by Nick Curtis