diff --git a/pyopencl/array.py b/pyopencl/array.py index a7a2a04c7f7a4f2492628bbac3a2cf6a0150705a..a03227f78462020a7622395bc993fb09797290e5 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -2230,16 +2230,29 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, chunk_size = _builtin_min(vec_count, 10) + # array of bools to specify whether the array of same index in this chunk + # will be filled with a single value. + use_fill = np.ndarray((chunk_size,), dtype=np.uint8) + array_lengths = np.ndarray((chunk_size,), dtype=np.int64) + def make_func_for_chunk_size(chunk_size): knl = elementwise.get_put_kernel( - context, - a_dtype, dest_indices.dtype, vec_count=chunk_size) + context, a_dtype, dest_indices.dtype, + vec_count=chunk_size) return knl knl = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i+chunk_size) + for fill_idx, ary in enumerate(arrays[chunk_slice]): + # If there is only one value in the values array for this src array + # in the chunk then fill every index in `dest_idx` array with it. + use_fill[fill_idx] = 1 if ary.size == 1 else 0 + array_lengths[fill_idx] = len(ary) + # Copy the populated `use_fill` array to a buffer on the device. + use_fill_cla = to_device(queue, use_fill) + array_lengths_cla = to_device(queue, array_lengths) if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) @@ -2259,6 +2272,8 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, + list(flatten( (i.base_data, i.offset) for i in arrays[chunk_slice])) + + [use_fill_cla.base_data, use_fill_cla.offset] + + [array_lengths_cla.base_data, array_lengths_cla.offset] + [dest_indices.size]), **dict(wait_for=wait_for)) diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index f27181464d8f0d37a95cd88098a14f2acd9ebc1f..0e35df43baef4e9b3e8def49bff25e9813387a54 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -441,12 +441,20 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1): ] + [ VectorArg(dtype, "src%d" % i, with_offset=True) for i in range(vec_count) + ] + [ + VectorArg(np.uint8, "use_fill", with_offset=True) + ] + [ + VectorArg(np.int64, "val_ary_lengths", with_offset=True) ] body = ( "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx - + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i) - for i in range(vec_count))) + + "\n".join( + "dest{i}[dest_idx] = (use_fill[{i}] ? src{i}[0] : " + "src{i}[i % val_ary_lengths[{i}]]);".format(i=i) + for i in range(vec_count) + ) + ) return get_elwise_kernel(context, args, body, preamble=dtype_to_c_struct(context.devices[0], dtype), diff --git a/test/test_array.py b/test/test_array.py index e89d71228b2a7c68dce2ae3b01c8b383fae736be..2c27e77f521cf698449864b52f4f9a4f5cfcd608 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -1103,6 +1103,81 @@ def test_squeeze(ctx_factory): #assert np.all(a_gpu_slice.get().ravel() == a_gpu_squeezed_slice.get().ravel()) +def test_fancy_fill(ctx_factory): + if _PYPY: + pytest.xfail("numpypy: multi value setting is not supported") + context = ctx_factory() + queue = cl.CommandQueue(context) + + numpy_dest = np.zeros((4,), np.int32) + numpy_idx = np.arange(3, dtype=np.int32) + numpy_src = np.arange(8, 9, dtype=np.int32) + numpy_dest[numpy_idx] = numpy_src + + cl_dest = cl_array.zeros(queue, (4,), np.int32) + cl_idx = cl_array.arange(queue, 3, dtype=np.int32) + cl_src = cl_array.arange(queue, 8, 9, dtype=np.int32) + cl_dest[cl_idx] = cl_src + + assert np.all(numpy_dest == cl_dest.get()) + + +def test_fancy_indexing(ctx_factory): + if _PYPY: + pytest.xfail("numpypy: multi value setting is not supported") + context = ctx_factory() + queue = cl.CommandQueue(context) + + numpy_dest = np.zeros((4,), np.int32) + numpy_idx = np.arange(3, 0, -1, dtype=np.int32) + numpy_src = np.arange(8, 10, dtype=np.int32) + numpy_dest[numpy_idx] = numpy_src + + cl_dest = cl_array.zeros(queue, (4,), np.int32) + cl_idx = cl_array.arange(queue, 3, 0, -1, dtype=np.int32) + cl_src = cl_array.arange(queue, 8, 10, dtype=np.int32) + cl_dest[cl_idx] = cl_src + + assert np.all(numpy_dest == cl_dest.get()) + + cl_idx[1] = 3 + cl_idx[2] = 2 + + numpy_idx[1] = 3 + numpy_idx[2] = 2 + + numpy_dest[numpy_idx] = numpy_src + cl_dest[cl_idx] = cl_src + + assert np.all(numpy_dest == cl_dest.get()) + + +def test_multi_put(ctx_factory): + if _PYPY: + pytest.xfail("numpypy: multi value setting is not supported") + + context = ctx_factory() + queue = cl.CommandQueue(context) + + cl_arrays = [ + cl_array.arange(queue, 0, 3, dtype=np.float32) + for i in range(1, 10) + ] + idx = cl_array.arange(queue, 0, 6, dtype=np.int32) + out_arrays = [ + cl_array.zeros(queue, (10,), np.float32) + for i in range(9) + ] + + out_compare = [np.zeros((10,), np.float32) for i in range(9)] + for i, ary in enumerate(out_compare): + ary[idx.get()] = np.arange(0, 3, dtype=np.float32) + + cl_array.multi_put(cl_arrays, idx, out=out_arrays) + + assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9)) + + if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the # tests.