From 45194649df48dc36f05b2775245d7a278209ef2b Mon Sep 17 00:00:00 2001 From: henry Date: Sat, 5 Nov 2016 11:30:20 -0700 Subject: [PATCH 1/6] * Allocate device array to hold single fill values for arrays in chunk. * Update `get_put_kernel` to check if fill value exists, and use it if so. --- pyopencl/array.py | 16 ++++++++++++++-- pyopencl/elementwise.py | 5 ++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/pyopencl/array.py b/pyopencl/array.py index a7a2a04c..31dbe450 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -2230,16 +2230,27 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, chunk_size = _builtin_min(vec_count, 10) + # np array to hold fill vals for each array in `arrays` chunk. + fill_vals = np.ndarray((chunk_size,), dtype=a_dtype) + # device buffer + fill_vals_cla = zeros(queue, shape=fill_vals.shape, dtype=a_dtype) + def make_func_for_chunk_size(chunk_size): knl = elementwise.get_put_kernel( - context, - a_dtype, dest_indices.dtype, vec_count=chunk_size) + context, a_dtype, dest_indices.dtype, + vec_count=chunk_size) return knl knl = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i+chunk_size) + # load the fill_vals np array with 0 if no fill and the fill value + # if the values array has only one item. + for fill_idx, ary in enumerate(arrays[chunk_slice]): + fill_vals[fill_idx] = ary.get()[0] if ary.size == 1 else 0 + #copy the populated fill_vals array to the buffer on device + fill_vals_cla.set(fill_vals) if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) @@ -2259,6 +2270,7 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, + list(flatten( (i.base_data, i.offset) for i in arrays[chunk_slice])) + + [fill_vals_cla.base_data, fill_vals_cla.offset] + [dest_indices.size]), **dict(wait_for=wait_for)) diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index f2718146..ec0a19fb 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -441,11 +441,14 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1): ] + [ VectorArg(dtype, "src%d" % i, with_offset=True) for i in range(vec_count) + ] + [ + VectorArg(dtype, "fill_with_single", with_offset=True) + for i in range(vec_count) ] body = ( "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx - + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i) + + "\n".join("dest%d[dest_idx] = fill_with_single[%d] or src%d[i];" % (i, i, i) for i in range(vec_count))) return get_elwise_kernel(context, args, body, -- GitLab From 688854983e8dc8ab7111ebb266deaba5cea2a976 Mon Sep 17 00:00:00 2001 From: henry Date: Thu, 1 Dec 2016 17:18:17 -0800 Subject: [PATCH 2/6] Construct and pass `use_fill` and `array_lengths` arrays to `get_put_kernel` from array multi_put. --- pyopencl/array.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/pyopencl/array.py b/pyopencl/array.py index 31dbe450..a03227f7 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -2230,10 +2230,10 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, chunk_size = _builtin_min(vec_count, 10) - # np array to hold fill vals for each array in `arrays` chunk. - fill_vals = np.ndarray((chunk_size,), dtype=a_dtype) - # device buffer - fill_vals_cla = zeros(queue, shape=fill_vals.shape, dtype=a_dtype) + # array of bools to specify whether the array of same index in this chunk + # will be filled with a single value. + use_fill = np.ndarray((chunk_size,), dtype=np.uint8) + array_lengths = np.ndarray((chunk_size,), dtype=np.int64) def make_func_for_chunk_size(chunk_size): knl = elementwise.get_put_kernel( @@ -2245,12 +2245,14 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i+chunk_size) - # load the fill_vals np array with 0 if no fill and the fill value - # if the values array has only one item. for fill_idx, ary in enumerate(arrays[chunk_slice]): - fill_vals[fill_idx] = ary.get()[0] if ary.size == 1 else 0 - #copy the populated fill_vals array to the buffer on device - fill_vals_cla.set(fill_vals) + # If there is only one value in the values array for this src array + # in the chunk then fill every index in `dest_idx` array with it. + use_fill[fill_idx] = 1 if ary.size == 1 else 0 + array_lengths[fill_idx] = len(ary) + # Copy the populated `use_fill` array to a buffer on the device. + use_fill_cla = to_device(queue, use_fill) + array_lengths_cla = to_device(queue, array_lengths) if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) @@ -2270,7 +2272,8 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None, + list(flatten( (i.base_data, i.offset) for i in arrays[chunk_slice])) - + [fill_vals_cla.base_data, fill_vals_cla.offset] + + [use_fill_cla.base_data, use_fill_cla.offset] + + [array_lengths_cla.base_data, array_lengths_cla.offset] + [dest_indices.size]), **dict(wait_for=wait_for)) -- GitLab From 94b597591684c87e9102572adef1250ace5d4580 Mon Sep 17 00:00:00 2001 From: henry Date: Thu, 1 Dec 2016 17:21:08 -0800 Subject: [PATCH 3/6] Add `use_fill` and `val_ary_lengths` to `get_put_kernel` VectorArgs. Modify the kernel to fill all indicies if there is only one value passed in values array, and to use modulo mode otherwise (to match numpys behavior). --- pyopencl/elementwise.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index ec0a19fb..d3655783 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -442,14 +442,19 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1): VectorArg(dtype, "src%d" % i, with_offset=True) for i in range(vec_count) ] + [ - VectorArg(dtype, "fill_with_single", with_offset=True) - for i in range(vec_count) + VectorArg(np.uint8, "use_fill", with_offset=True) + ] + [ + VectorArg(np.int64, "val_ary_lengths", with_offset=True) ] body = ( "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx - + "\n".join("dest%d[dest_idx] = fill_with_single[%d] or src%d[i];" % (i, i, i) - for i in range(vec_count))) + + "\n".join( + "dest{i}[dest_idx] = (use_fill[{i}] ? src{i}[0] : " + "src{i}[i % val_ary_lengths[{i}]]);".format(i=i) + for i in range(vec_count) + ) + ) return get_elwise_kernel(context, args, body, preamble=dtype_to_c_struct(context.devices[0], dtype), -- GitLab From 1d5a4269bdeec3273fc7ab363fe0f4e5ca48cc85 Mon Sep 17 00:00:00 2001 From: henry Date: Thu, 1 Dec 2016 17:21:55 -0800 Subject: [PATCH 4/6] Add tests for new `multi_put` scheme and new `get_put_kernel`. --- test/test_array.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/test/test_array.py b/test/test_array.py index e89d7122..d4ea8cd7 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -1103,6 +1103,74 @@ def test_squeeze(ctx_factory): #assert np.all(a_gpu_slice.get().ravel() == a_gpu_squeezed_slice.get().ravel()) +def test_fancy_fill(ctx_factory): + context = ctx_factory() + queue = cl.CommandQueue(context) + + numpy_dest = np.zeros((4,), np.int32) + numpy_idx = np.arange(3, dtype=np.int32) + numpy_src = np.arange(8, 9, dtype=np.int32) + numpy_dest[numpy_idx] = numpy_src + + cl_dest = cl_array.zeros(queue, (4,), np.int32) + cl_idx = cl_array.arange(queue, 3, dtype=np.int32) + cl_src = cl_array.arange(queue, 8, 9, dtype=np.int32) + cl_dest[cl_idx] = cl_src + + assert np.all(numpy_dest == cl_dest.get()) + + +def test_fancy_indexing(ctx_factory): + context = ctx_factory() + queue = cl.CommandQueue(context) + + numpy_dest = np.zeros((4,), np.int32) + numpy_idx = np.arange(3, 0, -1, dtype=np.int32) + numpy_src = np.arange(8, 10, dtype=np.int32) + numpy_dest[numpy_idx] = numpy_src + + cl_dest = cl_array.zeros(queue, (4,), np.int32) + cl_idx = cl_array.arange(queue, 3, 0, -1, dtype=np.int32) + cl_src = cl_array.arange(queue, 8, 10, dtype=np.int32) + cl_dest[cl_idx] = cl_src + + assert np.all(numpy_dest == cl_dest.get()) + + cl_idx[1] = 3 + cl_idx[2] = 2 + + numpy_idx[1] = 3 + numpy_idx[2] = 2 + + numpy_dest[numpy_idx] = numpy_src + cl_dest[cl_idx] = cl_src + + assert np.all(numpy_dest == cl_dest.get()) + + +def test_multi_put(ctx_factory): + context = ctx_factory() + queue = cl.CommandQueue(context) + + cl_arrays = [ + cl_array.arange(queue, 0, 3, dtype=np.float32) + for i in range(1, 10) + ] + idx = cl_array.arange(queue, 0, 6, dtype=np.int32) + out_arrays = [ + cl_array.zeros(queue, (10,), np.float32) + for i in range(9) + ] + + out_compare = [np.zeros((10,), np.float32) for i in range(9)] + for i, ary in enumerate(out_compare): + ary[idx.get()] = np.arange(0, 3, dtype=np.float32) + + cl_array.multi_put(cl_arrays, idx, out=out_arrays) + + assert np.all(np.all(out_compare[i] == cl_arrays[i].get()) for i in range(10)) + + if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the # tests. -- GitLab From 774467abfdca6cb8d5fbff552fecdd5cdf3a4055 Mon Sep 17 00:00:00 2001 From: henry Date: Thu, 1 Dec 2016 20:16:11 -0800 Subject: [PATCH 5/6] Check out_compare vs. out_arrays in multi_put test --- test/test_array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_array.py b/test/test_array.py index d4ea8cd7..a65699f0 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -1168,7 +1168,7 @@ def test_multi_put(ctx_factory): cl_array.multi_put(cl_arrays, idx, out=out_arrays) - assert np.all(np.all(out_compare[i] == cl_arrays[i].get()) for i in range(10)) + assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9)) if __name__ == "__main__": -- GitLab From e7a55e018a71f53915ced19cf0d5a87e555104a6 Mon Sep 17 00:00:00 2001 From: henry Date: Tue, 6 Dec 2016 21:22:15 -0800 Subject: [PATCH 6/6] Flake8 long kernel string. XFAIL multi put tests. --- pyopencl/elementwise.py | 2 +- test/test_array.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index d3655783..0e35df43 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -453,7 +453,7 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1): "dest{i}[dest_idx] = (use_fill[{i}] ? src{i}[0] : " "src{i}[i % val_ary_lengths[{i}]]);".format(i=i) for i in range(vec_count) - ) + ) ) return get_elwise_kernel(context, args, body, diff --git a/test/test_array.py b/test/test_array.py index a65699f0..2c27e77f 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -1104,6 +1104,8 @@ def test_squeeze(ctx_factory): def test_fancy_fill(ctx_factory): + if _PYPY: + pytest.xfail("numpypy: multi value setting is not supported") context = ctx_factory() queue = cl.CommandQueue(context) @@ -1121,6 +1123,8 @@ def test_fancy_fill(ctx_factory): def test_fancy_indexing(ctx_factory): + if _PYPY: + pytest.xfail("numpypy: multi value setting is not supported") context = ctx_factory() queue = cl.CommandQueue(context) @@ -1149,6 +1153,9 @@ def test_fancy_indexing(ctx_factory): def test_multi_put(ctx_factory): + if _PYPY: + pytest.xfail("numpypy: multi value setting is not supported") + context = ctx_factory() queue = cl.CommandQueue(context) -- GitLab