From 45194649df48dc36f05b2775245d7a278209ef2b Mon Sep 17 00:00:00 2001
From: henry <henry@rtbiq.com>
Date: Sat, 5 Nov 2016 11:30:20 -0700
Subject: [PATCH 1/6] * Allocate device array to hold single fill values for
 arrays in chunk. * Update `get_put_kernel` to check if fill value exists, and
 use it if   so.

---
 pyopencl/array.py       | 16 ++++++++++++++--
 pyopencl/elementwise.py |  5 ++++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/pyopencl/array.py b/pyopencl/array.py
index a7a2a04c..31dbe450 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -2230,16 +2230,27 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
 
     chunk_size = _builtin_min(vec_count, 10)
 
+    # np array to hold fill vals for each array in `arrays` chunk.
+    fill_vals = np.ndarray((chunk_size,), dtype=a_dtype)
+    # device buffer
+    fill_vals_cla = zeros(queue, shape=fill_vals.shape, dtype=a_dtype)
+
     def make_func_for_chunk_size(chunk_size):
         knl = elementwise.get_put_kernel(
-                context,
-                a_dtype, dest_indices.dtype, vec_count=chunk_size)
+                context, a_dtype, dest_indices.dtype,
+                vec_count=chunk_size)
         return knl
 
     knl = make_func_for_chunk_size(chunk_size)
 
     for start_i in range(0, len(arrays), chunk_size):
         chunk_slice = slice(start_i, start_i+chunk_size)
+        # load the fill_vals np array with 0 if no fill and the fill value
+        # if the values array has only one item.
+        for fill_idx, ary in enumerate(arrays[chunk_slice]):
+            fill_vals[fill_idx] = ary.get()[0] if ary.size == 1 else 0
+        #copy the populated fill_vals array to the buffer on device
+        fill_vals_cla.set(fill_vals)
 
         if start_i + chunk_size > vec_count:
             knl = make_func_for_chunk_size(vec_count-start_i)
@@ -2259,6 +2270,7 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
                     + list(flatten(
                         (i.base_data, i.offset)
                         for i in arrays[chunk_slice]))
+                    + [fill_vals_cla.base_data, fill_vals_cla.offset]
                     + [dest_indices.size]),
                 **dict(wait_for=wait_for))
 
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index f2718146..ec0a19fb 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -441,11 +441,14 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1):
             ] + [
                 VectorArg(dtype, "src%d" % i, with_offset=True)
                 for i in range(vec_count)
+            ] + [
+                VectorArg(dtype, "fill_with_single", with_offset=True)
+                for i in range(vec_count)
             ]
 
     body = (
             "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx
-            + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i)
+            + "\n".join("dest%d[dest_idx] = fill_with_single[%d] or src%d[i];" % (i, i, i)
                 for i in range(vec_count)))
 
     return get_elwise_kernel(context, args, body,
-- 
GitLab


From 688854983e8dc8ab7111ebb266deaba5cea2a976 Mon Sep 17 00:00:00 2001
From: henry <hrofuller@gmail.com>
Date: Thu, 1 Dec 2016 17:18:17 -0800
Subject: [PATCH 2/6] Construct and pass `use_fill` and `array_lengths` arrays
 to `get_put_kernel` from array multi_put.

---
 pyopencl/array.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/pyopencl/array.py b/pyopencl/array.py
index 31dbe450..a03227f7 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -2230,10 +2230,10 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
 
     chunk_size = _builtin_min(vec_count, 10)
 
-    # np array to hold fill vals for each array in `arrays` chunk.
-    fill_vals = np.ndarray((chunk_size,), dtype=a_dtype)
-    # device buffer
-    fill_vals_cla = zeros(queue, shape=fill_vals.shape, dtype=a_dtype)
+    # array of bools to specify whether the array of same index in this chunk
+    # will be filled with a single value.
+    use_fill = np.ndarray((chunk_size,), dtype=np.uint8)
+    array_lengths = np.ndarray((chunk_size,), dtype=np.int64)
 
     def make_func_for_chunk_size(chunk_size):
         knl = elementwise.get_put_kernel(
@@ -2245,12 +2245,14 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
 
     for start_i in range(0, len(arrays), chunk_size):
         chunk_slice = slice(start_i, start_i+chunk_size)
-        # load the fill_vals np array with 0 if no fill and the fill value
-        # if the values array has only one item.
         for fill_idx, ary in enumerate(arrays[chunk_slice]):
-            fill_vals[fill_idx] = ary.get()[0] if ary.size == 1 else 0
-        #copy the populated fill_vals array to the buffer on device
-        fill_vals_cla.set(fill_vals)
+            # If there is only one value in the values array for this src array
+            # in the chunk then fill every index in `dest_idx` array with it.
+            use_fill[fill_idx] = 1 if ary.size == 1 else 0
+            array_lengths[fill_idx] = len(ary)
+        # Copy the populated `use_fill` array to a buffer on the device.
+        use_fill_cla = to_device(queue, use_fill)
+        array_lengths_cla = to_device(queue, array_lengths)
 
         if start_i + chunk_size > vec_count:
             knl = make_func_for_chunk_size(vec_count-start_i)
@@ -2270,7 +2272,8 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
                     + list(flatten(
                         (i.base_data, i.offset)
                         for i in arrays[chunk_slice]))
-                    + [fill_vals_cla.base_data, fill_vals_cla.offset]
+                    + [use_fill_cla.base_data, use_fill_cla.offset]
+                    + [array_lengths_cla.base_data, array_lengths_cla.offset]
                     + [dest_indices.size]),
                 **dict(wait_for=wait_for))
 
-- 
GitLab


From 94b597591684c87e9102572adef1250ace5d4580 Mon Sep 17 00:00:00 2001
From: henry <hrofuller@gmail.com>
Date: Thu, 1 Dec 2016 17:21:08 -0800
Subject: [PATCH 3/6] Add `use_fill` and `val_ary_lengths` to `get_put_kernel`
 VectorArgs.

Modify the kernel to fill all indicies if there is only one value passed
in values array, and to use modulo mode otherwise (to match numpys
behavior).
---
 pyopencl/elementwise.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index ec0a19fb..d3655783 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -442,14 +442,19 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1):
                 VectorArg(dtype, "src%d" % i, with_offset=True)
                 for i in range(vec_count)
             ] + [
-                VectorArg(dtype, "fill_with_single", with_offset=True)
-                for i in range(vec_count)
+                VectorArg(np.uint8, "use_fill", with_offset=True)
+            ] + [
+                VectorArg(np.int64, "val_ary_lengths", with_offset=True)
             ]
 
     body = (
             "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx
-            + "\n".join("dest%d[dest_idx] = fill_with_single[%d] or src%d[i];" % (i, i, i)
-                for i in range(vec_count)))
+            + "\n".join(
+                    "dest{i}[dest_idx] = (use_fill[{i}] ? src{i}[0] : "
+                    "src{i}[i % val_ary_lengths[{i}]]);".format(i=i)
+                    for i in range(vec_count)
+                )
+            )
 
     return get_elwise_kernel(context, args, body,
             preamble=dtype_to_c_struct(context.devices[0], dtype),
-- 
GitLab


From 1d5a4269bdeec3273fc7ab363fe0f4e5ca48cc85 Mon Sep 17 00:00:00 2001
From: henry <hrofuller@gmail.com>
Date: Thu, 1 Dec 2016 17:21:55 -0800
Subject: [PATCH 4/6] Add tests for new `multi_put` scheme and new
 `get_put_kernel`.

---
 test/test_array.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/test/test_array.py b/test/test_array.py
index e89d7122..d4ea8cd7 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1103,6 +1103,74 @@ def test_squeeze(ctx_factory):
     #assert np.all(a_gpu_slice.get().ravel() == a_gpu_squeezed_slice.get().ravel())
 
 
+def test_fancy_fill(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    numpy_dest = np.zeros((4,), np.int32)
+    numpy_idx = np.arange(3, dtype=np.int32)
+    numpy_src = np.arange(8, 9, dtype=np.int32)
+    numpy_dest[numpy_idx] = numpy_src
+
+    cl_dest = cl_array.zeros(queue, (4,), np.int32)
+    cl_idx = cl_array.arange(queue, 3, dtype=np.int32)
+    cl_src = cl_array.arange(queue, 8, 9, dtype=np.int32)
+    cl_dest[cl_idx] = cl_src
+
+    assert np.all(numpy_dest == cl_dest.get())
+
+
+def test_fancy_indexing(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    numpy_dest = np.zeros((4,), np.int32)
+    numpy_idx = np.arange(3, 0, -1, dtype=np.int32)
+    numpy_src = np.arange(8, 10, dtype=np.int32)
+    numpy_dest[numpy_idx] = numpy_src
+
+    cl_dest = cl_array.zeros(queue, (4,), np.int32)
+    cl_idx = cl_array.arange(queue, 3, 0, -1, dtype=np.int32)
+    cl_src = cl_array.arange(queue, 8, 10, dtype=np.int32)
+    cl_dest[cl_idx] = cl_src
+
+    assert np.all(numpy_dest == cl_dest.get())
+
+    cl_idx[1] = 3
+    cl_idx[2] = 2
+
+    numpy_idx[1] = 3
+    numpy_idx[2] = 2
+
+    numpy_dest[numpy_idx] = numpy_src
+    cl_dest[cl_idx] = cl_src
+
+    assert np.all(numpy_dest == cl_dest.get())
+
+
+def test_multi_put(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    cl_arrays = [
+        cl_array.arange(queue, 0, 3, dtype=np.float32)
+        for i in range(1, 10)
+    ]
+    idx = cl_array.arange(queue, 0, 6, dtype=np.int32)
+    out_arrays = [
+        cl_array.zeros(queue, (10,), np.float32)
+        for i in range(9)
+    ]
+
+    out_compare = [np.zeros((10,), np.float32) for i in range(9)]
+    for i, ary in enumerate(out_compare):
+        ary[idx.get()] = np.arange(0, 3, dtype=np.float32)
+
+    cl_array.multi_put(cl_arrays, idx, out=out_arrays)
+
+    assert np.all(np.all(out_compare[i] == cl_arrays[i].get()) for i in range(10))
+
+
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the
     # tests.
-- 
GitLab


From 774467abfdca6cb8d5fbff552fecdd5cdf3a4055 Mon Sep 17 00:00:00 2001
From: henry <hrofuller@gmail.com>
Date: Thu, 1 Dec 2016 20:16:11 -0800
Subject: [PATCH 5/6] Check out_compare vs. out_arrays in multi_put test

---
 test/test_array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_array.py b/test/test_array.py
index d4ea8cd7..a65699f0 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1168,7 +1168,7 @@ def test_multi_put(ctx_factory):
 
     cl_array.multi_put(cl_arrays, idx, out=out_arrays)
 
-    assert np.all(np.all(out_compare[i] == cl_arrays[i].get()) for i in range(10))
+    assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9))
 
 
 if __name__ == "__main__":
-- 
GitLab


From e7a55e018a71f53915ced19cf0d5a87e555104a6 Mon Sep 17 00:00:00 2001
From: henry <hrofuller@gmail.com>
Date: Tue, 6 Dec 2016 21:22:15 -0800
Subject: [PATCH 6/6] Flake8 long kernel string. XFAIL multi put tests.

---
 pyopencl/elementwise.py | 2 +-
 test/test_array.py      | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index d3655783..0e35df43 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -453,7 +453,7 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1):
                     "dest{i}[dest_idx] = (use_fill[{i}] ? src{i}[0] : "
                     "src{i}[i % val_ary_lengths[{i}]]);".format(i=i)
                     for i in range(vec_count)
-                )
+                    )
             )
 
     return get_elwise_kernel(context, args, body,
diff --git a/test/test_array.py b/test/test_array.py
index a65699f0..2c27e77f 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1104,6 +1104,8 @@ def test_squeeze(ctx_factory):
 
 
 def test_fancy_fill(ctx_factory):
+    if _PYPY:
+        pytest.xfail("numpypy: multi value setting is not supported")
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
@@ -1121,6 +1123,8 @@ def test_fancy_fill(ctx_factory):
 
 
 def test_fancy_indexing(ctx_factory):
+    if _PYPY:
+        pytest.xfail("numpypy: multi value setting is not supported")
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
@@ -1149,6 +1153,9 @@ def test_fancy_indexing(ctx_factory):
 
 
 def test_multi_put(ctx_factory):
+    if _PYPY:
+        pytest.xfail("numpypy: multi value setting is not supported")
+
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-- 
GitLab