diff --git a/pyopencl/array.py b/pyopencl/array.py
index a7a2a04c7f7a4f2492628bbac3a2cf6a0150705a..a03227f78462020a7622395bc993fb09797290e5 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -2230,16 +2230,29 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
 
     chunk_size = _builtin_min(vec_count, 10)
 
+    # array of bools to specify whether the array of same index in this chunk
+    # will be filled with a single value.
+    use_fill = np.ndarray((chunk_size,), dtype=np.uint8)
+    array_lengths = np.ndarray((chunk_size,), dtype=np.int64)
+
     def make_func_for_chunk_size(chunk_size):
         knl = elementwise.get_put_kernel(
-                context,
-                a_dtype, dest_indices.dtype, vec_count=chunk_size)
+                context, a_dtype, dest_indices.dtype,
+                vec_count=chunk_size)
         return knl
 
     knl = make_func_for_chunk_size(chunk_size)
 
     for start_i in range(0, len(arrays), chunk_size):
         chunk_slice = slice(start_i, start_i+chunk_size)
+        for fill_idx, ary in enumerate(arrays[chunk_slice]):
+            # If there is only one value in the values array for this src array
+            # in the chunk then fill every index in `dest_idx` array with it.
+            use_fill[fill_idx] = 1 if ary.size == 1 else 0
+            array_lengths[fill_idx] = len(ary)
+        # Copy the populated `use_fill` array to a buffer on the device.
+        use_fill_cla = to_device(queue, use_fill)
+        array_lengths_cla = to_device(queue, array_lengths)
 
         if start_i + chunk_size > vec_count:
             knl = make_func_for_chunk_size(vec_count-start_i)
@@ -2259,6 +2272,8 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
                     + list(flatten(
                         (i.base_data, i.offset)
                         for i in arrays[chunk_slice]))
+                    + [use_fill_cla.base_data, use_fill_cla.offset]
+                    + [array_lengths_cla.base_data, array_lengths_cla.offset]
                     + [dest_indices.size]),
                 **dict(wait_for=wait_for))
 
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index f27181464d8f0d37a95cd88098a14f2acd9ebc1f..0e35df43baef4e9b3e8def49bff25e9813387a54 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -441,12 +441,20 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1):
             ] + [
                 VectorArg(dtype, "src%d" % i, with_offset=True)
                 for i in range(vec_count)
+            ] + [
+                VectorArg(np.uint8, "use_fill", with_offset=True)
+            ] + [
+                VectorArg(np.int64, "val_ary_lengths", with_offset=True)
             ]
 
     body = (
             "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx
-            + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i)
-                for i in range(vec_count)))
+            + "\n".join(
+                    "dest{i}[dest_idx] = (use_fill[{i}] ? src{i}[0] : "
+                    "src{i}[i % val_ary_lengths[{i}]]);".format(i=i)
+                    for i in range(vec_count)
+                    )
+            )
 
     return get_elwise_kernel(context, args, body,
             preamble=dtype_to_c_struct(context.devices[0], dtype),
diff --git a/test/test_array.py b/test/test_array.py
index e89d71228b2a7c68dce2ae3b01c8b383fae736be..2c27e77f521cf698449864b52f4f9a4f5cfcd608 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1103,6 +1103,81 @@ def test_squeeze(ctx_factory):
     #assert np.all(a_gpu_slice.get().ravel() == a_gpu_squeezed_slice.get().ravel())
 
 
+def test_fancy_fill(ctx_factory):
+    if _PYPY:
+        pytest.xfail("numpypy: multi value setting is not supported")
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    numpy_dest = np.zeros((4,), np.int32)
+    numpy_idx = np.arange(3, dtype=np.int32)
+    numpy_src = np.arange(8, 9, dtype=np.int32)
+    numpy_dest[numpy_idx] = numpy_src
+
+    cl_dest = cl_array.zeros(queue, (4,), np.int32)
+    cl_idx = cl_array.arange(queue, 3, dtype=np.int32)
+    cl_src = cl_array.arange(queue, 8, 9, dtype=np.int32)
+    cl_dest[cl_idx] = cl_src
+
+    assert np.all(numpy_dest == cl_dest.get())
+
+
+def test_fancy_indexing(ctx_factory):
+    if _PYPY:
+        pytest.xfail("numpypy: multi value setting is not supported")
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    numpy_dest = np.zeros((4,), np.int32)
+    numpy_idx = np.arange(3, 0, -1, dtype=np.int32)
+    numpy_src = np.arange(8, 10, dtype=np.int32)
+    numpy_dest[numpy_idx] = numpy_src
+
+    cl_dest = cl_array.zeros(queue, (4,), np.int32)
+    cl_idx = cl_array.arange(queue, 3, 0, -1, dtype=np.int32)
+    cl_src = cl_array.arange(queue, 8, 10, dtype=np.int32)
+    cl_dest[cl_idx] = cl_src
+
+    assert np.all(numpy_dest == cl_dest.get())
+
+    cl_idx[1] = 3
+    cl_idx[2] = 2
+
+    numpy_idx[1] = 3
+    numpy_idx[2] = 2
+
+    numpy_dest[numpy_idx] = numpy_src
+    cl_dest[cl_idx] = cl_src
+
+    assert np.all(numpy_dest == cl_dest.get())
+
+
+def test_multi_put(ctx_factory):
+    if _PYPY:
+        pytest.xfail("numpypy: multi value setting is not supported")
+
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    cl_arrays = [
+        cl_array.arange(queue, 0, 3, dtype=np.float32)
+        for i in range(1, 10)
+    ]
+    idx = cl_array.arange(queue, 0, 6, dtype=np.int32)
+    out_arrays = [
+        cl_array.zeros(queue, (10,), np.float32)
+        for i in range(9)
+    ]
+
+    out_compare = [np.zeros((10,), np.float32) for i in range(9)]
+    for i, ary in enumerate(out_compare):
+        ary[idx.get()] = np.arange(0, 3, dtype=np.float32)
+
+    cl_array.multi_put(cl_arrays, idx, out=out_arrays)
+
+    assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9))
+
+
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the
     # tests.