diff --git a/doc/source/array.rst b/doc/source/array.rst
index eddf91ca43931637d1b94bc33cec61667b0048d5..34efb1cff9485fe8bce7c9f9e7112f3e5f75c697 100644
--- a/doc/source/array.rst
+++ b/doc/source/array.rst
@@ -529,11 +529,18 @@ Quasirandom numbers are more expensive to generate.
 
         .. versionadded:: 2012.2
 
-    .. method:: fill_poisson(data, lambda_value, stream=None)
+    .. method:: fill_poisson(data, lambda_value=None, stream=None)
 
         Fills in :class:`GPUArray` *data* with Poisson distributed
-        pseudorandom values with lambda *lambda_value*. *data* must
-        be of type 32-bit unsigned int.
+        pseudorandom values.
+
+        If *lambda_value* is not None, it is used as lambda,
+        and *data* must be of type 32-bit unsigned int.
+
+        If *lambda_value* is None, the lambda value is read
+        from each *data* array element (similarly to numpy.random.poisson),
+        and the array is overwritten by the pseudorandom values.
+        *data* must be of type 32-bit unsigned int, 32 or 64-bit float.
 
         CUDA 5.0 and above.
 
@@ -621,8 +628,15 @@ Quasirandom numbers are more expensive to generate.
     .. method:: fill_poisson(data, lambda_value, stream=None)
 
         Fills in :class:`GPUArray` *data* with Poisson distributed
-        pseudorandom values with lambda *lambda_value*. *data* must
-        be of type 32-bit unsigned int.
+        pseudorandom values.
+
+        If *lambda_value* is not None, it is used as lambda,
+        and *data* must be of type 32-bit unsigned int.
+
+        If *lambda_value* is None, the lambda value is read
+        from each *data* array element (similarly to numpy.random.poisson),
+        and the array is overwritten by the pseudorandom values.
+        *data* must be of type 32-bit unsigned int, 32 or 64-bit float.
 
         CUDA 5.0 and above.
 
@@ -735,8 +749,15 @@ Quasirandom numbers are more expensive to generate.
     .. method:: fill_poisson(data, lambda_value, stream=None)
 
         Fills in :class:`GPUArray` *data* with Poisson distributed
-        pseudorandom values with lambda *lambda_value*. *data* must
-        be of type 32-bit unsigned int.
+        pseudorandom values.
+
+        If *lambda_value* is not None, it is used as lambda,
+        and *data* must be of type 32-bit unsigned int.
+
+        If *lambda_value* is None, the lambda value is read
+        from each *data* array element (similarly to numpy.random.poisson),
+        and the array is overwritten by the pseudorandom values.
+        *data* must be of type 32-bit unsigned int, 32 or 64-bit float.
 
         CUDA 5.0 and above.
 
@@ -826,8 +847,15 @@ Quasirandom numbers are more expensive to generate.
     .. method:: fill_poisson(data, lambda_value, stream=None)
 
         Fills in :class:`GPUArray` *data* with Poisson distributed
-        pseudorandom values with lambda *lambda_value*. *data* must
-        be of type 32-bit unsigned int.
+        pseudorandom values.
+
+        If *lambda_value* is not None, it is used as lambda,
+        and *data* must be of type 32-bit unsigned int.
+
+        If *lambda_value* is None, the lambda value is read
+        from each *data* array element (similarly to numpy.random.poisson),
+        and the array is overwritten by the pseudorandom values.
+        *data* must be of type 32-bit unsigned int, 32 or 64-bit float.
 
         CUDA 5.0 and above.
 
@@ -914,8 +942,15 @@ Quasirandom numbers are more expensive to generate.
     .. method:: fill_poisson(data, lambda_value, stream=None)
 
         Fills in :class:`GPUArray` *data* with Poisson distributed
-        pseudorandom values with lambda *lambda_value*. *data* must
-        be of type 32-bit unsigned int.
+        pseudorandom values.
+
+        If *lambda_value* is not None, it is used as lambda,
+        and *data* must be of type 32-bit unsigned int.
+
+        If *lambda_value* is None, the lambda value is read
+        from each *data* array element (similarly to numpy.random.poisson),
+        and the array is overwritten by the pseudorandom values.
+        *data* must be of type 32-bit unsigned int, 32 or 64-bit float.
 
         CUDA 5.0 and above.
 
@@ -1005,8 +1040,15 @@ Quasirandom numbers are more expensive to generate.
     .. method:: fill_poisson(data, lambda_value, stream=None)
 
         Fills in :class:`GPUArray` *data* with Poisson distributed
-        pseudorandom values with lambda *lambda_value*. *data* must
-        be of type 32-bit unsigned int.
+        pseudorandom values.
+
+        If *lambda_value* is not None, it is used as lambda,
+        and *data* must be of type 32-bit unsigned int.
+
+        If *lambda_value* is None, the lambda value is read
+        from each *data* array element (similarly to numpy.random.poisson),
+        and the array is overwritten by the pseudorandom values.
+        *data* must be of type 32-bit unsigned int, 32 or 64-bit float.
 
         CUDA 5.0 and above.
 
diff --git a/pycuda/curandom.py b/pycuda/curandom.py
index 78d660e7a2a2cd5e892957d1628a7d79e2e868d4..a5d3d38d0a241522251999037eef4d0f86a73ee2 100644
--- a/pycuda/curandom.py
+++ b/pycuda/curandom.py
@@ -296,6 +296,17 @@ __global__ void %(name)s(%(state_type)s *s, %(out_type)s *d, double lambda, cons
 }
 """
 
+gen_poisson_inplace_template = """
+__global__ void %(name)s(%(state_type)s *s, %(inout_type)s *d, const int n)
+{
+  const int tidx = blockIdx.x*blockDim.x+threadIdx.x;
+  const int delta = blockDim.x*gridDim.x;
+  for (int idx = tidx; idx < n; idx += delta)
+    d[idx] = (%(inout_type)s)(curand_poisson%(suffix)s(&s[tidx], double(d[idx])));
+}
+"""
+
+
 random_source = """
 // Uses C++ features (templates); do not surround with extern C
 #include <curand_kernel.h>
@@ -373,6 +384,12 @@ class _RandomNumberGeneratorBase(object):
         ("poisson_int", "unsigned int", ""),
         ]
 
+    gen_poisson_inplace_info = [
+        ("poisson_inplace_float", "float", ""),
+        ("poisson_inplace_double", "double", ""),
+        ("poisson_inplace_int", "unsigned int", ""),
+        ]
+
     def __init__(self, state_type, vector_type, generator_bits,
         additional_source, scramble_type=None):
         if get_curand_version() < (3, 2, 0):
@@ -409,6 +426,10 @@ class _RandomNumberGeneratorBase(object):
                     (name, out_type, suffix)
                     for name, out_type, suffix in self.gen_poisson_info
                     if do_generate(out_type)]
+            my_poisson_inplace_generators = [
+                    (name, inout_type, suffix)
+                    for name, inout_type, suffix in self.gen_poisson_inplace_info
+                    if do_generate(inout_type)]
 
         generator_sources = [
                 gen_template % {
@@ -429,6 +450,11 @@ class _RandomNumberGeneratorBase(object):
                         "name": name, "out_type": out_type, "suffix": suffix,
                         "state_type": state_type, }
                     for name, out_type, suffix in my_poisson_generators])
+            generator_sources.extend([
+                    gen_poisson_inplace_template % {
+                        "name": name, "inout_type": inout_type, "suffix": suffix,
+                        "state_type": state_type, }
+                    for name, inout_type, suffix in my_poisson_inplace_generators])
 
         source = (random_source + additional_source) % {
             "state_type": state_type,
@@ -457,6 +483,10 @@ class _RandomNumberGeneratorBase(object):
                 gen_func = module.get_function(name)
                 gen_func.prepare("PPdi")
                 self.generators[name] = gen_func
+            for name, inout_type, suffix  in my_poisson_inplace_generators:
+                gen_func = module.get_function(name)
+                gen_func.prepare("PPi")
+                self.generators[name] = gen_func
 
         self.generator_bits = generator_bits
         self._prepare_skipahead()
@@ -566,17 +596,32 @@ class _RandomNumberGeneratorBase(object):
             return result
 
     if get_curand_version() >= (5, 0, 0):
-        def fill_poisson(self, data, lambda_value, stream=None):
-            if data.dtype == np.uint32:
-                func_name = "poisson_int"
+        def fill_poisson(self, data, lambda_value=None, stream=None):
+            if lambda_value is None:
+                if data.dtype == np.float32:
+                    func_name = "poisson_inplace_float"
+                elif data.dtype == np.float64:
+                    func_name = "poisson_inplace_double"
+                elif data.dtype == np.uint32:
+                    func_name = "poisson_inplace_int"
+                else:
+                    raise NotImplementedError
             else:
-                raise NotImplementedError
+                if data.dtype == np.uint32:
+                    func_name = "poisson_int"
+                else:
+                    raise NotImplementedError
 
             func = self.generators[func_name]
 
-            func.prepared_async_call(
+            if lambda_value is None:
+                func.prepared_async_call(
                     (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
-                    self.state, data.gpudata, lambda_value, data.size)
+                    self.state, data.gpudata, data.size)
+            else:
+                func.prepared_async_call(
+                        (self.block_count, 1), (self.generators_per_block, 1, 1), stream,
+                        self.state, data.gpudata, lambda_value, data.size)
 
         def gen_poisson(self, shape, dtype, lambda_value, stream=None):
             result = array.empty(shape, dtype)
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 98ce1813dc53f537941e0c00564e385745a2f1fd..b8f2d43e3addf57a723db48fdf52624a4eee209a 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -298,6 +298,16 @@ class TestGPUArray:
             gen.gen_uniform(10000, np.uint32)
             if get_curand_version() >= (5, 0, 0):
                 gen.gen_poisson(10000, np.uint32, 13.0)
+                for dtype in dtypes + [np.uint32]:
+                    a = gpuarray.empty(1000000, dtype=dtype)
+                    v = 10
+                    a.fill(v)
+                    gen.fill_poisson(a)
+                    tmp = (a.get() == (v-1)).sum() / a.size
+                    # Commented out for CI on the off chance it'd fail
+                    # # Check Poisson statistics (need 1e6 values)
+                    # # Compare with scipy.stats.poisson.pmf(v - 1, v)
+                    # assert np.isclose(0.12511, tmp, atol=0.002)
 
     @mark_cuda_test
     def test_array_gt(self):