diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 258bae5cbc64090c5d70abb61dc39cfed7142218..607e9f9c798c030b36aeb6d5dbcd62dad580cfd0 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -666,6 +666,45 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
     )
 
 
+@context_dependent_memoize
+def get_rpow_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array):
+    """
+    Returns the kernel for the operation: ``z = x ** y``
+    """
+    if np.float64 in [dtype_x, dtype_y]:
+        func = "pow"
+    else:
+        func = "powf"
+
+    if not is_base_array and is_exp_array:
+        x_ctype = "%(tp_x)s x"
+        y_ctype = "%(tp_y)s *y"
+        func = "%s(x,y[i])" % func
+
+    elif is_base_array and is_exp_array:
+        x_ctype = "%(tp_x)s *x"
+        y_ctype = "%(tp_y)s *y"
+        func = "%s(x[i],y[i])" % func
+
+    elif is_base_array and not is_exp_array:
+        x_ctype = "%(tp_x)s *x"
+        y_ctype = "%(tp_y)s y"
+        func = "%s(x[i],y)" % func
+
+    else:
+        raise AssertionError
+
+    return get_elwise_kernel(
+        ("%(tp_z)s *z, " + x_ctype + ", "+y_ctype)
+        % {
+            "tp_x": dtype_to_ctype(dtype_x),
+            "tp_y": dtype_to_ctype(dtype_y),
+            "tp_z": dtype_to_ctype(dtype_z),
+        },
+        "z[i] = %s" % func,
+        name="pow_method")
+
+
 @context_dependent_memoize
 def get_fmod_kernel():
     return get_elwise_kernel(
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index a0bf84c4d42f93ba2215c895f7e3746a2d9b177a..5e7a560714a1a5769f311dfdded5416e771b53be 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -839,6 +839,27 @@ class GPUArray:
         """
         return self._pow(other, new=False)
 
+    def __rpow__(self, other):
+        common_dtype = _get_common_dtype(self, other)
+        result = self._new_like_me(common_dtype)
+
+        if not np.isscalar(other):
+            # Base is a gpuarray => do not cast.
+            base = other
+        else:
+            base = common_dtype.type(other)
+
+        func = elementwise.get_rpow_kernel(
+            base.dtype, self.dtype, result.dtype,
+            is_base_array=not np.isscalar(other), is_exp_array=not np.isscalar(self))
+        # Evaluates z = x ** y
+        func.prepared_async_call(self._grid, self._block, None,
+                                 result.gpudata,  # z
+                                 base if np.isscalar(base) else base.gpudata,  # x
+                                 self.gpudata,  # y
+                                 self.mem_size)
+        return result
+
     def reverse(self, stream=None):
         """Return this array in reversed order. The array is treated
         as one-dimensional.
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index dbf4b2f7f2ebee4fbec4a810b2ff3f8437d99458..5234d8a21b066620059622311e9cfc46811a31fe 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -40,6 +40,21 @@ class TestGPUArray:
         a_gpu = a_gpu.get()
         assert (np.abs(a ** 2 - a_gpu) < 1e-3).all()
 
+    @mark_cuda_test
+    def test_rpow_array(self):
+        scalar = np.random.rand()
+        a = abs(np.random.rand(10))
+        a_gpu = gpuarray.to_gpu(a)
+
+        result = (scalar ** a_gpu).get()
+        np.testing.assert_allclose(scalar ** a, result)
+
+        result = (a_gpu ** a_gpu).get()
+        np.testing.assert_allclose(a ** a, result)
+
+        result = (a_gpu ** scalar).get()
+        np.testing.assert_allclose(a ** scalar, result)
+
     @mark_cuda_test
     def test_numpy_integer_shape(self):
         gpuarray.empty(np.int32(17), np.float32)