diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 258bae5cbc64090c5d70abb61dc39cfed7142218..cbe9295397d3e7eb5f12e80cae63ee72aa225e04 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -648,22 +648,43 @@ def get_pow_kernel(dtype):
 
 
 @context_dependent_memoize
-def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
+def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array):
+    """
+    Returns the kernel for the operation: ``z = x ** y``
+    """
     if np.float64 in [dtype_x, dtype_y]:
         func = "pow"
     else:
         func = "powf"
 
+    if not is_base_array and is_exp_array:
+        x_ctype = "%(tp_x)s x"
+        y_ctype = "%(tp_y)s *y"
+        func = "%s(x,y[i])" % func
+
+    elif is_base_array and is_exp_array:
+        x_ctype = "%(tp_x)s *x"
+        y_ctype = "%(tp_y)s *y"
+        func = "%s(x[i],y[i])" % func
+
+    elif is_base_array and not is_exp_array:
+        x_ctype = "%(tp_x)s *x"
+        y_ctype = "%(tp_y)s y"
+        func = "%s(x[i],y)" % func
+
+    else:
+        raise AssertionError
+
     return get_elwise_kernel(
-        "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z"
+        ("%(tp_z)s *z, " + x_ctype + ", "+y_ctype)
         % {
             "tp_x": dtype_to_ctype(dtype_x),
             "tp_y": dtype_to_ctype(dtype_y),
             "tp_z": dtype_to_ctype(dtype_z),
         },
-        "z[i] = %s(x[i], y[i])" % func,
-        "pow_method",
-    )
+        "z[i] = %s" % func,
+        name="pow_method"
+        )
 
 
 @context_dependent_memoize
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index a0bf84c4d42f93ba2215c895f7e3746a2d9b177a..adb0ea539f8f4c89f05ab59795250ff56ba5fca1 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -780,16 +780,18 @@ class GPUArray:
                 result = self
 
             func = elementwise.get_pow_array_kernel(
-                self.dtype, other.dtype, result.dtype
+                result.dtype, self.dtype, other.dtype,
+                is_base_array=True,
+                is_exp_array=True
             )
 
             func.prepared_async_call(
                 self._grid,
                 self._block,
                 None,
+                result.gpudata,
                 self.gpudata,
                 other.gpudata,
-                result.gpudata,
                 self.mem_size,
             )
 
@@ -839,6 +841,27 @@ class GPUArray:
         """
         return self._pow(other, new=False)
 
+    def __rpow__(self, other):
+        common_dtype = _get_common_dtype(self, other)
+        result = self._new_like_me(common_dtype)
+
+        if not np.isscalar(other):
+            # Base is a gpuarray => do not cast.
+            base = other
+        else:
+            base = common_dtype.type(other)
+
+        func = elementwise.get_pow_array_kernel(
+            base.dtype, self.dtype, result.dtype,
+            is_base_array=not np.isscalar(other), is_exp_array=not np.isscalar(self))
+        # Evaluates z = x ** y
+        func.prepared_async_call(self._grid, self._block, None,
+                                 result.gpudata,  # z
+                                 base if np.isscalar(base) else base.gpudata,  # x
+                                 self.gpudata,  # y
+                                 self.mem_size)
+        return result
+
     def reverse(self, stream=None):
         """Return this array in reversed order. The array is treated
         as one-dimensional.
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index dbf4b2f7f2ebee4fbec4a810b2ff3f8437d99458..12e7704e673e509d2d334401264f9032340cb2eb 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -16,17 +16,19 @@ class TestGPUArray:
     @mark_cuda_test
     def test_pow_array(self):
         a = np.array([1, 2, 3, 4, 5]).astype(np.float32)
+        b = np.array([6, 7, 8, 9, 10]).astype(np.float32)
         a_gpu = gpuarray.to_gpu(a)
+        b_gpu = gpuarray.to_gpu(b)
 
-        result = pow(a_gpu, a_gpu).get()
-        assert (np.abs(a ** a - result) < 1e-3).all()
+        result = pow(a_gpu, b_gpu).get()
+        np.testing.assert_allclose(a ** b, result)
 
-        result = (a_gpu ** a_gpu).get()
-        assert (np.abs(pow(a, a) - result) < 1e-3).all()
+        result = (a_gpu ** b_gpu).get()
+        np.testing.assert_allclose(pow(a, b), result)
 
-        a_gpu **= a_gpu
+        a_gpu **= b_gpu
         a_gpu = a_gpu.get()
-        assert (np.abs(pow(a, a) - a_gpu) < 1e-3).all()
+        np.testing.assert_allclose(pow(a, b), result)
 
     @mark_cuda_test
     def test_pow_number(self):
@@ -40,6 +42,21 @@ class TestGPUArray:
         a_gpu = a_gpu.get()
         assert (np.abs(a ** 2 - a_gpu) < 1e-3).all()
 
+    @mark_cuda_test
+    def test_rpow_array(self):
+        scalar = np.random.rand()
+        a = abs(np.random.rand(10))
+        a_gpu = gpuarray.to_gpu(a)
+
+        result = (scalar ** a_gpu).get()
+        np.testing.assert_allclose(scalar ** a, result)
+
+        result = (a_gpu ** a_gpu).get()
+        np.testing.assert_allclose(a ** a, result)
+
+        result = (a_gpu ** scalar).get()
+        np.testing.assert_allclose(a ** scalar, result)
+
     @mark_cuda_test
     def test_numpy_integer_shape(self):
         gpuarray.empty(np.int32(17), np.float32)