diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 258bae5cbc64090c5d70abb61dc39cfed7142218..4e8601f02940075cf17cf873dbf0883225ddf1b9 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -631,38 +631,44 @@ def get_arange_kernel(dtype):
 
 
 @context_dependent_memoize
-def get_pow_kernel(dtype):
-    if dtype == np.float32:
+def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array):
+    """
+    Returns the kernel for the operation: ``z = x ** y``
+    """
+    if dtype_z == np.float32:
         func = "powf"
     else:
+        # FIXME: Casting args to double-precision not
+        # ideal for all cases (ex. int args)
         func = "pow"
 
-    return get_elwise_kernel(
-        "%(tp)s value, %(tp)s *y, %(tp)s *z"
-        % {
-            "tp": dtype_to_ctype(dtype),
-        },
-        "z[i] = %s(y[i], value)" % func,
-        "pow_method",
-    )
+    if not is_base_array and is_exp_array:
+        x_ctype = "%(tp_x)s x"
+        y_ctype = "%(tp_y)s *y"
+        func = "%s(x,y[i])" % func
 
+    elif is_base_array and is_exp_array:
+        x_ctype = "%(tp_x)s *x"
+        y_ctype = "%(tp_y)s *y"
+        func = "%s(x[i],y[i])" % func
+
+    elif is_base_array and not is_exp_array:
+        x_ctype = "%(tp_x)s *x"
+        y_ctype = "%(tp_y)s y"
+        func = "%s(x[i],y)" % func
 
-@context_dependent_memoize
-def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
-    if np.float64 in [dtype_x, dtype_y]:
-        func = "pow"
     else:
-        func = "powf"
+        raise AssertionError
 
     return get_elwise_kernel(
-        "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z"
+        (x_ctype + ", " + y_ctype + ", " + "%(tp_z)s *z")
         % {
             "tp_x": dtype_to_ctype(dtype_x),
             "tp_y": dtype_to_ctype(dtype_y),
             "tp_z": dtype_to_ctype(dtype_z),
         },
-        "z[i] = %s(x[i], y[i])" % func,
-        "pow_method",
+        "z[i] = %s" % func,
+        name="pow_method"
     )
 
 
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index a0bf84c4d42f93ba2215c895f7e3746a2d9b177a..36a39dc3d16e6fae90cf5ec5aabb269457a234d3 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -764,59 +764,41 @@ class GPUArray:
         Do the pow operator.
         with new, the user can choose between ipow or just pow
         """
+        common_dtype = _get_common_dtype(self, other)
+        if new:
+            result = self._new_like_me(common_dtype)
+        else:
+            result = self
 
-        if isinstance(other, GPUArray):
-            if not self.flags.forc or not other.flags.forc:
-                raise RuntimeError(
-                    "only contiguous arrays may "
-                    "be used as arguments to this operation"
-                )
-
-            assert self.shape == other.shape
-
-            if new:
-                result = self._new_like_me(_get_common_dtype(self, other))
-            else:
-                result = self
+        # {{{ sanity checks
 
-            func = elementwise.get_pow_array_kernel(
-                self.dtype, other.dtype, result.dtype
-            )
+        if (not self.flags.forc) or (isinstance(other, GPUArray)
+                                     and not other.flags.forc):
+            raise RuntimeError("only contiguous arrays may "
+                               "be used as arguments to this operation")
+        assert not isinstance(other, GPUArray) or other.shape == self.shape
 
-            func.prepared_async_call(
-                self._grid,
-                self._block,
-                None,
-                self.gpudata,
-                other.gpudata,
-                result.gpudata,
-                self.mem_size,
-            )
+        # }}}
 
-            return result
-        else:
-            if not self.flags.forc:
-                raise RuntimeError(
-                    "only contiguous arrays may "
-                    "be used as arguments to this operation"
-                )
+        func = elementwise.get_pow_array_kernel(
+            self.dtype,
+            common_dtype if np.isscalar(other) else other.dtype,
+            result.dtype,
+            not np.isscalar(self),
+            not np.isscalar(other)
+        )
 
-            if new:
-                result = self._new_like_me()
-            else:
-                result = self
-            func = elementwise.get_pow_kernel(self.dtype)
-            func.prepared_async_call(
-                self._grid,
-                self._block,
-                None,
-                other,
-                self.gpudata,
-                result.gpudata,
-                self.mem_size,
-            )
+        func.prepared_async_call(
+            self._grid,
+            self._block,
+            None,
+            self.gpudata,
+            other if np.isscalar(other) else other.gpudata,
+            result.gpudata,
+            self.mem_size,
+        )
 
-            return result
+        return result
 
     def __pow__(self, other):
         """pow function::
@@ -839,6 +821,27 @@ class GPUArray:
         """
         return self._pow(other, new=False)
 
+    def __rpow__(self, other):
+        common_dtype = _get_common_dtype(self, other)
+        result = self._new_like_me(common_dtype)
+
+        if not np.isscalar(other):
+            # Base is a gpuarray => do not cast.
+            base = other
+        else:
+            base = common_dtype.type(other)
+
+        func = elementwise.get_pow_array_kernel(
+            base.dtype, self.dtype, result.dtype,
+            is_base_array=not np.isscalar(other), is_exp_array=not np.isscalar(self))
+        # Evaluates z = x ** y
+        func.prepared_async_call(self._grid, self._block, None,
+                                 base if np.isscalar(base) else base.gpudata,  # x
+                                 self.gpudata,  # y
+                                 result.gpudata,  # z
+                                 self.mem_size)
+        return result
+
     def reverse(self, stream=None):
         """Return this array in reversed order. The array is treated
         as one-dimensional.
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index dbf4b2f7f2ebee4fbec4a810b2ff3f8437d99458..73ec3ade3f99e8830882b6271aadd70945e4b4f3 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -17,16 +17,18 @@ class TestGPUArray:
     def test_pow_array(self):
         a = np.array([1, 2, 3, 4, 5]).astype(np.float32)
         a_gpu = gpuarray.to_gpu(a)
+        b = np.array([1, 2, 3, 4, 5]).astype(np.float64)
+        b_gpu = gpuarray.to_gpu(b)
 
-        result = pow(a_gpu, a_gpu).get()
-        assert (np.abs(a ** a - result) < 1e-3).all()
+        result = pow(a_gpu, b_gpu).get()
+        np.testing.assert_allclose(a ** b, result, rtol=1e-6)
 
-        result = (a_gpu ** a_gpu).get()
-        assert (np.abs(pow(a, a) - result) < 1e-3).all()
+        result = (a_gpu ** b_gpu).get()
+        np.testing.assert_allclose(pow(a, b), result, rtol=1e-6)
 
-        a_gpu **= a_gpu
+        a_gpu **= b_gpu
         a_gpu = a_gpu.get()
-        assert (np.abs(pow(a, a) - a_gpu) < 1e-3).all()
+        np.testing.assert_allclose(pow(a, b), a_gpu, rtol=1e-6)
 
     @mark_cuda_test
     def test_pow_number(self):
@@ -34,11 +36,26 @@ class TestGPUArray:
         a_gpu = gpuarray.to_gpu(a)
 
         result = pow(a_gpu, 2).get()
-        assert (np.abs(a ** 2 - result) < 1e-3).all()
+        np.testing.assert_allclose(a ** 2, result, rtol=1e-6)
 
         a_gpu **= 2
         a_gpu = a_gpu.get()
-        assert (np.abs(a ** 2 - a_gpu) < 1e-3).all()
+        np.testing.assert_allclose(a ** 2, a_gpu, rtol=1e-6)
+
+    @mark_cuda_test
+    def test_rpow_array(self):
+        scalar = np.random.rand()
+        a = abs(np.random.rand(10))
+        a_gpu = gpuarray.to_gpu(a)
+
+        result = (scalar ** a_gpu).get()
+        np.testing.assert_allclose(scalar ** a, result)
+
+        result = (a_gpu ** a_gpu).get()
+        np.testing.assert_allclose(a ** a, result)
+
+        result = (a_gpu ** scalar).get()
+        np.testing.assert_allclose(a ** scalar, result)
 
     @mark_cuda_test
     def test_numpy_integer_shape(self):