diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 258bae5cbc64090c5d70abb61dc39cfed7142218..cbe9295397d3e7eb5f12e80cae63ee72aa225e04 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -648,22 +648,43 @@ def get_pow_kernel(dtype): @context_dependent_memoize -def get_pow_array_kernel(dtype_x, dtype_y, dtype_z): +def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array): + """ + Returns the kernel for the operation: ``z = x ** y`` + """ if np.float64 in [dtype_x, dtype_y]: func = "pow" else: func = "powf" + if not is_base_array and is_exp_array: + x_ctype = "%(tp_x)s x" + y_ctype = "%(tp_y)s *y" + func = "%s(x,y[i])" % func + + elif is_base_array and is_exp_array: + x_ctype = "%(tp_x)s *x" + y_ctype = "%(tp_y)s *y" + func = "%s(x[i],y[i])" % func + + elif is_base_array and not is_exp_array: + x_ctype = "%(tp_x)s *x" + y_ctype = "%(tp_y)s y" + func = "%s(x[i],y)" % func + + else: + raise AssertionError + return get_elwise_kernel( - "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" + ("%(tp_z)s *z, " + x_ctype + ", "+y_ctype) % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = %s(x[i], y[i])" % func, - "pow_method", - ) + "z[i] = %s" % func, + name="pow_method" + ) @context_dependent_memoize diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index a0bf84c4d42f93ba2215c895f7e3746a2d9b177a..adb0ea539f8f4c89f05ab59795250ff56ba5fca1 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -780,16 +780,18 @@ class GPUArray: result = self func = elementwise.get_pow_array_kernel( - self.dtype, other.dtype, result.dtype + result.dtype, self.dtype, other.dtype, + is_base_array=True, + is_exp_array=True ) func.prepared_async_call( self._grid, self._block, None, + result.gpudata, self.gpudata, other.gpudata, - result.gpudata, self.mem_size, ) @@ -839,6 +841,27 @@ class GPUArray: """ return self._pow(other, new=False) + def __rpow__(self, other): + common_dtype = _get_common_dtype(self, other) + result = self._new_like_me(common_dtype) + + if not np.isscalar(other): + # Base is a gpuarray => do not cast. + base = other + else: + base = common_dtype.type(other) + + func = elementwise.get_pow_array_kernel( + base.dtype, self.dtype, result.dtype, + is_base_array=not np.isscalar(other), is_exp_array=not np.isscalar(self)) + # Evaluates z = x ** y + func.prepared_async_call(self._grid, self._block, None, + result.gpudata, # z + base if np.isscalar(base) else base.gpudata, # x + self.gpudata, # y + self.mem_size) + return result + def reverse(self, stream=None): """Return this array in reversed order. The array is treated as one-dimensional. diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index dbf4b2f7f2ebee4fbec4a810b2ff3f8437d99458..12e7704e673e509d2d334401264f9032340cb2eb 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -16,17 +16,19 @@ class TestGPUArray: @mark_cuda_test def test_pow_array(self): a = np.array([1, 2, 3, 4, 5]).astype(np.float32) + b = np.array([6, 7, 8, 9, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) + b_gpu = gpuarray.to_gpu(b) - result = pow(a_gpu, a_gpu).get() - assert (np.abs(a ** a - result) < 1e-3).all() + result = pow(a_gpu, b_gpu).get() + np.testing.assert_allclose(a ** b, result) - result = (a_gpu ** a_gpu).get() - assert (np.abs(pow(a, a) - result) < 1e-3).all() + result = (a_gpu ** b_gpu).get() + np.testing.assert_allclose(pow(a, b), result) - a_gpu **= a_gpu + a_gpu **= b_gpu a_gpu = a_gpu.get() - assert (np.abs(pow(a, a) - a_gpu) < 1e-3).all() + np.testing.assert_allclose(pow(a, b), result) @mark_cuda_test def test_pow_number(self): @@ -40,6 +42,21 @@ class TestGPUArray: a_gpu = a_gpu.get() assert (np.abs(a ** 2 - a_gpu) < 1e-3).all() + @mark_cuda_test + def test_rpow_array(self): + scalar = np.random.rand() + a = abs(np.random.rand(10)) + a_gpu = gpuarray.to_gpu(a) + + result = (scalar ** a_gpu).get() + np.testing.assert_allclose(scalar ** a, result) + + result = (a_gpu ** a_gpu).get() + np.testing.assert_allclose(a ** a, result) + + result = (a_gpu ** scalar).get() + np.testing.assert_allclose(a ** scalar, result) + @mark_cuda_test def test_numpy_integer_shape(self): gpuarray.empty(np.int32(17), np.float32)