diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 258bae5cbc64090c5d70abb61dc39cfed7142218..4e8601f02940075cf17cf873dbf0883225ddf1b9 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -631,38 +631,44 @@ def get_arange_kernel(dtype): @context_dependent_memoize -def get_pow_kernel(dtype): - if dtype == np.float32: +def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array): + """ + Returns the kernel for the operation: ``z = x ** y`` + """ + if dtype_z == np.float32: func = "powf" else: + # FIXME: Casting args to double-precision not + # ideal for all cases (ex. int args) func = "pow" - return get_elwise_kernel( - "%(tp)s value, %(tp)s *y, %(tp)s *z" - % { - "tp": dtype_to_ctype(dtype), - }, - "z[i] = %s(y[i], value)" % func, - "pow_method", - ) + if not is_base_array and is_exp_array: + x_ctype = "%(tp_x)s x" + y_ctype = "%(tp_y)s *y" + func = "%s(x,y[i])" % func + elif is_base_array and is_exp_array: + x_ctype = "%(tp_x)s *x" + y_ctype = "%(tp_y)s *y" + func = "%s(x[i],y[i])" % func + + elif is_base_array and not is_exp_array: + x_ctype = "%(tp_x)s *x" + y_ctype = "%(tp_y)s y" + func = "%s(x[i],y)" % func -@context_dependent_memoize -def get_pow_array_kernel(dtype_x, dtype_y, dtype_z): - if np.float64 in [dtype_x, dtype_y]: - func = "pow" else: - func = "powf" + raise AssertionError return get_elwise_kernel( - "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" + (x_ctype + ", " + y_ctype + ", " + "%(tp_z)s *z") % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = %s(x[i], y[i])" % func, - "pow_method", + "z[i] = %s" % func, + name="pow_method" ) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index a0bf84c4d42f93ba2215c895f7e3746a2d9b177a..36a39dc3d16e6fae90cf5ec5aabb269457a234d3 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -764,59 +764,41 @@ class GPUArray: Do the pow operator. with new, the user can choose between ipow or just pow """ + common_dtype = _get_common_dtype(self, other) + if new: + result = self._new_like_me(common_dtype) + else: + result = self - if isinstance(other, GPUArray): - if not self.flags.forc or not other.flags.forc: - raise RuntimeError( - "only contiguous arrays may " - "be used as arguments to this operation" - ) - - assert self.shape == other.shape - - if new: - result = self._new_like_me(_get_common_dtype(self, other)) - else: - result = self + # {{{ sanity checks - func = elementwise.get_pow_array_kernel( - self.dtype, other.dtype, result.dtype - ) + if (not self.flags.forc) or (isinstance(other, GPUArray) + and not other.flags.forc): + raise RuntimeError("only contiguous arrays may " + "be used as arguments to this operation") + assert not isinstance(other, GPUArray) or other.shape == self.shape - func.prepared_async_call( - self._grid, - self._block, - None, - self.gpudata, - other.gpudata, - result.gpudata, - self.mem_size, - ) + # }}} - return result - else: - if not self.flags.forc: - raise RuntimeError( - "only contiguous arrays may " - "be used as arguments to this operation" - ) + func = elementwise.get_pow_array_kernel( + self.dtype, + common_dtype if np.isscalar(other) else other.dtype, + result.dtype, + not np.isscalar(self), + not np.isscalar(other) + ) - if new: - result = self._new_like_me() - else: - result = self - func = elementwise.get_pow_kernel(self.dtype) - func.prepared_async_call( - self._grid, - self._block, - None, - other, - self.gpudata, - result.gpudata, - self.mem_size, - ) + func.prepared_async_call( + self._grid, + self._block, + None, + self.gpudata, + other if np.isscalar(other) else other.gpudata, + result.gpudata, + self.mem_size, + ) - return result + return result def __pow__(self, other): """pow function:: @@ -839,6 +821,27 @@ class GPUArray: """ return self._pow(other, new=False) + def __rpow__(self, other): + common_dtype = _get_common_dtype(self, other) + result = self._new_like_me(common_dtype) + + if not np.isscalar(other): + # Base is a gpuarray => do not cast. + base = other + else: + base = common_dtype.type(other) + + func = elementwise.get_pow_array_kernel( + base.dtype, self.dtype, result.dtype, + is_base_array=not np.isscalar(other), is_exp_array=not np.isscalar(self)) + # Evaluates z = x ** y + func.prepared_async_call(self._grid, self._block, None, + base if np.isscalar(base) else base.gpudata, # x + self.gpudata, # y + result.gpudata, # z + self.mem_size) + return result + def reverse(self, stream=None): """Return this array in reversed order. The array is treated as one-dimensional. diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index dbf4b2f7f2ebee4fbec4a810b2ff3f8437d99458..73ec3ade3f99e8830882b6271aadd70945e4b4f3 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -17,16 +17,18 @@ class TestGPUArray: def test_pow_array(self): a = np.array([1, 2, 3, 4, 5]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) + b = np.array([1, 2, 3, 4, 5]).astype(np.float64) + b_gpu = gpuarray.to_gpu(b) - result = pow(a_gpu, a_gpu).get() - assert (np.abs(a ** a - result) < 1e-3).all() + result = pow(a_gpu, b_gpu).get() + np.testing.assert_allclose(a ** b, result, rtol=1e-6) - result = (a_gpu ** a_gpu).get() - assert (np.abs(pow(a, a) - result) < 1e-3).all() + result = (a_gpu ** b_gpu).get() + np.testing.assert_allclose(pow(a, b), result, rtol=1e-6) - a_gpu **= a_gpu + a_gpu **= b_gpu a_gpu = a_gpu.get() - assert (np.abs(pow(a, a) - a_gpu) < 1e-3).all() + np.testing.assert_allclose(pow(a, b), a_gpu, rtol=1e-6) @mark_cuda_test def test_pow_number(self): @@ -34,11 +36,26 @@ class TestGPUArray: a_gpu = gpuarray.to_gpu(a) result = pow(a_gpu, 2).get() - assert (np.abs(a ** 2 - result) < 1e-3).all() + np.testing.assert_allclose(a ** 2, result, rtol=1e-6) a_gpu **= 2 a_gpu = a_gpu.get() - assert (np.abs(a ** 2 - a_gpu) < 1e-3).all() + np.testing.assert_allclose(a ** 2, a_gpu, rtol=1e-6) + + @mark_cuda_test + def test_rpow_array(self): + scalar = np.random.rand() + a = abs(np.random.rand(10)) + a_gpu = gpuarray.to_gpu(a) + + result = (scalar ** a_gpu).get() + np.testing.assert_allclose(scalar ** a, result) + + result = (a_gpu ** a_gpu).get() + np.testing.assert_allclose(a ** a, result) + + result = (a_gpu ** scalar).get() + np.testing.assert_allclose(a ** scalar, result) @mark_cuda_test def test_numpy_integer_shape(self):