diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 629c440608ac7c48a3026d3e1c635aa3c5564dce..7d633011ce727f549b5a7ce929a0f5594de2ad3a 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -43,7 +43,6 @@ def get_elwise_module( after_loop="", ): from pycuda.compiler import SourceModule - return SourceModule( """ #include @@ -464,7 +463,20 @@ def get_linear_combination_kernel(summand_descriptors, dtype_z): @context_dependent_memoize -def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z): +def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z, + x_is_scalar=False, y_is_scalar=False): + """ + Returns a kernel corresponding to ``z = ax + by``. + + :arg x_is_scalar: A :class:`bool` which is *True* only if `x` is a scalar :class:`gpuarray`. + :arg y_is_scalar: A :class:`bool` which is *True* only if `y` is a scalar :class:`gpuarray`. + """ + out_t = dtype_to_ctype(dtype_z) + x = "x[0]" if x_is_scalar else "x[i]" + ax = f"a*(({out_t}) {x})" + y = "y[0]" if y_is_scalar else "y[i]" + by = f"b*(({out_t}) {y})" + result = f"{ax} + {by}" return get_elwise_kernel( "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" % { @@ -472,7 +484,7 @@ def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z): "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = a*x[i] + b*y[i]", + f"z[i] = {result}", "axpbyz", ) @@ -488,7 +500,17 @@ def get_axpbz_kernel(dtype_x, dtype_z): @context_dependent_memoize -def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator): +def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator, + x_is_scalar=False, y_is_scalar=False): + """ + Returns a kernel corresponding to ``z = x (operator) y``. + + :arg x_is_scalar: A :class:`bool` which is *True* only if `x` is a scalar :class:`gpuarray`. + :arg y_is_scalar: A :class:`bool` which is *True* only if `y` is a scalar :class:`gpuarray`. + """ + x = "x[0]" if x_is_scalar else "x[i]" + y = "y[0]" if y_is_scalar else "y[i]" + result = f"{x} {operator} {y}" return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { @@ -496,7 +518,7 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator): "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = x[i] %s y[i]" % operator, + f"z[i] = {result}", "multiply", ) @@ -760,8 +782,8 @@ def get_scalar_op_kernel(dtype_x, dtype_a, dtype_y, operator): "%(tp_x)s *x, %(tp_a)s a, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), - "tp_a": dtype_to_ctype(dtype_a), "tp_y": dtype_to_ctype(dtype_y), + "tp_a": dtype_to_ctype(dtype_a), }, "y[i] = x[i] %s a" % operator, "scalarop_kernel", diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index e4bd10afbae6ecfb4f4c8a6ecf36d86ba5237236..316715d82d6daa89c65515614e246ca917753612 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -25,6 +25,18 @@ def _get_common_dtype(obj1, obj2): return _get_common_dtype_base(obj1, obj2, has_double_support()) +def _get_broadcasted_binary_op_result(obj1, obj2, + dtype_getter=_get_common_dtype): + + if obj1.shape == obj2.shape: + return obj1._new_like_me(dtype_getter(obj1, obj2)) + elif obj1.shape == (): + return obj2._new_like_me(dtype_getter(obj1, obj2)) + elif obj2.shape == (): + return obj1._new_like_me(dtype_getter(obj1, obj2)) + else: + raise NotImplementedError("Broadcasting binary operator with shapes:" + f" {obj1.shape}, {obj2.shape}.") # {{{ vector types @@ -141,20 +153,22 @@ def _make_binary_op(operator): raise RuntimeError( "only contiguous arrays may " "be used as arguments to this operation" ) - - if isinstance(other, GPUArray): - assert self.shape == other.shape - + if isinstance(other, GPUArray) and (self, GPUArray): if not other.flags.forc: raise RuntimeError( "only contiguous arrays may " "be used as arguments to this operation" ) - result = self._new_like_me() + result = _get_broadcasted_binary_op_result(self, other) func = elementwise.get_binary_op_kernel( - self.dtype, other.dtype, result.dtype, operator - ) + self.dtype, + other.dtype, + result.dtype, + operator, + x_is_scalar=(self.shape == ()), + y_is_scalar=(other.shape == ())) + func.prepared_async_call( self._grid, self._block, @@ -166,7 +180,8 @@ def _make_binary_op(operator): ) return result - else: # scalar operator + elif isinstance(self, GPUArray): # scalar operator + assert np.isscalar(other) result = self._new_like_me() func = elementwise.get_scalar_op_kernel(self.dtype, np.dtype(type(other)), @@ -181,6 +196,8 @@ def _make_binary_op(operator): self.mem_size, ) return result + else: + return AssertionError return func @@ -400,38 +417,41 @@ class GPUArray: def _axpbyz(self, selffac, other, otherfac, out, add_timer=None, stream=None): """Compute ``out = selffac * self + otherfac*other``, where `other` is a vector..""" - assert self.shape == other.shape if not self.flags.forc or not other.flags.forc: raise RuntimeError( "only contiguous arrays may " "be used as arguments to this operation" ) - - func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype) - + assert ((self.shape == other.shape == out.shape) + or ((self.shape == ()) and other.shape == out.shape) + or ((other.shape == ()) and self.shape == out.shape)) + func = elementwise.get_axpbyz_kernel( + self.dtype, other.dtype, out.dtype, + x_is_scalar=(self.shape == ()), + y_is_scalar=(other.shape == ())) if add_timer is not None: add_timer( 3 * self.size, func.prepared_timed_call( - self._grid, + out._grid, selffac, - self.gpudata, + out.gpudata, otherfac, other.gpudata, out.gpudata, - self.mem_size, + out.mem_size, ), ) else: func.prepared_async_call( - self._grid, - self._block, + out._grid, + out._block, stream, selffac, self.gpudata, otherfac, other.gpudata, out.gpudata, - self.mem_size, + out.mem_size, ) return out @@ -463,16 +483,26 @@ class GPUArray: raise RuntimeError( "only contiguous arrays may " "be used as arguments to this operation" ) + assert ((self.shape == other.shape == out.shape) + or ((self.shape == ()) and other.shape == out.shape) + or ((other.shape == ()) and self.shape == out.shape)) + + func = elementwise.get_binary_op_kernel( + self.dtype, + other.dtype, + out.dtype, + "*", + x_is_scalar=(self.shape == ()), + y_is_scalar=(other.shape == ())) - func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, out.dtype, "*") func.prepared_async_call( - self._grid, - self._block, + out._grid, + out._block, stream, self.gpudata, other.gpudata, out.gpudata, - self.mem_size, + out.mem_size, ) return out @@ -509,17 +539,25 @@ class GPUArray: "only contiguous arrays may " "be used as arguments to this operation" ) - assert self.shape == other.shape + assert ((self.shape == other.shape == out.shape) + or ((self.shape == ()) and other.shape == out.shape) + or ((other.shape == ()) and self.shape == out.shape)) - func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, out.dtype, "/") + func = elementwise.get_binary_op_kernel( + self.dtype, + other.dtype, + out.dtype, + "/", + x_is_scalar=(self.shape == ()), + y_is_scalar=(other.shape == ())) func.prepared_async_call( - self._grid, - self._block, + out._grid, + out._block, stream, self.gpudata, other.gpudata, out.gpudata, - self.mem_size, + out.mem_size, ) return out @@ -546,31 +584,35 @@ class GPUArray: if isinstance(other, GPUArray): # add another vector - result = self._new_like_me(_get_common_dtype(self, other)) + result = _get_broadcasted_binary_op_result(self, other) return self._axpbyz(1, other, 1, result) - else: + + elif np.isscalar(other): # add a scalar if other == 0: return self.copy() else: result = self._new_like_me(_get_common_dtype(self, other)) return self._axpbz(1, other, result) - + else: + return NotImplemented __radd__ = __add__ def __sub__(self, other): """Substract an array from an array or a scalar from an array.""" if isinstance(other, GPUArray): - result = self._new_like_me(_get_common_dtype(self, other)) + result = _get_broadcasted_binary_op_result(self, other) return self._axpbyz(1, other, -1, result) - else: + elif np.isscalar(other): if other == 0: return self.copy() else: # create a new array for the result result = self._new_like_me(_get_common_dtype(self, other)) return self._axpbz(1, -other, result) + else: + return NotImplemented def __rsub__(self, other): """Substracts an array by a scalar or an array:: @@ -602,11 +644,13 @@ class GPUArray: def __mul__(self, other): if isinstance(other, GPUArray): - result = self._new_like_me(_get_common_dtype(self, other)) + result = _get_broadcasted_binary_op_result(self, other) return self._elwise_multiply(other, result) - else: + elif np.isscalar(other): result = self._new_like_me(_get_common_dtype(self, other)) return self._axpbz(other, 0, result) + else: + return NotImplemented def __rmul__(self, scalar): result = self._new_like_me(_get_common_dtype(self, scalar)) @@ -624,16 +668,17 @@ class GPUArray: x = self / n """ if isinstance(other, GPUArray): - result = self._new_like_me(_get_common_dtype(self, other)) + result = _get_broadcasted_binary_op_result(self, other) return self._div(other, result) - else: + elif np.isscalar(other): if other == 1: return self.copy() else: # create a new array for the result result = self._new_like_me(_get_common_dtype(self, other)) return self._axpbz(1 / other, 0, result) - + else: + return NotImplemented __truediv__ = __div__ def __rdiv__(self, other): diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index d4e03f7d154021cfdeb5adf327c0469185d554e2..881ac6e99dce782bbb3a99970dc0ac46ac3b96d4 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -117,13 +117,24 @@ class TestGPUArray: """Test the multiplication of two arrays.""" a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) + b = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32) + c = np.array(2) a_gpu = gpuarray.to_gpu(a) - b_gpu = gpuarray.to_gpu(a) + b_gpu = gpuarray.to_gpu(b) + c_gpu = gpuarray.to_gpu(c) - a_squared = (b_gpu * a_gpu).get() + a_mul_b = (a_gpu * b_gpu).get() + assert (a * b == a_mul_b).all() - assert (a * a == a_squared).all() + b_mul_a = (b_gpu * a_gpu).get() + assert (b * a == b_mul_a).all() + + a_mul_c = (a_gpu * c_gpu).get() + assert (a * c == a_mul_c).all() + + b_mul_c = (b_gpu * c_gpu).get() + assert (b * c == b_mul_c).all() def test_unit_multiply_array(self): @@ -138,9 +149,19 @@ class TestGPUArray: a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) + b = np.array(1).astype(np.float32) + b_gpu = gpuarray.to_gpu(b) a_added = (a_gpu + a_gpu).get() + a_added_scalar = (a_gpu + 1).get() + scalar_added_a = (1 + a_gpu).get() + a_gpu_pl_b_gpu = (a_gpu + b_gpu).get() + b_gpu_pl_a_gpu = (b_gpu + a_gpu).get() assert (a + a == a_added).all() + assert (a + 1 == a_added_scalar).all() + assert (1 + a == scalar_added_a).all() + assert (a + b == a_gpu_pl_b_gpu).all() + assert (b + a == b_gpu_pl_a_gpu).all() def test_iaddition_array(self): """Test the inplace addition of two arrays.""" @@ -176,9 +197,11 @@ class TestGPUArray: # test data a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) b = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32) + c = np.array(1).astype(np.float32) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) + c_gpu = gpuarray.to_gpu(c) result = (a_gpu - b_gpu).get() assert (a - b == result).all() @@ -186,6 +209,12 @@ class TestGPUArray: result = (b_gpu - a_gpu).get() assert (b - a == result).all() + result = (a_gpu - c_gpu).get() + assert (a - c == result).all() + + result = (c_gpu - a_gpu).get() + assert (c - a == result).all() + def test_substract_scalar(self): """Test the subtraction of an array and a scalar.""" @@ -219,9 +248,11 @@ class TestGPUArray: # test data a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32) b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32) + c = np.array(2) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) + c_gpu = gpuarray.to_gpu(c) a_divide = (a_gpu / b_gpu).get() assert (np.abs(a / b - a_divide) < 1e-3).all() @@ -229,6 +260,12 @@ class TestGPUArray: a_divide = (b_gpu / a_gpu).get() assert (np.abs(b / a - a_divide) < 1e-3).all() + a_divide = (a_gpu / c_gpu).get() + assert (np.abs(a / c - a_divide) < 1e-3).all() + + a_divide = (c_gpu / a_gpu).get() + assert (np.abs(c / a - a_divide) < 1e-3).all() + def test_random(self): from pycuda.curandom import rand as curand