From 07f9c5fdb86158f24e2bd88fb8baaa97159d9616 Mon Sep 17 00:00:00 2001 From: Mit Kotak Date: Sat, 25 Jun 2022 17:38:09 -0500 Subject: [PATCH] Implemented Broadcasting in addition array --- pycuda/elementwise.py | 18 +++++++++++++++--- pycuda/gpuarray.py | 43 ++++++++++++++++++++++++++++++------------- test/test_gpuarray.py | 10 ++++++++++ 3 files changed, 55 insertions(+), 16 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 4e8601f0..3b929dd6 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -43,7 +43,6 @@ def get_elwise_module( after_loop="", ): from pycuda.compiler import SourceModule - return SourceModule( """ #include @@ -464,7 +463,20 @@ def get_linear_combination_kernel(summand_descriptors, dtype_z): @context_dependent_memoize -def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z): +def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z, + x_is_scalar=False, y_is_scalar=False): + """ + Returns a kernel corresponding to ``z = ax + by``. + + :arg x_is_scalar: A :class:`bool` which is *True* only if `x` is a scalar :class:`gpuarray`. + :arg y_is_scalar: A :class:`bool` which is *True* only if `y` is a scalar :class:`gpuarray`. + """ + out_t = dtype_to_ctype(dtype_z) + x = "x[0]" if x_is_scalar else "x[i]" + ax = f"a*(({out_t}) {x})" + y = "y[0]" if y_is_scalar else "y[i]" + by = f"b*(({out_t}) {y})" + result = f"{ax} + {by}" return get_elwise_kernel( "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" % { @@ -472,7 +484,7 @@ def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z): "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = a*x[i] + b*y[i]", + f"z[i] = {result}", "axpbyz", ) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index a3dbc100..03612a04 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -25,6 +25,18 @@ def _get_common_dtype(obj1, obj2): return _get_common_dtype_base(obj1, obj2, has_double_support()) +def _get_broadcasted_binary_op_result(obj1, obj2, + dtype_getter=_get_common_dtype): + + if obj1.shape == obj2.shape: + return obj1._new_like_me(dtype_getter(obj1, obj2)) + elif obj1.shape == (): + return obj2._new_like_me(dtype_getter(obj1, obj2)) + elif obj2.shape == (): + return obj1._new_like_me(dtype_getter(obj1, obj2)) + else: + raise NotImplementedError("Broadcasting binary operator with shapes:" + f" {obj1.shape}, {obj2.shape}.") # {{{ vector types @@ -391,38 +403,41 @@ class GPUArray: def _axpbyz(self, selffac, other, otherfac, out, add_timer=None, stream=None): """Compute ``out = selffac * self + otherfac*other``, where `other` is a vector..""" - assert self.shape == other.shape if not self.flags.forc or not other.flags.forc: raise RuntimeError( "only contiguous arrays may " "be used as arguments to this operation" ) - - func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype) - + assert ((self.shape == other.shape == out.shape) + or ((self.shape == ()) and other.shape == out.shape) + or ((other.shape == ()) and self.shape == out.shape)) + func = elementwise.get_axpbyz_kernel( + self.dtype, other.dtype, out.dtype, + x_is_scalar=(self.shape == ()), + y_is_scalar=(other.shape == ())) if add_timer is not None: add_timer( 3 * self.size, func.prepared_timed_call( - self._grid, + out._grid, selffac, - self.gpudata, + out.gpudata, otherfac, other.gpudata, out.gpudata, - self.mem_size, + out.mem_size, ), ) else: func.prepared_async_call( - self._grid, - self._block, + out._grid, + out._block, stream, selffac, self.gpudata, otherfac, other.gpudata, out.gpudata, - self.mem_size, + out.mem_size, ) return out @@ -537,16 +552,18 @@ class GPUArray: if isinstance(other, GPUArray): # add another vector - result = self._new_like_me(_get_common_dtype(self, other)) + result = _get_broadcasted_binary_op_result(self, other) return self._axpbyz(1, other, 1, result) - else: + + elif np.isscalar(other): # add a scalar if other == 0: return self.copy() else: result = self._new_like_me(_get_common_dtype(self, other)) return self._axpbz(1, other, result) - + else: + return NotImplemented __radd__ = __add__ def __sub__(self, other): diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 73ec3ade..ec42b59f 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -133,9 +133,19 @@ class TestGPUArray: a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) + b = np.array(1).astype(np.float32) + b_gpu = gpuarray.to_gpu(b) a_added = (a_gpu + a_gpu).get() + a_added_scalar = (a_gpu + 1).get() + scalar_added_a = (1 + a_gpu).get() + a_gpu_pl_b_gpu = (a_gpu + b_gpu).get() + b_gpu_pl_a_gpu = (b_gpu + a_gpu).get() assert (a + a == a_added).all() + assert (a + 1 == a_added_scalar).all() + assert (1 + a == scalar_added_a).all() + assert (a + b == a_gpu_pl_b_gpu).all() + assert (b + a == b_gpu_pl_a_gpu).all() @mark_cuda_test def test_iaddition_array(self): -- GitLab