diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 7d633011ce727f549b5a7ce929a0f5594de2ad3a..a688a8db9e0669d5a5d5a60da2b41e7349b2b7d2 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -499,18 +499,64 @@ def get_axpbz_kernel(dtype_x, dtype_z): ) +@context_dependent_memoize +def get_xfloordivyz_kernel(dtype_x, dtype_z): + """ + Returns a :class:`GPUArray` corresponding to the operation + :math:`\floor\frac{\text{self}}{\text{other}}\rfloor`. Replicates + the functionality of :func:`numpy.floor_divide`. + """ + return get_elwise_kernel( + "%(tp_x)s *x, %(tp_z)s y, %(tp_z)s *z" + % {"tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_z)}, + "z[i] = (int) (( ((x[i]<0) != (y<0)) ? x[i] - (y + (y < 0) - (y>=0)) : x[i]) / y)", + "xfloordivyz_kernel", + ) + + @context_dependent_memoize def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator, - x_is_scalar=False, y_is_scalar=False): + x_is_scalar, y_is_scalar): """ Returns a kernel corresponding to ``z = x (operator) y``. :arg x_is_scalar: A :class:`bool` which is *True* only if `x` is a scalar :class:`gpuarray`. :arg y_is_scalar: A :class:`bool` which is *True* only if `y` is a scalar :class:`gpuarray`. """ + out_t = dtype_to_ctype(dtype_z) + x = "x[0]" if x_is_scalar else "x[i]" + x_cast = f"({out_t}) {x}" + y = "y[0]" if y_is_scalar else "y[i]" + y_cast = f"({out_t}) {y}" + result = f"{x_cast} {operator} {y_cast}" + return get_elwise_kernel( + "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" + % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_y": dtype_to_ctype(dtype_y), + "tp_z": dtype_to_ctype(dtype_z), + }, + f"z[i] = {result}", + "binaryop", + ) + + +@context_dependent_memoize +def get_binary_op_kernel_intcast(dtype_x, dtype_y, dtype_z, operator, + x_is_scalar, y_is_scalar): + """ + Returns a kernel corresponding to ``z = (int) (x (operator) y)``. + + :arg x_is_scalar: A :class:`bool` which is *True* only if `x` is a scalar :class:`gpuarray`. + :arg y_is_scalar: A :class:`bool` which is *True* only if `y` is a scalar :class:`gpuarray`. + """ + out_t = dtype_to_ctype(dtype_z) x = "x[0]" if x_is_scalar else "x[i]" + x_type_cast = f"({out_t}) {x}" y = "y[0]" if y_is_scalar else "y[i]" - result = f"{x} {operator} {y}" + y_type_cast = f"({out_t}) {y}" + x_cast = f"((({x_type_cast}<0) != ({y_type_cast}<0)) ? {x_type_cast} - ({y_type_cast} + ({y_type_cast} < 0) - ({y_type_cast}>=0)) : {x_type_cast})" + result = f"(int) ({x_cast} {operator} {y_type_cast})" return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { @@ -519,7 +565,7 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator, "tp_z": dtype_to_ctype(dtype_z), }, f"z[i] = {result}", - "multiply", + "binaryop_intcast", ) @@ -536,6 +582,19 @@ def get_rdivide_elwise_kernel(dtype_x, dtype_z): ) +@context_dependent_memoize +def get_rfloordivide_elwise_kernel(dtype_x, dtype_z): + return get_elwise_kernel( + "%(tp_x)s *x, %(tp_z)s y, %(tp_z)s *z" + % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_z": dtype_to_ctype(dtype_z), + }, + "z[i] = (int) (( ((y<0) != (x[i]<0)) ? y - (x[i] + (x[i] < 0) - (x[i]>=0)) : y) / (x[i]))", + "floordivide_r", + ) + + @context_dependent_memoize def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z): return get_elwise_kernel( diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 100d21cc7a75325d93295bdc8a89d0bf3e439b01..bed31889b31545b83bd66caf6c313b3fa731918a 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -25,9 +25,18 @@ def _get_common_dtype(obj1, obj2): return _get_common_dtype_base(obj1, obj2, has_double_support()) +def _truediv_result_type(obj1, obj2): + dtype = _get_common_dtype(obj1, obj2) + # See: test_true_divide in numpy/core/tests/test_ufunc.py + # pylint: disable=no-member + if dtype.kind in "iu": + return np.dtype(np.float64) + else: + return dtype + + def _get_broadcasted_binary_op_result(obj1, obj2, dtype_getter=_get_common_dtype): - if obj1.shape == obj2.shape: return obj1._new_like_me(dtype_getter(obj1, obj2)) elif obj1.shape == (): @@ -37,6 +46,7 @@ def _get_broadcasted_binary_op_result(obj1, obj2, else: raise NotImplementedError("Broadcasting binary operator with shapes:" f" {obj1.shape}, {obj2.shape}.") + # {{{ vector types @@ -478,6 +488,27 @@ class GPUArray: return out + def _xfloordivyz(self, other, out, stream=None): + """Compute ``out = self // other``, where `self` is a scalar.""" + + if not self.flags.forc: + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) + + func = elementwise.get_xfloordivyz_kernel(self.dtype, out.dtype) + func.prepared_async_call( + self._grid, + self._block, + stream, + self.gpudata, + other, + out.gpudata, + self.mem_size, + ) + + return out + def _elwise_multiply(self, other, out, stream=None): if not self.flags.forc: raise RuntimeError( @@ -531,6 +562,29 @@ class GPUArray: return out + def _rfloordiv_scalar(self, other, out, stream=None): + """Divides an array by a scalar:: + + y = n / self + """ + + if not self.flags.forc: + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) + func = elementwise.get_rfloordivide_elwise_kernel(self.dtype, out.dtype) + func.prepared_async_call( + self._grid, + self._block, + stream, + self.gpudata, + other, + out.gpudata, + self.mem_size, + ) + + return out + def _div(self, other, out, stream=None): """Divides an array by another array.""" @@ -550,6 +604,7 @@ class GPUArray: "/", x_is_scalar=(self.shape == ()), y_is_scalar=(other.shape == ())) + func.prepared_async_call( out._grid, out._block, @@ -562,6 +617,35 @@ class GPUArray: return out + def _floordiv(self, other, out, stream=None): + """Divides an array by another array.""" + + if not self.flags.forc or not other.flags.forc: + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) + + assert ((self.shape == other.shape == out.shape) + or ((self.shape == ()) and other.shape == out.shape) + or ((other.shape == ()) and self.shape == out.shape)) + + func = elementwise.get_binary_op_kernel_intcast(self.dtype, other.dtype, + out.dtype, "/", + x_is_scalar=(self.shape == ()), + y_is_scalar=(other.shape == ())) + + func.prepared_async_call( + self._grid, + self._block, + stream, + self.gpudata, + other.gpudata, + out.gpudata, + self.mem_size, + ) + + return out + def _new_like_me(self, dtype=None, order="C"): strides = None if dtype is None: @@ -668,30 +752,55 @@ class GPUArray: x = self / n """ if isinstance(other, GPUArray): - result = _get_broadcasted_binary_op_result(self, other) + + result = _get_broadcasted_binary_op_result(self, other, _truediv_result_type) return self._div(other, result) elif np.isscalar(other): if other == 1: return self.copy() else: # create a new array for the result - result = self._new_like_me(_get_common_dtype(self, other)) + result = self._new_like_me(_truediv_result_type(self, other)) return self._axpbz(1 / other, 0, result) else: return NotImplemented __truediv__ = __div__ + def __floordiv__(self, other): + """Divides an array by an array or a scalar:: + + x = self // n + """ + if isinstance(other, GPUArray): + result = _get_broadcasted_binary_op_result(self, other) + return self._floordiv(other, result) + else: + if other == 1: + return self.copy() + else: + # create a new array for the result + result = self._new_like_me(_get_common_dtype(self, other)) + return self._xfloordivyz(other, result) + def __rdiv__(self, other): """Divides an array by a scalar or an array:: x = n / self """ # create a new array for the result - result = self._new_like_me(_get_common_dtype(self, other)) + result = self._new_like_me(_truediv_result_type(self, other)) return self._rdiv_scalar(other, result) __rtruediv__ = __rdiv__ + def __rfloordiv__(self, other): + """ Divides an array by a scalar or an array:: + x = n // self + """ + # create a new array for the result + result = self._new_like_me(_get_common_dtype(self, other)) + return self._rfloordiv_scalar(other, result) + def __idiv__(self, other): """Divides an array by an array or a scalar:: @@ -707,6 +816,19 @@ class GPUArray: __itruediv__ = __idiv__ + def __ifloordiv__(self, other): + """Divides an array by an array or a scalar:: + + x /= n + """ + if isinstance(other, GPUArray): + return self._div(other, self) + else: + if other == 1: + return self + else: + return self._xfloordivyz(other, self) + def fill(self, value, stream=None): """fills the array with the specified value""" if not self.flags.forc: @@ -2191,7 +2313,9 @@ def _logical_op(x1, x2, out, allocator, operator): assert out.shape == x1.shape and out.dtype == np.bool_ func = elementwise.get_binary_op_kernel( - x1.dtype, x2.dtype, out.dtype, operator + x1.dtype, x2.dtype, out.dtype, operator, + x_is_scalar=(x1.shape == ()), + y_is_scalar=(x2.shape == ()) ) func.prepared_async_call(out._grid, out._block, None, diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 7091fee9481c52f4109ea1085a0d2cb0fa9765dc..110292e8855c669fcab76282ed54817fabb4b14e 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -11,6 +11,7 @@ import pycuda.gpuarray as gpuarray import pycuda.driver as drv from pycuda.compiler import SourceModule import pytest +import itertools @pytest.fixture(autouse=True) @@ -242,29 +243,72 @@ class TestGPUArray: result = (2 / a_gpu).get() assert (2 / a == result).all() - def test_divide_array(self): + @pytest.mark.parametrize("dtype", [np.int32, np.float]) + def test_divide_array(self, dtype): + """Test the division of an array and a scalar. """ + + arrays = [ + np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype), + np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype), + np.array(2).astype(dtype) + ] + + for a, b in itertools.product(arrays, arrays): + + a_gpu = gpuarray.to_gpu(a) + b_gpu = gpuarray.to_gpu(b) + a_divide_b = (a_gpu / b_gpu).get() + np.testing.assert_allclose(a / b, a_divide_b, rtol=1e-5) + + def test_floordivide_scalar(self): + """Test the division of an array and a scalar.""" + + a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) + a_gpu = gpuarray.to_gpu(a) + + result = (a_gpu // 2).get() + assert (a // 2 == result).all() + + result = (2 // a_gpu).get() + assert (2 // a == result).all() + + result = (a_gpu // (-2)).get() + assert (a // (-2) == result).all() + + result = ((-2) // a_gpu).get() + assert ((-2) // a == result).all() + + @pytest.mark.parametrize("dtype", [np.int32, np.float]) + def test_floordivide_array(self, dtype): """Test the division of an array and a scalar. """ # test data a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32) b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32) - c = np.array(2) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) - c_gpu = gpuarray.to_gpu(c) - a_divide = (a_gpu / b_gpu).get() - assert (np.abs(a / b - a_divide) < 1e-3).all() + result = (a_gpu // b_gpu).get() + np.testing.assert_allclose(a // b, result) + + result = (b_gpu // a_gpu).get() + np.testing.assert_allclose(b // a, result) + + result = (-a_gpu // b_gpu).get() + np.testing.assert_allclose(-a // b, result) - a_divide = (b_gpu / a_gpu).get() - assert (np.abs(b / a - a_divide) < 1e-3).all() + result = (-b_gpu // a_gpu).get() + np.testing.assert_allclose(-b // a, result) - a_divide = (a_gpu / c_gpu).get() - assert (np.abs(a / c - a_divide) < 1e-3).all() + # borrowed from https://github.com/inducer/pycuda/issues/360 + a = np.zeros(10, "int32") + 1 + b = np.zeros(10, "int32") + 2 - a_divide = (c_gpu / a_gpu).get() - assert (np.abs(c / a - a_divide) < 1e-3).all() + a_gpu = gpuarray.zeros(10, "int32") + 1 + b_gpu = gpuarray.zeros(10, "int32") + 2 + result = (a_gpu // b_gpu).get() + assert (a // b == result).all() def test_random(self): from pycuda.curandom import rand as curand @@ -595,7 +639,6 @@ class TestGPUArray: np.testing.assert_array_equal(any_ary_gpu, any_ary) assert any_ary_gpu.dtype == any_ary.dtype - import itertools for _array in list(itertools.product([0, 1], [0, 1], [0, 1])): array = np.array(_array, dtype) array_gpu = gpuarray.to_gpu(array) @@ -620,7 +663,6 @@ class TestGPUArray: np.testing.assert_array_equal(all_ary_gpu, all_ary) assert all_ary_gpu.dtype == all_ary.dtype - import itertools for _array in list(itertools.product([0, 1], [0, 1], [0, 1])): array = np.array(_array, dtype) array_gpu = gpuarray.to_gpu(array)