diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 7d633011ce727f549b5a7ce929a0f5594de2ad3a..a688a8db9e0669d5a5d5a60da2b41e7349b2b7d2 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -499,18 +499,64 @@ def get_axpbz_kernel(dtype_x, dtype_z):
     )
 
 
+@context_dependent_memoize
+def get_xfloordivyz_kernel(dtype_x, dtype_z):
+    """
+    Returns a :class:`GPUArray` corresponding to the operation
+    :math:`\floor\frac{\text{self}}{\text{other}}\rfloor`. Replicates
+    the functionality of :func:`numpy.floor_divide`.
+    """
+    return get_elwise_kernel(
+            "%(tp_x)s *x, %(tp_z)s y, %(tp_z)s *z"
+            % {"tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_z)},
+            "z[i] = (int) (( ((x[i]<0) != (y<0)) ? x[i] - (y + (y < 0) - (y>=0)) : x[i]) / y)",
+            "xfloordivyz_kernel",
+    )
+
+
 @context_dependent_memoize
 def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator,
-                        x_is_scalar=False, y_is_scalar=False):
+                        x_is_scalar, y_is_scalar):
     """
     Returns a kernel corresponding to ``z = x (operator) y``.
 
     :arg x_is_scalar: A :class:`bool` which is *True* only if `x` is a scalar :class:`gpuarray`.
     :arg y_is_scalar: A :class:`bool` which is *True* only if `y` is a scalar :class:`gpuarray`.
     """
+    out_t = dtype_to_ctype(dtype_z)
+    x = "x[0]" if x_is_scalar else "x[i]"
+    x_cast = f"({out_t}) {x}"
+    y = "y[0]" if y_is_scalar else "y[i]"
+    y_cast = f"({out_t}) {y}"
+    result = f"{x_cast} {operator} {y_cast}"
+    return get_elwise_kernel(
+        "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z"
+        % {
+            "tp_x": dtype_to_ctype(dtype_x),
+            "tp_y": dtype_to_ctype(dtype_y),
+            "tp_z": dtype_to_ctype(dtype_z),
+        },
+        f"z[i] = {result}",
+        "binaryop",
+    )
+
+
+@context_dependent_memoize
+def get_binary_op_kernel_intcast(dtype_x, dtype_y, dtype_z, operator,
+                        x_is_scalar, y_is_scalar):
+    """
+    Returns a kernel corresponding to ``z = (int) (x (operator) y)``.
+
+    :arg x_is_scalar: A :class:`bool` which is *True* only if `x` is a scalar :class:`gpuarray`.
+    :arg y_is_scalar: A :class:`bool` which is *True* only if `y` is a scalar :class:`gpuarray`.
+    """
+    out_t = dtype_to_ctype(dtype_z)
     x = "x[0]" if x_is_scalar else "x[i]"
+    x_type_cast = f"({out_t}) {x}"
     y = "y[0]" if y_is_scalar else "y[i]"
-    result = f"{x} {operator} {y}"
+    y_type_cast = f"({out_t}) {y}"
+    x_cast = f"((({x_type_cast}<0) != ({y_type_cast}<0)) ? {x_type_cast} - ({y_type_cast} + ({y_type_cast} < 0) - ({y_type_cast}>=0)) : {x_type_cast})"
+    result = f"(int) ({x_cast} {operator} {y_type_cast})"
     return get_elwise_kernel(
         "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z"
         % {
@@ -519,7 +565,7 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator,
             "tp_z": dtype_to_ctype(dtype_z),
         },
         f"z[i] = {result}",
-        "multiply",
+        "binaryop_intcast",
     )
 
 
@@ -536,6 +582,19 @@ def get_rdivide_elwise_kernel(dtype_x, dtype_z):
     )
 
 
+@context_dependent_memoize
+def get_rfloordivide_elwise_kernel(dtype_x, dtype_z):
+    return get_elwise_kernel(
+        "%(tp_x)s *x, %(tp_z)s y, %(tp_z)s *z"
+        % {
+            "tp_x": dtype_to_ctype(dtype_x),
+            "tp_z": dtype_to_ctype(dtype_z),
+        },
+        "z[i] = (int) (( ((y<0) != (x[i]<0)) ? y - (x[i] + (x[i] < 0) - (x[i]>=0)) : y) / (x[i]))",
+        "floordivide_r",
+        )
+
+
 @context_dependent_memoize
 def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z):
     return get_elwise_kernel(
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 100d21cc7a75325d93295bdc8a89d0bf3e439b01..bed31889b31545b83bd66caf6c313b3fa731918a 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -25,9 +25,18 @@ def _get_common_dtype(obj1, obj2):
     return _get_common_dtype_base(obj1, obj2, has_double_support())
 
 
+def _truediv_result_type(obj1, obj2):
+    dtype = _get_common_dtype(obj1, obj2)
+    # See: test_true_divide in numpy/core/tests/test_ufunc.py
+    # pylint: disable=no-member
+    if dtype.kind in "iu":
+        return np.dtype(np.float64)
+    else:
+        return dtype
+
+
 def _get_broadcasted_binary_op_result(obj1, obj2,
                                       dtype_getter=_get_common_dtype):
-
     if obj1.shape == obj2.shape:
         return obj1._new_like_me(dtype_getter(obj1, obj2))
     elif obj1.shape == ():
@@ -37,6 +46,7 @@ def _get_broadcasted_binary_op_result(obj1, obj2,
     else:
         raise NotImplementedError("Broadcasting binary operator with shapes:"
                                   f" {obj1.shape}, {obj2.shape}.")
+
 # {{{ vector types
 
 
@@ -478,6 +488,27 @@ class GPUArray:
 
         return out
 
+    def _xfloordivyz(self, other, out, stream=None):
+        """Compute ``out = self // other``, where `self` is a scalar."""
+
+        if not self.flags.forc:
+            raise RuntimeError(
+                "only contiguous arrays may " "be used as arguments to this operation"
+            )
+
+        func = elementwise.get_xfloordivyz_kernel(self.dtype, out.dtype)
+        func.prepared_async_call(
+                self._grid,
+                self._block,
+                stream,
+                self.gpudata,
+                other,
+                out.gpudata,
+                self.mem_size,
+        )
+
+        return out
+
     def _elwise_multiply(self, other, out, stream=None):
         if not self.flags.forc:
             raise RuntimeError(
@@ -531,6 +562,29 @@ class GPUArray:
 
         return out
 
+    def _rfloordiv_scalar(self, other, out, stream=None):
+        """Divides an array by a scalar::
+
+        y = n / self
+        """
+
+        if not self.flags.forc:
+            raise RuntimeError(
+                "only contiguous arrays may " "be used as arguments to this operation"
+            )
+        func = elementwise.get_rfloordivide_elwise_kernel(self.dtype, out.dtype)
+        func.prepared_async_call(
+                self._grid,
+                self._block,
+                stream,
+                self.gpudata,
+                other,
+                out.gpudata,
+                self.mem_size,
+        )
+
+        return out
+
     def _div(self, other, out, stream=None):
         """Divides an array by another array."""
 
@@ -550,6 +604,7 @@ class GPUArray:
             "/",
             x_is_scalar=(self.shape == ()),
             y_is_scalar=(other.shape == ()))
+
         func.prepared_async_call(
             out._grid,
             out._block,
@@ -562,6 +617,35 @@ class GPUArray:
 
         return out
 
+    def _floordiv(self, other, out, stream=None):
+        """Divides an array by another array."""
+
+        if not self.flags.forc or not other.flags.forc:
+            raise RuntimeError(
+                "only contiguous arrays may " "be used as arguments to this operation"
+            )
+
+        assert ((self.shape == other.shape == out.shape)
+            or ((self.shape == ()) and other.shape == out.shape)
+            or ((other.shape == ()) and self.shape == out.shape))
+
+        func = elementwise.get_binary_op_kernel_intcast(self.dtype, other.dtype,
+                out.dtype, "/",
+                x_is_scalar=(self.shape == ()),
+                y_is_scalar=(other.shape == ()))
+
+        func.prepared_async_call(
+                self._grid,
+                self._block,
+                stream,
+                self.gpudata,
+                other.gpudata,
+                out.gpudata,
+                self.mem_size,
+        )
+
+        return out
+
     def _new_like_me(self, dtype=None, order="C"):
         strides = None
         if dtype is None:
@@ -668,30 +752,55 @@ class GPUArray:
         x = self / n
         """
         if isinstance(other, GPUArray):
-            result = _get_broadcasted_binary_op_result(self, other)
+
+            result = _get_broadcasted_binary_op_result(self, other, _truediv_result_type)
             return self._div(other, result)
         elif np.isscalar(other):
             if other == 1:
                 return self.copy()
             else:
                 # create a new array for the result
-                result = self._new_like_me(_get_common_dtype(self, other))
+                result = self._new_like_me(_truediv_result_type(self, other))
                 return self._axpbz(1 / other, 0, result)
         else:
             return NotImplemented
     __truediv__ = __div__
 
+    def __floordiv__(self, other):
+        """Divides an array by an array or a scalar::
+
+        x = self // n
+        """
+        if isinstance(other, GPUArray):
+            result = _get_broadcasted_binary_op_result(self, other)
+            return self._floordiv(other, result)
+        else:
+            if other == 1:
+                return self.copy()
+            else:
+                # create a new array for the result
+                result = self._new_like_me(_get_common_dtype(self, other))
+                return self._xfloordivyz(other, result)
+
     def __rdiv__(self, other):
         """Divides an array by a scalar or an array::
 
         x = n / self
         """
         # create a new array for the result
-        result = self._new_like_me(_get_common_dtype(self, other))
+        result = self._new_like_me(_truediv_result_type(self, other))
         return self._rdiv_scalar(other, result)
 
     __rtruediv__ = __rdiv__
 
+    def __rfloordiv__(self, other):
+        """ Divides an array by a scalar or an array::
+        x = n // self
+        """
+        # create a new array for the result
+        result = self._new_like_me(_get_common_dtype(self, other))
+        return self._rfloordiv_scalar(other, result)
+
     def __idiv__(self, other):
         """Divides an array by an array or a scalar::
 
@@ -707,6 +816,19 @@ class GPUArray:
 
     __itruediv__ = __idiv__
 
+    def __ifloordiv__(self, other):
+        """Divides an array by an array or a scalar::
+
+        x /= n
+        """
+        if isinstance(other, GPUArray):
+            return self._div(other, self)
+        else:
+            if other == 1:
+                return self
+            else:
+                return self._xfloordivyz(other, self)
+
     def fill(self, value, stream=None):
         """fills the array with the specified value"""
         if not self.flags.forc:
@@ -2191,7 +2313,9 @@ def _logical_op(x1, x2, out, allocator, operator):
         assert out.shape == x1.shape and out.dtype == np.bool_
 
         func = elementwise.get_binary_op_kernel(
-            x1.dtype, x2.dtype, out.dtype, operator
+            x1.dtype, x2.dtype, out.dtype, operator,
+            x_is_scalar=(x1.shape == ()),
+            y_is_scalar=(x2.shape == ())
         )
         func.prepared_async_call(out._grid, out._block,
                                  None,
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 7091fee9481c52f4109ea1085a0d2cb0fa9765dc..110292e8855c669fcab76282ed54817fabb4b14e 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -11,6 +11,7 @@ import pycuda.gpuarray as gpuarray
 import pycuda.driver as drv
 from pycuda.compiler import SourceModule
 import pytest
+import itertools
 
 
 @pytest.fixture(autouse=True)
@@ -242,29 +243,72 @@ class TestGPUArray:
         result = (2 / a_gpu).get()
         assert (2 / a == result).all()
 
-    def test_divide_array(self):
+    @pytest.mark.parametrize("dtype", [np.int32, np.float])
+    def test_divide_array(self, dtype):
+        """Test the division of an array and a scalar. """
+
+        arrays = [
+            np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype),
+            np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype),
+            np.array(2).astype(dtype)
+        ]
+
+        for a, b in itertools.product(arrays, arrays):
+
+            a_gpu = gpuarray.to_gpu(a)
+            b_gpu = gpuarray.to_gpu(b)
+            a_divide_b = (a_gpu / b_gpu).get()
+            np.testing.assert_allclose(a / b, a_divide_b, rtol=1e-5)
+
+    def test_floordivide_scalar(self):
+        """Test the division of an array and a scalar."""
+
+        a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
+        a_gpu = gpuarray.to_gpu(a)
+
+        result = (a_gpu // 2).get()
+        assert (a // 2 == result).all()
+
+        result = (2 // a_gpu).get()
+        assert (2 // a == result).all()
+
+        result = (a_gpu // (-2)).get()
+        assert (a // (-2) == result).all()
+
+        result = ((-2) // a_gpu).get()
+        assert ((-2) // a == result).all()
+
+    @pytest.mark.parametrize("dtype", [np.int32, np.float])
+    def test_floordivide_array(self, dtype):
         """Test the division of an array and a scalar. """
 
         # test data
         a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32)
         b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32)
-        c = np.array(2)
 
         a_gpu = gpuarray.to_gpu(a)
         b_gpu = gpuarray.to_gpu(b)
-        c_gpu = gpuarray.to_gpu(c)
 
-        a_divide = (a_gpu / b_gpu).get()
-        assert (np.abs(a / b - a_divide) < 1e-3).all()
+        result = (a_gpu // b_gpu).get()
+        np.testing.assert_allclose(a // b, result)
+
+        result = (b_gpu // a_gpu).get()
+        np.testing.assert_allclose(b // a, result)
+
+        result = (-a_gpu // b_gpu).get()
+        np.testing.assert_allclose(-a // b, result)
 
-        a_divide = (b_gpu / a_gpu).get()
-        assert (np.abs(b / a - a_divide) < 1e-3).all()
+        result = (-b_gpu // a_gpu).get()
+        np.testing.assert_allclose(-b // a, result)
 
-        a_divide = (a_gpu / c_gpu).get()
-        assert (np.abs(a / c - a_divide) < 1e-3).all()
+        #  borrowed from https://github.com/inducer/pycuda/issues/360
+        a = np.zeros(10, "int32") + 1
+        b = np.zeros(10, "int32") + 2
 
-        a_divide = (c_gpu / a_gpu).get()
-        assert (np.abs(c / a - a_divide) < 1e-3).all()
+        a_gpu = gpuarray.zeros(10, "int32") + 1
+        b_gpu = gpuarray.zeros(10, "int32") + 2
+        result = (a_gpu // b_gpu).get()
+        assert (a // b == result).all()
 
     def test_random(self):
         from pycuda.curandom import rand as curand
@@ -595,7 +639,6 @@ class TestGPUArray:
             np.testing.assert_array_equal(any_ary_gpu, any_ary)
             assert any_ary_gpu.dtype == any_ary.dtype
 
-        import itertools
         for _array in list(itertools.product([0, 1], [0, 1], [0, 1])):
             array = np.array(_array, dtype)
             array_gpu = gpuarray.to_gpu(array)
@@ -620,7 +663,6 @@ class TestGPUArray:
             np.testing.assert_array_equal(all_ary_gpu, all_ary)
             assert all_ary_gpu.dtype == all_ary.dtype
 
-        import itertools
         for _array in list(itertools.product([0, 1], [0, 1], [0, 1])):
             array = np.array(_array, dtype)
             array_gpu = gpuarray.to_gpu(array)