diff --git a/pyopencl/array.py b/pyopencl/array.py
index d3d353d1bca6b70bc6e42127f8ab0da770ded2f4..b06dd70679651fe14fcc1c195c154cee67e3cc3c 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -50,6 +50,25 @@ def _get_common_dtype(obj1, obj2, queue):
                                   has_double_support(queue.device))
 
 
+def _get_truedivide_dtype(obj1, obj2, queue):
+    # the dtype of the division result obj1 / obj2
+
+    allow_double = has_double_support(queue.device)
+
+    x1 = obj1 if np.isscalar(obj1) else np.ones(1, obj1.dtype)
+    x2 = obj2 if np.isscalar(obj2) else np.ones(1, obj2.dtype)
+
+    result = (x1/x2).dtype
+
+    if not allow_double:
+        if result == np.float64:
+            result = np.dtype(np.float32)
+        elif result == np.complex128:
+            result = np.dtype(np.complex64)
+
+    return result
+
+
 # Work around PyPy not currently supporting the object dtype.
 # (Yes, it doesn't even support checking!)
 # (as of May 27, 2014 on PyPy 2.3)
@@ -1080,20 +1099,20 @@ class Array:
     def __div__(self, other):
         """Divides an array by an array or a scalar, i.e. ``self / other``.
         """
+        common_dtype = _get_truedivide_dtype(self, other, self.queue)
         if isinstance(other, Array):
-            result = self._new_like_me(
-                    _get_common_dtype(self, other, self.queue))
+            result = self._new_like_me(common_dtype)
             result.add_event(self._div(result, self, other))
         else:
             if other == 1:
                 return self.copy()
             else:
                 # create a new array for the result
-                common_dtype = _get_common_dtype(self, other, self.queue)
                 result = self._new_like_me(common_dtype)
                 result.add_event(
                         self._axpbz(result,
-                            common_dtype.type(1/other), self, self.dtype.type(0)))
+                                    np.true_divide(common_dtype.type(1), other),
+                                    self, self.dtype.type(0)))
 
         return result
 
@@ -1102,14 +1121,13 @@ class Array:
     def __rdiv__(self, other):
         """Divides an array by a scalar or an array, i.e. ``other / self``.
         """
+        common_dtype = _get_truedivide_dtype(self, other, self.queue)
 
         if isinstance(other, Array):
-            result = self._new_like_me(
-                    _get_common_dtype(self, other, self.queue))
+            result = self._new_like_me(common_dtype)
             result.add_event(other._div(result, self))
         else:
             # create a new array for the result
-            common_dtype = _get_common_dtype(self, other, self.queue)
             result = self._new_like_me(common_dtype)
             result.add_event(
                     self._rdiv_scalar(result, self, common_dtype.type(other)))
@@ -1118,6 +1136,26 @@ class Array:
 
     __rtruediv__ = __rdiv__
 
+    def __itruediv__(self, other):
+        # raise an error if the result cannot be cast to self
+        common_dtype = _get_truedivide_dtype(self, other, self.queue)
+        if not np.can_cast(common_dtype, self.dtype.type):
+            raise TypeError("Cannot cast {!r} to {!r}"
+                            .format(self.dtype, common_dtype))
+
+        if isinstance(other, Array):
+            self.add_event(
+                self._div(self, self, other))
+        else:
+            if other == 1:
+                return self
+            else:
+                self.add_event(
+                    self._axpbz(self, common_dtype.type(np.true_divide(1, other)),
+                                self, self.dtype.type(0)))
+
+        return self
+
     def __and__(self, other):
         common_dtype = _get_common_dtype(self, other, self.queue)
 
diff --git a/test/test_array.py b/test/test_array.py
index 2cbef16c0d8bbd168ccb9a371f1904cb8cfc022a..39f8fd74e572c49e19d1614d1c352fb625f5553b 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -471,14 +471,29 @@ def test_divide_scalar(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
-    a_gpu = cl_array.to_device(queue, a)
+    dtypes = (np.uint8, np.uint16, np.uint32,
+                  np.int8, np.int16, np.int32,
+                  np.float32, np.complex64)
+    from pyopencl.characterize import has_double_support
+    if has_double_support(queue.device):
+        dtypes = dtypes + (np.float64, np.complex128)
+
+    from itertools import product
 
-    result = (a_gpu / 2).get()
-    assert (a / 2 == result).all()
+    for dtype_a, dtype_s in product(dtypes, repeat=2):
+        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
+        s = dtype_s(40)
+        a_gpu = cl_array.to_device(queue, a)
 
-    result = (2 / a_gpu).get()
-    assert (np.abs(2 / a - result) < 1e-5).all()
+        b = a / s
+        b_gpu = a_gpu / s
+        assert (np.abs(b_gpu.get() - b) < 1e-3).all()
+        assert b_gpu.dtype is b.dtype
+
+        c = s / a
+        c_gpu = s / a_gpu
+        assert (np.abs(c_gpu.get() - c) < 1e-3).all()
+        assert c_gpu.dtype is c.dtype
 
 
 def test_divide_array(ctx_factory):
@@ -487,18 +502,100 @@ def test_divide_array(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-    #test data
-    a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32)
-    b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32)
+    dtypes = (np.float32, np.complex64)
+    from pyopencl.characterize import has_double_support
+    if has_double_support(queue.device):
+        dtypes = dtypes + (np.float64, np.complex128)
 
-    a_gpu = cl_array.to_device(queue, a)
-    b_gpu = cl_array.to_device(queue, b)
+    from itertools import product
+
+    for dtype_a, dtype_b in product(dtypes, repeat=2):
+
+        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
+        b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype_b)
+
+        a_gpu = cl_array.to_device(queue, a)
+        b_gpu = cl_array.to_device(queue, b)
+        c = a / b
+        c_gpu = (a_gpu / b_gpu)
+        assert (np.abs(c_gpu.get() - c) < 1e-3).all()
+        assert c_gpu.dtype is c.dtype
 
-    a_divide = (a_gpu / b_gpu).get()
-    assert (np.abs(a / b - a_divide) < 1e-3).all()
+        d = b / a
+        d_gpu = (b_gpu / a_gpu)
+        assert (np.abs(d_gpu.get() - d) < 1e-3).all()
+        assert d_gpu.dtype is d.dtype
+
+
+def test_divide_inplace_scalar(ctx_factory):
+    """Test inplace division of arrays and a scalar."""
+
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    dtypes = (np.uint8, np.uint16, np.uint32,
+                  np.int8, np.int16, np.int32,
+                  np.float32, np.complex64)
+    from pyopencl.characterize import has_double_support
+    if has_double_support(queue.device):
+        dtypes = dtypes + (np.float64, np.complex128)
+
+    from itertools import product
+
+    for dtype_a, dtype_s in product(dtypes, repeat=2):
+
+        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
+        s = dtype_s(40)
+        a_gpu = cl_array.to_device(queue, a)
+
+        # ensure the same behavior as inplace numpy.ndarray division
+        try:
+            a /= s
+        except TypeError:
+            with np.testing.assert_raises(TypeError):
+                a_gpu /= s
+        else:
+            a_gpu /= s
+            assert (np.abs(a_gpu.get() - a) < 1e-3).all()
+            assert a_gpu.dtype is a.dtype
+
+
+def test_divide_inplace_array(ctx_factory):
+    """Test inplace division of arrays."""
+
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    dtypes = (np.uint8, np.uint16, np.uint32,
+                  np.int8, np.int16, np.int32,
+                  np.float32, np.complex64)
+    from pyopencl.characterize import has_double_support
+    if has_double_support(queue.device):
+        dtypes = dtypes + (np.float64, np.complex128)
+
+    from itertools import product
 
-    a_divide = (b_gpu / a_gpu).get()
-    assert (np.abs(b / a - a_divide) < 1e-3).all()
+    for dtype_a, dtype_b in product(dtypes, repeat=2):
+        print(dtype_a, dtype_b)
+        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
+        b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype_b)
+
+        a_gpu = cl_array.to_device(queue, a)
+        b_gpu = cl_array.to_device(queue, b)
+
+        # ensure the same behavior as inplace numpy.ndarray division
+        try:
+            a_gpu /= b_gpu
+        except TypeError:
+            # pass for now, as numpy casts differently for in-place and out-place
+            # true_divide
+            pass
+            # with np.testing.assert_raises(TypeError):
+            #     a /= b
+        else:
+            a /= b
+            assert (np.abs(a_gpu.get() - a) < 1e-3).all()
+            assert a_gpu.dtype is a.dtype
 
 
 def test_bitwise(ctx_factory):