diff --git a/pyopencl/array.py b/pyopencl/array.py
index 0da86e57033695ea8606e3c69f5c364665d644f2..13bb24f36e455ba62aabdfb7feb4e2dea3320c91 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -1065,7 +1065,7 @@ class Array(object):
     def __rdiv__(self, other):
         """Divides an array by a scalar or an array, i.e. ``other / self``.
         """
-
+        
         if isinstance(other, Array):
             result = self._new_like_me(
                     _get_common_dtype(self, other, self.queue))
@@ -1083,16 +1083,20 @@ class Array(object):
 
 
     def __itruediv__(self, other):
+        common_dtype = _get_common_dtype(self, other, self.queue)
+        if common_dtype is not self.dtype:
+            raise TypeError("Cannot cast division output from {!r} to {!r}".format(self.dtype, common_dtype))
+
         if isinstance(other, Array):
             self.add_event(
-                    self._div(self, self, other))
+                self._div(self, self, other))
         else:
             if other == 1:
                 return self.copy()
             else:
-                common_dtype = _get_common_dtype(self, other, self.queue)
+                # cast 1/other to float32, as float64 might not be available...
                 self.add_event(
-                    self._axpbz(self, common_dtype.type(1/other), self, self.dtype.type(0)))
+                    self._axpbz(self, np.float32(1/other), self, common_dtype.type(0)))
 
         return self
         
diff --git a/test/test_array.py b/test/test_array.py
index ff0861aed5a7fa5ef74a2e41f8b1f1d854a78492..34497363a2b4b66f95a01a22164879ff0ff0237c 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -507,13 +507,13 @@ def test_divide_inplace_scalar(ctx_factory):
 
     for dtype in (np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.float32):
         #test data
-        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32)
-        s = 3.14159
+        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype)
+        s = 40
 
         a_gpu = cl_array.to_device(queue, a)
         a_gpu /= s
         a_divide = a_gpu.get()
-        assert (np.abs(a / s - a_divide) < 1e-3).all()
+        assert (np.abs((a / s).astype(dtype) - a_divide) < 1e-3).all()
 
 def test_divide_inplace_array(ctx_factory):
     """Test inplace division of arrays."""
@@ -523,8 +523,8 @@ def test_divide_inplace_array(ctx_factory):
 
     for dtype in (np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.float32):
         #test data
-        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32)
-        b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32)
+        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype)
+        b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype)
 
         a_gpu = cl_array.to_device(queue, a)
         b_gpu = cl_array.to_device(queue, b)