diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d41702c9002b4d649501134dc7a47d8f4a26a90d..e7520a8c116bc17839194160c97b4bafd44126a4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -55,6 +55,7 @@ jobs:
             run: |
                 CONDA_ENVIRONMENT=.test-conda-env-py3.yml
                 curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
+                ./configure.py --cl-use-shipped-ext
                 . ./build-and-test-py-project-within-miniconda.sh
 
     pytest_mac:
@@ -84,6 +85,7 @@ jobs:
                 CONDA_ENVIRONMENT=.test-conda-env-py3.yml
                 curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh
                 . ci-support.sh
+                ./configure.py --cl-use-shipped-ext
                 build_py_project_in_conda_env
                 build_docs
 
diff --git a/doc/make_constants.py b/doc/make_constants.py
index cca0b7ce459c7dd091c98ea1bf64e6a51835286f..803a52d04a4a5c9d86d95f11bea7a100a9dd12ad 100644
--- a/doc/make_constants.py
+++ b/doc/make_constants.py
@@ -203,6 +203,10 @@ const_ext_lookup = {
             "PIPE_SUPPORT": cl_30,
             },
 
+        cl.device_topology_type_amd: {
+            "PCIE": amd_devattr,
+            },
+
         cl.mem_object_type: {
             "IMAGE2D_ARRAY": cl_12,
             "IMAGE1D": cl_12,
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 354b942bf90071c4b0a79310fd3724245cf24266..1fa14a7bba43b5d29b6a7e8c04178da40e24627e 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -51,13 +51,14 @@ import sys
 
 _PYPY = "__pypy__" in sys.builtin_module_names
 
-from pyopencl._cl import (  # noqa
+from pyopencl._cl import (  # noqa: F401
         get_cl_header_version,
         program_kind,
         status_code,
         platform_info,
         device_type,
         device_info,
+        device_topology_type_amd,
         device_fp_config,
         device_mem_cache_type,
         device_local_mem_type,
@@ -153,9 +154,13 @@ from pyopencl._cl import (  # noqa
 
         Image,
         Sampler,
-        DeviceTopologyAmd,
         )
 
+try:
+    from pyopencl._cl import DeviceTopologyAmd  # noqa: F401
+except ImportError:
+    pass
+
 if not _PYPY:
     # FIXME: Add back to default set when pypy support catches up
     from pyopencl._cl import (  # noqa
@@ -1121,10 +1126,8 @@ def _add_functionality():
         """
         svmallocation_old_init(self, ctx, size, alignment, flags)
 
-        read_write = (
-                flags & mem_flags.WRITE_ONLY != 0
-                or flags & mem_flags.READ_WRITE != 0)
-
+        # mem_flags.READ_ONLY applies to kernels, not the host
+        read_write = True
         _interface["data"] = (
                 int(self._ptr_as_int()), not read_write)
 
diff --git a/pyopencl/array.py b/pyopencl/array.py
index d3d353d1bca6b70bc6e42127f8ab0da770ded2f4..bf58c965c69d69820a405de7a90eb7cd1e74bc87 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -50,6 +50,25 @@ def _get_common_dtype(obj1, obj2, queue):
                                   has_double_support(queue.device))
 
 
+def _get_truedivide_dtype(obj1, obj2, queue):
+    # the dtype of the division result obj1 / obj2
+
+    allow_double = has_double_support(queue.device)
+
+    x1 = obj1 if np.isscalar(obj1) else np.ones(1, obj1.dtype)
+    x2 = obj2 if np.isscalar(obj2) else np.ones(1, obj2.dtype)
+
+    result = (x1/x2).dtype
+
+    if not allow_double:
+        if result == np.float64:
+            result = np.dtype(np.float32)
+        elif result == np.complex128:
+            result = np.dtype(np.complex64)
+
+    return result
+
+
 # Work around PyPy not currently supporting the object dtype.
 # (Yes, it doesn't even support checking!)
 # (as of May 27, 2014 on PyPy 2.3)
@@ -63,6 +82,10 @@ except Exception:
         return False
 
 
+class InconsistentOpenCLQueueWarning(UserWarning):
+    pass
+
+
 class VecLookupWarner:
     def __getattr__(self, name):
         from warnings import warn
@@ -125,7 +148,11 @@ def elwise_kernel_runner(kernel_getter):
 
     def kernel_runner(*args, **kwargs):
         repr_ary = args[0]
-        queue = kwargs.pop("queue", None) or repr_ary.queue
+        queue = kwargs.pop("queue", None)
+        implicit_queue = queue is None
+        if implicit_queue:
+            queue = repr_ary.queue
+
         wait_for = kwargs.pop("wait_for", None)
 
         # wait_for must be a copy, because we modify it in-place below
@@ -152,6 +179,16 @@ def elwise_kernel_runner(kernel_getter):
                 actual_args.append(arg.base_data)
                 actual_args.append(arg.offset)
                 wait_for.extend(arg.events)
+
+                if (implicit_queue
+                        and arg.queue is not None
+                        and arg.queue != queue):
+                    from warnings import warn
+
+                    warn("Implicit queue in elementwise operation does not match "
+                            "queue of a provided argument. This will become an "
+                            "error in 2021.",
+                            type=InconsistentOpenCLQueueWarning)
             else:
                 actual_args.append(arg)
         actual_args.append(repr_ary.size)
@@ -989,7 +1026,7 @@ class Array:
             result.add_event(
                     self._axpbyz(result,
                         self.dtype.type(1), self,
-                        other.dtype.type(-1), other))
+                        result.dtype.type(-1), other))
 
             return result
         else:
@@ -1012,7 +1049,7 @@ class Array:
         # other must be a scalar
         result = self._new_like_me(common_dtype)
         result.add_event(
-                self._axpbz(result, self.dtype.type(-1), self,
+                self._axpbz(result, result.dtype.type(-1), self,
                     common_dtype.type(other)))
         return result
 
@@ -1080,20 +1117,20 @@ class Array:
     def __div__(self, other):
         """Divides an array by an array or a scalar, i.e. ``self / other``.
         """
+        common_dtype = _get_truedivide_dtype(self, other, self.queue)
         if isinstance(other, Array):
-            result = self._new_like_me(
-                    _get_common_dtype(self, other, self.queue))
+            result = self._new_like_me(common_dtype)
             result.add_event(self._div(result, self, other))
         else:
             if other == 1:
                 return self.copy()
             else:
                 # create a new array for the result
-                common_dtype = _get_common_dtype(self, other, self.queue)
                 result = self._new_like_me(common_dtype)
                 result.add_event(
                         self._axpbz(result,
-                            common_dtype.type(1/other), self, self.dtype.type(0)))
+                                    np.true_divide(common_dtype.type(1), other),
+                                    self, self.dtype.type(0)))
 
         return result
 
@@ -1102,14 +1139,13 @@ class Array:
     def __rdiv__(self, other):
         """Divides an array by a scalar or an array, i.e. ``other / self``.
         """
+        common_dtype = _get_truedivide_dtype(self, other, self.queue)
 
         if isinstance(other, Array):
-            result = self._new_like_me(
-                    _get_common_dtype(self, other, self.queue))
+            result = self._new_like_me(common_dtype)
             result.add_event(other._div(result, self))
         else:
             # create a new array for the result
-            common_dtype = _get_common_dtype(self, other, self.queue)
             result = self._new_like_me(common_dtype)
             result.add_event(
                     self._rdiv_scalar(result, self, common_dtype.type(other)))
@@ -1118,6 +1154,26 @@ class Array:
 
     __rtruediv__ = __rdiv__
 
+    def __itruediv__(self, other):
+        # raise an error if the result cannot be cast to self
+        common_dtype = _get_truedivide_dtype(self, other, self.queue)
+        if not np.can_cast(common_dtype, self.dtype.type):
+            raise TypeError("Cannot cast {!r} to {!r}"
+                            .format(self.dtype, common_dtype))
+
+        if isinstance(other, Array):
+            self.add_event(
+                self._div(self, self, other))
+        else:
+            if other == 1:
+                return self
+            else:
+                self.add_event(
+                    self._axpbz(self, common_dtype.type(np.true_divide(1, other)),
+                                self, self.dtype.type(0)))
+
+        return self
+
     def __and__(self, other):
         common_dtype = _get_common_dtype(self, other, self.queue)
 
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index 357aa2bbf17477713905d040376ec199a518f877..df364eda3c883d378c1e9d25136d8f59f5763f9d 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -503,36 +503,36 @@ def real_dtype(dtype):
 
 @context_dependent_memoize
 def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z):
-    ax = "a*x[i]"
-    by = "b*y[i]"
+    result_t = dtype_to_ctype(dtype_z)
 
     x_is_complex = dtype_x.kind == "c"
     y_is_complex = dtype_y.kind == "c"
 
-    if x_is_complex:
-        ax = "%s_mul(a, x[i])" % complex_dtype_to_name(dtype_x)
-
-    if y_is_complex:
-        by = "%s_mul(b, y[i])" % complex_dtype_to_name(dtype_y)
+    if dtype_z.kind == "c":
+        # a and b will always be complex here.
+        z_ct = complex_dtype_to_name(dtype_z)
 
-    if x_is_complex and not y_is_complex:
-        by = "{}_fromreal({})".format(complex_dtype_to_name(dtype_x), by)
+        if x_is_complex:
+            ax = f"{z_ct}_mul(a, {z_ct}_cast(x[i]))"
+        else:
+            ax = f"{z_ct}_mulr(a, x[i])"
 
-    if not x_is_complex and y_is_complex:
-        ax = "{}_fromreal({})".format(complex_dtype_to_name(dtype_y), ax)
+        if y_is_complex:
+            by = f"{z_ct}_mul(b, {z_ct}_cast(y[i]))"
+        else:
+            by = f"{z_ct}_mulr(b, y[i])"
 
-    if x_is_complex or y_is_complex:
-        result = (
-                "{root}_add({root}_cast({ax}), {root}_cast({by}))"
-                .format(
-                    ax=ax,
-                    by=by,
-                    root=complex_dtype_to_name(dtype_z)))
+        result = f"{z_ct}_add({ax}, {by})"
     else:
+        # real-only
+
+        ax = f"a*(({result_t}) x[i])"
+        by = f"b*(({result_t}) y[i])"
+
         result = f"{ax} + {by}"
 
     return get_elwise_kernel(context,
-            "{tp_z} *z, {tp_x} a, {tp_x} *x, {tp_y} b, {tp_y} *y".format(
+            "{tp_z} *z, {tp_z} a, {tp_x} *x, {tp_z} b, {tp_y} *y".format(
                 tp_x=dtype_to_ctype(dtype_x),
                 tp_y=dtype_to_ctype(dtype_y),
                 tp_z=dtype_to_ctype(dtype_z),
diff --git a/pyopencl/version.py b/pyopencl/version.py
index 0668cdc6768acd4654d85574dc4fd69788891456..4d2090dc6f60cab9113cb2798d7bf57991284252 100644
--- a/pyopencl/version.py
+++ b/pyopencl/version.py
@@ -1,3 +1,3 @@
-VERSION = (2020, 2, 1)
+VERSION = (2020, 2, 2)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000000000000000000000000000000000000..f2a2f6894081711b89214e24c18a5104f99db607
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+markers=
+        bitonic: tests involving bitonic sort
diff --git a/src/clinfo_ext.h b/src/clinfo_ext.h
index 43b7b6082fda28ad433f26c5d9a5e2e743e24940..dc4d6a8e6b9dd42b58b1fdf5dafeadd5fb2a4671 100644
--- a/src/clinfo_ext.h
+++ b/src/clinfo_ext.h
@@ -71,6 +71,10 @@
 #define CL_DEVICE_GFXIP_MAJOR_AMD			0x404A
 #define CL_DEVICE_GFXIP_MINOR_AMD			0x404B
 #define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD		0x404C
+#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD         0x4030
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD               0x4031
+#define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD    0x4033
+#define CL_DEVICE_PCIE_ID_AMD                           0x4034
 
 #ifndef CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD
 #define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD		1
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 1c7482d78b1bc68b3dedc2add7367bf277e5111b..42ee4c11cdea48dfc11043eeb09a68f631d20ada 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -364,7 +364,7 @@
 
 
 
-#define PYOPENCL_GET_INTEGRAL_INFO(WHAT, FIRST_ARG, SECOND_ARG, TYPE) \
+#define PYOPENCL_GET_TYPED_INFO(WHAT, FIRST_ARG, SECOND_ARG, TYPE) \
   { \
     TYPE param_value; \
     PYOPENCL_CALL_GUARDED(clGet##WHAT##Info, \
@@ -669,7 +669,7 @@ namespace pyopencl
       py::object get_info(cl_device_info param_name) const
       {
 #define DEV_GET_INT_INF(TYPE) \
-        PYOPENCL_GET_INTEGRAL_INFO(Device, m_device, param_name, TYPE);
+        PYOPENCL_GET_TYPED_INFO(Device, m_device, param_name, TYPE);
 
         switch (param_name)
         {
@@ -836,15 +836,15 @@ namespace pyopencl
 // {{{ AMD dev attrs cl_amd_device_attribute_query
 //
 // types of AMD dev attrs divined from
-// https://www.khronos.org/registry/cl/api/1.2/cl.hpp
+// https://github.com/KhronosGroup/OpenCL-CLHPP/blob/3b03738fef487378b188d21cc5f2bae276aa8721/include/CL/opencl.hpp#L1471-L1500
 #ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
           case CL_DEVICE_PROFILING_TIMER_OFFSET_AMD: DEV_GET_INT_INF(cl_ulong);
 #endif
-/* FIXME
 #ifdef CL_DEVICE_TOPOLOGY_AMD
           case CL_DEVICE_TOPOLOGY_AMD:
+            PYOPENCL_GET_TYPED_INFO(
+                Device, m_device, param_name, cl_device_topology_amd);
 #endif
-*/
 #ifdef CL_DEVICE_BOARD_NAME_AMD
           case CL_DEVICE_BOARD_NAME_AMD: ;
             PYOPENCL_GET_STR_INFO(Device, m_device, param_name);
@@ -875,6 +875,17 @@ namespace pyopencl
 #ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
           case CL_DEVICE_LOCAL_MEM_BANKS_AMD: DEV_GET_INT_INF(cl_uint);
 #endif
+// FIXME: MISSING:
+//
+// CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD
+// CL_DEVICE_GFXIP_MAJOR_AMD
+// CL_DEVICE_GFXIP_MINOR_AMD
+// CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD
+// CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD
+// CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD
+// CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD
+// CL_DEVICE_PCIE_ID_AMD
+
 // }}}
 
 #ifdef CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT
@@ -1037,7 +1048,7 @@ namespace pyopencl
         switch (param_name)
         {
           case CL_CONTEXT_REFERENCE_COUNT:
-            PYOPENCL_GET_INTEGRAL_INFO(
+            PYOPENCL_GET_TYPED_INFO(
                 Context, m_context, param_name, cl_uint);
 
           case CL_CONTEXT_DEVICES:
@@ -1101,7 +1112,7 @@ namespace pyopencl
 
 #if PYOPENCL_CL_VERSION >= 0x1010
           case CL_CONTEXT_NUM_DEVICES:
-            PYOPENCL_GET_INTEGRAL_INFO(
+            PYOPENCL_GET_TYPED_INFO(
                 Context, m_context, param_name, cl_uint);
 #endif
 
@@ -1438,10 +1449,10 @@ namespace pyopencl
             PYOPENCL_GET_OPAQUE_INFO(CommandQueue, m_queue, param_name,
                 cl_device_id, device);
           case CL_QUEUE_REFERENCE_COUNT:
-            PYOPENCL_GET_INTEGRAL_INFO(CommandQueue, m_queue, param_name,
+            PYOPENCL_GET_TYPED_INFO(CommandQueue, m_queue, param_name,
                 cl_uint);
           case CL_QUEUE_PROPERTIES:
-            PYOPENCL_GET_INTEGRAL_INFO(CommandQueue, m_queue, param_name,
+            PYOPENCL_GET_TYPED_INFO(CommandQueue, m_queue, param_name,
                 cl_command_queue_properties);
 
           default:
@@ -1517,13 +1528,13 @@ namespace pyopencl
             PYOPENCL_GET_OPAQUE_INFO(Event, m_event, param_name,
                 cl_command_queue, command_queue);
           case CL_EVENT_COMMAND_TYPE:
-            PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name,
+            PYOPENCL_GET_TYPED_INFO(Event, m_event, param_name,
                 cl_command_type);
           case CL_EVENT_COMMAND_EXECUTION_STATUS:
-            PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name,
+            PYOPENCL_GET_TYPED_INFO(Event, m_event, param_name,
                 cl_int);
           case CL_EVENT_REFERENCE_COUNT:
-            PYOPENCL_GET_INTEGRAL_INFO(Event, m_event, param_name,
+            PYOPENCL_GET_TYPED_INFO(Event, m_event, param_name,
                 cl_uint);
 #if PYOPENCL_CL_VERSION >= 0x1010
           case CL_EVENT_CONTEXT:
@@ -1547,7 +1558,7 @@ namespace pyopencl
 #if PYOPENCL_CL_VERSION >= 0x2000
           case CL_PROFILING_COMMAND_COMPLETE:
 #endif
-            PYOPENCL_GET_INTEGRAL_INFO(EventProfiling, m_event, param_name,
+            PYOPENCL_GET_TYPED_INFO(EventProfiling, m_event, param_name,
                 cl_ulong);
           default:
             throw error("Event.get_profiling_info", CL_INVALID_VALUE);
@@ -2462,7 +2473,7 @@ namespace pyopencl
         switch (param_name)
         {
           case CL_IMAGE_FORMAT:
-            PYOPENCL_GET_INTEGRAL_INFO(Image, data(), param_name,
+            PYOPENCL_GET_TYPED_INFO(Image, data(), param_name,
                 cl_image_format);
           case CL_IMAGE_ELEMENT_SIZE:
           case CL_IMAGE_ROW_PITCH:
@@ -2473,7 +2484,7 @@ namespace pyopencl
 #if PYOPENCL_CL_VERSION >= 0x1020
           case CL_IMAGE_ARRAY_SIZE:
 #endif
-            PYOPENCL_GET_INTEGRAL_INFO(Image, data(), param_name, size_t);
+            PYOPENCL_GET_TYPED_INFO(Image, data(), param_name, size_t);
 
 #if PYOPENCL_CL_VERSION >= 0x1020
           case CL_IMAGE_BUFFER:
@@ -2492,7 +2503,7 @@ namespace pyopencl
 
           case CL_IMAGE_NUM_MIP_LEVELS:
           case CL_IMAGE_NUM_SAMPLES:
-            PYOPENCL_GET_INTEGRAL_INFO(Image, data(), param_name, cl_uint);
+            PYOPENCL_GET_TYPED_INFO(Image, data(), param_name, cl_uint);
 #endif
 
           default:
@@ -3566,19 +3577,19 @@ namespace pyopencl
         switch (param_name)
         {
           case CL_SAMPLER_REFERENCE_COUNT:
-            PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name,
+            PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name,
                 cl_uint);
           case CL_SAMPLER_CONTEXT:
             PYOPENCL_GET_OPAQUE_INFO(Sampler, m_sampler, param_name,
                 cl_context, context);
           case CL_SAMPLER_ADDRESSING_MODE:
-            PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name,
+            PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name,
                 cl_addressing_mode);
           case CL_SAMPLER_FILTER_MODE:
-            PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name,
+            PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name,
                 cl_filter_mode);
           case CL_SAMPLER_NORMALIZED_COORDS:
-            PYOPENCL_GET_INTEGRAL_INFO(Sampler, m_sampler, param_name,
+            PYOPENCL_GET_TYPED_INFO(Sampler, m_sampler, param_name,
                 cl_bool);
 
           default:
@@ -3631,13 +3642,13 @@ namespace pyopencl
         switch (param_name)
         {
           case CL_PROGRAM_REFERENCE_COUNT:
-            PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name,
+            PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name,
                 cl_uint);
           case CL_PROGRAM_CONTEXT:
             PYOPENCL_GET_OPAQUE_INFO(Program, m_program, param_name,
                 cl_context, context);
           case CL_PROGRAM_NUM_DEVICES:
-            PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name,
+            PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name,
                 cl_uint);
           case CL_PROGRAM_DEVICES:
             {
@@ -3703,7 +3714,7 @@ namespace pyopencl
             // }}}
 #if PYOPENCL_CL_VERSION >= 0x1020
           case CL_PROGRAM_NUM_KERNELS:
-            PYOPENCL_GET_INTEGRAL_INFO(Program, m_program, param_name,
+            PYOPENCL_GET_TYPED_INFO(Program, m_program, param_name,
                 size_t);
           case CL_PROGRAM_KERNEL_NAMES:
             PYOPENCL_GET_STR_INFO(Program, m_program, param_name);
@@ -3722,7 +3733,7 @@ namespace pyopencl
         {
 #define PYOPENCL_FIRST_ARG m_program, dev.data() // hackety hack
           case CL_PROGRAM_BUILD_STATUS:
-            PYOPENCL_GET_INTEGRAL_INFO(ProgramBuild,
+            PYOPENCL_GET_TYPED_INFO(ProgramBuild,
                 PYOPENCL_FIRST_ARG, param_name,
                 cl_build_status);
           case CL_PROGRAM_BUILD_OPTIONS:
@@ -3731,13 +3742,13 @@ namespace pyopencl
                 PYOPENCL_FIRST_ARG, param_name);
 #if PYOPENCL_CL_VERSION >= 0x1020
           case CL_PROGRAM_BINARY_TYPE:
-            PYOPENCL_GET_INTEGRAL_INFO(ProgramBuild,
+            PYOPENCL_GET_TYPED_INFO(ProgramBuild,
                 PYOPENCL_FIRST_ARG, param_name,
                 cl_program_binary_type);
 #endif
 #if PYOPENCL_CL_VERSION >= 0x2000
           case CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE:
-            PYOPENCL_GET_INTEGRAL_INFO(ProgramBuild,
+            PYOPENCL_GET_TYPED_INFO(ProgramBuild,
                 PYOPENCL_FIRST_ARG, param_name,
                 size_t);
 #endif
@@ -4189,7 +4200,7 @@ namespace pyopencl
             PYOPENCL_GET_STR_INFO(Kernel, m_kernel, param_name);
           case CL_KERNEL_NUM_ARGS:
           case CL_KERNEL_REFERENCE_COUNT:
-            PYOPENCL_GET_INTEGRAL_INFO(Kernel, m_kernel, param_name,
+            PYOPENCL_GET_TYPED_INFO(Kernel, m_kernel, param_name,
                 cl_uint);
           case CL_KERNEL_CONTEXT:
             PYOPENCL_GET_OPAQUE_INFO(Kernel, m_kernel, param_name,
@@ -4215,7 +4226,7 @@ namespace pyopencl
         {
 #define PYOPENCL_FIRST_ARG m_kernel, dev.data() // hackety hack
           case CL_KERNEL_WORK_GROUP_SIZE:
-            PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup,
+            PYOPENCL_GET_TYPED_INFO(KernelWorkGroup,
                 PYOPENCL_FIRST_ARG, param_name,
                 size_t);
           case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
@@ -4230,13 +4241,13 @@ namespace pyopencl
 #if PYOPENCL_CL_VERSION >= 0x1010
           case CL_KERNEL_PRIVATE_MEM_SIZE:
 #endif
-            PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup,
+            PYOPENCL_GET_TYPED_INFO(KernelWorkGroup,
                 PYOPENCL_FIRST_ARG, param_name,
                 cl_ulong);
 
 #if PYOPENCL_CL_VERSION >= 0x1010
           case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
-            PYOPENCL_GET_INTEGRAL_INFO(KernelWorkGroup,
+            PYOPENCL_GET_TYPED_INFO(KernelWorkGroup,
                 PYOPENCL_FIRST_ARG, param_name,
                 size_t);
 #endif
@@ -4256,12 +4267,12 @@ namespace pyopencl
         {
 #define PYOPENCL_FIRST_ARG m_kernel, arg_index // hackety hack
           case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
-            PYOPENCL_GET_INTEGRAL_INFO(KernelArg,
+            PYOPENCL_GET_TYPED_INFO(KernelArg,
                 PYOPENCL_FIRST_ARG, param_name,
                 cl_kernel_arg_address_qualifier);
 
           case CL_KERNEL_ARG_ACCESS_QUALIFIER:
-            PYOPENCL_GET_INTEGRAL_INFO(KernelArg,
+            PYOPENCL_GET_TYPED_INFO(KernelArg,
                 PYOPENCL_FIRST_ARG, param_name,
                 cl_kernel_arg_access_qualifier);
 
@@ -4270,7 +4281,7 @@ namespace pyopencl
             PYOPENCL_GET_STR_INFO(KernelArg, PYOPENCL_FIRST_ARG, param_name);
 
           case CL_KERNEL_ARG_TYPE_QUALIFIER:
-            PYOPENCL_GET_INTEGRAL_INFO(KernelArg,
+            PYOPENCL_GET_TYPED_INFO(KernelArg,
                 PYOPENCL_FIRST_ARG, param_name,
                 cl_kernel_arg_type_qualifier);
 #undef PYOPENCL_FIRST_ARG
@@ -4476,9 +4487,9 @@ namespace pyopencl
         switch (param_name)
         {
           case CL_GL_TEXTURE_TARGET:
-            PYOPENCL_GET_INTEGRAL_INFO(GLTexture, data(), param_name, GLenum);
+            PYOPENCL_GET_TYPED_INFO(GLTexture, data(), param_name, GLenum);
           case CL_GL_MIPMAP_LEVEL:
-            PYOPENCL_GET_INTEGRAL_INFO(GLTexture, data(), param_name, GLint);
+            PYOPENCL_GET_TYPED_INFO(GLTexture, data(), param_name, GLint);
 
           default:
             throw error("MemoryObject.get_gl_texture_info", CL_INVALID_VALUE);
@@ -4729,22 +4740,22 @@ namespace pyopencl
     switch (param_name)
     {
       case CL_MEM_TYPE:
-        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
             cl_mem_object_type);
       case CL_MEM_FLAGS:
-        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
             cl_mem_flags);
       case CL_MEM_SIZE:
-        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
             size_t);
       case CL_MEM_HOST_PTR:
         throw pyopencl::error("MemoryObject.get_info", CL_INVALID_VALUE,
             "Use MemoryObject.get_host_array to get host pointer.");
       case CL_MEM_MAP_COUNT:
-        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
             cl_uint);
       case CL_MEM_REFERENCE_COUNT:
-        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
             cl_uint);
       case CL_MEM_CONTEXT:
         PYOPENCL_GET_OPAQUE_INFO(MemObject, data(), param_name,
@@ -4765,7 +4776,7 @@ namespace pyopencl
           return create_mem_object_wrapper(param_value);
         }
       case CL_MEM_OFFSET:
-        PYOPENCL_GET_INTEGRAL_INFO(MemObject, data(), param_name,
+        PYOPENCL_GET_TYPED_INFO(MemObject, data(), param_name,
             size_t);
 #endif
 
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index cbd1f9a40f85ee0c71be5adfe38bc8cd1cd20e50..175b5aa5932a16ac0528bb7656778f179b490f6f 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -552,41 +552,6 @@ void pyopencl_expose_part_2(py::module &m)
 #endif
   // }}}
 
-  // {{{ CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD
-
-  {
-    typedef cl_device_topology_amd cls;
-    py::class_<cls>(m, "DeviceTopologyAmd")
-      .def(py::init(
-            [](cl_char bus, cl_char device, cl_char function)
-            {
-              cl_device_topology_amd result;
-              result.pcie.bus = bus;
-              result.pcie.device = device;
-              result.pcie.function = function;
-              return result;
-            }),
-          py::arg("bus")=0,
-          py::arg("device")=0,
-          py::arg("function")=0)
-
-      .def_property("type",
-          [](cls &t) { return t.pcie.type; },
-          [](cls &t, cl_uint val) { t.pcie.type = val; })
-
-      .def_property("bus",
-          [](cls &t) { return t.pcie.bus; },
-          [](cls &t, cl_char val) { t.pcie.bus = val; })
-      .def_property("device",
-          [](cls &t) { return t.pcie.device; },
-          [](cls &t, cl_char val) { t.pcie.device = val; })
-      .def_property("function",
-          [](cls &t) { return t.pcie.function; },
-          [](cls &t, cl_char val) { t.pcie.function = val; })
-      ;
-  }
-
-  // }}}
 }
 
 
diff --git a/src/wrap_constants.cpp b/src/wrap_constants.cpp
index 6fc5658fa24894acd182328825e49f61dce3b26c..5f3e4324e9bd84039d7f5a4e668f2cbf02ee9510 100644
--- a/src/wrap_constants.cpp
+++ b/src/wrap_constants.cpp
@@ -40,6 +40,7 @@ namespace
   class platform_info { };
   class device_type { };
   class device_info { };
+  class device_topology_type_amd { };
   class device_fp_config { };
   class device_mem_cache_type { };
   class device_local_mem_type { };
@@ -370,6 +371,7 @@ void pyopencl_expose_constants(py::module &m)
     ADD_ATTR(DEVICE_, PCI_SLOT_ID_NV);
 #endif
 #endif
+
 // {{{ cl_amd_device_attribute_query
 #ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
     ADD_ATTR(DEVICE_, PROFILING_TIMER_OFFSET_AMD);
@@ -410,7 +412,6 @@ void pyopencl_expose_constants(py::module &m)
 #ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
     ADD_ATTR(DEVICE_, LOCAL_MEM_BANKS_AMD);
 #endif
-// }}}
 #ifdef CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD
     ADD_ATTR(DEVICE_, THREAD_TRACE_SUPPORTED_AMD);
 #endif
@@ -423,6 +424,19 @@ void pyopencl_expose_constants(py::module &m)
 #ifdef CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD
     ADD_ATTR(DEVICE_, AVAILABLE_ASYNC_QUEUES_AMD);
 #endif
+#ifdef CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD
+    ADD_ATTR(DEVICE_, PREFERRED_WORK_GROUP_SIZE_AMD);
+#endif
+#ifdef CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD
+    ADD_ATTR(DEVICE_, MAX_WORK_GROUP_SIZE_AMD);
+#endif
+#ifdef CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD
+    ADD_ATTR(DEVICE_, PREFERRED_CONSTANT_BUFFER_SIZE_AMD);
+#endif
+#ifdef CL_DEVICE_PCIE_ID_AMD
+    ADD_ATTR(DEVICE_, PCIE_ID_AMD);
+#endif
+// }}}
 
 #ifdef CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT
     ADD_ATTR(DEVICE_, MAX_ATOMIC_COUNTERS_EXT);
@@ -515,6 +529,13 @@ void pyopencl_expose_constants(py::module &m)
 #endif
   }
 
+  {
+    py::class_<device_topology_type_amd> cls(m, "device_topology_type_amd");
+#ifdef CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD
+    cls.attr("PCIE") = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD;
+#endif
+  }
+
   {
     py::class_<device_fp_config> cls(m, "device_fp_config");
     ADD_ATTR(FP_, DENORM);
@@ -1105,6 +1126,45 @@ void pyopencl_expose_constants(py::module &m)
 #endif
 
   // }}}
+
+  // {{{ CL_DEVICE_TOPOLOGY_AMD
+
+#ifdef CL_DEVICE_TOPOLOGY_AMD
+  {
+    typedef cl_device_topology_amd cls;
+    py::class_<cls>(m, "DeviceTopologyAmd")
+      .def(py::init(
+            [](cl_char bus, cl_char device, cl_char function)
+            {
+              cl_device_topology_amd result;
+              result.pcie.bus = bus;
+              result.pcie.device = device;
+              result.pcie.function = function;
+              return result;
+            }),
+          py::arg("bus")=0,
+          py::arg("device")=0,
+          py::arg("function")=0)
+
+      .def_property("type",
+          [](cls &t) { return t.pcie.type; },
+          [](cls &t, cl_uint val) { t.pcie.type = val; })
+
+      .def_property("bus",
+          [](cls &t) { return t.pcie.bus; },
+          [](cls &t, cl_char val) { t.pcie.bus = val; })
+      .def_property("device",
+          [](cls &t) { return t.pcie.device; },
+          [](cls &t, cl_char val) { t.pcie.device = val; })
+      .def_property("function",
+          [](cls &t) { return t.pcie.function; },
+          [](cls &t, cl_char val) { t.pcie.function = val; })
+      ;
+  }
+#endif
+
+  // }}}
+
 }
 
 
diff --git a/test/test_array.py b/test/test_array.py
index 2cbef16c0d8bbd168ccb9a371f1904cb8cfc022a..d17772375a64f9d236568bef72005637e95d181a 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -426,12 +426,20 @@ def test_addition_scalar(ctx_factory):
     assert (7 + a == a_added).all()
 
 
-def test_substract_array(ctx_factory):
+@pytest.mark.parametrize(("dtype_a", "dtype_b"),
+        [
+            (np.float32, np.float32),
+            (np.float32, np.int32),
+            (np.int32, np.int32),
+            (np.int64, np.int32),
+            (np.int64, np.uint32),
+            ])
+def test_subtract_array(ctx_factory, dtype_a, dtype_b):
     """Test the substraction of two arrays."""
     #test data
-    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
+    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(dtype_a)
     b = np.array([10, 20, 30, 40, 50,
-                  60, 70, 80, 90, 100]).astype(np.float32)
+                  60, 70, 80, 90, 100]).astype(dtype_b)
 
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -471,14 +479,29 @@ def test_divide_scalar(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
-    a_gpu = cl_array.to_device(queue, a)
+    dtypes = (np.uint8, np.uint16, np.uint32,
+                  np.int8, np.int16, np.int32,
+                  np.float32, np.complex64)
+    from pyopencl.characterize import has_double_support
+    if has_double_support(queue.device):
+        dtypes = dtypes + (np.float64, np.complex128)
 
-    result = (a_gpu / 2).get()
-    assert (a / 2 == result).all()
+    from itertools import product
 
-    result = (2 / a_gpu).get()
-    assert (np.abs(2 / a - result) < 1e-5).all()
+    for dtype_a, dtype_s in product(dtypes, repeat=2):
+        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
+        s = dtype_s(40)
+        a_gpu = cl_array.to_device(queue, a)
+
+        b = a / s
+        b_gpu = a_gpu / s
+        assert (np.abs(b_gpu.get() - b) < 1e-3).all()
+        assert b_gpu.dtype is b.dtype
+
+        c = s / a
+        c_gpu = s / a_gpu
+        assert (np.abs(c_gpu.get() - c) < 1e-3).all()
+        assert c_gpu.dtype is c.dtype
 
 
 def test_divide_array(ctx_factory):
@@ -487,18 +510,100 @@ def test_divide_array(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-    #test data
-    a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32)
-    b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32)
+    dtypes = (np.float32, np.complex64)
+    from pyopencl.characterize import has_double_support
+    if has_double_support(queue.device):
+        dtypes = dtypes + (np.float64, np.complex128)
 
-    a_gpu = cl_array.to_device(queue, a)
-    b_gpu = cl_array.to_device(queue, b)
+    from itertools import product
+
+    for dtype_a, dtype_b in product(dtypes, repeat=2):
+
+        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
+        b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype_b)
+
+        a_gpu = cl_array.to_device(queue, a)
+        b_gpu = cl_array.to_device(queue, b)
+        c = a / b
+        c_gpu = (a_gpu / b_gpu)
+        assert (np.abs(c_gpu.get() - c) < 1e-3).all()
+        assert c_gpu.dtype is c.dtype
+
+        d = b / a
+        d_gpu = (b_gpu / a_gpu)
+        assert (np.abs(d_gpu.get() - d) < 1e-3).all()
+        assert d_gpu.dtype is d.dtype
+
+
+def test_divide_inplace_scalar(ctx_factory):
+    """Test inplace division of arrays and a scalar."""
 
-    a_divide = (a_gpu / b_gpu).get()
-    assert (np.abs(a / b - a_divide) < 1e-3).all()
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    dtypes = (np.uint8, np.uint16, np.uint32,
+                  np.int8, np.int16, np.int32,
+                  np.float32, np.complex64)
+    from pyopencl.characterize import has_double_support
+    if has_double_support(queue.device):
+        dtypes = dtypes + (np.float64, np.complex128)
+
+    from itertools import product
+
+    for dtype_a, dtype_s in product(dtypes, repeat=2):
+
+        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
+        s = dtype_s(40)
+        a_gpu = cl_array.to_device(queue, a)
+
+        # ensure the same behavior as inplace numpy.ndarray division
+        try:
+            a /= s
+        except TypeError:
+            with np.testing.assert_raises(TypeError):
+                a_gpu /= s
+        else:
+            a_gpu /= s
+            assert (np.abs(a_gpu.get() - a) < 1e-3).all()
+            assert a_gpu.dtype is a.dtype
+
+
+def test_divide_inplace_array(ctx_factory):
+    """Test inplace division of arrays."""
+
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    dtypes = (np.uint8, np.uint16, np.uint32,
+                  np.int8, np.int16, np.int32,
+                  np.float32, np.complex64)
+    from pyopencl.characterize import has_double_support
+    if has_double_support(queue.device):
+        dtypes = dtypes + (np.float64, np.complex128)
+
+    from itertools import product
 
-    a_divide = (b_gpu / a_gpu).get()
-    assert (np.abs(b / a - a_divide) < 1e-3).all()
+    for dtype_a, dtype_b in product(dtypes, repeat=2):
+        print(dtype_a, dtype_b)
+        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
+        b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype_b)
+
+        a_gpu = cl_array.to_device(queue, a)
+        b_gpu = cl_array.to_device(queue, b)
+
+        # ensure the same behavior as inplace numpy.ndarray division
+        try:
+            a_gpu /= b_gpu
+        except TypeError:
+            # pass for now, as numpy casts differently for in-place and out-place
+            # true_divide
+            pass
+            # with np.testing.assert_raises(TypeError):
+            #     a /= b
+        else:
+            a /= b
+            assert (np.abs(a_gpu.get() - a) < 1e-3).all()
+            assert a_gpu.dtype is a.dtype
 
 
 def test_bitwise(ctx_factory):
diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index 45e2d7476bf2939bcdee66c487c412667183e831..8aad416e75b9e3e346946cb2b5c5b40836c88ee4 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -1030,6 +1030,30 @@ def test_coarse_grain_svm(ctx_factory):
         cl.enqueue_copy(queue, new_ary, svm_ary)
         assert np.array_equal(orig_ary*2, new_ary)
 
+    # {{{ https://github.com/inducer/pyopencl/issues/372
+
+    buf_arr = cl.svm_empty(ctx, cl.svm_mem_flags.READ_ONLY, 10, np.int32)
+    out_arr = cl.svm_empty(ctx, cl.svm_mem_flags.READ_WRITE, 10, np.int32)
+
+    svm_buf_arr = cl.SVM(buf_arr)
+    svm_out_arr = cl.SVM(out_arr)
+    with svm_buf_arr.map_rw(queue) as ary:
+        ary.fill(17)
+
+    prg_ro = cl.Program(ctx, r"""
+        __kernel void twice_ro(__global int *out_g, __global int *in_g)
+        {
+          out_g[get_global_id(0)] = 2*in_g[get_global_id(0)];
+        }
+        """).build()
+
+    prg_ro.twice_ro(queue, buf_arr.shape, None, svm_out_arr, svm_buf_arr)
+
+    with svm_out_arr.map_ro(queue) as ary:
+        print(ary)
+
+    # }}}
+
 
 def test_fine_grain_svm(ctx_factory):
     import sys