diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 48db98503e76d4144e08395df1bfd90f2a8490f2..3ee1a5ccb67aa2e931ebaa5834d09b02e71971ea 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -25,6 +25,7 @@ Python 3.5 Intel CPU:
   - intel-cl-cpu
   except:
   - tags
+
 Python 3.5 AMD CPU:
   script:
   - export PY_EXE=python3.5
@@ -37,6 +38,7 @@ Python 3.5 AMD CPU:
   - amd-cl-cpu
   except:
   - tags
+
 Python 2.6 AMD CPU:
   script:
   - export PY_EXE=python2.6
@@ -89,6 +91,19 @@ Python 3.5 AMD GPU:
   except:
   - tags
 
+Python 3.6 POCL:
+  script:
+  - export PY_EXE=python3.6
+  - export PYOPENCL_TEST=portable
+  - export EXTRA_INSTALL="numpy mako"
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python3.6
+  - pocl
+  except:
+  - tags
+
 Python 3.5 POCL CL 1.1:
   script:
   - export PY_EXE=python3.5
@@ -121,6 +136,7 @@ Python 2.7 Apple:
   - export PY_EXE=python2.7
   - export PYOPENCL_TEST=app:cpu
   - export EXTRA_INSTALL="numpy mako"
+  - export PKG_CONFIG_PATH=/usr/local/opt/libffi/lib/pkgconfig
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -134,6 +150,7 @@ PyPy POCL:
   - export PY_EXE=pypy
   - export PYOPENCL_TEST=portable
   - export EXTRA_INSTALL="numpy mako"
+  - export NO_DOCTESTS=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
diff --git a/doc/algorithm.rst b/doc/algorithm.rst
index 3ad9c53edc6c81bc7b6f0b804930f8cf60a73721..954fabe4b803647694dae7f3737b487bc2bd0aa0 100644
--- a/doc/algorithm.rst
+++ b/doc/algorithm.rst
@@ -225,7 +225,7 @@ Simple / Legacy Interface
 
     .. method:: __call__(self, input_ary, output_ary=None, allocator=None, queue=None)
 
-.. class:: InclusiveScanKernel(dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None)
+.. class:: InclusiveScanKernel(ctx, dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None)
 
     Works like :class:`ExclusiveScanKernel`.
 
diff --git a/doc/array.rst b/doc/array.rst
index 6f35c0461023480bd299d5f828ad91d36214a5c0..8c83e6663f007560a744953e0439a875981e56ea 100644
--- a/doc/array.rst
+++ b/doc/array.rst
@@ -101,7 +101,7 @@ functions defined on them such as `cfloat_mul(a, b)` or `cdouble_log(z)`.
 Elementwise kernels automatically include the header if your kernel has
 complex input or output.
 See the `source file
-<https://github.com/pyopencl/pyopencl/blob/master/src/cl/pyopencl-complex.h>`_
+<https://github.com/pyopencl/pyopencl/blob/master/pyopencl/cl/pyopencl-complex.h>`_
 for a precise list of what's available.
 
 If you need double precision support, please::
diff --git a/doc/howto.rst b/doc/howto.rst
index 5ea67f7df91ae40a44e4a4b8fe78957512aa8b4e..92244c43a69ef61ba3b33a581bc4a696ed9e7705 100644
--- a/doc/howto.rst
+++ b/doc/howto.rst
@@ -65,10 +65,10 @@ the device:
     >>> ary_host["field2"].fill(1000)
     >>> ary_host[13]["field2"] = 12
     >>> print(ary_host)
-    [(217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0)
-     (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0)
-     (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 12.0) (217, 1000.0)
-     (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0)]
+    [(217,  1000.) (217,  1000.) (217,  1000.) (217,  1000.) (217,  1000.)
+     (217,  1000.) (217,  1000.) (217,  1000.) (217,  1000.) (217,  1000.)
+     (217,  1000.) (217,  1000.) (217,  1000.) (217,    12.) (217,  1000.)
+     (217,  1000.) (217,  1000.) (217,  1000.) (217,  1000.) (217,  1000.)]
 
     >>> ary = cl.array.to_device(queue, ary_host)
 
@@ -85,10 +85,10 @@ We can then operate on the array with our own kernels:
 
     >>> evt = prg.set_to_1(queue, ary.shape, None, ary.data)
     >>> print(ary)
-    [(1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0)
-     (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0)
-     (1, 1000.0) (1, 12.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0)
-     (1, 1000.0) (1, 1000.0)]
+    [(1,  1000.) (1,  1000.) (1,  1000.) (1,  1000.) (1,  1000.) (1,  1000.)
+     (1,  1000.) (1,  1000.) (1,  1000.) (1,  1000.) (1,  1000.) (1,  1000.)
+     (1,  1000.) (1,    12.) (1,  1000.) (1,  1000.) (1,  1000.) (1,  1000.)
+     (1,  1000.) (1,  1000.)]
 
 as well as with PyOpenCL's built-in operations:
 
@@ -99,7 +99,7 @@ as well as with PyOpenCL's built-in operations:
     ...    preamble=my_struct_c_decl)
     >>> evt = elwise(ary)
     >>> print(ary)
-    [(2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0)
-     (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0)
-     (2, 1000.0) (2, 12.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0)
-     (2, 1000.0) (2, 1000.0)]
+    [(2,  1000.) (2,  1000.) (2,  1000.) (2,  1000.) (2,  1000.) (2,  1000.)
+     (2,  1000.) (2,  1000.) (2,  1000.) (2,  1000.) (2,  1000.) (2,  1000.)
+     (2,  1000.) (2,    12.) (2,  1000.) (2,  1000.) (2,  1000.) (2,  1000.)
+     (2,  1000.) (2,  1000.)]
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index afb48d619460943bb2a5245a27298f13da091cf2..a8a694cd3156c931cdfbc094e070914ac0ade510 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -218,7 +218,9 @@ _DEFAULT_BUILD_OPTIONS = []
 _DEFAULT_INCLUDE_OPTIONS = ["-I", _find_pyopencl_include_path()]
 
 # map of platform.name to build options list
-_PLAT_BUILD_OPTIONS = {}
+_PLAT_BUILD_OPTIONS = {
+        "Oclgrind": ["-D", "PYOPENCL_USING_OCLGRIND"],
+        }
 
 
 def enable_debugging(platform_or_context):
diff --git a/pyopencl/array.py b/pyopencl/array.py
index ac32eede8d277cd1a5d00c553027eda8e23ecdde..279705c25fffb3447b7757069dfe5a847bacb34a 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -1189,6 +1189,8 @@ class Array(object):
             raise ValueError("The truth value of an array with "
                     "more than one element is ambiguous. Use a.any() or a.all()")
 
+    __bool__ = __nonzero__
+
     def any(self, queue=None, wait_for=None):
         from pyopencl.reduction import get_any_kernel
         krnl = get_any_kernel(self.context, self.dtype)
diff --git a/pyopencl/cache.py b/pyopencl/cache.py
index 95aeb0eba885f997b98b71818cd1a1670d68176d..3d8f8fa98db087e019b531c89413b8ba16104d57 100644
--- a/pyopencl/cache.py
+++ b/pyopencl/cache.py
@@ -32,6 +32,10 @@ import sys
 import os
 from pytools import Record
 
+import logging
+logger = logging.getLogger(__name__)
+
+
 try:
     import hashlib
     new_hash = hashlib.md5
@@ -352,10 +356,14 @@ def _create_built_program_from_source_cached(ctx, src, options_bytes,
         cache_result = retrieve_from_cache(cache_dir, cache_key)
 
         if cache_result is None:
+            logger.info("build program: binary cache miss (key: %s)" % cache_key)
+
             to_be_built_indices.append(i)
             binaries.append(None)
             logs.append(None)
         else:
+            logger.debug("build program: binary cache hit (key: %s)" % cache_key)
+
             binary, log = cache_result
             binaries.append(binary)
             logs.append(log)
@@ -382,9 +390,14 @@ def _create_built_program_from_source_cached(ctx, src, options_bytes,
         src = src + "\n\n__constant int pyopencl_defeat_cache_%s = 0;" % (
                 uuid4().hex)
 
+        logger.info("build program: start building program from source on %s"
+                % ", ".join(str(devices[i]) for i in to_be_built_indices))
+
         prg = _cl._Program(ctx, src)
         prg.build(options_bytes, [devices[i] for i in to_be_built_indices])
 
+        logger.info("build program: from-source build complete")
+
         prg_devs = prg.get_info(_cl.program_info.DEVICES)
         prg_bins = prg.get_info(_cl.program_info.BINARIES)
         prg_logs = prg._get_build_logs()
diff --git a/pyopencl/cffi_cl.py b/pyopencl/cffi_cl.py
index 16f18f1f4e8507ce56e12b344b9e19f5d68ebd13..c2e87ca1035bfa1e6363ea43a2e6c5205c218a72 100644
--- a/pyopencl/cffi_cl.py
+++ b/pyopencl/cffi_cl.py
@@ -33,6 +33,7 @@ import warnings
 from warnings import warn
 import numpy as np
 import sys
+import re
 
 from pytools import memoize_method
 
@@ -42,6 +43,9 @@ from .compyte.array import f_contiguous_strides, c_contiguous_strides
 
 from pyopencl._cffi import lib as _lib
 
+import logging
+logger = logging.getLogger(__name__)
+
 
 class _CLKernelArg(object):
     pass
@@ -176,6 +180,9 @@ def _generic_info_to_python(info):
     elif type_ == 'cl_device_topology_amd*':
         ret = DeviceTopologyAmd(
                 value.pcie.bus, value.pcie.device, value.pcie.function)
+    elif type_ == 'cl_image_format*':
+        ret = ImageFormat(value.image_channel_order,
+                               value.image_channel_data_type)
     elif type_.startswith('char*['):
         ret = list(map(_ffi_pystr, value))
         _lib.free_pointer_array(info.value, len(value))
@@ -643,17 +650,6 @@ class Platform(_Common):
     def __repr__(self):
         return "<pyopencl.Platform '%s' at 0x%x>" % (self.name, self.int_ptr)
 
-    def _get_cl_version(self):
-        import re
-        version_string = self.version
-        match = re.match(r"^OpenCL ([0-9]+)\.([0-9]+) .*$", version_string)
-        if match is None:
-            raise RuntimeError("platform %s returned non-conformant "
-                               "platform version string '%s'" %
-                               (self, version_string))
-
-        return int(match.group(1)), int(match.group(2))
-
 
 def unload_platform_compiler(plat):
     _handle_error(_lib.platform__unload_compiler(plat.ptr))
@@ -692,6 +688,28 @@ class Device(_Common):
 # }}}
 
 
+# {{{ {Device,Platform}._get_cl_version
+
+_OPENCL_VERSION_STRING_RE = re.compile(r"^OpenCL ([0-9]+)\.([0-9]+) .*$")
+
+
+def _platdev_get_cl_version(self):
+    version_string = self.version
+    match = _OPENCL_VERSION_STRING_RE.match(version_string)
+    if match is None:
+        raise RuntimeError("platform %s returned non-conformant "
+                           "platform version string '%s'" %
+                           (self, version_string))
+
+    return int(match.group(1)), int(match.group(2))
+
+
+Platform._get_cl_version = _platdev_get_cl_version
+Device._get_cl_version = _platdev_get_cl_version
+
+# }}}
+
+
 # {{{ Context
 
 def _parse_context_properties(properties):
@@ -812,7 +830,7 @@ class CommandQueue(_Common):
         self.finish()
 
     def _get_cl_version(self):
-        return self.context._get_cl_version()
+        return self.device._get_cl_version()
 
 
 # }}}
@@ -1583,6 +1601,7 @@ class _Program(_Common):
         return build_logs
 
     def build(self, options_bytes, devices=None):
+        logger.debug("build program: start")
         err = None
         try:
             self._build(options=options_bytes, devices=devices)
@@ -1602,8 +1621,12 @@ class _Program(_Common):
         if err is not None:
             # Python 3.2 outputs the whole list of currently active exceptions
             # This serves to remove one (redundant) level from that nesting.
+
+            logger.debug("build program: completed, error")
             raise err
 
+        logger.debug("build program: completed, success")
+
         message = (75*"="+"\n").join(
                 "Build on %s succeeded, but said:\n\n%s" % (dev, log)
                 for dev, log in self._get_build_logs()
@@ -1839,7 +1862,7 @@ class Kernel(_Common):
         from pyopencl.characterize import has_struct_arg_count_bug
 
         count_bug_per_dev = [
-                has_struct_arg_count_bug(dev)
+                has_struct_arg_count_bug(dev, self.context)
                 for dev in self.context.devices]
 
         from pytools import single_valued
diff --git a/pyopencl/characterize/__init__.py b/pyopencl/characterize/__init__.py
index b6ea1c53f4d812dd79389e17a11051d88a4be173..d03051897b8e35a1a114fa5e8f4cebd145bb1589 100644
--- a/pyopencl/characterize/__init__.py
+++ b/pyopencl/characterize/__init__.py
@@ -1,8 +1,4 @@
-from __future__ import division
-from __future__ import absolute_import
-import six
-from six.moves import range
-from six.moves import zip
+from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
@@ -28,6 +24,8 @@ THE SOFTWARE.
 
 import pyopencl as cl
 from pytools import memoize
+import six
+from six.moves import range, zip
 
 
 class CLCharacterizationWarning(UserWarning):
@@ -322,14 +320,70 @@ def get_simd_group_size(dev, type_size):
     return None
 
 
-def has_struct_arg_count_bug(dev):
+def get_pocl_version(platform, fallback_value=None):
+    if platform.name != "Portable Computing Language":
+        return None
+
+    import re
+    ver_match = re.match(
+            r"^OpenCL [0-9.]+ pocl ([0-9]+)\.([0-9]+)", platform.version)
+    if ver_match is None:
+        msg = ("pocl version number did not have expected format: '%s'"
+                    % platform.version)
+        if fallback_value is not None:
+            from warnings import warn
+            warn(msg)
+            return fallback_value
+        else:
+            raise ValueError(msg)
+    else:
+        return (int(ver_match.group(1)), int(ver_match.group(2)))
+
+
+_CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE = {}
+
+
+def _check_for_pocl_arg_count_bug(dev, ctx=None):
+    try:
+        return _CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE[dev]
+    except KeyError:
+        pass
+
+    if ctx is None:
+        build_ctx = cl.Context([dev])
+    else:
+        build_ctx = ctx
+
+    prg = cl.Program(build_ctx, """
+            struct two_things
+            {
+                long a;
+                long b;
+            };
+
+            __kernel void test_knl(struct two_things x)
+            {
+            }
+            """).build()
+
+    result = prg.test_knl.num_args == 2
+    _CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE[dev] = result
+
+    return result
+
+
+def has_struct_arg_count_bug(dev, ctx=None):
     """Checks whether the device is expected to have the
     `argument counting bug <https://github.com/pocl/pocl/issues/197>`_.
     """
 
     if dev.platform.name == "Apple" and dev.type & cl.device_type.CPU:
         return "apple"
-    if (dev.platform.name == "Portable Computing Language"
-            and dev.address_bits == 64):
-        return "pocl"
+    if dev.platform.name == "Portable Computing Language":
+        pocl_version = get_pocl_version(dev.platform, fallback_value=(0.14))
+        if pocl_version <= (0, 13):
+            return "pocl"
+        elif pocl_version <= (0, 14) and _check_for_pocl_arg_count_bug(dev, ctx):
+            return "pocl"
+
     return False
diff --git a/pyopencl/cl/pyopencl-complex.h b/pyopencl/cl/pyopencl-complex.h
index 7518e8108e1123fa157f943d05f833ba8615c28d..fe29aae86099fc1eecf1eba11fc14ce271852444 100644
--- a/pyopencl/cl/pyopencl-complex.h
+++ b/pyopencl/cl/pyopencl-complex.h
@@ -32,12 +32,12 @@
 
 #define PYOPENCL_DECLARE_COMPLEX_TYPE_INT(REAL_TP, REAL_3LTR, TPROOT, TP) \
   \
-  REAL_TP TPROOT##_real(TP a) { return a.real; } \
-  REAL_TP TPROOT##_imag(TP a) { return a.imag; } \
-  REAL_TP TPROOT##_abs(TP a) { return hypot(a.real, a.imag); } \
-  REAL_TP TPROOT##_abs_squared(TP a) { return a.real * a.real + a.imag * a.imag; } \
+  inline REAL_TP TPROOT##_real(TP a) { return a.real; } \
+  inline REAL_TP TPROOT##_imag(TP a) { return a.imag; }        \
+  inline REAL_TP TPROOT##_abs(TP a) { return hypot(a.real, a.imag); }   \
+  inline REAL_TP TPROOT##_abs_squared(TP a) { return a.real * a.real + a.imag * a.imag; } \
   \
-  TP TPROOT##_new(REAL_TP real, REAL_TP imag) \
+  inline TP TPROOT##_new(REAL_TP real, REAL_TP imag)  \
   { \
     TP result; \
     result.real = real; \
@@ -45,7 +45,7 @@
     return result; \
   } \
   \
-  TP TPROOT##_fromreal(REAL_TP real) \
+  inline TP TPROOT##_fromreal(REAL_TP real)     \
   { \
     TP result; \
     result.real = real; \
@@ -54,47 +54,47 @@
   } \
   \
   \
-  TP TPROOT##_neg(TP a) { return TPROOT##_new(-a.real, -a.imag); } \
-  TP TPROOT##_conj(TP a) { return TPROOT##_new(a.real, -a.imag); } \
+  inline TP TPROOT##_neg(TP a) { return TPROOT##_new(-a.real, -a.imag); } \
+  inline TP TPROOT##_conj(TP a) { return TPROOT##_new(a.real, -a.imag); } \
   \
-  TP TPROOT##_add(TP a, TP b) \
+  inline TP TPROOT##_add(TP a, TP b)            \
   { \
     return TPROOT##_new(a.real + b.real, a.imag + b.imag); \
     ; \
   } \
-  TP TPROOT##_addr(TP a, REAL_TP b) \
+  inline TP TPROOT##_addr(TP a, REAL_TP b)      \
   { \
     return TPROOT##_new(b+a.real, a.imag); \
   } \
-  TP TPROOT##_radd(REAL_TP a, TP b) \
+  inline TP TPROOT##_radd(REAL_TP a, TP b)      \
   { \
     return TPROOT##_new(a+b.real, b.imag); \
   } \
   \
-  TP TPROOT##_sub(TP a, TP b) \
+  inline TP TPROOT##_sub(TP a, TP b)            \
   { \
     return TPROOT##_new(a.real - b.real, a.imag - b.imag); \
     ; \
   } \
   \
-  TP TPROOT##_mul(TP a, TP b) \
+  inline TP TPROOT##_mul(TP a, TP b)            \
   { \
     return TPROOT##_new( \
         a.real*b.real - a.imag*b.imag, \
         a.real*b.imag + a.imag*b.real); \
   } \
   \
-  TP TPROOT##_mulr(TP a, REAL_TP b) \
+  inline TP TPROOT##_mulr(TP a, REAL_TP b)      \
   { \
     return TPROOT##_new(a.real*b, a.imag*b); \
   } \
   \
-  TP TPROOT##_rmul(REAL_TP a, TP b) \
+  inline TP TPROOT##_rmul(REAL_TP a, TP b)      \
   { \
     return TPROOT##_new(a*b.real, a*b.imag); \
   } \
   \
-  TP TPROOT##_rdivide(REAL_TP z1, TP z2) \
+  inline TP TPROOT##_rdivide(REAL_TP z1, TP z2) \
   { \
     if (fabs(z2.real) <= fabs(z2.imag)) { \
       REAL_TP ratio = z2.real / z2.imag; \
@@ -108,7 +108,7 @@
     } \
   } \
   \
-  TP TPROOT##_divide(TP z1, TP z2) \
+  inline TP TPROOT##_divide(TP z1, TP z2)       \
   { \
     REAL_TP ratio, denom, a, b, c, d; \
     \
@@ -134,12 +134,12 @@
        (c + d * ratio) / denom); \
   } \
   \
-  TP TPROOT##_divider(TP a, REAL_TP b) \
+  inline TP TPROOT##_divider(TP a, REAL_TP b)   \
   { \
     return TPROOT##_new(a.real/b, a.imag/b); \
   } \
   \
-  TP TPROOT##_pow(TP a, TP b) \
+  inline TP TPROOT##_pow(TP a, TP b)            \
   { \
     REAL_TP logr = log(hypot(a.real, a.imag)); \
     REAL_TP logi = atan2(a.imag, a.real); \
@@ -151,7 +151,7 @@
     return TPROOT##_new(x*cosy, x*siny); \
   } \
   \
-  TP TPROOT##_powr(TP a, REAL_TP b) \
+  inline TP TPROOT##_powr(TP a, REAL_TP b)      \
   { \
     REAL_TP logr = log(hypot(a.real, a.imag)); \
     REAL_TP logi = atan2(a.imag, a.real); \
@@ -164,7 +164,7 @@
     return TPROOT##_new(x * cosy, x*siny); \
   } \
   \
-  TP TPROOT##_rpow(REAL_TP a, TP b) \
+  inline TP TPROOT##_rpow(REAL_TP a, TP b)      \
   { \
     REAL_TP logr = log(a); \
     REAL_TP x = exp(logr * b.real); \
@@ -175,7 +175,7 @@
     return TPROOT##_new(x * cosy, x * siny); \
   } \
   \
-  TP TPROOT##_sqrt(TP a) \
+  inline TP TPROOT##_sqrt(TP a)                 \
   { \
     REAL_TP re = a.real; \
     REAL_TP im = a.imag; \
@@ -196,7 +196,7 @@
     return result; \
   } \
   \
-  TP TPROOT##_exp(TP a) \
+  inline TP TPROOT##_exp(TP a) \
   { \
     REAL_TP expr = exp(a.real); \
     REAL_TP cosi; \
@@ -204,24 +204,24 @@
     return TPROOT##_new(expr * cosi, expr * sini); \
   } \
   \
-  TP TPROOT##_log(TP a) \
+  inline TP TPROOT##_log(TP a)                                                 \
   { return TPROOT##_new(log(hypot(a.real, a.imag)), atan2(a.imag, a.real)); } \
   \
-  TP TPROOT##_sin(TP a) \
+  inline TP TPROOT##_sin(TP a) \
   { \
     REAL_TP cosr; \
     REAL_TP sinr = sincos(a.real, &cosr); \
     return TPROOT##_new(sinr*cosh(a.imag), cosr*sinh(a.imag)); \
   } \
   \
-  TP TPROOT##_cos(TP a) \
+  inline TP TPROOT##_cos(TP a) \
   { \
     REAL_TP cosr; \
     REAL_TP sinr = sincos(a.real, &cosr); \
     return TPROOT##_new(cosr*cosh(a.imag), -sinr*sinh(a.imag)); \
   } \
   \
-  TP TPROOT##_tan(TP a) \
+  inline TP TPROOT##_tan(TP a) \
   { \
     REAL_TP re2 = 2.f * a.real; \
     REAL_TP im2 = 2.f * a.imag; \
@@ -237,21 +237,21 @@
     } \
   } \
   \
-  TP TPROOT##_sinh(TP a) \
+  inline TP TPROOT##_sinh(TP a) \
   { \
     REAL_TP cosi; \
     REAL_TP sini = sincos(a.imag, &cosi); \
     return TPROOT##_new(sinh(a.real)*cosi, cosh(a.real)*sini); \
   } \
   \
-  TP TPROOT##_cosh(TP a) \
+  inline TP TPROOT##_cosh(TP a) \
   { \
     REAL_TP cosi; \
     REAL_TP sini = sincos(a.imag, &cosi); \
     return TPROOT##_new(cosh(a.real)*cosi, sinh(a.real)*sini); \
   } \
   \
-  TP TPROOT##_tanh(TP a) \
+  inline TP TPROOT##_tanh(TP a) \
   { \
     REAL_TP re2 = 2.f * a.real; \
     REAL_TP im2 = 2.f * a.imag; \
diff --git a/pyopencl/cl/pyopencl-random123/openclfeatures.h b/pyopencl/cl/pyopencl-random123/openclfeatures.h
index af03d3092318c6c27f1a65ce8104c1609b1e66e1..8403706f9d5b16c3ac2a44a7c122c531e20bc9ec 100644
--- a/pyopencl/cl/pyopencl-random123/openclfeatures.h
+++ b/pyopencl/cl/pyopencl-random123/openclfeatures.h
@@ -69,8 +69,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#ifdef PYOPENCL_USING_OCLGRIND
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#else
 #define R123_USE_MULHILO64_OPENCL_INTRIN 1
 #endif
+#endif
 
 #ifndef R123_USE_AES_NI
 #define R123_USE_AES_NI 0
diff --git a/pyopencl/mempool.py b/pyopencl/mempool.py
index 1139edab691e86fdf955fc68c86e266fbb9bb8e6..071bdb98ffba1b1fc6462530c6739847d84cf8ee 100644
--- a/pyopencl/mempool.py
+++ b/pyopencl/mempool.py
@@ -248,7 +248,6 @@ class MemoryPool(object):
         for bin_nr, bin_list in six.iteritems(self.bin_nr_to_bin):
             while bin_list:
                 self.allocator.free(bin_list.pop())
-                self.held_blocks -= 1
                 yield
 
 
diff --git a/pyopencl/scan.py b/pyopencl/scan.py
index 0ea9e01e28fdbeca4058290ca0352639f4597880..1f8d5d2ca79b9ed2638c18bc60dc38934deba26c 100644
--- a/pyopencl/scan.py
+++ b/pyopencl/scan.py
@@ -1,10 +1,6 @@
 """Scan primitive."""
 
-from __future__ import division
-from __future__ import absolute_import
-import six
-from six.moves import range
-from six.moves import zip
+from __future__ import division, absolute_import
 
 __copyright__ = """
 Copyright 2011-2012 Andreas Kloeckner
@@ -28,6 +24,9 @@ Derived from code within the Thrust project, https://github.com/thrust/thrust/
 
 """
 
+import six
+from six.moves import range, zip
+
 import numpy as np
 
 import pyopencl as cl
@@ -296,8 +295,11 @@ void ${kernel_name}(
 
             if (LID_0 == 0 && unit_base != interval_begin)
             {
+                scan_type tmp = ldata[K][WG_SIZE - 1].value;
+                scan_type tmp_aux = ldata[0][0].value;
+
                 ldata[0][0].value = SCAN_EXPR(
-                    ldata[K][WG_SIZE - 1].value, ldata[0][0].value,
+                    tmp, tmp_aux,
                     %if is_segmented:
                         (l_segment_start_flags[0][0])
                     %else:
@@ -761,7 +763,7 @@ _PREFIX_WORDS = set("""
         group_base seg_end my_val DEBUG ARGS
         ints_to_store ints_per_wg scan_types_per_int linear_index
         linear_scan_data_idx dest src store_base wrapped_scan_type
-        dummy scan_tmp
+        dummy scan_tmp tmp_aux
 
         LID_2 LID_1 LID_0
         LDIM_0 LDIM_1 LDIM_2
@@ -1051,7 +1053,7 @@ class GenericScanKernel(_GenericScanKernelBase):
                 output_statement="ary[i+1] = item;")
 
         a = cl.array.arange(queue, 10000, dtype=np.int32)
-        scan_kernel(a, queue=queue)
+        knl(a, queue=queue)
 
     """
 
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 36fafbc381441139b0bc62bf185005678ba2856c..c7bd5ed00280840f78d751f895e6013ea2154601 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -458,7 +458,7 @@ class _CDeclList:
         if dtype in vec.type_to_scalar_and_count:
             return
 
-        for name, field_data in six.iteritems(dtype.fields):
+        for name, field_data in sorted(six.iteritems(dtype.fields)):
             field_dtype, offset = field_data[:2]
             self.add_dtype(field_dtype)
 
@@ -846,12 +846,12 @@ class _TemplateRenderer(object):
         if arguments is not None:
             cdl.visit_arguments(arguments)
 
-        for tv in six.itervalues(self.type_aliases):
+        for _, tv in sorted(six.iteritems(self.type_aliases)):
             cdl.add_dtype(tv)
 
         type_alias_decls = [
                 "typedef %s %s;" % (dtype_to_ctype(val), name)
-                for name, val in six.iteritems(self.type_aliases)
+                for name, val in sorted(six.iteritems(self.type_aliases))
                 ]
 
         return cdl.get_declarations() + "\n" + "\n".join(type_alias_decls)
diff --git a/pyopencl/version.py b/pyopencl/version.py
index 598079b12416759bd8e51e2af4ced9f48d52a096..29cfe8d37511d59834a25a29676ab6d79ac86d48 100644
--- a/pyopencl/version.py
+++ b/pyopencl/version.py
@@ -1,3 +1,3 @@
-VERSION = (2016, 2)
+VERSION = (2016, 2, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
diff --git a/setup.py b/setup.py
index 1d735904cc533f3f39405bd1a9d8285b3f2a59e3..3d2ddd0e9d359374abebb2ee2330381000dfa738 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@ def get_config_schema():
             IncludeDir, LibraryDir, Libraries, \
             Switch, StringListOption
 
-    default_cxxflags = ['-std=c++0x']
+    default_cxxflags = ['-std=gnu++11']
 
     if 'darwin' in sys.platform:
         import platform
diff --git a/src/c_wrapper/clinfo_ext.h b/src/c_wrapper/clinfo_ext.h
index 6094c52c18057b81e09526f3576c11042163e942..9263981e743a947b895cfd1344f2432cbebdbbd6 100644
--- a/src/c_wrapper/clinfo_ext.h
+++ b/src/c_wrapper/clinfo_ext.h
@@ -4,7 +4,7 @@
 #ifndef _EXT_H
 #define _EXT_H
 
-#ifdef __APPLE__
+#if (defined(__APPLE__) && !defined(PYOPENCL_APPLE_USE_CL_H))
 #include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
diff --git a/src/c_wrapper/context.cpp b/src/c_wrapper/context.cpp
index 0a453d0ba797d396e1cdd2cd2ede6195f3d498e6..f2478fd73a54c11dbed830f6eace3e57f9ace215 100644
--- a/src/c_wrapper/context.cpp
+++ b/src/c_wrapper/context.cpp
@@ -68,7 +68,7 @@ context::get_info(cl_uint param_name) const
                 break;
 
 #if defined(PYOPENCL_GL_SHARING_VERSION) && (PYOPENCL_GL_SHARING_VERSION >= 1)
-#if defined(__APPLE__) && defined(HAVE_GL)
+#if defined(__APPLE__) && defined(HAVE_GL) && !defined(PYOPENCL_APPLE_USE_CL_H)
             case CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE:
 #else
             case CL_GL_CONTEXT_KHR:
diff --git a/src/c_wrapper/gl_obj.cpp b/src/c_wrapper/gl_obj.cpp
index 6dfbec8c5ea11b78072964fb088d83849aa04763..bd7edf31d8ce772adae21047ab34e3eb925f1482 100644
--- a/src/c_wrapper/gl_obj.cpp
+++ b/src/c_wrapper/gl_obj.cpp
@@ -137,7 +137,7 @@ have_gl()
 cl_context_properties
 get_apple_cgl_share_group()
 {
-#ifdef __APPLE__
+#if (defined(__APPLE__) && !defined(PYOPENCL_APPLE_USE_CL_H))
     #ifdef HAVE_GL
         CGLContextObj kCGLContext = CGLGetCurrentContext();
         CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext);
diff --git a/src/c_wrapper/pyopencl_ext.h b/src/c_wrapper/pyopencl_ext.h
index 4b5e7871e57d7c26a89830e5bc5bec4bb1c8667c..cd5d7112ea6325d1770f2fbf4431c33ef2edec0d 100644
--- a/src/c_wrapper/pyopencl_ext.h
+++ b/src/c_wrapper/pyopencl_ext.h
@@ -7,7 +7,7 @@
 
 #else
 
-#ifdef __APPLE__
+#if (defined(__APPLE__) && !defined(PYOPENCL_APPLE_USE_CL_H))
 
 #include <OpenCL/opencl.h>
 
diff --git a/src/c_wrapper/wrap_cl.h b/src/c_wrapper/wrap_cl.h
index b097d12d9ebb32c092333721eef32ea847ba8ac5..21ff9c086805056e701186adf00070ed1eee48ed 100644
--- a/src/c_wrapper/wrap_cl.h
+++ b/src/c_wrapper/wrap_cl.h
@@ -13,7 +13,7 @@
 
 #define CL_USE_DEPRECATED_OPENCL_1_1_APIS
 
-#ifdef __APPLE__
+#if (defined(__APPLE__) && !defined(PYOPENCL_APPLE_USE_CL_H))
 
 // {{{ Mac
 
diff --git a/src/c_wrapper/wrap_constants.cpp b/src/c_wrapper/wrap_constants.cpp
index 701638b98b9eb415bd7da4cfe50ea45707d23b90..16b1d1f49ba156f10c5b19b63e36f9915b00b77d 100644
--- a/src/c_wrapper/wrap_constants.cpp
+++ b/src/c_wrapper/wrap_constants.cpp
@@ -421,7 +421,7 @@ void populate_constants(void(*add)(const char*, const char*, int64_t value))
     ADD_ATTR("context_properties",  ,WGL_HDC_KHR);
     ADD_ATTR("context_properties",  ,CGL_SHAREGROUP_KHR);
 #endif
-#if defined(__APPLE__) && defined(HAVE_GL)
+#if defined(__APPLE__) && defined(HAVE_GL) && !defined(PYOPENCL_APPLE_USE_CL_H)
     ADD_ATTR("context_properties",  ,CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE);
 #endif /* __APPLE__ */
 #ifdef CL_CONTEXT_OFFLINE_DEVICES_AMD
diff --git a/test/test_algorithm.py b/test/test_algorithm.py
index 374381ede72ad66951d5146613f4f7a00cc9311a..676390bd751116c334aa3a9425b92f7a32500fe3 100644
--- a/test/test_algorithm.py
+++ b/test/test_algorithm.py
@@ -386,7 +386,8 @@ def test_dot(ctx_factory):
 
             vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get()
 
-            assert abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) < 1e-4
+            rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab)
+            assert rel_err < 1e-4, rel_err
 
 
 @memoize
@@ -939,6 +940,9 @@ def test_bitonic_argsort(ctx_factory, size, dtype):
     queue = cl.CommandQueue(ctx)
 
     dev = ctx.devices[0]
+    if (dev.platform.name == "Portable Computing Language"
+            and sys.platform == "darwin"):
+        pytest.xfail("Bitonic sort crashes on Apple POCL")
     if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
         pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
             "parallelism")
diff --git a/test/test_enqueue_copy.py b/test/test_enqueue_copy.py
index 14f0bc7df3191531099b6077754c0c77870ffda8..564e833a4bc167644bd057c5fa2117d17bda2cda 100644
--- a/test/test_enqueue_copy.py
+++ b/test/test_enqueue_copy.py
@@ -29,6 +29,7 @@ import pytest
 
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+from pyopencl.characterize import get_pocl_version
 
 
 def generate_slice(start, shape):
@@ -42,7 +43,9 @@ def test_enqueue_copy_rect_2d(ctx_factory, honor_skip=True):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
-    if honor_skip and ctx.devices[0].platform.name == "Portable Computing Language":
+    if (honor_skip
+            and ctx.devices[0].platform.name == "Portable Computing Language"
+            and get_pocl_version(ctx.devices[0].platform) <= (0, 13)):
         # https://github.com/pocl/pocl/issues/353
         pytest.skip("POCL's rectangular copies crash")
 
@@ -127,7 +130,9 @@ def test_enqueue_copy_rect_3d(ctx_factory, honor_skip=True):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
-    if honor_skip and ctx.devices[0].platform.name == "Portable Computing Language":
+    if (honor_skip
+            and ctx.devices[0].platform.name == "Portable Computing Language"
+            and get_pocl_version(ctx.devices[0].platform) <= (0, 13)):
         # https://github.com/pocl/pocl/issues/353
         pytest.skip("POCL's rectangular copies crash")
 
diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index b1a23c2a00eb2289fd98dd5a228953109bf3ffef..a281e3105c5d17941821de6c103609a298d99a86 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -33,6 +33,7 @@ import pyopencl.array as cl_array
 import pyopencl.clrandom
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+from pyopencl.characterize import get_pocl_version
 
 # Are CL implementations crashy? You be the judge. :)
 try:
@@ -43,10 +44,11 @@ else:
     faulthandler.enable()
 
 
-def _skip_if_pocl(plat, msg='unsupported by pocl'):
+def _skip_if_pocl(plat, up_to_version, msg='unsupported by pocl'):
     if plat.vendor == "The pocl project":
-        import pytest
-        pytest.skip(msg)
+        if up_to_version is None or get_pocl_version(plat) <= up_to_version:
+            import pytest
+            pytest.skip(msg)
 
 
 def test_get_info(ctx_factory):
@@ -363,7 +365,7 @@ def test_image_2d(ctx_factory):
     if "Intel" in device.vendor and "31360.31426" in device.version:
         from pytest import skip
         skip("images crashy on %s" % device)
-    _skip_if_pocl(device.platform, 'pocl does not support CL_ADDRESS_CLAMP')
+    _skip_if_pocl(device.platform, None, 'pocl does not support CL_ADDRESS_CLAMP')
 
     prg = cl.Program(context, """
         __kernel void copy_image(
@@ -435,7 +437,7 @@ def test_image_3d(ctx_factory):
     if device.platform.vendor == "Intel(R) Corporation":
         from pytest import skip
         skip("images crashy on %s" % device)
-    _skip_if_pocl(device.platform, 'pocl does not support CL_ADDRESS_CLAMP')
+    _skip_if_pocl(device.platform, None, 'pocl does not support CL_ADDRESS_CLAMP')
 
     prg = cl.Program(context, """
         __kernel void copy_image_plane(
@@ -626,7 +628,8 @@ def test_can_build_binary(ctx_factory):
 
 def test_enqueue_barrier_marker(ctx_factory):
     ctx = ctx_factory()
-    _skip_if_pocl(ctx.devices[0].platform, 'pocl crashes on enqueue_barrier')
+    # Still relevant on pocl 0.14.
+    _skip_if_pocl(ctx.devices[0].platform, None, 'pocl crashes on enqueue_barrier')
     queue = cl.CommandQueue(ctx)
     cl.enqueue_barrier(queue)
     evt1 = cl.enqueue_marker(queue)
@@ -647,7 +650,7 @@ def test_unload_compiler(platform):
             cl.get_cl_header_version() < (1, 2)):
         from pytest import skip
         skip("clUnloadPlatformCompiler is only available in OpenCL 1.2")
-    _skip_if_pocl(platform, 'pocl does not support unloading compiler')
+    _skip_if_pocl(platform, (0, 13), 'pocl does not support unloading compiler')
     if platform.vendor == "Intel(R) Corporation":
         from pytest import skip
         skip("Intel proprietary driver does not support unloading compiler")
@@ -954,7 +957,8 @@ def test_coarse_grain_svm(ctx_factory):
         # https://bitbucket.org/pypy/numpy/issues/52
         assert isinstance(svm_ary.mem.base, cl.SVMAllocation)
 
-    if dev.platform.name != "Portable Computing Language":
+    if (dev.platform.name != "Portable Computing Language"
+            or get_pocl_version(dev.platform) >= (0, 14)):
         # pocl 0.13 has a bug misinterpreting the size parameter
         cl.enqueue_svm_memfill(queue, svm_ary, np.zeros((), svm_ary.mem.dtype))
 
@@ -980,7 +984,7 @@ def test_coarse_grain_svm(ctx_factory):
 
     if ctx.devices[0].platform.name != "Portable Computing Language":
         # "Blocking memcpy is unimplemented (clEnqueueSVMMemcpy.c:61)"
-        # in pocl 0.13.
+        # in pocl 0.13 and 0.14-pre.
 
         cl.enqueue_copy(queue, new_ary, svm_ary)
         assert np.array_equal(orig_ary*2, new_ary)